Pyspark Commands
Pyspark Commands
#spark-shell
#pyspark
#SparkContext
#SQLContext
#HiveContext
#spark-sql (only latest version)
#JDBC
#To connect to remote database using jdbc
#It works only from spark 1.3.0 or later
#Either you need to run pyspark with driver-class-path or set environment variable with
os.environ
pyspark --driver-class-path /usr/share/java/mysql-connector-java.jar
os.environ['SPARK_CLASSPATH'] = "/usr/share/java/mysql-connector-java.jar"
sqlContext = SQLContext(sc)
jdbcurl = "jdbc:mysql://quickstart.cloudera:3306/retail_db?
user=retail_dba&password=cloudera"
df = sqlContext.load(source="jdbc", url=jdbcurl, dbtable="departments")
df.count()
##########################################################################
####
# Load data from HDFS and storing results back to HDFS using Spark
from pyspark import SparkContext
dataRDD = sc.textFile("/user/cloudera/sqoop_import/departments")
for line in dataRDD.collect():
print(line)
print(dataRDD.count())
dataRDD.saveAsTextFile("/user/cloudera/pyspark/departments")
#saveAsSequenceFile
dataRDD.map(lambda x: (None,
x)).saveAsSequenceFile("/user/cloudera/pyspark/departmentsSeq")
dataRDD.map(lambda x: tuple(x.split(",",
1))).saveAsSequenceFile("/user/cloudera/pyspark/departmentsSeq")
dataRDD.map(lambda x: tuple(x.split(",",
1))).saveAsSequenceFile("/user/cloudera/pyspark/orders")
path="/user/cloudera/pyspark/departmentsSeq"
dataRDD.map(lambda x: tuple(x.split(",",
1))).saveAsNewAPIHadoopFile(path,"org.apache.hadoop.mapreduce.lib.output.SequenceFil
eOutputFormat",keyClass="org.apache.hadoop.io.Text",valueClass="org.apache.hadoop.io.
Text")
#We can run hive INSERT, LOAD and any valid hive query in Hive context
##########################################################################
####
# Developing word count program
# Create a file and type few lines and save it as wordcount.txt and copy to HDFS
# to /user/cloudera/wordcount.txt
data = sc.textFile("/user/cloudera/wordcount.txt")
dataFlatMap = data.flatMap(lambda x: x.split(" "))
dataMap = dataFlatMap.map(lambda x: (x, 1))
dataReduceByKey = dataMap.reduceByKey(lambda x,y: x + y)
dataReduceByKey.saveAsTextFile("/user/cloudera/wordcountoutput")
for i in dataReduceByKey.collect():
print(i)
##########################################################################
####
ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
revenuePerOrderPerDay = ordersJoinOrderItems.map(lambda t: (t[1][1].split(",")[1],
float(t[1][0].split(",")[4])))
# Using Hive
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10");
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
ordersMap = ordersRDD.map(lambda o: o.split(","))
orders = ordersMap.map(lambda o: Row(order_id=int(o[0]), order_date=o[1], \
order_customer_id=int(o[2]), order_status=o[3]))
ordersSchema = sqlContext.inferSchema(orders)
ordersSchema.registerTempTable("orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")
orderItemsMap = orderItemsRDD.map(lambda oi: oi.split(","))
orderItems = orderItemsMap.map(lambda oi: Row(order_item_id=int(oi[0]),
order_item_order_id=int(oi[1]), \
order_item_product_id=int(oi[2]), order_item_quantity=int(oi[3]),
order_item_subtotal=float(oi[4]), \
order_item_product_price=float(oi[5])))
orderItemsSchema = sqlContext.inferSchema(orderItems)
orderItemsSchema.registerTempTable("order_items")
##########################################################################
####
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")
orderItemsMap = orderItemsRDD.map(lambda rec: float(rec.split(",")[4]))
for i in orderItemsMap.take(5):
print i
#avg
revenue = sc.textFile("/user/cloudera/sqoop_import/order_items").map(lambda rec:
float(rec.split(",")[4])).reduce(lambda rev1, rev2: rev1 + rev2)
totalOrders = sc.textFile("/user/cloudera/sqoop_import/order_items").map(lambda rec:
int(rec.split(",")[1])).distinct().count()
for i in ordersByStatusPerDay.collect():
print(i)
ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
ordersJoinOrderItemsMap = ordersJoinOrderItems.map(lambda t: (t[1][1].split(",")[1],
float(t[1][0].split(",")[4])))
#average
#average revenue per day
#Parse Orders (key order_id)
#Parse Order items (key order_item_order_id)
#Join the data sets
#Parse joined data and get (order_date, order_id) as key and order_item_subtotal as value
#Use appropriate aggregate function to get sum(order_item_subtotal) for each order_date,
order_id combination
#Parse data to discard order_id and get order_date as key and sum(order_item_subtotal) per
order as value
#Use appropriate aggregate function to get sum(order_item_subtotal) per day and
count(distinct order_id) per day
#Parse data and apply average logic
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")
ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
ordersJoinOrderItemsMap = ordersJoinOrderItems.map(lambda t: ((t[1][1].split(",")[1],
t[0]), float(t[1][0].split(",")[4])))
revenuePerDay = revenuePerDayPerOrderMap.combineByKey( \
lambda x: (x, 1), \
lambda acc, revenue: (acc[0] + revenue, acc[1] + 1), \
lambda total1, total2: (round(total1[0] + total2[0], 2), total1[1] + total2[1]) \
)
revenuePerDay = revenuePerDayPerOrderMap.aggregateByKey( \
(0, 0), \
lambda acc, revenue: (acc[0] + revenue, acc[1] + 1), \
lambda total1, total2: (round(total1[0] + total2[0], 2), total1[1] + total2[1]) \
)
ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
ordersPerDayPerCustomer = ordersJoinOrderItems.map(lambda rec: ((rec[1][1].split(",")
[1], rec[1][1].split(",")[2]), float(rec[1][0].split(",")[4])))
revenuePerDayPerCustomer = ordersPerDayPerCustomer.reduceByKey(lambda x, y: x + y)
topCustomerPerDaybyRevenue = revenuePerDayPerCustomerMap.reduceByKey(lambda x,
y: findMax(x, y))
data = hiveContext.sql(" \
select * from ( \
select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal \
from orders o join order_items oi \
on o.order_id = oi.order_item_order_id \
group by o.order_date, o.order_customer_id) q1 \
join \
(select q.order_date, max(q.order_item_subtotal) order_item_subtotal \
from (select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal \
from orders o join order_items oi \
on o.order_id = oi.order_item_order_id \
group by o.order_date, o.order_customer_id) q \
group by q.order_date) q2 \
on q1.order_date = q2.order_date and q1.order_item_subtotal = q2.order_item_subtotal \
order by q1.order_date")
select * from (
select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal) order_item_subtotal
from orders o join order_items oi
on o.order_id = oi.order_item_order_id
group by o.order_date, o.order_customer_id) q1
join
(select q.order_date, max(q.order_item_subtotal) order_item_subtotal
from (select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal
from orders o join order_items oi
on o.order_id = oi.order_item_order_id
group by o.order_date, o.order_customer_id) q
group by q.order_date) q2
on q1.order_date = q2.order_date and q1.order_item_subtotal = q2.order_item_subtotal
order by q1.order_date;
##########################################################################
################
#Check if there are any cancelled orders with amount greater than 1000$
#Get only cancelled orders
#Join orders and order items
#Generate sum(order_item_subtotal) per order
#Filter data which amount to greater than 1000$
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")
ordersJoinOrderItems = orderItemsAgg.join(ordersParsedRDD)
##########################################################
def getFirstTwo(rec):
x=[]
ctr = 0
for i in rec[1]:
if(ctr < 2):
x.append(i)
ctr = ctr + 1
return (y for y in x)
def getTop(rec):
x=[]
max = 0
for i in rec[1]:
prodPrice = float(i.split(",")[4])
if(prodPrice > max):
max = prodPrice
for j in rec[1]:
if(float(j.split(",")[4]) == max):
x.append(j)
return (y for y in x)
products = sc.textFile("/user/cloudera/sqoop_import/products")
productsMap = products.map(lambda rec: (rec.split(",")[1], rec))
productsGroupBy = productsMap.groupByKey()
for i in productsGroupBy.collect(): print(i)