0% found this document useful (0 votes)
456 views

Pyspark Commands

This document provides examples of using Spark for various data processing tasks like connecting to remote databases, developing Spark applications, loading and storing data in HDFS, working with HiveContext and SQLContext, joining datasets, and calculating aggregate statistics. It includes code samples for connecting to MySQL, developing Spark jobs in Python, loading and saving data from and to HDFS, working with Hive and SQL contexts, joining order and order item data, and calculating sums, averages, and counts.

Uploaded by

Rambabu Giduturi
Copyright
© © All Rights Reserved
Available Formats
Download as ODT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
456 views

Pyspark Commands

This document provides examples of using Spark for various data processing tasks like connecting to remote databases, developing Spark applications, loading and storing data in HDFS, working with HiveContext and SQLContext, joining datasets, and calculating aggregate statistics. It includes code samples for connecting to MySQL, developing Spark jobs in Python, loading and saving data from and to HDFS, working with Hive and SQL contexts, joining order and order item data, and calculating sums, averages, and counts.

Uploaded by

Rambabu Giduturi
Copyright
© © All Rights Reserved
Available Formats
Download as ODT, PDF, TXT or read online on Scribd
You are on page 1/ 12

#Introduction

#spark-shell
#pyspark
#SparkContext
#SQLContext
#HiveContext
#spark-sql (only latest version)
#JDBC
#To connect to remote database using jdbc
#It works only from spark 1.3.0 or later
#Either you need to run pyspark with driver-class-path or set environment variable with
os.environ
pyspark --driver-class-path /usr/share/java/mysql-connector-java.jar
os.environ['SPARK_CLASSPATH'] = "/usr/share/java/mysql-connector-java.jar"

from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
jdbcurl = "jdbc:mysql://quickstart.cloudera:3306/retail_db?
user=retail_dba&password=cloudera"
df = sqlContext.load(source="jdbc", url=jdbcurl, dbtable="departments")

for rec in df.collect():


print(rec)

df.count()

##########################################################################
####

#Developing simple scala based applications for spark


#Save this to a file with py extension
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark")
sc = SparkContext(conf=conf)
dataRDD = sc.textFile("/user/cloudera/sqoop_import/departments")
for line in dataRDD.collect():
print(line)
dataRDD.saveAsTextFile("/user/cloudera/pyspark/departmentsTesting")

#Run using this command


#master local will run in spark native mode
spark-submit --master local saveFile.py

#master yarn will run in yarn mode


spark-submit --master yarn saveFile.py
##########################################################################
####

# Load data from HDFS and storing results back to HDFS using Spark
from pyspark import SparkContext

dataRDD = sc.textFile("/user/cloudera/sqoop_import/departments")
for line in dataRDD.collect():
print(line)

print(dataRDD.count())

dataRDD.saveAsTextFile("/user/cloudera/pyspark/departments")

#Object files are not available in python


dataRDD.saveAsObjectFile("/user/cloudera/pyspark/departmentsObject")

#saveAsSequenceFile
dataRDD.map(lambda x: (None,
x)).saveAsSequenceFile("/user/cloudera/pyspark/departmentsSeq")
dataRDD.map(lambda x: tuple(x.split(",",
1))).saveAsSequenceFile("/user/cloudera/pyspark/departmentsSeq")
dataRDD.map(lambda x: tuple(x.split(",",
1))).saveAsSequenceFile("/user/cloudera/pyspark/orders")
path="/user/cloudera/pyspark/departmentsSeq"

dataRDD.map(lambda x: tuple(x.split(",",
1))).saveAsNewAPIHadoopFile(path,"org.apache.hadoop.mapreduce.lib.output.SequenceFil
eOutputFormat",keyClass="org.apache.hadoop.io.Text",valueClass="org.apache.hadoop.io.
Text")

#reading sequence file


data = sc.sequenceFile("/user/cloudera/pyspark/departmentsSeq")
data = sc.sequenceFile("/user/cloudera/pyspark/orders")
data = sc.sequenceFile("/user/cloudera/pyspark/departmentsSeq",
"org.apache.hadoop.io.IntWritable", "org.apache.hadoop.io.Text")
for rec in data.collect():
print(rec)

from pyspark.sql import HiveContext


sqlContext = HiveContext(sc)
depts = sqlContext.sql("select * from departments")
for rec in depts.collect():
print(rec)

sqlContext.sql("create table departmentsSpark as select * from departments")


depts = sqlContext.sql("select * from departmentsSpark")
for rec in depts.collect():
print(rec)

#We can run hive INSERT, LOAD and any valid hive query in Hive context

#Make sure you copy departments.json to HDFS


#create departments.json on Linux file system
{"department_id":2, "department_name":"Fitness"}
{"department_id":3, "department_name":"Footwear"}
{"department_id":4, "department_name":"Apparel"}
{"department_id":5, "department_name":"Golf"}
{"department_id":6, "department_name":"Outdoors"}
{"department_id":7, "department_name":"Fan Shop"}
{"department_id":8, "department_name":"TESTING"}
{"department_id":8000, "department_name":"TESTING"}

#copying to HDFS (using linux command line)


hadoop fs -put departments.json /user/cloudera/pyspark

from pyspark import SQLContext


sqlContext = SQLContext(sc)
departmentsJson = sqlContext.jsonFile("/user/cloudera/pyspark/departments.json")
departmentsJson.registerTempTable("departmentsTable")
departmentsData = sqlContext.sql("select * from departmentsTable")
for rec in departmentsData.collect():
print(rec)

#Writing data in json format


departmentsData.toJSON().saveAsTextFile("/user/cloudera/pyspark/departmentsJson")

#Validating the data


hadoop fs -cat /user/cloudera/pyspark/departmentsJson/part*

##########################################################################
####
# Developing word count program
# Create a file and type few lines and save it as wordcount.txt and copy to HDFS
# to /user/cloudera/wordcount.txt

data = sc.textFile("/user/cloudera/wordcount.txt")
dataFlatMap = data.flatMap(lambda x: x.split(" "))
dataMap = dataFlatMap.map(lambda x: (x, 1))
dataReduceByKey = dataMap.reduceByKey(lambda x,y: x + y)

dataReduceByKey.saveAsTextFile("/user/cloudera/wordcountoutput")

for i in dataReduceByKey.collect():
print(i)
##########################################################################
####

# Join disparate datasets together using Spark


# Problem statement, get the revenue and number of orders from order_items on daily basis
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")

ordersParsedRDD = ordersRDD.map(lambda rec: (int(rec.split(",")[0]), rec))


orderItemsParsedRDD = orderItemsRDD.map(lambda rec: (int(rec.split(",")[1]), rec))

ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
revenuePerOrderPerDay = ordersJoinOrderItems.map(lambda t: (t[1][1].split(",")[1],
float(t[1][0].split(",")[4])))

# Get order count per day


ordersPerDay = ordersJoinOrderItems.map(lambda rec: rec[1][1].split(",")[1] + "," +
str(rec[0])).distinct()
ordersPerDayParsedRDD = ordersPerDay.map(lambda rec: (rec.split(",")[0], 1))
totalOrdersPerDay = ordersPerDayParsedRDD.reduceByKey(lambda x, y: x + y)

# Get revenue per day from joined data


totalRevenuePerDay = revenuePerOrderPerDay.reduceByKey( \
lambda total1, total2: total1 + total2 \
)

for data in totalRevenuePerDay.collect():


print(data)

# Joining order count per day and revenue per day


finalJoinRDD = totalOrdersPerDay.join(totalRevenuePerDay)
for data in finalJoinRDD.take(5):
print(data)

# Using Hive
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10");

joinAggData = sqlContext.sql("select o.order_date, round(sum(oi.order_item_subtotal), 2), \


count(distinct o.order_id) from orders o join order_items oi \
on o.order_id = oi.order_item_order_id \
group by o.order_date order by o.order_date")

for data in joinAggData.collect():


print(data)

# Using spark native sql


from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10");

ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
ordersMap = ordersRDD.map(lambda o: o.split(","))
orders = ordersMap.map(lambda o: Row(order_id=int(o[0]), order_date=o[1], \
order_customer_id=int(o[2]), order_status=o[3]))
ordersSchema = sqlContext.inferSchema(orders)
ordersSchema.registerTempTable("orders")

orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")
orderItemsMap = orderItemsRDD.map(lambda oi: oi.split(","))
orderItems = orderItemsMap.map(lambda oi: Row(order_item_id=int(oi[0]),
order_item_order_id=int(oi[1]), \
order_item_product_id=int(oi[2]), order_item_quantity=int(oi[3]),
order_item_subtotal=float(oi[4]), \
order_item_product_price=float(oi[5])))
orderItemsSchema = sqlContext.inferSchema(orderItems)
orderItemsSchema.registerTempTable("order_items")

joinAggData = sqlContext.sql("select o.order_date, sum(oi.order_item_subtotal), \


count(distinct o.order_id) from orders o join order_items oi \
on o.order_id = oi.order_item_order_id \
group by o.order_date order by o.order_date")

for data in joinAggData.collect():


print(data)

##########################################################################
####

# Calculate aggregate statistics (e.g., average or sum) using Spark


#sum
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
ordersRDD.count()

orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")
orderItemsMap = orderItemsRDD.map(lambda rec: float(rec.split(",")[4]))
for i in orderItemsMap.take(5):
print i

orderItemsReduce = orderItemsMap.reduce(lambda rev1, rev2: rev1 + rev2)

#Get max priced product from products table


#There is one record which is messing up default , delimiters
#Clean it up (we will see how we can filter with out deleting the record later)
hadoop fs -get /user/cloudera/sqoop_import/products
#Delete the record with product_id 685
hadoop fs -put -f products/part* /user/cloudera/sqoop_import/products

#pyspark script to get the max priced product


productsRDD = sc.textFile("/user/cloudera/sqoop_import/products")
productsMap = productsRDD.map(lambda rec: rec)
productsMap.reduce(lambda rec1, rec2: (rec1 if((rec1.split(",")[4] != "" and rec2.split(",")
[4] != "") and float(rec1.split(",")[4]) >= float(rec2.split(",")[4])) else rec2))

#avg
revenue = sc.textFile("/user/cloudera/sqoop_import/order_items").map(lambda rec:
float(rec.split(",")[4])).reduce(lambda rev1, rev2: rev1 + rev2)
totalOrders = sc.textFile("/user/cloudera/sqoop_import/order_items").map(lambda rec:
int(rec.split(",")[1])).distinct().count()

#Number of orders by status


ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
ordersMap = ordersRDD.map(lambda rec: (rec.split(",")[3], 1))
for i in ordersMap.countByKey().items(): print(i)
#groupByKey is not very efficient
ordersByStatus = ordersMap.groupByKey().map(lambda t: (t[0], sum(t[1])))
ordersByStatus = ordersMap.reduceByKey(lambda acc, value: acc + value)
ordersMap = ordersRDD.map(lambda rec: (rec.split(",")[3], rec))
ordersByStatus = ordersMap.aggregateByKey(0, lambda acc, value: acc+1, lambda acc,
value: acc+value)
ordersByStatus = ordersMap.combineByKey(lambda value: 1, lambda acc, value: acc+1,
lambda acc, value: acc+value)

for recs in ordersByStatus.collect():


print(recs)

#Number of orders by order date and order status


#Key orderDate and orderStatus
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
ordersMapRDD = ordersRDD.map(lambda rec: ((rec.split(",")[1], rec.split(",")[3]), 1))
ordersByStatusPerDay = ordersMapRDD.reduceByKey(lambda v1, v2: v1+v2)

for i in ordersByStatusPerDay.collect():
print(i)

#Total Revenue per day


ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")

ordersParsedRDD = ordersRDD.map(lambda rec: (rec.split(",")[0], rec))


orderItemsParsedRDD = orderItemsRDD.map(lambda rec: (rec.split(",")[1], rec))

ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
ordersJoinOrderItemsMap = ordersJoinOrderItems.map(lambda t: (t[1][1].split(",")[1],
float(t[1][0].split(",")[4])))

revenuePerDay = ordersJoinOrderItemsMap.reduceByKey(lambda acc, value: acc + value)


for i in revenuePerDay.collect(): print(i)

#average
#average revenue per day
#Parse Orders (key order_id)
#Parse Order items (key order_item_order_id)
#Join the data sets
#Parse joined data and get (order_date, order_id) as key and order_item_subtotal as value
#Use appropriate aggregate function to get sum(order_item_subtotal) for each order_date,
order_id combination
#Parse data to discard order_id and get order_date as key and sum(order_item_subtotal) per
order as value
#Use appropriate aggregate function to get sum(order_item_subtotal) per day and
count(distinct order_id) per day
#Parse data and apply average logic
ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")

ordersParsedRDD = ordersRDD.map(lambda rec: (rec.split(",")[0], rec))


orderItemsParsedRDD = orderItemsRDD.map(lambda rec: (rec.split(",")[1], rec))

ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
ordersJoinOrderItemsMap = ordersJoinOrderItems.map(lambda t: ((t[1][1].split(",")[1],
t[0]), float(t[1][0].split(",")[4])))

revenuePerDayPerOrder = ordersJoinOrderItemsMap.reduceByKey(lambda acc, value: acc


+ value)
revenuePerDayPerOrderMap = revenuePerDayPerOrder.map(lambda rec: (rec[0][0],
rec[1]))

revenuePerDay = revenuePerDayPerOrderMap.combineByKey( \
lambda x: (x, 1), \
lambda acc, revenue: (acc[0] + revenue, acc[1] + 1), \
lambda total1, total2: (round(total1[0] + total2[0], 2), total1[1] + total2[1]) \
)

revenuePerDay = revenuePerDayPerOrderMap.aggregateByKey( \
(0, 0), \
lambda acc, revenue: (acc[0] + revenue, acc[1] + 1), \
lambda total1, total2: (round(total1[0] + total2[0], 2), total1[1] + total2[1]) \
)

for data in revenuePerDay.collect():


print(data)
avgRevenuePerDay = revenuePerDay.map(lambda x: (x[0], x[1][0]/x[1][1]))

#Customer id with max revenue


ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")

ordersParsedRDD = ordersRDD.map(lambda rec: (rec.split(",")[0], rec))


orderItemsParsedRDD = orderItemsRDD.map(lambda rec: (rec.split(",")[1], rec))

ordersJoinOrderItems = orderItemsParsedRDD.join(ordersParsedRDD)
ordersPerDayPerCustomer = ordersJoinOrderItems.map(lambda rec: ((rec[1][1].split(",")
[1], rec[1][1].split(",")[2]), float(rec[1][0].split(",")[4])))
revenuePerDayPerCustomer = ordersPerDayPerCustomer.reduceByKey(lambda x, y: x + y)

revenuePerDayPerCustomerMap = revenuePerDayPerCustomer.map(lambda rec: (rec[0][0],


(rec[0][1], rec[1])))
topCustomerPerDaybyRevenue = revenuePerDayPerCustomerMap.reduceByKey(lambda x,
y: (x if x[1] >= y[1] else y))

#Using regular function


def findMax(x, y):
if(x[1] >= y[1]):
return x
else:
return y

topCustomerPerDaybyRevenue = revenuePerDayPerCustomerMap.reduceByKey(lambda x,
y: findMax(x, y))

# Using Hive Context


from pyspark.sql import HiveContext
hiveContext = HiveContext(sc)
hiveContext.sql("set spark.sql.shuffle.partitions=10");

data = hiveContext.sql(" \
select * from ( \
select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal \
from orders o join order_items oi \
on o.order_id = oi.order_item_order_id \
group by o.order_date, o.order_customer_id) q1 \
join \
(select q.order_date, max(q.order_item_subtotal) order_item_subtotal \
from (select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal \
from orders o join order_items oi \
on o.order_id = oi.order_item_order_id \
group by o.order_date, o.order_customer_id) q \
group by q.order_date) q2 \
on q1.order_date = q2.order_date and q1.order_item_subtotal = q2.order_item_subtotal \
order by q1.order_date")

# This query works in hive


select * from (select q.order_date, q.order_customer_id, q.order_item_subtotal,
max(q.order_item_subtotal) over (partition by q.order_date) max_order_item_subtotal
from (select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal
from orders o join order_items oi
on o.order_id = oi.order_item_order_id
group by o.order_date, o.order_customer_id) q) s
where s.order_item_subtotal = s.max_order_item_subtotal
order by s.order_date;

select * from (
select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal) order_item_subtotal
from orders o join order_items oi
on o.order_id = oi.order_item_order_id
group by o.order_date, o.order_customer_id) q1
join
(select q.order_date, max(q.order_item_subtotal) order_item_subtotal
from (select o.order_date, o.order_customer_id, sum(oi.order_item_subtotal)
order_item_subtotal
from orders o join order_items oi
on o.order_id = oi.order_item_order_id
group by o.order_date, o.order_customer_id) q
group by q.order_date) q2
on q1.order_date = q2.order_date and q1.order_item_subtotal = q2.order_item_subtotal
order by q1.order_date;

##########################################################################
################

# Filter data into a smaller dataset using Spark


ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
for i in ordersRDD.filter(lambda line: line.split(",")[3] == "COMPLETE").take(5): print(i)

for i in ordersRDD.filter(lambda line: "PENDING" in line.split(",")[3]).take(5): print(i)

for i in ordersRDD.filter(lambda line: int(line.split(",")[0]) > 100).take(5): print(i)

for i in ordersRDD.filter(lambda line: int(line.split(",")[0]) > 100 or line.split(",")[3] in


"PENDING").take(5): print(i)

for i in ordersRDD.filter(lambda line: int(line.split(",")[0]) > 1000 and ("PENDING" in


line.split(",")[3] or line.split(",")[3] == ("CANCELLED"))).take(5): print(i)
for i in ordersRDD.filter(lambda line: int(line.split(",")[0]) > 1000 and line.split(",")[3] !=
("COMPLETE")).take(5): print(i)

#Check if there are any cancelled orders with amount greater than 1000$
#Get only cancelled orders
#Join orders and order items
#Generate sum(order_item_subtotal) per order
#Filter data which amount to greater than 1000$

ordersRDD = sc.textFile("/user/cloudera/sqoop_import/orders")
orderItemsRDD = sc.textFile("/user/cloudera/sqoop_import/order_items")

ordersParsedRDD = ordersRDD.filter(lambda rec: rec.split(",")[3] in


"CANCELED").map(lambda rec: (int(rec.split(",")[0]), rec))
orderItemsParsedRDD = orderItemsRDD.map(lambda rec: (int(rec.split(",")[1]),
float(rec.split(",")[4])))
orderItemsAgg = orderItemsParsedRDD.reduceByKey(lambda acc, value: (acc + value))

ordersJoinOrderItems = orderItemsAgg.join(ordersParsedRDD)

for i in ordersJoinOrderItems.filter(lambda rec: rec[1][0] >= 1000).take(5): print(i)

##########################################################

# Write a query that produces ranked or sorted data using Spark

#Global sorting and ranking


orders = sc.textFile("/user/cloudera/sqoop_import/orders")
for i in orders.map(lambda rec: (int(rec.split(",")[0]), rec)).sortByKey().collect(): print(i)
for i in orders.map(lambda rec: (int(rec.split(",")[0]), rec)).sortByKey(False).take(5): print(i)
for i in orders.map(lambda rec: (int(rec.split(",")[0]), rec)).top(5): print(i)
for i in orders.map(lambda rec: (int(rec.split(",")[0]), rec)).takeOrdered(5, lambda x: x[0]):
print(i)
for i in orders.map(lambda rec: (int(rec.split(",")[0]), rec)).takeOrdered(5, lambda x: -x[0]):
print(i)
for i in orders.takeOrdered(5, lambda x: int(x.split(",")[0])): print(i)
for i in orders.takeOrdered(5, lambda x: -int(x.split(",")[0])): print(i)

#By key sorting and ranking


def getAllSortByPrice(rec, bool):
if(bool == False):
x = sorted(rec[1], key = lambda k: -float(k.split(",")[4]))
else:
x = sorted(rec[1], key = lambda k: float(k.split(",")[4]))
return (y for y in x)
def getAll(rec):
return (x for x in rec[1])

def getFirstTwo(rec):
x=[]
ctr = 0
for i in rec[1]:
if(ctr < 2):
x.append(i)
ctr = ctr + 1
return (y for y in x)

def getTop(rec):
x=[]
max = 0
for i in rec[1]:
prodPrice = float(i.split(",")[4])
if(prodPrice > max):
max = prodPrice
for j in rec[1]:
if(float(j.split(",")[4]) == max):
x.append(j)
return (y for y in x)

products = sc.textFile("/user/cloudera/sqoop_import/products")
productsMap = products.map(lambda rec: (rec.split(",")[1], rec))
productsGroupBy = productsMap.groupByKey()
for i in productsGroupBy.collect(): print(i)

#Get data sorted by product price per category


#You can use map or flatMap, if you want to see one record per line you need to use
flatMap
#Map will return the list
for i in productsGroupBy.map(lambda rec: sorted(rec[1], key=lambda k: float(k.split(",")
[4]))).take(100): print(i)
for i in productsGroupBy.map(lambda rec: sorted(rec[1], key=lambda k: float(k.split(",")
[4]), reverse=True)).take(100): print(i)

#To get topN products by price in each category


def getTopN(rec, topN):
x=[]
x = list(sorted(rec[1], key=lambda k: float(k.split(",")[4]), reverse=True))
import itertools
return (y for y in list(itertools.islice(x, 0, topN)))

for i in productsMap.groupByKey().flatMap(lambda x: getTopN(x, 2)).collect(): print(i)


#To get topN priced products by category
def getTopDenseN(rec, topN):
x=[]
topNPrices = [ ]
prodPrices = [ ]
prodPricesDesc = [ ]
for i in rec[1]:
prodPrices.append(float(i.split(",")[4]))
prodPricesDesc = list(sorted(set(prodPrices), reverse=True))
import itertools
topNPrices = list(itertools.islice(prodPricesDesc, 0, topN))
for j in sorted(rec[1], key=lambda k: float(k.split(",")[4]), reverse=True):
if(float(j.split(",")[4]) in topNPrices):
x.append(j)
return (y for y in x)

for i in productsMap.groupByKey().flatMap(lambda x: getTopDenseN(x, 2)).collect():


print(i)

productsFlatMap = products.flatMap(lambda rec: (rec.split(",")[1], float(rec.split(",")[4])))


for i in productsMap.groupByKey().flatMap(lambda x: getFirstTwo(x)).collect(): print(i)
for i in productsMap.groupByKey().flatMap(lambda x: getAllSortByPrice(x,
True)).collect(): print(i)
for i in productsMap.groupByKey().flatMap(getAll).collect(): print(i)
for i in productsMap.groupByKey().flatMap(getTop).collect(): print(i)

#Sorting using queries


#Global sorting and ranking
select * from products order by product_price desc;
select * from products order by product_price desc limit 10;

#By key sorting


#Using order by is not efficient, it serializes
select * from products order by product_category_id, product_price desc;

#Using distribute by sort by (to distribute sorting and scale it up)


select * from products distribute by product_category_id sort by product_price desc;

#By key ranking (in Hive we can use windowing/analytic functions)


select * from (select p.*,
dense_rank() over (partition by product_category_id order by product_price desc) dr
from products p
distribute by product_category_id) q
where dr <= 2 order by product_category_id, dr;

You might also like