0% found this document useful (0 votes)
95 views

Mycart Documentation Add Jar /home/acadgild/ecommerce/hive-serdes-1.0-SNAPSHOT - Jar Products - Info - Raw Table Creation

This document contains documentation for Mycart data warehousing. It describes: 1. The creation of various tables like products_info_raw, users_info_raw, user_activity_raw to store raw data and products_info_stg, users_info_stg, user_activity_stg for staging the data. 2. It also describes the creation of core tables like products_info_core, users_info_core, user_activity_core to store cleansed data and exception tables to store records that fail validation rules. 3. The document also covers loading raw data from files into raw tables, staging it into staging tables, validating and inserting records into core

Uploaded by

Ram Guggul
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
95 views

Mycart Documentation Add Jar /home/acadgild/ecommerce/hive-serdes-1.0-SNAPSHOT - Jar Products - Info - Raw Table Creation

This document contains documentation for Mycart data warehousing. It describes: 1. The creation of various tables like products_info_raw, users_info_raw, user_activity_raw to store raw data and products_info_stg, users_info_stg, user_activity_stg for staging the data. 2. It also describes the creation of core tables like products_info_core, users_info_core, user_activity_core to store cleansed data and exception tables to store records that fail validation rules. 3. The document also covers loading raw data from files into raw tables, staging it into staging tables, validating and inserting records into core

Uploaded by

Ram Guggul
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 36

Mycart Documentation

Add jar /home/acadgild/ecommerce/hive-serdes-1.0-SNAPSHOT.jar

Products_info_raw table creation

create table products_info_raw(id STRING, name STRING, reseller STRING, category STRING,
price DOUBLE, discount INT, profit_percent INT) ROW FORMAT SERDE
'com.cloudera.hive.serde.JSONSerDe';

Products_info_stg table creation

CREATE TABLE products_info_stg ( product_id STRING, product_name STRING, reseller


STRING, category STRING, price BIGINT, discount FLOAT, profit_percent FLOAT )
PARTITIONED BY ( rptg_dt STRING ) CLUSTERED BY ( product_id) INTO 8 BUCKETS
STORED AS ORC;

Creating products_info_core table

CREATE TABLE products_info_core

product_id STRING,

product_name STRING,

reseller STRING,

category STRING,

price BIGINT,

discount FLOAT,

profit_percent FLOAT

PARTITIONED BY (

rptg_dt STRING

)
CLUSTERED BY
(
product_id)
INTO 8 BUCKETS
STORED AS ORC;
Creating table products_info_excp

CREATE TABLE products_info_excp

product_id STRING,

product_name STRING,

reseller STRING,

category STRING,

price BIGINT,

discount FLOAT,

profit_percent FLOAT,

rule_failed STRING

PARTITIONED BY (

rptg_dt STRING

)
CLUSTERED BY (
product_id)
INTO 8 BUCKETS
STORED AS ORC;
User_activity_raw table creation

CREATE TABLE user_activity_raw (


product_id string,
user_id string,
cancellation string,
return string,
cancellation_reason string,
return_reason string,
order_date string,
shipment_date string,
delivery_date string,
cancellation_date string,
return_date string
)
ROW FORMAT SERDE
'com.cloudera.hive.serde.JSONSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

user_activity_stg table creation

CREATE TABLE user_activity_stg


(
product_id string,
user_id string,
cancellation string,
return string,
cancellation_reason string,
return_reason string,
order_date string,
shipment_date string,
delivery_date string,
cancellation_date string,
return_date string
)

PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
product_id,
user_id)
INTO 8 BUCKETS
STORED AS ORC;
user_activity_excp table creation

CREATE TABLE user_activity_excp


(
product_id string,
user_id string,
cancellation string,
return string,
cancellation_reason string,
return_reason string,
order_date string,
shipment_date string,
delivery_date string,
cancellation_date string,
return_date string,
rule_failed string
) PARTITIONED BY ( rptg_Dt STRING ) CLUSTERED BY ( product_id, user_id) INTO 8
BUCKETS STORED AS ORC;

user_activity_core table creation

CREATE TABLE user_activity_core


(
product_id string,
user_id string,
cancellation string,
return string,
cancellation_reason string,
return_reason string,
order_date string,
shipment_date string,
delivery_date string,
cancellation_date string,
return_date string
)

PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
product_id,
user_id)
INTO 8 BUCKETS
STORED AS ORC;

user_info_raw table creation

CREATE TABLE users_info_raw(

id string,
name string,
location struct<city:string,state:string>,
age INT,
category string
)

ROW FORMAT SERDE


'com.cloudera.hive.serde.JSONSerDe'

STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'

OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

users_info_stg table creation

CREATE TABLE users_info_stg(

user_id string,
name string,
location struct<city:string,state:string>,
age bigint,
occupation string
)

PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
user_id)
INTO 8 BUCKETS
STORED AS ORC;

users_info_excp table creation

CREATE TABLE users_info_excp(


user_id string,
name string,
location struct<city:string,state:string>,
age bigint,
occupation string,
rule_failed STRING
)
PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
user_id)
INTO 8 BUCKETS
STORED AS ORC;

user_info_core table creation

CREATE TABLE users_info_core(


user_id string,
name string,
location struct<city:string,state:string>,
age bigint,
occupation string
)

PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
user_id)
INTO 8 BUCKETS
STORED AS ORC;

Hbase table creation

production_category hbase table creation

create 'production_category', 'prod_details'

user_location hbase table creation

create 'user_location', 'user_details'


************************************************************************
DATA INSERTION
************************************************************************

Loading data to products_info_raw table

LOAD DATA LOCAL INPATH '/home/acadgild/ecommerce/data/product_info_merge.json'


INTO TABLE products_info_raw;

Loading data into users_info_raw table

LOAD DATA LOCAL INPATH '/home/acadgild/ecommerce/data/user_info_1.json'


INTO TABLE users_info_raw;

Loading data into user_activity_raw tbale


LOAD DATA LOCAL INPATH '/home/acadgild/ecommerce/data/user_activity_1.json'
INTO TABLE user_activity_raw;

Displaying the inserted data

Set the below property


set hive.exec.dynamic.partition.mode=nonstrict

Loading data into products_info_stg table


INSERT OVERWRITE TABLE products_info_stg PARTITION (rptg_dt) SELECT id, name,
reseller, category, price, discount, profit_percent, from_unixtime(cast(unix_timestamp() as
bigint),'yyyy-MM-dd') as rptg_dt FROM products_info_raw;

Loading data into users_info_stg table

INSERT OVERWRITE TABLE users_info_stg PARTITION (rptg_dt) SELECT id,name, location,


age, category, from_unixtime(cast(unix_timestamp() as bigint),'yyyy-MM-dd') as rptg_dt FROM
users_info_raw;

Loading data into user_activity_stg table

INSERT OVERWRITE TABLE user_activity_stg PARTITION (rptg_dt) SELECT product_id,


user_id, cancellation, return, cancellation_reason, return_reason, order_date, shipment_date,
delivery_date, cancellation_date, return_date, from_unixtime(cast(unix_timestamp() as
bigint),'yyyy-MM-dd') as rptg_dt FROM user_activity_raw;

Displaying the inserted data

creating prod_details table

CREATE EXTERNAL TABLE prod_details(


id string COMMENT 'from deserializer',
prod_id string COMMENT 'from deserializer',
category string COMMENT 'from deserializer')
ROW FORMAT SERDE
'org.apache.hadoop.hive.hbase.HBaseSerDe'
STORED BY
'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
'hbase.columns.mapping'=':key,prod_details:id,prod_details:category',
'serialization.format'='1')
TBLPROPERTIES (
'hbase.table.name'='production_category'
);

Creating prod_details_stg table

CREATE TABLE prod_details_stg (


id STRING,
prod_id STRING,
category STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

Inserting data into prod_details_stg table

LOAD DATA LOCAL INPATH '/home/acadgild/ecommerce/hbase_data/prod_details.txt'


INTO TABLE prod_details_stg;
Inserting data into prod_details table

set hbase.mapred.output.outputtable=production_category;

INSERT OVERWRITE TABLE prod_details


SELECT * FROM prod_details_stg;

Creating user_location table

CREATE EXTERNAL TABLE user_location( id string, user_id string, city string, state string )
ROW FORMAT SERDE 'org.apache.hadoop.hive.hbase.HBaseSerDe' STORED BY
'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES (
'hbase.columns.mapping'=':key,
user_details:id,
user_details:city,
user_details:state',
'serialization.format'='1'
) TBLPROPERTIES ( 'hbase.table.name'='user_location' );
Creating user_location_stg table

CREATE TABLE user_location_stg (


id STRING,
user_id STRING,
city STRING,
state STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

Inserting data into user_location_stg table

LOAD DATA LOCAL INPATH '/home/acadgild/ecommerce/hbase_data/user_location.txt'


INTO TABLE user_location_stg;
Inserting data into user_lcoation table

set hbase.mapred.output.outputtable=user_location;

INSERT OVERWRITE TABLE user_location


SELECT * FROM user_location_stg;
Inserting data into products_info_excp and products_info_core tables

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.auto.convert.join=false;

FROM products_info_stg p
LEFT OUTER JOIN prod_details l ON
p.product_id=l.prod_id AND p.rptg_dt=from_unixtime(cast(unix_timestamp() as bigint),'yyyy-
MM-dd')
INSERT OVERWRITE TABLE products_info_excp PARTITION (rptg_dt) SELECT p.product_id,
p.product_name, p.reseller, p.category, p.price, p.discount, p.profit_percent, CASE WHEN
p.product_id IS NULL THEN 'R1'

WHEN p.discount >= p.price THEN 'R2'

END AS rule_failed, p.rptg_dt WHERE (p.product_id IS NULL) OR (p.discount >= p.price)

INSERT OVERWRITE TABLE products_info_core PARTITION (rptg_dt) SELECT p.product_id,


p.product_name, p.reseller, CASE WHEN p.category IS NULL THEN l.category ELSE p.category
END AS category, p.price, p.discount, p.profit_percent, p.rptg_dt WHERE (p.product_id IS NOT
NULL) AND (p.discount <= p.price);
Displaying the results

Inserting data into users_info_excp and users_info_core tables

FROM users_info_stg p
LEFT OUTER JOIN user_location l ON
p.user_id=l.user_id AND p.rptg_dt=from_unixtime(cast(unix_timestamp() as bigint),'yyyy-MM-
dd')
INSERT OVERWRITE TABLE users_info_excp PARTITION (rptg_dt) SELECT p.user_id,
p.name, p.location, p.age, p.occupation,
CASE WHEN p.user_id IS NULL THEN 'R1'
WHEN p.age <= 0 THEN 'R3'
END AS rule_failed, p.rptg_dt
WHERE (p.user_id IS NULL) OR (p.age < 1)
INSERT OVERWRITE TABLE users_info_core PARTITION (rptg_dt) SELECT p.user_id,
p.name,
CASE WHEN (p.location.city IS NULL) AND (p.location.state IS NULL) THEN
named_struct('city',l.city,'state',l.state)
WHEN (p.location.city IS NULL) AND (p.location.state IS NOT NULL) THEN
named_struct('city',l.city,'state',p.location.state)
WHEN (p.location.city IS NOT NULL) AND (p.location.state IS NULL) THEN
named_struct('city',p.location.city,'state',l.state) ELSE p.location END AS location, p.age,
p.occupation, p.rptg_dt WHERE (p.user_id IS NOT NULL) AND (p.age >= 1);
Displaying the data

Inserting data into user_activity_excp and user_activity_core tables

FROM user_activity_stg p
LEFT OUTER JOIN user_location l ON p.user_id=l.user_id AND
p.rptg_dt=from_unixtime(cast(unix_timestamp() as bigint),'yyyy-MM-dd') LEFT OUTER JOIN prod_details pd
ON p.product_id=pd.id
INSERT OVERWRITE TABLE user_activity_excp PARTITION (rptg_dt) SELECT p.product_id, p.user_id,
p.cancellation, p.return, p.cancellation_reason, p.return_reason, p.order_date, p.shipment_date,
p.delivery_date, p.cancellation_date, p.return_date,
CASE WHEN (p.product_id IS NULL) OR (p.user_id IS NULL) THEN 'R1'
WHEN (p.order_date > p.shipment_date) THEN 'R2' ELSE 'NA' END AS rule_failed , p.rptg_dt
WHERE (p.user_id IS NULL) OR (p.product_id IS NULL) OR (p.order_date > p.shipment_date)
INSERT OVERWRITE TABLE user_activity_core PARTITION (rptg_dt) SELECT p.product_id, p.user_id,
p.cancellation, p.return, p.cancellation_reason, p.return_reason, p.order_date, p.shipment_date,
p.delivery_date, p.cancellation_date, p.return_date, p.rptg_dt
WHERE (p.user_id IS NOT NULL) AND (p.product_id IS NOT NULL) AND (p.order_date <=
p.shipment_date);
Data validation & Rules checking
Rules checking on user_acitivity_excp table

1.
hive -e "SELECT COUNT(*) FROM ecom.user_activity_excp WHERE rule_failed = 'R1'" >
user_activity_excp_r1.txt

Checking the data in user_activity_excp_r1.txt file

2.

hive -e "SELECT COUNT(*) FROM ecom.user_activity_excp WHERE rule_failed = 'R2'" >


user_activity_excp_r2.txt
Checking the data in user_excp_r2.txt

3.

hive -e "SELECT COUNT(*) FROM ecom.user_activity_excp WHERE rule_failed = 'R3'" >


user_activity_excp_r3.txt

Checking the data in user_excp_r3.txt


Checking the count of rows in user_activity_core table

hive -e "SELECT COUNT(*) FROM ecom.user_activity_core " > user_activity_core.txt

Checking the data in user_activity_core

Rules checking on user_info table

1.

hive -e "SELECT COUNT(*) FROM ecom.users_info_excp WHERE rule_failed = 'R1'" >


users_info_excp_r1.txt
Checking the data in users_info_excp_r1.txt

2.

hive -e "SELECT COUNT(*) FROM ecom.users_info_excp WHERE rule_failed = 'R2'" >


users_info_excp_r2.txt

Checking the data in users_info_excp_r2.txt


3.

hive -e "SELECT COUNT(*) FROM ecom.users_info_excp WHERE rule_failed = 'R3'" >


users_info_excp_r3.txt

Checking the data in users_info_excp_r3.txt

Checking the data in users_info_core table

hive -e "SELECT COUNT(*) FROM ecom.users_info_core" > users_info_core.txt


Checking the data in users_info_core.txt

Rules checking on products_info table

1.

hive -e "SELECT COUNT(*) FROM ecom.products_info_excp WHERE rule_failed = 'R1'" >


products_info_excp_r1.txt

Checking the data in products_info_excp_r1


2.

hive -e "SELECT COUNT(*) FROM ecom.products_info_excp WHERE rule_failed = 'R2'" >


products_info_excp_r2.txt

Checking the data in products_info_excp_r2.txt

3.

hive -e "SELECT COUNT(*) FROM ecom.products_info_excp WHERE rule_failed = 'R3'" >


products_info_excp_r3.txt
Checking the data in products_info_excp_r3.txt

Checking the contents of products_info_core table

hive -e "SELECT COUNT(*) FROM ecom.products_info_core" > products_info_core.txt

Checking the data in products_info_core.txt file


Python code

user_activity_excp_r1_cnt = float(file('user_activity_excp_r1.txt','r').read()[0])
user_activity_excp_r2_cnt = float(file('user_activity_excp_r2.txt','r').read()[0])

user_activity_excp_r3_cnt = float(file('user_activity_excp_r3.txt','r').read()[0])
user_activity_core_cnt = float(file('user_activity_core.txt','r').read()[0])

users_info_excp_r1_cnt = float(file('users_info_excp_r1.txt','r').read()[0])
users_info_excp_r2_cnt = float(file('users_info_excp_r2.txt','r').read()[0])
users_info_excp_r3_cnt = float(file('users_info_excp_r3.txt','r').read()[0])
users_info_core_cnt = float(file('users_info_core.txt','r').read()[0])

products_info_excp_r1_cnt = float(file('user_activity_excp_r1.txt','r').read()[0])
products_info_excp_r2_cnt = float(file('user_activity_excp_r2.txt','r').read()[0])
products_info_excp_r3_cnt = float(file('user_activity_excp_r3.txt','r').read()[0])
products_info_core_cnt = float(file('products_info_core.txt','r').read()[0])

threshold = file('rules_threshold.txt','r').read().strip().split(',')
r1_threshold, r2_threshold, r3_threshold = float(threshold[0])/100, float(threshold[1])/100,
float(threshold[2])/100

usr_activity_cnt = user_activity_excp_r1_cnt + user_activity_excp_r2_cnt +


user_activity_excp_r3_cnt + user_activity_core_cnt
users_info_cnt = users_info_excp_r1_cnt + users_info_excp_r2_cnt + users_info_excp_r3_cnt +
users_info_core_cnt
products_info_cnt = products_info_excp_r1_cnt + products_info_excp_r2_cnt +
products_info_excp_r3_cnt + products_info_core_cnt

if (user_activity_excp_r1_cnt/usr_activity_cnt > r1_threshold or


user_activity_excp_r2_cnt/usr_activity_cnt > r2_threshold or
user_activity_excp_r3_cnt/usr_activity_cnt > r3_threshold):
print("User activity records are invalid")
elif (users_info_excp_r1_cnt/users_info_cnt > r1_threshold or
users_info_excp_r2_cnt/users_info_cnt > r2_threshold or users_info_excp_r3_cnt/users_info_cnt >
r3_threshold):
print("User info records are invalid")
elif (products_info_excp_r1_cnt/products_info_core_cnt > r1_threshold or
products_info_excp_r2_cnt/products_info_core_cnt > r2_threshold or
products_info_excp_r3_cnt/products_info_core_cnt > r3_threshold):
print("Products info records are invalid")

If the number of invalid records are more than the threshold the project should be stopped
ideally.

Data Analysis

Purchase Pattern Detection


1.What is the most purchased category for every user? Identify the users with maximum
amount of valid purchase.

Creating table usr_category_agr_wrk

create table usr_category_agr_wrk


(
user_id string,
category string,
frequency bigint
)
PARTITIONED BY
(
rptg_Dt STRING
)
STORED AS ORC;

Query to find the frequency of most purchased category and inserting into the created table

INSERT OVERWRITE TABLE usr_category_agr_wrk PARTITION (rptg_dt) select u.user_id as


user_id,p.category as category,count(*) as cnt, from_unixtime(cast(unix_timestamp() as
bigint),'yyyy-MM-dd') FROM user_activity_core u LEFT OUTER JOIN products_info_core p ON
(u.product_id=p.product_id) group by u.user_id,p.category;

Checking the most purchased category frequency

Creating table usr_category_agr

create table usr_category_agr


(
user_id string,
most_purchased_category string
)
PARTITIONED BY
(
rptg_Dt STRING
)
STORED AS ORC;
Query to find the most purchased category and inserting into the created table

INSERT OVERWRITE TABLE usr_category_agr


PARTITION (rptg_dt)
SELECT user_id,category,rptg_dt FROM (
SELECT user_id,category,rptg_dt,rank() over ( partition by user_id order by frequency desc) as
rank
FROM usr_category_agr_wrk) a
WHERE a.rank=1
GROUP BY user_id,category,rptg_dt;

Checking the data


2.Which products are generating the maximum profit? (Profit = (price - discount) *
profit_precentage)

Creating table prod_profit_agr_wrk

create table prod_profit_agr_wrk


(
product_id string,
count bigint
)
PARTITIONED BY
(
rptg_Dt STRING
)
STORED AS ORC;

Inserting data into the table

INSERT OVERWRITE TABLE prod_profit_agr_wrk


PARTITION (rptg_dt)
SELECT u.product_id,
count(*),
u.rptg_dt
FROM user_activity_core u
LEFT OUTER JOIN products_info_core p
ON u.product_id=p.product_id
where u.cancellation='false' and u.return='False'
group by
u.product_id,u.rptg_dt;
Checking the data

Creating table prod_proft_agr

create table prod_profit_agr


(
product_id string,
count bigint,
net_profit bigint
)
PARTITIONED BY
(
rptg_Dt STRING
)
STORED AS ORC;
Inserting data into the table

INSERT OVERWRITE TABLE prod_profit_agr


PARTITION (rptg_dt)
SELECT u.product_id,
count,
count * (cast((price-cast(discount as bigint)) as bigint)* cast(profit_percent as bigint)/100) as
net_profit,
u.rptg_dt
FROM prod_profit_agr_wrk u
LEFT OUTER JOIN products_info_core p
ON u.product_id=p.product_id
group by u.product_id,count,
count * (cast((price-cast(discount as bigint)) as bigint)* cast(profit_percent as bigint)/100),
u.rptg_dt;

3.Which resellers are generating the maximum profit?

create table prod_profit_aggr (product_id string, most_profit_product string, reseller string )


PARTITIONED BY ( rptg_Dt STRING ) STORED AS ORC;

INSERT OVERWRITE TABLE prod_profit_aggr


PARTITION (rptg_dt)
SELECT product_id,most_profit_product,rptg_dt FROM (
SELECT p.product_id,p.net_profit as most_profit_product,pi.reseller,p.rptg_dt,rank() over (order
by net_profit desc) as rank
FROM prod_profit_agr p
LEFT OUTER JOIN products_info_core pi ON p.product_id=pi.product_id) a
WHERE a.rank=1
GROUP BY product_id,most_profit_product,rptg_dt;

4.Which is most sought after category corresponding to very occupation?

create table ocupation_category_aggr_wrk


(
user_id string,
occupation string,
category string,
count bigint
)
partitioned by
(
rptg_dt string)
stored as ORC;

Query

INSERT OVERWRITE TABLE ocupation_category_aggr_wrk


partition (rptg_dt)
select ua.user_id,u.occupation,p.category, count(*),ua.rptg_dt from
user_activity_core ua
LEFT OUTER JOIN users_info_core u
ON u.id=ua.user_id
LEFT OUTER JOIN products_info_core p
ON ua.product_id=p.product_id
group by u.occupation,p.category,ua.rptg_dt;

Table creating

create table ocupation_category_aggr


(
user_id string,
occupation string,
category string
)
partitioned by
(
rptg_dt string)
stored as ORC;

Query

INSERT OVERWRITE TABLE ocupation_category_aggr


partition (rptg_dt)
select user_id,occupation,category,rptg_dt from
(select occupation,category,rptg_dt, rank() over (partition by occupation order by count desc) as
rank
from ocupation_category_aggr_wrk )a
where a.rank=1;

Fraud detection:

1.Which user has performed most returns? What is the valid purchase made by those users?

create table fraud_detection_work1


(
user_id string,
return bigint
--valid_purchase bigint
)
partitioned by
(
rptg_dt string)
stored as ORC;

Query

INSERT OVERWRITE TABLE fraud_detection_work1


PARTITION (rptg_dt)

select user_id,count(*),rptg_dt from


user_activity_core u
where return='True'
group by user_id,rptg_dt;

Table
create table fraud_detection_work2
(
user_id string,
--return bigint
valid_purchase bigint
)
partitioned by
(
rptg_dt string)
stored as ORC;

Query

INSERT OVERWRITE TABLE fraud_detection_work2


PARTITION(rptg_dt)

select user_id,count(*),rptg_dt from


user_activity_core u
where return='False'
group by user_id,rptg_dt;
create table fraud_detection
(
user_id string,
return bigint,
valid_purchase bigint
)
partitioned by
(
rptg_dt string)
stored as ORC;

Query

INSERT OVERWRITE TABLE fraud_detection


PARTITION (rptg_dt)
select user_id,return,valid_purchase,rptg_dt from (
select w1.user_id as user_id,w1.return as return,w2.valid_purchase as valid_purchase,w1.rptg_dt as
rptg_dt,rank () over( order by w1.return desc) as rank
from fraud_detection_work1 w1
left outer join fraud_detection_work2 w2
ON w1.user_id=w2.user_id
)a
where a.rank=1;

2.Which location is getting most cancellation?

create table return_cancel_work


(
location struct<city:string,state:string>,
count bigint
)
PARTITIONED BY
(
rptg_dt string
)
STORED AS ORC;

Query

INSERT OVERWRITE TABLE return_cancel_work


PARTITION (rptg_dt)
select
CASE WHEN u.location IS NULL THEN named_struct('city','NA','state','NA')
ELSE u.location END AS location,
count(*),
CASE WHEN u.rptg_dt IS NULL THEN 'NA'
ELSE u.rptg_dt END AS rptg_dt from
user_activity_core ua
LEFT OUTER JOIN users_info_core u ON
ua.user_id=u.user_id
WHERE
ua.return='True'
group by u.location,u.rptg_dt ;

3.Which location is getting most returns?

Creating table
create table return_aggr
(
location struct<city:string,state:string>,
count bigint
)
PARTITIONED BY
(
rptg_dt string
)
STORED AS ORC;

Query

INSERT OVERWRITE TABLE return_aggr


PARTITION (rptg_dt)
select
location,count,rptg_dt from
(
select location,count,rptg_dt,rank() over (order by count desc) rank
FROM return_cancel_work
)a
WHERE a.rank=1
group by location,count,rptg_dt;

QUESTION - Final Hive table to generate most purchased category which fraud detectation
is return value is true

create table most_valid_purch_ctgr


(
user_id string,
category string,
purchase bigint
)
PARTITIONED BY
(
month string
)
STORED AS ORC;

Query

INSERT OVERWRITE TABLE most_valid_purch_ctgr


PARTITION (month)
select
u.user_id,
u.category,
fr.valid_purchase,
from_unixtime(unix_timestamp(fr.rptg_dt),'MM-YYYY') as month
from
ocupation_category_aggr u
left outer join
fraud_detection fr
ON (u.user_id = fr.user_id)
where
fr.return='True';

Sqoop final hive table data to MySQL using multiexport


=======================================================================
==================================================

one way to do it by hcatlog.

sqoop export --connect jdbc:mysql://localhost/test --driver com.mysql.jdbc.Driver --username hive -


-password hive --table mysql_table_export --hcatalog-table table_text --input-fields-terminated-by
'|' --input-lines-terminated-by '#'

else use the hadoop directory directly with normal export command as below - use /"*" for
recursive export of partitions

sqoop export --connect jdbc:mysql://localhost/db --username root --table employee --export-dir


/emp/emp_data/*

You might also like