Mycart Documentation Add Jar /home/acadgild/ecommerce/hive-serdes-1.0-SNAPSHOT - Jar Products - Info - Raw Table Creation
Mycart Documentation Add Jar /home/acadgild/ecommerce/hive-serdes-1.0-SNAPSHOT - Jar Products - Info - Raw Table Creation
create table products_info_raw(id STRING, name STRING, reseller STRING, category STRING,
price DOUBLE, discount INT, profit_percent INT) ROW FORMAT SERDE
'com.cloudera.hive.serde.JSONSerDe';
product_id STRING,
product_name STRING,
reseller STRING,
category STRING,
price BIGINT,
discount FLOAT,
profit_percent FLOAT
PARTITIONED BY (
rptg_dt STRING
)
CLUSTERED BY
(
product_id)
INTO 8 BUCKETS
STORED AS ORC;
Creating table products_info_excp
product_id STRING,
product_name STRING,
reseller STRING,
category STRING,
price BIGINT,
discount FLOAT,
profit_percent FLOAT,
rule_failed STRING
PARTITIONED BY (
rptg_dt STRING
)
CLUSTERED BY (
product_id)
INTO 8 BUCKETS
STORED AS ORC;
User_activity_raw table creation
PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
product_id,
user_id)
INTO 8 BUCKETS
STORED AS ORC;
user_activity_excp table creation
PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
product_id,
user_id)
INTO 8 BUCKETS
STORED AS ORC;
id string,
name string,
location struct<city:string,state:string>,
age INT,
category string
)
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
user_id string,
name string,
location struct<city:string,state:string>,
age bigint,
occupation string
)
PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
user_id)
INTO 8 BUCKETS
STORED AS ORC;
PARTITIONED BY
(
rptg_Dt STRING
)
CLUSTERED BY (
user_id)
INTO 8 BUCKETS
STORED AS ORC;
set hbase.mapred.output.outputtable=production_category;
CREATE EXTERNAL TABLE user_location( id string, user_id string, city string, state string )
ROW FORMAT SERDE 'org.apache.hadoop.hive.hbase.HBaseSerDe' STORED BY
'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES (
'hbase.columns.mapping'=':key,
user_details:id,
user_details:city,
user_details:state',
'serialization.format'='1'
) TBLPROPERTIES ( 'hbase.table.name'='user_location' );
Creating user_location_stg table
set hbase.mapred.output.outputtable=user_location;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.auto.convert.join=false;
FROM products_info_stg p
LEFT OUTER JOIN prod_details l ON
p.product_id=l.prod_id AND p.rptg_dt=from_unixtime(cast(unix_timestamp() as bigint),'yyyy-
MM-dd')
INSERT OVERWRITE TABLE products_info_excp PARTITION (rptg_dt) SELECT p.product_id,
p.product_name, p.reseller, p.category, p.price, p.discount, p.profit_percent, CASE WHEN
p.product_id IS NULL THEN 'R1'
FROM users_info_stg p
LEFT OUTER JOIN user_location l ON
p.user_id=l.user_id AND p.rptg_dt=from_unixtime(cast(unix_timestamp() as bigint),'yyyy-MM-
dd')
INSERT OVERWRITE TABLE users_info_excp PARTITION (rptg_dt) SELECT p.user_id,
p.name, p.location, p.age, p.occupation,
CASE WHEN p.user_id IS NULL THEN 'R1'
WHEN p.age <= 0 THEN 'R3'
END AS rule_failed, p.rptg_dt
WHERE (p.user_id IS NULL) OR (p.age < 1)
INSERT OVERWRITE TABLE users_info_core PARTITION (rptg_dt) SELECT p.user_id,
p.name,
CASE WHEN (p.location.city IS NULL) AND (p.location.state IS NULL) THEN
named_struct('city',l.city,'state',l.state)
WHEN (p.location.city IS NULL) AND (p.location.state IS NOT NULL) THEN
named_struct('city',l.city,'state',p.location.state)
WHEN (p.location.city IS NOT NULL) AND (p.location.state IS NULL) THEN
named_struct('city',p.location.city,'state',l.state) ELSE p.location END AS location, p.age,
p.occupation, p.rptg_dt WHERE (p.user_id IS NOT NULL) AND (p.age >= 1);
Displaying the data
FROM user_activity_stg p
LEFT OUTER JOIN user_location l ON p.user_id=l.user_id AND
p.rptg_dt=from_unixtime(cast(unix_timestamp() as bigint),'yyyy-MM-dd') LEFT OUTER JOIN prod_details pd
ON p.product_id=pd.id
INSERT OVERWRITE TABLE user_activity_excp PARTITION (rptg_dt) SELECT p.product_id, p.user_id,
p.cancellation, p.return, p.cancellation_reason, p.return_reason, p.order_date, p.shipment_date,
p.delivery_date, p.cancellation_date, p.return_date,
CASE WHEN (p.product_id IS NULL) OR (p.user_id IS NULL) THEN 'R1'
WHEN (p.order_date > p.shipment_date) THEN 'R2' ELSE 'NA' END AS rule_failed , p.rptg_dt
WHERE (p.user_id IS NULL) OR (p.product_id IS NULL) OR (p.order_date > p.shipment_date)
INSERT OVERWRITE TABLE user_activity_core PARTITION (rptg_dt) SELECT p.product_id, p.user_id,
p.cancellation, p.return, p.cancellation_reason, p.return_reason, p.order_date, p.shipment_date,
p.delivery_date, p.cancellation_date, p.return_date, p.rptg_dt
WHERE (p.user_id IS NOT NULL) AND (p.product_id IS NOT NULL) AND (p.order_date <=
p.shipment_date);
Data validation & Rules checking
Rules checking on user_acitivity_excp table
1.
hive -e "SELECT COUNT(*) FROM ecom.user_activity_excp WHERE rule_failed = 'R1'" >
user_activity_excp_r1.txt
2.
3.
1.
2.
1.
3.
user_activity_excp_r1_cnt = float(file('user_activity_excp_r1.txt','r').read()[0])
user_activity_excp_r2_cnt = float(file('user_activity_excp_r2.txt','r').read()[0])
user_activity_excp_r3_cnt = float(file('user_activity_excp_r3.txt','r').read()[0])
user_activity_core_cnt = float(file('user_activity_core.txt','r').read()[0])
users_info_excp_r1_cnt = float(file('users_info_excp_r1.txt','r').read()[0])
users_info_excp_r2_cnt = float(file('users_info_excp_r2.txt','r').read()[0])
users_info_excp_r3_cnt = float(file('users_info_excp_r3.txt','r').read()[0])
users_info_core_cnt = float(file('users_info_core.txt','r').read()[0])
products_info_excp_r1_cnt = float(file('user_activity_excp_r1.txt','r').read()[0])
products_info_excp_r2_cnt = float(file('user_activity_excp_r2.txt','r').read()[0])
products_info_excp_r3_cnt = float(file('user_activity_excp_r3.txt','r').read()[0])
products_info_core_cnt = float(file('products_info_core.txt','r').read()[0])
threshold = file('rules_threshold.txt','r').read().strip().split(',')
r1_threshold, r2_threshold, r3_threshold = float(threshold[0])/100, float(threshold[1])/100,
float(threshold[2])/100
If the number of invalid records are more than the threshold the project should be stopped
ideally.
Data Analysis
Query to find the frequency of most purchased category and inserting into the created table
Query
Table creating
Query
Fraud detection:
1.Which user has performed most returns? What is the valid purchase made by those users?
Query
Table
create table fraud_detection_work2
(
user_id string,
--return bigint
valid_purchase bigint
)
partitioned by
(
rptg_dt string)
stored as ORC;
Query
Query
Query
Creating table
create table return_aggr
(
location struct<city:string,state:string>,
count bigint
)
PARTITIONED BY
(
rptg_dt string
)
STORED AS ORC;
Query
QUESTION - Final Hive table to generate most purchased category which fraud detectation
is return value is true
Query
else use the hadoop directory directly with normal export command as below - use /"*" for
recursive export of partitions