Project 4: Retail Analysis With Walmart Data
Project 4: Retail Analysis With Walmart Data
Objective:
Statistical Model
For Store 1 – Build prediction models to forecast demand
Linear Regression – Utilize variables like date and restructure dates as 1 for 5 Feb 2010
(starting from the earliest date in order). Hypothesize if CPI, unemployment, and fuel price have
any impact on sales.
Change dates into days by creating new variable.
Select the model which gives best accuracy.
CODE {}
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from patsy import dmatrices
import sklearn
import seaborn as sns
import os
print(os.getcwd())
walmartdata = pd.read_csv('Walmart_Store_sales.csv')
walmartdata.head()
walmartdata_groupby = walmartdata.groupby('Store')['Weekly_Sales'].sum()
walmartdata_std = walmartdata.groupby('Store').agg({'Weekly_Sales':'std'})
walmartdata_std.head()
walmartdata_growth = walmartdata_Q32012.groupby(['Store'])['Weekly_Sales'].sum()
stores_holiday_sales = walmartdata[walmartdata['Holiday_Flag'] == 1]
stores_nonholiday_sales = walmartdata[walmartdata['Holiday_Flag'] == 0]
stores_holiday_sales_superBowl =
stores_holiday_sales[(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('12-02-
2010')) |(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('11-02-2011'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('10-02-2012'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('08-02-2013'))]
#Stores Sales in Labour Day
#Labour Day: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13
stores_holiday_sales_labourDay =
stores_holiday_sales[(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('10-09-
2010')) |(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('09-09-2011'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('07-09-2012'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('06-09-2013'))]
stores_holiday_sales_thanksgiving =
stores_holiday_sales[(pd.to_datetime(stores_holiday_sales['Date]) == pd.to_datetime('26-11-
2010')) |(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('25-11-2011'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('23-11-2012'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('29-11-2013'))]
#Stores Sales in Christmas
# Christmas: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13
stores_holiday_sales_Christmas =
stores_holiday_sales[(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('31-12-
2010')) |(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('30-12-2011'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('28-12-2012'))|
(pd.to_datetime(stores_holiday_sales['Date']) == pd.to_datetime('27-12-2013'))]
stores_nonholiday_sales_mean =
stores_nonholiday_sales.groupby(['Date']).agg({'Weekly_Sales':'mean'}).reset_index()
stores_holiday_sales_sum =
stores_holiday_sales.groupby(['Date']).agg({'Weekly_Sales':'sum'}).reset_index()
for row in stores_holiday_sales_sum.itertuples():
for row1 in stores_nonholiday_sales_mean.itertuples():
if row.Weekly_Sales > row1.Weekly_Sales:
print("On this Date {} Holiday Sales is greater than Non Holiday Sales and the Sales :-
{}".format(row.Date,row.Weekly_Sales))
break;
print("Super Bowl Day Sale",stores_holiday_sales_superBowl['Weekly_Sales'].sum())
print("Labour Day Sale",stores_holiday_sales_labourDay['Weekly_Sales'].sum())
print("Thanksgiving Day Sale",stores_holiday_sales_thanksgiving['Weekly_Sales'].sum())
print("Christmas Day Sale",stores_holiday_sales_Christmas['Weekly_Sales'].sum())
#Linear Regression – Utilize variables like date and restructure dates as 1 for 5 Feb #2010 (starting
from the earliest date in order).
#Hypothesize if CPI, unemployment, and #fuel price have any impact on sales.
logreg.fit(x_train_unemp,y_train_unemp)
y_pred_unemp = logreg.predict(x_test_unemp)
from sklearn import metrics
print(metrics.accuracy_score(y_test_cpi,y_pred))
print(metrics.accuracy_score(y_test_unemp,y_pred_unemp))