Exp_5_Exploratory_Data_Analysis_sdk_ok
Exp_5_Exploratory_Data_Analysis_sdk_ok
Example no :-1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('../input/auto_clean.csv')
df.head()
output :-
Example no.2
print(df.dtypes)
output :-
symboling int64
normalized-losses int64
make object
aspiration object
num-of-doors object
body-style object
drive-wheels object
engine-location object
wheel-base float64
length float64
width float64
height float64
curb-weight int64
engine-type object
num-of-cylinders object
engine-size int64
fuel-system object
bore float64
stroke float64
compression-ratio float64
horsepower float64
peak-rpm float64
city-mpg int64
highway-mpg int64
price float64
city-L/100km float64
horsepower-binned object
diesel int64
gas int64
dtype: object
Example no.3
df.corr()
output :-
Example no.4
df[['bore', 'stroke', 'compression-ratio', 'horsepower']].corr()
output:-
Example no.5
# Engine size as potential predictor variable of price
sns.regplot(x="engine-size", y="price", data=df)
plt.ylim(0,)
output :-
(0, 55963.75647760324)
Example no.6
df[["engine-size", "price"]].corr()
output :-
Example no.7
#The correlation is 0.0823, the non-diagonal elements of the table.
df[["stroke","price"]].corr()
output :-
Example no.8
df[['peak-rpm','price']].corr()
output :-
Example no.9
# There is a weak correlation between the variable 'stroke' and 'price.'
# as such regression will not work well. We can see this use "regplot" to demon-
strate this.
sns.regplot(x="stroke", y="price", data=df)
output :-
Example no.10
sns.boxplot(x="body-style", y="price",
output :-
<matplotlib.axes._subplots.AxesSubplot at 0x7f8fe0c5b908>
Example no.11
sns.boxplot(x="engine-location", y="pri
output :-
Descriptive Statistical Analysis
Example no.12
df.describe()
output :-
Example no.13
df.describe(include=['object'])
output :-
Example no.14
df['drive-wheels'].value_counts()
output :
fwd 118
rwd 75
4wd 8
Name: drive-wheels, dtype: int64
Example no.15
drive_wheels_counts = df['drive-wheels'].value_counts().to_frame()
drive_wheels_counts.rename(columns={'drive-wheels': 'value_counts'}, in-
place=True)
drive_wheels_counts
output :
value_counts
fwd 118
rwd 754
wd 8
Example no.16
# grouping results
df_gptest = df[['drive-wheels','body-style','price']]
grouped_test1 = df_gptest.groupby(['drive-wheels','body-style'],as_in-
dex=False).mean()
grouped_test1
output :
Example no.17
grouped_pivot = grouped_test1.pivot(index='drive-wheels',columns='body-style')
grouped_pivot
output :
Example no.18
grouped_pivot = grouped_pivot.fillna(0) #fill missing values with 0
grouped_pivot
output :
Example no.19
#use the grouped results
plt.pcolor(grouped_pivot, cmap='RdBu')
plt.colorbar()
plt.show()
output :
Example no.20
fig, ax = plt.subplots()
im = ax.pcolor(grouped_pivot, cmap='RdBu')
#label names
row_labels = grouped_pivot.columns.levels[1]
col_labels = grouped_pivot.index
#insert labels
ax.set_xticklabels(row_labels, minor=False)
ax.set_yticklabels(col_labels, minor=False)
fig.colorb
output :
Example no.21
sns.heatmap(grouped_pivot, annot=True, f
output :