Data Analysis Report
Data Analysis Report
• files.upload()
Code Cell
• df =
pd.read_csv("https://raw.githubusercontent.c
om/amankharwal/Website-data/master/
CarPrice.csv")
Code Cell
• df.head()
Code Cell
• df.tail()
Code Cell
• df.shape
Code Cell
• df.info()
Code Cell
• df.describe()
Code Cell
• df.isnull().sum()
Code Cell
• print("Duplicate Values
=",df.duplicated().sum())
Code Cell
• df.select_dtypes(include=["int","float"]).head(
)
Markdown Cell
• # **DATA CLEANING**
Code Cell
• Company_Name =
df["CarName"].apply(lambda x: x.split(" ")[0])
• df.insert(2,"CompanyName",Company_Name)
• df.drop(columns=["CarName"],inplace=True)
Code Cell
• df.head()
Code Cell
• def replace(a,b):
•
df["CompanyName"].replace(a,b,inplace=True
)
• replace('maxda','mazda')
• replace('porcshce','porsche')
• replace('toyouta','toyota')
• replace('vokswagen','volkswagen')
Code Cell
• df["CompanyName"].unique()
Markdown Cell
• # **EDA**
Code Cell
• plt.figure(figsize=(20, 6))
• plt.subplot(1, 2, 1)
• sns.distplot(df["price"], color="red", kde=True)
• plt.title("Car Price Distribution",
fontweight="black", pad=20, fontsize=20)
• plt.subplot(1, 2, 2)
• sns.boxplot(y=df["price"], palette="Set2")
Code Cell
• df["price"].agg(["min","mean","median","max
","std","skew"]).to_frame().T
Code Cell
• plt.figure(figsize=(14,6))
• counts = df["CompanyName"].value_counts()
• sns.barplot(x=counts.index, y=counts.values)
• plt.xlabel("Car Company")
• plt.ylabel("Total No. of cars sold")
• plt.title("Total Cars produced by Companies",
pad=20, fontweight="black", fontsize=20)
• plt.xticks(rotation=90)
• plt.show()
Code Cell
• df[df["CompanyName"]=="renault"]
Code Cell
• df[df["CompanyName"]=="mercury"]
Code Cell
• df[df["CompanyName"]=="porshe"]
Code Cell
• def clean_company_names(df, column):
• df[column] = df[column].str.lower()
• df[column] = df[column].replace({
• 'porshe': 'porsche',
• 'vw': 'volkswagen',
• })
• return df
• df = clean_company_names(df,
Code Cell
• df["fueltype"].unique()
Code Cell
• def categorical_visualization(cols):
• plt.figure(figsize=(20,10))
• plt.subplot(1,3,1)
•
sns.countplot(x=cols,data=df,palette="Set2",or
der=df[cols].value_counts().index)
• plt.title(f"{cols}
Distribution",pad=10,fontweight="black",fonts
ize=18)
• plt.xticks(rotation=90)
Code Cell
• df["aspiration"].unique()
Code Cell
• categorical_visualization("aspiration")
Code Cell
• categorical_visualization("doornumber")
Code Cell
• categorical_visualization("carbody")
Code Cell
• categorical_visualization("drivewheel")
Code Cell
• categorical_visualization("enginelocation")
Code Cell
• df[df["enginelocation"]=="rear"]
Code Cell
• categorical_visualization("enginetype")
Code Cell
• df[df["enginetype"]=="rotor"]
Code Cell
• df[df["enginetype"]=="dohcv"]
Code Cell
• categorical_visualization("cylindernumber")
Code Cell
• df[df["cylindernumber"]=="three"]
Code Cell
• df[df["cylindernumber"]=="twelve"]
Code Cell
• categorical_visualization("fuelsystem")
Code Cell
• df[df["fuelsystem"]=="mfi"]
Code Cell
• df[df["fuelsystem"]=="spfi"]
Code Cell
• categorical_visualization("symboling")
Code Cell
• def scatter_plot(cols):
• x=1
• plt.figure(figsize=(15,6))
• for col in cols:
• plt.subplot(1,3,x)
•
sns.scatterplot(x=col,y="price",data=df,color="
blue")
• plt.title(f"{col} vs
Code Cell
• scatter_plot(["carlength","carwidth","carheigh
t"])
Code Cell
• scatter_plot(["enginesize","boreratio","stroke"
])
Code Cell
• scatter_plot(["compressionratio","horsepower
","peakrpm"])
Code Cell
• def scatter_plot(cols):
• q_low = df["price"].quantile(0.01)
• q_hi = df["price"].quantile(0.99)
• df_filtered = df[(df["price"] > q_low) &
(df["price"] < q_hi)]
• x=1
• plt.figure(figsize=(15,6))
• for col in cols:
• plt.subplot(1,2,x)
Code Cell
• scatter_plot(["wheelbase","curbweight"])
Code Cell
• scatter_plot(["citympg","highwaympg"])
Code Cell
• f = round(df.groupby(["CompanyName"])
["price"].agg(["mean"]),2).T
• f
Code Cell
• df =
df.merge(f.T,how="left",on="CompanyName")
Code Cell
• bins = [0,10000,20000,40000]
• cars_bin=['Budget','Medium','Highend']
• df['CarsRange'] =
pd.cut(df['mean'],bins,right=False,labels=cars_
bin)
• df.head()
Code Cell
• new_df =
df[['fueltype','aspiration','doornumber','carbo
dy','drivewheel','enginetype','cylindernumber'
,'fuelsystem'
• ,'wheelbase','carlength','carwidth','cur
bweight','enginesize','boreratio','horsepower','
citympg','highwaympg',
• 'price','CarsRange']]
Code Cell
• new_df.head()
Code Cell
• new_df =
pd.get_dummies(columns=["fueltype","aspirat
ion","doornumber","carbody","drivewheel","e
nginetype",
•
"cylindernumber","fuelsystem","CarsRange"],
data=new_df)
Code Cell
• new_df.head()
Code Cell
• scaler = StandardScaler()
Code Cell
• num_cols =
['wheelbase','carlength','carwidth','curbweight
','enginesize','boreratio','horsepower',
• 'citympg','highwaympg']
• new_df[num_cols] =
scaler.fit_transform(new_df[num_cols])
Code Cell
• new_df.head()
Code Cell
• x = new_df.drop(columns=["price"])
• y = new_df["price"]
Code Cell
• x.shape
Code Cell
• y.shape
Code Cell
• x_train,x_test,y_train,y_test=train_test_split(x
,y,test_size=0.2,random_state=42)
Code Cell
• print("x_train - > ",x_train.shape)
• print("x_test - > ",x_test.shape)
• print("y_train - > ",y_train.shape)
• print("y_test - > ",y_test.shape)
Markdown Cell
• # **MODEL BUILDING**
Code Cell
• training_score = []
• testing_score = []
Code Cell
• def model_prediction(model):
• model.fit(x_train,y_train)
• x_train_pred = model.predict(x_train)
• x_test_pred = model.predict(x_test)
• a = r2_score(y_train,x_train_pred)*100
• b = r2_score(y_test,x_test_pred)*100
• training_score.append(a)
• testing_score.append(b)
Code Cell
• model_prediction(LinearRegression())
Code Cell
• model_prediction(DecisionTreeRegressor())
Code Cell
• model_prediction(RandomForestRegressor())
Code Cell
• models = ["Linear Regression","Decision
Tree","Random Forest"]
• df = pd.DataFrame({"Algorithms":models,
• "Training Score":training_score,
• "Testing Score":testing_score})
• df
Code Cell
• df.plot(x="Algorithms",y=["Training
Score","Testing Score"],
figsize=(16,6),kind="bar",
• title="Performance Visualization of
Different Models",colormap="Set1")
• plt.show()