0% found this document useful (0 votes)
13 views

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

k626856k
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

k626856k
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 22

Lec_ExploratoryDataAnalysis1Unit5Part1

October 3, 2023

[4]: import pandas as pd


import numpy as np

# Create a sample dataset with missing data and duplicates


data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice'],
'Age': [25, 30, np.nan, 35, 28, 25],
'Gender': ['Female', 'Male', 'Male', 'Male', 'Female', 'Female'],
'Salary': [50000, 60000, 45000, 70000, np.nan, 55000]
}

df = pd.DataFrame(data)

print("Dataset", "\n", df)


# Handling Missing Data
# Check for missing values
print("Missing values in the DataFrame:")
print(df.isnull())

# Remove rows with missing values


df.dropna(inplace=True)
print("\nDataFrame after removing rows with missing values:")
print(df)

# Data Transformation: Removing Duplicates


# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("\nDataFrame after removing duplicates:")
print(df)

# Transforming Data Using a Function or Mapping


# Example: Transform 'Age' column by squaring each value
df['Age'] = df['Age'].apply(lambda x: x ** 2)
print("\nDataFrame after transforming 'Age' column:")
print(df)

# Replacing Values

1
# Replace 'Female' with 'F' and 'Male' with 'M' in the 'Gender' column
df['Gender'] = df['Gender'].replace({'Female': 'F', 'Male': 'M'})
print("\nDataFrame after replacing values in 'Gender' column:")
print(df)

# Detecting and Filtering Outliers


# Detect outliers in the 'Salary' column using z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df['Salary']))
outliers = df[z_scores > 2]
print("\nOutliers in 'Salary' column:")
print(outliers)

# Functions in pandas
# Calculate the mean salary
mean_salary = df['Salary'].mean()
print("\nMean Salary:", mean_salary)

Dataset
Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
2 Charlie NaN Male 45000.0
3 David 35.0 Male 70000.0
4 Eva 28.0 Female NaN
5 Alice 25.0 Female 55000.0
Missing values in the DataFrame:
Name Age Gender Salary
0 False False False False
1 False False False False
2 False True False False
3 False False False False
4 False False False True
5 False False False False

DataFrame after removing rows with missing values:


Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
3 David 35.0 Male 70000.0
5 Alice 25.0 Female 55000.0

DataFrame after removing duplicates:


Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
3 David 35.0 Male 70000.0

2
5 Alice 25.0 Female 55000.0

DataFrame after transforming 'Age' column:


Name Age Gender Salary
0 Alice 625.0 Female 50000.0
1 Bob 900.0 Male 60000.0
3 David 1225.0 Male 70000.0
5 Alice 625.0 Female 55000.0

DataFrame after replacing values in 'Gender' column:


Name Age Gender Salary
0 Alice 625.0 F 50000.0
1 Bob 900.0 M 60000.0
3 David 1225.0 M 70000.0
5 Alice 625.0 F 55000.0

Outliers in 'Salary' column:


Empty DataFrame
Columns: [Name, Age, Gender, Salary]
Index: []

Mean Salary: 58750.0

1 Handling Missing Dat


[5]: import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt # Matlab-style plotting

[6]: df=pd.read_csv("data.csv")

[7]: df.head()

[7]: Make Model Year Engine Fuel Type Engine HP \


0 BMW 1 Series M 2011 premium unleaded (required) 335.0
1 BMW 1 Series 2011 premium unleaded (required) 300.0
2 BMW 1 Series 2011 premium unleaded (required) 300.0
3 BMW 1 Series 2011 premium unleaded (required) 230.0
4 BMW 1 Series 2011 premium unleaded (required) 230.0

Engine Cylinders Transmission Type Driven_Wheels Number of Doors \


0 6.0 MANUAL rear wheel drive 2.0
1 6.0 MANUAL rear wheel drive 2.0
2 6.0 MANUAL rear wheel drive 2.0
3 6.0 MANUAL rear wheel drive 2.0
4 6.0 MANUAL rear wheel drive 2.0

3
Market Category Vehicle Size Vehicle Style \
0 Factory Tuner,Luxury,High-Performance Compact Coupe
1 Luxury,Performance Compact Convertible
2 Luxury,High-Performance Compact Coupe
3 Luxury,Performance Compact Coupe
4 Luxury Compact Convertible

highway MPG city mpg Popularity MSRP


0 26 19 3916 46135
1 28 19 3916 40650
2 28 20 3916 36350
3 28 18 3916 29450
4 28 18 3916 34500

[9]: df.columns.tolist()

[9]: ['Make',
'Model',
'Year',
'Engine Fuel Type',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Driven_Wheels',
'Number of Doors',
'Market Category',
'Vehicle Size',
'Vehicle Style',
'highway MPG',
'city mpg',
'Popularity',
'MSRP']

[10]: #Uniform format


df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns.tolist()

[10]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',

4
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',
'popularity',
'msrp']

[11]: df = df.rename(columns={'msrp': 'price'})


df.columns.tolist()

[11]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',
'popularity',
'price']

[12]: string_columns = list(df.dtypes[df.dtypes == 'object'].index)


for col in string_columns:
df[col] = df[col].str.lower().str.replace(' ', '_')
df.columns.tolist()

[12]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',

5
'popularity',
'price']

[13]: df.head()

[13]: make model year engine_fuel_type engine_hp \


0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0
1 bmw 1_series 2011 premium_unleaded_(required) 300.0
2 bmw 1_series 2011 premium_unleaded_(required) 300.0
3 bmw 1_series 2011 premium_unleaded_(required) 230.0
4 bmw 1_series 2011 premium_unleaded_(required) 230.0

engine_cylinders transmission_type driven_wheels number_of_doors \


0 6.0 manual rear_wheel_drive 2.0
1 6.0 manual rear_wheel_drive 2.0
2 6.0 manual rear_wheel_drive 2.0
3 6.0 manual rear_wheel_drive 2.0
4 6.0 manual rear_wheel_drive 2.0

market_category vehicle_size vehicle_style \


0 factory_tuner,luxury,high-performance compact coupe
1 luxury,performance compact convertible
2 luxury,high-performance compact coupe
3 luxury,performance compact coupe
4 luxury compact convertible

highway_mpg city_mpg popularity price


0 26 19 3916 46135
1 28 19 3916 40650
2 28 20 3916 36350
3 28 18 3916 29450
4 28 18 3916 34500

[15]: from scipy import stats


from scipy.stats import norm
from sklearn.preprocessing import StandardScaler

[18]: sns.distplot(df['price']);

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

6
[19]: sns.distplot(df['price'] , fit=norm);

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

7
[25]: sns.distplot(df['price'] , fit=norm);
(mu, sigma) = norm.fit(df['price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#check skewness of the data
print("Skewness: %f" % df['price'].skew())
print("Kurtosis: %f" % df['price'].kurt())

#Now plot the distribution


plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu,␣
,→sigma)],

loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot


fig = plt.figure()
res = stats.probplot(df['price'], plot=plt)
plt.show()

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).

8
warnings.warn(msg, FutureWarning)

mu = 40594.74 and sigma = 60106.58

Skewness: 11.771987
Kurtosis: 268.926276

9
kurtosis
In probability theory and statistics, kurtosis is a measure of the “tailedness” of the probability
distribution of a real-valued random variable.
Like skewness, kurtosis describes a particular aspect of a probability distribution.
There are different ways to quantify kurtosis for a theoretical distribution, and there are corre-
sponding ways of estimating it using a sample from a population.
Different measures of kurtosis may have different interpretations.
[29]: # Set the variable and data for the scatter plot
engine_col = 'engine_hp'
engine_data = pd.concat([df['price'], df[engine_col]], axis=1)

engine_data.head()

[29]: price engine_hp


0 46135 335.0
1 40650 300.0
2 36350 300.0
3 29450 230.0
4 34500 230.0

10
[30]: # Create the scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(x=engine_data[engine_col], y=engine_data['price'])
ax.set_ylim([0, 800000])
ax.set_title("Scatter plot of car popularity and price")
ax.set_xlabel("Engine Horsepower (rpm)")
ax.set_ylabel("Price ($)")

# Show the plot


plt.show()

[32]: engine_cylinders_col = 'engine_cylinders'


engine_cylinders_price_data = pd.concat([df['price'],␣
,→df[engine_cylinders_col]], axis=1)

# Create the box plot


fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=engine_cylinders_col, y='price',␣
,→data=engine_cylinders_price_data, ax=ax)

ax.set_title('Box plot of engine cylinder and price')

11
ax.set_xlabel('Number of Engine Cylinders')
ax.set_ylabel('Price ($)')
plt.xticks(rotation=45)

# Show the plot


plt.show()

[42]: # Select the top car makes by frequency


make_col = 'make'
# Select the top 5 car makes by frequency
top_makes = df['make'].value_counts().nlargest(5)
top_makes

[42]: chevrolet 1123


ford 881
volkswagen 809
toyota 746
dodge 626
Name: make, dtype: int64

12
[43]: top_makes = df['make'].value_counts().nlargest(5).index.tolist()
print(top_makes)
# Create a new DataFrame that only includes the top makes
top_make_data = df[df[make_col].isin(top_makes)]
top_make_data.head()

['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge']

[43]: make model year engine_fuel_type engine_hp engine_cylinders \


479 toyota 4runner 2014 regular_unleaded 270.0 6.0
480 toyota 4runner 2014 regular_unleaded 270.0 6.0
481 toyota 4runner 2014 regular_unleaded 270.0 6.0
482 toyota 4runner 2014 regular_unleaded 270.0 6.0
483 toyota 4runner 2014 regular_unleaded 270.0 6.0

transmission_type driven_wheels number_of_doors market_category \


479 automatic rear_wheel_drive 4.0 NaN
480 automatic rear_wheel_drive 4.0 NaN
481 automatic four_wheel_drive 4.0 NaN
482 automatic four_wheel_drive 4.0 NaN
483 automatic four_wheel_drive 4.0 NaN

vehicle_size vehicle_style highway_mpg city_mpg popularity price


479 midsize 4dr_suv 23 17 2031 41365
480 midsize 4dr_suv 23 17 2031 35740
481 midsize 4dr_suv 22 17 2031 37615
482 midsize 4dr_suv 22 17 2031 34695
483 midsize 4dr_suv 22 17 2031 35725

[44]: # Create the box plot with the top makes


fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=make_col, y='price', data=top_make_data, ax=ax)
ax.set_title('Box plot of top car brands and price')
ax.set_xlabel('Car Brand')
ax.set_ylabel('Price ($)')
plt.xticks(rotation=45)

# Show the plot


plt.show()

13
[45]: # correlation matrix

[47]: plt.figure(figsize=(7,6))
correlation = df.corr()
sns.heatmap(correlation,annot=True)
correlation

[47]: year engine_hp engine_cylinders number_of_doors \


year 1.000000 0.351794 -0.041479 0.263787
engine_hp 0.351794 1.000000 0.779988 -0.102713
engine_cylinders -0.041479 0.779988 1.000000 -0.140088
number_of_doors 0.263787 -0.102713 -0.140088 1.000000
highway_mpg 0.258240 -0.406563 -0.621606 0.118570
city_mpg 0.198171 -0.439371 -0.600776 0.120881
popularity 0.073049 0.037501 0.041145 -0.048272
price 0.227590 0.662008 0.531312 -0.126635

highway_mpg city_mpg popularity price

14
year 0.258240 0.198171 0.073049 0.227590
engine_hp -0.406563 -0.439371 0.037501 0.662008
engine_cylinders -0.621606 -0.600776 0.041145 0.531312
number_of_doors 0.118570 0.120881 -0.048272 -0.126635
highway_mpg 1.000000 0.886829 -0.020991 -0.160043
city_mpg 0.886829 1.000000 -0.003217 -0.157676
popularity -0.020991 -0.003217 1.000000 -0.048476
price -0.160043 -0.157676 -0.048476 1.000000

pairplot
[48]: sns.set()
cols = ['year', 'engine_hp', 'engine_cylinders', 'number_of_doors', 'price',]
sns.pairplot(df[cols], height = 2.5)
plt.show();

15
2 DATA CLEANSING
[53]: #check missing ratio
data_na = (df.isnull().sum() / len(df)) * 100
print(data_na)
# exclude the columns that are not null (consider onlu colums that have null␣
,→values non zeros)

data_na = data_na.drop(data_na[data_na == 0].index).


,→sort_values(ascending=False)[:30]

missing_data = pd.DataFrame({'Missing Ratio' :data_na})


missing_data.head(20)

16
make 0.000000
model 0.000000
year 0.000000
engine_fuel_type 0.025180
engine_hp 0.579151
engine_cylinders 0.251805
transmission_type 0.000000
driven_wheels 0.000000
number_of_doors 0.050361
market_category 31.408427
vehicle_size 0.000000
vehicle_style 0.000000
highway_mpg 0.000000
city_mpg 0.000000
popularity 0.000000
price 0.000000
dtype: float64

[53]: Missing Ratio


market_category 31.408427
engine_hp 0.579151
engine_cylinders 0.251805
number_of_doors 0.050361
engine_fuel_type 0.025180

[54]: fig, ax = plt.subplots(figsize=(15, 12))


sns.barplot(x=data_na.index, y=data_na, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set(xlabel='Features', ylabel='Percent of missing values',
title='Percent missing data by feature')
ax.grid(True)
plt.show()

17
Drop Duplicate
[56]: print(df.shape)
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

(11914, 16)
number of duplicate rows: (715, 16)

[58]: #drop duplicates


df = df.drop_duplicates()
print(df.shape)

(11199, 16)

18
3 Deal with missing values
[60]: df.head().T

[60]: 0 \
make bmw
model 1_series_m
year 2011
engine_fuel_type premium_unleaded_(required)
engine_hp 335.0
engine_cylinders 6.0
transmission_type manual
driven_wheels rear_wheel_drive
number_of_doors 2.0
market_category factory_tuner,luxury,high-performance
vehicle_size compact
vehicle_style coupe
highway_mpg 26
city_mpg 19
popularity 3916
price 46135

1 2 \
make bmw bmw
model 1_series 1_series
year 2011 2011
engine_fuel_type premium_unleaded_(required) premium_unleaded_(required)
engine_hp 300.0 300.0
engine_cylinders 6.0 6.0
transmission_type manual manual
driven_wheels rear_wheel_drive rear_wheel_drive
number_of_doors 2.0 2.0
market_category luxury,performance luxury,high-performance
vehicle_size compact compact
vehicle_style convertible coupe
highway_mpg 28 28
city_mpg 19 20
popularity 3916 3916
price 40650 36350

3 4
make bmw bmw
model 1_series 1_series
year 2011 2011
engine_fuel_type premium_unleaded_(required) premium_unleaded_(required)
engine_hp 230.0 230.0
engine_cylinders 6.0 6.0

19
transmission_type manual manual
driven_wheels rear_wheel_drive rear_wheel_drive
number_of_doors 2.0 2.0
market_category luxury,performance luxury
vehicle_size compact compact
vehicle_style coupe convertible
highway_mpg 28 28
city_mpg 18 18
popularity 3916 3916
price 29450 34500

[61]: df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11199 entries, 0 to 11913
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 make 11199 non-null object
1 model 11199 non-null object
2 year 11199 non-null int64
3 engine_fuel_type 11196 non-null object
4 engine_hp 11130 non-null float64
5 engine_cylinders 11169 non-null float64
6 transmission_type 11199 non-null object
7 driven_wheels 11199 non-null object
8 number_of_doors 11193 non-null float64
9 market_category 7823 non-null object
10 vehicle_size 11199 non-null object
11 vehicle_style 11199 non-null object
12 highway_mpg 11199 non-null int64
13 city_mpg 11199 non-null int64
14 popularity 11199 non-null int64
15 price 11199 non-null int64
dtypes: float64(3), int64(5), object(8)
memory usage: 1.5+ MB

[62]: # engine_fuel_type 11196 non-null object


# the missing values are filled with the mode (the colums data is not real␣
,→values) within the same 'model' group.

# If the mode is empty, it uses 'None' as the fallback value.


df['engine_fuel_type'] = df.groupby('model')['engine_fuel_type'].
,→transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else␣

,→None))

[64]: df['engine_fuel_type'].isnull().sum()

20
[64]: 0

[65]: #number_of_doors 11193 non-null float64


#the missing values are filled with the mean (average) of the 'number_of_doors'␣
,→within the same 'model' group.

df['number_of_doors'] = df.groupby('model')['number_of_doors'].transform(lambda␣
,→x: x.fillna(x.mean()))

[67]: df['number_of_doors'].isnull().sum()

[67]: 0

[70]: # engine_cylinders 11169 non-null float64


#the missing values are filled with the mean (average) of the␣
,→'engine_cylinders' within the same 'model' group.

df['engine_cylinders'] = df.groupby('model')['engine_cylinders'].
,→transform(lambda x: x.fillna(x.mean()))

[71]: df['engine_cylinders'].isnull().sum()

[71]: 29

[73]: #As we utilize the groupby method, there may still be null values present in␣
,→our dataset.

[74]: df['engine_hp'] = df.groupby(['model', 'year'])['engine_hp'].transform(lambda x:


,→ x.fillna(x.mean()))

[75]: df['engine_hp'].isnull().sum()

[75]: 47

[76]: df.isnull().sum()

[76]: make 0
model 0
year 0
engine_fuel_type 0
engine_hp 47
engine_cylinders 29
transmission_type 0
driven_wheels 0
number_of_doors 0
market_category 3376
vehicle_size 0
vehicle_style 0
highway_mpg 0

21
city_mpg 0
popularity 0
price 0
dtype: int64

[77]: #As we utilize the groupby method, there may still be null values present in␣
,→our dataset.

To address this issue, we can use a rule-based method for imputing these remaining missing values
[ ]:

22

You might also like