5) Randomforest - Ipynb - Colaboratory
5) Randomforest - Ipynb - Colaboratory
numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['car_evaluation(1).csv']))
df
df.shape
(1727, 7)
df.head()
vhigh vhigh.1 2 2.1 small low unacc
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names
col_names
df.head()
buying maint doors persons lug_boot safety class
#summary of data set
0 vhigh vhigh 2 2 small med unacc
1 vhigh vhigh 2 2 small high unacc
df.info()
2 vhigh vhigh 2 2 med low unacc
3 vhigh vhigh 2 2 med med unacc
<class
4 'pandas.core.frame.DataFrame'>
dtypes: object(7)
#
#Frequency distribution of values in variables
#Now, check the frequency counts of categorical variables.
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for col in col_names:
print(df[col].value_counts())
high 432
med 432
low 432
vhigh 431
high 432
med 432
low 432
vhigh 431
3 432
4 432
5more 432
2 431
4 576
more 576
2 575
big 576
med 576
small 575
high 576
med 576
low 575
unacc 1209
acc 384
good 69
vgood 65
df['class'].value_counts()
unacc 1209
acc 384
good 69
vgood 65
# check missing values in variables
df.isnull().sum()
buying 0
maint 0
doors 0
persons 0
lug_boot 0
safety 0
class 0
dtype: int64
#Declare feature vector and target variable
X = df.drop(['class'], axis=1)
y = df['class']
X
buying maint doors persons lug_boot safety
01722 unacc
21723 low
unacc
low 5more more med high
3 unacc
41724 low
unacc
low 5more more big low
...
1723
1726 vgood
1725 good
#Split data into separate training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state =
# check the shape of X_train and X_test
X_train.shape, X_test.shape
#Feature Engineering
# check data types in X_train
X_train.dtypes
buying object
maint object
doors object
persons object
lug_boot object
safety object
dtype: object
#Encode categorical variables
X_train.head()
buying maint doors persons lug_boot safety
# import category encoders
import category_encoders as ce
# encode categorical variables with ordinal encoding
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safe
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
/usr/local/lib/python3.7/dist-packages/category_encoders/utils.py:21: FutureWarning:
elif pd.api.types.is_categorical(cols):
X_train.head()
buying maint doors persons lug_boot safety
83
X_test.head() 1 1 1 1 1 1
48 1 1 2 2 1 2
buying maint doors persons lug_boot safety
468 2 1 2 3 2 2
599 2 2 3 1 3 1
155 1 2 2 2 1 1
932 3 1 3 3 3 1
1043 3 2 3 2 2 1
628 2 2 1 1 3 3
1497 4 2 1 3 1 2
1262 3 4 3 2 1 1
# import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# instantiate the classifier
rfc = RandomForestClassifier(random_state=0)
# fit the model
rfc.fit(X_train, y_train)
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
warm_start=False)
# Predict the Test set results
y_pred = rfc.predict(X_test)
# Check accuracy score
from sklearn.metrics import accuracy_score
#print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_t
# instantiate the classifier with n_estimators = 100
rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set
rfc_100.fit(X_train, y_train)
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
warm_start=False)
# Predict on the test set results
y_pred_100 = rfc_100.predict(X_test)
# Check accuracy score
print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_t
#Find important features with Random Forest model
# create the classifier with n_estimators = 100
clf = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set
clf.fit(X_train, y_train)
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
warm_start=False)
# view the feature scores
feature scores = pd Series(clf feature importances index=X train columns) sort values(as
feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(as
feature_scores
safety 0.291657
persons 0.235380
buying 0.160692
maint 0.134143
lug_boot 0.111595
doors 0.066533
dtype: float64
# Creating a seaborn bar plot
sns.barplot(x=feature_scores, y=feature_scores.index)
# Add title to the graph
plt.title("Visualizing Important Features")
# Visualize the graph
plt.show()
#Build Random Forest model on selected features
# declare feature vector and target variable
X = df.drop(['class', 'doors'], axis=1)
y = df['class']
# split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state =
X
#encode categorical variables with ordinal encoding
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'persons', 'lug_boot', 'safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
/usr/local/lib/python3.7/dist-packages/category_encoders/utils.py:21: FutureWarning:
elif pd.api.types.is_categorical(cols):
# instantiate the classifier with n_estimators = 100
clf = RandomForestClassifier(random_state=0)
# fit the model to the training set
clf.fit(X_train, y_train)
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
warm_start=False)
# Predict on the test set results
y_pred = clf.predict(X_test)
# Check accuracy score
print('Model accuracy score with doors variable removed : {0:0.4f}'. format(accuracy_score
# Classification Report
#Classification report is another way to evaluate the classification model performance. It
#We can print a classification report as follows:-
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))