0% found this document useful (0 votes)
13 views9 pages

Import As Import As Import As: "Default - CSV"

Uploaded by

growhigh007
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views9 pages

Import As Import As Import As: "Default - CSV"

Uploaded by

growhigh007
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

In [35]: import numpy as np

import pandas as pd
import seaborn as sns

In [36]: df = pd.read_csv("Default.csv")
df.head(5)

Out[36]: Unnamed: 0 default student balance income

0 1 No No 729.53 44361.63

1 2 No Yes 817.18 12106.13

2 3 No No 1073.55 31767.14

3 4 No No 529.25 35704.49

4 5 No No 785.66 38463.50

In [37]: df.drop(['Unnamed: 0'],axis = 1,inplace =True)

In [38]: df.shape

Out[38]: (10000, 4)

In [39]: import matplotlib.pyplot as plt


from scipy import stats, integrate
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14
In [40]: sns.boxplot(x='default', y='income', data=df)
plt.show()

In [41]: sns.lmplot(x='balance', y='income', hue = 'default', data=df, aspect=1.5, fit_


plt.show()
In [42]: df.default.value_counts()

Out[42]: default
No 9667
Yes 333
Name: count, dtype: int64

In [43]: df.student.value_counts()

Out[43]: student
No 7056
Yes 2944
Name: count, dtype: int64

In [44]: df['default 1'] = df.default.factorize()[0]

In [45]: df['student 1'] = df.student.factorize()[0]

In [46]: df.head()

Out[46]: default student balance income default 1 student 1

0 No No 729.53 44361.63 0 0

1 No Yes 817.18 12106.13 0 1

2 No No 1073.55 31767.14 0 0

3 No No 529.25 35704.49 0 0

4 No No 785.66 38463.50 0 0

In [47]: df.drop(["default","student"],axis = 1,inplace = True)

In [48]: pd.crosstab(df['default 1'], df['student 1'], rownames=['Default'], colnames=[

Out[48]: Student 0 1

Default

0 6850 2817

1 206 127

In [57]: X = df[['balance']]
y = df['default 1']

In [50]: ​
In [58]: y

Out[58]: 0 0
1 0
2 0
3 0
4 0
..
9995 0
9996 0
9997 0
9998 0
9999 0
Name: default 1, Length: 10000, dtype: int64
In [59]: from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X, y)
print(logreg.coef_)
print(logreg.intercept_)

y_pred = logreg.predict_proba(X)
plt.scatter(X.values, y_pred[:,1])
#plt.scatter(X.values, y)
plt.show()

[[0.00549892]]
[-10.65132824]

In [60]: y_pred

Out[60]: array([[9.98694319e-01, 1.30568146e-03],


[9.97887402e-01, 2.11259754e-03],
[9.91405252e-01, 8.59474814e-03],
...,
[9.97533484e-01, 2.46651596e-03],
[8.83240365e-01, 1.16759635e-01],
[9.99928552e-01, 7.14476480e-05]])
In [61]: y_pred[:,0]

Out[61]: array([0.99869432, 0.9978874 , 0.99140525, ..., 0.99753348, 0.88324037,


0.99992855])

In [62]: X.head()

Out[62]: balance

0 729.53

1 817.18

2 1073.55

3 529.25

4 785.66

In [63]: #splitting the data into train and test with 70:30 ratio
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_st

In [64]: from sklearn.linear_model import LogisticRegression


from sklearn import metrics
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X, y)
print(logreg.coef_)
print(logreg.intercept_)

[[0.005602]]
[-7.42029855]

In [65]: logreg.fit(xTrain, yTrain)

Out[65]: LogisticRegression(class_weight='balanced')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust
the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
nbviewer.org.

In [66]: #now applying our learnt model on test and also on train data
y_log_pred_test = logreg.predict(xTest)
y_log_pred_train = logreg.predict(xTrain)

In [67]: y_log_pred_test.shape

Out[67]: (3000,)

In [68]: y_log_pred_train.shape

Out[68]: (7000,)
In [69]: y_log_pred_test

Out[69]: array([0, 0, 0, ..., 0, 0, 0])

In [70]: conf = metrics.confusion_matrix(yTest, y_log_pred_test)


conf

Out[70]: array([[2514, 377],


[ 22, 87]])

In [77]: confusion = metrics.confusion_matrix(yTest, y_log_pred_test)


print(confusion)
#[row, column]
TN = confusion[1, 1]
TP = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print ("TP",TP)
print ("TN",TN)
print("FN",FN)
print ("FP",FP)

[[2514 377]
[ 22 87]]
TP 2514
TN 87
FN 22
FP 377
In [79]: cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=T
sns.heatmap(conf,cmap = cmap,xticklabels=['predicted_default_yes=0','predicted_
 
Out[79]: <Axes: >

In [80]: # print the first 25 true and predicted responses


print('True', yTest.values[0:15])
print('Pred', y_log_pred_test[0:15])

True [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
Pred [0 0 0 0 0 0 1 0 1 0 1 1 1 1 1]

In [81]: #comparing the metrics of predicted lebel and real label of test data
print('Accuracy_Score:', metrics.accuracy_score(yTest, y_log_pred_test))

Accuracy_Score: 0.867

In [82]: print('Classification Error:',1 - metrics.accuracy_score(yTest, y_log_pred_test

Classification Error: 0.133

In [83]: print('Sensitivity or Recall:', metrics.recall_score(yTest, y_log_pred_test))

Sensitivity or Recall: 0.7981651376146789


In [84]: specificity = TN / (TN + FP)

print(specificity)

0.1875

In [85]: from sklearn.metrics import classification_report


print(classification_report(yTest, y_log_pred_test))

precision recall f1-score support

0 0.99 0.87 0.93 2891


1 0.19 0.80 0.30 109

accuracy 0.87 3000


macro avg 0.59 0.83 0.62 3000
weighted avg 0.96 0.87 0.90 3000

In [ ]: ​

In [ ]: ​

In [ ]: ​

In [ ]: ​

You might also like