0% found this document useful (0 votes)
65 views17 pages

Project3: Loading Library

1. The document loads libraries and data to analyze a personal loan dataset with 5000 customers and 14 variables. 2. Exploratory data analysis is performed including checking for missing data, outliers, and negative values. The Experience variable is made positive. 3. The data is imputed for missing values in the Family variable and categorical variables are converted to factors.

Uploaded by

Shreya Garg
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
65 views17 pages

Project3: Loading Library

1. The document loads libraries and data to analyze a personal loan dataset with 5000 customers and 14 variables. 2. Exploratory data analysis is performed including checking for missing data, outliers, and negative values. The Experience variable is made positive. 3. The data is imputed for missing values in the Family variable and categorical variables are converted to factors.

Uploaded by

Shreya Garg
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 17

project3

#LOADING LIBRARY
library(readr)
library(readxl)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ROCR)
library(ineq)
library(StatMeasures)
library(rattle)

#LOADING DATA
Customerdata=read_excel("Thera Bank_Personal_Loan_Modelling-dataset-1.xlsx")
attach(Customerdata)

#Exploratory Data Analysis


#1 coloum name treatment
names(Customerdata)

## [1] "ID" "Age (in years)"


## [3] "Experience (in years)" "Income (in K/month)"
## [5] "ZIP Code" "Family members"
## [7] "CCAvg" "Education"
## [9] "Mortgage" "Personal Loan"
## [11] "Securities Account" "CD Account"
## [13] "Online" "CreditCard"

colnames(Customerdata)=make.names(colnames(Customerdata))

#2 Data overview
head(Customerdata)

## # A tibble: 6 x 14
## ID Age..in.years. Experience..in.~ Income..in.K.mo~ ZIP.Code
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25 1 49 91107
## 2 2 45 19 34 90089
## 3 3 39 15 11 94720
## 4 4 35 9 100 94112
## 5 5 35 8 45 91330
## 6 6 37 13 29 92121
## # ... with 9 more variables: Family.members <dbl>, CCAvg <dbl>,
## # Education <dbl>, Mortgage <dbl>, Personal.Loan <dbl>,
## # Securities.Account <dbl>, CD.Account <dbl>, Online <dbl>,
## # CreditCard <dbl>

summary(Customerdata)

## ID Age..in.years. Experience..in.years. Income..in.K.month.


## Min. : 1 Min. :23.00 Min. :-3.0 Min. : 8.00
## 1st Qu.:1251 1st Qu.:35.00 1st Qu.:10.0 1st Qu.: 39.00
## Median :2500 Median :45.00 Median :20.0 Median : 64.00
## Mean :2500 Mean :45.34 Mean :20.1 Mean : 73.77
## 3rd Qu.:3750 3rd Qu.:55.00 3rd Qu.:30.0 3rd Qu.: 98.00
## Max. :5000 Max. :67.00 Max. :43.0 Max. :224.00
##
## ZIP.Code Family.members CCAvg Education
## Min. : 9307 Min. :1.000 Min. : 0.000 Min. :1.000
## 1st Qu.:91911 1st Qu.:1.000 1st Qu.: 0.700 1st Qu.:1.000
## Median :93437 Median :2.000 Median : 1.500 Median :2.000
## Mean :93153 Mean :2.397 Mean : 1.938 Mean :1.881
## 3rd Qu.:94608 3rd Qu.:3.000 3rd Qu.: 2.500 3rd Qu.:3.000
## Max. :96651 Max. :4.000 Max. :10.000 Max. :3.000
## NA's :18
## Mortgage Personal.Loan Securities.Account CD.Account
## Min. : 0.0 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0 Median :0.000 Median :0.0000 Median :0.0000
## Mean : 56.5 Mean :0.096 Mean :0.1044 Mean :0.0604
## 3rd Qu.:101.0 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :635.0 Max. :1.000 Max. :1.0000 Max. :1.0000
##
## Online CreditCard
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :0.000
## Mean :0.5968 Mean :0.294
## 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.000
##

str(Customerdata)

## Classes 'tbl_df', 'tbl' and 'data.frame': 5000 obs. of 14 variables:


## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age..in.years. : num 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience..in.years.: num 1 19 15 9 8 13 27 24 10 9 ...
## $ Income..in.K.month. : num 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : num 91107 90089 94720 94112 91330 ...
## $ Family.members : num 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : num 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account : num 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : num 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : num 0 0 0 0 1 0 0 1 0 0 ...

Customerdata$Education=as.factor(Customerdata$Education)
Customerdata$Personal.Loan=as.factor(Customerdata$Personal.Loan)
Customerdata$Securities.Account=as.factor(Customerdata$Securities.Account)
Customerdata$CD.Account=as.factor(Customerdata$CD.Account)
Customerdata$Online=as.factor(Customerdata$Online)
Customerdata$CreditCard=as.factor(Customerdata$CreditCard)

str(Customerdata)

## Classes 'tbl_df', 'tbl' and 'data.frame': 5000 obs. of 14 variables:


## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age..in.years. : num 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience..in.years.: num 1 19 15 9 8 13 27 24 10 9 ...
## $ Income..in.K.month. : num 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : num 91107 90089 94720 94112 91330 ...
## $ Family.members : num 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 2 3
2 3 ...
## $ Mortgage : num 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
2 ...
## $ Securities.Account : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1
1 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 1 2
1 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1
1 ...

Customerdata=Customerdata[,-c(1,5)]

#4Checking Data set for missing value


library(mice)
library(VIM)
any(is.na.data.frame(Customerdata))

## [1] TRUE

impute=mice(Customerdata[,],m=3,seed=123)

##
## iter imp variable
## 1 1 Family.members
## 1 2 Family.members
## 1 3 Family.members
## 2 1 Family.members
## 2 2 Family.members
## 2 3 Family.members
## 3 1 Family.members
## 3 2 Family.members
## 3 3 Family.members
## 4 1 Family.members
## 4 2 Family.members
## 4 3 Family.members
## 5 1 Family.members
## 5 2 Family.members
## 5 3 Family.members

print(impute)

## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## Age..in.years. Experience..in.years. Income..in.K.month.
## "" "" ""
## Family.members CCAvg Education
## "pmm" "" ""
## Mortgage Personal.Loan Securities.Account
## "" "" ""
## CD.Account Online CreditCard
## "" "" ""
## PredictorMatrix:
## Age..in.years. Experience..in.years.
## Age..in.years. 0 1
## Experience..in.years. 1 0
## Income..in.K.month. 1 1
## Family.members 1 1
## CCAvg 1 1
## Education 1 1
## Income..in.K.month. Family.members CCAvg Education
## Age..in.years. 1 1 1 1
## Experience..in.years. 1 1 1 1
## Income..in.K.month. 0 1 1 1
## Family.members 1 0 1 1
## CCAvg 1 1 0 1
## Education 1 1 1 0
## Mortgage Personal.Loan Securities.Account CD.Account
## Age..in.years. 1 1 1 1
## Experience..in.years. 1 1 1 1
## Income..in.K.month. 1 1 1 1
## Family.members 1 1 1 1
## CCAvg 1 1 1 1
## Education 1 1 1 1
## Online CreditCard
## Age..in.years. 1 1
## Experience..in.years. 1 1
## Income..in.K.month. 1 1
## Family.members 1 1
## CCAvg 1 1
## Education 1 1

newdata=complete(impute,1)
any(is.na.data.frame(newdata))

## [1] FALSE

#3checking for outliers


boxplot(newdata)

#5Negative value treatment


newdata$Experience..in.years.=abs(newdata$Experience..in.years.)
summary(newdata)

## Age..in.years. Experience..in.years. Income..in.K.month. Family.members


## Min. :23.00 Min. : 0.00 Min. : 8.00 Min. :1.000
## 1st Qu.:35.00 1st Qu.:10.00 1st Qu.: 39.00 1st Qu.:1.000
## Median :45.00 Median :20.00 Median : 64.00 Median :2.000
## Mean :45.34 Mean :20.13 Mean : 73.77 Mean :2.396
## 3rd Qu.:55.00 3rd Qu.:30.00 3rd Qu.: 98.00 3rd Qu.:3.000
## Max. :67.00 Max. :43.00 Max. :224.00 Max. :4.000
## CCAvg Education Mortgage Personal.Loan
## Min. : 0.000 1:2096 Min. : 0.0 0:4520
## 1st Qu.: 0.700 2:1403 1st Qu.: 0.0 1: 480
## Median : 1.500 3:1501 Median : 0.0
## Mean : 1.938 Mean : 56.5
## 3rd Qu.: 2.500 3rd Qu.:101.0
## Max. :10.000 Max. :635.0
## Securities.Account CD.Account Online CreditCard
## 0:4478 0:4698 0:2016 0:3530
## 1: 522 1: 302 1:2984 1:1470
##
##
##
##

#Decision Tree.
#Spliting Data into train and test data.
seed=2000
set.seed(seed)
sample=sample.split(newdata,SplitRatio = 0.7)
train_data=subset(newdata,sample == TRUE)
test_data=subset(newdata,sample == FALSE)
nrow(train_data)

## [1] 3333

nrow(test_data)

## [1] 1667

#Checking Response variable


table(train_data$Personal.Loan)

##
## 0 1
## 3025 308

str(train_data)

## 'data.frame': 3333 obs. of 12 variables:


## $ Age..in.years. : num 25 45 39 35 37 34 65 29 48 59 ...
## $ Experience..in.years.: num 1 19 15 9 13 9 39 5 23 32 ...
## $ Income..in.K.month. : num 49 34 11 100 29 180 105 45 114 40 ...
## $ Family.members : num 4 3 1 1 4 1 4 3 2 4 ...
## $ CCAvg : num 1.6 1.5 1 2.7 0.4 8.9 2.4 0.1 3.8 2.5 ...
## $ Education : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 3 3 2
3 2 ...
## $ Mortgage : num 0 0 0 0 155 0 0 0 0 0 ...
## $ Personal.Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1
1 ...
## $ Securities.Account : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 2
1 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1
2 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
1 ...

DT=train_data #DECISION TREE TRAIN DATA


RF=train_data #RANDOM FOREST TRAIN DATA
DS=test_data #DECISION TREE TEST DATA
RS=test_data #RANDOME FOREST TEST DATA

#Buliding Cart Model


Model1=rpart(formula = Personal.Loan~.,data=DT,method =
"class",cp=0,minsplit=100,minbucket=10,xval=10)
fancyRpartPlot(Model1)

Model1
## n= 3333
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3333 308 0 (0.907590759 0.092409241)
## 2) Income..in.K.month.< 113.5 2671 56 0 (0.979034070 0.020965930)
## 4) CCAvg< 2.95 2475 11 0 (0.995555556 0.004444444) *
## 5) CCAvg>=2.95 196 45 0 (0.770408163 0.229591837)
## 10) CD.Account=0 178 32 0 (0.820224719 0.179775281) *
## 11) CD.Account=1 18 5 1 (0.277777778 0.722222222) *
## 3) Income..in.K.month.>=113.5 662 252 0 (0.619335347 0.380664653)
## 6) Education=1 449 50 0 (0.888641425 0.111358575)
## 12) Family.members< 2.5 399 0 0 (1.000000000 0.000000000) *
## 13) Family.members>=2.5 50 0 1 (0.000000000 1.000000000) *
## 7) Education=2,3 213 11 1 (0.051643192 0.948356808)
## 14) Income..in.K.month.< 116.5 18 7 0 (0.611111111 0.388888889) *
## 15) Income..in.K.month.>=116.5 195 0 1 (0.000000000 1.000000000) *

#Compleximity parameter chart


printcp(Model1)

##
## Classification tree:
## rpart(formula = Personal.Loan ~ ., data = DT, method = "class",
## cp = 0, minsplit = 100, minbucket = 10, xval = 10)
##
## Variables actually used in tree construction:
## [1] CCAvg CD.Account Education
## [4] Family.members Income..in.K.month.
##
## Root node error: 308/3333 = 0.092409
##
## n= 3333
##
## CP nsplit rel error xerror xstd
## 1 0.310065 0 1.00000 1.00000 0.054284
## 2 0.162338 2 0.37987 0.48052 0.038612
## 3 0.012987 3 0.21753 0.24351 0.027800
## 4 0.000000 6 0.17857 0.22727 0.026878

plotcp(Model1)
Pmodel=rpart(formula = Personal.Loan~.,data=DT,method =
"class",cp=0.04,minsplit=100,minbucket=100,xval=10)
fancyRpartPlot(Pmodel)
Pmodel

## n= 3333
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3333 308 0 (0.90759076 0.09240924)
## 2) Income..in.K.month.< 113.5 2671 56 0 (0.97903407 0.02096593) *
## 3) Income..in.K.month.>=113.5 662 252 0 (0.61933535 0.38066465)
## 6) Education=1 449 50 0 (0.88864143 0.11135857) *
## 7) Education=2,3 213 11 1 (0.05164319 0.94835681) *

#Cart Model output Explanation


#First node shows that there are 91% chance that customer will accept personal loan 9%
chance of not accepting loan.
#Monthly income is the 1st variable that is split in decision tree hence it is the most
important variable for building strategy.
#Highest Risk segment is of 6% which means that 6% of customer will not accept personal
loan are being taken in that segment which is of Monthly Income >114 ,Education=1.
#second Risk segment is of 2% which means that 2% of customer will not accept personal
loan are being taken in that segment which is of Monthly Income >114, Education =1 and
Family.Members < 2.5
#Prediction
DT$Prediction=predict(Pmodel,data=DT,type = "class")
DT$probability=predict(Pmodel,data=DT,type = "prob")
tbl=table(Actual=DT$Personal.Loan,prediction=DT$Prediction)
(3014+252)/sum(tbl)

## [1] 0.979898

Z=Pmodel

#accurancy of above model is 97.9 %.


#PREDICTION USING SAME MODEL IN TEST DATA.
DS$Predict.class=predict(Pmodel,newdata = DS,type = "class")
tbl1=table(Actual.test=DS$Personal.Loan,prediction.test=DS$Predict.class)
tbl1

## prediction.test
## Actual.test 0 1
## 0 1489 6
## 1 47 125

(1489+144)/sum(tbl1)

## [1] 0.9796041

#Decision Tree Model Performance and Validation-Train Data.


#Confusion Matrix
DT$Prediction=predict(Pmodel,data=DT,type = "class")
tbl=table(Actual=DT$Personal.Loan,prediction=DT$Prediction)
tbl

## prediction
## Actual 0 1
## 0 3014 11
## 1 106 202

#1)Accuracy Of Model
(3014+252)/sum(tbl)

## [1] 0.979898

#2classification error
(11+56)/sum(tbl)

## [1] 0.02010201
#3)sensitivity(True Positive Rate)
252/(252+56)

## [1] 0.8181818

#4)Specifity(True Negative Rate)


3014/(11+3014)

## [1] 0.9963636

pobj=prediction(DT$probability[,2],DT$Personal.Loan)
perf <- performance(pobj, "tpr", "fpr")
plot(perf,main = "ROC curve")

KS <- max(attr(pobj, 'y.values')[[1]]-attr(perf, 'x.values')[[1]])


auc <- performance(pobj,"auc");
auc <- as.numeric([email protected])
print(KS)

## [1] -Inf

auc

## [1] 0.8842803
gini=ineq(DT$probability,"gini")
print(gini)

## [1] 0.4767402

#Decision Tree Model Performance and Validation-Test Data.


#Confusion Matrix
DS$Predict.class=predict(Pmodel,newdata = DS,type = "class")
DS$probability1=predict(Pmodel,newdata = DS,type="prob")
tbl1=table(Actual.test=DS$Personal.Loan,prediction.test=DS$Predict.class)
tbl1

## prediction.test
## Actual.test 0 1
## 0 1489 6
## 1 47 125

(1489+144)/sum(tbl1)

## [1] 0.9796041

#1)Accuracy Of Model
(1489++144)/sum(tbl1)

## [1] 0.9796041

#2classification error
(6+28)/sum(tbl1)

## [1] 0.02039592

#3)sensitivity(True Positive Rate)


144/(144+28)

## [1] 0.8372093

#4)Specifity(True Negative Rate)


1489/(1489+6)

## [1] 0.9959866

pobj1=prediction(DS$probability1[,2],DS$Personal.Loan)
perf1 <- performance(pobj1, "tpr", "fpr")
plot(perf1,main = "ROC curve")
#Randome forest
print(sum(RF$Personal.Loan==1)/nrow(RF))

## [1] 0.09240924

rndforest=randomForest(Personal.Loan~.,data=RF,ntree=501,mtry=3,nodesize=10,i
mportance=TRUE)
rndforest

##
## Call:
## randomForest(formula = Personal.Loan ~ ., data = RF, ntree = 501,
mtry = 3, nodesize = 10, importance = TRUE)
## Type of random forest: classification
## Number of trees: 501
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.56%
## Confusion matrix:
## 0 1 class.error
## 0 3018 7 0.00231405
## 1 45 263 0.14610390

print(rndforest$err.rate)
## OOB 0 1
## [1,] 0.02977667 0.020720721 0.1313131
## [2,] 0.02933738 0.015512465 0.1744186
## [3,] 0.03137570 0.017621145 0.1759259
## [4,] 0.03066378 0.015013829 0.1950207
## [5,] 0.02920443 0.014380531 0.1797753
## [6,] 0.02917602 0.012707377 0.1923077
## [7,] 0.02753442 0.012064805 0.1796610
## [8,] 0.02718567 0.011576439 0.1800000
## [9,] 0.02687023 0.010781671 0.1824104
## [10,] 0.02363636 0.008018710 0.1758958

plot(rndforest)

rndforest=randomForest(Personal.Loan~.,data=RF,ntree=101,mtry=3,nodesize=10,i
mportance=TRUE)
print(rndforest$err.rate)

## OOB 0 1
## [1,] 0.04358553 0.019090909 0.2758621
## [2,] 0.03241895 0.012700166 0.2164948
## [3,] 0.02891566 0.011968085 0.1923077
## [4,] 0.03797922 0.014624506 0.2643678
## [5,] 0.03268846 0.011037528 0.2428571
## [6,] 0.03160920 0.009513742 0.2448980
## [7,] 0.03031250 0.010344828 0.2233333
## [8,] 0.02709360 0.010522743 0.1887417
## [9,] 0.02440513 0.008739496 0.1782178
## [10,] 0.02605271 0.008344459 0.2000000

plot(rndforest)

#TUNNING
set.seed(seed)
X=tuneRF(x=RF,y=RF$Personal.Loan,mtryStart =3,stepFactor = 1,ntreeTry
=500,trace = TRUE,
plot = TRUE,doBest = TRUE, nodesize=5,importance=TRUE)

## mtry = 3 OOB error = 0%


## Searching left ...
## Searching right ...

You might also like