Project3: Loading Library
Project3: Loading Library
#LOADING LIBRARY
library(readr)
library(readxl)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ROCR)
library(ineq)
library(StatMeasures)
library(rattle)
#LOADING DATA
Customerdata=read_excel("Thera Bank_Personal_Loan_Modelling-dataset-1.xlsx")
attach(Customerdata)
colnames(Customerdata)=make.names(colnames(Customerdata))
#2 Data overview
head(Customerdata)
## # A tibble: 6 x 14
## ID Age..in.years. Experience..in.~ Income..in.K.mo~ ZIP.Code
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25 1 49 91107
## 2 2 45 19 34 90089
## 3 3 39 15 11 94720
## 4 4 35 9 100 94112
## 5 5 35 8 45 91330
## 6 6 37 13 29 92121
## # ... with 9 more variables: Family.members <dbl>, CCAvg <dbl>,
## # Education <dbl>, Mortgage <dbl>, Personal.Loan <dbl>,
## # Securities.Account <dbl>, CD.Account <dbl>, Online <dbl>,
## # CreditCard <dbl>
summary(Customerdata)
str(Customerdata)
Customerdata$Education=as.factor(Customerdata$Education)
Customerdata$Personal.Loan=as.factor(Customerdata$Personal.Loan)
Customerdata$Securities.Account=as.factor(Customerdata$Securities.Account)
Customerdata$CD.Account=as.factor(Customerdata$CD.Account)
Customerdata$Online=as.factor(Customerdata$Online)
Customerdata$CreditCard=as.factor(Customerdata$CreditCard)
str(Customerdata)
Customerdata=Customerdata[,-c(1,5)]
## [1] TRUE
impute=mice(Customerdata[,],m=3,seed=123)
##
## iter imp variable
## 1 1 Family.members
## 1 2 Family.members
## 1 3 Family.members
## 2 1 Family.members
## 2 2 Family.members
## 2 3 Family.members
## 3 1 Family.members
## 3 2 Family.members
## 3 3 Family.members
## 4 1 Family.members
## 4 2 Family.members
## 4 3 Family.members
## 5 1 Family.members
## 5 2 Family.members
## 5 3 Family.members
print(impute)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## Age..in.years. Experience..in.years. Income..in.K.month.
## "" "" ""
## Family.members CCAvg Education
## "pmm" "" ""
## Mortgage Personal.Loan Securities.Account
## "" "" ""
## CD.Account Online CreditCard
## "" "" ""
## PredictorMatrix:
## Age..in.years. Experience..in.years.
## Age..in.years. 0 1
## Experience..in.years. 1 0
## Income..in.K.month. 1 1
## Family.members 1 1
## CCAvg 1 1
## Education 1 1
## Income..in.K.month. Family.members CCAvg Education
## Age..in.years. 1 1 1 1
## Experience..in.years. 1 1 1 1
## Income..in.K.month. 0 1 1 1
## Family.members 1 0 1 1
## CCAvg 1 1 0 1
## Education 1 1 1 0
## Mortgage Personal.Loan Securities.Account CD.Account
## Age..in.years. 1 1 1 1
## Experience..in.years. 1 1 1 1
## Income..in.K.month. 1 1 1 1
## Family.members 1 1 1 1
## CCAvg 1 1 1 1
## Education 1 1 1 1
## Online CreditCard
## Age..in.years. 1 1
## Experience..in.years. 1 1
## Income..in.K.month. 1 1
## Family.members 1 1
## CCAvg 1 1
## Education 1 1
newdata=complete(impute,1)
any(is.na.data.frame(newdata))
## [1] FALSE
#Decision Tree.
#Spliting Data into train and test data.
seed=2000
set.seed(seed)
sample=sample.split(newdata,SplitRatio = 0.7)
train_data=subset(newdata,sample == TRUE)
test_data=subset(newdata,sample == FALSE)
nrow(train_data)
## [1] 3333
nrow(test_data)
## [1] 1667
##
## 0 1
## 3025 308
str(train_data)
Model1
## n= 3333
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3333 308 0 (0.907590759 0.092409241)
## 2) Income..in.K.month.< 113.5 2671 56 0 (0.979034070 0.020965930)
## 4) CCAvg< 2.95 2475 11 0 (0.995555556 0.004444444) *
## 5) CCAvg>=2.95 196 45 0 (0.770408163 0.229591837)
## 10) CD.Account=0 178 32 0 (0.820224719 0.179775281) *
## 11) CD.Account=1 18 5 1 (0.277777778 0.722222222) *
## 3) Income..in.K.month.>=113.5 662 252 0 (0.619335347 0.380664653)
## 6) Education=1 449 50 0 (0.888641425 0.111358575)
## 12) Family.members< 2.5 399 0 0 (1.000000000 0.000000000) *
## 13) Family.members>=2.5 50 0 1 (0.000000000 1.000000000) *
## 7) Education=2,3 213 11 1 (0.051643192 0.948356808)
## 14) Income..in.K.month.< 116.5 18 7 0 (0.611111111 0.388888889) *
## 15) Income..in.K.month.>=116.5 195 0 1 (0.000000000 1.000000000) *
##
## Classification tree:
## rpart(formula = Personal.Loan ~ ., data = DT, method = "class",
## cp = 0, minsplit = 100, minbucket = 10, xval = 10)
##
## Variables actually used in tree construction:
## [1] CCAvg CD.Account Education
## [4] Family.members Income..in.K.month.
##
## Root node error: 308/3333 = 0.092409
##
## n= 3333
##
## CP nsplit rel error xerror xstd
## 1 0.310065 0 1.00000 1.00000 0.054284
## 2 0.162338 2 0.37987 0.48052 0.038612
## 3 0.012987 3 0.21753 0.24351 0.027800
## 4 0.000000 6 0.17857 0.22727 0.026878
plotcp(Model1)
Pmodel=rpart(formula = Personal.Loan~.,data=DT,method =
"class",cp=0.04,minsplit=100,minbucket=100,xval=10)
fancyRpartPlot(Pmodel)
Pmodel
## n= 3333
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3333 308 0 (0.90759076 0.09240924)
## 2) Income..in.K.month.< 113.5 2671 56 0 (0.97903407 0.02096593) *
## 3) Income..in.K.month.>=113.5 662 252 0 (0.61933535 0.38066465)
## 6) Education=1 449 50 0 (0.88864143 0.11135857) *
## 7) Education=2,3 213 11 1 (0.05164319 0.94835681) *
## [1] 0.979898
Z=Pmodel
## prediction.test
## Actual.test 0 1
## 0 1489 6
## 1 47 125
(1489+144)/sum(tbl1)
## [1] 0.9796041
## prediction
## Actual 0 1
## 0 3014 11
## 1 106 202
#1)Accuracy Of Model
(3014+252)/sum(tbl)
## [1] 0.979898
#2classification error
(11+56)/sum(tbl)
## [1] 0.02010201
#3)sensitivity(True Positive Rate)
252/(252+56)
## [1] 0.8181818
## [1] 0.9963636
pobj=prediction(DT$probability[,2],DT$Personal.Loan)
perf <- performance(pobj, "tpr", "fpr")
plot(perf,main = "ROC curve")
## [1] -Inf
auc
## [1] 0.8842803
gini=ineq(DT$probability,"gini")
print(gini)
## [1] 0.4767402
## prediction.test
## Actual.test 0 1
## 0 1489 6
## 1 47 125
(1489+144)/sum(tbl1)
## [1] 0.9796041
#1)Accuracy Of Model
(1489++144)/sum(tbl1)
## [1] 0.9796041
#2classification error
(6+28)/sum(tbl1)
## [1] 0.02039592
## [1] 0.8372093
## [1] 0.9959866
pobj1=prediction(DS$probability1[,2],DS$Personal.Loan)
perf1 <- performance(pobj1, "tpr", "fpr")
plot(perf1,main = "ROC curve")
#Randome forest
print(sum(RF$Personal.Loan==1)/nrow(RF))
## [1] 0.09240924
rndforest=randomForest(Personal.Loan~.,data=RF,ntree=501,mtry=3,nodesize=10,i
mportance=TRUE)
rndforest
##
## Call:
## randomForest(formula = Personal.Loan ~ ., data = RF, ntree = 501,
mtry = 3, nodesize = 10, importance = TRUE)
## Type of random forest: classification
## Number of trees: 501
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.56%
## Confusion matrix:
## 0 1 class.error
## 0 3018 7 0.00231405
## 1 45 263 0.14610390
print(rndforest$err.rate)
## OOB 0 1
## [1,] 0.02977667 0.020720721 0.1313131
## [2,] 0.02933738 0.015512465 0.1744186
## [3,] 0.03137570 0.017621145 0.1759259
## [4,] 0.03066378 0.015013829 0.1950207
## [5,] 0.02920443 0.014380531 0.1797753
## [6,] 0.02917602 0.012707377 0.1923077
## [7,] 0.02753442 0.012064805 0.1796610
## [8,] 0.02718567 0.011576439 0.1800000
## [9,] 0.02687023 0.010781671 0.1824104
## [10,] 0.02363636 0.008018710 0.1758958
plot(rndforest)
rndforest=randomForest(Personal.Loan~.,data=RF,ntree=101,mtry=3,nodesize=10,i
mportance=TRUE)
print(rndforest$err.rate)
## OOB 0 1
## [1,] 0.04358553 0.019090909 0.2758621
## [2,] 0.03241895 0.012700166 0.2164948
## [3,] 0.02891566 0.011968085 0.1923077
## [4,] 0.03797922 0.014624506 0.2643678
## [5,] 0.03268846 0.011037528 0.2428571
## [6,] 0.03160920 0.009513742 0.2448980
## [7,] 0.03031250 0.010344828 0.2233333
## [8,] 0.02709360 0.010522743 0.1887417
## [9,] 0.02440513 0.008739496 0.1782178
## [10,] 0.02605271 0.008344459 0.2000000
plot(rndforest)
#TUNNING
set.seed(seed)
X=tuneRF(x=RF,y=RF$Personal.Loan,mtryStart =3,stepFactor = 1,ntreeTry
=500,trace = TRUE,
plot = TRUE,doBest = TRUE, nodesize=5,importance=TRUE)