Ds
Ds
install.packages("dplyr") head(data4)
"Data Manipulation in R" dim(data4)
library(dplyr) names(data4)
student<-read.csv("students.csv",sep = ",",header = T) modeldata<-lm(index~written+language+tech+gk,data=data4)
View(student) …str(student) …dim(student) summary(modeldata)
filter(student,Class=="TYCS") data4$pred<-predict(modeldata,data4)
filter(student,Marks>500) head(data4)
filter(student,Class=="TYCS",Marks>500) modeldata$residuals
filter(student,Class=="TYCS" & Marks>500) Aim : Practical of Logistic Regression Algorithm
filter(student,Class=="TYCS" | Marks>500) library("MASS") …data("biopsy") …View(biopsy) …str(biopsy) …
filter(student,Class=="TYCS" | Marks<=500) names(biopsy) …
select(student,Name,Class) summary(biopsy) ….colSums((is.na(biopsy)))
data1<-filter(student,Class=="TYCS" | Marks<=500) biopsy1=na.omit(biopsy) …
select(data1,Name,Class,Marks) colSums((is.na(biopsy1))) …
arrange(student,Name) biopsy$ID=NULL …boxplot(biopsy)
arrange(student,Marks) fit<-glm(class~.,family = binomial,data = biopsy1)
arrange(student,desc(Marks)) summary(fit)
data2<-arrange(student,desc(Marks)) biopsy1$prob<-predict(fit,type = "response")
filter(data2,Class=="TYCS") View(biopsy1)
mutate(student,Perc=Marks/10) biopsy1$predict=rep("benign",683)
summarize(student,n()) biopsy1$predict[biopsy1$prob>0.99]="malignant"
summarise(student,max(Marks)) View(biopsy1)
summarise(student,IQR(Marks)) table(biopsy1$predict,biopsy1$class)
summarise(student,mean(Marks)) mean(biopsy1$predict==biopsy1$class)
summarise(student,sum(Marks)) fit2<-glm(class~V1+V4+V6+V7,family = binomial,data=biopsy1)
summarise(student,sd(Marks)) summary(fit2)
grp=group_by(student,Class) biopsy1$prob=predict(fit2,type = "response")
summarise(grp,mean(Marks)) View(biopsy1)
summarise(grp,min(Marks)) biopsy1$predict=rep("benign",683)
student%>%filter(Class=="TYCS")%>%select(Name) biopsy1$predict[biopsy1$prob>0.5]="malignant"
student%>%filter(Class=="TYCS")%>%arrange(Marks) View(biopsy1)
student%>%filter(Class=="TYCS")%>%arrange(desc(Marks)) table(biopsy1$predict,biopsy1$class)
student%>%filter(Class=="TYCS")%>%summarise(n()) mean(biopsy1$predict==biopsy1$class)
count(student,Class)
summarise(grp,n()) Practical of Decision Tree
student%>%group_by(Class)%>%summarise(n()) Regression tree
// Plots data<-read.csv("Hitters.csv",sep = ",",header = T)
hist(student$Marks,xlab="Student Marks",main = "Histogram of View(data)...str(data) …summary(data)... names(data) …library(rpart)
Student Marks") regtree<-rpart(Salary~Hits+Runs+Years,data=data)
barplot(student$Marks,xlab="Student Marks",main = "Barplot of regtree …plot(regtree) …plot(regtree)... text(regtree)
Student Marks") install.packages("rpart.plot") library(rpart.plot) …rpart.plot(regtree) …
boxplot(student$Marks) View(regtree) …
data<-read.csv(file.choose(),sep = ",",header = T) //Cp-complexity parameter
plot(data$Year,data$Rainfall,type="l",col="red",lwd=3) regtree$cptable …cp=min(regtree$cptable[5,]) …
data<-read.csv(file.choose(),sep = ",",header = T) pr=prune(regtree,cp=cp) …rpart.plot(pr) …
plot(data$Year,data$Rainfall,type="l",col="red",lwd=3) //Classification Tree
plot(data$Year,data$Population,type="l",lty="dotted",col="blue",lwd=3 library("MASS") …data("biopsy") …View(biopsy) …str(biopsy) …
) …….data("mtcars") names(biopsy) …summary(biopsy) …biopsy$ID=NULL
install.packages("corrplot") classtree<-rpart(class~.,data=biopsy)
library(corrplot) rpart.plot(classtree)
M<-cor(mtcars) biopsy$pred=predict(classtree,biopsy,type = "class")
corrplot(M,method = "ellipse") table(biopsy$pred,biopsy$class)
corrplot(M,method = "ellipse",col = "red") install.packages("titanic")
data("iris") library("titanic")
plot(iris$Petal.Length,iris$Petal.Width,col=iris$Species,pch=15) data("titanic_train")
str(titanic_train)
Practical of Simple/Multiple Linear Regression View(titanic_train)
#simple linear regression titanic_train$Name=NULL
data3<-read.csv("studweight.csv",sep = ",",header = T) titanictree<-rpart(Survived~Pclass+Age+Parch,data = titanic_train)
summary(data3) rpart.plot(titanictree)
str(data3)
fit<-lm(Weight~Height,data=data3) 'Classification tree'
summary(fit) golf<-read.csv("Golf.csv",sep = ',',header = T)
"Height is very significant in determining the weight" View(golf)...str(golf) …names(golf) …library("rpart") …
plot(data3$Height,data3$Weight) install.packages("rpart.plot") …library("rpart.plot") …
abline(fit,lwd=3,col="blue") tree<-rpart(Play~.,data=golf,control = rpart.control(minsplit =
#multiple linear regression 1,minbucket = 1,cp=0)) …rpart.plot(tree)
data4<-read.csv("emp_index.csv",sep = ",",header = T)
summary(data4)
Practical of Hypothesis Testing #one sample t-test dim(newdata)
data<-read.csv("onesample.csv",sep = ",",header = T) # Hierarchical clustering on IRIS dataset
View(data) …str(data) …summary(data) …boxplot(data) # dist function is used to compute the distance matrix
t.test(data$Time,mu=80,alternative="greater") # i.e. Euclidean distance between every pair of observations
#two sample t-test clust<-hclust(dist(iris[,3:4]))
my_data<-read.csv("twosample.csv",sep = ",",header = T) plot(clust)
View(my_data) …str(my_data) … clusterCut<-cutree(clust,3)
summary(my_data) …boxplot(my_data) … table(clusterCut,iris$Species)
var.test(my_data$time_g1,my_data$time_g2,alternative="two.sided") clust<-hclust(dist(iris[,3:4]),method = "average")
t.test(my_data$time_g1,my_data$time_g2,alternative="two.sided") plot(clust)
#paired t-test clusterCut<-cutree(clust,3)
time<-read.csv("paired_t_test.csv",sep = ",",header = T) table(clusterCut,iris$Species)
View(time) …str(time) …
summary(time) …boxplot(time) Aim : Practical of Time-Series Forecasting
t.test(time$time_before,time$time_after,alternative="greater",paired = # Time Series Analysis and Forecasting on AirPassengers
T) install.packages("forecast")
#correlation library(forecast)
cor<-read.csv("correlation.csv",sep = ",",header = T) data("AirPassengers")
View(cor) class(AirPassengers)
str(cor) head(AirPassengers)
summary(cor) sum(is.na(AirPassengers))
cor.test(cor$aptitude,cor$job_prof,alternative = "two.sided",method = summary(AirPassengers)
"pearson") plot(AirPassengers)
#paired t-test application tsdata<-ts(AirPassengers,frequency = 12)
stud<-read.csv("student.csv",sep = ",",header = T) ddata<-decompose(tsdata)
View(stud) plot(ddata)
str(stud) holt<-HoltWinters(tsdata,beta = FALSE,gamma = FALSE)
summary(stud) plot(holt)
boxplot(stud) # Time Series Analysis on Rainfall dataset
t.test(stud$Test1,stud$Test2,alternative="less",paired = T) rainfall<-read.csv("rainfall.csv",sep = ",",header = T)
#correlation - Ice cream head(rainfall)
ice<-read.csv("icecream.csv",sep = ",",header = T) summary(rainfall)
View(ice) class(rainfall)
str(ice) tsdata<-ts(rainfall,frequency = 12,start = c(2012,1))
summary(ice) class(tsdata)
boxplot(ice) plot(tsdata)
cor.test(ice$Total.sales,ice$Temp,alternative = "two.sided",method =
"pearson") Aim : Practical of Principal Component Analysis.
# Principal Component Analysis upon IRIS dataset
Aim : Practical of Analysis of Variance data("iris")
#one-way-anova test str(iris)
data1<-read.csv("one-way-anova.csv",sep = ",", header = T) summary(iris)
names(data1) …str(data1) mypr<-prcomp(iris[,-5])
data1$dept<-as.factor(data1$dept) mypr
str(data1) …summary(data1) …View(data1) …head(data1) summary(mypr)
anv1<-aov(formula=satindex~dept,data=data1)...summary(anv1) plot(mypr,type="l")
biplot(mypr)
#two-way-anova test
data2<-read.csv("crop-data.csv",sep = ",",header = T) db.student.insert({_id=101,RollNo:4,Name=”Laxmi”,Marks:450,H
names(data2) obbies:[“Reading”,”Danci ng”]});
str(data2)
data2$density<-as.factor(data2$density) db.student.find({Class:”TYCS”},{Name:1,Class:1,_id=0})
str(data2) db.student.find({Class:{$ne:”TYCS”}},{Name:1,Class:1,_id:0})
summary(data2) db.student.find().sort({Marks:1}) //ascending
head(data2) db.student.find({Class:”TYCS”},Marks:{$gt:400}})
View(data2) //or, and, not
anv2<-aov(formula=yield~density+block+fertilizer,data=data2) db.student.find({$or:[{Class:”TYCS”},Marks:{$gt:500}}]})
summary(anv2) db.student.find({Class:{$ne:”TYCS”}},{Name:1,Class:1,_id:0}) ->
will name and class of those students whose class not TYCS
library(readxl) db.student.update({RollNo:2},{$set:{Marks:531}})
mydata<-read.csv("newsadv.csv") …View(mydata) …names(mydata) db.student.remove({Class:”FYCS”})
anv<-aov(formula=Count~Day+Section,data=mydata) db.student.updateMany({Class:”TYCS”},{$inc:{Marks:5}})
summary(anv) db.Employee.aggregate({$group:{“_id”:”$Dept”,”Count”:{$sum:
Practical of Clustering 1}}}) -> This will retrieve the number of employees in each
# K-means clustering on IRIS dataset department
data("iris")...names(iris)...newdata<-iris[,-5]...head(newdata) …………..:”$Dept”,”Count”:{$avg:”$Salary”}}})
fit<-kmeans(newdata,3)
library(cluster) db.student.find({},{Name:1,Marks:1,_is:0}).sort({Marks:1}) ->
clusplot(newdata,fit$cluster,color=T,shade=T,labels=2,lines=0) sort the name and marks using projection argument
fit… fit$size