0% found this document useful (0 votes)
8 views2 pages

Ds

The document provides a comprehensive guide on data management operations in R using the 'dplyr' package, including data manipulation techniques such as filtering, selecting, and summarizing data. It also covers practical applications of various statistical methods, including logistic regression, decision trees, hypothesis testing, time-series forecasting, and principal component analysis. Additionally, the document discusses clustering techniques and includes examples with datasets like 'students.csv', 'biopsy', and 'iris'.

Uploaded by

sefami1889
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views2 pages

Ds

The document provides a comprehensive guide on data management operations in R using the 'dplyr' package, including data manipulation techniques such as filtering, selecting, and summarizing data. It also covers practical applications of various statistical methods, including logistic regression, decision trees, hypothesis testing, time-series forecasting, and principal component analysis. Additionally, the document discusses clustering techniques and includes examples with datasets like 'students.csv', 'biopsy', and 'iris'.

Uploaded by

sefami1889
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

Data Management Operations in R using ‘dplyr’ str(data4)

install.packages("dplyr") head(data4)
"Data Manipulation in R" dim(data4)
library(dplyr) names(data4)
student<-read.csv("students.csv",sep = ",",header = T) modeldata<-lm(index~written+language+tech+gk,data=data4)
View(student) …str(student) …dim(student) summary(modeldata)
filter(student,Class=="TYCS") data4$pred<-predict(modeldata,data4)
filter(student,Marks>500) head(data4)
filter(student,Class=="TYCS",Marks>500) modeldata$residuals
filter(student,Class=="TYCS" & Marks>500) Aim : Practical of Logistic Regression Algorithm
filter(student,Class=="TYCS" | Marks>500) library("MASS") …data("biopsy") …View(biopsy) …str(biopsy) …
filter(student,Class=="TYCS" | Marks<=500) names(biopsy) …
select(student,Name,Class) summary(biopsy) ….colSums((is.na(biopsy)))
data1<-filter(student,Class=="TYCS" | Marks<=500) biopsy1=na.omit(biopsy) …
select(data1,Name,Class,Marks) colSums((is.na(biopsy1))) …
arrange(student,Name) biopsy$ID=NULL …boxplot(biopsy)
arrange(student,Marks) fit<-glm(class~.,family = binomial,data = biopsy1)
arrange(student,desc(Marks)) summary(fit)
data2<-arrange(student,desc(Marks)) biopsy1$prob<-predict(fit,type = "response")
filter(data2,Class=="TYCS") View(biopsy1)
mutate(student,Perc=Marks/10) biopsy1$predict=rep("benign",683)
summarize(student,n()) biopsy1$predict[biopsy1$prob>0.99]="malignant"
summarise(student,max(Marks)) View(biopsy1)
summarise(student,IQR(Marks)) table(biopsy1$predict,biopsy1$class)
summarise(student,mean(Marks)) mean(biopsy1$predict==biopsy1$class)
summarise(student,sum(Marks)) fit2<-glm(class~V1+V4+V6+V7,family = binomial,data=biopsy1)
summarise(student,sd(Marks)) summary(fit2)
grp=group_by(student,Class) biopsy1$prob=predict(fit2,type = "response")
summarise(grp,mean(Marks)) View(biopsy1)
summarise(grp,min(Marks)) biopsy1$predict=rep("benign",683)
student%>%filter(Class=="TYCS")%>%select(Name) biopsy1$predict[biopsy1$prob>0.5]="malignant"
student%>%filter(Class=="TYCS")%>%arrange(Marks) View(biopsy1)
student%>%filter(Class=="TYCS")%>%arrange(desc(Marks)) table(biopsy1$predict,biopsy1$class)
student%>%filter(Class=="TYCS")%>%summarise(n()) mean(biopsy1$predict==biopsy1$class)
count(student,Class)
summarise(grp,n()) Practical of Decision Tree
student%>%group_by(Class)%>%summarise(n()) Regression tree
// Plots data<-read.csv("Hitters.csv",sep = ",",header = T)
hist(student$Marks,xlab="Student Marks",main = "Histogram of View(data)...str(data) …summary(data)... names(data) …library(rpart)
Student Marks") regtree<-rpart(Salary~Hits+Runs+Years,data=data)
barplot(student$Marks,xlab="Student Marks",main = "Barplot of regtree …plot(regtree) …plot(regtree)... text(regtree)
Student Marks") install.packages("rpart.plot") library(rpart.plot) …rpart.plot(regtree) …
boxplot(student$Marks) View(regtree) …
data<-read.csv(file.choose(),sep = ",",header = T) //Cp-complexity parameter
plot(data$Year,data$Rainfall,type="l",col="red",lwd=3) regtree$cptable …cp=min(regtree$cptable[5,]) …
data<-read.csv(file.choose(),sep = ",",header = T) pr=prune(regtree,cp=cp) …rpart.plot(pr) …
plot(data$Year,data$Rainfall,type="l",col="red",lwd=3) //Classification Tree
plot(data$Year,data$Population,type="l",lty="dotted",col="blue",lwd=3 library("MASS") …data("biopsy") …View(biopsy) …str(biopsy) …
) …….data("mtcars") names(biopsy) …summary(biopsy) …biopsy$ID=NULL
install.packages("corrplot") classtree<-rpart(class~.,data=biopsy)
library(corrplot) rpart.plot(classtree)
M<-cor(mtcars) biopsy$pred=predict(classtree,biopsy,type = "class")
corrplot(M,method = "ellipse") table(biopsy$pred,biopsy$class)
corrplot(M,method = "ellipse",col = "red") install.packages("titanic")
data("iris") library("titanic")
plot(iris$Petal.Length,iris$Petal.Width,col=iris$Species,pch=15) data("titanic_train")
str(titanic_train)
Practical of Simple/Multiple Linear Regression View(titanic_train)
#simple linear regression titanic_train$Name=NULL
data3<-read.csv("studweight.csv",sep = ",",header = T) titanictree<-rpart(Survived~Pclass+Age+Parch,data = titanic_train)
summary(data3) rpart.plot(titanictree)
str(data3)
fit<-lm(Weight~Height,data=data3) 'Classification tree'
summary(fit) golf<-read.csv("Golf.csv",sep = ',',header = T)
"Height is very significant in determining the weight" View(golf)...str(golf) …names(golf) …library("rpart") …
plot(data3$Height,data3$Weight) install.packages("rpart.plot") …library("rpart.plot") …
abline(fit,lwd=3,col="blue") tree<-rpart(Play~.,data=golf,control = rpart.control(minsplit =
#multiple linear regression 1,minbucket = 1,cp=0)) …rpart.plot(tree)
data4<-read.csv("emp_index.csv",sep = ",",header = T)
summary(data4)
Practical of Hypothesis Testing #one sample t-test dim(newdata)
data<-read.csv("onesample.csv",sep = ",",header = T) # Hierarchical clustering on IRIS dataset
View(data) …str(data) …summary(data) …boxplot(data) # dist function is used to compute the distance matrix
t.test(data$Time,mu=80,alternative="greater") # i.e. Euclidean distance between every pair of observations
#two sample t-test clust<-hclust(dist(iris[,3:4]))
my_data<-read.csv("twosample.csv",sep = ",",header = T) plot(clust)
View(my_data) …str(my_data) … clusterCut<-cutree(clust,3)
summary(my_data) …boxplot(my_data) … table(clusterCut,iris$Species)
var.test(my_data$time_g1,my_data$time_g2,alternative="two.sided") clust<-hclust(dist(iris[,3:4]),method = "average")
t.test(my_data$time_g1,my_data$time_g2,alternative="two.sided") plot(clust)
#paired t-test clusterCut<-cutree(clust,3)
time<-read.csv("paired_t_test.csv",sep = ",",header = T) table(clusterCut,iris$Species)
View(time) …str(time) …
summary(time) …boxplot(time) Aim : Practical of Time-Series Forecasting
t.test(time$time_before,time$time_after,alternative="greater",paired = # Time Series Analysis and Forecasting on AirPassengers
T) install.packages("forecast")
#correlation library(forecast)
cor<-read.csv("correlation.csv",sep = ",",header = T) data("AirPassengers")
View(cor) class(AirPassengers)
str(cor) head(AirPassengers)
summary(cor) sum(is.na(AirPassengers))
cor.test(cor$aptitude,cor$job_prof,alternative = "two.sided",method = summary(AirPassengers)
"pearson") plot(AirPassengers)
#paired t-test application tsdata<-ts(AirPassengers,frequency = 12)
stud<-read.csv("student.csv",sep = ",",header = T) ddata<-decompose(tsdata)
View(stud) plot(ddata)
str(stud) holt<-HoltWinters(tsdata,beta = FALSE,gamma = FALSE)
summary(stud) plot(holt)
boxplot(stud) # Time Series Analysis on Rainfall dataset
t.test(stud$Test1,stud$Test2,alternative="less",paired = T) rainfall<-read.csv("rainfall.csv",sep = ",",header = T)
#correlation - Ice cream head(rainfall)
ice<-read.csv("icecream.csv",sep = ",",header = T) summary(rainfall)
View(ice) class(rainfall)
str(ice) tsdata<-ts(rainfall,frequency = 12,start = c(2012,1))
summary(ice) class(tsdata)
boxplot(ice) plot(tsdata)
cor.test(ice$Total.sales,ice$Temp,alternative = "two.sided",method =
"pearson") Aim : Practical of Principal Component Analysis.
# Principal Component Analysis upon IRIS dataset
Aim : Practical of Analysis of Variance data("iris")
#one-way-anova test str(iris)
data1<-read.csv("one-way-anova.csv",sep = ",", header = T) summary(iris)
names(data1) …str(data1) mypr<-prcomp(iris[,-5])
data1$dept<-as.factor(data1$dept) mypr
str(data1) …summary(data1) …View(data1) …head(data1) summary(mypr)
anv1<-aov(formula=satindex~dept,data=data1)...summary(anv1) plot(mypr,type="l")
biplot(mypr)
#two-way-anova test
data2<-read.csv("crop-data.csv",sep = ",",header = T) db.student.insert({_id=101,RollNo:4,Name=”Laxmi”,Marks:450,H
names(data2) obbies:[“Reading”,”Danci ng”]});
str(data2)
data2$density<-as.factor(data2$density) db.student.find({Class:”TYCS”},{Name:1,Class:1,_id=0})
str(data2) db.student.find({Class:{$ne:”TYCS”}},{Name:1,Class:1,_id:0})
summary(data2) db.student.find().sort({Marks:1}) //ascending
head(data2) db.student.find({Class:”TYCS”},Marks:{$gt:400}})
View(data2) //or, and, not
anv2<-aov(formula=yield~density+block+fertilizer,data=data2) db.student.find({$or:[{Class:”TYCS”},Marks:{$gt:500}}]})
summary(anv2) db.student.find({Class:{$ne:”TYCS”}},{Name:1,Class:1,_id:0}) ->
will name and class of those students whose class not TYCS
library(readxl) db.student.update({RollNo:2},{$set:{Marks:531}})
mydata<-read.csv("newsadv.csv") …View(mydata) …names(mydata) db.student.remove({Class:”FYCS”})
anv<-aov(formula=Count~Day+Section,data=mydata) db.student.updateMany({Class:”TYCS”},{$inc:{Marks:5}})
summary(anv) db.Employee.aggregate({$group:{“_id”:”$Dept”,”Count”:{$sum:
Practical of Clustering 1}}}) -> This will retrieve the number of employees in each
# K-means clustering on IRIS dataset department
data("iris")...names(iris)...newdata<-iris[,-5]...head(newdata) …………..:”$Dept”,”Count”:{$avg:”$Salary”}}})
fit<-kmeans(newdata,3)
library(cluster) db.student.find({},{Name:1,Marks:1,_is:0}).sort({Marks:1}) ->
clusplot(newdata,fit$cluster,color=T,shade=T,labels=2,lines=0) sort the name and marks using projection argument
fit… fit$size

You might also like