Aditya Garg DMDW
Aditya Garg DMDW
CXS – 425
Data Mining and Data Warehousing Lab
Assignment 1
1.
a. Find matrix – matrix multiplication
b. Find (AB)T and (AB)-1
c. Find the mean, standard deviation for each column and row for the
matrices A, B, AB, (AB)-1.
A <- rbind(c(3,-2,1),c(-1,4,-2))
B <- rbind(c(-7,4),c(9,5),c(2,-1))
print("Matrix A : ")
print(A)
print("Matrix B :")
print(B)
#AB
C <-A%*%B
print("Multiplication AB :")
print(C)
#T(AB)
T <-t(C)
print("Transpose of Matrix AB :")
print(T)
#I(AB)
I <- solve(C)
print("Inverse of Matrix AB :")
print(I)
#Mean
print("Mean of Matrix A :")
#Row
mean(A[1,])
mean(A[2,])
#column
mean(A[,1])
mean(A[,2])
mean(A[,3])
#Standard Deviations
print("Standard deviation of matrix A :")
sd(A,na.rm=TRUE)
print("Standard deviation of matrix B :")
sd(B,na.rm=TRUE)
print("Standard deviation of matrix AB :")
sd(C,na.rm=TRUE)
print("Standard deviation of matrix inverse of AB :")
sd(I,na.rm=TRUE)
OUTPUT
17103045 3
17103045 4
2. Write a “Function” program in R to find n! . Hence Find 13! , 32! ,.Do not name the
function by “Factorial”. You can initialize that 0!=1 and 1!=1.
factorial <- 1
if((n==0||n==1))
factorial <- 1
else{
for(i in 1:n)
factorial <- factorial*i
}
return (factorial)
}
print(findfactorial(13))
print(findfactorial(32))
OUTPUT
17103045 5
min1 = 10000
max1 = -10000
for(i in 1:l){
if(min1>vector1[i]){
min1 = vector1[i];
}
if(max1<vector1[i]){
max1 = vector1[i];
}
}
OUTPUT
17103045 6
ASSIGNMENT 2
In R, we can write data frames easily to a file, using the write.table() command.
write.table(cars1, file="cars1.txt", quote=F)
The first argument refers to the data frame to be written to the output file, the second is
the name of the output file. By default, R will surround each entry in the output file by
quotes, so we use quote=F.
The function read.table(“/location”) can then be used to read the data frame directly
Code:
OUTUPT:
17103045 7
CODE:
print("********Data Tail***********")
#data-tail
print(tail(1:5,1))
print("******Names Data ***********")
print(names(data))
17103045 8
OUTPUT:
17103045 9
Code:
dataset<- read_excel("BEPSxls.xlsx")
mean(dataset$age)
median(dataset$age)
summary(dataset)
hist(dataset$age,main = 'AGE HISTOGRAM')
plot(dataset$Blair)
OUTPUT:
17103045 10
library(readxl)
dataset=read_excel(file.choose())
#For dataset
attach(dataset)
cat(gender)
detach(dataset)
cat(gender)
17103045 11
ASSIGNMENT 3
Code:
mtcars
mtcars$mpg = ifelse(is.na(mtcars$mpg),ave(mtcars$mpg, FUN =
function(x) mean(x,na.rm='TRUE')),mtcars$mpg)
OUTPUT:
CODE:
library(dplyr)
OUTPUT:
17103045 13
CODE:
print("Arrange : ")
arrange(mtcars, desc(disp))
print("Group By : ")
group_by(mtcars,drat)
print("Summarise : ")
summarise(mtcars,mean(disp))
print("Select : ")
select(mtcars,qsec)
print("Intersect :")
A<- subset(mtcars,disp==160)
B<- subset(mtcars,cyl=100)
intersect(A,B)
print("SetDiff :")
setdiff(B,A)
17103045 14
17103045 15
17103045 16
17103045 17
Code:
DATA <- subset(mtcars,select=c(1:9))
print(DATA)
Code:
myData <- data.frame(col1 = c(1:3, NA),
col2 = c("this", NA,"is", "text"),
col3 = c(TRUE, FALSE, TRUE, TRUE),
col4 = c(2.5, NA, 3.2, NA))
is.na(myData)
17103045 18
CODE:
is.na(mtcars)
17103045 19
ASSIGNMENT 4
In other words, the H0 hypothesis implies that there is not enough evidence to prove
the mean of the group (factor) are different from another.
17103045 20
Code:
OUTPUT:
17103045 21
2. Repeat the quesion1 and perform one-way analysis of variance using inbuilt
dataset in Rstudio.
Code:
#build data
my_data<- PlantGrowth
#show levels
levels(my_data$group)
OUTPUT:
17103045 23
17103045 24
ASSIGNMENT 5
1. Consider dataset “Groceries” and apply apriori algorithm on it. What are the
first 5 rules generated when the min support is 0.001 and min confidence is 0.9
Code:
library(arules)
groceries <- read_excel("LAB5.csv")
rules=apriori(data= groceries, parameter =
list(support=0.001,confidence=0.9))
inspect(rules[1:5])
OUTPUT:
2. The database has four transaction. What association rule can be found in this set,
if the minimum support is 60% and minimum confidence is 80%.
Code:
library(arules)
library(readr)
groceries2 <- read_excel("LAB5-2.csv")
Output:
17103045 25
Code:
library(arules)
library(readr)
titanic <- read_csv("titanic.csv")
data(titanic)
rules=apriori(data= titanic, parameter =
list(support=0.6,confidence=0.8))
rules
inspect(rules[1:5])
OUTPUT:
17103045 26
ASSIGNMENT 6
Code:
dataset <- read_excel("LAB6.xlsx")
summary(dataset)
hist(dataset$X)
plot(Y~X, data=dataset)
dataset.lm <- lm(Y~X, dataset)
summary(dataset.lm)
Output:
17103045 27
Code:
Output:
17103045 28
ASSIGNMENT 7
Code:
library(rpart.plot)
library(rpart)
dataset <- read_csv("austin_weather.csv")
head(dataset)
shuffle_index<-sample(1:nrow(dataset))
dataset <- dataset[shuffle_index,]
ls(dataset)
sum(is.na(dataset$Events))
dim(dataset)
sum(is.na(dataset$DewPointAvgF))
summary(dataset$TempHighF)
str(dataset)
dataset[] <- lapply(dataset, as.numeric)
Output:
Code:
dataset <- read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
dim(dataset)
ls(dataset)
Output:
17103045 30
17103045 31
ASSIGNMENT 8
1. Write a procedure for clustering customer data using Simple KMeans Algorithm
Code:
library(readr)
dataset = read_csv("Mall_Customers.csv")
dataset = dataset[4:5]
set.seed(6)
wcss = vector()
for (i in 1:10) wcss[i] = sum(kmeans(dataset, i)$withinss)
plot(1:10,
wcss,
type = 'b',
main = paste('The Elbow Method'),
xlab = 'Number of clusters',
ylab = 'WCSS')
kmeans = kmeans(x = dataset, centers = 5)
y_kmeans = kmeans$cluster
Output:
17103045 33
17103045 34
ASSIGNMENT 9
Code:
# Installing Packages
install.packages("e1071")
install.packages("caTools")
install.packages("caret")
# Loading package
library(e1071)
library(caTools)
library(caret)
library(dplyr)
dataset = read_csv("Mall_Customers.csv")
Output:
Code:
install.packages("mixtools")
dataset = read_csv("Mall_Customers.csv")
summary(dataset$Score)
x <- dataset$Score
plot(density(x))
Q <- 0
# starting value of expected value of the log likelihood
Q[2] <- sum.finite(log(pi1)+log(dnorm(x, mu1, sigma1))) +
sum.finite(log(pi2)+log(dnorm(x, mu2, sigma2)))
k <- 2
while (abs(Q[k]-Q[k-1])>=1e-6) {
# E step
comp1 <- pi1 * dnorm(x, mu1, sigma1)
comp2 <- pi2 * dnorm(x, mu2, sigma2)
comp.sum <- comp1 + comp2
p1 <- comp1/comp.sum
p2 <- comp2/comp.sum
# M step
pi1 <- sum.finite(p1) / length(x)
pi2 <- sum.finite(p2) / length(x)
p1 <- pi1
p2 <- pi2
k <- k + 1
Q[k] <- sum(log(comp.sum))
}
library(mixtools)
gm<-normalmixEM(x,k=2,lambda=c(0.9,0.1),mu=c(0.4,0.3),sigma=c(0.05,0.02))
gm$mu
gm$sigma
gm$lambda
hist(x, prob=T, breaks=32, xlim=c(range(x)[1], range(x)[2]), main='')
lines(density(x), col="green", lwd=2)
x1 <- seq(from=range(x)[1], to=range(x)[2], length.out=1000)
y <- pi1 * dnorm(x1, mean=mu1, sd=sigma1) + pi2 * dnorm(x1, mean=mu2,
sd=sigma2)
lines(x1, y, col="red", lwd=2)
legend('topright', col=c("green", 'red'), lwd=2, legend=c("kernal", "fitted"))
Output:
17103045 37
17103045 38
ASSIGNMENT 10
Dataset:
Min_Support : 50%
Using WEKA,
We got the rules as