Sahanashree Ex-2 ML (2)
Sahanashree Ex-2 ML (2)
(i) For your dataset, identify any duplicates and remove them, retaining only their first occurrence.
(ii) Going by each attribute, use the two techniques discussed in class to identify outliers. You may find it
useful to write two functions - OutlierFromIQR() and OutlierFromZscore() - to process your dataset. An
outlier that lies 3*IQR outside Q1 or Q3 is an extreme outlier - these may also be identified.
#Q1
Drug<-read.csv("Drug.csv")
Drug
list(Outliers = outliers)
}
# Print results
print("Outliers detected using IQR method:")
print(outliers_iqr)
library(ggplot2)
#boxplot of Na
OutlierFromIQR <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE)
Q3 <- quantile(column, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
return(column < lower_bound | column > upper_bound)
}
#scatter plot of Na
OutlierFromIQR <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE)
Q3 <- quantile(column, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
return(column < lower_bound | column > upper_bound)
}
colSums(is.na(Drug))
# Impute numeric columns with median and categorical columns with mode
for (col in names(Drug)) {
if (is.numeric(Drug[[col]])) {
Drug[[col]][is.na(Drug[[col]])] <- median(Drug[[col]], na.rm = TRUE)
} else {
mode_value <- names(sort(table(Drug[[col]]), decreasing = TRUE))[1]
Drug[[col]][is.na(Drug[[col]])] <- mode_value
}
}
#Q3