ML LAB34
ML LAB34
import pandas as pd
data = pd.read_csv('large_housing_data_mumbai.csv')
print("Original Data:")
print(data.head())
Original Data:
House_ID Bedrooms Size (sq ft) Price (INR) Location Year_Built
0 1 4.0 855.0 31356226.0 Juhu 2002.0
1 2 5.0 1847.0 27775439.0 Andheri 2004.0
2 3 NaN 2363.0 37325149.0 Bandra 2000.0
3 4 5.0 626.0 6147116.0 South Mumbai 2002.0
4 5 5.0 NaN 49899606.0 Worli NaN
#Imputation
#Handle missing values using median for numerical columns and the most
frequent value for categorical columns.
from sklearn.impute import SimpleImputer
num_features = ['Bedrooms', 'Size (sq ft)', 'Price (INR)', 'Year_Built']
cat_features = ['Location']
num_imputer = SimpleImputer(strategy='median')
data[num_features] = num_imputer.fit_transform(data[num_features])
cat_imputer = SimpleImputer(strategy='most_frequent')
data[cat_features] = cat_imputer.fit_transform(data[cat_features])
print("\nData After Imputation:")
print(data.head())
#Anomaly Detection
#Detect anomalies in the dataset. Here, we use Z-scores to identify anomalies
in the Price (INR) column.
from scipy import stats
z_scores = stats.zscore(data[num_features])
data['Anomaly'] = (abs(z_scores) > 3).any(axis=1) # Mark anomalies
print("\nData After Anomaly Detection:")
print(data.head())
#Rule-Based Anomaly Detection
#simple rules where:
#A house with less than 1000 sq ft should have 1 to 2 bedrooms.
#A house with 1000-2000 sq ft should have 2 to 4 bedrooms.
#A house with more than 2000 sq ft should have 3 or more bedrooms.
def is_bedroom_size_reasonable(row):
if row['Size (sq ft)'] < 1000:
return 1 <= row['Bedrooms'] <= 2
elif row['Size (sq ft)'] <= 2000:
return 2 <= row['Bedrooms'] <= 4
else:
return row['Bedrooms'] >= 3
data['Bed_Size_Anomaly'] = ~data.apply(is_bedroom_size_reasonable, axis=1)
print("\nData After Rule-Based Anomaly Detection:")
print(data.head())
Anomaly
0 False
1 False
2 False
3 False
4 False
Anomaly Bed_Size_Anomaly
0 False True
1 False True
2 False False
3 False True
4 False True
#Standardization
#Standardize numerical features so they have a mean of 0 and a standard
deviation of 1.
from sklearn.preprocessing import StandardScaler
# Standardize numericals
scaler = StandardScaler()
data[num_features] = scaler.fit_transform(data[num_features])
print("\nData After Standardization:")
print(data.head())
Anomaly Bed_Size_Anomaly
0 False True
1 False True
2 False False
3 False True
4 False True
#Normalization
#Normalize numerical features to fit within the range [0, 1]
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
data[num_features] = normalizer.fit_transform(data[num_features])
print("\nData After Normalization:")
print(data.head())
Anomaly Bed_Size_Anomaly
0 False True
1 False True
2 False False
3 False True
4 False True
#Encoding
#One-Hot Encode the categorical feature Location.
from sklearn.preprocessing import OneHotEncoder
# One-Hot Encoding for 'Location'
encoder = OneHotEncoder(sparse=False)
encoded_location = encoder.fit_transform(data[['Location']])
encoded_df = pd.DataFrame(encoded_location,
columns=encoder.get_feature_names_out(['Location']))
Location_Worli
0 0.0
1 0.0
2 0.0
3 0.0
4 1.0
/usr/local/lib/python3.10/dist-
packages/sklearn/preprocessing/_encoders.py:975: FutureWarning: `sparse` was
renamed to `sparse_output` in version 1.2 and will be removed in 1.4.
`sparse_output` is ignored unless you leave `sparse` to its default value.
warnings.warn(
Experiment 2
import numpy as np
class GradientDescentMSE:
def __init__(self, lr=0.01, n_iters=1000):
self.lr = lr
self.n_iters = n_iters
self.x1 = None
self.x2 = None
for _ in range(self.n_iters):
# Compute predictions
y_pred = self.x1 * X[:, 0] + self.x2 * X[:, 1]
# Update parameters
self.x1 = self.x1 - self.lr * grad_x1
self.x2 = self.x2 - self.lr * grad_x2
Height Weight
0 151 63
1 174 81
2 138 56
3 186 91
4 128 47
5 136 57
6 179 76
7 163 72
8 152 62
9 131 48
#Display
print(f"Mean of Height (x_mean): {x_mean}")
print(f"Mean of Weight (y_mean): {y_mean}")
# Calculate b1 (slope)
b1 = sum_xiyi_xbar_ybar / sum_sq_xi_xbar
# Display b1 (slope)
print(f"Slope (b1): {b1}")
# Calculate b0 (intercept)
b0 = y_mean - b1 * x_mean
# Display b0 (intercept)
print(f"Intercept (b0): {b0}")
# Example prediction
height_new = 160
weight_prediction = predict(height_new)
print(f'Predicted weight for height {height_new} cm is
{weight_prediction:.2f} kg')
class LogisticRegression():
for _ in range(self.n_iters):
linear_pred = np.dot(X, self.weights) + self.bias
y_pred = sigmoid(linear_pred) # Logistic addition. Rest all is
Linear Regression.
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt
bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=1234)
model = LogisticRegression(lr=0.01)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
0.9210526315789473
C:\Users\rohra\AppData\Local\Temp\ipykernel_19392\4033946986.py:2:
RuntimeWarning: overflow encountered in exp
return 1/(1+np.exp(-x))
import pandas as pd
print(results)
Actual Predicted
0 1 1
1 1 1
2 1 1
3 1 1
4 1 1
.. ... ...
109 1 1
110 0 0
111 1 0
112 0 0
113 0 0
# EDA
# Missing Data
df.info()
df.isna().sum()
df.head()
df['sex'].unique()
df['island'].unique()
df = df[df['sex']!='.']
# Feature Engineering
pd.get_dummies(df)
pd.get_dummies(df.drop('species',axis=1),drop_first=True)
plt.show()
print(classification_report(y_test,base_pred))
model.feature_importances_
pd.DataFrame(index=X.columns,data=model.feature_importances_,columns=['Feature
Importance'])
plt.show()
def report_model(model):
model_preds = model.predict(X_test)
print(classification_report(y_test,model_preds))
print('\n')
plt.figure(figsize=(12,8),dpi=150)
plot_tree(model,filled=True,feature_names=X.columns);
pruned_tree = DecisionTreeClassifier(max_depth=2)
pruned_tree.fit(X_train,y_train)
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
def report_model(model):
# Print classification report if needed (e.g., precision, recall, etc.)
print('\n')
pruned_tree = DecisionTreeClassifier(max_leaf_nodes=3)
pruned_tree.fit(X_train,y_train)
report_model(pruned_tree)
entropy_tree = DecisionTreeClassifier(criterion='entropy')
entropy_tree.fit(X_train,y_train)
report_model(entropy_tree)
Experiment 6
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
print(data)
# Use only the first two features for training and visualization
X = data.iloc[:, :2].values # First two features
y = data.iloc[:, -1].values # Target variable (last column)
# Make predictions
y_pred = svm_rbf.predict(X_test)
# Accuracy
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracy:.4f}\n")
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
print()
print()
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Use only the first two features for training and visualization
X = data.iloc[:, :2].values # First two features
y = data.iloc[:, -1].values # Target variable (last column)
# 1. SVM Model
svm_rbf = SVC(kernel='rbf', gamma='auto', probability=True)
svm_rbf.fit(X_train, y_train)
y_pred_svm = svm_rbf.predict(X_test)
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Since we can't train an ensemble model directly, we just plot the decision boundary using the
SVM model
plot_decision_boundary(X_test, y_test, svm_rbf)
Experiment 8
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Assuming the last column is the target label and the rest are features
X = data.iloc[:, :-1].values # Features (all rows, all columns except the last)
y = data.iloc[:, -1].values # Target (all rows, last column)