import pandas as pd
import numpy as np
from [Link] import SimpleImputer
from [Link] import StandardScaler, LabelEncoder
# 1. Create a sample dataset with missing values
data = {'Age': [25, 30, [Link], 35, 40],
'Salary': [50000, 60000, 55000, [Link], 65000],
'Department': ['IT', 'HR', 'IT', 'Marketing', 'HR']}
df = [Link](data)
print("Original Dataset:\n", df, "\n")
# 2. Handling Missing Values (Imputation)
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])
# 3. Encoding Categorical Data
encoder = LabelEncoder()
df['Department'] = encoder.fit_transform(df['Department'])
# 4. Feature Scaling (Standardization)
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
print("Pre-processed Dataset:\n", df)
Output:
Exp 3:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
# 1. Sample Transactional Dataset (One-Hot Encoded)
data = {'Milk': [1, 0, 1, 1, 0],
'Bread': [1, 1, 1, 0, 1],
'Butter': [0, 1, 1, 1, 1],
'Eggs': [1, 0, 0, 1, 1]}
df = [Link](data)
# 2. Apply Apriori Algorithm to find frequent itemsets
# min_support = 0.4
frequent_itemsets = apriori(df, min_support=0.4, use_colnames=True)
# 3. Generate Association Rules
rules = association_rules(frequent_itemsets, metric="confidence",
min_threshold=0.7)
print("Association Rules:\n", rules[['antecedents', 'consequents',
'support', 'confidence', 'lift']])
Output:
from [Link] import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from [Link] import SVC
from [Link] import classification_report, confusion_matrix
import seaborn as sns
import [Link] as plt
# Using a more complex dataset for classification
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split([Link],
[Link], test_size=0.3, random_state=42)
# --- SVM with Detailed Metrics ---
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
print("--- SVM Classification Report ---")
print(classification_report(y_test, y_pred,
target_names=data.target_names))
# Cross-Validation
cv_scores = cross_val_score(svm_classifier, [Link], [Link],
cv=5)
print(f"5-Fold Cross Validation Accuracy: {cv_scores.mean():.2f} (+/-
{cv_scores.std() * 2:.2f})")
# Plotting Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
[Link](figsize=(6,4))
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=data.target_names, yticklabels=data.target_names)
[Link]('SVM Confusion Matrix')
[Link]('Actual')
[Link]('Predicted')
[Link]()
# Initialize Gaussian Naïve Bayes
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
print("--- Naïve Bayes Classification Report ---")
print(classification_report(y_test, y_pred,
target_names=data.target_names))
# Cross-Validation
cv_scores = cross_val_score(nb_classifier, [Link], [Link],
cv=5)
print(f"5-Fold Cross Validation Accuracy: {cv_scores.mean():.2f} (+/-
{cv_scores.std() * 2:.2f})")
# Plotting Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
[Link](figsize=(6,4))
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=data.target_names, yticklabels=data.target_names)
[Link]('Naïve Bayes Confusion Matrix')
[Link]('Actual')
[Link]('Predicted')
[Link]()
Output:
import pandas as pd
from [Link] import fetch_california_housing,
load_breast_cancer
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPClassifier
from [Link] import mean_squared_error, r2_score,
accuracy_score, classification_report
print("--- 1. Linear Regression (California Housing Dataset) ---")
# Load and prepare data
california = fetch_california_housing()
X_lin, y_lin = [Link], [Link]
X_train_lin, X_test_lin, y_train_lin, y_test_lin =
train_test_split(X_lin, y_lin, test_size=0.2, random_state=42)
# Scale features
scaler_lin = StandardScaler()
X_train_lin_scaled = scaler_lin.fit_transform(X_train_lin)
X_test_lin_scaled = scaler_lin.transform(X_test_lin)
# Train and evaluate
linear_model = LinearRegression()
linear_model.fit(X_train_lin_scaled, y_train_lin)
y_pred_lin = linear_model.predict(X_test_lin_scaled)
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test_lin,
y_pred_lin):.4f}")
print(f"R-squared Score: {r2_score(y_test_lin, y_pred_lin):.4f}\n")
print("--- 2. Logistic Regression (Breast Cancer Dataset) ---")
# Load and prepare data
cancer_data = load_breast_cancer()
X_clf, y_clf = cancer_data.data, cancer_data.target
X_train_clf, X_test_clf, y_train_clf, y_test_clf =
train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
# Scale features (Critical for Logistic and Neural Networks)
scaler_clf = StandardScaler()
X_train_clf_scaled = scaler_clf.fit_transform(X_train_clf)
X_test_clf_scaled = scaler_clf.transform(X_test_clf)
# Train and evaluate
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train_clf_scaled, y_train_clf)
y_pred_log = logistic_model.predict(X_test_clf_scaled)
print(f"Accuracy: {accuracy_score(y_test_clf, y_pred_log):.4f}")
print("Classification Report Overview:")
print(classification_report(y_test_clf, y_pred_log,
target_names=cancer_data.target_names))
print("--- 3. Neural Network (Breast Cancer Dataset) ---")
# Train and evaluate using the same scaled dataset from Part 2
# Configuration: 2 hidden layers (64 & 32 neurons), ReLU activation,
Adam optimizer
ann_model = MLPClassifier(
hidden_layer_sizes=(64, 32),
activation='relu',
solver='adam',
max_iter=1000,
random_state=42
)
print("Training Neural Network...")
ann_model.fit(X_train_clf_scaled, y_train_clf)
y_pred_ann = ann_model.predict(X_test_clf_scaled)
print(f"Accuracy: {accuracy_score(y_test_clf, y_pred_ann):.4f}")
print("Classification Report Overview:")
print(classification_report(y_test_clf, y_pred_ann,
target_names=cancer_data.target_names))
Output: