0% found this document useful (0 votes)
5 views

AML_code_for_m2

The document outlines a comprehensive data analysis and machine learning workflow using Python, including data loading, preprocessing, and visualization techniques. It covers handling missing values, encoding categorical variables, splitting the dataset into training and testing sets, and training various models with hyperparameter tuning using GridSearchCV. Finally, it evaluates the performance of classification and regression models with metrics such as accuracy, mean squared error, and R2 score.

Uploaded by

hetvi.bhora
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

AML_code_for_m2

The document outlines a comprehensive data analysis and machine learning workflow using Python, including data loading, preprocessing, and visualization techniques. It covers handling missing values, encoding categorical variables, splitting the dataset into training and testing sets, and training various models with hyperparameter tuning using GridSearchCV. Finally, it evaluates the performance of classification and regression models with metrics such as accuracy, mean squared error, and R2 score.

Uploaded by

hetvi.bhora
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

# Imports

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score,


mean_squared_error, r2_score

# 2. Loading CSV
# Replace 'your_dataset.csv' with your actual dataset

df = pd.read_csv('your_dataset.csv')

df.head()

df.drop(columns=["Employee_ID"],inplace=True)

# 3. Dataset Overview
print(df.head())

print(df.info())

print(df.describe())

print("Shape of the dataset:", df.shape)

# 4. Handling Missing Values


print("Missing values:\n", df.isnull().sum())

df.fillna(df.mean(), inplace=True) # Imputation for numerical columns

df["Mental_Health_Condition"].fillna("Unknown", inplace=True) # Imputation for


categorical columns

df["Physical_Activity"].fillna("Unknown", inplace=True) # Imputation for categorical


columns
df.isnull().sum()

# 5. Splitting Numerical and Categorical Columns


numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Numerical Columns:", numerical_cols)

print("Categorical Columns:", categorical_cols)

# 6. Plotting bar plots for Categorical columns (Univariate)


for col in categorical_cols:

counts = df[col].value_counts()

plt.bar(counts.index,counts.values)

plt.title(col)

plt.show()

# 7. Plotting pie plots for Categorical columns (Univariate)


for col in categorical_cols:

counts = df[col].value_counts()

plt.pie(counts.values,labels = counts.index, autopct = '%1.1f%%')

plt.title(col)

plt.show()

# 6. Plotting histrograms for Numerical columns (Univariate)


for col in numerical_cols:

sns.histplot(df[target_col], kde=True)

plt.title(f"Distribution of {target_col}")

plt.show()

# 7. Plotting boxplots of Numerical columns


plt.figure(figsize=(12,12))

sns.boxplot(data=df[numerical_cols])

plt.title("Boxplot of Numerical Columns")


plt.show()

# 8. Bivariate Plots
sns.pairplot(df, hue=target_col if target_col in categorical_cols else None)

plt.show()

# 9. Scatter Plots
for col in numerical_cols:

if col != target_col:

sns.scatterplot(x=df[col], y=df[target_col])

plt.title(f"{col} vs {target_col}")

plt.show()

# 10. Multivariate Plots


corr = df[numerical_cols].corr()

sns.heatmap(corr, annot=True, cmap="coolwarm")

plt.title("Correlation Heatmap")

plt.show()

# 11. Value counts of Categorical columns


for col in categorical_cols:

print(df[col].value_counts())

print()

# 12. Ordinal Encoding


# Define ordinal columns with order

ordinal_columns = {

"Stress_Level": ["Low", "Medium", "High"],

"Productivity_Change": ["Increase", "No Change", "Decrease"]

# Prepare an empty DataFrame for encoded data

ordinal_encoded_data = pd.DataFrame()
# Process each ordinal column

for col, order in ordinal_columns.items():

if col in df.columns:

encoder = OrdinalEncoder(categories=[order])

# Flatten the 2D array returned by fit_transform

ordinal_encoded_data[col] = encoder.fit_transform(df[[col]]).flatten()

# Drop the original columns and add encoded columns

df.drop(columns=ordinal_columns.keys(), inplace=True, errors='ignore')

df = pd.concat([df, ordinal_encoded_data], axis=1)

# 13. One hot encoding


encoded_df = pd.get_dummies(df[ohe_cols], drop_first=True)

df.drop(columns=ohe_cols, inplace=True)

df = pd.concat([df, encoded_df], axis=1)

# 14. Separate Features (X) and Target (y)


X = df.drop(columns=[target_col])

y = df[target_col]

# Encoding target if it's categorical

if y.dtype == 'object':

le = LabelEncoder()

y = le.fit_transform(y)

# 15. Train-Test Split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 16. Feature Scaling


scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)
# 17. Model Training and Hyperparameter Tuning for Classifier
models = {

"DecisionTreeClassifier": DecisionTreeClassifier(),

"RandomForestClassifier": RandomForestClassifier(),

# 18. Define parameter grids for hyperparameter tuning


param_grids = {

"DecisionTreeClassifier": {

'max_depth': [5, 10, 15, None],

'min_samples_split': [2, 5, 10],

'min_samples_leaf': [1, 2, 4]

},

"RandomForestClassifier": {

'n_estimators': [50, 100, 150],

'max_depth': [5, 10, None],

'min_samples_split': [2, 5],

'min_samples_leaf': [1, 2]

},

# Create a dictionary to store the best model for each algorithm

best_models = {}

# 19. Loop through the models and parameter grids to perform


GridSearchCV
for model_name, model in models.items():

print(f"Training {model_name}...")

# Get the parameter grid for the current model

param_grid = param_grids[model_name]

# Create the GridSearchCV object


grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,
n_jobs=-1, verbose=1)

# Fit GridSearchCV to the data (assuming X_train and y_train are your training data)

grid_search.fit(X_train, y_train)

# Store the best model in the dictionary

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# Print best parameters

print(f"Best parameters for {model_name}: {grid_search.best_params_}")

print(f"Best score for {model_name}: {grid_search.best_score_}")

print(f"\n{model_name} Classification Report:")

print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

# 20. Model Training and Hyperparameter Tuning for Regression


models = {

"DecisionTreeRegressor": DecisionTreeRegressor(),

"RandomForestRegressor": RandomForestRegressor(),

# 21. Define parameter grids for hyperparameter tuning


param_grids = {

"DecisionTreeRegressor": {

"max_depth": [3, 5, 7, 10],

"min_samples_split": [2, 5, 10],

"min_samples_leaf": [1, 2, 4],

},

"RandomForestRegressor": {

"n_estimators": [100, 200],

"max_depth": [5, 10, None],


"min_samples_split": [2, 5],

"min_samples_leaf": [1, 2],

"bootstrap": [True, False],

},

# 22. Loop through models and perform GridSearchCV


for name, model in models.items():

print(f"\n{name} Hyperparameter Tuning:")

# GridSearchCV for hyperparameter tuning

grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name],


cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV

best_model = grid_search.best_estimator_

# Predict and evaluate the best model

y_pred = best_model.predict(X_test)

# Print the metrics

print("Best Hyperparameters:", grid_search.best_params_)

print("MSE:", mean_squared_error(y_test, y_pred))

print("R2 Score:", r2_score(y_test, y_pred))

print()

You might also like