Vertopal.com_ML Project 2
Vertopal.com_ML Project 2
df=pd.read_csv("HR_Analytics.csv")
df.head()
[5 rows x 38 columns]
df.isnull().sum()
EmpID 0
Age 0
AgeGroup 0
Attrition 0
BusinessTravel 0
DailyRate 0
Department 0
DistanceFromHome 0
Education 0
EducationField 0
EmployeeCount 0
EmployeeNumber 0
EnvironmentSatisfaction 0
Gender 0
HourlyRate 0
JobInvolvement 0
JobLevel 0
JobRole 0
JobSatisfaction 0
MaritalStatus 0
MonthlyIncome 0
SalarySlab 0
MonthlyRate 0
NumCompaniesWorked 0
Over18 0
OverTime 0
PercentSalaryHike 0
PerformanceRating 0
RelationshipSatisfaction 0
StandardHours 0
StockOptionLevel 0
TotalWorkingYears 0
TrainingTimesLastYear 0
WorkLifeBalance 0
YearsAtCompany 0
YearsInCurrentRole 0
YearsSinceLastPromotion 0
YearsWithCurrManager 57
dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 EmpID 1480 non-null object
1 Age 1480 non-null int64
2 AgeGroup 1480 non-null object
3 Attrition 1480 non-null object
4 BusinessTravel 1480 non-null object
5 DailyRate 1480 non-null int64
6 Department 1480 non-null object
7 DistanceFromHome 1480 non-null int64
8 Education 1480 non-null int64
9 EducationField 1480 non-null object
10 EmployeeCount 1480 non-null int64
11 EmployeeNumber 1480 non-null int64
12 EnvironmentSatisfaction 1480 non-null int64
13 Gender 1480 non-null object
14 HourlyRate 1480 non-null int64
15 JobInvolvement 1480 non-null int64
16 JobLevel 1480 non-null int64
17 JobRole 1480 non-null object
18 JobSatisfaction 1480 non-null int64
19 MaritalStatus 1480 non-null object
20 MonthlyIncome 1480 non-null int64
21 SalarySlab 1480 non-null object
22 MonthlyRate 1480 non-null int64
23 NumCompaniesWorked 1480 non-null int64
24 Over18 1480 non-null object
25 OverTime 1480 non-null object
26 PercentSalaryHike 1480 non-null int64
27 PerformanceRating 1480 non-null int64
28 RelationshipSatisfaction 1480 non-null int64
29 StandardHours 1480 non-null int64
30 StockOptionLevel 1480 non-null int64
31 TotalWorkingYears 1480 non-null int64
32 TrainingTimesLastYear 1480 non-null int64
33 WorkLifeBalance 1480 non-null int64
34 YearsAtCompany 1480 non-null int64
35 YearsInCurrentRole 1480 non-null int64
36 YearsSinceLastPromotion 1480 non-null int64
37 YearsWithCurrManager 1423 non-null float64
dtypes: float64(1), int64(25), object(12)
memory usage: 439.5+ KB
OverTime_Yes
0 0
1 0
2 1
3 0
4 0
[5 rows x 48 columns]
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 48 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 1480 non-null float64
1 AgeGroup 1480 non-null object
2 Attrition 1480 non-null int32
3 DailyRate 1480 non-null float64
4 DistanceFromHome 1480 non-null float64
5 Education 1480 non-null int64
6 EnvironmentSatisfaction 1480 non-null int64
7 Gender 1480 non-null int64
8 HourlyRate 1480 non-null float64
9 JobInvolvement 1480 non-null int64
10 JobLevel 1480 non-null int64
11 JobSatisfaction 1480 non-null int64
12 MonthlyIncome 1480 non-null float64
13 SalarySlab 1480 non-null object
14 MonthlyRate 1480 non-null float64
15 NumCompaniesWorked 1480 non-null int64
16 PercentSalaryHike 1480 non-null int64
17 PerformanceRating 1480 non-null int64
18 RelationshipSatisfaction 1480 non-null int64
19 StockOptionLevel 1480 non-null int64
20 TotalWorkingYears 1480 non-null int64
21 TrainingTimesLastYear 1480 non-null int64
22 WorkLifeBalance 1480 non-null int64
23 YearsAtCompany 1480 non-null float64
24 YearsInCurrentRole 1480 non-null int64
25 YearsSinceLastPromotion 1480 non-null int64
26 YearsWithCurrManager 1480 non-null float64
27 BusinessTravel_TravelRarely 1480 non-null uint8
28 BusinessTravel_Travel_Frequently 1480 non-null uint8
29 BusinessTravel_Travel_Rarely 1480 non-null uint8
30 Department_Research & Development 1480 non-null uint8
31 Department_Sales 1480 non-null uint8
32 EducationField_Life Sciences 1480 non-null uint8
33 EducationField_Marketing 1480 non-null uint8
34 EducationField_Medical 1480 non-null uint8
35 EducationField_Other 1480 non-null uint8
36 EducationField_Technical Degree 1480 non-null uint8
37 JobRole_Human Resources 1480 non-null uint8
38 JobRole_Laboratory Technician 1480 non-null uint8
39 JobRole_Manager 1480 non-null uint8
40 JobRole_Manufacturing Director 1480 non-null uint8
41 JobRole_Research Director 1480 non-null uint8
42 JobRole_Research Scientist 1480 non-null uint8
43 JobRole_Sales Executive 1480 non-null uint8
44 JobRole_Sales Representative 1480 non-null uint8
45 MaritalStatus_Married 1480 non-null uint8
46 MaritalStatus_Single 1480 non-null uint8
47 OverTime_Yes 1480 non-null uint8
dtypes: float64(8), int32(1), int64(16), object(2), uint8(21)
memory usage: 336.9+ KB
# Encoding Order
# 1 SalarySlab ("Upto 5k", "5k-10k", etc.)
11️⃣
# "Upto 5k" → 0
# "5k-10k" → 1
# "10k-15k" → 2
# "15k+" → 3
# "26-35" → 1
# "36-45" → 2
# "46-55" → 3
# "55+" → 4
# Countplot of Attrition
plt.figure(figsize=(6, 4))
sns.countplot(x=df['Attrition'], palette="coolwarm")
plt.title("Employee Attrition Count")
plt.xlabel("Attrition (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['Age'], y=df['MonthlyIncome'],
hue=df['Attrition'], alpha=0.7, palette="coolwarm")
plt.title("Age vs Monthly Income (Colored by Attrition)")
plt.xlabel("Age")
plt.ylabel("Monthly Income")
plt.legend(title="Attrition", labels=["No", "Yes"])
plt.show()
plt.figure(figsize=(6, 4))
sns.barplot(x=df['SalarySlab'], y=df['Attrition'], palette="coolwarm")
plt.title("Attrition Rate by Salary Slab")
plt.xlabel("Salary Slab (Encoded)")
plt.ylabel("Attrition Rate")
plt.show()
plt.figure(figsize=(6, 4))
sns.boxplot(x=df['WorkLifeBalance'], y=df['Attrition'],
palette="coolwarm")
plt.title("Work-Life Balance vs Attrition")
plt.xlabel("Work-Life Balance (1 = Worst, 4 = Best)")
plt.ylabel("Attrition")
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(x=df['Attrition'], y=df['YearsAtCompany'], palette='Set2')
plt.title("Attrition vs Years at Company")
plt.xlabel("Attrition (0 = No, 1 = Yes)")
plt.ylabel("Years at Company")
plt.show()
plt.figure(figsize=(8,5))
sns.histplot(df['MonthlyIncome'], bins=30, kde=True, color='purple')
plt.title("Distribution of Monthly Income")
plt.xlabel("Monthly Income")
plt.ylabel("Count")
plt.show()
plt.figure(figsize=(8,6))
sns.scatterplot(x=df['YearsAtCompany'], y=df['MonthlyIncome'],
hue=df['Attrition'], alpha=0.6, palette='coolwarm')
plt.title("Years at Company vs Monthly Income (Colored by Attrition)")
plt.xlabel("Years at Company")
plt.ylabel("Monthly Income")
plt.show()
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Attrition']) # Features
y = df['Attrition'] # Target
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42, stratify=y)
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Define models
models = {
"Logistic Regression": LogisticRegression(),
"Random Forest": RandomForestClassifier(n_estimators=100,
random_state=42),
"Decision Tree": DecisionTreeClassifier(random_state=42),
"SVM": SVC(kernel='linear', probability=True)
}
Confusion Matrix:
[[242 6]
[ 19 29]]
--------------------------------------------------
Model: Random Forest
Accuracy: 0.8682
Classification Report:
precision recall f1-score support
Confusion Matrix:
[[247 1]
[ 38 10]]
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8209
Classification Report:
precision recall f1-score support
Confusion Matrix:
[[232 16]
[ 37 11]]
--------------------------------------------------
Model: SVM
Accuracy: 0.9088
Classification Report:
precision recall f1-score support
Confusion Matrix:
[[243 5]
[ 22 26]]
--------------------------------------------------
log_reg = LogisticRegression(class_weight='balanced',
solver='liblinear')
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_resampled, y_train_resampled)
param_grid_svm = {
'C': [0.1, 1, 10],
'kernel': ['linear', 'rbf']
}
svm = SVC(class_weight='balanced')
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5,
scoring='f1')
grid_search_svm.fit(X_train_resampled, y_train_resampled)
param_grid_rf = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='f1')
grid_search_rf.fit(X_train_resampled, y_train_resampled)
# Create ensemble
ensemble = VotingClassifier(estimators=[
('log_reg', lr),
('svm', svm)
], voting='soft', weights=[2, 1]) # Give LR more importance
# Soft voting uses probabilities
# Train & evaluate
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
# Report
print("accuracy ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred_ensemble))
accuracy 0.8581081081081081
precision recall f1-score support
# Report
print("Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print(classification_report(y_test, y_pred_ensemble))
Accuracy: 0.8614864864864865
precision recall f1-score support
# New Evaluation
print("Adjusted Accuracy:", accuracy_score(y_test, y_pred_adjusted))
print(classification_report(y_test, y_pred_adjusted))