0% found this document useful (0 votes)
5 views

Coding

Uploaded by

Soniya Singh
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Coding

Uploaded by

Soniya Singh
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

import pandas as pd

# Load the dataset from the uploaded file


file_path = '/content/employee_attrition_data.csv'
employee_attrition_data = pd.read_csv(file_path)

# Display the first few rows of the dataset


print("First few rows of the dataset:")
print(employee_attrition_data.head())

# Display summary information about the dataset


print("\nSummary Information of the dataset:")
print(employee_attrition_data.info())

# Calculate basic statistics for numerical columns


print("\nBasic Statistics of the dataset:")
print(employee_attrition_data.describe())

First few rows of the dataset:


Employee_ID Age Gender Department Job_Title
Years_at_Company \
0 0 27 Male Marketing Manager 9

1 1 53 Female Sales Engineer 10

2 2 59 Female Marketing Analyst 8

3 3 42 Female Engineering Manager 1

4 4 44 Female Sales Engineer 10

Satisfaction_Level Average_Monthly_Hours Promotion_Last_5Years


Salary \
0 0.586251 151 0
60132
1 0.261161 221 1
79947
2 0.304382 184 0
46958
3 0.480779 242 0
40662
4 0.636244 229 1
74307

Attrition
0 0
1 0
2 1
3 0
4 0
Summary Information of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Employee_ID 1000 non-null int64
1 Age 1000 non-null int64
2 Gender 1000 non-null object
3 Department 1000 non-null object
4 Job_Title 1000 non-null object
5 Years_at_Company 1000 non-null int64
6 Satisfaction_Level 1000 non-null float64
7 Average_Monthly_Hours 1000 non-null int64
8 Promotion_Last_5Years 1000 non-null int64
9 Salary 1000 non-null int64
10 Attrition 1000 non-null int64
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB
None

Basic Statistics of the dataset:


Employee_ID Age Years_at_Company Satisfaction_Level
\
count 1000.000000 1000.000000 1000.000000 1000.000000

mean 499.500000 42.205000 5.605000 0.505995

std 288.819436 10.016452 2.822223 0.289797

min 0.000000 25.000000 1.000000 0.001376

25% 249.750000 33.000000 3.000000 0.258866

50% 499.500000 43.000000 6.000000 0.505675

75% 749.250000 51.000000 8.000000 0.761135

max 999.000000 59.000000 10.000000 0.999979

Average_Monthly_Hours Promotion_Last_5Years Salary


Attrition
count 1000.000000 1000.000000 1000.000000
1000.000000
mean 199.493000 0.486000 64624.980000
0.495000
std 29.631908 0.500054 20262.984333
0.500225
min 150.000000 0.000000 30099.000000
0.000000
25% 173.000000 0.000000 47613.500000
0.000000
50% 201.000000 0.000000 64525.000000
0.000000
75% 225.000000 1.000000 81921.000000
1.000000
max 249.000000 1.000000 99991.000000
1.000000

import pandas as pd

file_path = '/content/employee_attrition_data.csv'
employee_attrition_data = pd.read_csv(file_path)

# Check for missing values


missing_values = employee_attrition_data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# One-hot encode categorical variables


encoded_data = pd.get_dummies(employee_attrition_data,
columns=['Gender', 'Department', 'Job_Title'])

# Display the first few rows of the encoded dataset


print("First few rows of the encoded dataset:")
print(encoded_data.head())

Missing values in each column:


Employee_ID 0
Age 0
Gender 0
Department 0
Job_Title 0
Years_at_Company 0
Satisfaction_Level 0
Average_Monthly_Hours 0
Promotion_Last_5Years 0
Salary 0
Attrition 0
dtype: int64
First few rows of the encoded dataset:
Employee_ID Age Years_at_Company Satisfaction_Level \
0 0 27 9 0.586251
1 1 53 10 0.261161
2 2 59 8 0.304382
3 3 42 1 0.480779
4 4 44 10 0.636244
Average_Monthly_Hours Promotion_Last_5Years Salary Attrition \
0 151 0 60132 0
1 221 1 79947 0
2 184 0 46958 1
3 242 0 40662 0
4 229 1 74307 0

Gender_Female Gender_Male Department_Engineering


Department_Finance \
0 False True False
False
1 True False False
False
2 True False False
False
3 True False True
False
4 True False False
False

Department_HR Department_Marketing Department_Sales \


0 False True False
1 False False True
2 False True False
3 False False False
4 False False True

Job_Title_Accountant Job_Title_Analyst Job_Title_Engineer \


0 False False False
1 False False True
2 False True False
3 False False False
4 False False True

Job_Title_HR Specialist Job_Title_Manager


0 False True
1 False False
2 False False
3 False True
4 False False

import matplotlib.pyplot as plt


import seaborn as sns

# Generate summary statistics for all variables


summary_statistics = encoded_data.describe()
print("Summary Statistics:")
print(summary_statistics)

# Histograms for numerical variables


fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.histplot(encoded_data['Age'], kde=True, ax=axes[0])


axes[0].set_title('Age Distribution')

sns.histplot(encoded_data['Satisfaction_Level'], kde=True, ax=axes[1])


axes[1].set_title('Satisfaction Level Distribution')

sns.histplot(encoded_data['Salary'], kde=True, ax=axes[2])


axes[2].set_title('Salary Distribution')

plt.show()

# Count plots for original categorical variables


fig, axes = plt.subplots(1, 2, figsize=(18, 5))

sns.countplot(data=employee_attrition_data, x='Department',
ax=axes[0])
axes[0].set_title('Department Count')

sns.countplot(data=employee_attrition_data, x='Job_Title', ax=axes[1])


axes[1].set_title('Job Title Count')

plt.show()

# Generate a correlation matrix


correlation_matrix = encoded_data.corr()

# Plot the correlation matrix


plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',
fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

Summary Statistics:
Employee_ID Age Years_at_Company Satisfaction_Level
\
count 1000.000000 1000.000000 1000.000000 1000.000000

mean 499.500000 42.205000 5.605000 0.505995

std 288.819436 10.016452 2.822223 0.289797

min 0.000000 25.000000 1.000000 0.001376

25% 249.750000 33.000000 3.000000 0.258866

50% 499.500000 43.000000 6.000000 0.505675

75% 749.250000 51.000000 8.000000 0.761135


max 999.000000 59.000000 10.000000 0.999979

Average_Monthly_Hours Promotion_Last_5Years Salary


Attrition
count 1000.000000 1000.000000 1000.000000
1000.000000
mean 199.493000 0.486000 64624.980000
0.495000
std 29.631908 0.500054 20262.984333
0.500225
min 150.000000 0.000000 30099.000000
0.000000
25% 173.000000 0.000000 47613.500000
0.000000
50% 201.000000 0.000000 64525.000000
0.000000
75% 225.000000 1.000000 81921.000000
1.000000
max 249.000000 1.000000 99991.000000
1.000000
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Select features for clustering (excluding the target variable


'Attrition' and identifier 'Employee_ID')
features = encoded_data.drop(columns=['Employee_ID', 'Attrition'])

# Apply K-means clustering


kmeans = KMeans(n_clusters=3, random_state=42)
encoded_data['Cluster'] = kmeans.fit_predict(features)

# Visualize the clusters


plt.figure(figsize=(12, 6))
sns.scatterplot(data=encoded_data, x='Satisfaction_Level',
y='Average_Monthly_Hours', hue='Cluster', palette='viridis')
plt.title('K-means Clustering of Employees')
plt.show()

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/
_kmeans.py:870: FutureWarning: The default value of `n_init` will
change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly
to suppress the warning
warnings.warn(

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Select features and target


X = encoded_data.drop(columns=['Employee_ID', 'Attrition', 'Cluster'])
y = encoded_data['Attrition']

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

# Apply logistic regression


logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict on the test set


y_pred = logreg.predict(X_test)

# Evaluate the model


classification_report_logreg = classification_report(y_test, y_pred)
confusion_matrix_logreg = confusion_matrix(y_test, y_pred)

print("Classification Report:")
print(classification_report_logreg)
print("\nConfusion Matrix:")
print(confusion_matrix_logreg)

Classification Report:
precision recall f1-score support

0 0.51 0.59 0.55 102


1 0.49 0.41 0.44 98

accuracy 0.50 200


macro avg 0.50 0.50 0.49 200
weighted avg 0.50 0.50 0.50 200

Confusion Matrix:
[[60 42]
[58 40]]

You might also like