Bi 6 New
Bi 6 New
1. Classification
In [1]: import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics
In [2]: df=pd.read_csv('emails.csv')
In [3]: df.head()
Out[3]:
Email
the to ect and for of a you hou ... connevey jay valued lay infrastructure military allowin
No.
Email
0 0 0 1 0 0 0 2 0 0 ... 0 0 0 0 0 0
1
Email
1 8 13 24 6 6 2 102 1 27 ... 0 0 0 0 0 0
2
Email
2 0 0 1 0 0 0 8 0 0 ... 0 0 0 0 0 0
3
Email
3 0 5 22 0 5 1 51 2 10 ... 0 0 0 0 0 0
4
Email
4 7 6 17 1 5 2 57 0 9 ... 0 0 0 0 0 0
5
In [4]: df.tail()
Out[4]:
Email
the to ect and for of a you hou ... connevey jay valued lay infrastructure military allo
No.
Email
5167 2 2 2 3 0 0 32 0 0 ... 0 0 0 0 0 0
5168
Email
5168 35 27 11 2 6 5 151 4 3 ... 0 0 0 0 0 0
5169
Email
5169 0 0 1 1 0 0 11 0 0 ... 0 0 0 0 0 0
5170
Email
5170 2 7 1 0 2 1 28 2 0 ... 0 0 0 0 0 0
5171
Email
5171 22 24 5 1 6 5 148 8 2 ... 0 0 0 0 0 0
5172
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB
In [6]: df.describe()
Out[6]:
the to ect and for of a you
In [7]: df.columns
Out[7]: Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
...
'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
'allowing', 'ff', 'dry', 'Prediction'],
dtype='object', length=3002)
In [8]: df.dtypes
In [9]: df.size
Out[9]: 15526344
In [10]: df.isna().sum()
Out[10]: Email No. 0
the 0
to 0
ect 0
and 0
..
military 0
allowing 0
ff 0
dry 0
Prediction 0
Length: 3002, dtype: int64
In [11]: df.dropna(inplace=True)
In [13]: X = df.drop(['Prediction'],axis = 1)
X
Out[13]:
the to ect and for of a you hou in ... enhancements connevey jay valued lay infrastructure
0 0 0 1 0 0 0 2 0 0 0 ... 0 0 0 0 0 0
1 8 13 24 6 6 2 102 1 27 18 ... 0 0 0 0 0 0
2 0 0 1 0 0 0 8 0 0 4 ... 0 0 0 0 0 0
3 0 5 22 0 5 1 51 2 10 1 ... 0 0 0 0 0 0
4 7 6 17 1 5 2 57 0 9 3 ... 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5167 2 2 2 3 0 0 32 0 0 5 ... 0 0 0 0 0 0
5169 0 0 1 1 0 0 11 0 0 1 ... 0 0 0 0 0 0
5170 2 7 1 0 2 1 28 2 0 8 ... 0 0 0 0 0 0
In [14]: y = df['Prediction']
y
Out[14]: 0 0
1 0
2 0
3 0
4 0
..
5167 0
5168 0
5169 1
5170 1
5171 0
Name: Prediction, Length: 5172, dtype: int64
In [17]: print("Prediction",y_pred)
Prediction [0 0 1 ... 1 1 1]
Confusion Matrix:
[[804 293]
[ 16 439]]
Classification Report:
precision recall f1-score support
SVM Classifier
In [24]: from sklearn.svm import SVC
model=SVC(C=1)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
Confusion Matrix:
[[1091 6]
[ 90 365]]
In [26]: print("SVM accuracy: ",metrics.accuracy_score(y_test,y_pred))
In [33]: # Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X)