Gaurav - Data Mining Lab Assignment
Gaurav - Data Mining Lab Assignment
29 Jun 2021
2
1. Linear Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
linearreg=LinearRegression()
x=np.array([1.1,1.3,1.5,2,2.2,2.9,3,3.2,3.2,3.7,3.9,4,4,4.1,4.5,4.9,5.1,5.
3,5.9,6,6.8,7.1,7.9,8.2,8.7,9,9.5,9.6,10.3,10.5,11.2,11.5,12.3,12.9,13.5])
y=np.array([39343,46205,37731,43525,39891,56642,60150,54445,64445,5718,632
18,55794,56957,57081,61111,67938,66029,83088,81363,93940,91738,98273,10130
2,113812,109431,105582,116969,112635,122391,121872,127345,126756,128765,13
5675,139465])
x=x.reshape(-1,1)
linearreg.fit(x,y)
y_pred=linearreg.predict(x)
plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
3
linearreg=LinearRegression()
x=np.array([108,19,13,124,40,57,23,14,45,10,5,48,11,23,7,2,24,6,3,23,6,9
,9,3,29,7,4,20,7,4,0,25,6,5,22,11,61,12,4,16,13,60,41,37,55,41,11,27,8,
3 ,17,13,13,15,8,29,30,24,9,31,14,53,26])
y=np.array([392.5,46.2,15.7,422.2,119.4,170.9,56.9,77.5,214,65.3,20.9,24
8.1,23.5,39.6,48.8,6.6,134.9,50.9,4.4,113,14.8,48.7,52.1,13.2,103.9,77.5
,11.8,98.1,27.9,38.1,0,69.2,14.6,40.3,161.5,57.2,217.6,58.1,12.6,59.6,89
.9,202.4,181.3,152.8,162.8,73.4,21.3,92.6,76.1,39.9,142.1,93,31.9,32.1,5
5.6,133.3,194.5,137.9,87.4,209.8,95.5,244.6,187.5,
])
x=x.reshape(-1,1)
linearreg.fit(x,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
y_pred=linearreg.predict(x)
plt.scatter(x,y)
plt.plot(x,y_pred,color='red')
plt.show()
4
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv("mlr.csv")
data.head()
X1 X2 X3 X4 X5 X6
X = data.iloc[:,data.columns != "x1"]
Y = data.iloc[:, 0]
X
x3 x4 x5 x6
x 2
Y.head()
0 0.283
1 0.276
2 0.281
3 0.328
4 0.290
Name: x1, dtype: float64
plt.scatter(X.x2,X.x3,marker="*",color="orange")
plt.scatter(X.x2,X.x4,marker="*",color="green")
6
plt.scatter(X.x2,X.x5,marker="*",color="red")
plt.scatter(X.x2,X.x6,marker="*",color="orange")
plt.scatter(X.x3,X.x4,marker="*",color="green")
7
plt.scatter(X.x3,X.x5,marker="*",color="red")
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_pred = model.predict(X_test)
Y_pred
array([0.23386571, 0.31718764, 0.33311851, 0.29940205, 0.27633325,
0.24228869, 0.23924197, 0.29401786, 0.29011389])
8
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
df = pd.read_csv(io.BytesIO(uploaded['insurance-1.csv']))
ivar = df[['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' , 'charges']]
fig = plt.figure()
ax = fig.add_subplot(111)
csx = ax.matshow(np.linalg.inv(ivar.corr()) , cmap = 'Blues')
fig.colorbar(csx)
ax.set_xlabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
ax.set_ylabel(['']+['age' , 'sex' , 'bmi' , 'children' , 'smoker' , 'region' ,
'charges'])
df.head()
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
df.columns = ['age','sex','bmi','children','smoker','region','charges']
X = df.iloc[:,df.columns != "age"]
Y = df.iloc[:, 0]
X
3866.85520
... ... ... ... ... ... ... 1333 male 30.970 3 no northwest
2007.94500
Y.head()
0 19
1 18
2 28
3 33
4 32
plt.scatter(X.sex,X.bmi,marker="*",color="green")
11
plt.scatter(X.sex,X.children,marker="*",color="orange")
plt.scatter(X.sex,X.charges,marker="*",color="red")
plt.scatter(X.bmi,X.charges,marker="*",color="red")
plt.scatter(X.children,X.charges,marker="*",color="green")
12
plt.scatter(X.smoker,X.charges,marker="*",color="orange")
3. Logistic Regression
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
age=np.array([22,25,47,52,46,56,55,60,62,61,18,28,27,29,49,55,25,58,19,18,
21,26,4 0,45,50,54,23])
bought_insurance=np.array([0,0,1,0,1,1,0,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,1
13
,1,1,0] )
print(age)
print(bought_insurance)
[22 25 47 52 46 56 55 60 62 61 18 28 27 29 49 55 25 58 19 18 21 26 40 45
50 54 23]
[0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0]
plt.scatter(age,bought_insurance,c=bought_insurance,cmap='rainbow')
plt.title("Scatter Plot Of Logistic Regression")
plt.show()
age_train,age_test,bought_insurance_train,bought_insurance_test=train_test_split(
age,bought_insurance,random_state=1)
print(age_train.shape,age_test.shape,bought_insurance_train.shape,bought_insuranc
e_test.shape)
(20,) (7,) (20,) (7,)
log_reg=LogisticRegression
mean_squared_error(age,bought_insurance)
1760.2592592592594
14
4. K-Means Algorithm
import pandas as pd
import io
df = pd.read_csv(io.BytesIO(uploaded['cars.csv']))
df.head()
cubicinches hp time-to-6 0
weightlb s year brand
mpg cylinder s
8 302 140 3449 11 1971 US. 3 15.0 8 400 150 3761 10 1971 US. 4 30.5 4 98 63
X = df.iloc[:,:-1].values
X = pd.DataFrame(X)
X = pd
X.head()
cubicinches hp time-to-6 0
weightlb s year
mpg cylinder s
2051 17 1978
X.describe()
cubicinches hp time-to-6 0
mpg cylinder s weightlb s year
count 261.0 261 261 261 261 261 261 unique 103.0 5 75 85 240 17 1
X['cylinders'] = pd.to_numeric(X['cylinders'],
errors='coerce').astype('float64')
X['cubicinches'] = pd.to_numeric(X['cubicinches'],
errors='coerce').astype('float64')
X['hp'] = pd.to_numeric(X['hp'], errors='coerce').astype('float64')
X['weightlbs'] = pd.to_numeric(X['weightlbs'],
errors='coerce').astype('float64')
X['time-to-60'] = pd.to_numeric(X['time-to-60'],
errors='coerce').astype('float64')
X['year'] = pd.to_numeric(X['year'], errors='coerce').astype('float64')
import numpy as np
X.replace([np.inf, -np.inf], np.nan)
print(X.isna().sum())
mean1=X['cubicinches'].mean()
X['cubicinches']=X['cubicinches'].fillna(mean1)
mean2=X['weightlbs'].mean()
X['weightlbs']=X['weightlbs'].fillna(mean2)
print(mean1,mean2)
print(X.isna().sum())
mpg 0
cylinders 0
cubicinches 2
hp 0
weightlbs 3
time-to-60 0
year 0
dtype: int64
200.9189189189189 3009.8333333333335
mpg 0
cylinders 0
cubicinches 0
hp 0
weightlbs 0
time-to-60 0
year 0
dtype: int64
#X=X.to_numpy
1,1],s=100,c='blue',label='Japan')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans ==
2,1],s=100,c='green',label='Europe')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=30
0,c='yellow',label='Centroids')
plt.title('Clusters of car brands')
plt.legend()
plt.show()
17
18
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder#for train test splitting
from sklearn.model_selection import train_test_split#for decision tree object
from sklearn.tree import DecisionTreeClassifier#for checking testing results
from sklearn.metrics import classification_report, confusion_matrix#for
visualizing tree
from sklearn.tree import plot_tree
import io
df = pd.read_csv(io.BytesIO(uploaded['Iris.csv']))
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
df.shape
(150, 6)
df.isnull().any()
Id False
SepalLengthCm False
SepalWidthCm False
PetalLengthCm False
PetalWidthCm False
Species False
dtype: bool
20
df['species'] = np.random.choice(2,150)
sns.pairplot(df, hue="species", size=2.5)
21
sns.heatmap(df.corr())
target = df['species']
df1 = df.copy()
df1 = df1.drop('species', axis =1)
X = df1
target
0 0
1 1
2 1
3 1
4 0
..
145 1
146 0
147 1
148 1
149 1
Name: species, Length: 150, dtype: int64
22
le = LabelEncoder()
target = le.fit_transform(target)
target
array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
y = target
y_pred = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))
Classification report-
Precision recall f1-score support
0 1.00 1.00 1.00 10
1 1.00 1.00 1.00 9
2 1.00 1.00 1.00 11
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
23
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))sns.heatmap(data=cm,linewidths=.5, annot=True,square
= True, cmap = 'Blues')plt.ylabel('Actual label')
plt.xlabel('Predicted label')all_sample_title = 'Accuracy Score:
{0}'.format(dtree.score(X_test, y_test))
plt.title(all_sample_title, size = 15)
24
Neural Networks:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
observations=1000
xs=np.random.uniform(-10,10,(observations,1))
zs=np.random.uniform(-10,10,(observations,1))
generated_inputs=np.column_stack((xs,zs))
noise=np.random.uniform(-10,10,(observations,1))
generated_target=2*xs-3*zs+5+noise
np.savez('TF_intro',input=generated_inputs,targets=generated_target)
training_data=np.load('TF_intro.npz')
input_size=2
output_size=1
models = tf.keras.Sequential([
tf.keras.layers.Dense(output_size)
])
custom_optimizer=tf.keras.optimizers.SGD(learning_rate=0.02)
models.compile(optimizer=custom_optimizer,loss='mean_squared_error')
models.fit(training_data['input'],training_data['targets'],epochs=100,verbose=
1)
Epoch 1/100
32/32 [==============================] - 1s 1ms/step - loss: 60.4431
Epoch 2/100
32/32 [==============================] - 0s 1ms/step - loss: 40.1109
Epoch 3/100
32/32 [==============================] - 0s 1ms/step - loss: 39.9997
Epoch 4/100
32/32 [==============================] - 0s 1ms/step - loss: 34.7306
Epoch 5/100
32/32 [==============================] - 0s 1ms/step - loss: 36.7232
models.layers[0].get_weights()
[array([[ 2.0985565],
[-2.907345 ]], dtype=float32), array([5.32353], dtype=float32)]
26
[array([[2.0985565 ],
[-2.907345]], dtype=float32), array([5.32353], dtype=float32)]
weights=models.layers[0].get_weights()[0]
bias=models.layers[0].get_weights()[1]
out=training_data['targets'].round(1)
from sklearn.metrics import mean_squared_error
mean_squared_error(generated_target, out, squared=False)
0.02858235386343541
plt.scatter(np.squeeze(models.predict_on_batch(training_data['input'])),np.squeez
e(training_data['targets']),c='#88c999')
plt.xlabel('Input')
plt.ylabel('Predicted Output')
plt.show()
27
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
%matplotlib inline
val_predictions = classifier.predict(X_val)
val_accuracy = accuracy_score(val_predictions, Y_val)
val_recall = recall_score(val_predictions, Y_val)
val_precision = precision_score(val_predictions, Y_val)
print('Model metrics')
print('Accuracy Train: %.2f, Validation: %.2f' % (train_accuracy,
val_accuracy))
print('Recall Train: %.2f, Validation: %.2f' % (train_recall, val_recall))
print('Precision Train: %.2f, Validation: %.2f' % (train_precision,
val_precision))
import io
train_data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))
import io
test_data = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))
train_data['train'] = 1
test_data['train'] = 0
data = train_data.append(test_data, sort=False)
test_ids = test_data['PassengerId'].values
data.head()
Passeng Surviv Pcla Na Sex Age Sib Par Tic Fare Cabi Embark tra
erId ed ss me Sp ch ket n ed in
data.describe()
train.head()
X_train.head()
classifier = GaussianNB()
classifier.fit(X_train2, Y_train2)
GaussianNB(priors=None, var_smoothing=1e-09)
34
classifier.partial_fit(X_train1, Y_train1)
GaussianNB(priors=None, var_smoothing=1e-09)
2.21250514e-01 4.41231610e-01]
test.fillna(test.mean(), inplace=True)
test_predictions = classifier.predict(test)
submission = pd.DataFrame({'PassengerId': test_ids})
submission['Survived'] = test_predictions.astype('int')
submission.to_csv('submission.csv', index=False)
submission.head(10)
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
5 897 0
6 898 1
7 899 0
8 900 1
9 901 0
36