MACHINE LEARNING LAB MANUAL
MACHINE LEARNING LAB MANUAL
the most specific hypothesis based on a given set of training data samples.
Read the training data from a .CSV file.
import random
import csv
attributes = [['Sunny','Rainy'],
['Warm','Cold'],
['Normal','High'],
['Strong','Weak'],
['Warm','Cool'],
['Same','Change']]
num_attributes = len(attributes)
a=[]
print("\n The Given Training Data Set \n")
for i in range(0,len(a)):
if a[i][num_attributes]=='Yes':
for j in range(0,num_attributes):
if a[i][j]!=hypothesis[j]:
hypothesis[j]='?'
else :
hypothesis[j]= a[i][j]
print(" For Training Example No :{0} the hypothesis is ".format(i),hypothesis)
print("\n The Maximally Specific Hypothesis for a given Training Examples :\n")
print(hypothesis)
OUTPUT :
The most general hypothesis : ['?','?','?','?','?','?']
import random
import csv
def g_0(n):
return ("?",)*n
def s_0(n):
return ('0',)*n
min_specializations(h=('?', 'x',),
domains=[['a', 'b', 'c'], ['x', 'y']],
x=('b', 'x'))
def get_domains(examples):
d = [set() for i in examples[0]]
for x in examples:
for i, xi in enumerate(x):
d[i].add(xi)
return [list(sorted(x)) for x in d]
get_domains(examples)
[['Rainy', 'Sunny'],
['Cold', 'Warm'],
['High', 'Normal'],
['Strong'],
['Cool', 'Warm'],
['Change', 'Same'],
['N', 'Y']]
def candidate_elimination(examples):
domains = get_domains(examples)[:-1]
G = set([g_0(len(domains))])
S = set([s_0(len(domains))])
i=0
print("\n G[{0}]:".format(i),G)
print("\n S[{0}]:".format(i),S)
for xcx in examples:
i=i+1
x, cx = xcx[:-1], xcx[-1] # Splitting data into attributes and decisions
if cx=='Y': # x is positive example
G = {g for g in G if fulfills(x, g)}
S = generalize_S(x, G, S)
else: # x is negative example
S = {s for s in S if not fulfills(x, s)}
G = specialize_G(x, domains, G, S)
print("\n G[{0}]:".format(i),G)
print("\n S[{0}]:".format(i),S)
return
def generalize_S(x, G, S):
S_prev = list(S)
for s in S_prev:
if s not in S:
continue
if not fulfills(x, s):
S.remove(s)
Splus = min_generalizations(s, x)
## keep only generalizations that have a counterpart in G
S.update([h for h in Splus if any([more_general(g,h)
for g in G])])
## remove hypotheses less specific than any other in S
S.difference_update([h for h in S if
any([more_general(h, h1)
for h1 in S if h != h1])])
return S
def specialize_G(x, domains, G, S):
G_prev = list(G)
for g in G_prev:
if g not in G:
continue
if fulfills(x, g):
G.remove(g)
Gminus = min_specializations(g, domains, x)
## keep only specializations that have a conuterpart in S
G.update([h for h in Gminus if any([more_general(h, s)
for s in S])])
## remove hypotheses less general than any other in G
G.difference_update([h for h in G if
any([more_general(g1, h)
for g1 in G if h != g1])])
return G
candidate_elimination(examples)
G[0]: {('?', '?', '?', '?', '?', '?')}
G[3]: {('Sunny', '?', '?', '?', '?', '?'), ('?', 'Warm', '?', '?', '?', '?'), ('?', '?', '?', '?', '?',
'Same')}
G[4]: {('Sunny', '?', '?', '?', '?', '?'), ('?', 'Warm', '?', '?', '?', '?')}
df_tennis = DataFrame.from_csv('C:\\Users\\Desktop\\Data\\PlayTennis.csv')
#df_tennis.columns[0]
df_tennis.keys()[0]
'PlayTennis'
# -p*log2*p
def entropy(probs):
import math
#Function to calulate the entropy of the given Data Sets/List with respect to target attributes
def entropy_of_list(a_list):
#print("A-list",a_list)
# print("\nClasses:",cnt)
num_instances = len(a_list)*1.0 # = 14
print("\n Classes:",min(cnt),max(cnt))
total_entropy = entropy_of_list(df_tennis['PlayTennis'])
0 No
1 No
2 Yes
3 Yes
4 Yes
5 No
6 Yes
7 No
8 Yes
9 Yes
10 Yes
11 Yes
12 Yes
13 No
Classes: No Yes
df_split = df.groupby(split_attribute_name)
# print("Name:\n",name)
# print("Group:\n",group)
# print("NOBS",nobs)
#print([target_attribute_name])
#print("DFAGGENT",df_agg_ent)
# print(df_agg_ent)
# Calculate Information Gain:
old_entropy = entropy_of_list(df[target_attribute_name])
print('\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'Humidity', 'PlayTennis')),"\n")
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
if len(cnt) == 1:
return next(iter(cnt)) # next input data set, or raises StopIteration when EOF is hit.
else:
best_attr = attribute_names[index_of_max]
# Create an empty tree, to be populated in a moment
# Split dataset
subtree = id3(data_subset,
target_attribute_name,
remaining_attribute_names,
default_class)
tree[best_attr][attr_val] = subtree
return tree
# Predicting Attributes
attribute_names = list(df_tennis.columns)
# Run Algorithm:
tree = id3(df_tennis,'PlayTennis',attribute_names)
#print(tree)
pprint(tree)
attribute = next(iter(tree))
print("Tree Keys:\n",tree[attribute].keys())
Classes: No Yes
Classes: No Yes
Classes: No Yes
Probabilities of Class No is 0.25:
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No Yes
Classes: No No
Classes: No Yes
Classes: No No
Classes: No Yes
Classes: No Yes
Classes: No No
Classes: No Yes
Classes: No Yes
Classes: No Yes
Probabilities of Class No is 0.3333333333333333:
Classes: No Yes
Best Attribute :
Outlook
Tree Keys:
Description:
CODE :
SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG \
0 KERALA 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5
1 KERALA 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8
2 KERALA 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2
3 KERALA 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8
4 KERALA 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6
.. ... ... ... ... ... ... ... ... ... ...
113 KERALA 2014 4.6 10.3 17.9 95.7 251.0 454.4 677.8 733.9
114 KERALA 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2
115 KERALA 2016 2.4 3.8 35.9 143.0 186.4 522.2 412.3 325.5
116 KERALA 2017 1.9 6.8 8.9 43.6 173.5 498.5 319.6 531.8
117 KERALA 2018 29.1 52.1 48.6 116.4 183.8 625.4 1048.5 1398.9
ANNU
J F M A M O N D
SUBDI YEA JU JU AU SE AL FLO
A E A P A C O E
VISION R N L G P RAIN ODS
N B R R Y T V C
FALL
Y
KER 19 28 44. 51 16 17 824 743 35 19 26 35 3248.
0 48.4 E
ALA 01 .7 7 .6 0.0 4.7 .6 .0 7.5 7.7 6.9 0.8 6
S
Y
KER 19 6. 57 83. 13 390 120 31 49 35 15 3326.
1 2.6 121.5 E
ALA 02 7 .3 9 4.5 .9 5.0 5.8 1.6 8.4 8.3 6
S
Y
KER 19 3. 18. 3. 83. 24 558 102 42 34 35 15 3271.
2 59.0 E
ALA 03 2 6 1 6 9.7 .6 2.5 0.2 1.8 4.1 7.0 2
S
Y
KER 19 23 32 71. 23 109 725 35 22 32 33. 3129.
3 3.0 3.3 E
ALA 04 .7 .2 5 5.7 8.2 .5 1.8 2.7 8.1 9 7
S
data.tail()
ANNU
J F M A M O N D
SUBDI YEA JU JU AU SE AL FLO
A E A P A C O E
VISION R N L G P RAIN ODS
N B R R Y T V C
FALL
Y
KER 20 4. 10. 17 95. 25 45 677 733 29 35 99. 3046.
113 47.2 E
ALA 14 6 3 .9 7 1.0 4.4 .8 .9 8.8 5.5 5 4
S
Y
KER 20 29 52. 48 11 18 62 104 139 42 35 12 4473.
117 65.1 E
ALA 18 .1 1 .6 6.4 3.8 5.4 8.5 8.9 3.6 6.1 5.4 0
S
# Finding number of missing values
data.isnull().sum() # cheaking if any colomns is left empty or not.
SUBDIVISION 0
YEAR 0
JAN 0
FEB 0
MAR 0
APR 0
MAY 0
JUN 0
JUL 0
AUG 0
SEP 0
OCT 0
NOV 0
DEC 0
ANNUAL RAINFALL 0
FLOODS 0
dtype: int64
print(data.shape)
(118, 16)
data.describe()
ANN
Y
UAL
E MA AP MA AU OC NO DE
JAN FEB JUN JUL SEP RAI
A R R Y G T V C
NFA
R
LL
co 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118.
un 0000 000 000 000 000 000 0000 0000 0000 000 000 000 0000 0000
t 00 000 000 000 000 000 00 00 00 000 000 000 00 00
m 1959 12.2 15.6 36.6 110. 228. 651. 698. 430. 246. 293. 162. 40.0 2925
ea .500 186 338 703 330 644 6177 2203 3694 207 207 311 0932 .405
n 000 44 98 39 508 915 97 39 92 627 627 017 2 085
34.2 15.4 16.4 30.0 44.6 147. 186. 228. 181. 121. 93.7 83.2 36.6 452.
st
0769 737 062 638 334 548 1813 9889 9804 901 052 004 7633 1694
d
9 66 90 62 52 778 63 66 63 131 53 85 0 07
1901 0.00 0.00 0.10 13.1 53.4 196. 167. 178. 41.3 68.5 31.5 2068
mi 0.10
.000 000 000 000 000 000 8000 5000 6000 000 000 000 .800
n 0000
000 0 0 0 00 00 00 00 00 00 00 00 000
ANN
Y
UAL
E MA AP MA AU OC NO DE
JAN FEB JUN JUL SEP RAI
A R R Y G T V C
NFA
R
LL
1930 2.17 4.70 18.1 74.3 125. 535. 533. 316. 155. 222. 93.0 10.3 2613
25
.250 500 000 000 500 050 5500 2000 7250 425 125 250 5000 .525
%
000 0 0 00 00 000 00 00 00 000 000 00 0 000
1959 5.80 8.35 28.4 110. 184. 625. 691. 386. 223. 284. 152. 31.1 2934
50
.500 000 000 000 400 600 6000 6500 2500 550 300 450 0000 .300
%
000 0 0 00 000 000 00 00 00 000 000 000 0 000
1988 18.1 21.4 49.8 136. 264. 786. 832. 500. 334. 355. 218. 54.0 3170
75
.750 750 000 250 450 875 9750 4250 1000 500 150 325 2500 .400
%
000 00 00 00 000 000 00 00 00 000 000 000 0 000
2018 83.5 79.0 217. 238. 738. 1098 1526 1398 526. 567. 365. 202. 4473
m
.000 000 000 200 000 800 .200 .500 .900 700 900 600 3000 .000
ax
000 00 00 000 000 000 000 000 000 000 000 000 00 000
data.info
SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG \
0 KERALA 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5
1 KERALA 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8
2 KERALA 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2
3 KERALA 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8
4 KERALA 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6
.. ... ... ... ... ... ... ... ... ... ...
113 KERALA 2014 4.6 10.3 17.9 95.7 251.0 454.4 677.8 733.9
114 KERALA 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2
115 KERALA 2016 2.4 3.8 35.9 143.0 186.4 522.2 412.3 325.5
116 KERALA 2017 1.9 6.8 8.9 43.6 173.5 498.5 319.6 531.8
117 KERALA 2018 29.1 52.1 48.6 116.4 183.8 625.4 1048.5 1398.9
data.cov()
ANN
UAL
YE MA MA
JAN FEB APR JUN JUL AUG SEP OCT NOV DEC RAI
AR R Y
NFA
LL
- - - - - - - - -
1170 2.17 132. 274.9 448.9
YE 119. 13.2 301.1 1114. 1749. 96.8 370. 155. 3063.
.166 692 6256 8376 1581
AR 378 0726 2606 1491 9538 7649 3602 1235 3444
667 3 41 1 2
632 5 8 45 46 6 56 04 44
- - - - -
239. 4.97 36.5 24.0 163.0 545.5 121.9 830.1
JA 119. 24.43 214.0 50.8 14.2 50.9
437 919 7705 3951 6240 7428 7090 5409
N 3786 4163 9484 1245 0542 6820
427 2 3 2 3 1 0 2
32 4 1 1 9
- - -
4.97 269. 121. 90.5 165.2 132.6 81.6 455.9
FE 2.17 202.1 21.74 69.44 222. 76.4
919 166 0277 8596 9358 2965 8435 1333
B 6923 2965 8364 2838 3326 3407
2 362 66 6 0 4 5 0
5 84 9
- - - -
36.5 121. 903. 99.3 106.3 126.1 232.0 527.1 28.9 1578.
MA 13.2 456.7 64.9 81.5
770 027 8357 1578 4856 8624 3387 8475 9010 3057
R 0726 2199 8028 7308
53 766 79 4 7 9 4 8 8 93
5 0 5 9
- - -
132. 24.0 90.5 99.3 1992 606.5 153.0 473. 82.4 2267.
AP 754.4 388.5 70.33 180.
6256 395 859 1578 .145 4039 7005 3309 6427 5901
R 9018 9154 6859 7103
41 12 66 4 044 3 8 62 6 85
5 0 72
- - - - - - -
163. 2177 2101. 2725 1165 2099
MA 301. 202. 456. 754. 33.92 1571. 3340. 638.
062 0.641 8923 .149 .417 7.420
Y 1260 129 7219 4901 9279 7035 5866 9783
403 812 04 056 193 795
68 655 90 85 71 52 71
- - - -
545. 165. 106. 606. 3466 4047. 20.1 247. 3817
JU 1114 33.92 492.9 1194. 581.
574 293 3485 5403 3.499 5670 6114 3294 0.332
N .149 9279 4423 5766 6996
281 580 67 93 937 71 5 60 986
145 9 33 54
- - - -
121. 21.7 126. 153. 4047. 5243 6436. 5846. 541. 6750
1749 1571. 543. 113.
JUL 970 483 1862 0700 5670 5.946 8768 3471 2144 8.201
.953 7035 4793 9883
900 64 49 58 71 420 65 94 59 520
846 71 71 96
- - - - -
274. 24.4 69.4 232. 6436. 3311 2178. 948. 3398
AU 388. 3340. 492.9 3094 1706
9837 341 428 0338 8768 6.888 7627 3650 7.052
G 5915 5866 4423 .959 .808
61 63 38 74 65 805 99 73 721
40 52 9 423 293
- - - - -
448. 132. 527. 70.3 2101. 5846. 2178. 1485 2361
SE 214. 1194. 369. 280. 49.2
9158 629 1847 3685 8923 3471 7627 9.885 0.285
P 094 5766 5008 0773 1024
12 654 58 9 04 94 99 839 602
844 33 28 50 3
- - - - - - -
81.6 473. 2725. 541.2 8780 8722.
OC 96.8 50.8 64.9 20.16 3094. 369.5 187. 134.
843 3309 1490 1445 .674 4679
T 7649 124 8028 1145 9594 0082 5802 2629
55 62 56 9 386 10
6 51 5 23 8 56 78
- - - - - - - -
28.9 948.3 - 215. 1345 712.5
DE 155. 50.9 76.4 180. 638.9 581.6 113.9 134.
9010 6507 49.21 8019 .153 6072
C 1235 682 340 7103 7837 9965 8839 2629
8 3 0243 48 160 1
04 09 79 72 1 4 6 78
AN
NU -
830. 455. 1578 2267 2099 3817 6750 3398 2361 8722 5597 712. 2044
AL 3063
154 913 .305 .590 7.420 0.332 8.201 7.052 0.285 .467 .319 5607 57.17
RAI .344
092 330 793 185 795 986 520 721 602 910 516 21 2453
NF 444
ALL
data.corr()
YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
ANNUAL RAINFALL
YEAR 1.000000 -0.225531 0.003879 -0.012842 0.086865 -0.059661
-0.174938 -0.223403 0.044173 0.107655 -0.030223 -0.130129
-0.123643 -0.198048
JAN -0.225531 1.000000 0.019613 0.078626 0.034807 0.071420
0.189375 0.034423 0.008677 -0.113502 -0.035044 -0.011034
-0.089809 0.118648
FEB 0.003879 0.019613 1.000000 0.245375 0.123706 -0.083500
0.054114 0.005789 0.023259 0.066317 0.053133 -0.162880
-0.127025 0.061457
MAR -0.012842 0.078626 0.245375 1.000000 0.074014 -0.102961
0.019000 0.018330 0.042411 0.143850 -0.023066 -0.032612
0.026292 0.116103
APR 0.086865 0.034807 0.123706 0.074014 1.000000 -0.114566
0.072990 0.014977 -0.047842 0.012928 0.113172 0.022206
-0.110392 0.112358
MAY -0.059661 0.071420 -0.083500 -0.102961 -0.114566 1.000000
0.001235 -0.046518 -0.124412 0.116860 0.197102 0.094934
-0.118077 0.314723
JUN -0.174938 0.189375 0.054114 0.019000 0.072990 0.001235
1.000000 0.094939 -0.014549 -0.052634 0.001156 0.015967
-0.085188 0.453407
JUL -0.223403 0.034423 0.005789 0.018330 0.014977 -0.046518
0.094939 1.000000 0.154467 0.209441 0.025223 -0.028526
-0.013573 0.651990
AUG 0.044173 0.008677 0.023259 0.042411 -0.047842 -0.124412
-0.014549 0.154467 1.000000 0.098215 -0.181496 -0.112729
0.142090 0.413036
SEP 0.107655 -0.113502 0.066317 0.143850 0.012928 0.116860
-0.052634 0.209441 0.098215 1.000000 -0.032348 -0.027615
-0.011007 0.428344
OCT -0.030223 -0.035044 0.053133 -0.023066 0.113172 0.197102
0.001156 0.025223 -0.181496 -0.032348 1.000000 -0.024060
-0.039067 0.205861
NOV -0.130129 -0.011034 -0.162880 -0.032612 0.022206 0.094934
0.015967 -0.028526 -0.112729 -0.027615 -0.024060 1.000000
0.070720 0.148783
DEC -0.123643 -0.089809 -0.127025 0.026292 -0.110392 -0.118077
-0.085188 -0.013573 0.142090 -0.011007 -0.039067 0.070720
1.000000 0.042967
ANNUAL RAINFALL -0.198048 0.118648 0.061457 0.116103 0.112358
0.314723 0.453407 0.651990 0.413036 0.428344 0.205861
0.148783 0.042967 1.000000
SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT
NOV DEC ANNUAL RAINFALL FLOODS
0 KERALA 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5 197.7
266.9 350.8 48.4 3248.6 1
1 KERALA 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8 491.6
358.4 158.3 121.5 3326.6 1
2 KERALA 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2 341.8
354.1 157.0 59.0 3271.2 1
3 KERALA 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8 222.7
328.1 33.9 3.3 3129.7 1
4 KERALA 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6 217.2
383.5 74.4 0.2 2741.6 0
x=data.iloc[:,1:14]
x.head()
YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
0 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5 197.7 266.9 350.8
48.4
1 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8 491.6 358.4 158.3
121.5
2 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2 341.8 354.1 157.0
59.0
3 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8 222.7 328.1 33.9
3.3
4 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6 217.2 383.5 74.4
0.2
y=data.iloc[:,-1]
y
0 1
1 1
2 1
3 1
4 0
..
113 1
114 0
115 0
116 0
117 1
Name: FLOODS, Length: 118, dtype: int64
8 1909 54.1 11.8 61.3 93.8 473.2 704.7 782.3 258.0 195.4 212.1 171.1 32.3
73 1974 1.6 5.4 16.0 128.0 221.5 266.9 1004.2 533.6 383.6 142.1 61.0 3.6
77 1978 3.3 14.7 31.4 73.9 396.8 758.1 686.7 516.8 119.4 171.0 365.6 39.0
114 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1 223.6 79.4
80 1981 7.0 6.8 28.5 75.9 166.3 912.4 489.8 495.6 376.6 265.0 138.6 43.3
x_train.dtypes
YEAR int64
JAN float64
FEB float64
MAR float64
APR float64
MAY float64
JUN float64
JUL float64
AUG float64
SEP float64
OCT float64
NOV float64
DEC float64
dtype: object
x_test.head()
YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
16 1917 2.9 47.6 79.4 38.1 122.9 703.7 342.7 335.1 470.3 264.1 256.4 41.6
9 1910 2.7 25.7 23.3 124.5 148.8 680.0 484.1 473.8 248.6 356.6 280.4 0.1
18 1919 43.0 6.1 33.9 65.9 247.0 636.8 648.0 484.2 255.9 249.2 280.1 53.0
88 1989 10.3 0.0 30.1 141.5 169.4 657.5 450.7 285.5 271.1 308.0 92.9 5.6
72 1973 0.0 0.3 12.3 131.5 119.9 617.0 583.5 487.5 61.3 260.8 84.5 53.8
# type casting.
y_train=y_train.astype('int')
y_train
8 1
73 0
77 1
114 0
80 1
..
96 1
102 0
46 1
42 1
43 0
Name: FLOODS, Length: 94, dtype: int64
y_test=y_test.astype('int')
y_test
16 0
9 0
18 1
88 0
72 0
100 0
4 0
64 0
67 1
101 0
30 1
11 1
23 1
66 0
99 0
106 1
33 0
95 0
41 1
76 1
108 0
78 0
117 1
84 0
Name: FLOODS, dtype: int64
# 1. KNN Classifier
clf=neighbors.KNeighborsClassifier()
clf.fit(x_train,y_train)
KNeighborsClassifier()
knn_acc
knn_proba
array([[0.2, 0.8], [0.8, 0.2], [0. , 1. ], [0.6, 0.4], [0.2, 0.8], [0.2,
0.8], [1. , 0. ], [0.4, 0.6], [0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.2, 0.8],
[0.6, 0.4], [0.6, 0.4], [1. , 0. ], [0.4, 0.6], [0.8, 0.2], [0.6, 0.4], [1. ,
0. ], [0.2, 0.8], [0.6, 0.4], [0.6, 0.4], [0.4, 0.6], [0.2, 0.8], [0.6, 0.4],
[0.6, 0.4], [0.4, 0.6], [0.6, 0.4], [0.6, 0.4], [0.6, 0.4], [0.6, 0.4], [0.8,
0.2], [0.6, 0.4], [0.8, 0.2], [1. , 0. ], [0.8, 0.2], [0.2, 0.8], [0.2, 0.8],
[0.4, 0.6], [0. , 1. ], [0.6, 0.4], [0.8, 0.2], [0.6, 0.4], [0.6, 0.4], [1. ,
0. ], [0. , 1. ], [0. , 1. ], [0.6, 0.4], [1. , 0. ], [0.4, 0.6], [0.6, 0.4],
[0.2, 0.8], [0.4, 0.6], [0.8, 0.2], [0.2, 0.8], [0.6, 0.4], [0.2, 0.8], [0.4,
0.6], [0.2, 0.8], [0.4, 0.6], [1. , 0. ], [0.2, 0.8], [0.4, 0.6], [0.4, 0.6],
[1. , 0. ], [0.8, 0.2], [0.4, 0.6], [0.6, 0.4], [0.6, 0.4], [0.8, 0.2], [0.4,
0.6], [0.8, 0.2], [0.4, 0.6], [0.2, 0.8], [0.4, 0.6], [0.8, 0.2], [0.8, 0.2],
[0.8, 0.2], [0.4, 0.6], [0.2, 0.8], [0.4, 0.6], [1. , 0. ], [0.4, 0.6], [0.4,
0.6], [0.4, 0.6], [0.8, 0.2], [0.6, 0.4], [1. , 0. ], [0. , 1. ], [0.4, 0.6],
[0.8, 0.2], [0.6, 0.4], [0.2, 0.8], [0.6, 0.4]])
Accuracy Score:79.166667
Recall Score:77.777778
ROC score:78.888889
[[12 3]
[ 2 7]]
lr_proba
array([0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0])
print(y_test.values)
[0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0]
accuracy score:87.500000
recall score:88.888889
roc score:87.777778
[[13 2]
[ 1 8]]
svc_proba
array([[2.53477212e-01, 7.46522788e-01], [6.26215813e-01, 3.73784187e-01],
[8.85692145e-02, 9.11430785e-01], [6.11301593e-01, 3.88698407e-01],
[1.31856926e-01, 8.68143074e-01], [1.82732776e-02, 9.81726722e-01],
[8.09704178e-01, 1.90295822e-01], [1.12116148e-01, 8.87883852e-01],
[8.11536808e-01, 1.88463192e-01], [8.50419768e-01, 1.49580232e-01],
[1.46441543e-01, 8.53558457e-01], [9.61272383e-02, 9.03872762e-01],
[8.73845038e-01, 1.26154962e-01], [4.74410079e-01, 5.25589921e-01],
[9.23344720e-01, 7.66552803e-02], [1.48414321e-01, 8.51585679e-01],
[8.64377127e-01, 1.35622873e-01], [6.01890924e-01, 3.98109076e-01],
[9.58578971e-01, 4.14210286e-02], [2.09878503e-01, 7.90121497e-01],
[9.02162919e-01, 9.78370813e-02], [2.86267143e-01, 7.13732857e-01],
[1.75833318e-01, 8.24166682e-01], [5.25702332e-01, 4.74297668e-01],
[6.00511908e-01, 3.99488092e-01], [1.22322092e-01, 8.77677908e-01],
[2.05031520e-01, 7.94968480e-01], [6.33213733e-01, 3.66786267e-01],
[4.66490741e-01, 5.33509259e-01], [7.42615883e-01, 2.57384117e-01],
[9.16207567e-01, 8.37924333e-02], [2.15335464e-01, 7.84664536e-01],
[6.94553329e-01, 3.05446671e-01], [6.35611529e-01, 3.64388471e-01],
[5.62736716e-01, 4.37263284e-01], [9.74313518e-01, 2.56864820e-02],
[1.13881509e-01, 8.86118491e-01], [2.61365437e-02, 9.73863456e-01],
[5.12240780e-02, 9.48775922e-01], [1.33584926e-05, 9.99986642e-01],
[9.93063512e-01, 6.93648822e-03], [6.43663018e-01, 3.56336982e-01],
[3.59580083e-01, 6.40419917e-01], [8.37230882e-01, 1.62769118e-01],
[9.93599759e-01, 6.40024146e-03], [4.62520080e-02, 9.53747992e-01],
[2.34057595e-02, 9.76594240e-01], [7.00738214e-01, 2.99261786e-01],
[9.86154637e-01, 1.38453627e-02], [2.08709962e-01, 7.91290038e-01],
[9.33933463e-01, 6.60665373e-02], [5.34954399e-03, 9.94650456e-01],
[2.04084161e-01, 7.95915839e-01], [9.90751572e-01, 9.24842834e-03],
[4.45372682e-03, 9.95546273e-01], [1.66091700e-01, 8.33908300e-01],
[3.86929555e-06, 9.99996131e-01], [6.69515754e-01, 3.30484246e-01],
[2.04190402e-01, 7.95809598e-01], [2.08019626e-02, 9.79198037e-01],
[7.87668011e-01, 2.12331989e-01], [1.92560722e-02, 9.80743928e-01],
[4.02508642e-01, 5.97491358e-01], [8.97733715e-03, 9.91022663e-01],
[9.98020874e-01, 1.97912611e-03], [2.16682834e-01, 7.83317166e-01],
[4.53850253e-01, 5.46149747e-01], [3.86311001e-01, 6.13688999e-01],
[8.49607998e-01, 1.50392002e-01], [9.91714321e-01, 8.28567937e-03],
[1.22655934e-01, 8.77344066e-01], [9.91825902e-01, 8.17409787e-03],
[1.98061346e-01, 8.01938654e-01], [5.13864541e-02, 9.48613546e-01],
[1.10499781e-01, 8.89500219e-01], [9.92124948e-01, 7.87505233e-03],
[7.55786228e-01, 2.44213772e-01], [9.86458934e-01, 1.35410660e-02],
[2.32739746e-02, 9.76726025e-01], [1.85677493e-01, 8.14322507e-01],
[2.63622292e-02, 9.73637771e-01], [9.98743897e-01, 1.25610287e-03],
[3.13956197e-01, 6.86043803e-01], [1.50578196e-02, 9.84942180e-01],
[4.14229509e-01, 5.85770491e-01], [9.33259709e-01, 6.67402914e-02],
[5.39538365e-01, 4.60461635e-01], [9.99054569e-01, 9.45431174e-04],
[4.76500210e-06, 9.99995235e-01], [1.70654294e-01, 8.29345706e-01],
[9.88791890e-01, 1.12081100e-02], [4.32883215e-01, 5.67116785e-01],
[1.77166530e-02, 9.82283347e-01], [9.78291777e-01, 2.17082225e-02]])
svc_scores=svc_proba[:,1]
svc_scores
y_pred=svc_classifier.predict(x_test)
print("Actual Flood Values:")
print(y_test.values)
recall score:100.000000
roc score:90.000000
[[12 3]
[ 0 9]]
print("Predicted Values:")
y_pred=dtc_clf.predict(x_test)
y_pred
Predicted Values:
array([1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
0, 1, 0])
print("Actual Values:")
print(y_test.values)
Actual Values:
[0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0]
accuracy score:62.500000
recall score:66.666667
roc score:63.333333
[[9 6]
[3 6]]
RandomForestClassifier(max_depth=3, random_state=0)
rmf_clf_acc=cross_val_score(rmf_clf,x_train_std,y_train,cv=3,scoring="accuracy",n_jobs=-1)
rmf_proba=cross_val_predict(rmf_clf,x_train_std,y_train,cv=3,method='predict_proba')
rmf_clf_acc
rmf_proba
accuracy score:62.500000
recall score:66.666667
roc score:63.333333
[[9 6]
[3 6]]
models = []
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
models.append(('KNN', KNeighborsClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('SVC', SVC()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
names = []
scores = []
for name, model in models:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
scores.append(accuracy_score(y_test, y_pred))
names.append(name)
tr_split = pd.DataFrame({'Name': names, 'Score': scores})
tr_split
Name Score
0 KNN 0.791667
1 LR 0.875000
2 SVC 0.875000
3 DT 0.625000
4 RF 0.625000
plt.show()
tr_split['Score'].max()
0.875
# So, we can see and choose the best model for Prediction.
5: Develop a program for Bias, Variance, Remove duplicates , Cross Validation
Code:
import numpy as np
import pickle
import matplotlib.pyplot as plot
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd
reg = LinearRegression()
#for each degree run fit on 10 models,800 data points each,pass test data and get output
for i in range(1,21):
difference = np.zeros((10,80))
temp=np.zeros(80)
#create output array to get predicted values
y_return=np.zeros((10,80))
for j in range(10):
polynomial = PolynomialFeatures(degree=i , include_bias=False)
X_TRAIN = polynomial.fit_transform(x_train[j])
X_TEST = polynomial.fit_transform(x_test)
reg.fit(X_TRAIN , y_train[j])
prediction = []
prediction = (reg.predict(X_TEST))
y_return[j]=prediction
difference[j] = ((y_test - y_return[j])**2)
#getting average mse
for j in range (10):
temp += difference[j]
temp /= 10
tot_err[i] = np.mean(temp)
#for each degree ,calculate bias
y_mean=np.mean(y_return,axis=0)
bias[i]=np.mean(abs(y_mean - y_test))
biassq[i] = np.mean((y_mean - y_test)**2)
# for each degree , calculate variance
y_var=np.var(y_return,axis=0)
variance[i]=np.mean(y_var)
#calculate irreducible error
irred_error[i]=np.mean(temp) - (biassq[i] +variance[i])
bias[0]=None
biassq[0]=None
variance[0]=None
irred_error[0]=None
tot_err[0]=None
table_bias=pd.DataFrame({'Degree':np.array(range(0,21)),'Bias':bias,'Variance': variance, \
'irreducible error': irred_error})
print(table_bias.to_string(index=False))
plot.legend()
6. Write a program to implement Categorical Encoding, One-hot Encoding
Code:
values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
['can' 'i' 'eat' 'the' 'pizza' 'you' 'can' 'eat' 'the' 'pizza']
[0 2 1 4 3 5 0 1 4 3]
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
[[1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[1. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0.]]
BACKPROPAGATION Algorithm
• Create a feed-forward network with ni inputs, nhidden hidden units, and nout output
units.
• Initialize all network weights to small random numbers
• Until the termination condition is met, Do
1
Machine Learning Laboratory
Training Examples:
Expected % in
Example Sleep Study
Exams
1 2 9 92
2 1 5 86
3 3 6 89
Program:
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0) # maximum of X array longitudinally
y = y/100
#Sigmoid Function
def sigmoid (x):
return 1/(1 + np.exp(-x))
#Variable initialization
epoch=5000 #Setting training iterations
lr=0.1 #Setting learning rate
inputlayer_neurons = 2 #number of features in data set
hiddenlayer_neurons = 3 #number of hidden layers neurons
output_neurons = 1 #number of neurons at output layer
2
Machine Learning Laboratory
#Forward Propogation
hinp1=np.dot(X,wh)
hinp=hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+ bout
output = sigmoid(outinp)
#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO* outgrad
EH = d_output.dot(wout.T)
3
Machine Learning Laboratory
Output:
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.89726759]
[0.87196896]
[0.9000671]]
4
Exp. No. 8. Write a program to implement k-Nearest Neighbor algorithm to classify the iris
data set. Print both correct and wrong predictions. Java/Python ML library classes can be
used for this problem.
Data Set:
Iris Plants Dataset: Dataset contains 150 instances (50 in each of three classes) Number of
Attributes: 4 numeric, predictive attributes and the Class.
Python Program to Implement and Demonstrate KNN
Algorithm :
import numpy as np
import pandas as pd
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
print(X.head())
ypred = classifier.predict(Xtest)
i=0
print ("\n-------------------------------------------------------------------------")
print ("-------------------------------------------------------------------------")
if (label == ypred[i]):
i=i+1
print ("-------------------------------------------------------------------------")
print ("-------------------------------------------------------------------------")
print ("-------------------------------------------------------------------------")
print ("-------------------------------------------------------------------------")
OUTPUT :
sepal-length sepal-width petal-length petal-width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
----------------------------------------------------------------
Original Label Predicted Label
Correct/Wrong
----------------------------------------------------------------
Iris-setosa Iris-setosa Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-virginica Iris-versicolor Wrong
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-virginica Wrong
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-setosa Iris-setosa Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
----------------------------------------------------------------
Confusion Matrix:
[[13 0 0]
[ 0 17 1]
[ 0 1 13]]
----------------------------------------------------------------
Classification Report:
precision recall f1-score support
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
----------------------------------------------------------------
Accuracy of the classifer is 0.96
----------------------------------------------------------------
Machine Learning Laboratory
9.Implement the non-parametric Locally Weighted Regression algorithm in order to fit data
points. Select appropriate data set for your experiment and draw graphs.
Regression:
• Regression is a technique from statistics that is used to predict values of a desired
target quantity when the target quantity is continuous.
• In regression, we seek to identify (or estimate) a continuous variable y associated with
a given input vector x.
• y is called the dependent variable.
• x is called the independent variable.
Loess/Lowess Regression:
Loess regression is a nonparametric technique that uses local weighted regression to fit a
smooth curve through points in a scatter plot.
1
Machine Learning Laboratory
Lowess Algorithm:
• Locally weighted regression is a very powerful nonparametric model used in statistical
learning.
• Given a dataset X, y, we attempt to find a model parameter β(x) that minimizes
residual sum of weighted squared errors.
• The weights are given by a kernel function (k or w) which can be chosen arbitrarily
Algorithm
1. Read the Given data Sample to X and the curve (linear or non linear) to Y
2. Set the value for Smoothening parameter or Free parameter say τ
3. Set the bias /Point of interest set x0 which is a subset of X
4. Determine the weight matrix using :
6. Prediction = x0*β:
Program
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.io import push_notebook
2
Machine Learning Laboratory
# predict value
return x0 @ beta # @ Matrix Multiplication or Dot Product
for prediction
def radial_kernel(x0, X, tau):
return np.exp(np.sum((X - x0) ** 2, axis=1) / (-2 * tau *
tau))
# Weight or Radial Kernal Bias Function
n = 1000
# generate dataset
X = np.linspace(-3, 3, num=n)
print("The Data Set ( 10 Samples) X :\n",X[1:10])
Y = np.log(np.abs(X ** 2 - 1) + .5)
print("The Fitting Curve Data Set (10 Samples) Y
:\n",Y[1:10])
# jitter X
X += np.random.normal(scale=.1, size=n)
print("Normalised (10 Samples) X :\n",X[1:10])
show(gridplot([
[plot_lwr(10.), plot_lwr(1.)],
[plot_lwr(0.1), plot_lwr(0.01)]]))
3
Machine Learning Laboratory
Output
4
Machine Learning Laboratory
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
5
Machine Learning Laboratory
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();
6
Experiment - 10:
AIM: Assuming a set of Documents that need to be classified, use the naive Bayesian Classifier model to perform thi
s task. Built - in Java classes API can be used to write the program. Calculate the accuracy, Precision and recall for
your dataset.
import pandas as pd
msg=pd.read_csv("naivetext.csv",names=["message","label"])
print(" The dimensions of the dataset", msg.shape)
msg["labelnum"]=msg.label.map({"pos":1,"neg":0})
X=msg.message
Y=msg.labelnum
print(X)
print(Y)
from sklearn.model_selection import train_test_split
X_train,X test, Y train, Y test = train_test_split(X,Y)
print("\n the total no. of training data:", y train, shape)
print("\n the total no. of test data:", y test, shape)
from sklearn.feature_extraction.text import countvectorizer
Count_Vect=CountVectorizer()
X train_dtm=Count_Vect.fit_transform(X train)
X test_dtm=Count_Vect.transform(X test)
print("\n the words or tokens in the text documents \n")
print(Count_Vect.get=feature_names())
from sklearn.naive_bayes import multinomial NB
df=Multinominal NB().fit(X train_dtm, Y train)
predicted=df.predict(xtest_dtm)
from sklearn import metrics
print("\n Accuracy of the classifier is", metrics.accuracy_score(Y test, predicted))
print("\n confusion matrix")
print(metrics.confusion_matrix(Y test, predicted))
print(metrics.confusion_score(Y test, predicted))
print("\n the values of recall", metrics.recall_score(Y test, predicted))h
Experiment-11: Apply EM algorithm to cluster a Data Set. Use the same data set for clustering
using k-Means algorithm. Compare the results of these two algorithms and comment on the
quality of clustering. You can add Java/Python ML library classes/API in the program.
CODE :
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import pandas as pd
X=pd.read_csv('/content/kmeansdata.csv')
x1 = X['Distance_Feature'].values
x2 = X['Speeding_Feature'].values
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
plt.plot()
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Dataset')
plt.scatter(x1, x2)
plt.show()
#code for EM
gmm = GaussianMixture(n_components=3)
gmm.fit(X)
em_predictions = gmm.predict(X)
print("\nEM predictions")
print(em_predictions)
print("mean:\n",gmm.means_)
print('\n')
print("Covariances\n",gmm.covariances_)
print(X)
plt.title('Exceptation Maximum')
plt.scatter(X[:,0], X[:,1],c=em_predictions,s=50)
plt.show()
EM predictions
[1 1 1 ... 0 0 0]
mean:
[[180.12995794 10.18334766]
[ 50.04762937 8.82874097]
[179.34576209 66.43976809]]
Covariances
[[[ 3.58889799e+02 -3.86609795e-02]
[-3.86609795e-02 2.50576436e+01]]
[[ 1.02463951e+02 1.38088891e+00]
[ 1.38088891e+00 1.00051507e+02]]
[[ 4.20956286e+02 -4.07566432e+01]
[-4.07566432e+01 3.87123236e+02]]]
[[ 71.24 28. ]
[ 52.53 25. ]
[ 64.54 27. ]
...
[170.91 12. ]
[176.14 5. ]
[168.03 9. ]]
#code for Kmeans
import matplotlib.pyplot as plt1
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print(kmeans.cluster_centers_)
print(kmeans.labels_)
plt.title('KMEANS')
plt1.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow')
plt1.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
[[180.34311782 10.52011494]
[ 50.04763438 8.82875 ]
[177.83509615 70.28846154]]
[1 1 1 ... 0 0 0]
<matplotlib.collections.PathCollection at 0x7f652d2554d0>
Experiment-12: Exploratory Data Analysis for Classification using Pandas or Matplotlib.
Description:
The purpose of this EDA is to find insights which will serve us later in another notebook for Data
cleaning/preparation/transformation which will ultimately be used into a machine learning algorithm. We will
proceed as follow:
CODE:
# Preparations: For the preparations lets first import the necessary libraries and load the
files needed for our EDA.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
df = pd.read_csv('/content/train.csv')
df.head()
MS MS Lot Lo A Lan Po Po F Mis Mi M Y Sal Sale Sal
St Lot Ut
I Sub Zo Fro tA ll dCo ol ol e cFe sc oS rS eT Con eP
re Sh ilit ...
d Clas nin ntag re e ntou Ar Q n atur Va ol ol yp ditio ric
et ape ies
s g e a y r ea C ce e l d d e n e
2
8 P Al 0
N No
65 4 a Na Lv lP Na Na 20 8
0 1 60 RL Reg ... 0 a 0 2 WD rm
.0 5 v N l u N N 08 5
N al
0 e b 0
0
1
9 P Al 8
N No
80 6 a Na Lv lP Na Na 20 1
1 2 20 RL Reg ... 0 a 0 5 WD rm
.0 0 v N l u N N 07 5
N al
0 e b 0
0
2
1
P Al 2
1 N No
68 a Na Lv lP Na Na 20 3
2 3 60 RL 2 IR1 ... 0 a 0 9 WD rm
.0 v N l u N N 08 5
5 N al
e b 0
0
0
1
9 P Al 4
N Ab
60 5 a Na Lv lP Na Na 20 0
3 4 70 RL IR1 ... 0 a 0 2 WD nor
.0 5 v N l u N N 06 0
N ml
0 e b 0
0
2
1
P Al 5
4 N No
84 a Na Lv lP Na Na 20 0
4 5 60 RL 2 IR1 ... 0 a 0 12 WD rm
.0 v N l u N N 08 0
6 N al
e b 0
0
0
5 rows × 81 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
# From these observations, we can see that some features won't be relevant in our exploratory analysis as there are too
much missing values (such as Alley and PoolQC). Plus there is so much features to analyse that it may be better to
concentrate on the ones which can give us real insights. Let's just remove Id and the features with 30% or less NaN values.
print(df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});
count 1460.000000
mean 180921.195890
std 79442.502883
min 34900.000000
25% 129975.000000
50% 163000.000000
75% 214000.000000
max 755000.000000
Name: SalePrice, dtype: float64
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot`
is a deprecated function and will be removed in a future version. Please adapt your code to use
either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level
function for histograms).warnings.warn(msg, FutureWarning)
# With this information we can see that the prices are skewed right and some outliers lies above ~500,000.
We will eventually want to get rid of them to get a normal distribution of the independent variable
(`SalePrice`) for machine learning.
list(set(df.dtypes.tolist()))
[dtype('O'), dtype('float64'), dtype('int64')]
2
2 0
6
84 200 20 19 70 0 8
0 60 5. 7 5 0 ... 0 61 0 0 0 0 0 2
50 3 03 6.0 6 0 5
0
8 0
0
1
2 8
8
96 197 19 97 29 0 1
1 20 0. 6 8 0.0 0 ... 0 0 0 0 0 0 5
00 6 76 8 8 0 5
0
7 0
0
2
2 2
6 11
200 20 16 48 0 3
2 60 8. 25 7 5 0 ... 0 42 0 0 0 0 0 9
1 02 2.0 6 0 5
0 0
8 0
0
1
2 4
6
95 191 19 21 27 0 0
3 70 0. 7 5 0.0 0 ... 0 35 0 0 0 0 2
50 5 70 6 2 0 0
0
6 0
0
2
2 5
8 14
200 20 35 65 19 1 0 0
4 60 4. 26 8 5 0 ... 84 0 0 0 0 0
0 00 0.0 5 2 2 0 0
0 0
8 0
0
5 rows × 37 columns
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose
informations
# Features such as `1stFlrSF`, `TotalBsmtSF`, `LotFrontage`, `GrLiveArea`... seems to share a similar
distribution to the one we have with `SalePrice`. Lets see if we can find new clues later.
# Correlation
# Now we'll try to find which features are strongly correlated with SalePrice. We'll store them in a var
called golden_features_list. We'll reuse our df_num dataset to do so.
# Perfect, we now have a list of strongly correlated values but this list is incomplete as we know that
correlation is affected by outliers. So we could proceed as follow:
• Plot the numerical features and see which ones have very few or explainable outliers
• Remove the outliers from these features and see which one can have a good correlation without
their outliers
Btw, correlation by itself does not always explain the relationship between data so ploting them could even
lead us to new insights and in the same manner, check that our correlated values have a linear relationship to
the SalePrice.
For example, relationships such as curvilinear relationship cannot be guessed just by looking at the
correlation value so lets take the features we excluded from our correlation table and plot them to see if they
show some kind of pattern.
import operator
individual_features_df = []
tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]
individual_features_df.append(tmpDf)
# We found strongly correlated predictors with `SalePrice`. Later with feature engineering we may add
dummy values where value of a given feature > 0 would be 1 (precense of such feature) and 0 would be 0.
For `2ndFlrSF` for example, we could create a dummy value for its precense or non-precense and finally sum
it up to `1stFlrSF`.
# Conclusion
# By looking at correlation between numerical values we discovered 11 features which have a strong
relationship to a house price. Besides correlation we didn't find any notable pattern on the datas which are
not correlated.
plt.figure(figsize=(12, 10))
13. Write a program to construct a Bayesian network considering medical data. Use this model
to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can
use Java/Python ML library classes/API
Theory
A Bayesian network is a directed acyclic graph in which each edge corresponds to a conditional
dependency, and each node corresponds to a unique random variable.
Bayesian network consists of two major parts: a directed acyclic graph and a set of conditional
probability distributions
• The directed acyclic graph is a set of random variables represented by nodes.
• The conditional probability distribution of a node (random variable) is defined for every
possible outcome of the preceding causal node(s).
For illustration, consider the following example. Suppose we attempt to turn on our computer,
but the computer does not start (observation/evidence). We would like to know which of the
possible causes of computer failure is more likely. In this simplified illustration, we assume
only two possible causes of this misfortune: electricity failure and computer malfunction.
The corresponding directed acyclic graph is depicted in below figure.
Fig: Directed acyclic graph representing two independent possible causes of a computer failure.
The goal is to calculate the posterior conditional probability distribution of each of the possible
unobserved causes given the observed evidence, i.e. P [Cause | Evidence].
Machine Learning Laboratory
Data Set:
Title: Heart Disease Databases
The Cleveland database contains 76 attributes, but all published experiments refer to using a
subset of 14 of them. In particular, the Cleveland database is the only one that has been used
by ML researchers to this date. The "Heartdisease" field refers to the presence of heart disease
in the patient. It is integer valued from 0 (no presence) to 4.
Database: 0 1 2 3 4 Total
Cleveland: 164 55 36 35 13 303
Attribute Information:
1. age: age in years
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type
• Value 1: typical angina
• Value 2: atypical angina
• Value 3: non-anginal pain
• Value 4: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
• Value 0: normal
• Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation
or depression of > 0.05 mV)
• Value 2: showing probable or definite left ventricular hypertrophy by Estes'
criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
• Value 1: upsloping
• Value 2: flat
• Value 3: downsloping
12. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
13. Heartdisease: It is integer valued from 0 (no presence) to 4.
Machine Learning Laboratory
Program:
import numpy as np
import pandas as pd
import csv
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
3
Machine Learning Laboratory
Output:
Machine Learning Laboratory
6
Experiment-14: Write a program to Implement Support Vector Machines
CODE:
X = datasets.iloc[:, [2,3]].values
Y = datasets.iloc[:, 4].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
# Fitting the classifier into the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_Train, Y_Train)
SVC(kernel='linear', random_state=0)
# Predicting the test set results
Y_Pred = classifier.predict(X_Test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_Test, Y_Pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_Set, Y_Set = X_Train, Y_Train
X1, X2 = np.meshgrid(np.arange(start = X_Set[:, 0].min() - 1, stop = X_Set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_Set[:, 1].min() - 1, stop = X_Set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(Y_Set)):
plt.scatter(X_Set[Y_Set == j, 0], X_Set[Y_Set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Support Vector Machine (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()