0% found this document useful (0 votes)
40 views105 pages

MACHINE LEARNING LAB MANUAL

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
40 views105 pages

MACHINE LEARNING LAB MANUAL

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 105

Program 1 : Implement and demonstrate the FIND-S algorithm for finding

the most specific hypothesis based on a given set of training data samples.
Read the training data from a .CSV file.

import random
import csv

attributes = [['Sunny','Rainy'],
['Warm','Cold'],
['Normal','High'],
['Strong','Weak'],
['Warm','Cool'],
['Same','Change']]

num_attributes = len(attributes)

print (" \n The most general hypothesis : ['?','?','?','?','?','?']\n")


print ("\n The most specific hypothesis : ['0','0','0','0','0','0']\n")

a=[]
print("\n The Given Training Data Set \n")

with open(‘/content/sample_data/ws.csv', 'r') as csvFile:


reader = csv.reader(csvFile)
for row in reader:
a.append (row)
print(row)
print("\n The initial value of hypothesis: ")
hypothesis = ['0'] * num_attributes
print(hypothesis)

# Comparing with First Training Example


for j in range(0,num_attributes):
hypothesis[j] = a[0][j];

# Comparing with Remaining Training Examples of Given Data Set

print("\n Find S: Finding a Maximally Specific Hypothesis\n")

for i in range(0,len(a)):
if a[i][num_attributes]=='Yes':
for j in range(0,num_attributes):
if a[i][j]!=hypothesis[j]:
hypothesis[j]='?'
else :
hypothesis[j]= a[i][j]
print(" For Training Example No :{0} the hypothesis is ".format(i),hypothesis)

print("\n The Maximally Specific Hypothesis for a given Training Examples :\n")
print(hypothesis)
OUTPUT :
The most general hypothesis : ['?','?','?','?','?','?']

The most specific hypothesis : ['0','0','0','0','0','0']

The Given Training Data Set

['sunny', 'warm', 'normal', 'strong', 'warm', 'same', 'Yes']


['sunny', 'warm', 'high', 'strong', 'warm', 'same', 'Yes']
['rainy', 'cold', 'high', 'strong', 'warm', 'change', 'No']
['sunny', 'warm', 'high', 'strong', 'cool', 'change', 'Yes']

The initial value of hypothesis:


['0', '0', '0', '0', '0', '0']

Find S: Finding a Maximally Specific Hypothesis

For Training Example No :0 the hypothesis is ['sunny', 'warm', 'normal', 'strong',


'warm', 'same']
For Training Example No :1 the hypothesis is ['sunny', 'warm', '?', 'strong', 'warm',
'same']
For Training Example No :2 the hypothesis is ['sunny', 'warm', '?', 'strong', 'warm',
'same']
For Training Example No :3 the hypothesis is ['sunny', 'warm', '?', 'strong', '?', '?']

The Maximally Specific Hypothesis for a given Training Examples :

['sunny', 'warm', '?', 'strong', '?', '?']


Problem 2: For a given set of training data examples stored in a .CSV file,
implement and demonstrate the Candidate-Elimination algorithm to
output a description of the set of all hypotheses consistent with the
training examples.

import random
import csv

def g_0(n):
return ("?",)*n
def s_0(n):
return ('0',)*n

def more_general(h1, h2):


more_general_parts = []
for x, y in zip(h1, h2):
mg = x == "?" or (x != "0" and (x == y or y == "0"))
more_general_parts.append(mg)
return all(more_general_parts)
l1 = [1, 2, 3]
l2 = [3, 4, 5]
list(zip(l1, l2))

[(1, 3), (2, 4), (3, 5)]


# min_generalizations
def fulfills(example, hypothesis):
### the implementation is the same as for hypotheses:
return more_general(hypothesis, example)
def min_generalizations(h, x):
h_new = list(h)
for i in range(len(h)):
if not fulfills(x[i:i+1], h[i:i+1]):
h_new[i] = '?' if h[i] != '0' else x[i]
return [tuple(h_new)]

min_generalizations(h=('0', '0' , 'sunny'),


x=('rainy', 'windy', 'cloudy'))

[('rainy', 'windy', '?')]

def min_specializations(h, domains, x):


results = []
for i in range(len(h)):
if h[i] == "?":
for val in domains[i]:
if x[i] != val:
h_new = h[:i] + (val,) + h[i+1:]
results.append(h_new)
elif h[i] != "0":
h_new = h[:i] + ('0',) + h[i+1:]
results.append(h_new)
return results

min_specializations(h=('?', 'x',),
domains=[['a', 'b', 'c'], ['x', 'y']],
x=('b', 'x'))

[('a', 'x'), ('c', 'x'), ('?', '0')]

with open('/content/sample_data/wsce.csv’) as csvFile:


examples = [tuple(line) for line in csv.reader(csvFile)]
#examples = [('sunny', 'warm', 'normal', 'strong', 'warm', 'same',True),
# ('sunny', 'warm', 'high', 'strong', 'warm', 'same',True),
# ('rainy', 'cold', 'high', 'strong', 'warm', 'change',False),
# ('sunny', 'warm', 'high', 'strong', 'cool', 'change',True)]
examples
[('Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Y'),
('Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Y'),
('Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', 'N'),
('Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change', 'Y')]

def get_domains(examples):
d = [set() for i in examples[0]]
for x in examples:
for i, xi in enumerate(x):
d[i].add(xi)
return [list(sorted(x)) for x in d]

get_domains(examples)

[['Rainy', 'Sunny'],
['Cold', 'Warm'],
['High', 'Normal'],
['Strong'],
['Cool', 'Warm'],
['Change', 'Same'],
['N', 'Y']]
def candidate_elimination(examples):
domains = get_domains(examples)[:-1]
G = set([g_0(len(domains))])
S = set([s_0(len(domains))])
i=0
print("\n G[{0}]:".format(i),G)
print("\n S[{0}]:".format(i),S)
for xcx in examples:
i=i+1
x, cx = xcx[:-1], xcx[-1] # Splitting data into attributes and decisions
if cx=='Y': # x is positive example
G = {g for g in G if fulfills(x, g)}
S = generalize_S(x, G, S)
else: # x is negative example
S = {s for s in S if not fulfills(x, s)}
G = specialize_G(x, domains, G, S)
print("\n G[{0}]:".format(i),G)
print("\n S[{0}]:".format(i),S)
return
def generalize_S(x, G, S):
S_prev = list(S)
for s in S_prev:
if s not in S:
continue
if not fulfills(x, s):
S.remove(s)
Splus = min_generalizations(s, x)
## keep only generalizations that have a counterpart in G
S.update([h for h in Splus if any([more_general(g,h)
for g in G])])
## remove hypotheses less specific than any other in S
S.difference_update([h for h in S if
any([more_general(h, h1)
for h1 in S if h != h1])])
return S
def specialize_G(x, domains, G, S):
G_prev = list(G)
for g in G_prev:
if g not in G:
continue
if fulfills(x, g):
G.remove(g)
Gminus = min_specializations(g, domains, x)
## keep only specializations that have a conuterpart in S
G.update([h for h in Gminus if any([more_general(h, s)
for s in S])])
## remove hypotheses less general than any other in G
G.difference_update([h for h in G if
any([more_general(g1, h)
for g1 in G if h != g1])])
return G

candidate_elimination(examples)
G[0]: {('?', '?', '?', '?', '?', '?')}

S[0]: {('0', '0', '0', '0', '0', '0')}

G[1]: {('?', '?', '?', '?', '?', '?')}

S[1]: {('Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same')}

G[2]: {('?', '?', '?', '?', '?', '?')}

S[2]: {('Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same')}

G[3]: {('Sunny', '?', '?', '?', '?', '?'), ('?', 'Warm', '?', '?', '?', '?'), ('?', '?', '?', '?', '?',
'Same')}

S[3]: {('Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same')}

G[4]: {('Sunny', '?', '?', '?', '?', '?'), ('?', 'Warm', '?', '?', '?', '?')}

S[4]: {('Sunny', 'Warm', '?', 'Strong', '?', '?')}


# Import Play Tennis Data
import pandas as pd

from pandas import DataFrame

df_tennis = DataFrame.from_csv('C:\\Users\\Desktop\\Data\\PlayTennis.csv')

print("\n Given Play Tennis Data Set:\n\n", df_tennis)

Given Play Tennis Data Set:

PlayTennis Outlook Temperature Humidity Wind

0 No Sunny Hot High Weak

1 No Sunny Hot High Strong

2 Yes Overcast Hot High Weak

3 Yes Rain Mild High Weak

4 Yes Rain Cool Normal Weak

5 No Rain Cool Normal Strong

6 Yes Overcast Cool Normal Strong

7 No Sunny Mild High Weak

8 Yes Sunny Cool Normal Weak

9 Yes Rain Mild Normal Weak

10 Yes Sunny Mild Normal Strong

11 Yes Overcast Mild High Strong

12 Yes Overcast Hot Normal Weak

13 No Rain Mild High Strong

#df_tennis.columns[0]
df_tennis.keys()[0]

'PlayTennis'

# Entropy of the Training Data Set


#Function to calculate the entropy of probaility of observations

# -p*log2*p

def entropy(probs):

import math

return sum( [-prob*math.log(prob, 2) for prob in probs] )

#Function to calulate the entropy of the given Data Sets/List with respect to target attributes

def entropy_of_list(a_list):

#print("A-list",a_list)

from collections import Counter

cnt = Counter(x for x in a_list) # Counter calculates the propotion of class

# print("\nClasses:",cnt)

#print("No and Yes Classes:",a_list.name,cnt)

num_instances = len(a_list)*1.0 # = 14

print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))

probs = [x / num_instances for x in cnt.values()] # x means no of YES/NO

print("\n Classes:",min(cnt),max(cnt))

print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))

print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))

return entropy(probs) # Call Entropy :


# The initial entropy of the YES/NO attribute for our dataset.

print("\n INPUT DATA SET FOR ENTROPY CALCULATION:\n", df_tennis['PlayTennis'])

total_entropy = entropy_of_list(df_tennis['PlayTennis'])

print("\n Total Entropy of PlayTennis Data Set:",total_entropy)

INPUT DATA SET FOR ENTROPY CALCULATION:

0 No

1 No

2 Yes

3 Yes

4 Yes

5 No

6 Yes

7 No

8 Yes

9 Yes

10 Yes

11 Yes

12 Yes

13 No

Name: PlayTennis, dtype: object

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:


Probabilities of Class Yes is 0.6428571428571429:

Total Entropy of PlayTennis Data Set: 0.9402859586706309

# Information Gain of Attributes


def information_gain(df, split_attribute_name, target_attribute_name, trace=0):

print("Information Gain Calculation of ",split_attribute_name)

# Split Data by Possible Vals of Attribute:

df_split = df.groupby(split_attribute_name)

# for name,group in df_split:

# print("Name:\n",name)

# print("Group:\n",group)

# Calculate Entropy for Target Attribute, as well as

# Proportion of Obs in Each Data-Split

nobs = len(df.index) * 1.0

# print("NOBS",nobs)

df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x:


len(x)/nobs] })[target_attribute_name]

#print([target_attribute_name])

#print(" Entropy List ",entropy_of_list)

#print("DFAGGENT",df_agg_ent)

df_agg_ent.columns = ['Entropy', 'PropObservations']

#if trace: # helps understand what fxn is doing:

# print(df_agg_ent)
# Calculate Information Gain:

new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )

old_entropy = entropy_of_list(df[target_attribute_name])

return old_entropy - new_entropy

print('Info-gain for Outlook is :'+str( information_gain(df_tennis, 'Outlook', 'PlayTennis')),"\n")

print('\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'Humidity', 'PlayTennis')),"\n")

print('\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'Wind', 'PlayTennis')),"\n")

print('\n Info-gain for Temperature is:' + str( information_gain(df_tennis,


'Temperature','PlayTennis')),"\n")

Information Gain Calculation of Outlook

Number of Instances of the Current Sub Class is 4.0:

Classes: Yes Yes

Probabilities of Class Yes is 1.0:

Probabilities of Class Yes is 1.0:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:


Probabilities of Class Yes is 0.6:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Info-gain for Outlook is :0.246749819774

Information Gain Calculation of Humidity

Number of Instances of the Current Sub Class is 7.0:

Classes: No Yes

Probabilities of Class No is 0.42857142857142855:


Probabilities of Class Yes is 0.5714285714285714:

Number of Instances of the Current Sub Class is 7.0:

Classes: No Yes

Probabilities of Class No is 0.14285714285714285:

Probabilities of Class Yes is 0.8571428571428571:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Info-gain for Humidity is: 0.151835501362

Information Gain Calculation of Wind

Number of Instances of the Current Sub Class is 6.0:

Classes: No Yes

Probabilities of Class No is 0.5:


Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 8.0:

Classes: No Yes

Probabilities of Class No is 0.25:

Probabilities of Class Yes is 0.75:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Info-gain for Wind is:0.0481270304083

Information Gain Calculation of Temperature

Number of Instances of the Current Sub Class is 4.0:

Classes: No Yes

Probabilities of Class No is 0.25:


Probabilities of Class Yes is 0.75:

Number of Instances of the Current Sub Class is 4.0:

Classes: No Yes

Probabilities of Class No is 0.5:

Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 6.0:

Classes: No Yes

Probabilities of Class No is 0.3333333333333333:

Probabilities of Class Yes is 0.6666666666666666:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Info-gain for Temperature is:0.029222565659


# ID3 Algorithm
def id3(df, target_attribute_name, attribute_names, default_class=None):

## Tally target attribute:

from collections import Counter

cnt = Counter(x for x in df[target_attribute_name])# class of YES /NO

## First check: Is this split of the dataset homogeneous?

if len(cnt) == 1:

return next(iter(cnt)) # next input data set, or raises StopIteration when EOF is hit.

## Second check: Is this split of the dataset empty?

# if yes, return a default value

elif df.empty or (not attribute_names):

return default_class # Return None for Empty Data Set

## Otherwise: This dataset is ready to be devied up!

else:

# Get Default Value for next recursive call of this function:

default_class = max(cnt.keys()) #No of YES and NO Class

# Compute the Information Gain of the attributes:

gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]

index_of_max = gainz.index(max(gainz)) # Index of Best Attribute

# Choose Best Attribute to split on:

best_attr = attribute_names[index_of_max]
# Create an empty tree, to be populated in a moment

tree = {best_attr:{}} # Iniiate the tree with best attribute as a node

remaining_attribute_names = [i for i in attribute_names if i != best_attr]

# Split dataset

# On each split, recursively call this algorithm.

# populate the empty tree with subtrees, which

# are the result of the recursive call

for attr_val, data_subset in df.groupby(best_attr):

subtree = id3(data_subset,

target_attribute_name,

remaining_attribute_names,

default_class)

tree[best_attr][attr_val] = subtree

return tree

# Predicting Attributes

# Get Predictor Names (all but 'class')

attribute_names = list(df_tennis.columns)

print("List of Attributes:", attribute_names)

attribute_names.remove('PlayTennis') #Remove the class attribute

print("Predicting Attributes:", attribute_names)


List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']

Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']

# Run Algorithm:

from pprint import pprint

tree = id3(df_tennis,'PlayTennis',attribute_names)

print("\n\nThe Resultant Decision Tree is :\n")

#print(tree)

pprint(tree)

attribute = next(iter(tree))

print("Best Attribute :\n",attribute)

print("Tree Keys:\n",tree[attribute].keys())

Information Gain Calculation of Outlook

Number of Instances of the Current Sub Class is 4.0:

Classes: Yes Yes

Probabilities of Class Yes is 1.0:

Probabilities of Class Yes is 1.0:

Number of Instances of the Current Sub Class is 5.0:


Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Information Gain Calculation of Temperature

Number of Instances of the Current Sub Class is 4.0:

Classes: No Yes
Probabilities of Class No is 0.25:

Probabilities of Class Yes is 0.75:

Number of Instances of the Current Sub Class is 4.0:

Classes: No Yes

Probabilities of Class No is 0.5:

Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 6.0:

Classes: No Yes

Probabilities of Class No is 0.3333333333333333:

Probabilities of Class Yes is 0.6666666666666666:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Information Gain Calculation of Humidity


Number of Instances of the Current Sub Class is 7.0:

Classes: No Yes

Probabilities of Class No is 0.42857142857142855:

Probabilities of Class Yes is 0.5714285714285714:

Number of Instances of the Current Sub Class is 7.0:

Classes: No Yes

Probabilities of Class No is 0.14285714285714285:

Probabilities of Class Yes is 0.8571428571428571:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Information Gain Calculation of Wind

Number of Instances of the Current Sub Class is 6.0:


Classes: No Yes

Probabilities of Class No is 0.5:

Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 8.0:

Classes: No Yes

Probabilities of Class No is 0.25:

Probabilities of Class Yes is 0.75:

Number of Instances of the Current Sub Class is 14.0:

Classes: No Yes

Probabilities of Class No is 0.35714285714285715:

Probabilities of Class Yes is 0.6428571428571429:

Information Gain Calculation of Temperature

Number of Instances of the Current Sub Class is 2.0:

Classes: No Yes

Probabilities of Class No is 0.5:


Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 3.0:

Classes: No Yes

Probabilities of Class No is 0.3333333333333333:

Probabilities of Class Yes is 0.6666666666666666:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Information Gain Calculation of Humidity

Number of Instances of the Current Sub Class is 2.0:

Classes: No Yes

Probabilities of Class No is 0.5:

Probabilities of Class Yes is 0.5:


Number of Instances of the Current Sub Class is 3.0:

Classes: No Yes

Probabilities of Class No is 0.3333333333333333:

Probabilities of Class Yes is 0.6666666666666666:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Information Gain Calculation of Wind

Number of Instances of the Current Sub Class is 2.0:

Classes: No No

Probabilities of Class No is 1.0:

Probabilities of Class No is 1.0:

Number of Instances of the Current Sub Class is 3.0:

Classes: Yes Yes


Probabilities of Class Yes is 1.0:

Probabilities of Class Yes is 1.0:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Information Gain Calculation of Temperature

Number of Instances of the Current Sub Class is 1.0:

Classes: Yes Yes

Probabilities of Class Yes is 1.0:

Probabilities of Class Yes is 1.0:

Number of Instances of the Current Sub Class is 2.0:

Classes: No No

Probabilities of Class No is 1.0:


Probabilities of Class No is 1.0:

Number of Instances of the Current Sub Class is 2.0:

Classes: No Yes

Probabilities of Class No is 0.5:

Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Information Gain Calculation of Humidity

Number of Instances of the Current Sub Class is 3.0:

Classes: No No

Probabilities of Class No is 1.0:

Probabilities of Class No is 1.0:

Number of Instances of the Current Sub Class is 2.0:


Classes: Yes Yes

Probabilities of Class Yes is 1.0:

Probabilities of Class Yes is 1.0:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

Information Gain Calculation of Wind

Number of Instances of the Current Sub Class is 2.0:

Classes: No Yes

Probabilities of Class No is 0.5:

Probabilities of Class Yes is 0.5:

Number of Instances of the Current Sub Class is 3.0:

Classes: No Yes
Probabilities of Class No is 0.3333333333333333:

Probabilities of Class Yes is 0.6666666666666666:

Number of Instances of the Current Sub Class is 5.0:

Classes: No Yes

Probabilities of Class No is 0.4:

Probabilities of Class Yes is 0.6:

The Resultant Decision Tree is :

{'Outlook': {'Overcast': 'Yes',

'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},

'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}

Best Attribute :

Outlook

Tree Keys:

dict_keys(['Overcast', 'Rain', 'Sunny']


Experiment-4: Exercises to solve the real-world problems using the following machine learning
methods: c) Binary Classifier

Description:

BINARY CLASSIFICATION - FLOOD PREDICTION MODEL


• Flood Prediction is a Binary Classification Problem with class labels as : YES & NO.
• Machine Learning algorithms to predict the chances of Flood in the state of Kerala using the
Kerela flood dataset.
• This Model uses 5 Machine Learning Algorithms namely KNN Classification, Logistic
Regression, Support Vector Machine, Decision Tree and Random Forest to get the best
possible model to predict the floods using Kerela Rainfall Data.

CODE :

# Importing Required Libraries.


import numpy as np
import pandas as pd
data = pd.read_csv('/content/kerala.csv')
print(data)

SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG \
0 KERALA 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5
1 KERALA 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8
2 KERALA 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2
3 KERALA 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8
4 KERALA 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6
.. ... ... ... ... ... ... ... ... ... ...
113 KERALA 2014 4.6 10.3 17.9 95.7 251.0 454.4 677.8 733.9
114 KERALA 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2
115 KERALA 2016 2.4 3.8 35.9 143.0 186.4 522.2 412.3 325.5
116 KERALA 2017 1.9 6.8 8.9 43.6 173.5 498.5 319.6 531.8
117 KERALA 2018 29.1 52.1 48.6 116.4 183.8 625.4 1048.5 1398.9

SEP OCT NOV DEC ANNUAL RAINFALL FLOODS


0 197.7 266.9 350.8 48.4 3248.6 YES
1 491.6 358.4 158.3 121.5 3326.6 YES
2 341.8 354.1 157.0 59.0 3271.2 YES
3 222.7 328.1 33.9 3.3 3129.7 YES
4 217.2 383.5 74.4 0.2 2741.6 NO
.. ... ... ... ... ... ...
113 298.8 355.5 99.5 47.2 3046.4 YES
114 292.9 308.1 223.6 79.4 2600.6 NO
115 173.2 225.9 125.4 23.6 2176.6 NO
116 209.5 192.4 92.5 38.1 2117.1 NO
117 423.6 356.1 125.4 65.1 4473.0 YES

[118 rows x 16 columns]


data.head()

ANNU
J F M A M O N D
SUBDI YEA JU JU AU SE AL FLO
A E A P A C O E
VISION R N L G P RAIN ODS
N B R R Y T V C
FALL

Y
KER 19 28 44. 51 16 17 824 743 35 19 26 35 3248.
0 48.4 E
ALA 01 .7 7 .6 0.0 4.7 .6 .0 7.5 7.7 6.9 0.8 6
S

Y
KER 19 6. 57 83. 13 390 120 31 49 35 15 3326.
1 2.6 121.5 E
ALA 02 7 .3 9 4.5 .9 5.0 5.8 1.6 8.4 8.3 6
S

Y
KER 19 3. 18. 3. 83. 24 558 102 42 34 35 15 3271.
2 59.0 E
ALA 03 2 6 1 6 9.7 .6 2.5 0.2 1.8 4.1 7.0 2
S

Y
KER 19 23 32 71. 23 109 725 35 22 32 33. 3129.
3 3.0 3.3 E
ALA 04 .7 .2 5 5.7 8.2 .5 1.8 2.7 8.1 9 7
S

KER 19 1. 22. 9. 10 26 850 520 29 21 38 74. 2741. N


4 0.2
ALA 05 2 3 4 5.9 3.3 .2 .5 3.6 7.2 3.5 4 6 O

data.tail()

ANNU
J F M A M O N D
SUBDI YEA JU JU AU SE AL FLO
A E A P A C O E
VISION R N L G P RAIN ODS
N B R R Y T V C
FALL

Y
KER 20 4. 10. 17 95. 25 45 677 733 29 35 99. 3046.
113 47.2 E
ALA 14 6 3 .9 7 1.0 4.4 .8 .9 8.8 5.5 5 4
S

KER 20 3. 50 21 20 56 406 252 29 30 22 2600. N


114 5.8 79.4
ALA 15 1 .1 4.1 1.8 3.6 .0 .2 2.9 8.1 3.6 6 O

KER 20 2. 35 14 18 52 412 325 17 22 12 2176. N


115 3.8 23.6
ALA 16 4 .9 3.0 6.4 2.2 .3 .5 3.2 5.9 5.4 6 O

KER 20 1. 8. 43. 17 49 319 531 20 19 92. 2117. N


116 6.8 38.1
ALA 17 9 9 6 3.5 8.5 .6 .8 9.5 2.4 5 1 O

Y
KER 20 29 52. 48 11 18 62 104 139 42 35 12 4473.
117 65.1 E
ALA 18 .1 1 .6 6.4 3.8 5.4 8.5 8.9 3.6 6.1 5.4 0
S
# Finding number of missing values
data.isnull().sum() # cheaking if any colomns is left empty or not.
SUBDIVISION 0
YEAR 0
JAN 0
FEB 0
MAR 0
APR 0
MAY 0
JUN 0
JUL 0
AUG 0
SEP 0
OCT 0
NOV 0
DEC 0
ANNUAL RAINFALL 0
FLOODS 0
dtype: int64

# Analyzing the Dataset

print(data.shape)
(118, 16)

data.describe()

ANN
Y
UAL
E MA AP MA AU OC NO DE
JAN FEB JUN JUL SEP RAI
A R R Y G T V C
NFA
R
LL

co 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118. 118.
un 0000 000 000 000 000 000 0000 0000 0000 000 000 000 0000 0000
t 00 000 000 000 000 000 00 00 00 000 000 000 00 00

m 1959 12.2 15.6 36.6 110. 228. 651. 698. 430. 246. 293. 162. 40.0 2925
ea .500 186 338 703 330 644 6177 2203 3694 207 207 311 0932 .405
n 000 44 98 39 508 915 97 39 92 627 627 017 2 085

34.2 15.4 16.4 30.0 44.6 147. 186. 228. 181. 121. 93.7 83.2 36.6 452.
st
0769 737 062 638 334 548 1813 9889 9804 901 052 004 7633 1694
d
9 66 90 62 52 778 63 66 63 131 53 85 0 07

1901 0.00 0.00 0.10 13.1 53.4 196. 167. 178. 41.3 68.5 31.5 2068
mi 0.10
.000 000 000 000 000 000 8000 5000 6000 000 000 000 .800
n 0000
000 0 0 0 00 00 00 00 00 00 00 00 000
ANN
Y
UAL
E MA AP MA AU OC NO DE
JAN FEB JUN JUL SEP RAI
A R R Y G T V C
NFA
R
LL

1930 2.17 4.70 18.1 74.3 125. 535. 533. 316. 155. 222. 93.0 10.3 2613
25
.250 500 000 000 500 050 5500 2000 7250 425 125 250 5000 .525
%
000 0 0 00 00 000 00 00 00 000 000 00 0 000

1959 5.80 8.35 28.4 110. 184. 625. 691. 386. 223. 284. 152. 31.1 2934
50
.500 000 000 000 400 600 6000 6500 2500 550 300 450 0000 .300
%
000 0 0 00 000 000 00 00 00 000 000 000 0 000

1988 18.1 21.4 49.8 136. 264. 786. 832. 500. 334. 355. 218. 54.0 3170
75
.750 750 000 250 450 875 9750 4250 1000 500 150 325 2500 .400
%
000 00 00 00 000 000 00 00 00 000 000 000 0 000

2018 83.5 79.0 217. 238. 738. 1098 1526 1398 526. 567. 365. 202. 4473
m
.000 000 000 200 000 800 .200 .500 .900 700 900 600 3000 .000
ax
000 00 00 000 000 000 000 000 000 000 000 000 00 000

data.info
SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG \
0 KERALA 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5
1 KERALA 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8
2 KERALA 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2
3 KERALA 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8
4 KERALA 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6
.. ... ... ... ... ... ... ... ... ... ...
113 KERALA 2014 4.6 10.3 17.9 95.7 251.0 454.4 677.8 733.9
114 KERALA 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2
115 KERALA 2016 2.4 3.8 35.9 143.0 186.4 522.2 412.3 325.5
116 KERALA 2017 1.9 6.8 8.9 43.6 173.5 498.5 319.6 531.8
117 KERALA 2018 29.1 52.1 48.6 116.4 183.8 625.4 1048.5 1398.9

SEP OCT NOV DEC ANNUAL RAINFALL FLOODS


0 197.7 266.9 350.8 48.4 3248.6 YES
1 491.6 358.4 158.3 121.5 3326.6 YES
2 341.8 354.1 157.0 59.0 3271.2 YES
3 222.7 328.1 33.9 3.3 3129.7 YES
4 217.2 383.5 74.4 0.2 2741.6 NO
.. ... ... ... ... ... ...
113 298.8 355.5 99.5 47.2 3046.4 YES
114 292.9 308.1 223.6 79.4 2600.6 NO
115 173.2 225.9 125.4 23.6 2176.6 NO
116 209.5 192.4 92.5 38.1 2117.1 NO
117 423.6 356.1 125.4 65.1 4473.0 YES

[118 rows x 16 columns]>

data.cov()
ANN
UAL
YE MA MA
JAN FEB APR JUN JUL AUG SEP OCT NOV DEC RAI
AR R Y
NFA
LL

- - - - - - - - -
1170 2.17 132. 274.9 448.9
YE 119. 13.2 301.1 1114. 1749. 96.8 370. 155. 3063.
.166 692 6256 8376 1581
AR 378 0726 2606 1491 9538 7649 3602 1235 3444
667 3 41 1 2
632 5 8 45 46 6 56 04 44

- - - - -
239. 4.97 36.5 24.0 163.0 545.5 121.9 830.1
JA 119. 24.43 214.0 50.8 14.2 50.9
437 919 7705 3951 6240 7428 7090 5409
N 3786 4163 9484 1245 0542 6820
427 2 3 2 3 1 0 2
32 4 1 1 9

- - -
4.97 269. 121. 90.5 165.2 132.6 81.6 455.9
FE 2.17 202.1 21.74 69.44 222. 76.4
919 166 0277 8596 9358 2965 8435 1333
B 6923 2965 8364 2838 3326 3407
2 362 66 6 0 4 5 0
5 84 9

- - - -
36.5 121. 903. 99.3 106.3 126.1 232.0 527.1 28.9 1578.
MA 13.2 456.7 64.9 81.5
770 027 8357 1578 4856 8624 3387 8475 9010 3057
R 0726 2199 8028 7308
53 766 79 4 7 9 4 8 8 93
5 0 5 9

- - -
132. 24.0 90.5 99.3 1992 606.5 153.0 473. 82.4 2267.
AP 754.4 388.5 70.33 180.
6256 395 859 1578 .145 4039 7005 3309 6427 5901
R 9018 9154 6859 7103
41 12 66 4 044 3 8 62 6 85
5 0 72

- - - - - - -
163. 2177 2101. 2725 1165 2099
MA 301. 202. 456. 754. 33.92 1571. 3340. 638.
062 0.641 8923 .149 .417 7.420
Y 1260 129 7219 4901 9279 7035 5866 9783
403 812 04 056 193 795
68 655 90 85 71 52 71

- - - -
545. 165. 106. 606. 3466 4047. 20.1 247. 3817
JU 1114 33.92 492.9 1194. 581.
574 293 3485 5403 3.499 5670 6114 3294 0.332
N .149 9279 4423 5766 6996
281 580 67 93 937 71 5 60 986
145 9 33 54

- - - -
121. 21.7 126. 153. 4047. 5243 6436. 5846. 541. 6750
1749 1571. 543. 113.
JUL 970 483 1862 0700 5670 5.946 8768 3471 2144 8.201
.953 7035 4793 9883
900 64 49 58 71 420 65 94 59 520
846 71 71 96

- - - - -
274. 24.4 69.4 232. 6436. 3311 2178. 948. 3398
AU 388. 3340. 492.9 3094 1706
9837 341 428 0338 8768 6.888 7627 3650 7.052
G 5915 5866 4423 .959 .808
61 63 38 74 65 805 99 73 721
40 52 9 423 293

- - - - -
448. 132. 527. 70.3 2101. 5846. 2178. 1485 2361
SE 214. 1194. 369. 280. 49.2
9158 629 1847 3685 8923 3471 7627 9.885 0.285
P 094 5766 5008 0773 1024
12 654 58 9 04 94 99 839 602
844 33 28 50 3

- - - - - - -
81.6 473. 2725. 541.2 8780 8722.
OC 96.8 50.8 64.9 20.16 3094. 369.5 187. 134.
843 3309 1490 1445 .674 4679
T 7649 124 8028 1145 9594 0082 5802 2629
55 62 56 9 386 10
6 51 5 23 8 56 78

NO - - - - 82.4 1165. 247.3 - - - - 6922 215. 5597.


V 370. 14.2 222. 81.5 6427 4171 2946 543.4 1706. 280.0 187. .320 8019 3195
ANN
UAL
YE MA MA
JAN FEB APR JUN JUL AUG SEP OCT NOV DEC RAI
AR R Y
NFA
LL

3602 054 332 7308 6 93 0 7937 8082 7735 5802 647 48 16


56 21 684 9 1 93 0 56

- - - - - - - -
28.9 948.3 - 215. 1345 712.5
DE 155. 50.9 76.4 180. 638.9 581.6 113.9 134.
9010 6507 49.21 8019 .153 6072
C 1235 682 340 7103 7837 9965 8839 2629
8 3 0243 48 160 1
04 09 79 72 1 4 6 78

AN
NU -
830. 455. 1578 2267 2099 3817 6750 3398 2361 8722 5597 712. 2044
AL 3063
154 913 .305 .590 7.420 0.332 8.201 7.052 0.285 .467 .319 5607 57.17
RAI .344
092 330 793 185 795 986 520 721 602 910 516 21 2453
NF 444
ALL

data.corr()

YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
ANNUAL RAINFALL
YEAR 1.000000 -0.225531 0.003879 -0.012842 0.086865 -0.059661
-0.174938 -0.223403 0.044173 0.107655 -0.030223 -0.130129
-0.123643 -0.198048
JAN -0.225531 1.000000 0.019613 0.078626 0.034807 0.071420
0.189375 0.034423 0.008677 -0.113502 -0.035044 -0.011034
-0.089809 0.118648
FEB 0.003879 0.019613 1.000000 0.245375 0.123706 -0.083500
0.054114 0.005789 0.023259 0.066317 0.053133 -0.162880
-0.127025 0.061457
MAR -0.012842 0.078626 0.245375 1.000000 0.074014 -0.102961
0.019000 0.018330 0.042411 0.143850 -0.023066 -0.032612
0.026292 0.116103
APR 0.086865 0.034807 0.123706 0.074014 1.000000 -0.114566
0.072990 0.014977 -0.047842 0.012928 0.113172 0.022206
-0.110392 0.112358
MAY -0.059661 0.071420 -0.083500 -0.102961 -0.114566 1.000000
0.001235 -0.046518 -0.124412 0.116860 0.197102 0.094934
-0.118077 0.314723
JUN -0.174938 0.189375 0.054114 0.019000 0.072990 0.001235
1.000000 0.094939 -0.014549 -0.052634 0.001156 0.015967
-0.085188 0.453407
JUL -0.223403 0.034423 0.005789 0.018330 0.014977 -0.046518
0.094939 1.000000 0.154467 0.209441 0.025223 -0.028526
-0.013573 0.651990
AUG 0.044173 0.008677 0.023259 0.042411 -0.047842 -0.124412
-0.014549 0.154467 1.000000 0.098215 -0.181496 -0.112729
0.142090 0.413036
SEP 0.107655 -0.113502 0.066317 0.143850 0.012928 0.116860
-0.052634 0.209441 0.098215 1.000000 -0.032348 -0.027615
-0.011007 0.428344
OCT -0.030223 -0.035044 0.053133 -0.023066 0.113172 0.197102
0.001156 0.025223 -0.181496 -0.032348 1.000000 -0.024060
-0.039067 0.205861
NOV -0.130129 -0.011034 -0.162880 -0.032612 0.022206 0.094934
0.015967 -0.028526 -0.112729 -0.027615 -0.024060 1.000000
0.070720 0.148783
DEC -0.123643 -0.089809 -0.127025 0.026292 -0.110392 -0.118077
-0.085188 -0.013573 0.142090 -0.011007 -0.039067 0.070720
1.000000 0.042967
ANNUAL RAINFALL -0.198048 0.118648 0.061457 0.116103 0.112358
0.314723 0.453407 0.651990 0.413036 0.428344 0.205861
0.148783 0.042967 1.000000

# replacing the yes/no in floods coloumn by 1/0


data['FLOODS'].replace(['YES','NO'],[1,0],inplace=True)

#Printing the clean data


data.head()

SUBDIVISION YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT
NOV DEC ANNUAL RAINFALL FLOODS
0 KERALA 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5 197.7
266.9 350.8 48.4 3248.6 1
1 KERALA 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8 491.6
358.4 158.3 121.5 3326.6 1
2 KERALA 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2 341.8
354.1 157.0 59.0 3271.2 1
3 KERALA 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8 222.7
328.1 33.9 3.3 3129.7 1
4 KERALA 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6 217.2
383.5 74.4 0.2 2741.6 0

# Separating the data which we are going to use for prediction.

x=data.iloc[:,1:14]
x.head()

YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
0 1901 28.7 44.7 51.6 160.0 174.7 824.6 743.0 357.5 197.7 266.9 350.8
48.4
1 1902 6.7 2.6 57.3 83.9 134.5 390.9 1205.0 315.8 491.6 358.4 158.3
121.5
2 1903 3.2 18.6 3.1 83.6 249.7 558.6 1022.5 420.2 341.8 354.1 157.0
59.0
3 1904 23.7 3.0 32.2 71.5 235.7 1098.2 725.5 351.8 222.7 328.1 33.9
3.3
4 1905 1.2 22.3 9.4 105.9 263.3 850.2 520.5 293.6 217.2 383.5 74.4
0.2

# Now separate the flood label from the dataset.

y=data.iloc[:,-1]
y

0 1
1 1
2 1
3 1
4 0
..
113 1
114 0
115 0
116 0
117 1
Name: FLOODS, Length: 118, dtype: int64

# Plotting the Data


import matplotlib.pyplot as plt
# sets the backend of matplotlib to the 'inline' backend.
%matplotlib inline
c = data[['JUN','JUL','AUG','SEP']]
c.hist()
plt.show()
# How the rainfall index vary during rainy season
ax = data[['JAN', 'FEB', 'MAR', 'APR','MAY', 'JUN', 'AUG', 'SEP', 'OCT','NOV','DEC']].mean().plo
t.bar(width=0.5,edgecolor='k',align='center',linewidth=2,figsize=(14,6))
plt.xlabel('Month',fontsize=30)
plt.ylabel('Monthly Rainfall',fontsize=20)
plt.title('Rainfall in Kerela for all Months',fontsize=25)
ax.tick_params(labelsize=20)
plt.grid()
plt.ioff()
# Using sklearn to develop the ML Model

# Scaling the data between 0 and 1.


from sklearn import preprocessing
minmax = preprocessing.MinMaxScaler(feature_range=(0,1))
minmax.fit(x).transform(x)

array([[0. , 0.34371257, 0.56582278, ..., 0.39727673, 0.95570189, 0.2388724 ], [0.00854701, 0.08023952,


0.03291139, ..., 0.5804966 , 0.37952709, 0.60039565], [0.01709402, 0.03832335, 0.23544304, ..., 0.57188626,
0.37563604, 0.29129575], ..., [0.98290598, 0.02874251, 0.04810127, ..., 0.31517821, 0.28105358, 0.11622156],
[0.99145299, 0.02275449, 0.08607595, ..., 0.24809772, 0.18258007, 0.18793274], [1. , 0.34850299, 0.65949367, ...,
0.57589107, 0.28105358, 0.3214639 ]])

#dividing the dataset into training dataset and test dataset.


from sklearn import model_selection,neighbors
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.head()
YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC

8 1909 54.1 11.8 61.3 93.8 473.2 704.7 782.3 258.0 195.4 212.1 171.1 32.3

73 1974 1.6 5.4 16.0 128.0 221.5 266.9 1004.2 533.6 383.6 142.1 61.0 3.6

77 1978 3.3 14.7 31.4 73.9 396.8 758.1 686.7 516.8 119.4 171.0 365.6 39.0

114 2015 3.1 5.8 50.1 214.1 201.8 563.6 406.0 252.2 292.9 308.1 223.6 79.4

80 1981 7.0 6.8 28.5 75.9 166.3 912.4 489.8 495.6 376.6 265.0 138.6 43.3

x_train.dtypes

YEAR int64
JAN float64
FEB float64
MAR float64
APR float64
MAY float64
JUN float64
JUL float64
AUG float64
SEP float64
OCT float64
NOV float64
DEC float64
dtype: object

x_test.head()

YEAR JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC

16 1917 2.9 47.6 79.4 38.1 122.9 703.7 342.7 335.1 470.3 264.1 256.4 41.6

9 1910 2.7 25.7 23.3 124.5 148.8 680.0 484.1 473.8 248.6 356.6 280.4 0.1

18 1919 43.0 6.1 33.9 65.9 247.0 636.8 648.0 484.2 255.9 249.2 280.1 53.0

88 1989 10.3 0.0 30.1 141.5 169.4 657.5 450.7 285.5 271.1 308.0 92.9 5.6

72 1973 0.0 0.3 12.3 131.5 119.9 617.0 583.5 487.5 61.3 260.8 84.5 53.8
# type casting.
y_train=y_train.astype('int')
y_train

8 1
73 0
77 1
114 0
80 1
..
96 1
102 0
46 1
42 1
43 0
Name: FLOODS, Length: 94, dtype: int64

y_test=y_test.astype('int')
y_test

16 0
9 0
18 1
88 0
72 0
100 0
4 0
64 0
67 1
101 0
30 1
11 1
23 1
66 0
99 0
106 1
33 0
95 0
41 1
76 1
108 0
78 0
117 1
84 0
Name: FLOODS, dtype: int64

# Using Multiple Binary Classifiers

# 1. KNN Classifier
clf=neighbors.KNeighborsClassifier()
clf.fit(x_train,y_train)

KNeighborsClassifier()

# Predicted chance of Flood.


print("Predicted Values for the Floods:")
y_predict=clf.predict(x_test)
y_predict

Predicted Values for the Floods:


array([0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
1, 0])

print("Actual Values for the Floods:")


print(y_test)

Actual Values for the Floods:


16 0
9 0
18 1
88 0
72 0
100 0
4 0
64 0
67 1
101 0
30 1
11 1
23 1
66 0
99 0
106 1
33 0
95 0
41 1
76 1
108 0
78 0
117 1
84 0
Name: FLOODS, dtype: int64

print("List of the Predicted Values:")


print(y_predict)

List of the Predicted Values:


[0 1 1 0 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0]

# Scaling the dataset.


from sklearn.model_selection import cross_val_score,cross_val_predict
x_train_std= minmax.fit_transform(x_train)
x_test_std= minmax.fit_transform(x_test)
knn_acc=cross_val_score(clf,x_train_std,y_train,cv=3,scoring='accuracy',n_jobs=-1)
knn_proba=cross_val_predict(clf,x_train_std,y_train,cv=3,method='predict_proba')

knn_acc

array([0.8125 , 0.87096774, 0.80645161])

knn_proba

array([[0.2, 0.8], [0.8, 0.2], [0. , 1. ], [0.6, 0.4], [0.2, 0.8], [0.2,
0.8], [1. , 0. ], [0.4, 0.6], [0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.2, 0.8],
[0.6, 0.4], [0.6, 0.4], [1. , 0. ], [0.4, 0.6], [0.8, 0.2], [0.6, 0.4], [1. ,
0. ], [0.2, 0.8], [0.6, 0.4], [0.6, 0.4], [0.4, 0.6], [0.2, 0.8], [0.6, 0.4],
[0.6, 0.4], [0.4, 0.6], [0.6, 0.4], [0.6, 0.4], [0.6, 0.4], [0.6, 0.4], [0.8,
0.2], [0.6, 0.4], [0.8, 0.2], [1. , 0. ], [0.8, 0.2], [0.2, 0.8], [0.2, 0.8],
[0.4, 0.6], [0. , 1. ], [0.6, 0.4], [0.8, 0.2], [0.6, 0.4], [0.6, 0.4], [1. ,
0. ], [0. , 1. ], [0. , 1. ], [0.6, 0.4], [1. , 0. ], [0.4, 0.6], [0.6, 0.4],
[0.2, 0.8], [0.4, 0.6], [0.8, 0.2], [0.2, 0.8], [0.6, 0.4], [0.2, 0.8], [0.4,
0.6], [0.2, 0.8], [0.4, 0.6], [1. , 0. ], [0.2, 0.8], [0.4, 0.6], [0.4, 0.6],
[1. , 0. ], [0.8, 0.2], [0.4, 0.6], [0.6, 0.4], [0.6, 0.4], [0.8, 0.2], [0.4,
0.6], [0.8, 0.2], [0.4, 0.6], [0.2, 0.8], [0.4, 0.6], [0.8, 0.2], [0.8, 0.2],
[0.8, 0.2], [0.4, 0.6], [0.2, 0.8], [0.4, 0.6], [1. , 0. ], [0.4, 0.6], [0.4,
0.6], [0.4, 0.6], [0.8, 0.2], [0.6, 0.4], [1. , 0. ], [0. , 1. ], [0.4, 0.6],
[0.8, 0.2], [0.6, 0.4], [0.2, 0.8], [0.6, 0.4]])

# Accuracy of KNN Model :

from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,confusion_matrix


print("\nAccuracy Score:%f"%(accuracy_score(y_test,y_predict)*100))
print("Recall Score:%f"%(recall_score(y_test,y_predict)*100))
print("ROC score:%f"%(roc_auc_score(y_test,y_predict)*100))
print(confusion_matrix(y_test,y_predict))

Accuracy Score:79.166667
Recall Score:77.777778
ROC score:78.888889
[[12 3]
[ 2 7]]

# 2. Logistic Regression Classification Algorithm

x_train_std=minmax.fit_transform(x_train) # fit the values in between 0 and 1.


y_train_std=minmax.transform(x_test)
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)
lr_acc=cross_val_score(lr,x_train_std,y_train,cv=3,scoring='accuracy',n_jobs=-1)
lr_proba=cross_val_predict(lr,x_train_std,y_train,cv=3,method='predict_proba')
lr_acc

array([0.9375 , 0.83870968, 0.93548387])

lr_proba

array([[0.32188546, 0.67811454], [0.51429733, 0.48570267], [0.38307406,


0.61692594], [0.60799446, 0.39200554], [0.31303735, 0.68696265], [0.07953431,
0.92046569], [0.55944247, 0.44055753], [0.31399097, 0.68600903], [0.70898459,
0.29101541], [0.67202443, 0.32797557], [0.32020787, 0.67979213], [0.2342836 ,
0.7657164 ], [0.58522222, 0.41477778], [0.55545261, 0.44454739], [0.56565991,
0.43434009], [0.26706465, 0.73293535], [0.62297474, 0.37702526], [0.42498329,
0.57501671], [0.76999382, 0.23000618], [0.38728749, 0.61271251], [0.67832896,
0.32167104], [0.33411549, 0.66588451], [0.26198974, 0.73801026], [0.48580865,
0.51419135], [0.57278318, 0.42721682], [0.260131 , 0.739869 ], [0.35464424,
0.64535576], [0.5948436 , 0.4051564 ], [0.41877983, 0.58122017], [0.53720388,
0.46279612], [0.60982153, 0.39017847], [0.34443545, 0.65556455], [0.50440352,
0.49559648], [0.47634302, 0.52365698], [0.48930742, 0.51069258], [0.69280423,
0.30719577], [0.31601947, 0.68398053], [0.31060663, 0.68939337], [0.30373674,
0.69626326], [0.19719304, 0.80280696], [0.78025688, 0.21974312], [0.45147895,
0.54852105], [0.4299277 , 0.5700723 ], [0.49761976, 0.50238024], [0.64465749,
0.35534251], [0.34362757, 0.65637243], [0.20706422, 0.79293578], [0.54199546,
0.45800454], [0.74937673, 0.25062327], [0.36765109, 0.63234891], [0.60889347,
0.39110653], [0.20394419, 0.79605581], [0.31837192, 0.68162808], [0.78899401,
0.21100599], [0.26672507, 0.73327493], [0.48173419, 0.51826581], [0.15882893,
0.84117107], [0.50769024, 0.49230976], [0.46166574, 0.53833426], [0.2123488 ,
0.7876512 ], [0.43957275, 0.56042725], [0.31307136, 0.68692864], [0.43194973,
0.56805027], [0.28329656, 0.71670344], [0.76994794, 0.23005206], [0.34658513,
0.65341487], [0.51665225, 0.48334775], [0.43759609, 0.56240391], [0.57281821,
0.42718179], [0.6306284 , 0.3693716 ], [0.27596537, 0.72403463], [0.65702989,
0.34297011], [0.48621437, 0.51378563], [0.2590243 , 0.7409757 ], [0.32410574,
0.67589426], [0.7374675 , 0.2625325 ], [0.64910538, 0.35089462], [0.65066629,
0.34933371], [0.37069844, 0.62930156], [0.53668991, 0.46331009], [0.27269691,
0.72730309], [0.78490638, 0.21509362], [0.46926733, 0.53073267], [0.2580524 ,
0.7419476 ], [0.48937679, 0.51062321], [0.6042879 , 0.3957121 ], [0.48959182,
0.51040818], [0.77205283, 0.22794717], [0.12206253, 0.87793747], [0.39385481,
0.60614519], [0.78790479, 0.21209521], [0.42515393, 0.57484607], [0.16321164,
0.83678836], [0.64833641, 0.35166359]])
y_pred=lr.predict(x_test)
y_pred

array([0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
0, 1, 0])

print(y_test.values)

[0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0]

print("List of the Predicted Values:")


print(y_pred)

List of the Predicted Values:


[0 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0 1 0]

# Accuracy and Efficiency of LR Model :


from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,confusion_matrix
print("\naccuracy score:%f"%(accuracy_score(y_test,y_pred)*100))
print("recall score:%f"%(recall_score(y_test,y_pred)*100))
print("roc score:%f"%(roc_auc_score(y_test,y_pred)*100))
print(confusion_matrix(y_test,y_pred))

accuracy score:87.500000
recall score:88.888889
roc score:87.777778
[[13 2]
[ 1 8]]

# 3. Support Vector Classification


from sklearn.svm import SVC
svc=SVC(kernel='rbf',probability=True)
svc_classifier=svc.fit(x_train,y_train)
svc_acc=cross_val_score(svc_classifier,x_train_std,y_train,cv=3,scoring="accuracy",n_jobs=-
1)
svc_proba=cross_val_predict(svc_classifier,x_train_std,y_train,cv=3,method='predict_proba'
)
svc_acc
array([0.90625 , 0.90322581, 0.90322581])

svc_proba
array([[2.53477212e-01, 7.46522788e-01], [6.26215813e-01, 3.73784187e-01],
[8.85692145e-02, 9.11430785e-01], [6.11301593e-01, 3.88698407e-01],
[1.31856926e-01, 8.68143074e-01], [1.82732776e-02, 9.81726722e-01],
[8.09704178e-01, 1.90295822e-01], [1.12116148e-01, 8.87883852e-01],
[8.11536808e-01, 1.88463192e-01], [8.50419768e-01, 1.49580232e-01],
[1.46441543e-01, 8.53558457e-01], [9.61272383e-02, 9.03872762e-01],
[8.73845038e-01, 1.26154962e-01], [4.74410079e-01, 5.25589921e-01],
[9.23344720e-01, 7.66552803e-02], [1.48414321e-01, 8.51585679e-01],
[8.64377127e-01, 1.35622873e-01], [6.01890924e-01, 3.98109076e-01],
[9.58578971e-01, 4.14210286e-02], [2.09878503e-01, 7.90121497e-01],
[9.02162919e-01, 9.78370813e-02], [2.86267143e-01, 7.13732857e-01],
[1.75833318e-01, 8.24166682e-01], [5.25702332e-01, 4.74297668e-01],
[6.00511908e-01, 3.99488092e-01], [1.22322092e-01, 8.77677908e-01],
[2.05031520e-01, 7.94968480e-01], [6.33213733e-01, 3.66786267e-01],
[4.66490741e-01, 5.33509259e-01], [7.42615883e-01, 2.57384117e-01],
[9.16207567e-01, 8.37924333e-02], [2.15335464e-01, 7.84664536e-01],
[6.94553329e-01, 3.05446671e-01], [6.35611529e-01, 3.64388471e-01],
[5.62736716e-01, 4.37263284e-01], [9.74313518e-01, 2.56864820e-02],
[1.13881509e-01, 8.86118491e-01], [2.61365437e-02, 9.73863456e-01],
[5.12240780e-02, 9.48775922e-01], [1.33584926e-05, 9.99986642e-01],
[9.93063512e-01, 6.93648822e-03], [6.43663018e-01, 3.56336982e-01],
[3.59580083e-01, 6.40419917e-01], [8.37230882e-01, 1.62769118e-01],
[9.93599759e-01, 6.40024146e-03], [4.62520080e-02, 9.53747992e-01],
[2.34057595e-02, 9.76594240e-01], [7.00738214e-01, 2.99261786e-01],
[9.86154637e-01, 1.38453627e-02], [2.08709962e-01, 7.91290038e-01],
[9.33933463e-01, 6.60665373e-02], [5.34954399e-03, 9.94650456e-01],
[2.04084161e-01, 7.95915839e-01], [9.90751572e-01, 9.24842834e-03],
[4.45372682e-03, 9.95546273e-01], [1.66091700e-01, 8.33908300e-01],
[3.86929555e-06, 9.99996131e-01], [6.69515754e-01, 3.30484246e-01],
[2.04190402e-01, 7.95809598e-01], [2.08019626e-02, 9.79198037e-01],
[7.87668011e-01, 2.12331989e-01], [1.92560722e-02, 9.80743928e-01],
[4.02508642e-01, 5.97491358e-01], [8.97733715e-03, 9.91022663e-01],
[9.98020874e-01, 1.97912611e-03], [2.16682834e-01, 7.83317166e-01],
[4.53850253e-01, 5.46149747e-01], [3.86311001e-01, 6.13688999e-01],
[8.49607998e-01, 1.50392002e-01], [9.91714321e-01, 8.28567937e-03],
[1.22655934e-01, 8.77344066e-01], [9.91825902e-01, 8.17409787e-03],
[1.98061346e-01, 8.01938654e-01], [5.13864541e-02, 9.48613546e-01],
[1.10499781e-01, 8.89500219e-01], [9.92124948e-01, 7.87505233e-03],
[7.55786228e-01, 2.44213772e-01], [9.86458934e-01, 1.35410660e-02],
[2.32739746e-02, 9.76726025e-01], [1.85677493e-01, 8.14322507e-01],
[2.63622292e-02, 9.73637771e-01], [9.98743897e-01, 1.25610287e-03],
[3.13956197e-01, 6.86043803e-01], [1.50578196e-02, 9.84942180e-01],
[4.14229509e-01, 5.85770491e-01], [9.33259709e-01, 6.67402914e-02],
[5.39538365e-01, 4.60461635e-01], [9.99054569e-01, 9.45431174e-04],
[4.76500210e-06, 9.99995235e-01], [1.70654294e-01, 8.29345706e-01],
[9.88791890e-01, 1.12081100e-02], [4.32883215e-01, 5.67116785e-01],
[1.77166530e-02, 9.82283347e-01], [9.78291777e-01, 2.17082225e-02]])
svc_scores=svc_proba[:,1]
svc_scores

array([7.46522788e-01, 3.73784187e-01, 9.11430785e-01, 3.88698407e-01, 8.68143074e-01,


9.81726722e-01, 1.90295822e-01, 8.87883852e-01, 1.88463192e-01, 1.49580232e-01,
8.53558457e-01, 9.03872762e-01, 1.26154962e-01, 5.25589921e-01, 7.66552803e-02,
8.51585679e-01, 1.35622873e-01, 3.98109076e-01, 4.14210286e-02, 7.90121497e-01,
9.78370813e-02, 7.13732857e-01, 8.24166682e-01, 4.74297668e-01, 3.99488092e-01,
8.77677908e-01, 7.94968480e-01, 3.66786267e-01, 5.33509259e-01, 2.57384117e-01,
8.37924333e-02, 7.84664536e-01, 3.05446671e-01, 3.64388471e-01, 4.37263284e-01,
2.56864820e-02, 8.86118491e-01, 9.73863456e-01, 9.48775922e-01, 9.99986642e-01,
6.93648822e-03, 3.56336982e-01, 6.40419917e-01, 1.62769118e-01, 6.40024146e-03,
9.53747992e-01, 9.76594240e-01, 2.99261786e-01, 1.38453627e-02, 7.91290038e-01,
6.60665373e-02, 9.94650456e-01, 7.95915839e-01, 9.24842834e-03, 9.95546273e-01,
8.33908300e-01, 9.99996131e-01, 3.30484246e-01, 7.95809598e-01, 9.79198037e-01,
2.12331989e-01, 9.80743928e-01, 5.97491358e-01, 9.91022663e-01, 1.97912611e-03,
7.83317166e-01, 5.46149747e-01, 6.13688999e-01, 1.50392002e-01, 8.28567937e-03,
8.77344066e-01, 8.17409787e-03, 8.01938654e-01, 9.48613546e-01, 8.89500219e-01,
7.87505233e-03, 2.44213772e-01, 1.35410660e-02, 9.76726025e-01, 8.14322507e-01,
9.73637771e-01, 1.25610287e-03, 6.86043803e-01, 9.84942180e-01, 5.85770491e-01,
6.67402914e-02, 4.60461635e-01, 9.45431174e-04, 9.99995235e-01, 8.29345706e-01,
1.12081100e-02, 5.67116785e-01, 9.82283347e-01, 2.17082225e-02])

y_pred=svc_classifier.predict(x_test)
print("Actual Flood Values:")
print(y_test.values)

Actual Flood Values:


[0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0]

print("Predicted Flood Values:")


print(y_pred)

Predicted Flood Values:


[0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 0 0 1 1 1 0 1 0]

# Accuracy of SVM model


from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,confusion_matrix
print("\naccuracy score:%f"%(accuracy_score(y_test,y_pred)*100))
print("recall score:%f"%(recall_score(y_test,y_pred)*100))
print("roc score:%f"%(roc_auc_score(y_test,y_pred)*100))
print(confusion_matrix(y_test,y_pred))
accuracy score:87.500000

recall score:100.000000
roc score:90.000000
[[12 3]
[ 0 9]]

# 4.Decision Tree Classification


from sklearn.tree import DecisionTreeClassifier
dtc_clf=DecisionTreeClassifier()
dtc_clf.fit(x_train,y_train)
dtc_clf_acc=cross_val_score(dtc_clf,x_train_std,y_train,cv=3,scoring="accuracy",n_jobs=-1)
dtc_clf_acc

array([0.8125 , 0.77419355, 0.64516129])

print("Predicted Values:")
y_pred=dtc_clf.predict(x_test)
y_pred

Predicted Values:
array([1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
0, 1, 0])

print("Actual Values:")
print(y_test.values)

Actual Values:
[0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0]

# Accuracy of Decision Tree Model


from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,confusion_matrix
print("\naccuracy score:%f"%(accuracy_score(y_test,y_pred)*100))
print("recall score:%f"%(recall_score(y_test,y_pred)*100))
print("roc score:%f"%(roc_auc_score(y_test,y_pred)*100))
print(confusion_matrix(y_test,y_pred))

accuracy score:62.500000
recall score:66.666667
roc score:63.333333
[[9 6]
[3 6]]

# 5. Random Forest Classifier


from sklearn.ensemble import RandomForestClassifier
rmf=RandomForestClassifier(max_depth=3,random_state=0)
rmf_clf=rmf.fit(x_train,y_train)
rmf_clf

RandomForestClassifier(max_depth=3, random_state=0)
rmf_clf_acc=cross_val_score(rmf_clf,x_train_std,y_train,cv=3,scoring="accuracy",n_jobs=-1)
rmf_proba=cross_val_predict(rmf_clf,x_train_std,y_train,cv=3,method='predict_proba')
rmf_clf_acc

array([0.9375 , 0.83870968, 0.83870968])

rmf_proba

[0.22572259, 0.77427741], [0.33685086, 0.66314914], [0.41815907,


0.58184093], [0.18985469, 0.81014531], [0.64764947, 0.35235053],
[0.25820426, 0.74179574], [0.40834267, 0.59165733], [0.53694728,
0.46305272], [0.71020202, 0.28979798], [0.47070073, 0.52929927],
[0.23385605, 0.76614395], [0.4006628 , 0.5993372 ], [0.71323606,
0.28676394], [0.36114793, 0.63885207], [0.630473 , 0.369527 ],
[0.21351883, 0.78648117], [0.24236241, 0.75763759], [0.76991099,
0.23008901], [0.17839735, 0.82160265], [0.34069168, 0.65930832],
[0.13744217, 0.86255783], [0.21754333, 0.78245667], [0.22615267,
0.77384733], [0.33654063, 0.66345937], [0.32431382, 0.67568618],
[0.42711438, 0.57288562], [0.21908131, 0.78091869], [0.26258563,
0.73741437], [0.7568743 , 0.2431257 ], [0.54485407, 0.45514593],
[0.59965978, 0.40034022], [0.3663423 , 0.6336577 ], [0.54966046,
0.45033954], [0.65991867, 0.34008133], [0.40901872, 0.59098128],
[0.71098249, 0.28901751], [0.52046983, 0.47953017], [0.1934226 ,
0.8065774 ], [0.39983088, 0.60016912], [0.67707878, 0.32292122],
[0.67339827, 0.32660173], [0.64379148, 0.35620852], [0.48688986,
0.51311014], [0.64902433, 0.35097567], [0.46477574, 0.53522426],
[0.76015278, 0.23984722], [0.46613516, 0.53386484], [0.29167134,
0.70832866], [0.39725643, 0.60274357], [0.70980882, 0.29019118],
[0.4052448 , 0.5947552 ], [0.83353977, 0.16646023], [0.2171972 ,
0.7828028 ], [0.55762025, 0.44237975], [0.79246657, 0.20753343],
[0.46054338, 0.53945662], [0.23723346, 0.76276654], [0.7488818 ,
0.2511182 ]])

from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,confusion_matrix


print("\naccuracy score:%f"%(accuracy_score(y_test,y_pred)*100))
print("recall score:%f"%(recall_score(y_test,y_pred)*100))
print("roc score:%f"%(roc_auc_score(y_test,y_pred)*100))
print(confusion_matrix(y_test,y_pred))

accuracy score:62.500000
recall score:66.666667
roc score:63.333333
[[9 6]
[3 6]]

# Final Accuracy of our Models is obtained by Comparing all the


prediction models

models = []
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
models.append(('KNN', KNeighborsClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('SVC', SVC()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))

names = []
scores = []
for name, model in models:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
scores.append(accuracy_score(y_test, y_pred))
names.append(name)
tr_split = pd.DataFrame({'Name': names, 'Score': scores})
tr_split

Name Score
0 KNN 0.791667
1 LR 0.875000
2 SVC 0.875000
3 DT 0.625000
4 RF 0.625000

import seaborn as sns


axis = sns.barplot(x = 'Name', y = 'Score', data =tr_split )
axis.set(xlabel='Classifier Models', ylabel='Accuracy of the Model')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.01, '{:1.4f}'.format(height), ha="center")

plt.show()
tr_split['Score'].max()

0.875

# So, we can see and choose the best model for Prediction.
5: Develop a program for Bias, Variance, Remove duplicates , Cross Validation

Code:

import numpy as np
import pickle
import matplotlib.pyplot as plot
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd

#get train data


with open('/content/train.pkl', 'rb') as train:
data1 = pickle.load(train)
#shuffle data everytime to get diff models
np.random.shuffle(data1)
#getting x and y coordinates separately
x_tr=data1[:,:-1]
y_tr=data1[:,1]
#get test data
with open('/content/test.pkl','rb') as test:
data2 = pickle.load(test)
x_test=data2[:,:-1]
y_test=data2[:,1]
#print(y_test)
#split train data into 10 sets
x_train=np.array(np.array_split(x_tr,10))
y_train=np.array(np.array_split(y_tr,10))
bias=np.zeros(21)
biassq=np.zeros(21)
variance=np.zeros(21)
degree=np.zeros(21)
irred_error=np.zeros(21)
tot_err = np.zeros(21)

reg = LinearRegression()
#for each degree run fit on 10 models,800 data points each,pass test data and get output
for i in range(1,21):
difference = np.zeros((10,80))
temp=np.zeros(80)
#create output array to get predicted values
y_return=np.zeros((10,80))
for j in range(10):
polynomial = PolynomialFeatures(degree=i , include_bias=False)
X_TRAIN = polynomial.fit_transform(x_train[j])
X_TEST = polynomial.fit_transform(x_test)
reg.fit(X_TRAIN , y_train[j])
prediction = []
prediction = (reg.predict(X_TEST))
y_return[j]=prediction
difference[j] = ((y_test - y_return[j])**2)
#getting average mse
for j in range (10):
temp += difference[j]
temp /= 10
tot_err[i] = np.mean(temp)
#for each degree ,calculate bias
y_mean=np.mean(y_return,axis=0)
bias[i]=np.mean(abs(y_mean - y_test))
biassq[i] = np.mean((y_mean - y_test)**2)
# for each degree , calculate variance
y_var=np.var(y_return,axis=0)
variance[i]=np.mean(y_var)
#calculate irreducible error
irred_error[i]=np.mean(temp) - (biassq[i] +variance[i])

bias[0]=None
biassq[0]=None
variance[0]=None
irred_error[0]=None
tot_err[0]=None
table_bias=pd.DataFrame({'Degree':np.array(range(0,21)),'Bias':bias,'Variance': variance, \
'irreducible error': irred_error})
print(table_bias.to_string(index=False))

Degree Bias Variance irreducible error


0 NaN NaN NaN
1 819.616296 52502.851726 0.000000e+00
2 810.723177 63038.338967 1.164153e-10
3 66.759783 82511.802026 -2.910383e-11
4 75.086506 119504.738659 1.455192e-11
5 74.118251 142927.888359 0.000000e+00
6 73.672010 155691.412463 2.910383e-11
7 76.976841 188606.361625 0.000000e+00
8 83.027855 197845.570631 -5.820766e-11
9 86.125443 221867.004600 2.910383e-11
10 88.543096 226648.113716 0.000000e+00
11 85.650841 244186.973565 2.910383e-11
12 110.271938 247779.685373 0.000000e+00
13 90.934839 269844.732601 5.820766e-11
14 121.178328 260209.925547 5.820766e-11
15 159.138647 274458.587049 5.820766e-11
16 163.106983 276973.564430 -5.820766e-11
17 232.105387 281008.916077 1.164153e-10
18 233.036935 284071.981987 -5.820766e-11
19 302.110220 278882.173174 -5.820766e-11
20 301.952975 284937.633646 1.164153e-10
plot.xlabel('Model Complexity')
plot.ylabel('Error')
plot.title("Bias^2 vs Variance")
#plotting the graph og bias square and variance
plot.plot(tot_err,label='total error', color = 'green')
# plot.plot(bias**2,label='Bias^2', color = 'blue')
plot.plot(biassq,label='Bias^2', color = 'blue')
plot.plot(variance,label='Variance', color = 'red')

plot.legend()
6. Write a program to implement Categorical Encoding, One-hot Encoding

Code:

from numpy import array


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
# data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']

doc1 = "Can I eat the Pizza".lower()


doc2 = "You can eat the Pizza".lower()
doc1 = doc1.split()
doc2 = doc2.split()
doc1_array = array(doc1)
doc2_array = array(doc2)
doc3 = doc1+doc2
# doc3 = set(doc3)
data = list(doc3)

values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

['can' 'i' 'eat' 'the' 'pizza' 'you' 'can' 'eat' 'the' 'pizza']
[0 2 1 4 3 5 0 1 4 3]

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 1.]
[1. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0.]]

# invert first example


inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)
['can']
Machine Learning Laboratory

7.Build an Artificial Neural Network by implementing the Backpropagation algorithm


andtest the same using appropriate data sets.

BACKPROPAGATION Algorithm

BACKPROPAGATION (training_example, ƞ, nin, nout, nhidden )


Each training example is a pair of the form (𝑥 ⃗ →, 𝑡→ ), where (𝑥→ ) is the vector of network
input values, (𝑡→ ) and is the vector of target network output values.
ƞ is the learning rate (e.g., .05). ni, is the number of network inputs, nhidden the number
of units in the hidden layer, and nout the number of output units.
The input from unit i into unit j is denoted xji, and the weight from unit i to unit j is
denoted wji

• Create a feed-forward network with ni inputs, nhidden hidden units, and nout output
units.
• Initialize all network weights to small random numbers
• Until the termination condition is met, Do

• For each (⃗𝑥→, 𝑡→ ), in training examples, Do

Propagate the input forward through the network:


1. Input the instance ⃗𝑥→, to the network and compute the output ou of every
unit u in the network.

Propagate the errors backward through the network:

1
Machine Learning Laboratory

Training Examples:

Expected % in
Example Sleep Study
Exams
1 2 9 92
2 1 5 86
3 3 6 89

Normalize the input


Expected %
Example Sleep Study
in Exams
1 2/3 = 0.66666667 9/9 = 1 0.92
2 1/3 = 0.33333333 5/9 = 0.55555556 0.86
3 3/3 = 1 6/9 = 0.66666667 0.89

Program:

import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0) # maximum of X array longitudinally
y = y/100

#Sigmoid Function
def sigmoid (x):
return 1/(1 + np.exp(-x))

#Derivative of Sigmoid Function


def derivatives_sigmoid(x):
return x * (1 - x)

#Variable initialization
epoch=5000 #Setting training iterations
lr=0.1 #Setting learning rate
inputlayer_neurons = 2 #number of features in data set
hiddenlayer_neurons = 3 #number of hidden layers neurons
output_neurons = 1 #number of neurons at output layer

2
Machine Learning Laboratory

#weight and bias initialization


wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neur
ons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neuron
s))
bout=np.random.uniform(size=(1,output_neurons))

#draws a random range of numbers uniformly of dim x*y


for i in range(epoch):

#Forward Propogation
hinp1=np.dot(X,wh)
hinp=hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+ bout
output = sigmoid(outinp)

#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO* outgrad
EH = d_output.dot(wout.T)

#how much hidden layer wts contributed to error


hiddengrad = derivatives_sigmoid(hlayer_act)
d_hiddenlayer = EH * hiddengrad

# dotproduct of nextlayererror and currentlayerop


wout += hlayer_act.T.dot(d_output) *lr
wh += X.T.dot(d_hiddenlayer) *lr

print("Input: \n" + str(X))


print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)

3
Machine Learning Laboratory

Output:

Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]

Actual Output:
[[0.92]
[0.86]
[0.89]]

Predicted Output:
[[0.89726759]
[0.87196896]
[0.9000671]]

4
Exp. No. 8. Write a program to implement k-Nearest Neighbor algorithm to classify the iris
data set. Print both correct and wrong predictions. Java/Python ML library classes can be
used for this problem.

Data Set:

Iris Plants Dataset: Dataset contains 150 instances (50 in each of three classes) Number of
Attributes: 4 numeric, predictive attributes and the Class.
Python Program to Implement and Demonstrate KNN
Algorithm :

import numpy as np

import pandas as pd

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

from sklearn import metrics

names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe

dataset = pd.read_csv("/content/IRIS.csv", names=names)

X = dataset.iloc[:, :-1]

y = dataset.iloc[:, -1]

print(X.head())

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30)

classifier = KNeighborsClassifier(n_neighbors=5).fit(Xtrain, ytrain)

ypred = classifier.predict(Xtest)

i=0

print ("\n-------------------------------------------------------------------------")

print ('%-25s %-25s %-25s' % ('Original Label', 'Predicted Label', 'Correct/Wrong'))

print ("-------------------------------------------------------------------------")

for label in ytest:

print ('%-25s %-25s' % (label, ypred[i]), end="")

if (label == ypred[i]):

print (' %-25s' % ('Correct'))


else:

print (' %-25s' % ('Wrong'))

i=i+1

print ("-------------------------------------------------------------------------")

print("\nConfusion Matrix:\n",metrics.confusion_matrix(ytest, ypred))

print ("-------------------------------------------------------------------------")

print("\nClassification Report:\n",metrics.classification_report(ytest, ypred))

print ("-------------------------------------------------------------------------")

print('Accuracy of the classifer is %0.2f' % metrics.accuracy_score(ytest,ypred))

print ("-------------------------------------------------------------------------")
OUTPUT :
sepal-length sepal-width petal-length petal-width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2

----------------------------------------------------------------
Original Label Predicted Label
Correct/Wrong
----------------------------------------------------------------
Iris-setosa Iris-setosa Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-virginica Iris-versicolor Wrong
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-virginica Wrong
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-setosa Iris-setosa Correct
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
----------------------------------------------------------------
Confusion Matrix:
[[13 0 0]
[ 0 17 1]
[ 0 1 13]]
----------------------------------------------------------------
Classification Report:
precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 13


Iris-versicolor 0.94 0.94 0.94 18
Iris-virginica 0.93 0.93 0.93 14

accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45

----------------------------------------------------------------
Accuracy of the classifer is 0.96
----------------------------------------------------------------
Machine Learning Laboratory

9.Implement the non-parametric Locally Weighted Regression algorithm in order to fit data
points. Select appropriate data set for your experiment and draw graphs.

Locally Weighted Regression Algorithm

Regression:
• Regression is a technique from statistics that is used to predict values of a desired
target quantity when the target quantity is continuous.
• In regression, we seek to identify (or estimate) a continuous variable y associated with
a given input vector x.
• y is called the dependent variable.
• x is called the independent variable.

Loess/Lowess Regression:
Loess regression is a nonparametric technique that uses local weighted regression to fit a
smooth curve through points in a scatter plot.

1
Machine Learning Laboratory

Lowess Algorithm:
• Locally weighted regression is a very powerful nonparametric model used in statistical
learning.
• Given a dataset X, y, we attempt to find a model parameter β(x) that minimizes
residual sum of weighted squared errors.
• The weights are given by a kernel function (k or w) which can be chosen arbitrarily

Algorithm

1. Read the Given data Sample to X and the curve (linear or non linear) to Y
2. Set the value for Smoothening parameter or Free parameter say τ
3. Set the bias /Point of interest set x0 which is a subset of X
4. Determine the weight matrix using :

5. Determine the value of model term parameter β using :

6. Prediction = x0*β:

Program

import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.io import push_notebook

def local_regression(x0, X, Y, tau):# add bias term


x0 = np.r_[1, x0] # Add one to avoid the loss in
information
X = np.c_[np.ones(len(X)), X]

# fit model: normal equations with kernel


xw = X.T * radial_kernel(x0, X, tau) # XTranspose * W

beta = np.linalg.pinv(xw @ X) @ xw @ Y #@ Matrix


Multiplication or Dot Product

2
Machine Learning Laboratory

# predict value
return x0 @ beta # @ Matrix Multiplication or Dot Product
for prediction
def radial_kernel(x0, X, tau):
return np.exp(np.sum((X - x0) ** 2, axis=1) / (-2 * tau *
tau))
# Weight or Radial Kernal Bias Function

n = 1000
# generate dataset
X = np.linspace(-3, 3, num=n)
print("The Data Set ( 10 Samples) X :\n",X[1:10])
Y = np.log(np.abs(X ** 2 - 1) + .5)
print("The Fitting Curve Data Set (10 Samples) Y
:\n",Y[1:10])
# jitter X
X += np.random.normal(scale=.1, size=n)
print("Normalised (10 Samples) X :\n",X[1:10])

domain = np.linspace(-3, 3, num=300)


print(" Xo Domain Space(10 Samples) :\n",domain[1:10])
def plot_lwr(tau):
# prediction through regression
prediction = [local_regression(x0, X, Y, tau) for x0 in
domain]
plot = figure(plot_width=400, plot_height=400)
plot.title.text='tau=%g' % tau
plot.scatter(X, Y, alpha=.3)
plot.line(domain, prediction, line_width=2, color='red')
return plot

show(gridplot([
[plot_lwr(10.), plot_lwr(1.)],
[plot_lwr(0.1), plot_lwr(0.01)]]))

3
Machine Learning Laboratory

Output

4
Machine Learning Laboratory

# -*- coding: utf-8 -*-


"""
Spyder Editor

This is a temporary script file.


"""
from numpy import *
from os import listdir
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np1
import numpy.linalg as np
from scipy.stats.stats import pearsonr

def kernel(point,xmat, k):


m,n = np1.shape(xmat)
weights = np1.mat(np1.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2))
return weights

def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W

def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)

5
Machine Learning Laboratory

ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred

# load data points


data = pd.read_csv('tips.csv')
bill = np1.array(data.total_bill)
tip = np1.array(data.tip)

#preparing and add 1 in bill


mbill = np1.mat(bill)
mtip = np1.mat(tip) # mat is used to convert to n dimesiona to 2 dimensional array form
m= np1.shape(mbill)[1]
# print(m) 244 data is stored in m
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T)) # create a stack of bill from ONE
#print(X)
#set k here
ypred = localWeightRegression(X,mtip,0.3)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();

6
Experiment - 10:

AIM: Assuming a set of Documents that need to be classified, use the naive Bayesian Classifier model to perform thi
s task. Built - in Java classes API can be used to write the program. Calculate the accuracy, Precision and recall for
your dataset.

import pandas as pd
msg=pd.read_csv("naivetext.csv",names=["message","label"])
print(" The dimensions of the dataset", msg.shape)
msg["labelnum"]=msg.label.map({"pos":1,"neg":0})
X=msg.message
Y=msg.labelnum
print(X)
print(Y)
from sklearn.model_selection import train_test_split
X_train,X test, Y train, Y test = train_test_split(X,Y)
print("\n the total no. of training data:", y train, shape)
print("\n the total no. of test data:", y test, shape)
from sklearn.feature_extraction.text import countvectorizer
Count_Vect=CountVectorizer()
X train_dtm=Count_Vect.fit_transform(X train)
X test_dtm=Count_Vect.transform(X test)
print("\n the words or tokens in the text documents \n")
print(Count_Vect.get=feature_names())
from sklearn.naive_bayes import multinomial NB
df=Multinominal NB().fit(X train_dtm, Y train)
predicted=df.predict(xtest_dtm)
from sklearn import metrics
print("\n Accuracy of the classifier is", metrics.accuracy_score(Y test, predicted))
print("\n confusion matrix")
print(metrics.confusion_matrix(Y test, predicted))
print(metrics.confusion_score(Y test, predicted))
print("\n the values of recall", metrics.recall_score(Y test, predicted))h
Experiment-11: Apply EM algorithm to cluster a Data Set. Use the same data set for clustering
using k-Means algorithm. Compare the results of these two algorithms and comment on the
quality of clustering. You can add Java/Python ML library classes/API in the program.

CODE :
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import pandas as pd
X=pd.read_csv('/content/kmeansdata.csv')
x1 = X['Distance_Feature'].values
x2 = X['Speeding_Feature'].values
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
plt.plot()
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Dataset')
plt.scatter(x1, x2)
plt.show()
#code for EM
gmm = GaussianMixture(n_components=3)
gmm.fit(X)
em_predictions = gmm.predict(X)
print("\nEM predictions")
print(em_predictions)
print("mean:\n",gmm.means_)
print('\n')
print("Covariances\n",gmm.covariances_)
print(X)
plt.title('Exceptation Maximum')
plt.scatter(X[:,0], X[:,1],c=em_predictions,s=50)
plt.show()

EM predictions
[1 1 1 ... 0 0 0]
mean:
[[180.12995794 10.18334766]
[ 50.04762937 8.82874097]
[179.34576209 66.43976809]]

Covariances
[[[ 3.58889799e+02 -3.86609795e-02]
[-3.86609795e-02 2.50576436e+01]]

[[ 1.02463951e+02 1.38088891e+00]
[ 1.38088891e+00 1.00051507e+02]]

[[ 4.20956286e+02 -4.07566432e+01]
[-4.07566432e+01 3.87123236e+02]]]
[[ 71.24 28. ]
[ 52.53 25. ]
[ 64.54 27. ]
...
[170.91 12. ]
[176.14 5. ]
[168.03 9. ]]
#code for Kmeans
import matplotlib.pyplot as plt1
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

print(kmeans.cluster_centers_)
print(kmeans.labels_)

plt.title('KMEANS')
plt1.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow')
plt1.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')

[[180.34311782 10.52011494]
[ 50.04763438 8.82875 ]
[177.83509615 70.28846154]]
[1 1 1 ... 0 0 0]
<matplotlib.collections.PathCollection at 0x7f652d2554d0>
Experiment-12: Exploratory Data Analysis for Classification using Pandas or Matplotlib.

Description:

The purpose of this EDA is to find insights which will serve us later in another notebook for Data
cleaning/preparation/transformation which will ultimately be used into a machine learning algorithm. We will
proceed as follow:

Example Dataset: House Prices - Advanced Regression Techniques

CODE:
# Preparations: For the preparations lets first import the necessary libraries and load the
files needed for our EDA.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

df = pd.read_csv('/content/train.csv')
df.head()
MS MS Lot Lo A Lan Po Po F Mis Mi M Y Sal Sale Sal
St Lot Ut
I Sub Zo Fro tA ll dCo ol ol e cFe sc oS rS eT Con eP
re Sh ilit ...
d Clas nin ntag re e ntou Ar Q n atur Va ol ol yp ditio ric
et ape ies
s g e a y r ea C ce e l d d e n e

2
8 P Al 0
N No
65 4 a Na Lv lP Na Na 20 8
0 1 60 RL Reg ... 0 a 0 2 WD rm
.0 5 v N l u N N 08 5
N al
0 e b 0
0

1
9 P Al 8
N No
80 6 a Na Lv lP Na Na 20 1
1 2 20 RL Reg ... 0 a 0 5 WD rm
.0 0 v N l u N N 07 5
N al
0 e b 0
0

2
1
P Al 2
1 N No
68 a Na Lv lP Na Na 20 3
2 3 60 RL 2 IR1 ... 0 a 0 9 WD rm
.0 v N l u N N 08 5
5 N al
e b 0
0
0

1
9 P Al 4
N Ab
60 5 a Na Lv lP Na Na 20 0
3 4 70 RL IR1 ... 0 a 0 2 WD nor
.0 5 v N l u N N 06 0
N ml
0 e b 0
0

2
1
P Al 5
4 N No
84 a Na Lv lP Na Na 20 0
4 5 60 RL 2 IR1 ... 0 a 0 12 WD rm
.0 v N l u N N 08 0
6 N al
e b 0
0
0

5 rows × 81 columns

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

# From these observations, we can see that some features won't be relevant in our exploratory analysis as there are too
much missing values (such as Alley and PoolQC). Plus there is so much features to analyse that it may be better to
concentrate on the ones which can give us real insights. Let's just remove Id and the features with 30% or less NaN values.

# df.count() does not include NaN values


df2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]
del df2['Id']
print("List of dropped columns:", end=" ")
for c in df.columns:
if c not in df2.columns:
print(c, end=", ")
print('\n')
df = df2

List of dropped columns: Id, Alley, PoolQC, Fence, MiscFeature,

# Now, let’s take a look at how the housing price is distributed

print(df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

count 1460.000000
mean 180921.195890
std 79442.502883
min 34900.000000
25% 129975.000000
50% 163000.000000
75% 214000.000000
max 755000.000000
Name: SalePrice, dtype: float64
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot`
is a deprecated function and will be removed in a future version. Please adapt your code to use
either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level
function for histograms).warnings.warn(msg, FutureWarning)
# With this information we can see that the prices are skewed right and some outliers lies above ~500,000.
We will eventually want to get rid of them to get a normal distribution of the independent variable
(`SalePrice`) for machine learning.

# Numerical data distribution

list(set(df.dtypes.tolist()))
[dtype('O'), dtype('float64'), dtype('int64')]

df_num = df.select_dtypes(include = ['float64', 'int64'])


df_num.head()
Y S
M Lo L Ov Yea Ma Bs Bs Wo Op 3S Sc P M M Y
Ov ea Enc al
SS tFr ot era rRe sV mt mt . od en sn ree oo is o r
era r lose e
ub ont A llC mo nr Fi Fi . De Por Po nP lA c S S
llQ B dPo P
Cl ag re on dA Ar nS nS . ck chS rc orc re V ol ol
ual ui rch ri
ass e a d dd ea F1 F2 SF F h h a al d d
lt ce

2
2 0
6
84 200 20 19 70 0 8
0 60 5. 7 5 0 ... 0 61 0 0 0 0 0 2
50 3 03 6.0 6 0 5
0
8 0
0

1
2 8
8
96 197 19 97 29 0 1
1 20 0. 6 8 0.0 0 ... 0 0 0 0 0 0 5
00 6 76 8 8 0 5
0
7 0
0

2
2 2
6 11
200 20 16 48 0 3
2 60 8. 25 7 5 0 ... 0 42 0 0 0 0 0 9
1 02 2.0 6 0 5
0 0
8 0
0

1
2 4
6
95 191 19 21 27 0 0
3 70 0. 7 5 0.0 0 ... 0 35 0 0 0 0 2
50 5 70 6 2 0 0
0
6 0
0

2
2 5
8 14
200 20 35 65 19 1 0 0
4 60 4. 26 8 5 0 ... 84 0 0 0 0 0
0 00 0.0 5 2 2 0 0
0 0
8 0
0

5 rows × 37 columns

df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose
informations
# Features such as `1stFlrSF`, `TotalBsmtSF`, `LotFrontage`, `GrLiveArea`... seems to share a similar
distribution to the one we have with `SalePrice`. Lets see if we can find new clues later.
# Correlation

# Now we'll try to find which features are strongly correlated with SalePrice. We'll store them in a var
called golden_features_list. We'll reuse our df_num dataset to do so.

df_num_corr = df_num.corr()['SalePrice'][:-1] # -1 because the latest row is SalePrice


golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list),
golden_features_list))

There is 10 strongly correlated values with SalePrice:


OverallQual 0.790982
GrLivArea 0.708624
GarageCars 0.640409
GarageArea 0.623431
TotalBsmtSF 0.613581
1stFlrSF 0.605852
FullBath 0.560664
TotRmsAbvGrd 0.533723
YearBuilt 0.522897
YearRemodAdd 0.507101
Name: SalePrice, dtype: float64

# Perfect, we now have a list of strongly correlated values but this list is incomplete as we know that
correlation is affected by outliers. So we could proceed as follow:

• Plot the numerical features and see which ones have very few or explainable outliers
• Remove the outliers from these features and see which one can have a good correlation without
their outliers
Btw, correlation by itself does not always explain the relationship between data so ploting them could even
lead us to new insights and in the same manner, check that our correlated values have a linear relationship to
the SalePrice.
For example, relationships such as curvilinear relationship cannot be guessed just by looking at the
correlation value so lets take the features we excluded from our correlation table and plot them to see if they
show some kind of pattern.

for i in range(0, len(df_num.columns), 5):


sns.pairplot(data=df_num,
x_vars=df_num.columns[i:i+5],
y_vars=['SalePrice'])
# We can clearly identify some relationships. Most of them seems to have a linear relationship with
the SalePrice and if we look closely at the data we can see that a lot of data points are located on x = 0 which
may indicate the absence of such feature in the house.
Now, let’s remove these 0 values and repeat the process of finding correlated values:

import operator

individual_features_df = []

for i in range(0, len(df_num.columns) - 1): # -1 because the last column is SalePrice

tmpDf = df_num[[df_num.columns[i], 'SalePrice']]

tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]

individual_features_df.append(tmpDf)

all_correlations = {feature.columns[0]: feature.corr()['SalePrice'][0] for feature in individual_features_df}

all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))

for (key, value) in all_correlations:

print("{:>15}: {:>15}".format(key, value))


KitchenAbvGr: -0.1392006921778576
HalfBath: -0.08439171127179902
MSSubClass: -0.08428413512659509
OverallCond: -0.07785589404867797
YrSold: -0.028922585168736813
BsmtHalfBath: -0.02883456718548182
PoolArea: -0.014091521506356765
BsmtFullBath: 0.011439163340408606
MoSold: 0.046432245223819446
3SsnPorch: 0.06393243256889088
OpenPorchSF: 0.08645298857147718
MiscVal: 0.08896338917298921
Fireplaces: 0.12166058421363891
BsmtUnfSF: 0.16926100049514173
BedroomAbvGr: 0.18093669310848806
WoodDeckSF: 0.1937060123752066
BsmtFinSF2: 0.19895609430836594
EnclosedPorch: 0.24127883630117497
ScreenPorch: 0.2554300795487841
LotArea: 0.2638433538714051
LowQualFinSF: 0.30007501655501323
LotFrontage: 0.35179909657067737
MasVnrArea: 0.43409021975689227
BsmtFinSF1: 0.47169042652357296
GarageYrBlt: 0.4863616774878596
YearRemodAdd: 0.5071009671113866
YearBuilt: 0.5228973328794967
TotRmsAbvGrd: 0.5337231555820284
FullBath: 0.5745626737760822
1stFlrSF: 0.6058521846919153
GarageArea: 0.6084052829168346
TotalBsmtSF: 0.6096808188074374
GarageCars: 0.6370954062078923
2ndFlrSF: 0.6733048324568376
GrLivArea: 0.7086244776126515
OverallQual: 0.7909816005838053
# Very interesting! We found another strongly correlated value by cleaning up the data a bit. Now
our golden_features_list var looks like this:

golden_features_list = [key for key, value in all_correlations if abs(value) >= 0.5]

print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list),


golden_features_list))

There is 11 strongly correlated values with SalePrice:

['YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF', 'GarageArea', 'TotalBsmtSF',


'GarageCars', '2ndFlrSF', 'GrLivArea', 'OverallQual']

# We found strongly correlated predictors with `SalePrice`. Later with feature engineering we may add
dummy values where value of a given feature > 0 would be 1 (precense of such feature) and 0 would be 0.
For `2ndFlrSF` for example, we could create a dummy value for its precense or non-precense and finally sum
it up to `1stFlrSF`.

# Conclusion

# By looking at correlation between numerical values we discovered 11 features which have a strong
relationship to a house price. Besides correlation we didn't find any notable pattern on the datas which are
not correlated.

# Feature to feature relationship


# Trying to plot all the numerical features in a seaborn pairplot will take us too much time and will be hard to
interpret.

corr = df_num.drop('SalePrice', axis=1).corr() # We already examined SalePrice correlations

plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],

cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,

annot=True, annot_kws={"size": 8}, square=True);


# We can conclude that, by essence, some of those features may be combined between each other in order to
reduce the number of features (`1stFlrSF`/`TotalBsmtSF`, `GarageCars`/`GarageArea`) and others indicates
that people expect multiples features to be packaged together.
Machine Learning Laboratory

13. Write a program to construct a Bayesian network considering medical data. Use this model
to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can
use Java/Python ML library classes/API

Theory
A Bayesian network is a directed acyclic graph in which each edge corresponds to a conditional
dependency, and each node corresponds to a unique random variable.

Bayesian network consists of two major parts: a directed acyclic graph and a set of conditional
probability distributions
• The directed acyclic graph is a set of random variables represented by nodes.
• The conditional probability distribution of a node (random variable) is defined for every
possible outcome of the preceding causal node(s).

For illustration, consider the following example. Suppose we attempt to turn on our computer,
but the computer does not start (observation/evidence). We would like to know which of the
possible causes of computer failure is more likely. In this simplified illustration, we assume
only two possible causes of this misfortune: electricity failure and computer malfunction.
The corresponding directed acyclic graph is depicted in below figure.

Fig: Directed acyclic graph representing two independent possible causes of a computer failure.

The goal is to calculate the posterior conditional probability distribution of each of the possible
unobserved causes given the observed evidence, i.e. P [Cause | Evidence].
Machine Learning Laboratory

Data Set:
Title: Heart Disease Databases
The Cleveland database contains 76 attributes, but all published experiments refer to using a
subset of 14 of them. In particular, the Cleveland database is the only one that has been used
by ML researchers to this date. The "Heartdisease" field refers to the presence of heart disease
in the patient. It is integer valued from 0 (no presence) to 4.
Database: 0 1 2 3 4 Total
Cleveland: 164 55 36 35 13 303
Attribute Information:
1. age: age in years
2. sex: sex (1 = male; 0 = female)
3. cp: chest pain type
• Value 1: typical angina
• Value 2: atypical angina
• Value 3: non-anginal pain
• Value 4: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
• Value 0: normal
• Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation
or depression of > 0.05 mV)
• Value 2: showing probable or definite left ventricular hypertrophy by Estes'
criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
• Value 1: upsloping
• Value 2: flat
• Value 3: downsloping
12. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
13. Heartdisease: It is integer valued from 0 (no presence) to 4.
Machine Learning Laboratory

Some instance from the dataset:


age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal Heartdisease
63 1 1 145 233 1 2 150 0 2.3 3 0 6 0
67 1 4 160 286 0 2 108 1 1.5 2 3 3 2
67 1 4 120 229 0 2 129 1 2.6 2 2 7 1
41 0 2 130 204 0 2 172 0 1.4 1 0 3 0
62 0 4 140 268 0 2 160 0 3.6 3 2 3 3
60 1 4 130 206 0 2 132 1 2.4 2 2 7 4

Program:

import numpy as np
import pandas as pd
import csv
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination

#read Cleveland Heart Disease data


heartDisease = pd.read_csv('heart.csv')
heartDisease = heartDisease.replace('?',np.nan)

#display the data


print('Few examples from the dataset are given below')
print(heartDisease.head())

#display the Attributes names and datatyes

print('\n Attributes and datatypes')


print(heartDisease.dtypes)

#Creat Model- Bayesian Network


model =
BayesianModel([('age','heartdisease'),('sex','heartdisease'),(
'exang','heartdisease'),('cp','heartdisease'),('heartdisease',
'restecg'),('heartdisease','chol')])

3
Machine Learning Laboratory

#Learning CPDs using Maximum Likelihood Estimators

print('\n Learning CPD using Maximum likelihood estimators')


model.fit(heartDisease,estimator=MaximumLikelihoodEstimator)

# Inferencing with Bayesian Network

print('\n Inferencing with Bayesian Network:')


HeartDiseasetest_infer = VariableElimination(model)

#computing the Probability of HeartDisease given restecg


print('\n 1.Probability of HeartDisease given evidence=
restecg :1')
q1=HeartDiseasetest_infer.query(variables=['heartdisease'],evi
dence={'restecg':1})
print(q1)

#computing the Probability of HeartDisease given cp

print('\n 2.Probability of HeartDisease given evidence= cp:2 ')


q2=HeartDiseasetest_infer.query(variables=['heartdisease'],evi
dence={'cp':2})
print(q2)
Machine Learning Laboratory

Output:
Machine Learning Laboratory

6
Experiment-14: Write a program to Implement Support Vector Machines

CODE:

# Importing the libraries


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the datasets
datasets = pd.read_csv('/content/Social_Network_Ads.csv')

X = datasets.iloc[:, [2,3]].values
Y = datasets.iloc[:, 4].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)
# Fitting the classifier into the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_Train, Y_Train)

SVC(kernel='linear', random_state=0)
# Predicting the test set results
Y_Pred = classifier.predict(X_Test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_Test, Y_Pred)
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_Set, Y_Set = X_Train, Y_Train
X1, X2 = np.meshgrid(np.arange(start = X_Set[:, 0].min() - 1, stop = X_Set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_Set[:, 1].min() - 1, stop = X_Set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(Y_Set)):
plt.scatter(X_Set[Y_Set == j, 0], X_Set[Y_Set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Support Vector Machine (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

# Visualising the Test set results


from matplotlib.colors import ListedColormap
X_Set, Y_Set = X_Test, Y_Test
X1, X2 = np.meshgrid(np.arange(start = X_Set[:, 0].min() - 1, stop = X_Set[:, 0].max() + 1, step =
0.01),
np.arange(start = X_Set[:, 1].min() - 1, stop = X_Set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(Y_Set)):
plt.scatter(X_Set[Y_Set == j, 0], X_Set[Y_Set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Support Vector Machine (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

You might also like