Lab Manual ML
Lab Manual ML
LEARNING
AL 405 (Machine Learning LAB)
LIST OF PRACTICALS
Practical # Objective
5 Study how Naïve Bayes classifier for a sample training data work
Aim: Implement and demonstrate the FIND-S algorithm for finding the most
specific hypothesis based on a given set of training data samples. Read the
training data from a .CSV file.
import pandas as pd
from pandas import DataFrame
data = DataFrame.from_csv('EnjoySport.csv')
columnLength= data.shape[1]
print (data.values)
h = ['0']*(columnLength-1)
hp=[]
hn=[]
for trainingExample in data.values:
if trainingExample[-1]!='no':
hp.append(list(trainingExample))
else:
hn.append(list(trainingExample))
for i in range (len(hp)):
for j in range(columnLength-1):
if (h[j]=='0'):
h[j]=hp[i][j]
if (h[j]!=hp[i][j]):
h[j]='?'
else:
h[j]=hp[i][j]
print('\nThe positive Hypotheses are:',hp) print('\
nThe negative Hypotheses are:',hn) print('\nThe
Maximally Specific Hypothesis h is:',h)
Output:
Dataset:
Source: https://github.com/praahas/machine-learning-vtu
Experiment 2: Candidate-Elimination Algorithm
Aim: For a given set of training data examples stored in a .CSV file, implement
and demonstrate the Candidate-Elimination algorithm to output a description
of the set of all hypotheses consistent with the training examples.
from pandas import DataFrame
data=DataFrame.from_csv('EnjoySport.csv')
concepts=data.values[:,:-1]
target=data.values[:,-1]
Output:
Dataset:
Source: https://github.com/ggrao1/Candidate-Elimination
Experiment 3: Decision Tree based ID3 Algorithm
Aim: Write a program to demonstrate the working of the decision tree based
ID3 algorithm. Use an appropriate data set for building the decision tree and
apply this knowledge to classify a new sample.
def infoGain(P, N):
import math
return -P / (P + N) * math.log2(P / ( P + N)) - N / (P + N) * math.log2(N / (P + N))
countC = {}
for cVal in conceptVals:
dataCC = data[data[concept] = = cVal]
countC[cVal] = dataCC.shape[0]
if countC[conceptVals[0]] = = 0:
tree = insertConcept(tree, addTo, conceptVals[1])
return tree
if countC[conceptVals[1]] = = 0:
tree = insertConcept(tree, addTo, conceptVals[0])
return tree
ClassEntropy = infoGain(countC[conceptVals[1]],countC[conceptVals[0]])
Attr = {}
for a in AttributeList:
Attr[a] = list(set(data[a]))
AttrCount = {}
EntropyAttr = {}
for att in Attr:
for vals in Attr [att]:
for c in conceptVals:
iData = data[data[att] = = vals]
dataAtt = iData[iData[concept] = = c]
AttrCount[c] = dataAtt.shape[0]
TotalInfo = AttrCount[conceptVals[1]] + AttrCount[conceptVals[0]]
if AttrCount[conceptVals[1]] = = 0 or AttrCount[conceptVals[0]] = = 0:
InfoGain=0
else:
InfoGain = infoGain(AttrCount[conceptVals[1]], AttrCount[conceptVals[0]])
Gain = {}
for g in EntropyAttr:
Gain[g] = ClassEntropy - EntropyAttr[g]
def main():
from pandas import DataFrame
data = DataFrame.from_csv('PlayTennis.csv')
print(data)
AttributeList = list(data)[:-1]
concept = str(list(data)[-1])
conceptVals = list(set(data[concept]))
tree = getNextNode(data, AttributeList, concept, conceptVals, {'root':'None'}, 'root')
print(tree)
main()
Output:
Dataset:
Source: https://github.com/ggrao1/decision-tree
Experiment 4: Artificial Neural Network using Back propagation
Algorithm
def derivatives_sigmoid(x):
return x * (1 - x)
epoch=7000
learning_rate=0.1
inputlayer_neurons = 2
hiddenlayer_neurons = 3
output_neurons = 1
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wo=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bo=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
net_h=np.dot(X,wh) + bh
sigma_h= sigmoid(net_h)
net_o= np.dot(sigma_h,wo)+ bo
output = sigmoid(net_o)
deltaK = (y-output)* derivatives_sigmoid(output)
deltaH = deltaK.dot(wo.T) * derivatives_sigmoid(sigma_h)
wo = wo + sigma_h.T.dot(deltaK) *learning_rate
wh = wh + X.T.dot(deltaH) *learning_rate
Aim: Write a program to implement the Naïve Bayes classifier for a sample
training data set stored as a .CSV file. Compute the accuracy of the classifier,
considering few test data sets.
def probAttr(data,attr,val):
Total=data.shape[0]
cnt = len(data[data[attr] == val])
return cnt,cnt/Total
def train(data,Attr,conceptVals,concept):
conceptProbs = {}
countConcept={}
for cVal in conceptVals:
countConcept[cVal],conceptProbs[cVal] = probAttr(data,concept,cVal)
AttrConcept = {}
probability_list = {}
for att in Attr: #Create a tree for attribute
AttrConcept[att] = {}
probability_list[att] = {}
for val in Attr[att]:
AttrConcept[att][val] = {}
a,probability_list[att][val] = probAttr(data,att,val)
for cVal in conceptVals:
dataTemp = data[data[att]==val]
AttrConcept[att][val][cVal] = len(dataTemp[dataTemp[concept] == cVal])/countConcept[cVal]
print("P(A) : ",conceptProbs,"\n")
print("P(X/A) : ",AttrConcept,"\n")
print("P(X) : ",probability_list,"\n")
return conceptProbs,AttrConcept,probability_list
def test(examples,Attr,concept_list,conceptProbs,AttrConcept,probability_list):
misclassification_count=0
Total = len(examples)
for ex in examples:
px={}
for a in Attr:
for x in ex:
for c in concept_list:
if x in AttrConcept[a]:
if c not in px:
px[c] = conceptProbs[c]*AttrConcept[a][x][c]/probability_list[a][x]
else:
px[c] = px[c]*AttrConcept[a][x][c]/probability_list[a][x]
print(px)
classification = max(px,key=px.get)
print("Classification :",classification,"Expected :",ex[-
1]) if(classification!=ex[-1]):
misclassification_count+=1
misclassification_rate=misclassification_count*100/Total
accuracy=100-misclassification_rate
print("Misclassification Count={}".format(misclassification_count))
print("Misclassification Rate={}%".format(misclassification_rate))
print("Accuracy={}%".format(accuracy))
def main():
import pandas as pd
from pandas import DataFrame
data = DataFrame.from_csv('PlayTennis_train1.csv')
concept=str(list(data)[-1])
concept_list = set(data[concept])
Attr={}
for a in list(data)[:-1]:
Attr[a] = set(data[a])
conceptProbs,AttrConcept,probability_list = train(data,Attr,concept_list,concept)
examples = DataFrame.from_csv(PlayTennis_test1.csv')
test(examples.values,Attr,concept_list,conceptProbs,AttrConcept,probability_list)
main()
Output:
Dataset:
Training Set
Testing example
Source: https://github.com/ggrao1/NaiveBayes
Experiment 6: Naïve Bayes Classifier using API
Aim: Assuming a set of documents that need to be classified, use the naïve
Bayesian Classifier model to perform this task. Built-in Java classes/API can be
used to write the program. Calculate the accuracy, precision, and recall for
your data set.
import pandas as pd
msg = pd.read_csv('document.csv', names=['message', 'label'])
print("Total Instances of Dataset: ", msg.shape[0])
msg['labelnum'] = msg.label.map({'pos': 1, 'neg': 0})
X = msg.message
y = msg.labelnum
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
from sklearn.feature_extraction.text import CountVectorizer
count_v = CountVectorizer()
Xtrain_dm = count_v.fit_transform(Xtrain)
Xtest_dm = count_v.transform(Xtest)
df = pd.DataFrame(Xtrain_dm.toarray(),columns=count_v.get_feature_names())
print(df[0:5])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(Xtrain_dm, ytrain)
pred = clf.predict(Xtest_dm)
Source: https://github.com/rumaan/machine-learning-lab-vtu
Dataset:
Experiment 7: Bayesian Network
cancer_model.get_cpds()
print(cancer_model.get_cpds('Pollution'))
print(cancer_model.get_cpds('Smoker'))
print(cancer_model.get_cpds('Xray'))
print(cancer_model.get_cpds('Dyspnoea'))
print(cancer_model.get_cpds('Cancer'))
cancer_model.local_independencies('Xray')
cancer_model.local_independencies('Pollution')
cancer_model.local_independencies('Smoker')
cancer_model.local_independencies('Dyspnoea')
cancer_model.local_independencies('Cancer')
cancer_model.get_independencies()
Inferencing:
Diagnosis of heart patients using standard Heart Disease Data Set :
import numpy as np
from urllib.request import urlopen
import urllib
import matplotlib.pyplot as plt # Visuals
import seaborn as sns
import sklearn as skl
import pandas as pd
Cleveland_data_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-
disease/processed.hungarian.data'
np.set_printoptions(threshold=np.nan) #see a whole array when we output it
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal',
'heartdisease']
heartDisease = pd.read_csv(urlopen(Cleveland_data_URL), names = names) #gets Cleveland data
del heartDisease['ca']
del heartDisease['slope']
del heartDisease['thal']
del heartDisease['oldpeak']
print(model.get_cpds('age'))
print(model.get_cpds('chol'))
print(model.get_cpds('sex'))
model.get_independencies()
Aim: Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the
same data set for clustering using k-Means algorithm. Compare the results of
these two algorithms and comment on the quality of clustering. You can add
Java/Python ML library classes/API in the program.
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_iris
import sklearn.metrics as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=load_iris()
X=pd.DataFrame(dataset.data)
X.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y=pd.DataFrame(dataset.target)
y.columns=['Targets']
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
#REAL PLOT
plt.subplot(1,3,1)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y.Targets],s=40)
plt.title('Real')
#KMeans -PLOT
plt.subplot(1,3,2)
model=KMeans(n_clusters=3)
model.fit(X)
predY=np.choose(model.labels_,[0,1,2]).astype(np.int64)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[predY],s=40)
plt.title('KMeans')
#GMM PLOT
scaler=preprocessing.StandardScaler()
scaler.fit(X)
xsa=scaler.transform(X)
xs=pd.DataFrame(xsa,columns=X.columns)
gmm=GaussianMixture(n_components=3)
gmm.fit(xs)
y_cluster_gmm=gmm.predict(xs)
plt.subplot(1,3,3)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm],s=40)
plt.title('GMM Classification')
Output
Dataset:
Experiment 9: k-Nearest Neighbour Algorithm
Aim: Write a program to implement k-Nearest Neighbour algorithm to classify the iris
data set. Print both correct and wrong predictions. Java/Python ML library classes can
be used for this problem.
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
dataset=load_iris()
X_train,X_test,y_train,y_test=train_test_split(dataset["data"],dataset["target"],random_state=0)
clf=KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train,y_train)
for i in range(len(X_test)):
x=X_test[i]
x_new=np.array([x])
prediction=clf.predict(x_new)
print("TARGET=",y_test[i],dataset["target_names"][y_test[i]],"PREDICTED=",prediction,dataset["target_
names"][prediction])
print(clf.score(X_test,y_test))
Output:
TARGET= 2 virginica PREDICTED= [2] ['virginica']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 2 virginica PREDICTED= [2] ['virginica']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 2 virginica PREDICTED= [2] ['virginica']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 2 virginica PREDICTED= [2] ['virginica']
TARGET= 2 virginica PREDICTED= [2] ['virginica']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 0 setosa PREDICTED= [0] ['setosa']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 1 versicolor PREDICTED= [1] ['versicolor']
TARGET= 1
versicolor
PREDICTED= [2]
['virginica']
Dataset:
Source: https://github.com/praahas/machine-learning-vtu
Experiment 10: Locally Weighted Regression Algorithm
residuals = y - yest
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
return yest
def main():
import math
n = 100
x = np.linspace(0, 2 * math.pi, n)
y = np.sin(x) + 0.3 * np.random.randn(n)
f =0.25
iterations=3
yest = lowess(x, y, f, iterations)
main()
Output: