PESIT Bangalore South Campus: Vii Semester Lab Manual Subject: Machine Learning
PESIT Bangalore South Campus: Vii Semester Lab Manual Subject: Machine Learning
VII SEMESTER
LAB MANUAL
import csv
def loadCsv(filename):
lines = csv.reader(open(filename, "rt"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = dataset[i]
return dataset
attributes = ['Sky','Temp','Humidity','Wind','Water','Forecast']
print(attributes)
num_attributes = len(attributes)
filename = "Weather.csv"
dataset = loadCsv(filename)
print(dataset)
target=['Yes','Yes','No','Yes']
print(target)
hypothesis=['0'] * num_attributes
print(hypothesis)
if(target[i] == 'Yes'):
for j in range(num_attributes):
if(hypothesis[j]=='0'):
hypothesis[j] = dataset[i][j]
if(hypothesis[j]!= dataset[i][j]):
hypothesis[j]='?'
print(i+1,'=',hypothesis)
print("Final Hypothesis")
print(hypothesis)
OUTPUT:
Final Hypothesis
['Sunny ', 'Warm', '?', 'Strong ', '?', '?']
2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithmto output a description of the set of
all hypotheses consistent with the training examples.
import numpy as np
import pandas as pd
''' learn() function implements the learning method of the Candidate elimination
algorithm.
Arguments:
concepts - a data frame with all the features
target - a data frame with corresponding output values
'''
# Initialise S0 with the first instance from concepts
# .copy() makes sure a new list is created instead of just pointing to the same
memory location
specific_h = concepts[0].copy()
print("initialization of specific_h and general_h")
print(specific_h)
general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
print(general_h)
# The learning iterations
for i, h in enumerate(concepts):
OUTPUT:
initialization of specific_h and general_h
['Sunny' 'Warm' 'High' 'Strong' 'Warm' 'Same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Steps of Candidate Elimination Algorithm 1
['Sunny' 'Warm' 'High' 'Strong' 'Warm' 'Same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Steps of Candidate Elimination Algorithm 2
['Sunny' 'Warm' 'High' 'Strong' 'Warm' 'Same']
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?',
'?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', 'Same']]
Steps of Candidate Elimination Algorithm 3
['Sunny' 'Warm' 'High' 'Strong' '?' '?']
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?',
'?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Final Specific_h:
['Sunny' 'Warm' '?' 'Strong' '?' '?']
Final General_h:
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]
3. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge to
classify a new sample.
import pandas as pd
import numpy as np
#Import the dataset and define the feature as well as the target datasets / columns
dataset = pd.read_csv('playtennis.csv',
names=['outlook','temperature','humidity','wind','class',])
#Import all columns omitting the fist which consists the names of the animals
#We drop the animal names since this is not a good feature to split the data on
attributes =('Outlook','Temperature','Humidity','Wind','PlayTennis')
def entropy(target_col):
def InfoGain(data,split_attribute_name,target_name="class"):
#Calculate the entropy of the total dataset
total_entropy = entropy(data[target_name])
#Calculate the values and the corresponding counts for the split attribute
vals,counts= np.unique(data[split_attribute_name],return_counts=True)
Weighted_Entropy =
np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==va
ls[i]).dropna()[target_name]) for i in range(len(vals))])
def ID3(data,originaldata,features,target_attribute_name="class",parent_node_class =
None):
#Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#
#If all target_values have the same value, return this value
if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]
#If the dataset is empty, return the mode target feature value in the original dataset
elif len(data)==0:
return np.unique(originaldata[target_attribute_name])
[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
else:
#Set the default value for this node --> The mode target feature value of the current
node
parent_node_class = np.unique(data[target_attribute_name])
[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
#Create the tree structure. The root gets the name of the feature (best_feature) with the
maximum information gain in the first run
tree = {best_feature:{}}
#Remove the feature with the best inforamtion gain from the feature space
features = [i for i in features if i != best_feature]
#Grow a branch under the root node for each possible value of the root node feature
#Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here
the recursion comes in!
subtree =
ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
#Add the sub tree, grown from the sub_dataset to the tree under the root node
tree[best_feature][value] = subtree
return(tree)
#1.
for key in list(query.keys()):
if key in list(tree.keys()):
#2.
try:
result = tree[key][query[key]]
except:
return default
#3.
result = tree[key][query[key]]
#4.
if isinstance(result,dict):
return predict(query,result)
else:
return result
def train_test_split(dataset):
training_data = dataset.iloc[:14].reset_index(drop=True)
#We drop the index respectively relabel the index
#starting form 0, because we do not want to run into errors regarding the row labels /
indexe #testing_data = dataset.iloc[10:].reset_index(drop=True)
return training_data
#,testing_data
def test(data,tree):
#Create new query instances by simply removing the target feature column from the
original #dataset and Convert it to a dictionary
#Create a empty DataFrame in whose columns the prediction of the tree are stored
predicted = pd.DataFrame(columns=["predicted"])
#Calculate the prediction accuracy
for i in range(len(data)):
predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
"""
Train the tree, Print the tree and predict the accuracy
"""
XX = train_test_split(dataset)
training_data=XX
#testing_data=XX[1]
tree = ID3(training_data,training_data,training_data.columns[:-1])
print(' Display Tree',tree)
print('len=',len(training_data))
test(training_data,tree)
OUTPUT:
Display Tree {'outlook': {0: {'humidity': {0.0: 0.0, 1.0: 1.0}}, 1: 1.0, 2: {'wind': {0.0:
1.0, 1.0: 0.0}}}}
len= 14
The prediction accuracy is: 100.0 %
4. Build an Artificial Neural Network by implementing the Backpropagation algorithm
and test the same using appropriate data sets.
# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
network = list()
hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in
range(n_hidden)]
network.append(hidden_layer)
output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in
range(n_outputs)]
network.append(output_layer)
return network
import csv
import random
import math
#1.Load Data
def loadCsv(filename):
lines = csv.reader(open(filename, "rt"))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
#Seperatedata by Class
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
#Calculate Mean
def mean(numbers):
return sum(numbers)/float(len(numbers))
#Make a Prediction
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
filename = 'DBetes.csv'
splitRatio = 0.70
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset),
len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))
OUTPUT:
#3 TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
# Machine Learning
#4 Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
# Building a pipeline: We can write less code and do all of the above, by building a pipeline
as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',
MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
# Performance of NB Classifier
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
accuracy=np.mean(predicted == twenty_test.target)
print("Predicted Accuracy = ",accuracy)
import bayespy as bp
import numpy as np
import csv
from colorama import init
from colorama import Fore, Back, Style
init()
data.append([ageEnum[x[0]],genderEnum[x[1]],familyHistoryEnum[x[2]],dietEnum[x[
3]],lifeStyleEnum[x[4]],cholesterolEnum[x[5]],heartDiseaseEnum[x[6]]])
# Training data for machine learning todo: should import from csv
data = np.array(data)
N = len(data)
p_gender = bp.nodes.Dirichlet(1.0*np.ones(2))
gender = bp.nodes.Categorical(p_gender, plates=(N,))
gender.observe(data[:,1])
p_familyhistory = bp.nodes.Dirichlet(1.0*np.ones(2))
familyhistory = bp.nodes.Categorical(p_familyhistory, plates=(N,))
familyhistory.observe(data[:,2])
p_diet = bp.nodes.Dirichlet(1.0*np.ones(3))
diet = bp.nodes.Categorical(p_diet, plates=(N,))
diet.observe(data[:,3])
p_lifestyle = bp.nodes.Dirichlet(1.0*np.ones(4))
lifestyle = bp.nodes.Categorical(p_lifestyle, plates=(N,))
lifestyle.observe(data[:,4])
p_cholesterol = bp.nodes.Dirichlet(1.0*np.ones(3))
cholesterol = bp.nodes.Categorical(p_cholesterol, plates=(N,))
cholesterol.observe(data[:,5])
# Interactive Test
m=0
while m == 0:
print("\n")
res = bp.nodes.MultiMixture([int(input('Enter Age: ' + str(ageEnum))),
int(input('Enter Gender: ' + str(genderEnum))), int(input('Enter FamilyHistory: ' +
str(familyHistoryEnum))), int(input('Enter dietEnum: ' + str(dietEnum))),
int(input('Enter LifeStyle: ' + str(lifeStyleEnum))), int(input('Enter Cholesterol: ' +
str(cholesterolEnum)))], bp.nodes.Categorical, p_heartdisease).get_moments()[0]
[heartDiseaseEnum['Yes']]
print("Probability(HeartDisease) = " + str(res))
#print(Style.RESET_ALL)
**EM algorithm
import numpy as np
from scipy import stats
np.random.seed(110)
red_mean = 3
red_std = 0.8
blue_mean = 7
blue_std = 1
#Since the colours are hidden from us, we will start the EM process
#Starting guesses are very critical because the EM Algorithm converges to
# a local maxima. Hence we can get different answers with different starting points
#One reasonably good guess would be to take the value from a different but less
#robust algorithm
# estimates for the mean
red_mean_guess = 2.1
blue_mean_guess = 6
#The variable both_colours holds each data point. The function stats.norm computes
#the probability of the point under a normal distribution with the given parameters:
for i in range(10):
likelihood_of_red = stats.norm(red_mean_guess, red_std_guess).pdf(both_colours)
likelihood_of_blue = stats.norm(blue_mean_guess,
blue_std_guess).pdf(both_colours)
#Normalize these weights so that they can total 1
likelihood_total = likelihood_of_red + likelihood_of_blue
#With our current estimates and our newly-computed weights, we can now compute
new,
#probably better, estimates for the parameters (step 4). We need a function for the
#mean and a function for the standard deviation:
#Lets print the model parameters (The means and the std deviation in our case)
print("red mean:", red_mean_guess, ":::::::::", "blue mean:", blue_mean_guess)
print("red std:", red_std_guess, ":::::::::", "blue std:", blue_std_guess)
mublue = blue_mean_guess
sigmablue = blue_std_guess
y = np.linspace(mublue - 2.5*sigmablue, mublue + 2.5*sigmablue, 100)
plt.plot(y,mlab.normpdf(y, mublue, sigmablue))
# set parameters
red_mean = 3
red_std = 0.8
blue_mean = 7
blue_std = 1
#We will need the elbow curve for calculating exact value of k
#But we will use 2 for now
kmeans=KMeans(n_clusters=2)
kmeansoutput=kmeans.fit(both_colours.reshape(-1,1))
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()
OUTPUT:
9. Write a program to implement k-Nearest Neighbour algorithm to classify the iris data
set. Print both correct and wrong predictions. Java/Python ML library classes can be
used for this problem.
#1.Import Data
print("Predicted Data")
print(clf.predict(X_test))
prediction=clf.predict(X_test)
diff=prediction-y_test
print("Result is ")
print(diff)
print('Total no of samples misclassied =', sum(abs(diff)))
OUTPUT:
Feature Names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Iris Data:
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5.4 3.7 1.5 0.2]
[4.8 3.4 1.6 0.2]
[4.8 3. 1.4 0.1]
[4.3 3. 1.1 0.1]
[5.8 4. 1.2 0.2]
[5.7 4.4 1.5 0.4]
[5.4 3.9 1.3 0.4]
[5.1 3.5 1.4 0.3]
[5.7 3.8 1.7 0.3]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[4.7 3.2 1.6 0.2]
[4.8 3.1 1.6 0.2]
[5.4 3.4 1.5 0.4]
[5.2 4.1 1.5 0.1]
[5.5 4.2 1.4 0.2]
[4.9 3.1 1.5 0.1]
[5. 3.2 1.2 0.2]
[5.5 3.5 1.3 0.2]
[4.9 3.1 1.5 0.1]
[4.4 3. 1.3 0.2]
[5.1 3.4 1.5 0.2]
[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]
[4.8 3. 1.4 0.3]
[5.1 3.8 1.6 0.2]
[4.6 3.2 1.4 0.2]
[5.3 3.7 1.5 0.2]
[5. 3.3 1.4 0.2]
[7. 3.2 4.7 1.4]
[6.4 3.2 4.5 1.5]
[6.9 3.1 4.9 1.5]
[5.5 2.3 4. 1.3]
[6.5 2.8 4.6 1.5]
[5.7 2.8 4.5 1.3]
[6.3 3.3 4.7 1.6]
[4.9 2.4 3.3 1. ]
[6.6 2.9 4.6 1.3]
[5.2 2.7 3.9 1.4]
[5. 2. 3.5 1. ]
[5.9 3. 4.2 1.5]
[6. 2.2 4. 1. ]
[6.1 2.9 4.7 1.4]
[5.6 2.9 3.6 1.3]
[6.7 3.1 4.4 1.4]
[5.6 3. 4.5 1.5]
[5.8 2.7 4.1 1. ]
[6.2 2.2 4.5 1.5]
[5.6 2.5 3.9 1.1]
[5.9 3.2 4.8 1.8]
[6.1 2.8 4. 1.3]
[6.3 2.5 4.9 1.5]
[6.1 2.8 4.7 1.2]
[6.4 2.9 4.3 1.3]
[6.6 3. 4.4 1.4]
[6.8 2.8 4.8 1.4]
[6.7 3. 5. 1.7]
[6. 2.9 4.5 1.5]
[5.7 2.6 3.5 1. ]
[5.5 2.4 3.8 1.1]
[5.5 2.4 3.7 1. ]
[5.8 2.7 3.9 1.2]
[6. 2.7 5.1 1.6]
[5.4 3. 4.5 1.5]
[6. 3.4 4.5 1.6]
[6.7 3.1 4.7 1.5]
[6.3 2.3 4.4 1.3]
[5.6 3. 4.1 1.3]
[5.5 2.5 4. 1.3]
[5.5 2.6 4.4 1.2]
[6.1 3. 4.6 1.4]
[5.8 2.6 4. 1.2]
[5. 2.3 3.3 1. ]
[5.6 2.7 4.2 1.3]
[5.7 3. 4.2 1.2]
[5.7 2.9 4.2 1.3]
[6.2 2.9 4.3 1.3]
[5.1 2.5 3. 1.1]
[5.7 2.8 4.1 1.3]
[6.3 3.3 6. 2.5]
[5.8 2.7 5.1 1.9]
[7.1 3. 5.9 2.1]
[6.3 2.9 5.6 1.8]
[6.5 3. 5.8 2.2]
[7.6 3. 6.6 2.1]
[4.9 2.5 4.5 1.7]
[7.3 2.9 6.3 1.8]
[6.7 2.5 5.8 1.8]
[7.2 3.6 6.1 2.5]
[6.5 3.2 5.1 2. ]
[6.4 2.7 5.3 1.9]
[6.8 3. 5.5 2.1]
[5.7 2.5 5. 2. ]
[5.8 2.8 5.1 2.4]
[6.4 3.2 5.3 2.3]
[6.5 3. 5.5 1.8]
[7.7 3.8 6.7 2.2]
[7.7 2.6 6.9 2.3]
[6. 2.2 5. 1.5]
[6.9 3.2 5.7 2.3]
[5.6 2.8 4.9 2. ]
[7.7 2.8 6.7 2. ]
[6.3 2.7 4.9 1.8]
[6.7 3.3 5.7 2.1]
[7.2 3.2 6. 1.8]
[6.2 2.8 4.8 1.8]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.1]
[7.2 3. 5.8 1.6]
[7.4 2.8 6.1 1.9]
[7.9 3.8 6.4 2. ]
[6.4 2.8 5.6 2.2]
[6.3 2.8 5.1 1.5]
[6.1 2.6 5.6 1.4]
[7.7 3. 6.1 2.3]
[6.3 3.4 5.6 2.4]
[6.4 3.1 5.5 1.8]
[6. 3. 4.8 1.8]
[6.9 3.1 5.4 2.1]
[6.7 3.1 5.6 2.4]
[6.9 3.1 5.1 2.3]
[5.8 2.7 5.1 1.9]
[6.8 3.2 5.9 2.3]
[6.7 3.3 5.7 2.5]
[6.7 3. 5.2 2.3]
[6.3 2.5 5. 1.9]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]]
Target Names: ['setosa' 'versicolor' 'virginica']
Target: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0000000000000111111111111111111111111
1111111111111111111111111122222222222
2222222222222222222222222222222222222
2 2]
Accuracy= 0.9473684210526315
Predicted Data
[2 1 2 1 0 1 1 1 2 2 0 2 1 1 1 2 2 1 2 2 1 1 1 2 0 0 1 0 1 0 2 0 1 1 0 1 1 2]
Test data :
[2 1 2 1 0 1 1 1 1 2 0 2 1 1 1 2 2 1 2 1 1 1 1 2 0 0 1 0 1 0 2 0 1 1 0 1 1 2]
Result is
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Total no of samples misclassied = 2
10. Implement the non-parametric Locally Weighted Regressionalgorithm in order to fit
data points. Select appropriate data set for your experiment and draw graphs.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Weigh each point by its distance to the reference point. We are considering
# All points here. If KNN was the topic, we could restrict this to "K"
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np.shape(xmat)
ypred = np.zeros(m)
for i in range(m):
# predicted value y = wx. Here w = weights we have computed.
# Remember that both w and x are vectors here (2*1 and 1*2 respectively)
# Resultant value of y is a scalar
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
#horizontally stack
X= np.hstack((one.T,mcolA.T))
print(X.shape)
OUTPUT:
(80, 2)