ML Lab R20
ML Lab R20
INDEX
DevelopaprogramforBias,Variance,Remove
5 duplicates, Cross Validation
WriteaprogramtoimplementCategorical
6 Encoding, One-hot Encoding
WriteaprogramtoImplementPrincipleComponent
15 Analysis.
Experiment-1
AIM: Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesisbased on a given set of training data samples. Read the training data froma .CSVfile.
Dataset:
sky Airtemp Humidity Wind Water Forecast EnjoySports
sunny warm normal strong warm same yes
sunny warm high strong warm same yes
rainy cold high strong warm change no
sunny warm high strong cool change yes
PROGRAM:
import csv
num_attributes=6
a=[]
print("\nTheGivenTrainingDataSet\n")
withopen('enjoyysports.csv','r')ascsvfile:
reader = csv.reader(csvfile)
forrowinreader:
a.append(row)
print(row)
print("\nTheinitialvalueofhypothesis:")
print(hypothesis)
forjinrange(0,num_attributes): hypothesis[j]
= a[0][j];
print("\nFindS:Findinga MaximallySpecificHypothesis\n")
foriinrange(0,len(a)):
ifa[i][num_attributes]=='yes':
forjinrange(0,num_attributes):
ifa[i][j]!=hypothesis[j]:hypothesis[j]='?'
else:
hypothesis[j]=a[i][j]
print(hypothesis)
OUTPUT:
TheGivenTrainingDataSet
['sunny','warm','normal','strong','warm','same','yes']
['sunny','warm','high','strong','warm','same','yes']
['rainy','cold','high','strong','warm','change','no']
['sunny','warm','high','strong','cool','change',
'yes'] The initial value of hypothesis:
['0','0','0','0','0','0']
FindS:FindingaMaximallySpecific Hypothesis
ForTraininginstanceNo:0thehypothesisis['sunny', 'warm', 'normal','strong','warm',
'same']
ForTraininginstanceNo:1thehypothesisis['sunny', 'warm', '?','strong','warm', 'same']
ForTraininginstanceNo:2thehypothesisis['sunny', 'warm', '?','strong','warm', 'same'] For
Training instance No: 3the hypothesis is ['sunny', 'warm', '?', 'strong', '?',
'?']TheMaximallySpecificHypothesisforagivenTrainingExamples:
['sunny', 'warm', '?', 'strong', '?', '?']
Experiment-2
AIM: For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the set of all
hypotheses consistent with the training examples.
Dataset:
sky Airtemp Humidity Wind Water Forecast PlayTennis
sunny warm normal strong warm same yes
sunny warm high strong warm same yes
rainy cold high strong warm change no
sunny warm high strong cool change yes
PROGRAM:
importnumpyasnp
import pandasaspd
data=pd.DataFrame(data=pd.read_csv('enjoyysports.csv'))
print(concepts)
target=np.array(data.iloc[:,-1])
print(target)
specific_h=concepts[0].copy()
print("Initializationofspecific_handgeneral_h") print(specific_h)
general_h=[["?"foriinrange(len(specific_h))]foriinrange(len(specific_h))] print(general_h)
iftarget[i] =="yes":
forxinrange(len(specific_h)):
ifh[x]!=specific_h[x]:
specific_h[x] = '?'
general_h[x][x]='?'
print("SpecificHypothesisafterstep",i+1) print(specific_h)
print("GeneralHypothesisafterstep",i+1) print(general_h)
iftarget[i] =="no":
forxinrange(len(specific_h)):
[x]=specific_h[x]
else:
general_h[x][x]='?'
foriinindices:
general_h.remove(['?','?','?','?','?','?'])
returnspecific_h,general_h
print("FinalSpecific_h:",s_final,sep="\n")
print("FinalGeneral_h:",g_final,sep="\n")
OUTPUT:
[['sunny''warm''normal''strong''warm''same']
['sunny''warm''high''strong''warm''same']
['rainy''cold''high''strong''warm''change']
['sunny''warm''high''strong''cool''change']]
['yes''yes''no''yes']
SpecificHypothesisafterstep1
['sunny''warm''normal''strong''warm''same']
GeneralHypothesisafterstep 1
[['?','?','?','?','?','?'],['?','?','?','?','?','?'],['?','?','?','?','?','?'],
['?','?','?','?','?','?'],['?','?','?','?','?','?'],['?','?','?','?','?','?']]
GeneralHypothesisafterstep 2
[['?','?','?','?','?','?'],['?','?','?','?','?','?'],['?','?','?','?','?','?'],
['?','?','?','?','?','?'],['?','?','?','?','?','?'],['?','?','?','?','?','?']]
SpecificHypothesisafterstep4
['sunny''warm''?''strong''?''?']
GeneralHypothesisafterstep 4
[['sunny','?','?','?','?','?'],['?','warm','?','?','?','?'],['?','?','?','?','?','?'],
['?','?','?','?','?','?'],['?','?','?','?','?','?'],['?','?','?','?','?','?']]
FinalSpecific_h:
['sunny''warm''?''strong''?''?']
FinalGeneral_h:
[['sunny','?','?','?','?','?'],['?','warm','?','?','?','?']]
Experiment-3
AIM:WriteaprogramtodemonstratetheworkingofthedecisiontreebasedID3algorithm.
Useanappropriatedatasetforbuildingthedecisiontreeandapplythisknowledgetoclassify a new
sample.
Dataset:
outlook temp humidity windy pt
sunny hot high weak no
sunny hot high strong no
overcast hot high weak yes
rainy mild high weak yes
rainy cool normal weak yes
rainy cool normal strong no
overcast cool normal strong yes
sunny mild high weak no
sunny cool normal weak yes
rainy mild normal weak yes
sunny mild normal strong yes
overcast mild high strong yes
overcast hot normal weak yes
rainy mild high strong no
PROGRAM:
importnumpyasnp
import math
importcsv
defread_data(filename):
withopen(filename,'r')ascsvfile:
datareader=csv.reader(csvfile,delimiter=',') headers
= next(datareader)
metadata=[]
traindata =[]
for name in headers:
metadata.append(name)
for row in datareader:
traindata.append(row)
return(metadata,traindata)
classNode:
def _init_(self, attribute):
self.attribute=attribute
self.children = []
self.answer = ""
def_str_(self):
returnself.attribute
defsubtables(data,col,delete):
dict = {}
items=np.unique(data[:, col])
count=np.zeros((items.shape[0],1),dtype=np.int32)
for x in range(items.shape[0]):
foryinrange(data.shape[0]):
ifdata[y,col]==items[x]: count[x]
+= 1
forxinrange(items.shape[0]):
dict[items[x]]=np.empty((int(count[x]),data.shape[1]),dtype="|S32") pos
=0
foryinrange(data.shape[0]):
ifdata[y,col]==items[x]:
dict[items[x]][pos]=data[y] pos
+= 1
ifdelete:
dict[items[x]]=np.delete(dict[items[x]],col,1)
defentropy(S):
items=np.unique(S)
ifitems.size==1:
return 0
counts=np.zeros((items.shape[0],1))
sums = 0
forxinrange(items.shape[0]):
counts[x]=sum(S== items[x])/(S.size*1.0)
forcountincounts:
sums+=-1*count*math.log(count,2) return
sums
defgain_ratio(data,col):
items,dict=subtables(data,col,delete=False)
total_size = data.shape[0]
entropies=np.zeros((items.shape[0],1))
intrinsic = np.zeros((items.shape[0], 1))
forxinrange(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x]=ratio*entropy(dict[items[x]][:,-1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy=entropy(data[:,-1]) iv
= -1 * sum(intrinsic)
forxinrange(entropies.shape[0]): total_entropy
-= entropies[x]
returntotal_entropy/iv
defcreate_node(data,metadata):
if(np.unique(data[:,-1])).shape[0]==1:
node = Node("")
node.answer=np.unique(data[:,-1])[0] return
node
gains=np.zeros((data.shape[1]-1,1))
split =
np.argmax(gains)node=Node
(metadata[split])
metadata=np.delete(metadata,split,0)
items,dict=subtables(data,split,delete=True) for
x in range(items.shape[0]):
child=create_node(dict[items[x]],metadata)
node.children.append((items[x],child))
return node
defempty(size):
s = ""
forxinrange(size): s
+= ""
returns
defprint_tree(node,level): if
node.answer != "":
print(empty(level), node.answer)
return
print(empty(level),node.attribute)
for value, n in node.children:
print(empty(level+1),value)
print_tree(n, level + 2)
metadata,traindata=read_data("ID3.csv")
data = np.array(traindata)
node=create_node(data,metadata)
print_tree(node, 0)
OUTPUT:
outlook
overcastb'yes'
rainy
windy
b'strong'b'no'
b'weak'
b'yes'sunn
yhumidity
b'high'b'no'b'
normal' b'yes'
Experiment-4
AIM:Exercisestosolvethereal-worldproblemsusingthefollowingmachinelearning methods:
a) LinearRegression
b) LogisticRegression
c) BinaryClassifier
a) LinearRegression:
Linear regression is probably one of the most important and widely used regression techniques. It’s
amongthesimplestregressionmethods. Oneofits mainadvantagesistheeaseof interpretingresults.
PROGRAM:
from sklearn import datasets
from sklearn import metrics
disease=datasets.load_diabetes()
print(disease.keys())
importnumpyasnp
disease_X=disease.data[:,np.newaxis,2]
disease_X_train = disease_X[:-30]
disease_X_test = disease_X[-20:]
disease_Y_train = disease.target[:-30]
disease_Y_test = disease.target[-20:]
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(disease_X_train,disease_Y_train)
y_predict = reg.predict(disease_X_test)
accuracy=metrics.mean_squared_error(disease_Y_test,y_predict,)
print("accuracy=",accuracy)
weights = reg.coef_
intercept=reg.intercept_
print(weights,intercept)
import matplotlib.pyplot as plt
plt.scatter(disease_X_test,disease_Y_test)
plt.plot(disease_X_test,y_predict)
plt.show()
OUTPUT:
dict_keys(['data','target','frame','DESCR','feature_names','data_filename','target_filename',
'data_module'])
accuracy=2561.3204277283867
[941.43097333]153.39713623331698
b) LogisticRegression:
It’saclassificationalgorithmthat isusedwherethetargetvariable isofcategoricalnature. The main
objective behind Logistic Regression is to determine the relationship between features and
the probability of a particular outcome.
DataSet:
UserID Gender Age EstimatedSalary Purchased
15624510 Male 19 19000 0
15810944 Male 25 20000 0
15668575 Female 26 43000 0
15603246 Female 27 57000 0
15804002 Male 19 76000 0
15728773 Male 27 58000 0
15598044 Female 27 84000 0
15694829 Female 32 150000 1
15600575 Male 25 33000 0
15727311 Female 35 65000 0
15570769 Female 26 80000 0
15606274 Female 26 52000 0
15746139 Male 20 86000 0
15704987 Male 32 18000 0
15628972 Male 18 82000 0
15697686 Male 29 80000 0
15733883 Male 47 25000 1
15617482 Male 45 26000 1
15704583 Male 46 28000 1
PROGRAM:
importpandasaspd
importnumpyasnp
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
dataset=pd.read_csv("User_Data.csv") x
= dataset.iloc[:, [2, 3]].values
y=dataset.iloc[:,4].values
fromsklearn.model_selectionimporttrain_test_split
xtrain,xtest,ytrain, ytest=train_test_split(x,y,test_size=0.25,random_state=0)
fromsklearn.preprocessingimportStandardScaler
sc_x = StandardScaler()
xtrain=sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)
print(xtrain[0:10,:])
fromsklearn.linear_modelimportLogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(xtrain, ytrain)
y_pred=classifier.predict(xtest)
fromsklearn.metricsimportconfusion_matrix cm
= confusion_matrix(ytest, y_pred)
print ("ConfusionMatrix:\n", cm)
fromsklearn.metrics importaccuracy_score
print("Accuracy:", accuracy_score(ytest, y_pred))
frommatplotlib.colorsimportListedColormap
X_set, y_set = xtest, ytest
X1,X2=np.meshgrid(np.arange(start=X_set[:,0].min() -1, stop =
X_set[:, 0].max() + 1, step = 0.01),
np.arange(start=X_set[:,1].min()- 1,
stop=X_set[:,1].max() +1,step =0.01))
plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape),
alpha=0.75,cmap=ListedColormap(('red','green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set==j,0],X_set[y_set ==j,1], c =
ListedColormap(('red', 'green'))(i), label = j)
plt.title('Classifier(Testset)')
plt.xlabel('Age')
plt.ylabel('EstimatedSalary')
plt.legend()
plt.show()
Output:
[[2.149452-1.02601437]
[-0.287173750.70708966]
[-1.261824050.4720925]
[-0.40900504-0.49727077]
[-0.28717375-0.0566511]
[0.32198269-1.23163688]
[0.687476550.14897141]
[0.321982692.6458162]
[1.90578942-0.99663973]
[-0.40900504-0.23289897]]
ConfusionMatrix:
[[4 0]
[0 1]]
Accuracy:1.0
c) BinaryClassifier:
Inmachine learning,binaryclassificationisasupervised learningalgorithmthat
categorizes new observations into one of two classes.
Ifthe modelsuccessfully predicts the patients as positive, this case is called True
Positive(TP).Ifthemodelsuccessfullypredictspatientsasnegative, this iscalled True
Negative (TN).
The binary classifier may misdiagnose some patients as well. Ifa diseased patient is
classifiedashealthybyanegativetestresult,thiserroriscalledFalseNegative(FN).
Similarly,Ifahealthypatient isclassifiedasdiseasedbyapositivetestresult,this error is
called False Positive (FP).
PROGRAM:
fromnumpyimportwhere
fromcollectionsimportCounter
fromsklearn.datasetsimportmake_blobs
frommatplotlibimport pyplot
X,y=make_blobs(n_samples=1000,centers=2,random_state=1)
print(X.shape, y.shape)
counter=Counter(y)
print(counter)
foriinrange(10):
print(X[i],y[i])
row_ix=where(y==label)[0]
pyplot.scatter(X[row_ix,0],X[row_ix,1],label=str(label)) pyplot.legend()
pyplot.show()
OUTPUT:
(1000,2) (1000,)
Counter({0: 500, 1:500})
[-3.058372724.48825769] 0
[-8.60973869-3.72714879] 1
[1.371297215.23107449] 0
[-9.33917563-2.9544469] 1
[-11.57178593 -3.85275513] 1
[-11.42257341 -4.85679127] 1
[-10.44518578 -3.76476563] 1
[-10.44603561 -3.26065964] 1
[-0.619470753.48804983] 0
[-10.91115591-4.5772537] 1
Experiment-5
AIM:DevelopaprogramforBias,Variance,Removeduplicates,CrossValidation.
PROGRAM:
frommlxtend.evaluateimportbias_variance_decomp
import warnings
warnings.filterwarnings('ignore')
fromsklearn.datasetsimportfetch_california_housing
housing=fetch_california_housing()
fromsklearnimportmetrics
X,y=fetch_california_housing(return_X_y=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=1) model_lr =
LinearRegression()
mse,bias,var=bias_variance_decomp(model_lr,X_train,y_train,
num_rounds=200, random_seed=123)
y_pred=model_lr.predict(X_test)
print('AvgVariance:%.3f'%var)
print('MeanSquareerrorbySckit-learnlib:%.3f'%metrics.mean_squared_error(y_test,y_pred))
OUTPUT:
MSEfrombias_variancelib[avgexpectedloss]:0.527 Avg
Bias: 0.525
AvgVariance:0.002
MeanSquare errorbySckit-learn lib:0.527
Experiment-6
AIM:WriteaprogramtoimplementCategoricalEncoding,One-hotEncoding.
Dataset:
Sno Empid Gender Remarks
0 45 male Nice
1 78 female Good
2 56 female Great
3 12 male Great
4 7 female Nice
PROGRAM:
importpandasaspd
fromnumpyimportasarray
fromsklearn.preprocessingimportOneHotEncoder data
= pd.read_csv("emp.csv")
print(data.head())
print(data['Gender'].unique())
print(data['Remarks'].unique())
print(data['Gender'].value_counts())
print(data['Remarks'].value_counts())
encoder=OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(data)
print(onehot)
OUTPUT:
Sno Empid Gender Remarks
0 0 45 male Nice
1 1 78 female Good
2 2 56 female Great
3 3 12 male Great
4 4 7 female Nice
['male''female']
['Nice''Good''Great']
female 3
male 2
Name:Gender,dtype:int64
Nice 2
Great2
Good 1
Name:Remarks,dtype:int64
[[1.0.0.0.0.0.0.1.0.0.0.1.0.0.1.]
[0.1.0.0.0.0.0.0.0.1.1.0.1.0.0.]
[0.0.1.0.0.0.0.0.1.0.1.0.0.1.0.]
[0.0.0.1.0.0.1.0.0.0.0.1.0.1.0.]
[0.0.0.0.1.1.0.0.0.0.1.0.0.0.1.]]
Experiment-7
AIM: Build anArtificialNeuralNetworkbyimplementing the Back propagationalgorithm and
test the same using appropriate data sets.
PROGRAM:
importnumpyasnp
X=np.array(([2,9],[1,5],[3,6]),dtype=float)
y=np.array(([92],[86],[89]),dtype=float) X
= X/np.amax(X,axis=0)
y=y/100
defsigmoid (x):
return1/(1+np.exp(-x))
defderivatives_sigmoid(x):
returnx*(1-x)
epoch=1
lr=0.1
inputlayer_neurons=2
hiddenlayer_neurons=3
output_neurons = 1
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
hinp1=np.dot(X,wh)
hinp=hinp1 + bh
hlayer_act=sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
output=sigmoid(outinp)
EO=y-output
outgrad=derivatives_sigmoid(output)
EH=d_output.dot(wout.T)
hiddengrad=derivatives_sigmoid(hlayer_act)
d_hiddenlayer = EH * hiddengrad
wout+=hlayer_act.T.dot(d_output)*lr
bout+=np.sum(d_output,axis=0,keepdims=True)*lr wh
+= X.T.dot(d_hiddenlayer) *lr
bh+=np.sum(d_hiddenlayer,axis=0,keepdims=True)*lr
print("PredictedOutput:\n",output)
OUTPUT:
Input:
[[0.666666671. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
ActualOutput:
[[0.92]
[0.86]
[0.89]]
PredictedOutput:
[[0.75910827]
[0.75067151]
[0.76258194]]
Exercise-8
AIM:Writeaprogramtoimplement k-NearestNeighboralgorithmtoclassifythe irisdata set.
Print both correct and wrong predictions.
PROGRAM:
import pandasaspd
fromsklearn.datasetsimportload_iris
iris=load_iris()
iris.keys()
df=pd.DataFrame(iris['data'])
print(df)
print(iris['target_names'])
X=df
y=iris['target']
fromsklearn.model_selectionimporttrain_test_split
X_train,X_test,y_train, y_test=train_test_split(X,y,test_size=0.33,random_state=42) from
sklearn.neighbors import KNeighborsClassifier knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
import numpy as np
import warnings
warnings.filterwarnings('ignore')
x_new=np.array([[5,2.9,1,0.2]])
prediction=knn.predict(x_new) iris['target_names']
[prediction]
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
fromsklearn.metricsimportclassification_report
y_pred=knn.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(" correct predicition",accuracy_score(y_test,y_pred))
print("worngpredicition",(1-accuracy_score(y_test,y_pred)))
OUTPUT:
0 1 2 3
1466.32.55.01.9
1476.53.05.22.0
1486.23.45.42.3
1495.93.05.11.8
['setosa''versicolor''virginica']
[[1900]
[ 0150]
[ 0115]]
correctpredicition0.98
worngpredicition0.020000000000000018
Experiment-9
AIM:Implement thenon-parametricLocallyWeightedRegressionalgorithminorderto fit data
points. Select appropriate data set for your experiment and draw graphs.
PROGRAM:
from math import ceil
import numpy as np
fromscipyimportlinalg
deflowess(x,y,f,iterations): n
= len(x)
r=int(ceil(f*n))
h=[np.sort(np.abs(x-x[i]))[r]foriinrange(n)]
w=np.clip(np.abs((x[:,None]-x[None,:])/h),0.0,1.0) w = (1
- w ** 3) ** 3
yest=np.zeros(n)
delta=np.ones(n)
foriterationin range(iterations):
foriinrange(n):
weights=delta*w[:,i]
b=np.array([np.sum(weights*y),np.sum(weights*y*x)]) A =
[np.sum(weights*x),np.sum(weights*x*x)]]) beta =
linalg.solve(A, b)
yest[i]=beta[0]+beta[1]*x[i]
residuals = y - yest
s = np.median(np.abs(residuals))
delta=np.clip(residuals/(6.0 *s),-1,1)
delta=(1-delta ** 2)**2
returnyest
import math
n = 100
x=np.linspace(0,2*math.pi,n)
y=np.sin(x)+0.3*np.random.randn(n) f =
0.25
iterations=10
yest=lowess(x,y,f,iterations)
importmatplotlib.pyplotasplt
plt.plot(x,y,"r.")
plt.plot(x,yest,"b-")
OUTPUT:
[<matplotlib.lines.Line2Dat0x1755ac36cd0>]
Exercise-10
AIM: Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to perform this task. Built-in Java classes/API can be used to write the
program. Calculate the accuracy, precision, and recall for your data set.
Dataset:
Ilovethissandwich pos
This isanamazingplace pos
Ifeel verygoodabout these beers pos
Thisismybestwork pos
Whatanawesomeview pos
Idonotlikethisrestaurant neg
I amtiredofthis stuff neg
Ican'tdealwith this neg
Heismyswornenemy neg
Mybossis horrible neg
Thisisanawesomeplace pos
PROGRAM:
import pandasaspd
msg=pd.read_csv('ex10.csv',names=['message','label'])
import warnings
warnings.filterwarnings("ignore")
X=msg.message
y=msg.labelnum
fromsklearn.model_selectionimporttrain_test_split Xtrain,
fromsklearn.feature_extraction.textimportCountVectorizer
count_v=CountVectorizer()
Xtrain_dm=count_v.fit_transform(Xtrain)
Xtest_dm = count_v.transform(Xtest)
df=pd.DataFrame(Xtrain_dm.toarray(),columns=count_v.get_feature_names())
print(df[0:5])
fromsklearn.naive_bayesimportMultinomialNB clf
= MultinomialNB()
clf.fit(Xtrain_dm, ytrain)
pred=clf.predict(Xtest_dm)
print("%s->%s"%(doc,p))
fromsklearn.metricsimportaccuracy_score,confusion_matrix,precision_score, recall_score
print('AccuracyMetrics: \n')
print('Accuracy:',accuracy_score(ytest,pred))
print('Precision:',precision_score(ytest,pred))
print('ConfusionMatrix:\n',confusion_matrix(ytest,pred))
OUTPUT:
TotalInstancesofDataset:11
aboutamanawesomebeersbestbosscandealdo...restaurant\
0 1 00 0 1 0 0 0 0 0... 0
1 0 00 0 0 1 0 0 0 0... 0
2 0 00 0 0 0 0 1 1 0... 0
3 0 01 1 0 0 0 0 0 0... 0
4 0 01 1 0 0 0 0 0 0... 0
stuffthesethistiredveryviewwhatwithwork
0 0 1 0 0 1 0 0 0 0
1 0 0 1 0 0 0 0 0 1
2 0 0 1 0 0 0 0 1 0
3 0 0 0 0 0 1 1 0 0
4 0 0 1 0 0 0 0 0 0
[5rows x29columns]
Ifeelverygoodaboutthesebeers->pos This
Ican't dealwiththis->pos
Accuracy Metrics:
Accuracy:0.3333333333333333
Recall:0.5
Precision:0.5
ConfusionMatrix:
[[0 1]
[11]]
Exercise-11
AIM: Apply EM algorithm to cluster a Heart Disease Data Set. Use the same data set for
clustering using k-Means algorithm. Compare the results of these two algorithms and
commentonthequalityofclustering.YoucanaddJava/PythonMLlibraryclasses/APIinthe
program.
PROGRAM:
importmatplotlib.pyplotasplt
fromsklearnimportdatasets
fromsklearn.clusterimportKMeans
import pandasaspd
importnumpyasnp
iris=datasets.load_iris()
X= pd.DataFrame(iris.data)
X.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width'] y
= pd.DataFrame(iris.target)
y.columns=['Targets']
model=KMeans(n_clusters=3)
model.fit(X)
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
plt.subplot(1, 3, 1)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y.Targets],s=40)
plt.title('Real Clusters')
plt.xlabel('PetalLength')
plt.ylabel('Petal Width')
plt.subplot(1, 3, 2)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[model.labels_],s=40)
plt.title('K-Means Clustering')
plt.xlabel('PetalLength')
plt.ylabel('Petal Width')
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()
scaler.fit(X)
xsa =scaler.transform(X)
xs=pd.DataFrame(xsa,columns=X.columns)
fromsklearn.mixtureimportGaussianMixture gmm
= GaussianMixture(n_components=40) gmm.fit(xs)
plt.subplot(1,3,3)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[0],s=40)
plt.title('GMM Clustering')
plt.xlabel('PetalLength')
plt.ylabel('Petal Width')
print('Observation:TheGMMusingEMalgorithmbasedclustering matched
the true labels more closely than the Kmeans.')
OUTPUT:
Observation:TheGMMusingEMalgorithmbasedclusteringmatchedthetruelabelsmore closely
than the Kmeans.
Exercise-12
AIM:ExploratoryDataAnalysis for ClassificationusingPandasor Matplotlib.
PROGRAM:
import pandasaspd
importmatplotlib.pyplotaspp
df=pd.DataFrame({"nums": list(range(1,8)),
"sqrs":[i**2for iinrange(1,8)]})
nCallinghead:\n",df.head())
print("\nCallingheadwith'n':\n",df.head(3))
foriindf:
print(f"\nFor{i}:\n")
df[i].hist()
pp.show()
OUTPUT:
The data is :
numssqrs
0 1 1
1 2 4
2 3 9
3 4 16
4 5 25
5 6 36
6 7 49
Callinghead:
numssqrs
0 1 1
1 2 4
2 3 9
3 4 16
4 5 25
Callingheadwith'n':
numssqrs
0 1 1
1 2 4
2 3 9
Callingtail:
nums sqrs
2 3 9
3 4 16
4 5 25
5 6 36
6 7 49
Callingtailwith'n':
numssqrs
4 5 25
5 6 36
6 7 49
Gettingthedescriptionofourdataset:
nums sqrs
count 7.000000 7.000000
mean 4.000000 20.000000
std 2.160247 17.682383
min 1.000000 1.000000
25% 2.500000 6.500000
50% 4.000000 16.000000
75% 5.500000 30.500000
max 7.000000 49.000000
Gettingallcorrelationsofour dataset:
nums sqrs
nums 1.000000 0.977356
sqrs 0.977356 1.000000
Fornums:
Forsqrs :
Exercise-13
AIM:WriteaPythonprogramtoconstructaBayesiannetworkconsidering medicaldata.
Usethis modelto demonstratethediagnosisofheartpatientsusingstandardHeartDisease Data
Set.
Dataset:
age Gender Family diet Lifestyle cholestrol heartdisease
0 0 1 1 3 0 1
0 1 1 1 3 0 1
1 0 0 0 2 1 1
4 0 1 1 3 2 0
3 1 1 0 0 2 0
2 0 1 1 1 0 1
4 0 1 0 2 0 1
0 0 1 1 3 0 1
3 1 1 0 0 2 0
1 1 0 0 0 2 1
4 1 0 1 2 0 1
4 0 1 1 3 2 0
2 1 0 0 0 0 0
2 0 1 1 1 0 1
3 1 1 0 0 1 0
0 0 1 0 0 2 1
1 1 0 1 2 1 1
3 1 1 1 0 1 0
4 0 1 1 3 2 0
PROGRAM:
importpandasaspd
frompgmpy.estimatorsimportMaximumLikelihoodEstimator from
pgmpy.models import BayesianNetwork
frompgmpy.inference import VariableElimination
data = pd.read_csv("heart.csv")
heart_disease=pd.DataFrame(data)
print(heart_disease)
model = BayesianNetwork([ ('age', 'Lifestyle'), ('Gender', 'Lifestyle'),
('Family','heartdisease'),('diet','cholestrol'),('Lifestyle','diet'),
('cholestrol', 'heartdisease'),('diet', 'cholestrol')])
model.fit(heart_disease,estimator=MaximumLikelihoodEstimator)
HeartDisease_infer = VariableElimination(model)
print('ForAgeenterSuperSeniorCitizen:0,SeniorCitizen:1,MiddleAged:2,Youth:3,Teen:4')
print('For Gender enter Male:0, Female:1')
print('ForFamilyHistoryenterYes:1,No:0') print('For
Diet enter High:0, Medium:1')
print('forLifeStyleenterAthlete:0,Active:1,Moderate:2,Sedentary:3')
print('for Cholesterol enter High:0, BorderLine:1, Normal:2')
q=HeartDisease_infer.query(variables=['heartdisease'],evidence={'age':int(input('EnterAge:')),
OUTPUT:
ageGenderFamilydietLifestylecholestrolheartdisease
0 0 0 1 1 3 0 1
1 0 1 1 1 3 0 1
2 1 0 0 0 2 1 1
3 4 0 1 1 3 2 0
4 3 1 1 0 0 2 0
5 2 0 1 1 1 0 1
6 4 0 1 0 2 0 1
7 0 0 1 1 3 0 1
8 3 1 1 0 0 2 0
9 1 1 0 0 0 2 1
10 4 1 0 1 2 0 1
11 4 0 1 1 3 2 0
12 2 1 0 0 0 0 0
13 2 0 1 1 1 0 1
14 3 1 1 0 0 1 0
15 0 0 1 0 0 2 1
16 1 1 0 1 2 1 1
17 3 1 1 1 0 1 0
18 4 0 1 1 3 2 0
ForAgeenterSuperSeniorCitizen:0,SeniorCitizen:1,MiddleAged:2,Yout h:3,
Teen:4
For Gender enter Male:0, Female:1
ForFamilyHistoryenterYes:1,No:0 For
Diet enter High:0, Medium:1
forLifeStyleenterAthlete:0,Active:1,Moderate:2,Sedentary:3 for
Cholesterol enter High:0, BorderLine:1, Normal:2
Enter Age:2
EnterGender:1
EnterFamilyHistory:1
Enter Diet: 0
Enter Lifestyle:2
EnterCholestrol:2
+ + +
|heartdisease | phi(heartdisease)|
+=================+=====================+
|heartdisease(0)| 0.8333|
+ + +
|heartdisease(1)| 0.1667|
+ + +
Experiment-14
AIM:WriteaprogramtoImplementSupportVectorMachinesandPrincipleComponent Analysis.
PROGRAM:
importnumpyasnp
import pandasaspd
importmatplotlib.pyplotasplt
fromsklearnimportdatasets
fromsklearn.model_selectionimporttrain_test_split
fromsklearn.svmimport SVC
from sklearn.decomposition import PCA
fromsklearn.metricsimportaccuracy_score
iris= pd.read_csv(r'iris.csv',header=0)
print(iris.columns)
print(iris.shape)
x= iris.drop('Species',axis=1)
iris['Species']=iris['Species'].astype('category')
X_train,X_test,y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)
n_components = 2
pca=PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
svm=SVC(kernel='linear',C=1.0)
svm.fit(X_train_pca, y_train)
y_pred = svm.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print("AccuracyofSVMwithPCA:",accuracy)
plt.figure(figsize=(8, 6))
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1],
c=y_train,cmap=plt.cm.Paired,marker='o')
plt.xlabel('PrincipalComponent1')
plt.ylabel('PrincipalComponent2')
h= .02
x_min, x_max=X_train_pca[:,0].min()-1,X_train_pca[:,0].max()+1
y_min,y_max=X_train_pca[:, 1].min()-1,X_train_pca[:, 1].max()+1
xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h)) Z =
svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z= Z.reshape(xx.shape)
plt.contourf(xx,yy,Z,cmap=plt.cm.Paired,alpha=0.8)
plt.title('SVM Decision Boundary with PCA')
plt.show()
OUTPUT:
Index(['sno', 'Id', 'SepalLengthCm', 'SepalWidthCm',
'PetalLengthCm','PetalWidthCm','Species'],dtype='object')
(150,7)
AccuracyofSVMwithPCA:0.9666666666666667
Exercise-15
AIM:WriteaprogramtoImplement PrincipleComponentAnalysis.
PROGRAM:
import pandasaspd
importnumpyasnp
importmatplotlib.pyplotasplt
'exec(%matplotlib inline)'
fromsklearn.datasetsimportload_breast_cancer
cancer = load_breast_cancer()
df=pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df.head()
fromsklearn.preprocessingimportStandardScaler scalar
= StandardScaler()
scalar.fit(df)
scaled_data =scalar.transform(df)
fromsklearn.decompositionimportPCA
components = 2
pca=PCA(n_components=2)
pca.fit(scaled_data)
x_pca=pca.transform(scaled_data)
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'],cmap='plasma')
plt.ylabel('SecondPrincipalComponent')
OUTPUT:
Text(0,0.5,'SecondPrincipalComponent')