Naive Bayes Project
Naive Bayes Project
ipynb - Colaboratory
Imports
1 import numpy as np
2 import pandas as pd
3 import scipy.stats as stats
4 import matplotlib.pyplot as plt
5 import seaborn as sns
6 from sklearn.metrics import confusion_matrix,f1_score
7
8 sns.set(style="whitegrid")
1 import numpy as np
2 import pandas as pd
3 import os
4 from keras.datasets import mnist
5 import matplotlib.pyplot as plt
6 import seaborn as sns
7 from sklearn.preprocessing import scale
8 from sklearn.model_selection import train_test_split
9 from sklearn import datasets
1 class NaiveBayes():
2
3 def calc_prior(self, features, target): #calculate prior probability
4 self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()
5 return self.prior
6
7 def calc_statistics(self, features, target): # calculate mean and variance for each feature
8 self.mean = features.groupby(target).apply(np.mean).to_numpy()
9 self.var = features.groupby(target).apply(np.var).to_numpy()
10 self.var = self.var+1000 #1000 is a smoothing factor to avoid zero variance
11 return self.mean, self.var
12
13 def calculate_likelihood_gaussian(self, class_idx, x):
14 mean = self.mean[class_idx]
15 var = self.var[class_idx]
16 p_x_given_y = np.exp((-1/2)*((x-mean)**2) / (2 * var)) / np.sqrt(2 * np.pi * var)
17 return p_x_given_y
18
19 def calc_posterior(self, x):
20 posteriors = []
21 # calculate posterior probability for each class
22 for i in range(self.Classes_count):
23 prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
24 conditional = np.sum(np.log(self.calculate_likelihood_gaussian(i, x))) # use the log to make it more numerically stable
25 posterior = prior + conditional
26 posteriors.append(posterior)
27 return self.classes[np.argmax(posteriors)]
28
29
30 def fit(self, features, target):
31 self.classes = np.unique(target)
32 self.Classes_count = len(self.classes)
33 self.feature_nums = features.shape[1]
34 self.rows = features.shape[0]
35 self.calc_statistics(features, target)
36 self.calc_prior(features, target)
37
38 def predict(self, features): #predict class
39 preds = [self.calc_posterior(f) for f in features.to_numpy()]
40 return preds
41
42 def accuracy(self, y_test, y_pred): #for calculating accuracy
43 accuracy = np.sum(y_test == y_pred) / len(y_test)
44 print(f'The accuracy is :{accuracy*100}%')
45
46
https://colab.research.google.com/drive/12kQfNMiMLZ1-gDeUNOaNQogO479yX12H?usp=sharing#scrollTo=rTtSqFq_ll4g&printMode=true 1/5
6/26/23, 11:53 PM Copy of Naive_Bayes_Project2.ipynb - Colaboratory
47 def visualize(self, y_true, y_pred, target): #visualize predictions and true data
48
49 fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
50 sns.countplot(x=y_true,ax=ax[0])
51 sns.countplot(x=np.sort(y_pred), ax=ax[1])
52 fig.suptitle('True vs Predicted Comparison', fontsize=20)
53 ax[0].tick_params(labelsize=12)
54 ax[1].tick_params(labelsize=12)
55 ax[0].set_title("True values", fontsize=18)
56 ax[1].set_title("Predicted values", fontsize=18)
57 plt.show()
58
59 def confusionMatrix(self, actual, predict): #computing confusion Matrix
60 cfm = pd.DataFrame(confusion_matrix(actual, predict))
61 print('Testing Confusion Matrix: Actual vs. Prediction')
62 display(cfm)
Load DataSet
1 from sklearn.datasets import fetch_openml
2 from sklearn.utils import check_random_state
3 from sklearn.model_selection import train_test_split
4 from sklearn.preprocessing import StandardScaler
5
6
7 def fetch_data(test_size=10000, randomize=False, standardize=False):
8 X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
9 if randomize:
10 random_state = check_random_state(0)
11 permutation = random_state.permutation(X.shape[0])
12 X = X[permutation]
13 y = y[permutation]
14 X_train, X_test, y_train, y_test = train_test_split(
15 X, y, test_size=test_size, shuffle=False)
16 if standardize:
17 scaler = StandardScaler()
18 X_train = scaler.fit_transform(X_train)
19 X_test = scaler.transform(X_test)
20 return X_train, y_train, X_test, y_test
21
22
23 if __name__ == '__main__':
24 train_image, train_label, test_image, test_label = fetch_data()
1 # convert to df
2 X_train = pd.DataFrame(train_image).squeeze()
3 Y_train = pd.DataFrame(train_label).squeeze()
4 X_test = pd.DataFrame(test_image).squeeze()
5 Y_test = pd.DataFrame(test_label).squeeze()
6
Testing Model
1 # train the model
2 x = NaiveBayes()
3 x.fit((X_train), (Y_train))
4 predictions = x.predict(X_test)
1 x.accuracy(Y_test, predictions)
1 x.confusionMatrix(Y_test, predictions)
https://colab.research.google.com/drive/12kQfNMiMLZ1-gDeUNOaNQogO479yX12H?usp=sharing#scrollTo=rTtSqFq_ll4g&printMode=true 2/5
6/26/23, 11:53 PM Copy of Naive_Bayes_Project2.ipynb - Colaboratory
0 852 2 0 4 2 6 46 1 41 26
1 0 1125 1 2 0 0 4 0 2 1
3 3 151 13 697 2 8 18 13 39 66
4 1 52 0 0 471 0 21 1 9 427
6 7 62 3 0 9 7 862 0 6 2
7 0 102 4 4 15 0 3 798 13 89
1 score = f1_score(Y_test, predictions, average=None)
2 print(f'f1 score is: {score}')
8 6 226 3 27 19 4 14 10 541 124
9 score
f1 4 is:
63 [0.90349947
0 7 23 0
0.70268582 10.68447205
12 9 0.72984293
890 0.60191693 0.56897896
0.82606612 0.84221636 0.59417902 0.64939803]
1 x.visualize(Y_test,predictions,'label')
1 # compare to sklearn Naive Bayes Classifier
2 from sklearn.naive_bayes import GaussianNB
3 from sklearn.metrics import accuracy_score
4
1 clf = GaussianNB()
1 # iris dataset
2 clf.fit(X_train, Y_train)
GaussianNB()
1 preds = clf.predict(X_test)
1 score =clf.score(X_test, Y_test)
2 print(f'the accuracy is: {score*100}%')
3
https://colab.research.google.com/drive/12kQfNMiMLZ1-gDeUNOaNQogO479yX12H?usp=sharing#scrollTo=rTtSqFq_ll4g&printMode=true 3/5
6/26/23, 11:53 PM Copy of Naive_Bayes_Project2.ipynb - Colaboratory
Categorical NB(Bonus)
1 class categorical_NB:
2
3 def calc_prior_value(self,X_train,Y_train):
4 self._priorsvalues = np.zeros(len(self._uniqueclasses), dtype = np.float64)
5 X_size = float(X_train.shape[0])
6 for i, Class in enumerate(self._uniqueclasses):
7 x = X_train[Y_train == Class]
8 self._priorsvalues[i] = x.shape[0] / X_size
9
10 def _class_Probability_matrix(self, X_train, Y_train, class_no):
11 feature_values= 256 #0 to 255
12 X = X_train[Y_train == class_no]
13 X = np.transpose(X)
14 class_matrix = np.zeros((X.shape[0], feature_values))
15 for value in range(X.shape[0]):
16 frequency = np.zeros(feature_values)
17 for i in X[value]:
18 frequency[int(i)] += 1
19 class_matrix[value] = (frequency/len(X[value]))
20
21 return class_matrix
22
23
24 def fit(self, X_train, Y_train):
25 self._uniqueclasses = np.unique(Y_train)
26 self.calc_prior_value(X_train,Y_train)
27
28 #calculate Classes matrix
29 self._classes_matrices = np.zeros((len(self._uniqueclasses), 784, 256))
30 for i, Class in enumerate(self._uniqueclasses):
31 self._classes_matrices[i] = self._class_Probability_matrix(X_train,Y_train,Class)
32
33 def predict_label(self, x):
34 log_prob = np.zeros(len(self._uniqueclasses))
35
36 for idx, ci in enumerate(self._uniqueclasses):
37 mat = self._classes_matrices[idx]
38 sum = 0.0
39 probability_vec = []
40
41 for i in range(len(x)):
42 value = int(x[i])
43 prob = mat[i][value]
44 probability_vec.append(prob)
45
46 sum = np.sum(np.nan_to_num(np.log(probability_vec)))
47 log_prob[idx] = sum
48
49 classified_class = np.argmax(log_prob)
50 return self._uniqueclasses[classified_class]
51
52 def predict(self, X_test):
53 label = []
54 for x in X_test:
55 label.append(self.predict_label(x))
56 return np.array(label)
57
58 def accuracy(self, y_test, y_pred): #for calculating accuracy
59 accuracy = np.sum(y_test == y_pred) / len(y_test)
60 print(f'The accuracy is :{accuracy*100}%')
61
62 def confusionMatrix(self, actual, predict): #computing confusion Matrix
63 cfm = pd.DataFrame(confusion_matrix(actual, predict))
64 print('Testing Confusion Matrix: Actual vs. Prediction')
65 display(cfm)
1 model = categorical_NB()
2 model.fit(np.array(X_train),np.array((Y_train)))
1 predictions = model.predict(np.array(X_test))
https://colab.research.google.com/drive/12kQfNMiMLZ1-gDeUNOaNQogO479yX12H?usp=sharing#scrollTo=rTtSqFq_ll4g&printMode=true 4/5
6/26/23, 11:53 PM Copy of Naive_Bayes_Project2.ipynb - Colaboratory
1 accuracy = np.sum(predictions == Y_test) / len(Y_test)
2 print(f'The accuracy is :{accuracy*100}%')
1 score = f1_score(Y_test, predictions, average=None)
2 print(f'f1 score is: {score}')
1 model.confusionMatrix(Y_test,predictions)
0 880 0 17 17 2 20 8 0 36 0
2 492 0 443 36 2 16 5 2 36 0
3 337 0 88 460 5 52 3 9 44 12
4 338 0 27 31 374 77 5 4 61 65
5 342 0 17 72 14 344 9 6 85 3
6 370 1 54 8 3 14 491 0 17 0
7 358 0 2 36 27 13 0 478 22 92
8 286 0 40 67 32 91 1 7 434 16
9 263 1 1 18 89 30 0 57 48 502
https://colab.research.google.com/drive/12kQfNMiMLZ1-gDeUNOaNQogO479yX12H?usp=sharing#scrollTo=rTtSqFq_ll4g&printMode=true 5/5