0% found this document useful (0 votes)
26 views19 pages

Aiml 5-8

Uploaded by

K.VENKATESH
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
26 views19 pages

Aiml 5-8

Uploaded by

K.VENKATESH
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 19

5.

BUILD REGRESSION MODELS

PROGRAM :

from math import ceil


import numpy as np
from scipy import linalg

def lowess(x, y, f, iterations):


n = len(x)
r = int(ceil(f * n))
h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]

yest = np.zeros(n)
delta = np.ones(n)

for iteration in range(iterations):


for i in range(n):
weights = delta * np.clip(np.abs((x - x[i]) / h[i]), 0.0,
1.0)
weights = (1 - weights ** 3) ** 3

b = np.array([np.sum(weights * y), np.sum(weights * y


* x)])
A = np.array([[np.sum(weights), np.sum(weights * x)],
[np.sum(weights * x), np.sum(weights * x *
x)]])

beta = linalg.solve(A, b)

yest[i] = beta[0] + beta[1] * x[i]

residuals = y - yest
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2

return yest

import matplotlib.pyplot as plt


import math

n = 100
x = np.linspace(0, 2 * math.pi, n)
y = np.sin(x) + 0.3 * np.random.randn(n)
f = 0.25
iterations = 3

yest = lowess(x, y, f, iterations)

plt.plot(x, y, 'r.', label='Original data')


plt.plot(x, yest, 'b-', label='LOWESS fit (f=0.25)')
plt.legend()
plt.show()

OUTPUT :
6. BUILD DECISION TREES AND RANDOM FORESTS

PROGRAM :

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier,
export_graphviz
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from six import StringIO
from IPython.display import Image
import pydotplus
data =
pd.read_csv('/Users/ganesh/PycharmProjects/DecisionTree/S
ocial_Network_Ads.csv')
feature_cols = ['Age', 'EstimatedSalary']
x = data.iloc[:, [2, 3]].values
y = data.iloc[:, 4].values
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.25, random_state=0)
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print('Accuracy Score:', metrics.accuracy_score(y_test,
y_pred))
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)
x_set, y_set = x_test, y_test
x1, x2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1,
stop=x_set[:, 0].max() + 1, step=0.01),
np.arange(start=x_set[:, 1].min() - 1,
stop=x_set[:, 1].max() + 1, step=0.01))
plt.contourf(x1, x2, classifier.predict(np.array([x1.ravel(),
x2.ravel()]).T).reshape(x1.shape),
alpha=0.75, cmap=ListedColormap(("red",
"green")))
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(("red", "green"))(i), label=j)
plt.title('Decision Tree (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,
feature_names=feature_cols, class_names=['0', '1'])
graph =
pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
classifier = DecisionTreeClassifier(criterion="gini",
max_depth=3)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print("Accuracy (Optimized):",
metrics.accuracy_score(y_test, y_pred))
dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,
feature_names=feature_cols, class_names=['0', '1'])
graph =
pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

OUTPUT OF DECISION TREE WITHOUT PRUNING:


OPTIMIZED OUTPUT OF DECISION TREE USING GINI
INDEX (CART):
7. BUILD SVM MODELS.

PROGRAM :

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string

from nltk.corpus import stopwords


from wordcloud import WordCloud, STOPWORDS,
ImageColorGenerator
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,


confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm
from nltk import word_tokenize

from sklearn.metrics import roc_auc_score


from matplotlib import pyplot
from sklearn.metrics import plot_confusion_matrix
class data_read_write(object):
def __init__(self):
pass

def init(self, file_link):


self.data_frame = pd.read_csv(file_link)

def read_csv_file(self, file_link):


return self.data_frame

def write_to_csvfile(self, file_link):


self.data_frame.to_csv(file_link, encoding='utf-8',
index=False, header=True)
return

class generate_word_cloud(data_read_write):
def __init__(self):
pass

def variance_column(self, data):


return np.var(data)

def word_cloud(self, data_frame_column,


output_image_file):
text = " ".join(review for review in data_frame_column)
stopwords = set(STOPWORDS)
stopwords.update(["subject"])
wordcloud = WordCloud(width=1200, height=800,
stopwords=stopwords, max_font_size=50, margin=0,
background_color="white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig("Distribution.png")
plt.show()
wordcloud.to_file(output_image_file)
return

class data_cleaning(data_read_write):
def __init__(self):
pass

def message_cleaning(self, message):


Test_punc_removed = [char for char in message if char
not in string.punctuation]
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in
Test_punc_removed_join.split() if word.lower() not in
stopwords.words('english')]
final_join = ' '.join(Test_punc_removed_join_clean)
return final_join

def apply_to_column(self, data_column_text):


data_processed =
data_column_text.apply(self.message_cleaning)
return data_processed

class apply_embeddding_and_model(data_read_write):
def __init__(self):
pass

def apply_count_vector(self, v_data_column):


vectorizer = CountVectorizer(min_df=2,
analyzer="word", tokenizer=None, preprocessor=None,
stop_words=None)
return vectorizer.fit_transform(v_data_column)

def apply_svm(self, X, y):


X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2)
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = svm.SVC(C=params['C'],
kernel=params['kernel'], gamma=params['gamma'],
probability=True)
svm_cv.fit(X_train, y_train)
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)
print(classification_report(y_test, y_predict_test))
print("test set")
print("\nAccuracy Score: " +
str(metrics.accuracy_score(y_test, y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test,
y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test,
y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,
y_predict_test)))

class_names = ['ham', 'spam']


titles_options = [("Confusion matrix, without
normalization", None), ("Normalized confusion matrix",
'true')]

for title, normalize in titles_options:


disp = plot_confusion_matrix(svm_cv, X_test, y_test,
display_labels=class_names, cmap=plt.cm.Blues,
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.savefig("SVM.png")
plt.show()

ns_probs = [0 for _ in range(len(y_test))]


lr_probs = svm_cv.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)


lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')


pyplot.plot(lr_fpr, lr_tpr, marker='.', label='SVM')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.savefig("SVMMat.png")
pyplot.show()
return

data_obj = data_read_write()
data_frame = data_obj.read_csv_file("processed.csv")
data_frame.head()
data_frame.tail()
data_frame.describe()
data_frame.info()

data_frame.head()
data_frame.groupby('spam').describe()
data_frame['length'] = data_frame['text'].apply(len)
data_frame['length'].max()

sns.set(rc={'figure.figsize':(11.7,8.27)})
ham_messages_length = data_frame[data_frame['spam']==0]
spam_messages_length =
data_frame[data_frame['spam']==1]

ham_messages_length['length'].plot(bins=100, kind='hist',
label='Ham')
spam_messages_length['length'].plot(bins=100, kind='hist',
label='Spam')
plt.title('Distribution of Length of Email Text')
plt.xlabel('Length of Email Text')
plt.legend()

data_frame[data_frame['spam']==0].text.values

ham_words_length = [len(word_tokenize(title)) for title in


data_frame[data_frame['spam']==0].text.values]
spam_words_length = [len(word_tokenize(title)) for title in
data_frame[data_frame['spam']==1].text.values]
print(max(ham_words_length))
print(max(spam_words_length))

sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.distplot(ham_words_length, norm_hist=True,
bins=30, label='Ham')
ax = sns.distplot(spam_words_length, norm_hist=True,
bins=30, label='Spam')
plt.title('Distribution of Number of Words')
plt.xlabel('Number of Words')
plt.legend()
plt.savefig("SVMGraph.png")
plt.show()

def mean_word_length(x):
word_lengths = np.array([])
for word in word_tokenize(x):
word_lengths = np.append(word_lengths, len(word))
return word_lengths.mean()

ham_meanword_length =
data_frame[data_frame['spam']==0].text.apply(mean_word_l
ength)
spam_meanword_length =
data_frame[data_frame['spam']==1].text.apply(mean_word_l
ength)

sns.distplot(ham_meanword_length, norm_hist=True,
bins=30, label='Ham')
sns.distplot(spam_meanword_length, norm_hist=True,
bins=30, label='Spam')
plt.title('Distribution of Mean Word Length')
plt.xlabel('Mean Word Length')
plt.legend()
plt.savefig("Graph.png")
plt.show()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def stop_words_ratio(x):
num_total_words = 0
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words / num_total_words

ham_stopwords = data_frame[data_frame['spam'] ==
0].text.apply(stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam'] ==
1].text.apply(stop_words_ratio)

sns.distplot(ham_stopwords, norm_hist=True, label='Ham')


sns.distplot(spam_stopwords, norm_hist=True, label='Spam')
plt.title('Distribution of Stop-word Ratio')
plt.xlabel('Stop Word Ratio')
plt.legend()

ham = data_frame[data_frame['spam']==0]
spam = data_frame[data_frame['spam']==1]
spam['length'].plot(bins=60, kind='hist')
ham['length'].plot(bins=60, kind='hist')

data_frame['Ham(0) and Spam(1)'] = data_frame['spam']

print('Spam percentage =', (len(spam) / len(data_frame)


)*100,"%")
print('Ham percentage =', (len(ham) / len(data_frame)
)*100,"%")
sns.countplot(data_frame['Ham(0) and Spam(1)'],
label="Count")

data_clean_obj = data_cleaning()
data_frame['clean_text'] =
data_clean_obj.apply_to_column(data_frame['text'])
data_frame.head()
data_obj.data_frame.head()
data_obj.write_to_csvfile("processed_file.csv")

cv_object = apply_embeddding_and_model()
spamham_countvectorizer =
cv_object.apply_count_vector(data_frame['clean_text'])

X = spamham_countvectorizer
label = data_frame['spam'].values
y = label
cv_object.apply_svm(X, y)

OUTPUT :

PRECISION RECALL F1-SCORE SUPPORT


0.99 0.99 0.99 877
0.98 0.97 0.98 269

accuracy 0.99 1146

macro avg 0.99 0.98 0.99 1146

weighted avg 0.99 0.99 0.99 1146


test set:

Accuracy Score: 0.9895287958115183

F1 Score: 0.9776119402985075

Recall: 0.9739776951672863

Precision: 0.9812734082397003

Normalized confusion matrix: [[0.99429875 0.00570125]


[0.0260223 0.9739777 ]]
8. IMPLEMENT ENSEMBLING TECHNIQUES.

PROGRAM :
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv("train_data.csv")
target = df["target"]
train = df.drop("target", axis=1)
X_train, X_test, y_train, y_test = train_test_split(train,
target, test_size=0.20)

train_ratio = 0.70
validation_ratio = 0.20
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(


X_train, y_train, test_size=1 - train_ratio)

x_val, x_test, y_val, y_test = train_test_split(


x_test, y_test, test_size=test_ratio / (test_ratio +
validation_ratio))
model_1 = LinearRegression()
model_2 = xgb.XGBRegressor()
model_3 = RandomForestRegressor()
model_1.fit(x_train, y_train)
model_2.fit(x_train, y_train)
model_3.fit(x_train, y_train)
val_pred_1 = model_1.predict(x_val)
val_pred_2 = model_2.predict(x_val)
val_pred_3 = model_3.predict(x_val)
val_pred_1 = pd.DataFrame(val_pred_1)
val_pred_2 = pd.DataFrame(val_pred_2)
val_pred_3 = pd.DataFrame(val_pred_3)

df_val = pd.concat([x_val, val_pred_1, val_pred_2,


val_pred_3], axis=1)
test_pred_1 = model_1.predict(x_test)
test_pred_2 = model_2.predict(x_test)
test_pred_3 = model_3.predict(x_test)
test_pred_1 = pd.DataFrame(test_pred_1)
test_pred_2 = pd.DataFrame(test_pred_2)
test_pred_3 = pd.DataFrame(test_pred_3)

df_test = pd.concat([x_test, test_pred_1, test_pred_2,


test_pred_3], axis=1)

final_model = LinearRegression()
final_model.fit(df_val, y_val)
final_pred = final_model.predict(df_test)
mse = mean_squared_error(y_test, final_pred)
print("Mean Squared Error:", mse)

OUTPUT :
4790

You might also like