Aiml 5-8
Aiml 5-8
PROGRAM :
yest = np.zeros(n)
delta = np.ones(n)
beta = linalg.solve(A, b)
residuals = y - yest
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
return yest
n = 100
x = np.linspace(0, 2 * math.pi, n)
y = np.sin(x) + 0.3 * np.random.randn(n)
f = 0.25
iterations = 3
OUTPUT :
6. BUILD DECISION TREES AND RANDOM FORESTS
PROGRAM :
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier,
export_graphviz
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from six import StringIO
from IPython.display import Image
import pydotplus
data =
pd.read_csv('/Users/ganesh/PycharmProjects/DecisionTree/S
ocial_Network_Ads.csv')
feature_cols = ['Age', 'EstimatedSalary']
x = data.iloc[:, [2, 3]].values
y = data.iloc[:, 4].values
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.25, random_state=0)
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print('Accuracy Score:', metrics.accuracy_score(y_test,
y_pred))
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)
x_set, y_set = x_test, y_test
x1, x2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1,
stop=x_set[:, 0].max() + 1, step=0.01),
np.arange(start=x_set[:, 1].min() - 1,
stop=x_set[:, 1].max() + 1, step=0.01))
plt.contourf(x1, x2, classifier.predict(np.array([x1.ravel(),
x2.ravel()]).T).reshape(x1.shape),
alpha=0.75, cmap=ListedColormap(("red",
"green")))
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(("red", "green"))(i), label=j)
plt.title('Decision Tree (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,
feature_names=feature_cols, class_names=['0', '1'])
graph =
pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
classifier = DecisionTreeClassifier(criterion="gini",
max_depth=3)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print("Accuracy (Optimized):",
metrics.accuracy_score(y_test, y_pred))
dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,
feature_names=feature_cols, class_names=['0', '1'])
graph =
pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
PROGRAM :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
class generate_word_cloud(data_read_write):
def __init__(self):
pass
class data_cleaning(data_read_write):
def __init__(self):
pass
class apply_embeddding_and_model(data_read_write):
def __init__(self):
pass
data_obj = data_read_write()
data_frame = data_obj.read_csv_file("processed.csv")
data_frame.head()
data_frame.tail()
data_frame.describe()
data_frame.info()
data_frame.head()
data_frame.groupby('spam').describe()
data_frame['length'] = data_frame['text'].apply(len)
data_frame['length'].max()
sns.set(rc={'figure.figsize':(11.7,8.27)})
ham_messages_length = data_frame[data_frame['spam']==0]
spam_messages_length =
data_frame[data_frame['spam']==1]
ham_messages_length['length'].plot(bins=100, kind='hist',
label='Ham')
spam_messages_length['length'].plot(bins=100, kind='hist',
label='Spam')
plt.title('Distribution of Length of Email Text')
plt.xlabel('Length of Email Text')
plt.legend()
data_frame[data_frame['spam']==0].text.values
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax = sns.distplot(ham_words_length, norm_hist=True,
bins=30, label='Ham')
ax = sns.distplot(spam_words_length, norm_hist=True,
bins=30, label='Spam')
plt.title('Distribution of Number of Words')
plt.xlabel('Number of Words')
plt.legend()
plt.savefig("SVMGraph.png")
plt.show()
def mean_word_length(x):
word_lengths = np.array([])
for word in word_tokenize(x):
word_lengths = np.append(word_lengths, len(word))
return word_lengths.mean()
ham_meanword_length =
data_frame[data_frame['spam']==0].text.apply(mean_word_l
ength)
spam_meanword_length =
data_frame[data_frame['spam']==1].text.apply(mean_word_l
ength)
sns.distplot(ham_meanword_length, norm_hist=True,
bins=30, label='Ham')
sns.distplot(spam_meanword_length, norm_hist=True,
bins=30, label='Spam')
plt.title('Distribution of Mean Word Length')
plt.xlabel('Mean Word Length')
plt.legend()
plt.savefig("Graph.png")
plt.show()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_words_ratio(x):
num_total_words = 0
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words / num_total_words
ham_stopwords = data_frame[data_frame['spam'] ==
0].text.apply(stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam'] ==
1].text.apply(stop_words_ratio)
ham = data_frame[data_frame['spam']==0]
spam = data_frame[data_frame['spam']==1]
spam['length'].plot(bins=60, kind='hist')
ham['length'].plot(bins=60, kind='hist')
data_clean_obj = data_cleaning()
data_frame['clean_text'] =
data_clean_obj.apply_to_column(data_frame['text'])
data_frame.head()
data_obj.data_frame.head()
data_obj.write_to_csvfile("processed_file.csv")
cv_object = apply_embeddding_and_model()
spamham_countvectorizer =
cv_object.apply_count_vector(data_frame['clean_text'])
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label
cv_object.apply_svm(X, y)
OUTPUT :
F1 Score: 0.9776119402985075
Recall: 0.9739776951672863
Precision: 0.9812734082397003
PROGRAM :
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv("train_data.csv")
target = df["target"]
train = df.drop("target", axis=1)
X_train, X_test, y_train, y_test = train_test_split(train,
target, test_size=0.20)
train_ratio = 0.70
validation_ratio = 0.20
test_ratio = 0.10
final_model = LinearRegression()
final_model.fit(df_val, y_val)
final_pred = final_model.predict(df_test)
mse = mean_squared_error(y_test, final_pred)
print("Mean Squared Error:", mse)
OUTPUT :
4790