Untitled28.ipynb - Colaboratory
Untitled28.ipynb - Colaboratory
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
import nltk
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
print("Setup Complete")
!pip install keras
!pip install --upgrade tensorflow
!pip install --upgrade keras
data.head()
All-New
Fire HD 8
Electronics,iPad &
Tablet, 8"
0 Amazon Tablets,All Tablets,Fire Electronics 2016-12-26T00:00:00
HD
Ta...
Display,
Wi-Fi...
Amazon -
Echo Plus Amazon Echo,Smart
1 w/ Built- Amazon Home,Networking,Home Electronics,Hardware 2018-01-17T00:00:00
In Hub - & Tools...
Silver
Amazon
Echo
1 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
Positive['sentiment'].value_counts()
Positive 3749
Name: sentiment, dtype: int64
Neutral['sentiment'].value_counts()
Neutral 158
Name: sentiment, dtype: int64
Negative['sentiment'].value_counts()
Negative 93
Name: sentiment, dtype: int64
data1.head()
sentiment reviews.text
print('Shape : ',data1.shape)
data1.head()
Shape : (4000, 2)
sentiment reviews.text
2 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#Download Stopwords
nltk.download('stopwords')
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'[a-z]+')
stop_words = set(stopwords.words('english'))
def preprocess(document):
document = document.lower() # Convert to lowercase
words = tokenizer.tokenize(document) # Tokenize
words = [w for w in words if not w in stop_words] # Removing stopwords
# Lemmatizing
for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
words = [wordnet_lemmatizer.lemmatize(x, pos) for x in words]
return " ".join(words)
print("Setup Complete")
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
Setup Complete
[nltk_data] Unzipping corpora/stopwords.zip.
data1['Processed_Review'] = data1['reviews.text'].apply(preprocess)
data1.head()
<ipython-input-28-7a0be1bed3d0>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
sentiment Processed_Review
1 Positive purchase two amazon echo plus two dot plus fou...
3 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
# for tf-idf
def textPreprocessing(data2):
#Remove Punctuation Logic
import string
removePunctuation = [char for char in data2 if char not in string.punctuation
#Join Chars to form sentences
sentenceWithoutPunctuations = ''.join(removePunctuation)
words = sentenceWithoutPunctuations.split()
#StopwordRemoval
from nltk.corpus import stopwords
removeStopwords = [word for word in words if word.lower() not in stopwords.words
return removeStopwords
data2.groupby('sentiment').describe()
Processed_Review
sentiment
#Text preprocessing
data2['Processed_Review'].head(2).apply(textPreprocessing)
0 [purchase, black, fridaypros, great, price, ev...
1 [purchase, two, amazon, echo, plus, two, dot, ...
Name: Processed_Review, dtype: object
len(bow.vocabulary_)
3407
reviews_bow = bow.transform(data2['Processed_Review'])
4 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
tfidfDataFinal.shape
(4000, 3407)
model
▾ MultinomialNB
MultinomialNB()
columns
['Processed_Review']
print(data2.sentiment.value_counts())
Positive 3749
Neutral 158
Negative 93
Name: sentiment, dtype: int64
5 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
X_res.shape,Y_res.shape
((11247, 1), (11247,))
6 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
Y1=pd.DataFrame(Y_res,columns=['sentiment'])
Processed_Review sentiment
1 purchase two amazon echo plus two dot plus fou... Positive
Final_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11247 entries, 0 to 11246
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Processed_Review 11247 non-null object
1 sentiment 11247 non-null object
dtypes: object(2)
memory usage: 175.9+ KB
# Using Matplotlib to plot the final data & show distribution of reviews sentiment in
print(Final_data.sentiment.value_counts())
Final_data['sentiment'].value_counts().plot(kind='bar')
plt.title("Distribution of Reviews Sentiment", size=18)
Positive 3749
Neutral 3749
Negative 3749
Name: sentiment, dtype: int64
Text(0.5, 1.0, 'Distribution of Reviews Sentiment')
7 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
df = Final_data.sample(frac=0.1, random_state=0)
df.head()
Processed_Review sentiment
8805 buy think would great read book play game howe... Neutral
10143 great beginner like child limit use many apps ... Neutral
10937 buy kindle past time one come defective port b... Neutral
if stemming==True # stemming
8 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
if stemming==True: # stemming
# stemmer = PorterStemmer()
stemmer = SnowballStemmer('english')
words = [stemmer.stem(w) for w in words]
for d in X_train:
X_train_cleaned.append(cleanText(d))
print('Show a cleaned review in the training set : \n', X_train_cleaned[10])
for d in X_test:
X_test_cleaned.append(cleanText(d))
Show a cleaned review in the training set :
daughter love easy navigate hard break
# Fit and transform the training data to a document-term matrix using CountVectorizer
countVect = CountVectorizer()
X_train_countVect = countVect.fit_transform(X_train_cleaned)
▾ MultinomialNB
MultinomialNB()
def modelEvaluation(predictions):
'''
Print model evaluation to predicted result
'''
print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test
#print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)))
print("\nClassification report : \n", metrics.classification_report(y_test
print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions
Classification report :
precision recall f1-score support
9 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
Confusion Matrix :
[[37 0 2]
[ 3 35 1]
[ 0 6 29]]
# Fitting and transforming the training data to a document-term matrix using TfidfVect
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
▾ LogisticRegression
LogisticRegression()
Classification report :
precision recall f1-score support
Confusion Matrix :
[[39 0 0]
[ 3 36 0]
[ 0 5 30]]
10 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
Classification report :
precision recall f1-score support
Confusion Matrix :
[[39 0 0]
[ 2 37 0]
[ 1 7 27]]
# Building a pipeline
estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)
# Grid search
params = {"lr__C":[0.1, 1, 10], #regularization param of logistic regression
"tfidf__min_df": [1, 3], #min count of words
"tfidf__max_features": [1000, None], #max features
"tfidf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
"tfidf__stop_words": [None, "english"]} #use stopwords or don't
Classification report :
precision recall f1-score support
Confusion Matrix :
11 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
Confusion Matrix :
[[38 0 1]
[ 1 37 1]
[ 0 4 31]]
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
True
# Splitting review text into parsed sentences using NLTK's punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
12 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
13 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
#Applying lstm
df = Final_data.sample(frac=0.1, random_state=0)
df.head()
Processed_Review sentiment
8805 buy think would great read book play game howe... 2
10143 great beginner like child limit use many apps ... 2
10937 buy kindle past time one come defective port b... 2
top_words = 20000
maxlen = 100
batch_size = 32
nb_classes = 3
nb_epoch = 3
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
14 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
# Compiling LSTM
model1.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# Model Evaluation
score = model1.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)
print('Test loss : {:.4f}'.format(score[0]))
print('Test accuracy : {:.4f}'.format(score[1]))
Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_3 (Embedding) (None, None, 128) 2560000
=================================================================
Total params: 2691971 (10.27 MB)
Trainable params: 2691971 (10.27 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/3
32/32 [==============================] - 14s 293ms/step - loss: 0.6464 - accuracy
Epoch 2/3
32/32 [==============================] - 7s 204ms/step - loss: 0.5352 - accuracy:
Epoch 3/3
32/32 [==============================] - 8s 258ms/step - loss: 0.3841 - accuracy:
15 of 16 15/12/23, 01:14
Untitled28.ipynb - Colaboratory https://colab.research.google.com/drive/1sAgmbXSp...
16 of 16 15/12/23, 01:14