0% found this document useful (0 votes)

2 views

TopicClassifierbyDavidCaleb

The document outlines a Python script for text classification using the 20 Newsgroups dataset, employing various machine learning models including Naive Bayes, SVM, Random Forest, and Logistic Regression. It includes data preprocessing steps, feature extraction with TF-IDF, model training, hyperparameter tuning, and evaluation through accuracy scores and confusion matrices. The best model is saved for future use, demonstrating a comprehensive approach to text classification tasks.

Uploaded by

wet42701

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

2 views

TopicClassifierbyDavidCaleb

Uploaded by

wet42701

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 7

# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV,
cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,
accuracy_score, roc_curve, auc
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import string

# Download NLTK data (only required once)

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to

[nltk_data] C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data] C:\Users\Usuario\AppData\Roaming\nltk_data...

True

# Load the dataset

categories = ['sci.space', 'comp.graphics', 'talk.politics.guns',
'rec.sport.baseball', 'sci.med']
newsgroups = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)

# Figure adjustments
fig = plt.figure(figsize=(12, 15)) # Larger size for better
visualization
gs = fig.add_gridspec(2, 2)
gs.update(wspace=0.4, hspace=0.7) # Increases space between subplots

# Background and color settings

background_color = "#f8fafc"
color_palette = ["#94a3b8", "#64748b", "#475569", "#334155",
"#1e293b"]
fig.patch.set_facecolor(background_color)
axes = [fig.add_subplot(gs[i, j]) for i in range(2) for j in range(2)]
for ax in axes:
ax.set_facecolor(background_color)

# Title of the plot

axes[0].spines["bottom"].set_visible(False)
axes[0].spines["left"].set_visible(False)
axes[0].spines["top"].set_visible(False)
axes[0].spines["right"].set_visible(False)
axes[0].tick_params(left=False, bottom=False)
axes[0].set_xticklabels([])
axes[0].set_yticklabels([])
axes[0].text(0.5, 0.5,
'Topic Distribution and\nDocument Analysis\
n_________________',
horizontalalignment='center',
verticalalignment='center',
fontsize=18, fontweight='bold',
fontfamily='serif',
color="#000000")

# Plot 1: Distribution of Topics

ax1 = axes[1]
ax1.set_title("Distribution of Topics", loc='center', fontsize=14,
fontweight='bold', fontfamily="serif", color="#000000")
sns.countplot(x=newsgroups.target, palette=color_palette, ax=ax1)
ax1.set_xticks(np.arange(len(categories)))
ax1.set_xticklabels(newsgroups.target_names, rotation=30, ha='right',
fontsize=10)
ax1.set_xlabel("")
ax1.set_ylabel("Count", fontsize=12)

# Plot 2: Document Length Distribution

ax2 = axes[2]
ax2.set_title("Document Length Distribution", loc='center',
fontsize=14, fontweight='bold', fontfamily="serif", color="#000000")
doc_lengths = [len(doc.split()) for doc in newsgroups.data]
sns.histplot(doc_lengths, bins=50, kde=True, color=color_palette[2],
ax=ax2)
ax2.set_xlabel("Number of Words", fontsize=12)
ax2.set_ylabel("Frequency", fontsize=12)

# Plot 3: Most Common Words (Optional)

from sklearn.feature_extraction.text import CountVectorizer

ax3 = axes[3]
ax3.set_title("Most Common Words", loc='center', fontsize=14,
fontweight='bold', fontfamily="serif", color="#000000")
vectorizer = CountVectorizer(stop_words='english', max_features=20)
word_counts = vectorizer.fit_transform(newsgroups.data)
word_freq = dict(zip(vectorizer.get_feature_names_out(),
word_counts.sum(axis=0).tolist()[0]))
sns.barplot(x=list(word_freq.values()), y=list(word_freq.keys()),
palette=color_palette, ax=ax3)
ax3.set_xlabel("Frequency", fontsize=12)
ax3.set_ylabel("Words", fontsize=12)

# Adjust and show

plt.show()

C:\Users\Usuario\AppData\Local\Temp\ipykernel_9492\3698852670.py:33:
FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be

removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.countplot(x=newsgroups.target, palette=color_palette, ax=ax1)

C:\Users\Usuario\AppData\Local\Temp\ipykernel_9492\3698852670.py:55:
FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be

removed in v0.14.0. Assign the `y` variable to `hue` and set
`legend=False` for the same effect.

sns.barplot(x=list(word_freq.values()), y=list(word_freq.keys()),
palette=color_palette, ax=ax3)
C:\Users\Usuario\AppData\Local\Temp\ipykernel_9492\3698852670.py:55:
UserWarning:
The palette list has fewer values (5) than needed (20) and will cycle,
which may produce an uninterpretable plot.
sns.barplot(x=list(word_freq.values()), y=list(word_freq.keys()),
palette=color_palette, ax=ax3)
# Advanced Text Preprocessing
# Define a function to clean and lemmatize text
def preprocess_text(text):
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize text
tokens = word_tokenize(text.lower())
# Lemmatize tokens
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(tokens)

# Apply preprocessing to the dataset

preprocessed_data = [preprocess_text(doc) for doc in newsgroups.data]

# Feature Extraction with TF-IDF

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7,
ngram_range=(1, 2))
X = vectorizer.fit_transform(preprocessed_data)
y = newsgroups.target

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, random_state=42)

# Compare Multiple Models

models = {
'Naive Bayes': MultinomialNB(),
'SVM': SVC(kernel='linear', probability=True, random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100,
random_state=42),
'Logistic Regression': LogisticRegression(max_iter=1000,
random_state=42)
}

results = {}
for name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
results[name] = accuracy
print(f"{name} Accuracy: {accuracy:.4f}")

Naive Bayes Accuracy: 0.9808

SVM Accuracy: 0.9787
Random Forest Accuracy: 0.9238
Logistic Regression Accuracy: 0.9780

# Hyperparameter Tuning with Grid Search (Example: SVM)

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(probability=True, random_state=42),
param_grid, cv=3)
grid_search.fit(X_train, y_train)
print(f"Best SVM Parameters: {grid_search.best_params_}")

Best SVM Parameters: {'C': 10, 'kernel': 'linear'}

# Cross-Validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")

Cross-Validation Accuracy: 0.9794

# Visualize Results
# Confusion Matrix for the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=newsgroups.target_names,
yticklabels=newsgroups.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Save the Model
import joblib
joblib.dump(best_model, 'text_classification_model.pkl')

['text_classification_model.pkl']

BUS 225 Module Four Assignment Explanation of The Importance of Data Analysis
100% (1)
BUS 225 Module Four Assignment Explanation of The Importance of Data Analysis
3 pages
Unstructtured Data Classification Fresco
100% (1)
Unstructtured Data Classification Fresco
4 pages
Lab5 Example Fall 23
No ratings yet
Lab5 Example Fall 23
4 pages
code
No ratings yet
code
13 pages
Python Project
No ratings yet
Python Project
2 pages
20BCE1779 - Web Mining - Lab-5
No ratings yet
20BCE1779 - Web Mining - Lab-5
8 pages
Aped For Fake News
No ratings yet
Aped For Fake News
6 pages
Personalized Cancer Diagnosis
No ratings yet
Personalized Cancer Diagnosis
100 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
17 - Source Code - nlp-2-5
No ratings yet
17 - Source Code - nlp-2-5
4 pages
NLP Manual
No ratings yet
NLP Manual
21 pages
FakeNewsDetection Student
No ratings yet
FakeNewsDetection Student
7 pages
SMA 3
No ratings yet
SMA 3
3 pages
Efficient Python Tricks and Tools For Data Scientists - by Khuyen Tran
No ratings yet
Efficient Python Tricks and Tools For Data Scientists - by Khuyen Tran
20 pages
Shreya Srivastava-27
No ratings yet
Shreya Srivastava-27
3 pages
Email Spam Classifier
No ratings yet
Email Spam Classifier
22 pages
DSC_253_Homework_1
No ratings yet
DSC_253_Homework_1
15 pages
8-text classification - Jupyter Notebook
No ratings yet
8-text classification - Jupyter Notebook
2 pages
Python CA 4
No ratings yet
Python CA 4
9 pages
2403RES29 - Hemant Choudhary - CS582 - Assignment - 1
No ratings yet
2403RES29 - Hemant Choudhary - CS582 - Assignment - 1
5 pages
Machine Learning Code Explanation
No ratings yet
Machine Learning Code Explanation
33 pages
Report On - Social Media Research Topic Modeling
No ratings yet
Report On - Social Media Research Topic Modeling
26 pages
Import Pandas As PD DF PD - Read - CSV ("Titanic - Train - CSV") DF - Head
No ratings yet
Import Pandas As PD DF PD - Read - CSV ("Titanic - Train - CSV") DF - Head
20 pages
A Comprehensive Guide To Understand and Implement Text Classification in Python
No ratings yet
A Comprehensive Guide To Understand and Implement Text Classification in Python
34 pages
Methodology (Autosaved)
No ratings yet
Methodology (Autosaved)
9 pages
Workshop - NLP - Ipynb - Colaboratory
No ratings yet
Workshop - NLP - Ipynb - Colaboratory
5 pages
ML_Prac1-10
No ratings yet
ML_Prac1-10
32 pages
FND Imp Points
No ratings yet
FND Imp Points
6 pages
LAB 6
No ratings yet
LAB 6
47 pages
Unit2 ML Programs
No ratings yet
Unit2 ML Programs
7 pages
Using Pre-Trained Word Embeddings - 1716328022707
No ratings yet
Using Pre-Trained Word Embeddings - 1716328022707
8 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Linearregression SVM
No ratings yet
Linearregression SVM
3 pages
ML Report Fake News Detection
No ratings yet
ML Report Fake News Detection
15 pages
6 - Text Vectorization-CSC688-SP22
No ratings yet
6 - Text Vectorization-CSC688-SP22
5 pages
1. Linear Regression (Code)
No ratings yet
1. Linear Regression (Code)
9 pages
NLP Tushar
No ratings yet
NLP Tushar
21 pages
8
No ratings yet
8
9 pages
Sumati
No ratings yet
Sumati
10 pages
7 Aiml
No ratings yet
7 Aiml
4 pages
MLP Week 6 NaiveBayesImplementation - Ipynb - Colaboratory
No ratings yet
MLP Week 6 NaiveBayesImplementation - Ipynb - Colaboratory
5 pages
School of Engineering: Lab Manual On Machine Learning Lab
No ratings yet
School of Engineering: Lab Manual On Machine Learning Lab
23 pages
ML_lab_programs
No ratings yet
ML_lab_programs
8 pages
Assignment 1.1: First 10 Rows Looks Like Below in Notepad++
100% (1)
Assignment 1.1: First 10 Rows Looks Like Below in Notepad++
6 pages
Data Mining Numericals
No ratings yet
Data Mining Numericals
38 pages
SMA EXP 10 CODE PRINT
No ratings yet
SMA EXP 10 CODE PRINT
7 pages
01 - Inspect - Pretrained - Model: 0.1 Download Pre-Trained Model Files
No ratings yet
01 - Inspect - Pretrained - Model: 0.1 Download Pre-Trained Model Files
8 pages
Document
No ratings yet
Document
3 pages
Unit 2
No ratings yet
Unit 2
5 pages
NLP Labsheet-2 Sentiment Analysis Using Naive Bayes Classifier
No ratings yet
NLP Labsheet-2 Sentiment Analysis Using Naive Bayes Classifier
15 pages
Q 3
No ratings yet
Q 3
2 pages
ML Lab Programs
No ratings yet
ML Lab Programs
18 pages
Extra Feature NLP
No ratings yet
Extra Feature NLP
5 pages
Practical Assignment ML
No ratings yet
Practical Assignment ML
50 pages
Rajeev Mishra 20 SCSE1180087
No ratings yet
Rajeev Mishra 20 SCSE1180087
29 pages
cyberbullying code
No ratings yet
cyberbullying code
6 pages
ccc
No ratings yet
ccc
25 pages
hatespeech_code_ipynb
No ratings yet
hatespeech_code_ipynb
31 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Scikit-Learn Cheat Sheet Python For Data Science: Preprocessing The Data Evaluate Your Model's Performance
100% (1)
Scikit-Learn Cheat Sheet Python For Data Science: Preprocessing The Data Evaluate Your Model's Performance
1 page
Home Work
No ratings yet
Home Work
12 pages
Spss Data Assignment Final
No ratings yet
Spss Data Assignment Final
8 pages
Sidco Project
75% (4)
Sidco Project
67 pages
Diego Sirico - Using Literature For Language Learning - Students' and Teachers' Views-Cambridge Scholars Publishing (2021)
No ratings yet
Diego Sirico - Using Literature For Language Learning - Students' and Teachers' Views-Cambridge Scholars Publishing (2021)
349 pages
6673572(Ebook) Data Analytics Applied to the Mining Industry by Ali Soofastaei ISBN 9781138360006, 1138360007 - The latest ebook version is now available for instant access
100% (1)
6673572(Ebook) Data Analytics Applied to the Mining Industry by Ali Soofastaei ISBN 9781138360006, 1138360007 - The latest ebook version is now available for instant access
82 pages
09 - Machine Learning
No ratings yet
09 - Machine Learning
7 pages
The Relationship Between Reward System, Employee Motivation and Employees Performance in Car Dealers Located in Kingdom of Bahrain
No ratings yet
The Relationship Between Reward System, Employee Motivation and Employees Performance in Car Dealers Located in Kingdom of Bahrain
6 pages
Industrial Engineering by S K Mondal (Marinenotes - Blogspot.com)
No ratings yet
Industrial Engineering by S K Mondal (Marinenotes - Blogspot.com)
318 pages
DA_Bootcamp_4 (1)
No ratings yet
DA_Bootcamp_4 (1)
46 pages
Villanueva BSA22 LaboratoryExercise3
No ratings yet
Villanueva BSA22 LaboratoryExercise3
2 pages
4cf708fe-5603-4faa-967f-9b1624149b30
No ratings yet
4cf708fe-5603-4faa-967f-9b1624149b30
18 pages
An Assessment To The Conduct of Proper Haircut
100% (1)
An Assessment To The Conduct of Proper Haircut
37 pages
Ec2303 PS10
No ratings yet
Ec2303 PS10
3 pages
(Pub) Daft (1995) - Why I Recommended That Your Manuscript Be Rejected and What You Can Do About It
No ratings yet
(Pub) Daft (1995) - Why I Recommended That Your Manuscript Be Rejected and What You Can Do About It
19 pages
Time Series Characteristic
No ratings yet
Time Series Characteristic
72 pages
Vif Procedure
No ratings yet
Vif Procedure
4 pages
Research Title in Bold, Uppercase Letters Following An Inverted Pyramid Form Not Exceeding 12 Words
No ratings yet
Research Title in Bold, Uppercase Letters Following An Inverted Pyramid Form Not Exceeding 12 Words
7 pages
Quality Engineering and Management
No ratings yet
Quality Engineering and Management
3 pages
Bda (Eee) Mid-III QP 2024
No ratings yet
Bda (Eee) Mid-III QP 2024
1 page
Rhythmflows Hiring Document Nov 2024
No ratings yet
Rhythmflows Hiring Document Nov 2024
4 pages
Statistical Quality Control (Questions and Answers)
100% (1)
Statistical Quality Control (Questions and Answers)
26 pages
Challenges and Strategies of Subtitling Humour
No ratings yet
Challenges and Strategies of Subtitling Humour
259 pages
Big_Data_Data_Mining_and_Data_Science_-_George_Dimitoglou
No ratings yet
Big_Data_Data_Mining_and_Data_Science_-_George_Dimitoglou
386 pages
He Sas 23
No ratings yet
He Sas 23
10 pages
Project Report Guidelines
No ratings yet
Project Report Guidelines
20 pages
Chapter 1 - INTRODUCTION: 1.1 Research Background
No ratings yet
Chapter 1 - INTRODUCTION: 1.1 Research Background
87 pages
Biometrics, GIS and Agro-Meteorology Research Support - Pdfabbhyyyyy
No ratings yet
Biometrics, GIS and Agro-Meteorology Research Support - Pdfabbhyyyyy
18 pages
Northumbria Uni Assignment Sample 1
No ratings yet
Northumbria Uni Assignment Sample 1
2 pages
Chapter 1
No ratings yet
Chapter 1
19 pages
Rehabilitation Centers
No ratings yet
Rehabilitation Centers
54 pages

TopicClassifierbyDavidCaleb

Uploaded by

TopicClassifierbyDavidCaleb

Uploaded by

# Import necessary libraries

# Download NLTK data (only required once)

[nltk_data] Downloading package punkt to

# Load the dataset

# Background and color settings

# Title of the plot

# Plot 1: Distribution of Topics

# Plot 2: Document Length Distribution

# Plot 3: Most Common Words (Optional)

# Adjust and show

Passing `palette` without assigning `hue` is deprecated and will be

sns.countplot(x=newsgroups.target, palette=color_palette, ax=ax1)

Passing `palette` without assigning `hue` is deprecated and will be

# Apply preprocessing to the dataset

# Feature Extraction with TF-IDF

# Split the dataset into training and testing sets

# Compare Multiple Models

Naive Bayes Accuracy: 0.9808

# Hyperparameter Tuning with Grid Search (Example: SVM)

Best SVM Parameters: {'C': 10, 'kernel': 'linear'}

Cross-Validation Accuracy: 0.9794

You might also like