TopicClassifierbyDavidCaleb
TopicClassifierbyDavidCaleb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV,
cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,
accuracy_score, roc_curve, auc
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import string
True
# Figure adjustments
fig = plt.figure(figsize=(12, 15)) # Larger size for better
visualization
gs = fig.add_gridspec(2, 2)
gs.update(wspace=0.4, hspace=0.7) # Increases space between subplots
ax3 = axes[3]
ax3.set_title("Most Common Words", loc='center', fontsize=14,
fontweight='bold', fontfamily="serif", color="#000000")
vectorizer = CountVectorizer(stop_words='english', max_features=20)
word_counts = vectorizer.fit_transform(newsgroups.data)
word_freq = dict(zip(vectorizer.get_feature_names_out(),
word_counts.sum(axis=0).tolist()[0]))
sns.barplot(x=list(word_freq.values()), y=list(word_freq.keys()),
palette=color_palette, ax=ax3)
ax3.set_xlabel("Frequency", fontsize=12)
ax3.set_ylabel("Words", fontsize=12)
C:\Users\Usuario\AppData\Local\Temp\ipykernel_9492\3698852670.py:33:
FutureWarning:
sns.barplot(x=list(word_freq.values()), y=list(word_freq.keys()),
palette=color_palette, ax=ax3)
C:\Users\Usuario\AppData\Local\Temp\ipykernel_9492\3698852670.py:55:
UserWarning:
The palette list has fewer values (5) than needed (20) and will cycle,
which may produce an uninterpretable plot.
sns.barplot(x=list(word_freq.values()), y=list(word_freq.keys()),
palette=color_palette, ax=ax3)
# Advanced Text Preprocessing
# Define a function to clean and lemmatize text
def preprocess_text(text):
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize text
tokens = word_tokenize(text.lower())
# Lemmatize tokens
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return ' '.join(tokens)
results = {}
for name, model in models.items():
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
results[name] = accuracy
print(f"{name} Accuracy: {accuracy:.4f}")
# Cross-Validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")
# Visualize Results
# Confusion Matrix for the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=newsgroups.target_names,
yticklabels=newsgroups.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Save the Model
import joblib
joblib.dump(best_model, 'text_classification_model.pkl')
['text_classification_model.pkl']