vertopal.com_experiment11
vertopal.com_experiment11
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
test_size=0.3, random_state=42)
# ------------------------------
# 1. Univariate Feature Selection (SelectKBest)
# ------------------------------
select_k = SelectKBest(score_func=f_classif, k=10)
select_k.fit(X_train, y_train)
selected_features_kbest = X.columns[select_k.get_support()]
# ------------------------------
# 2. Recursive Feature Elimination (RFE)
# ------------------------------
model = LogisticRegression(max_iter=10000)
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X_train, y_train)
selected_features_rfe = X.columns[rfe.support_]
# ------------------------------
# 3. Feature Importances from Random Forest
# ------------------------------
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
top_10_rf = X.columns[indices[:10]]
# ------------------------------
# 4. Lasso (L1-based) Feature Selection
# ------------------------------
lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.1,
max_iter=10000)
lasso.fit(X_train, y_train)
selected_features_lasso = X.columns[lasso.coef_[0] != 0]
sns.barplot(x=importances[indices[:10]], y=X.columns[indices[:10]],
palette="viridis")