0% found this document useful (0 votes)
22 views

ML Internal questions

Uploaded by

kallempudisai
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views

ML Internal questions

Uploaded by

kallempudisai
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 15

Q1)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Weight’ and relevant features like ’Length’, ’Height’,
etc.
data = pd.read_csv(’fish_data.csv’)

# Separate features and target variable


X = data.drop(columns=[’Weight’]) # Input features
y = data[’Weight’] # Target variable

# Define train-test split ratios


split_ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]

# Store results
results = []

# Train and evaluate the model for each split ratio


for train_ratio, test_ratio in split_ratios:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)

# Initialize and train the model


model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Store the results


results.append({
’Train Ratio’: train_ratio,
’Test Ratio’: test_ratio,
’MSE’: mse,
’R^2’: r2
})

# Display results
results_df = pd.DataFrame(results)
print(results_df)

# Compare and explain results


def explain_results(results):
print("\nModel Performance Explanation:")
for result in results:
train_ratio = result[’Train Ratio’]
test_ratio = result[’Test Ratio’]
mse = result[’MSE’]
r2 = result[’R^2’]
print(f"Train-Test Split ({train_ratio*100}% - {test_ratio*100}%):")
print(f" Mean Squared Error (MSE): {mse:.2f}")
print(f" R-squared (R^2): {r2:.2f}\n")

explain_results(results)

Q2)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Weight’ and relevant features like ’Length’, ’Height’,
etc.
data = pd.read_csv(’fish_data.csv’)

# Separate features and target variable


X = data.drop(columns=[’Weight’]) # Input features
y = data[’Weight’] # Target variable

# Define train-test split ratios


split_ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]

# Store results
results = []

# Train and evaluate the models for each split ratio


for train_ratio, test_ratio in split_ratios:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)

# Simple Linear Regression


linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Polynomial Regression (degree 2)


poly_pipeline = Pipeline([
(’poly_features’, PolynomialFeatures(degree=2)),
(’linear_model’, LinearRegression())
])
poly_pipeline.fit(X_train, y_train)
y_pred_poly = poly_pipeline.predict(X_test)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
# Store the results
results.append({
’Train Ratio’: train_ratio,
’Test Ratio’: test_ratio,
’Linear MSE’: mse_linear,
’Linear R^2’: r2_linear,
’Poly MSE’: mse_poly,
’Poly R^2’: r2_poly
})

# Display results
results_df = pd.DataFrame(results)
print(results_df)

# Compare and explain results


def explain_results(results):
print("\nModel Performance Comparison:")
for result in results:
train_ratio = result[’Train Ratio’]
test_ratio = result[’Test Ratio’]
mse_linear = result[’Linear MSE’]
r2_linear = result[’Linear R^2’]
mse_poly = result[’Poly MSE’]
r2_poly = result[’Poly R^2’]
print(f"Train-Test Split ({train_ratio*100}% - {test_ratio*100}%):")
print(f" Linear Regression - MSE: {mse_linear:.2f}, R^2: {r2_linear:.2f}")
print(f" Polynomial Regression - MSE: {mse_poly:.2f}, R^2: {r2_poly:.2f}\n")

explain_results(results)

Q3)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Species’, ’Length1’, ’Length2’, ’Length3’, ’Height’,
and ’Width’
data = pd.read_csv(’fish_data.csv’)

# Encode the Species column into numerical values


label_encoder = LabelEncoder()
data[’Species’] = label_encoder.fit_transform(data[’Species’])

# Separate features and target variable


X = data[[’Length1’, ’Length2’, ’Length3’, ’Height’, ’Width’]] # Input features
y = data[’Species’] # Target variable

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define SVM kernels to evaluate


kernels = [’linear’, ’poly’, ’rbf’, ’sigmoid’]
results = []

# Train and evaluate the SVM model for each kernel


for kernel in kernels:
# Initialize and train the model
svm_model = SVC(kernel=kernel, random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Store the results


results.append({
’Kernel’: kernel,
’Accuracy’: accuracy,
’Confusion Matrix’: conf_matrix
})

# Print confusion matrix for the current kernel


print(f"\nKernel: {kernel}")
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

# Plot confusion matrix


sns.heatmap(conf_matrix, annot=True, fmt=’d’, cmap=’Blues’, xticklabels=label_encoder.classes_,
yticklabels=label_encoder.classes_)
plt.title(f"Confusion Matrix - Kernel: {kernel}")
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.show()

# Identify the best-performing kernel


best_result = max(results, key=lambda x: x[’Accuracy’])
print(f"\nBest Kernel: {best_result[’Kernel’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.2f}")

# Explanation of findings
def explain_findings(best_result):
print("\nExplanation of Findings:")
print(f"The best-performing kernel is ’{best_result[’Kernel’]}’ with an accuracy of
{best_result[’Accuracy’]:.2f}. ")
print("This suggests that the ’{best_result[’Kernel’]}’ kernel is most effective at capturing the relationship
between the features and the target variable.")

explain_findings(best_result)
Q4)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Wine dataset


data = load_wine()
X = data.data
y = data.target

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define scaling techniques


scalers = {
’StandardScaler’: StandardScaler(),
’MinMaxScaler’: MinMaxScaler(),
’RobustScaler’: RobustScaler()
}

# Define SVM kernels


kernels = [’linear’, ’poly’, ’rbf’, ’sigmoid’]

# Store results
results = []

for scaler_name, scaler in scalers.items():


# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for kernel in kernels:


# Train SVM model
svm = SVC(kernel=kernel, random_state=42)
svm.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svm.predict(X_test_scaled)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

results.append({
’Scaler’: scaler_name,
’Kernel’: kernel,
’Accuracy’: accuracy,
’Confusion Matrix’: cm
})

# Display results
best_result = max(results, key=lambda x: x[’Accuracy’])

print("Best Result:")
print(f"Scaler: {best_result[’Scaler’]}")
print(f"Kernel: {best_result[’Kernel’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.4f}")
print("Confusion Matrix:")
print(best_result[’Confusion Matrix’])

# Visualize confusion matrix of the best result


sns.heatmap(best_result[’Confusion Matrix’], annot=True, fmt=’d’, cmap=’Blues’,
xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.title(f"Best Combination: {best_result[’Scaler’]} + {best_result[’Kernel’]}\nAccuracy:
{best_result[’Accuracy’]:.4f}")
plt.show()

# Summary of findings
print("\nSummary of Findings:")
print(f"The best performance was achieved with the {best_result[’Scaler’]} scaling method and the
{best_result[’Kernel’]} kernel. This scaling method is likely best suited for the dataset because it effectively
handles feature distributions and scales them appropriately for the SVM kernel.")

Q5)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,
classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Wine dataset


data = load_wine()
X = data.data
y = data.target

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression using One-vs-Rest (OvR) strategy


ovr_model = OneVsRestClassifier(LogisticRegression(random_state=42, max_iter=10000))
ovr_model.fit(X_train_scaled, y_train)
ovr_preds = ovr_model.predict(X_test_scaled)

# Train logistic regression using One-vs-One (OvO) strategy


ovo_model = OneVsOneClassifier(LogisticRegression(random_state=42, max_iter=10000))
ovo_model.fit(X_train_scaled, y_train)
ovo_preds = ovo_model.predict(X_test_scaled)

# Evaluate models
metrics = {}

# One-vs-Rest (OvR) metrics


metrics[’OvR’] = {
’Accuracy’: accuracy_score(y_test, ovr_preds),
’Precision’: precision_score(y_test, ovr_preds, average=’weighted’),
’Recall’: recall_score(y_test, ovr_preds, average=’weighted’),
’Confusion Matrix’: confusion_matrix(y_test, ovr_preds)
}

# One-vs-One (OvO) metrics


metrics[’OvO’] = {
’Accuracy’: accuracy_score(y_test, ovo_preds),
’Precision’: precision_score(y_test, ovo_preds, average=’weighted’),
’Recall’: recall_score(y_test, ovo_preds, average=’weighted’),
’Confusion Matrix’: confusion_matrix(y_test, ovo_preds)
}

# Display results
for strategy, result in metrics.items():
print(f"\n{strategy} Strategy:")
print(f"Accuracy: {result[’Accuracy’]:.4f}")
print(f"Precision: {result[’Precision’]:.4f}")
print(f"Recall: {result[’Recall’]:.4f}")
print("Confusion Matrix:")
print(result[’Confusion Matrix’])

sns.heatmap(result[’Confusion Matrix’], annot=True, fmt=’d’, cmap=’Blues’,


xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.title(f"{strategy} Strategy Confusion Matrix")
plt.show()

# Summary of findings
if metrics[’OvR’][’Accuracy’] > metrics[’OvO’][’Accuracy’]:
better_strategy = ’One-vs-Rest (OvR)’
else:
better_strategy = ’One-vs-One (OvO)’

print("\nSummary of Findings:")
print(f"The better-performing strategy is {better_strategy}. OvR is generally simpler and may perform well
on datasets with a large number of classes, whereas OvO could be more computationally intensive but
might better capture pairwise class distinctions.")
Q6)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Wine dataset


data = load_wine()
X = data.data
y = data.target

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize results storage


results = []

# Try different numbers of principal components


for n_components in range(1, X.shape[1] + 1):
# Apply PCA
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train Logistic Regression model


model = LogisticRegression(random_state=42, max_iter=10000)
model.fit(X_train_pca, y_train)

# Evaluate the model


y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Store results
results.append({
’n_components’: n_components,
’Accuracy’: accuracy,
’Confusion Matrix’: cm
})

# Find the best result


best_result = max(results, key=lambda x: x[’Accuracy’])
# Print results
print("Optimal Number of Principal Components:")
print(f"Number of Components: {best_result[’n_components’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.4f}")
print("Confusion Matrix:")
print(best_result[’Confusion Matrix’])

# Plot accuracy vs. number of principal components


n_components = [result[’n_components’] for result in results]
accuracies = [result[’Accuracy’] for result in results]
plt.plot(n_components, accuracies, marker=’o’)
plt.title(’Accuracy vs. Number of Principal Components’)
plt.xlabel(’Number of Principal Components’)
plt.ylabel(’Accuracy’)
plt.grid()
plt.show()

# Visualize the confusion matrix for the best result


sns.heatmap(best_result[’Confusion Matrix’], annot=True, fmt=’d’, cmap=’Blues’,
xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.title(f"Confusion Matrix (n_components={best_result[’n_components’]})")
plt.show()

# Summary of findings
print("\nSummary of Findings:")
print(f"The optimal number of principal components is {best_result[’n_components’]}, achieving an
accuracy of {best_result[’Accuracy’]:.4f}. PCA helps reduce the dimensionality of the dataset while
retaining essential information, improving model performance or interpretability.")

Q7)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load dataset
# Replace ’your_dataset.csv’ with the actual file path
data = pd.read_csv(’your_dataset.csv’)

# Inspect the dataset


print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())
print("\nSummary Statistics:")
print(data.describe())

# Visualize the distribution of key features


features_to_plot = [’Relative Compactness’, ’Surface Area’, ’Wall Area’, ’Roof Area’, ’Heating Load’,
’Cooling Load’]
for feature in features_to_plot:
plt.figure(figsize=(6, 4))
sns.histplot(data[feature], kde=True, bins=20)
plt.title(f’Distribution of {feature}’)
plt.show()

# Pair plot for relationships between features


sns.pairplot(data[features_to_plot], diag_kind=’kde’)
plt.show()

# Check for missing values


print("\nMissing Values:")
print(data.isnull().sum())

# Handle missing values (if any)


data = data.dropna()

# Normalize the features for clustering


scaler = StandardScaler()
normalized_data = scaler.fit_transform(data[features_to_plot])

# Perform K-Means clustering with 3 and 4 clusters


clusters = [3, 4]
kmeans_results = {}

for k in clusters:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(normalized_data)
kmeans_results[k] = labels
data[f’Cluster_{k}’] = labels

# Visualize clusters using PCA (reduce to 2D for visualization)


pca = PCA(n_components=2)
reduced_data = pca.fit_transform(normalized_data)
plt.figure(figsize=(8, 6))
sns.scatterplot(
x=reduced_data[:, 0], y=reduced_data[:, 1], hue=labels, palette=’Set1’, s=50
)
plt.title(f’K-Means Clustering with {k} Clusters (PCA Reduced)’)
plt.xlabel(’PCA Component 1’)
plt.ylabel(’PCA Component 2’)
plt.legend(title=’Cluster’)
plt.show()

# Interpretation of clusters
for k in clusters:
print(f"\nCluster Centers for K={k}:")
centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=features_to_plot
)
print(centers)
# Save the results to a CSV file
data.to_csv(’clustered_data.csv’, index=False)

print("\nClustering completed. Results saved to ’clustered_data.csv’.")

Q8)
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

# Load the dataset


data = pd.read_csv(’building_energy_efficiency.csv’)

# Display basic information about the dataset


print("Dataset Info:")
print(data.info())

# Display the first few rows of the dataset


print("\nFirst 5 Rows:")
print(data.head())

# Display summary statistics


print("\nSummary Statistics:")
print(data.describe())

# Check for missing values


print("\nMissing Values:")
print(data.isnull().sum())

# Data Visualization
# Histogram for each numerical feature
data.hist(figsize=(12, 8))
plt.tight_layout()
plt.show()

# Scatter plot matrix


sns.pairplot(data)
plt.show()

# Box plots for target variables


plt.figure(figsize=(10, 6))
sns.boxplot(data=data[[’Relative Compactness’, ’Surface Area’, ’Wall Area’, ’Roof Area’, ’Heating Load’,
’Cooling Load’]])
plt.title(’Box plots of features’)
plt.show()

# Normalize the features


scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Apply DBSCAN clustering


dbscan = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan.fit_predict(scaled_data)

# Add cluster labels to the dataset


data[’Cluster’] = clusters

# Visualize the clusters using PCA for dimensionality reduction


pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap=’viridis’, s=50, alpha=0.6)
plt.title(’DBSCAN Clustering Results’)
plt.xlabel(’PCA Component 1’)
plt.ylabel(’PCA Component 2’)
plt.colorbar(label=’Cluster Label’)
plt.show()

# Interpretation of results
print("\nDBSCAN Clustering Results:")
unique_clusters = np.unique(clusters)
print(f"Number of clusters: {len(unique_clusters) - (1 if -1 in clusters else 0)}")
print(f"Noise points: {np.sum(clusters == -1)}")

# Provide interpretation based on clusters


for cluster in unique_clusters:
if cluster != -1:
print(f"Cluster {cluster}:")
cluster_data = data[data[’Cluster’] == cluster]
print(cluster_data.describe())

Q9)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset


df = pd.read_csv(’path_to_your_dataset.csv’)

# Handle missing values (fill with median for numerical features and mode for categorical features)
imputer_num = SimpleImputer(strategy=’median’)
imputer_cat = SimpleImputer(strategy=’most_frequent’)

df[’age’] = imputer_num.fit_transform(df[[’age’]])
df[’education’] = imputer_cat.fit_transform(df[[’education’]])
df[’occupation’] = imputer_cat.fit_transform(df[[’occupation’]])

# Encode categorical features


label_enc = LabelEncoder()
df[’education’] = label_enc.fit_transform(df[’education’])
df[’occupation’] = label_enc.fit_transform(df[’occupation’])

# Separate features and target variable


X = df.drop(’income’, axis=1)
y = df[’income’]

# Normalize the numeric features


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.select_dtypes(include=[’int64’, ’float64’]))

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Logistic Regression model


log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Train Decision Tree model


decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)

# Evaluate models
def evaluate_model(y_true, y_pred):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label=’>50K’)
recall = recall_score(y_true, y_pred, pos_label=’>50K’)
f1 = f1_score(y_true, y_pred, pos_label=’>50K’)
return accuracy, precision, recall, f1

log_reg_metrics = evaluate_model(y_test, y_pred_log)


tree_metrics = evaluate_model(y_test, y_pred_tree)

print("Logistic Regression Metrics:")


print(f"Accuracy: {log_reg_metrics[0]}, Precision: {log_reg_metrics[1]}, Recall: {log_reg_metrics[2]}, F1
Score: {log_reg_metrics[3]}")

print("\nDecision Tree Metrics:")


print(f"Accuracy: {tree_metrics[0]}, Precision: {tree_metrics[1]}, Recall: {tree_metrics[2]}, F1 Score:
{tree_metrics[3]}")

# Comparison of models
if log_reg_metrics[3] > tree_metrics[3]:
best_model = "Logistic Regression"
best_metrics = log_reg_metrics
else:
best_model = "Decision Tree"
best_metrics = tree_metrics
print(f"\nBest model: {best_model} with F1 Score: {best_metrics[3]}")

Q10)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset


data = pd.read_csv(’car_evaluation.csv’)

# Encode categorical features


# Define categorical columns
categorical_columns = [’buying’, ’maint’, ’doors’, ’persons’, ’lug_boot’, ’safety’]

# Use One-Hot Encoding


encoder = OneHotEncoder(drop=’first’, sparse=False) # Drop one category to avoid multicollinearity
encoded_features = encoder.fit_transform(data[categorical_columns])

# Convert the encoded array back to a DataFrame


encoded_df = pd.DataFrame(encoded_features,
columns=encoder.get_feature_names(categorical_columns))

# Drop original categorical columns and concatenate with encoded features


data = data.drop(categorical_columns, axis=1)
data = pd.concat([data, encoded_df], axis=1)

# Split the data into features (X) and target (y)


X = data.drop(’class’, axis=1)
y = data[’class’]

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model


model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set


y_pred = model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=’weighted’)
recall = recall_score(y_test, y_pred, average=’weighted’)
f1 = f1_score(y_test, y_pred, average=’weighted’)

# Print the evaluation metrics


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

You might also like