ML Internal questions
ML Internal questions
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Weight’ and relevant features like ’Length’, ’Height’,
etc.
data = pd.read_csv(’fish_data.csv’)
# Store results
results = []
# Make predictions
y_pred = model.predict(X_test)
# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Display results
results_df = pd.DataFrame(results)
print(results_df)
explain_results(results)
Q2)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Weight’ and relevant features like ’Length’, ’Height’,
etc.
data = pd.read_csv(’fish_data.csv’)
# Store results
results = []
# Display results
results_df = pd.DataFrame(results)
print(results_df)
explain_results(results)
Q3)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Species’, ’Length1’, ’Length2’, ’Length3’, ’Height’,
and ’Width’
data = pd.read_csv(’fish_data.csv’)
# Make predictions
y_pred = svm_model.predict(X_test)
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# Explanation of findings
def explain_findings(best_result):
print("\nExplanation of Findings:")
print(f"The best-performing kernel is ’{best_result[’Kernel’]}’ with an accuracy of
{best_result[’Accuracy’]:.2f}. ")
print("This suggests that the ’{best_result[’Kernel’]}’ kernel is most effective at capturing the relationship
between the features and the target variable.")
explain_findings(best_result)
Q4)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Store results
results = []
# Make predictions
y_pred = svm.predict(X_test_scaled)
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
results.append({
’Scaler’: scaler_name,
’Kernel’: kernel,
’Accuracy’: accuracy,
’Confusion Matrix’: cm
})
# Display results
best_result = max(results, key=lambda x: x[’Accuracy’])
print("Best Result:")
print(f"Scaler: {best_result[’Scaler’]}")
print(f"Kernel: {best_result[’Kernel’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.4f}")
print("Confusion Matrix:")
print(best_result[’Confusion Matrix’])
# Summary of findings
print("\nSummary of Findings:")
print(f"The best performance was achieved with the {best_result[’Scaler’]} scaling method and the
{best_result[’Kernel’]} kernel. This scaling method is likely best suited for the dataset because it effectively
handles feature distributions and scales them appropriately for the SVM kernel.")
Q5)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,
classification_report
import seaborn as sns
import matplotlib.pyplot as plt
# Evaluate models
metrics = {}
# Display results
for strategy, result in metrics.items():
print(f"\n{strategy} Strategy:")
print(f"Accuracy: {result[’Accuracy’]:.4f}")
print(f"Precision: {result[’Precision’]:.4f}")
print(f"Recall: {result[’Recall’]:.4f}")
print("Confusion Matrix:")
print(result[’Confusion Matrix’])
# Summary of findings
if metrics[’OvR’][’Accuracy’] > metrics[’OvO’][’Accuracy’]:
better_strategy = ’One-vs-Rest (OvR)’
else:
better_strategy = ’One-vs-One (OvO)’
print("\nSummary of Findings:")
print(f"The better-performing strategy is {better_strategy}. OvR is generally simpler and may perform well
on datasets with a large number of classes, whereas OvO could be more computationally intensive but
might better capture pairwise class distinctions.")
Q6)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Store results
results.append({
’n_components’: n_components,
’Accuracy’: accuracy,
’Confusion Matrix’: cm
})
# Summary of findings
print("\nSummary of Findings:")
print(f"The optimal number of principal components is {best_result[’n_components’]}, achieving an
accuracy of {best_result[’Accuracy’]:.4f}. PCA helps reduce the dimensionality of the dataset while
retaining essential information, improving model performance or interpretability.")
Q7)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Load dataset
# Replace ’your_dataset.csv’ with the actual file path
data = pd.read_csv(’your_dataset.csv’)
for k in clusters:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(normalized_data)
kmeans_results[k] = labels
data[f’Cluster_{k}’] = labels
# Interpretation of clusters
for k in clusters:
print(f"\nCluster Centers for K={k}:")
centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=features_to_plot
)
print(centers)
# Save the results to a CSV file
data.to_csv(’clustered_data.csv’, index=False)
Q8)
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
# Data Visualization
# Histogram for each numerical feature
data.hist(figsize=(12, 8))
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap=’viridis’, s=50, alpha=0.6)
plt.title(’DBSCAN Clustering Results’)
plt.xlabel(’PCA Component 1’)
plt.ylabel(’PCA Component 2’)
plt.colorbar(label=’Cluster Label’)
plt.show()
# Interpretation of results
print("\nDBSCAN Clustering Results:")
unique_clusters = np.unique(clusters)
print(f"Number of clusters: {len(unique_clusters) - (1 if -1 in clusters else 0)}")
print(f"Noise points: {np.sum(clusters == -1)}")
Q9)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Handle missing values (fill with median for numerical features and mode for categorical features)
imputer_num = SimpleImputer(strategy=’median’)
imputer_cat = SimpleImputer(strategy=’most_frequent’)
df[’age’] = imputer_num.fit_transform(df[[’age’]])
df[’education’] = imputer_cat.fit_transform(df[[’education’]])
df[’occupation’] = imputer_cat.fit_transform(df[[’occupation’]])
# Evaluate models
def evaluate_model(y_true, y_pred):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label=’>50K’)
recall = recall_score(y_true, y_pred, pos_label=’>50K’)
f1 = f1_score(y_true, y_pred, pos_label=’>50K’)
return accuracy, precision, recall, f1
# Comparison of models
if log_reg_metrics[3] > tree_metrics[3]:
best_model = "Logistic Regression"
best_metrics = log_reg_metrics
else:
best_model = "Decision Tree"
best_metrics = tree_metrics
print(f"\nBest model: {best_model} with F1 Score: {best_metrics[3]}")
Q10)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)