Lab 1
import pandas as pd
from [Link] import files
def find_s(data):
"""
Implements the FIND-S algorithm to find the most specific hypothesis.
Args:
data ([Link]): Training data with features and target.
Returns:
list: The most specific hypothesis.
"""
target = [Link][:, -1]
features = [Link][:, :-1]
positive_examples = data[target == 'yes']
if positive_examples.empty:
return None
hypothesis = positive_examples.iloc[0, :-1].tolist()
for _, row in positive_examples.iloc[1:].iterrows():
example = row[:-1].tolist()
for i in range(len(hypothesis)):
if hypothesis[i] != example[i]:
hypothesis[i] = '?'
return hypothesis
def main():
"""
Reads training data from a CSV file and demonstrates the FIND-S
algorithm.
"""
try:
filepath = r"/content/[Link]"
data = pd.read_csv(filepath)
hypothesis = find_s(data)
if hypothesis:
print("Most Specific Hypothesis (FIND-S):", hypothesis)
else:
print("No positive examples found in the dataset.")
except FileNotFoundError:
print("Error: The specified CSV file was not found.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()
lab 2
import pandas as pd
def initialize_hypotheses(num_attributes):
specific_h = ['Φ'] * num_attributes
general_h = [['?'] * num_attributes]
return specific_h, general_h
def update_specific(specific_h, instance):
for i in range(len(specific_h)):
if specific_h[i] == 'Φ':
specific_h[i] = instance[i]
elif specific_h[i] != instance[i]:
specific_h[i] = '?'
return specific_h
def update_general(general_h, specific_h, instance):
new_general_h = []
for hypothesis in general_h:
for i in range(len(hypothesis)):
if hypothesis[i] != specific_h[i] and hypothesis[i] != '?':
new_hypothesis = hypothesis[:]
new_hypothesis[i] = '?'
new_general_h.append(new_hypothesis)
return [hypothesis for hypothesis in new_general_h if hypothesis !=
specific_h]
def candidate_elimination(data):
num_attributes = len([Link]) - 1
specific_h, general_h = initialize_hypotheses(num_attributes)
for _, row in [Link]():
instance = row[:-1].tolist()
label = row[-1]
if label == 'yes':
specific_h = update_specific(specific_h, instance)
general_h = [h for h in general_h if all(
h[i] == '?' or h[i] == instance[i] for i in
range(num_attributes)
)]
elif label == 'no':
new_general_h = []
for hypothesis in general_h:
if any(hypothesis[i] != '?' and hypothesis[i] !=
instance[i] for i in range(num_attributes)):
new_general_h.append(hypothesis)
general_h = new_general_h
return specific_h, general_h
def main():
filepath = r"/content/[Link]"
data = pd.read_csv(filepath)
specific_h, general_h = candidate_elimination(data)
print("Final Specific Hypothesis:", specific_h)
print("Final General Hypothesis:", general_h)
if __name__ == "__main__":
main()
lab 3
import pandas as pd
from math import log2
def entropy(target_col):
"""
Calculates the entropy of the target column.
Args:
target_col ([Link]): The target column.
Returns:
float: Entropy value.
"""
probabilities = target_col.value_counts(normalize=True)
return -sum(p * log2(p) for p in probabilities)
def information_gain(data, split_attribute_name, target_attribute_name):
"""
Calculates information gain for a split attribute.
Args:
data ([Link]): The dataset.
split_attribute_name (str): The name of the attribute to split on.
target_attribute_name (str): The name of the target column.
Returns:
float: The information gain value.
"""
total_entropy = entropy(data[target_attribute_name])
values = data[split_attribute_name].unique()
weighted_entropy = 0
for value in values:
subset = data[data[split_attribute_name] == value]
subset_entropy = entropy(subset[target_attribute_name])
weighted_entropy += (len(subset) / len(data)) * subset_entropy
return total_entropy - weighted_entropy
def id3(data, features, target_attribute_name):
"""
Recursively builds the decision tree using the ID3 algorithm.
Args:
data ([Link]): The dataset.
features (list): List of feature names.
target_attribute_name (str): The name of the target column.
Returns:
dict: The decision tree.
"""
if len(data[target_attribute_name].unique()) == 1:
return data[target_attribute_name].iloc[0]
if not features:
return data[target_attribute_name].mode()[0]
info_gains = {feature: information_gain(data, feature,
target_attribute_name) for feature in features}
best_feature = max(info_gains, key=info_gains.get)
tree = {best_feature: {}}
remaining_features = [f for f in features if f != best_feature]
for value in data[best_feature].unique():
subset = data[data[best_feature] == value]
subtree = id3(subset, remaining_features, target_attribute_name)
tree[best_feature][value] = subtree
return tree
if __name__ == "__main__":
data_dict = {
'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain',
'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast',
'Rain'],
'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool',
'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal',
'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong',
'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No',
'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
data = [Link](data_dict)
features = list([Link][:-1])
target = 'PlayTennis'
decision_tree = id3(data, features, target)
print("Decision Tree:")
print(decision_tree)
lab 8
import pandas as pd
from [Link] import GaussianMixture
from sklearn import preprocessing
import [Link] as plt
df = pd.read_csv('/content/[Link]') # Replace 'your_data.csv'
with your file
# Separate features and target
features = [Link]('enjoySport', axis=1)
target = df['enjoySport']
# Identify categorical columns
categorical_features = features.select_dtypes(include=['object']).columns
# Apply one-hot encoding to categorical features
features_encoded = pd.get_dummies(features, columns=categorical_features)
# Scale the numerical features
scaler = [Link]()
X_scaled = scaler.fit_transform(features_encoded)
X_scaled = [Link](X_scaled, columns=features_encoded.columns)
n_clusters = 3 # Replace with your desired number of clusters
gmm = GaussianMixture(n_components=n_clusters)
[Link](X_scaled)
cluster_labels = [Link](X_scaled)
df['cluster'] = cluster_labels
# Visualize the clusters (using the first two components after scaling for
simplicity)
# You might need dimensionality reduction (like PCA) for more than 2
features
[Link](figsize=(8, 6))
[Link](X_scaled.iloc[:, 0], X_scaled.iloc[:, 1], c=cluster_labels,
cmap='viridis')
[Link]('Clusters')
[Link]('Scaled Feature 1')
[Link]('Scaled Feature 2')
[Link]()
Lab 9
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import KNeighborsClassifier
from [Link] import accuracy_score
iris = datasets.load_iris()
X = [Link]
y = [Link]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = [Link](X_test)
k = 3
knn = KNeighborsClassifier(n_neighbors=k)
[Link](X_train, y_train)
y_pred = [Link](X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Lab 10
import numpy as np
import [Link] as plt
def gaussian_kernel(x, x0, tau):
"""
Computes the Gaussian kernel value.
Args:
x: Data point.
x0: Query point.
tau: Bandwidth parameter.
Returns:
Kernel value.
"""
return [Link](-[Link]((x - x0)**2) / (2 * tau**2))
def compute_weights(X, x0, tau):
"""
Computes the weights for each data point relative to the query point.
Args:
X: Input data points.
x0: Query point.
tau: Bandwidth parameter.
Returns:
Diagonal weight matrix.
"""
m = [Link][0]
weights = [Link](m)
for i in range(m):
weights[i] = gaussian_kernel(X[i], x0, tau)
return [Link](weights)
def locally_weighted_regression(X, y, x0, tau):
"""
Fits a weighted linear regression model around the query point.
Args:
X: Input data points.
y: Output data points.
x0: Query point.
tau: Bandwidth parameter.
Returns:
Predicted value at query point.
"""
X_b = np.c_[[Link](([Link][0], 1)), X]
x0_b = np.r_[1, x0]
W = compute_weights(X, x0, tau)
theta = [Link](X_b.T @ W @ X_b) @ (X_b.T @ W @ y)
return x0_b @ theta
def plot_lwr(X, y, tau):
"""
Predicts and plots the results.
Args:
X: Input data points.
y: Output data points.
tau: Bandwidth parameter.
"""
X_range = [Link]([Link](X), [Link](X), 300)
y_pred = [locally_weighted_regression(X, y, x0, tau) for x0 in
X_range]
[Link](X, y, color='blue', label='Data points')
[Link](X_range, y_pred, color='red', label='LWR fit')
[Link]('X')
[Link]('y')
[Link]()
[Link]('Locally Weighted Regression')
[Link]()
if __name__ == '__main__':
X = [Link](5 * [Link](100, 1), axis=0)
y = [Link](X).ravel() + 0.3 * [Link](100)
tau = 0.08
plot_lwr(X, y, tau)