0% found this document useful (0 votes)
3 views2 pages

Assignment 1

Uploaded by

AKANTO NANDI
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views2 pages

Assignment 1

Uploaded by

AKANTO NANDI
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

import numpy as np

import pandas as pd

# Step 1: Load the dataset


data = pd.read_csv('ANandi.csv')

# Step 2: Standardize the data (mean = 0, variance = 1)


# Exclude non-numeric columns if present
numeric_data = data.select_dtypes(include=[np.number])
mean = numeric_data.mean()
std = numeric_data.std()
standardized_data = (numeric_data - mean) / std

# Step 3: Compute the covariance matrix


cov_matrix = np.cov(standardized_data, rowvar=False)

# Step 4: Perform eigen decomposition


eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# Step 5: Sort eigenvalues and eigenvectors


sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Select the first two principal components


top_eigenvectors = eigenvectors[:, :2]

# Step 6: Transform the data


data_reduced = np.dot(standardized_data, top_eigenvectors)

# Step 7: Save the reduced data


data_reduced_df = pd.DataFrame(data_reduced, columns=['PC1', 'PC2'])
data_reduced_df.to_csv('data_reduced.csv', index=False)

print("PCA completed. The first two principal components are saved in 'data_reduced.csv'.")

PCA completed. The first two principal components are saved in 'data_reduced.csv'.

# Step 1: Load the reduced data


data = pd.read_csv('data_reduced.csv').to_numpy()

# Parameters
k = 3 # Number of clusters (adjust as needed)
max_iterations = 100 # Maximum number of iterations
tolerance = 1e-4 # Convergence threshold

# Step 2: Initialize cluster centers randomly


np.random.seed(42) # For reproducibility
centers = data[np.random.choice(data.shape[0], k, replace=False)]

# Step 3-5: K-Means Algorithm


for iteration in range(max_iterations):
# Step 3: Assign clusters based on the closest center
distances = np.linalg.norm(data[:, np.newaxis] - centers, axis=2)
cluster_assignments = np.argmin(distances, axis=1)

# Step 4: Update cluster centers


new_centers = np.array([data[cluster_assignments == i].mean(axis=0) for i in range(k)])

# Check for convergence


if np.linalg.norm(new_centers - centers) < tolerance:
print(f"Converged after {iteration + 1} iterations.")
break

centers = new_centers
else:
print("Reached maximum iterations.")

# Step 6: Save results


# Create a DataFrame with the cluster assignments
output_data = pd.DataFrame(data, columns=['PC1', 'PC2'])
output_data['Cluster'] = cluster_assignments

# Save cluster assignments and cluster centers


output_data.to_csv('cluster_assignments.csv', index=False)
centers_df = pd.DataFrame(centers, columns=['PC1', 'PC2'])
centers_df.to_csv('cluster_centers.csv', index=False)

print("Clustering completed.")
print("Cluster assignments saved to 'cluster_assignments.csv'.")
print("Cluster centers saved to 'cluster_centers.csv'.")
Converged after 9 iterations.
Clustering completed.
Cluster assignments saved to 'cluster_assignments.csv'.
Cluster centers saved to 'cluster_centers.csv'.
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

You might also like