import numpy as np
import pandas as pd
# Step 1: Load the dataset
data = pd.read_csv('ANandi.csv')
# Step 2: Standardize the data (mean = 0, variance = 1)
# Exclude non-numeric columns if present
numeric_data = data.select_dtypes(include=[np.number])
mean = numeric_data.mean()
std = numeric_data.std()
standardized_data = (numeric_data - mean) / std
# Step 3: Compute the covariance matrix
cov_matrix = np.cov(standardized_data, rowvar=False)
# Step 4: Perform eigen decomposition
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# Step 5: Sort eigenvalues and eigenvectors
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]
# Select the first two principal components
top_eigenvectors = eigenvectors[:, :2]
# Step 6: Transform the data
data_reduced = np.dot(standardized_data, top_eigenvectors)
# Step 7: Save the reduced data
data_reduced_df = pd.DataFrame(data_reduced, columns=['PC1', 'PC2'])
data_reduced_df.to_csv('data_reduced.csv', index=False)
print("PCA completed. The first two principal components are saved in 'data_reduced.csv'.")
PCA completed. The first two principal components are saved in 'data_reduced.csv'.
# Step 1: Load the reduced data
data = pd.read_csv('data_reduced.csv').to_numpy()
# Parameters
k = 3 # Number of clusters (adjust as needed)
max_iterations = 100 # Maximum number of iterations
tolerance = 1e-4 # Convergence threshold
# Step 2: Initialize cluster centers randomly
np.random.seed(42) # For reproducibility
centers = data[np.random.choice(data.shape[0], k, replace=False)]
# Step 3-5: K-Means Algorithm
for iteration in range(max_iterations):
# Step 3: Assign clusters based on the closest center
distances = np.linalg.norm(data[:, np.newaxis] - centers, axis=2)
cluster_assignments = np.argmin(distances, axis=1)
# Step 4: Update cluster centers
new_centers = np.array([data[cluster_assignments == i].mean(axis=0) for i in range(k)])
# Check for convergence
if np.linalg.norm(new_centers - centers) < tolerance:
print(f"Converged after {iteration + 1} iterations.")
break
centers = new_centers
else:
print("Reached maximum iterations.")
# Step 6: Save results
# Create a DataFrame with the cluster assignments
output_data = pd.DataFrame(data, columns=['PC1', 'PC2'])
output_data['Cluster'] = cluster_assignments
# Save cluster assignments and cluster centers
output_data.to_csv('cluster_assignments.csv', index=False)
centers_df = pd.DataFrame(centers, columns=['PC1', 'PC2'])
centers_df.to_csv('cluster_centers.csv', index=False)
print("Clustering completed.")
print("Cluster assignments saved to 'cluster_assignments.csv'.")
print("Cluster centers saved to 'cluster_centers.csv'.")
Converged after 9 iterations.
Clustering completed.
Cluster assignments saved to 'cluster_assignments.csv'.
Cluster centers saved to 'cluster_centers.csv'.
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js