#suppress warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')


#pip install kmodes


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification # dataset function

#for Centroid-based Clustering
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.decomposition import PCA


# Generate a synthetic dataset with make_classification
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=42)


# Apply K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

# Get cluster assignments for each data point
labels = kmeans.labels_

# Plot the original data points and the cluster centers
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', edgecolors='k', alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x', s=200, label='Centroids')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()

#We visualize the original data points with different colors based ..
#on their cluster assignments and plot the cluster centers with red 'x' markers.


# Convert numerical features to categorical features
X_categorical = np.apply_along_axis(lambda x: np.round(x), axis=0, arr=X)

# Convert to DataFrame for easier handling
df = pd.DataFrame(data=X_categorical, columns=["Feature_1", "Feature_2"])

# Apply K-Modes clustering
k = 2  # Number of clusters
km = KModes(n_clusters=k, init='Huang', n_init=5, verbose=1, random_state=42)
clusters = km.fit_predict(df)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.scatter(km.cluster_centroids_[:, 0], km.cluster_centroids_[:, 1], c='red', marker='x', s=200, label='Centroids')
plt.title('K-Modes Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()
#We visualize the results by plotting the original data points with different colors ..
#based on their cluster assignments and marking the cluster centroids with red 'x' markers.#

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 81, cost: 1194.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 943.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 70, cost: 1058.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 155, cost: 1105.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 155, cost: 1105.0
Best run was number 2


from sklearn.datasets import make_blobs #dataset function

# for Density-based Clustering 
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import AffinityPropagation
from sklearn.preprocessing import StandardScaler

#for Distribution-based Clustering
from sklearn.mixture import GaussianMixture
from sklearn.mixture import GaussianMixture

#for Hierarchical Clustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

#make_classification generates datasets with numerical features, which may not be optimal for DBSCAN.
#because DBSCAN is designed for spatial data with varying densities. 
#We'll use the make_blobs function to create a dataset with well-defined clusters for this case


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for DBSCAN)
X = StandardScaler().fit_transform(X)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=500, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for Mean-Shift)
X = StandardScaler().fit_transform(X)

# Apply Mean-Shift clustering
meanshift = MeanShift(bandwidth=0.8)
clusters = meanshift.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Mean-Shift Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

#We visualize the results by plotting the data points with different colors based on their cluster assignments


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for Affinity Propagation)
X = StandardScaler().fit_transform(X)

# Apply Affinity Propagation clustering
affinity_propagation = AffinityPropagation(damping=0.9, preference=-200)
clusters = affinity_propagation.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Affinity Propagation Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

#We visualize the results by plotting the data points with different colors based on their cluster assignments.


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=1000, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for GMM)
X = StandardScaler().fit_transform(X)

# Apply Gaussian Mixture Model (GMM) clustering
gmm = GaussianMixture(n_components=4, random_state=0)
clusters = gmm.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Gaussian Mixture Model (GMM) Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=600, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for GMM)
X = StandardScaler().fit_transform(X)

# Apply Gaussian Mixture Model (GMM) clustering using EM
gmm = GaussianMixture(n_components=4, random_state=0)
gmm.fit(X)

# Get cluster assignments for each data point
clusters = gmm.predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.scatter(gmm.means_[:, 0], gmm.means_[:, 1], c='red', marker='x', s=200, label='Cluster Centers')
plt.title('Expectation-Maximization (EM) Clustering with Gaussian Mixture Model (GMM)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()
#We visualize the results by plotting the data points with different colors based 
#on their cluster assignments and marking the cluster centers with red 'x' markers.


from scipy.cluster.hierarchy import dendrogram, linkage

#Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)

# Apply Agglomerative Hierarchical Clustering
agg_cluster = AgglomerativeClustering(n_clusters=4, linkage='single')
clusters = agg_cluster.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Agglomerative Hierarchical Clustering with Single Linkage')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# Plot the dendrogram
linkage_matrix = linkage(X, method='single')
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram with Single Linkage')
plt.xlabel('Data Points')
plt.ylabel('Distance')
plt.show()

#We visualize the results by plotting the data points with different colors based on their cluster assignments.
#We also plot the dendrogram to visualize the hierarchy of cluster mergers.


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=450, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for BIRCH)
X = StandardScaler().fit_transform(X)

# Apply BIRCH clustering
birch_cluster = Birch(threshold=0.5, n_clusters=4)
clusters = birch_cluster.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('BIRCH Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

#We visualize the results by plotting the data points with different colors based on their cluster assignments.


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=430, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for Spectral Clustering)
X = StandardScaler().fit_transform(X)

# Apply Spectral Clustering
spectral_cluster = SpectralClustering(n_clusters=4, affinity='nearest_neighbors', random_state=0)
clusters = spectral_cluster.fit_predict(X)

# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Spectral Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.


import time
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score


# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Standardize the features (important for some clustering algorithms)
X_standardized = StandardScaler().fit_transform(X)

# Define the clustering algorithms, a researcher can add more clustering examples if they like
algorithms = {
    'K-Means': KMeans(n_clusters=4, random_state=0),
    'Agglomerative': AgglomerativeClustering(n_clusters=4),
    'Spectral': SpectralClustering(n_clusters=4, affinity='nearest_neighbors', random_state=0)
}

# Evaluate and compare the performance
for name, algorithm in algorithms.items():
    start_time = time.time()
    # Fit the algorithm
    if name == 'Spectral':
        clusters = algorithm.fit_predict(X_standardized)
    else:
        clusters = algorithm.fit_predict(X)

    # Evaluate performance metrics
    silhouette = silhouette_score(X, clusters)
    davies_bouldin = davies_bouldin_score(X, clusters)

    # Measure computational efficiency
    runtime = time.time() - start_time

    # Print results
    print(f"{name} clustering:")
    print(f"Silhouette Score: {silhouette:.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
    print(f"Runtime: {runtime:.4f} seconds\n")

# Visualize the results (you can choose one algorithm for visualization)
algorithm = algorithms['K-Means']
clusters = algorithm.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

K-Means clustering:
Silhouette Score: 0.6820
Davies-Bouldin Index: 0.4376
Runtime: 1.6680 seconds

Agglomerative clustering:
Silhouette Score: 0.6820
Davies-Bouldin Index: 0.4376
Runtime: 0.0150 seconds

Spectral clustering:
Silhouette Score: 0.6820
Davies-Bouldin Index: 0.4376
Runtime: 1.6600 seconds