#suppress warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')
Clustering - refers to the mathematical and algorithmic process of grouping ‘n’ elements into ‘k’ distinct groups such that the groups so produced have more similarity among its members than among members belonging to different, distinct groups.
TASK: Implementation of clustering Algorithmns, by type.
libraries
#pip install kmodes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification # dataset function
#for Centroid-based Clustering
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.decomposition import PCA
# Generate a synthetic dataset with make_classification
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=42)
# Apply K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)
# Get cluster assignments for each data point
labels = kmeans.labels_
# Plot the original data points and the cluster centers
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', edgecolors='k', alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x', s=200, label='Centroids')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()
#We visualize the original data points with different colors based ..
#on their cluster assignments and plot the cluster centers with red 'x' markers.
# Convert numerical features to categorical features
X_categorical = np.apply_along_axis(lambda x: np.round(x), axis=0, arr=X)
# Convert to DataFrame for easier handling
df = pd.DataFrame(data=X_categorical, columns=["Feature_1", "Feature_2"])
# Apply K-Modes clustering
k = 2 # Number of clusters
km = KModes(n_clusters=k, init='Huang', n_init=5, verbose=1, random_state=42)
clusters = km.fit_predict(df)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.scatter(km.cluster_centroids_[:, 0], km.cluster_centroids_[:, 1], c='red', marker='x', s=200, label='Centroids')
plt.title('K-Modes Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()
#We visualize the results by plotting the original data points with different colors ..
#based on their cluster assignments and marking the cluster centroids with red 'x' markers.#
Init: initializing centroids Init: initializing clusters Starting iterations... Run 1, iteration: 1/100, moves: 81, cost: 1194.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 2, iteration: 1/100, moves: 0, cost: 943.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 3, iteration: 1/100, moves: 70, cost: 1058.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 4, iteration: 1/100, moves: 155, cost: 1105.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 5, iteration: 1/100, moves: 155, cost: 1105.0 Best run was number 2
libraries
from sklearn.datasets import make_blobs #dataset function
# for Density-based Clustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import AffinityPropagation
from sklearn.preprocessing import StandardScaler
#for Distribution-based Clustering
from sklearn.mixture import GaussianMixture
from sklearn.mixture import GaussianMixture
#for Hierarchical Clustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering
#make_classification generates datasets with numerical features, which may not be optimal for DBSCAN.
#because DBSCAN is designed for spatial data with varying densities.
#We'll use the make_blobs function to create a dataset with well-defined clusters for this case
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for DBSCAN)
X = StandardScaler().fit_transform(X)
# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=500, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for Mean-Shift)
X = StandardScaler().fit_transform(X)
# Apply Mean-Shift clustering
meanshift = MeanShift(bandwidth=0.8)
clusters = meanshift.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Mean-Shift Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for Affinity Propagation)
X = StandardScaler().fit_transform(X)
# Apply Affinity Propagation clustering
affinity_propagation = AffinityPropagation(damping=0.9, preference=-200)
clusters = affinity_propagation.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Affinity Propagation Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.
3. Distribution-based Clustering
We will do simple implementations of these algorithmns, they include Gaussian Mixture Model and Expectation-Maximization (EM) Clustering
for these implementations we will still use the make_blobs function to create a dataset
libraries
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=1000, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for GMM)
X = StandardScaler().fit_transform(X)
# Apply Gaussian Mixture Model (GMM) clustering
gmm = GaussianMixture(n_components=4, random_state=0)
clusters = gmm.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Gaussian Mixture Model (GMM) Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.
Expectation-Maximization (EM) is often used in conjunction with Gaussian Mixture Models (GMM), as the EM algorithm is employed to estimate the parameters of the GMM
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=600, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for GMM)
X = StandardScaler().fit_transform(X)
# Apply Gaussian Mixture Model (GMM) clustering using EM
gmm = GaussianMixture(n_components=4, random_state=0)
gmm.fit(X)
# Get cluster assignments for each data point
clusters = gmm.predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.scatter(gmm.means_[:, 0], gmm.means_[:, 1], c='red', marker='x', s=200, label='Cluster Centers')
plt.title('Expectation-Maximization (EM) Clustering with Gaussian Mixture Model (GMM)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()
#We visualize the results by plotting the data points with different colors based
#on their cluster assignments and marking the cluster centers with red 'x' markers.
libraries
from scipy.cluster.hierarchy import dendrogram, linkage
#Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
# Apply Agglomerative Hierarchical Clustering
agg_cluster = AgglomerativeClustering(n_clusters=4, linkage='single')
clusters = agg_cluster.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Agglomerative Hierarchical Clustering with Single Linkage')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
# Plot the dendrogram
linkage_matrix = linkage(X, method='single')
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram with Single Linkage')
plt.xlabel('Data Points')
plt.ylabel('Distance')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.
#We also plot the dendrogram to visualize the hierarchy of cluster mergers.
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=450, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for BIRCH)
X = StandardScaler().fit_transform(X)
# Apply BIRCH clustering
birch_cluster = Birch(threshold=0.5, n_clusters=4)
clusters = birch_cluster.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('BIRCH Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=430, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for Spectral Clustering)
X = StandardScaler().fit_transform(X)
# Apply Spectral Clustering
spectral_cluster = SpectralClustering(n_clusters=4, affinity='nearest_neighbors', random_state=0)
clusters = spectral_cluster.fit_predict(X)
# Visualize the results
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('Spectral Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
#We visualize the results by plotting the data points with different colors based on their cluster assignments.
Finally, we compare the performances of some of these clustering algorithms, by considering metrics such as silhouette score, Davies-Bouldin index, and computational efficiency.
import time
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
# Generate a synthetic dataset with make_blobs
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Standardize the features (important for some clustering algorithms)
X_standardized = StandardScaler().fit_transform(X)
# Define the clustering algorithms, a researcher can add more clustering examples if they like
algorithms = {
'K-Means': KMeans(n_clusters=4, random_state=0),
'Agglomerative': AgglomerativeClustering(n_clusters=4),
'Spectral': SpectralClustering(n_clusters=4, affinity='nearest_neighbors', random_state=0)
}
# Evaluate and compare the performance
for name, algorithm in algorithms.items():
start_time = time.time()
# Fit the algorithm
if name == 'Spectral':
clusters = algorithm.fit_predict(X_standardized)
else:
clusters = algorithm.fit_predict(X)
# Evaluate performance metrics
silhouette = silhouette_score(X, clusters)
davies_bouldin = davies_bouldin_score(X, clusters)
# Measure computational efficiency
runtime = time.time() - start_time
# Print results
print(f"{name} clustering:")
print(f"Silhouette Score: {silhouette:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
print(f"Runtime: {runtime:.4f} seconds\n")
# Visualize the results (you can choose one algorithm for visualization)
algorithm = algorithms['K-Means']
clusters = algorithm.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', edgecolors='k', alpha=0.7)
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
K-Means clustering: Silhouette Score: 0.6820 Davies-Bouldin Index: 0.4376 Runtime: 1.6680 seconds Agglomerative clustering: Silhouette Score: 0.6820 Davies-Bouldin Index: 0.4376 Runtime: 0.0150 seconds Spectral clustering: Silhouette Score: 0.6820 Davies-Bouldin Index: 0.4376 Runtime: 1.6600 seconds