import numpy as np

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline
plt.rcParams["figure.dpi"] = 200

iris_dataset = load_iris()
pca = PCA(n_components=2)
data = pca.fit_transform(iris_dataset.data)

colors = np.array(['#7d4e88', '#42a19d', '#fde725'])
for k in np.unique(iris_dataset.target):
    mask = iris_dataset.target == k
    x = data[:,0][mask]
    y = data[:,1][mask]
    plt.scatter(x, y, s=5, c=colors[k], label=iris_dataset.target_names[k])

plt.title('The Iris Dataset')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.legend(loc='lower left')
plt.show()

X = data

max_iter = 100
tol = 1e-5
n_clusters = 3

print("First 5 rows\n", X[:5,:])

First 5 rows
 [[-2.68412563  0.31939725]
 [-2.71414169 -0.17700123]
 [-2.88899057 -0.14494943]
 [-2.74534286 -0.31829898]
 [-2.72871654  0.32675451]]

means = X[np.random.randint(X.shape[0], size=n_clusters),:]
clusters = {k: [] for k in range(n_clusters)}

for i in range(max_iter):
    # Assign each sample to the cluster mean that is closest
    for sample in X:
        distances = np.linalg.norm(means - sample, axis=1)
        classification = np.argmin(distances)
        clusters[classification].append(sample)

    # Recompute cluster means as the means of the new clusters
    new_means = np.zeros_like(means)
    for k, cluster in clusters.items():
        if len(cluster) == 0:
            new_means[k] = X[np.random.randint(X.shape[0]),:]
        else:
            new_means[k] = np.mean(cluster, axis=0)

    # If old and new means are sufficiently close, terminate early
    if np.max(np.linalg.norm(means - new_means, axis=1)) <= tol:
        break

    # Otherwise, assign new means, reset clusters and continue
    means = new_means
    clusters = {k: [] for k in range(n_clusters)}

classifications = []
for row in X:
    classification = np.argmin(np.linalg.norm(means - row, axis=1))
    classifications.append(classification)
    
classifications = np.array(classifications)

plt.scatter(data[:,0], data[:,1], s=5, c=colors[classifications])
for k in range(means.shape[0]):
    plt.scatter(means[k,0], means[k,1], marker='+', c=colors[k], label='Cluster Mean' if k == 0 else None)
    
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('K-Means Clustering on the Iris Dataset')
plt.legend(loc='upper left')
plt.show()

from sklearn.base import BaseEstimator, ClusterMixin

class KMeans(BaseEstimator, ClusterMixin):
    """Basic k-means clustering class."""
    def __init__(self, n_clusters=8, max_iter=100, tol=1e-5, norm=2):
        """Store clustering algorithm parameters.
        
        Parameters:
            n_clusters (int): How many clusters to compute.
            max_iter (int): The maximum number of iterations to compute.
            tol (float): The convergence tolerance.
            norm (int): The order of the norm to use
        """
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.norm = norm
    
    def fit(self, X, y=None):
        """Compute the cluster means from random initial conditions.
        
        Parameters:
            X ((n_samples, n_classes) ndarray): the data to be clustered.
        """
        # Randomly assign initial means and set empty clusters
        means = X[np.random.randint(X.shape[0], size=self.n_clusters),:]
        clusters = {k: [] for k in range(self.n_clusters)}
        
        for i in range(self.max_iter):
            # Assign each sample to the cluster mean that is closest
            for sample in X:
                # The distance between two vectors is the norm of their
                # difference. So we take the difference and compute
                # the norm on each row.
                distances = np.linalg.norm(means - sample, axis=1, ord=self.norm)
                classification = np.argmin(distances)
                clusters[classification].append(sample)
            
            # Recompute cluster means as the means of the new clusters
            new_means = np.zeros_like(means)
            for k, cluster in clusters.items():
                if len(cluster) == 0: # If the cluster is empty, use a random data point
                    new_means[k] = X[np.random.randint(X.shape[0]),:]
                else:
                    new_means[k] = np.mean(cluster, axis=0)
            
            # If old and new means are sufficiently close, terminate early
            max_diff = np.max(np.linalg.norm(means - new_means, axis=1, ord=self.norm))
            if max_diff <= self.tol:
                break
            
            # Otherwise, assign new means, reset clusters and continue
            means = new_means
            clusters = {k: [] for k in range(self.n_clusters)}
        
        self.means = means
        
        return self
    
    def predict(self, W):
        """Classify each entry of W based on which cluster mean it belongs to.

        Parameters:
            W ((n_samples, n_classes) ndarray): the data to be clustered.
        
        Returns:
            ((n_samples) ndarray): Integer labels from 0 to n_clusters for each entry of W.
        """
        classifications = []
        for row in W:
            classification = np.argmin(np.linalg.norm(self.means - row, axis=1, ord=self.norm))
            classifications.append(classification)
            
        return np.array(classifications)
        
    
    def fit_predict(self, X, y=None):
        """Fit to the data and return the resulting labels.

        Parameters:
            X ((n_samples, n_classes) ndarray): the data to be clustered.
        """
        return self.fit(X).predict(X)

def plot_fire_stations(n_clusters, norm):
    data = np.load('sacramento.npy')
    
    model = KMeans(n_clusters=n_clusters, norm=norm)
    labels = model.fit_predict(data)
    means = model.means

    plt.scatter(data[:,0], data[:,1], s=2, c=labels)
    plt.scatter(means[:,0], means[:,1], s=50, c=np.unique(labels), marker='+')
    plt.ylabel('Latitude')
    plt.xlabel('Longitude')
    plt.title(f'Optimal Sacramento Fire Station Placement: {norm}-norm')
    plt.show()

plot_fire_stations(16, 2)

plot_fire_stations(16, 1)

K-Means Clustering¶

Clustering¶

Clustering Algorithms¶

The K-Means Clustering Algorithm¶

Clustering Algorithms as a Predictive Model¶

An Example with City Planning¶