# Machine learning algorithm 08 clustering exploration (04 exploration of K-means clustering algorithm, hierarchical clustering concept, density clustering DBSCAN exploration)

## Exploration of K-means clustering algorithm

```import  numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import  KMeans,MiniBatchKMeans

def expand(a,b):
d=(b-a)*0.1
return a-d,b+d
if __name__ == '__main__':
N=400               #Create 400 samples
centers=4           #Create 4 sets of data
# n_features=2 samples contain 2 dimension centers, so the samples are distributed in 4 clusters
# The return value data, y, means that we created the data through ds. In fact, we know the data in that label
data,y=ds.make_blobs(N,n_features=2,centers=centers,random_state=2)
# Specify the variance of the four groups to create data
data2,y2 = ds.make_blobs(N, n_features=2, centers=centers,cluster_std=(1,2.5,0.5,2),random_state=2)
#All the data of the first group of data, the second group of data to the top 50, the top 20 of the third group of data and the top 5 of the fourth group of data
data3=np.vstack((data[y==0][:],data[y==1][:50],data[y==2][:20],data[y==3][:5]))
# Label the data3 data
y3=np.array([0]*100+[1]*50+[2]*20+[3]*5)
#  Create KMeans object
cls=KMeans(n_clusters=4,init='k-means++')  #initial
y_hat=cls.fit_predict(data)  #Training and prediction data
y2_hat = cls.fit_predict(data2) #Training and prediction data2
y3_hat = cls.fit_predict(data3) #Training and prediction data3

m=np.array(((1,1),(1,3)))  #Create a matrix
data_r=data.dot(m)         #Use the created matrix to linearly transform data
y_r_hat=cls.fit_predict(data_r)  #Training and prediction data_r
matplotlib.rcParams['font.sans-serif']=[u'SimHei'] #Can display Chinese
matplotlib.rcParams['axes.unicode_minus'] = False
cm=matplotlib.colors.ListedColormap(list('rgbm'))
plt.figure(figsize=(9,10),facecolor='w')
plt.subplot(421)
plt.title(u'raw data')
plt.scatter(data[:,0],data[:,1],c=y,s=30,cmap=cm,edgecolors='none')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(422)
plt.title(u'KMeans++clustering')
plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none')
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(423)
plt.title(u'Data after rotation')
plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
x1_min, x2_min = np.min(data_r, axis=0)
x1_max, x2_max = np.max(data_r, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(424)
plt.title(u'After rotation KMeans++clustering')
plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none')
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(425)
plt.title(u'Unequal variance data')
plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none')
x1_min, x2_min = np.min(data2, axis=0)
x1_max, x2_max = np.max(data2, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(426)
plt.title(u'Unequal variance KMeans++clustering')
plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none')
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(427)
plt.title(u'Unequal quantity data')
plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none')
x1_min, x2_min = np.min(data3, axis=0)
x1_max, x2_max = np.max(data3, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.subplot(428)
plt.title(u'Unequal quantity KMeans++clustering')
plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none')
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)

plt.tight_layout(2, rect=(0, 0, 1, 0.97))
plt.suptitle(u'Data distribution pair KMeans Influence of clustering', fontsize=18)
# https://github.com/matplotlib/matplotlib/issues/829
plt.show()
# plt.savefig('cluster_kmeans')

```

## Exploration of relevant indicators

```# !/usr/bin/python
# -*- coding:utf-8 -*-

from sklearn import metrics

if __name__ == "__main__":
y = [0, 0, 0, 1, 1, 1]
y_hat = [0, 0, 1, 1, 2, 2]
h = metrics.homogeneity_score(y, y_hat)
c = metrics.completeness_score(y, y_hat)
print(u'identity(Homogeneity): ', h)
print(u'Integrity(Completeness): ', c)
v2 = 2 * c * h / (c + h)
v = metrics.v_measure_score(y, y_hat)
print(u'V-Measure: ', v2, v)

y = [0, 0, 0, 1, 1, 1]
y_hat = [0, 0, 1, 3, 3, 3]
h = metrics.homogeneity_score(y, y_hat)
c = metrics.completeness_score(y, y_hat)
v = metrics.v_measure_score(y, y_hat)
print(u'identity(Homogeneity): ', h)
print(u'Integrity(Completeness): ', c)
print(u'V-Measure: ', v)

# Allow different values
y = [0, 0, 0, 1, 1, 1]
y_hat = [1, 1, 1, 0, 0, 0]
h = metrics.homogeneity_score(y, y_hat)
c = metrics.completeness_score(y, y_hat)
v = metrics.v_measure_score(y, y_hat)
print(u'identity(Homogeneity): ', h)
print(u'Integrity(Completeness): ', c)
print(u'V-Measure: ', v)

y = [0, 0, 1, 1]
y_hat = [0, 1, 0, 1]
print(ari)

y = [0, 0, 0, 1, 1, 1]
y_hat = [0, 0, 1, 1, 2, 2]
print(ari)

```

result

```identity(Homogeneity):  0.6666666666666669
Integrity(Completeness):  0.420619835714305
V-Measure:  0.5158037429793889 0.5158037429793889
identity(Homogeneity):  1.0
Integrity(Completeness):  0.6853314789615865
V-Measure:  0.8132898335036762
identity(Homogeneity):  1.0
Integrity(Completeness):  1.0
V-Measure:  1.0
-0.5
0.24242424242424243

Process finished with exit code 0```

## hierarchical clustering

• the problem of k-means k value selection and initial center point selection is solved
• divided into splitting method and coagulation method

### Splitting method

• 1. Group all samples into one cluster
• While does not meet the condition (k number or distance threshold):
• calculate the distance between samples in the same cluster C, and select the two samples ab with the farthest distance (termination condition detection)
• classify samples a and b into C1 and C2 (termination condition test)
• calculate who the sample in cluster c is close to and into

### Coagulation method

1. Treat all points as an independent cluster
• While does not meet the conditions:
• calculate the distance between two clusters and find the cluster C1 and C2 with the minimum distance (multiple calculation methods)
• merge C1 and C2

• how to combine C1 and C2:
• 1. The sample distance with the smallest distance between two clusters
• 1 2The distance between the two farthest points of two clusters
• 3. Draw the distance between two clusters
• 4. The median of the distance between two thick
• 5. Find the center point of each set, and the distance of the center point represents the distance of the cluster

## Density clustering DBSCAN

• DBSCAN(Density-Based Spatial Clustering of Applications　with Noise)

• an algorithm based on density clustering, which is different from hierarchical clustering. It defines clusters as the largest set of points connected by density, can divide regions with high density into clusters, and can effectively resist noise
Q: define cluster as the largest set of points connected by density?
Concept 1: direct density up to:
• e neighborhood of object: the area of a given object within radius E;
• core object: given a number m, if there are at least m objects in the e neighborhood of the object,
This object is the core object
• direct density up to: given an object set D, If p is in the e neighborhood of q and q is a core object, p starts from q
The starting point is directly accessible

• density reachability: if there is an object chain p1p2... pn, so that p1=p,pn=q,pi+1 is directly density reachable with respect to e and m, then object P is density reachable with respect to e and m from object Q.

• density connection: if there is an object o in set D, the density of O - > P can reach, O - > Q If the density can be reached, then p and q are related to the density of e and m

## DB SCAN algorithm

• DB-SCAN looks for clusters by examining the e-neighborhood of each object in the dataset
If there is more than m in the e neighborhood of a point p Create a new object with p as the core Cluster, repeatedly search for density connected sets according to p (it is possible to merge the original generated clusters). When no new points can be added to the cluster, the search ends.

### DB_SCAN exploration

```# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

def expand(a, b):
d = (b - a) * 0.1
return a-d, b+d

if __name__ == "__main__":
N = 1000
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
data = StandardScaler().fit_transform(data)
# Parameters of data:(epsilon, min_sample)  #epsilon radius min_ The minimum number of samples in the radius of sample is used to judge whether it is a core object
params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))

matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

plt.figure(figsize=(12, 8), facecolor='w')
plt.suptitle(u'DBSCAN clustering', fontsize=20)

for i in range(6):
eps, min_samples = params[i]
model = DBSCAN(eps=eps, min_samples=min_samples)
model.fit(data)
y_hat = model.labels_

core_indices = np.zeros_like(y_hat, dtype=bool)
core_indices[model.core_sample_indices_] = True

y_unique = np.unique(y_hat)
n_clusters = y_unique.size - (1 if -1 in y_hat else 0)
print(y_unique, 'The number of clusters is:', n_clusters)

plt.subplot(2, 3, i+1)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
print(clrs)
for k, clr in zip(y_unique, clrs):
cur = (y_hat == k)
if k == -1:
plt.scatter(data[cur, 0], data[cur, 1], s=20, c='k')
continue
plt.scatter(data[cur, 0], data[cur, 1], s=30, c=clr, edgecolors='k')
plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=60, c=clr, marker='o', edgecolors='k')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)
plt.title(u'epsilon = %.1f  m = %d，Number of clusters:%d' % (eps, min_samples, n_clusters), fontsize=16)
plt.tight_layout()