Machine learning algorithm 08 clustering exploration (04 exploration of K-means clustering algorithm, hierarchical clustering concept, density clustering DBSCAN exploration)

Exploration of K-means clustering algorithm

import  numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import  KMeans,MiniBatchKMeans

def expand(a,b):
    d=(b-a)*0.1
    return a-d,b+d
if __name__ == '__main__':
    N=400               #Create 400 samples
    centers=4           #Create 4 sets of data
    # n_features=2 samples contain 2 dimension centers, so the samples are distributed in 4 clusters
    # The return value data, y, means that we created the data through ds. In fact, we know the data in that label
    data,y=ds.make_blobs(N,n_features=2,centers=centers,random_state=2)
    # Specify the variance of the four groups to create data
    data2,y2 = ds.make_blobs(N, n_features=2, centers=centers,cluster_std=(1,2.5,0.5,2),random_state=2)
    #All the data of the first group of data, the second group of data to the top 50, the top 20 of the third group of data and the top 5 of the fourth group of data
    data3=np.vstack((data[y==0][:],data[y==1][:50],data[y==2][:20],data[y==3][:5]))
    # Label the data3 data
    y3=np.array([0]*100+[1]*50+[2]*20+[3]*5)
    #  Create KMeans object
    cls=KMeans(n_clusters=4,init='k-means++')  #initial
    y_hat=cls.fit_predict(data)  #Training and prediction data
    y2_hat = cls.fit_predict(data2) #Training and prediction data2
    y3_hat = cls.fit_predict(data3) #Training and prediction data3

    m=np.array(((1,1),(1,3)))  #Create a matrix
    data_r=data.dot(m)         #Use the created matrix to linearly transform data
    y_r_hat=cls.fit_predict(data_r)  #Training and prediction data_r
    matplotlib.rcParams['font.sans-serif']=[u'SimHei'] #Can display Chinese
    matplotlib.rcParams['axes.unicode_minus'] = False
    cm=matplotlib.colors.ListedColormap(list('rgbm'))
    plt.figure(figsize=(9,10),facecolor='w')
    plt.subplot(421)
    plt.title(u'raw data')
    plt.scatter(data[:,0],data[:,1],c=y,s=30,cmap=cm,edgecolors='none')
    x1_min, x2_min = np.min(data, axis=0)
    x1_max, x2_max = np.max(data, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(422)
    plt.title(u'KMeans++clustering')
    plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(423)
    plt.title(u'Data after rotation')
    plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data_r, axis=0)
    x1_max, x2_max = np.max(data_r, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(424)
    plt.title(u'After rotation KMeans++clustering')
    plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(425)
    plt.title(u'Unequal variance data')
    plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data2, axis=0)
    x1_max, x2_max = np.max(data2, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(426)
    plt.title(u'Unequal variance KMeans++clustering')
    plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(427)
    plt.title(u'Unequal quantity data')
    plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data3, axis=0)
    x1_max, x2_max = np.max(data3, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(428)
    plt.title(u'Unequal quantity KMeans++clustering')
    plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.tight_layout(2, rect=(0, 0, 1, 0.97))
    plt.suptitle(u'Data distribution pair KMeans Influence of clustering', fontsize=18)
    # https://github.com/matplotlib/matplotlib/issues/829
    # plt.subplots_adjust(top=0.92)
    plt.show()
    # plt.savefig('cluster_kmeans')



Exploration of relevant indicators

# !/usr/bin/python
# -*- coding:utf-8 -*-

from sklearn import metrics


if __name__ == "__main__":
    y = [0, 0, 0, 1, 1, 1]
    y_hat = [0, 0, 1, 1, 2, 2]
    h = metrics.homogeneity_score(y, y_hat)
    c = metrics.completeness_score(y, y_hat)
    print(u'identity(Homogeneity): ', h)
    print(u'Integrity(Completeness): ', c)
    v2 = 2 * c * h / (c + h)
    v = metrics.v_measure_score(y, y_hat)
    print(u'V-Measure: ', v2, v)

    y = [0, 0, 0, 1, 1, 1]
    y_hat = [0, 0, 1, 3, 3, 3]
    h = metrics.homogeneity_score(y, y_hat)
    c = metrics.completeness_score(y, y_hat)
    v = metrics.v_measure_score(y, y_hat)
    print(u'identity(Homogeneity): ', h)
    print(u'Integrity(Completeness): ', c)
    print(u'V-Measure: ', v)

    # Allow different values
    y = [0, 0, 0, 1, 1, 1]
    y_hat = [1, 1, 1, 0, 0, 0]
    h = metrics.homogeneity_score(y, y_hat)
    c = metrics.completeness_score(y, y_hat)
    v = metrics.v_measure_score(y, y_hat)
    print(u'identity(Homogeneity): ', h)
    print(u'Integrity(Completeness): ', c)
    print(u'V-Measure: ', v)

    y = [0, 0, 1, 1]
    y_hat = [0, 1, 0, 1]
    ari = metrics.adjusted_rand_score(y, y_hat)
    print(ari)

    y = [0, 0, 0, 1, 1, 1]
    y_hat = [0, 0, 1, 1, 2, 2]
    ari = metrics.adjusted_rand_score(y, y_hat)
    print(ari)

result

identity(Homogeneity):  0.6666666666666669
 Integrity(Completeness):  0.420619835714305
V-Measure:  0.5158037429793889 0.5158037429793889
 identity(Homogeneity):  1.0
 Integrity(Completeness):  0.6853314789615865
V-Measure:  0.8132898335036762
 identity(Homogeneity):  1.0
 Integrity(Completeness):  1.0
V-Measure:  1.0
-0.5
0.24242424242424243

Process finished with exit code 0

hierarchical clustering

• the problem of k-means k value selection and initial center point selection is solved
• divided into splitting method and coagulation method

Splitting method

• 1. Group all samples into one cluster
• While does not meet the condition (k number or distance threshold):
• calculate the distance between samples in the same cluster C, and select the two samples ab with the farthest distance (termination condition detection)
• classify samples a and b into C1 and C2 (termination condition test)
• calculate who the sample in cluster c is close to and into

 

Coagulation method

1. Treat all points as an independent cluster
• While does not meet the conditions:
• calculate the distance between two clusters and find the cluster C1 and C2 with the minimum distance (multiple calculation methods)
• merge C1 and C2
        
• how to combine C1 and C2:
        • 1. The sample distance with the smallest distance between two clusters
• 1 2The distance between the two farthest points of two clusters
        • 3. Draw the distance between two clusters
        • 4. The median of the distance between two thick
        • 5. Find the center point of each set, and the distance of the center point represents the distance of the cluster

Hierarchical clustering contrast k-means

        

Density clustering DBSCAN

 

• DBSCAN(Density-Based Spatial Clustering of Applications with Noise)

 

• an algorithm based on density clustering, which is different from hierarchical clustering. It defines clusters as the largest set of points connected by density, can divide regions with high density into clusters, and can effectively resist noise
Q: define cluster as the largest set of points connected by density?
Concept 1: direct density up to:
• e neighborhood of object: the area of a given object within radius E;
• core object: given a number m, if there are at least m objects in the e neighborhood of the object,
This object is the core object
• direct density up to: given an object set D, If p is in the e neighborhood of q and q is a core object, p starts from q
The starting point is directly accessible

• density reachability: if there is an object chain p1p2... pn, so that p1=p,pn=q,pi+1 is directly density reachable with respect to e and m, then object P is density reachable with respect to e and m from object Q.

• density connection: if there is an object o in set D, the density of O - > P can reach, O - > Q If the density can be reached, then p and q are related to the density of e and m

 

DB SCAN algorithm

• DB-SCAN looks for clusters by examining the e-neighborhood of each object in the dataset
If there is more than m in the e neighborhood of a point p Create a new object with p as the core Cluster, repeatedly search for density connected sets according to p (it is possible to merge the original generated clusters). When no new points can be added to the cluster, the search ends.

DB_SCAN exploration

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


def expand(a, b):
    d = (b - a) * 0.1
    return a-d, b+d


if __name__ == "__main__":
    N = 1000
    centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
    data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
    data = StandardScaler().fit_transform(data)
    # Parameters of data:(epsilon, min_sample)  #epsilon radius min_ The minimum number of samples in the radius of sample is used to judge whether it is a core object
    params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))

    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False

    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle(u'DBSCAN clustering', fontsize=20)

    for i in range(6):
        eps, min_samples = params[i]
        model = DBSCAN(eps=eps, min_samples=min_samples)
        model.fit(data)
        y_hat = model.labels_

        core_indices = np.zeros_like(y_hat, dtype=bool)
        core_indices[model.core_sample_indices_] = True

        y_unique = np.unique(y_hat)
        n_clusters = y_unique.size - (1 if -1 in y_hat else 0)
        print(y_unique, 'The number of clusters is:', n_clusters)

        plt.subplot(2, 3, i+1)
        clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
        print(clrs)
        for k, clr in zip(y_unique, clrs):
            cur = (y_hat == k)
            if k == -1:
                plt.scatter(data[cur, 0], data[cur, 1], s=20, c='k')
                continue
            plt.scatter(data[cur, 0], data[cur, 1], s=30, c=clr, edgecolors='k')
            plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=60, c=clr, marker='o', edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(True)
        plt.title(u'epsilon = %.1f  m = %d,Number of clusters:%d' % (eps, min_samples, n_clusters), fontsize=16)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

 

Keywords: Algorithm Machine Learning

Added by Dark Phoenix on Sat, 08 Jan 2022 21:21:04 +0200