from scipy.io import arff import numpy as np from sklearn.cluster import KMeans from sklearn.datasets import make_blobs import matplotlib.pyplot as plt from sklearn import metrics from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import DBSCAN import hdbscan n_clusters = 2 data_final = [] x_list = [] y_list = [] silhouette = [] calinski = [] davies = [] data = np.loadtxt('zgn.data') for (x, y) in data : x_list.append(x) y_list.append(y) data_final.append([x,y]) #get the values of the different coefficients for different min_samples values from 2 to 20 for n in range(2, 20): clustering = hdbscan.HDBSCAN(min_samples=n) colors = clustering.fit_predict(data_final) silh = metrics.silhouette_score(data_final, colors, metric='euclidean') silhouette.append(silh) dbsc = metrics.davies_bouldin_score(data_final, colors) davies.append(dbsc) caha = metrics.calinski_harabasz_score(data_final, colors) calinski.append(caha) plt.plot(range(2,20), silhouette, marker='o', label='Silhouette') plt.xlim(2,20) plt.xlabel('Nb minimum de voisins') plt.plot(range(2,20), davies, marker='o', label='Davies') plt.xlim(2,20) plt.xlabel('Nb minimum de voisins') """plt.plot(range(2,20), calinski, marker='o') plt.xlim(2,20) plt.xlabel('Nb minimum de voisins') plt.ylabel('Calinski coeff')""" plt.legend() plt.show() #silhouettte coefficient #get the index of the best result m = max(silhouette) indice = [i for i, j in enumerate(silhouette) if j == m][0] +2 print("Silhouette : ", indice) plt.subplot(3,1,1) #display the best obtained result clustering = hdbscan.HDBSCAN(min_samples=indice) colors = clustering.fit_predict(data_final) plt.scatter(x_list, y_list, c=colors, s=5) plt.show() #davies bouldin metrics #get the index of the best result m = min(davies) indice = [i for i, j in enumerate(davies) if j == m][0] +2 print("Davies Bouldin : ", indice) plt.subplot(3,1,2) #display the best obtained result with davies bouldin metrics clustering = hdbscan.HDBSCAN(min_samples=indice) colors = clustering.fit_predict(data_final) plt.scatter(x_list, y_list, c=colors, s=5) plt.show() #calinski metrics #get the index of the best result m = max(calinski) indice = [i for i, j in enumerate(calinski) if j == m][0] +2 print("Calinski Harabasz : ", indice) plt.subplot(3,1,3) #display the best obtained result clustering = hdbscan.HDBSCAN(min_samples=indice) colors = clustering.fit_predict(data_final) plt.scatter(x_list, y_list, c=colors, s=5) plt.show()