123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- from scipy.io import arff
- from sklearn.cluster import KMeans
- from sklearn.cluster import AgglomerativeClustering
- from sklearn.cluster import DBSCAN
- from sklearn import metrics
- import matplotlib.pyplot as plt
- import numpy as np
- import matplotlib.colors as colors
- import random
- import hdbscan
- import csv
-
- colors_list = list(colors._colors_full_map.values())
- random.shuffle(colors_list)
- #datasets = ["R15", "2d-10c", "square1"] #Marche avec kmeans
- #datasets = ["donut1", "banana", "spiral", "long1"] #Ne marche pas avec kmeans
-
- #datasets = ["donut1", "banana", "spiral", "long1"] #Marche avec Agglomerative
- #datasets = ["dartboard1", "R15", "atom"] #Ne marche pas avec Agglomerative
-
- """
- datasets = ["dartboard1", "long1", "banana"] #Marche avec DBSCAN
- eps_list = [0.05, 0.4, 0.04]
- min_samples_list=[5,3,5]
- """
-
- """
- datasets = ["diamond9", "R15", "atom"] #Ne marche pas avec DBSCAN
- eps_list = [0.13, 0.35, 1.75]
- min_samples_list=[5,3,5]
- """
-
-
- datasets=["a", "h", "t", "tr", "zgn", "zgo"]
- eps_list=[1.75, 0.85, 0.4, 0.5, 0.36, 0.5]
- min_samples_list=[5,5,3,5,3,5]
-
- #n_clu = [9, 4, 15]
-
- data = []
-
- for dataset in datasets:
- #data += [arff.loadarff(open('clustering-benchmark/src/main/resources/datasets/artificial/' + dataset + '.arff','r'))]
-
- csvfile = open('dataset/' + dataset + '.data','r')
- data += [[list(map(float, point)) for point in csv.reader(csvfile, delimiter = '\t')]]
- #print(data)
-
- for i in range(len(data)):
- #points = np.array([[data[i][0][j][0], data[i][0][j][1]] for j in range(len(data[i][0]))])
- points = np.array(data[i])
-
- best_coef = 0
- best_labels = []
- for n in range(2,16):
- labels = KMeans(n_clusters = n).fit_predict(points)
- #labels = AgglomerativeClustering(n_clusters = n, linkage = 'single').fit_predict(points)
- coef = metrics.silhouette_score(points, labels)
- #coef = metrics.calinski_harabasz_score(points, labels)
- if coef > best_coef:
- best_coef = coef
- best_labels = labels
-
- #best_labels = DBSCAN(eps=eps_list[i], min_samples=min_samples_list[i]).fit_predict(points)
- #best_labels = hdbscan.HDBSCAN(min_samples=min_samples_list[i]).fit_predict(points)
-
- n_clu_best = max(best_labels)+1
-
- fig = plt.figure()
-
- if len(points[0]) == 2:
- ax = fig.add_subplot(111)
- ax.scatter(points[:,0], points[:,1], s = 1, c = best_labels)
- elif len(points[0]) == 3:
- ax = fig.add_subplot(111, projection='3d')
- ax.scatter(points[:,0], points[:,1], points[:,2], s = 1, c = best_labels)
- plt.title(datasets[i] + "(" + str(n_clu_best) + " clusters)")
- plt.show()
|