TP_Clustering/test.py
2022-01-12 14:59:06 +01:00

78 lines
No EOL
2.6 KiB
Python

from scipy.io import arff
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as colors
import random
import hdbscan
import csv
colors_list = list(colors._colors_full_map.values())
random.shuffle(colors_list)
#datasets = ["R15", "2d-10c", "square1"] #Marche avec kmeans
#datasets = ["donut1", "banana", "spiral", "long1"] #Ne marche pas avec kmeans
#datasets = ["donut1", "banana", "spiral", "long1"] #Marche avec Agglomerative
#datasets = ["dartboard1", "R15", "atom"] #Ne marche pas avec Agglomerative
"""
datasets = ["dartboard1", "long1", "banana"] #Marche avec DBSCAN
eps_list = [0.05, 0.4, 0.04]
min_samples_list=[5,3,5]
"""
"""
datasets = ["diamond9", "R15", "atom"] #Ne marche pas avec DBSCAN
eps_list = [0.13, 0.35, 1.75]
min_samples_list=[5,3,5]
"""
datasets=["a", "h", "t", "tr", "zgn", "zgo"]
eps_list=[1.75, 0.85, 0.4, 0.5, 0.36, 0.5]
min_samples_list=[5,5,3,5,3,5]
#n_clu = [9, 4, 15]
data = []
for dataset in datasets:
#data += [arff.loadarff(open('clustering-benchmark/src/main/resources/datasets/artificial/' + dataset + '.arff','r'))]
csvfile = open('dataset/' + dataset + '.data','r')
data += [[list(map(float, point)) for point in csv.reader(csvfile, delimiter = '\t')]]
#print(data)
for i in range(len(data)):
#points = np.array([[data[i][0][j][0], data[i][0][j][1]] for j in range(len(data[i][0]))])
points = np.array(data[i])
best_coef = 0
best_labels = []
for n in range(2,16):
labels = KMeans(n_clusters = n).fit_predict(points)
#labels = AgglomerativeClustering(n_clusters = n, linkage = 'single').fit_predict(points)
coef = metrics.silhouette_score(points, labels)
#coef = metrics.calinski_harabasz_score(points, labels)
if coef > best_coef:
best_coef = coef
best_labels = labels
#best_labels = DBSCAN(eps=eps_list[i], min_samples=min_samples_list[i]).fit_predict(points)
#best_labels = hdbscan.HDBSCAN(min_samples=min_samples_list[i]).fit_predict(points)
n_clu_best = max(best_labels)+1
fig = plt.figure()
if len(points[0]) == 2:
ax = fig.add_subplot(111)
ax.scatter(points[:,0], points[:,1], s = 1, c = best_labels)
elif len(points[0]) == 3:
ax = fig.add_subplot(111, projection='3d')
ax.scatter(points[:,0], points[:,1], points[:,2], s = 1, c = best_labels)
plt.title(datasets[i] + "(" + str(n_clu_best) + " clusters)")
plt.show()