78 lines
No EOL
2.6 KiB
Python
78 lines
No EOL
2.6 KiB
Python
from scipy.io import arff
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
from sklearn.cluster import DBSCAN
|
|
from sklearn import metrics
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import matplotlib.colors as colors
|
|
import random
|
|
import hdbscan
|
|
import csv
|
|
|
|
colors_list = list(colors._colors_full_map.values())
|
|
random.shuffle(colors_list)
|
|
#datasets = ["R15", "2d-10c", "square1"] #Marche avec kmeans
|
|
#datasets = ["donut1", "banana", "spiral", "long1"] #Ne marche pas avec kmeans
|
|
|
|
#datasets = ["donut1", "banana", "spiral", "long1"] #Marche avec Agglomerative
|
|
#datasets = ["dartboard1", "R15", "atom"] #Ne marche pas avec Agglomerative
|
|
|
|
"""
|
|
datasets = ["dartboard1", "long1", "banana"] #Marche avec DBSCAN
|
|
eps_list = [0.05, 0.4, 0.04]
|
|
min_samples_list=[5,3,5]
|
|
"""
|
|
|
|
"""
|
|
datasets = ["diamond9", "R15", "atom"] #Ne marche pas avec DBSCAN
|
|
eps_list = [0.13, 0.35, 1.75]
|
|
min_samples_list=[5,3,5]
|
|
"""
|
|
|
|
|
|
datasets=["a", "h", "t", "tr", "zgn", "zgo"]
|
|
eps_list=[1.75, 0.85, 0.4, 0.5, 0.36, 0.5]
|
|
min_samples_list=[5,5,3,5,3,5]
|
|
|
|
#n_clu = [9, 4, 15]
|
|
|
|
data = []
|
|
|
|
for dataset in datasets:
|
|
#data += [arff.loadarff(open('clustering-benchmark/src/main/resources/datasets/artificial/' + dataset + '.arff','r'))]
|
|
|
|
csvfile = open('dataset/' + dataset + '.data','r')
|
|
data += [[list(map(float, point)) for point in csv.reader(csvfile, delimiter = '\t')]]
|
|
#print(data)
|
|
|
|
for i in range(len(data)):
|
|
#points = np.array([[data[i][0][j][0], data[i][0][j][1]] for j in range(len(data[i][0]))])
|
|
points = np.array(data[i])
|
|
|
|
best_coef = 0
|
|
best_labels = []
|
|
for n in range(2,16):
|
|
labels = KMeans(n_clusters = n).fit_predict(points)
|
|
#labels = AgglomerativeClustering(n_clusters = n, linkage = 'single').fit_predict(points)
|
|
coef = metrics.silhouette_score(points, labels)
|
|
#coef = metrics.calinski_harabasz_score(points, labels)
|
|
if coef > best_coef:
|
|
best_coef = coef
|
|
best_labels = labels
|
|
|
|
#best_labels = DBSCAN(eps=eps_list[i], min_samples=min_samples_list[i]).fit_predict(points)
|
|
#best_labels = hdbscan.HDBSCAN(min_samples=min_samples_list[i]).fit_predict(points)
|
|
|
|
n_clu_best = max(best_labels)+1
|
|
|
|
fig = plt.figure()
|
|
|
|
if len(points[0]) == 2:
|
|
ax = fig.add_subplot(111)
|
|
ax.scatter(points[:,0], points[:,1], s = 1, c = best_labels)
|
|
elif len(points[0]) == 3:
|
|
ax = fig.add_subplot(111, projection='3d')
|
|
ax.scatter(points[:,0], points[:,1], points[:,2], s = 1, c = best_labels)
|
|
plt.title(datasets[i] + "(" + str(n_clu_best) + " clusters)")
|
|
plt.show() |