No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test.py 2.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. from scipy.io import arff
  2. from sklearn.cluster import KMeans
  3. from sklearn.cluster import AgglomerativeClustering
  4. from sklearn.cluster import DBSCAN
  5. from sklearn import metrics
  6. import matplotlib.pyplot as plt
  7. import numpy as np
  8. import matplotlib.colors as colors
  9. import random
  10. import hdbscan
  11. import csv
  12. colors_list = list(colors._colors_full_map.values())
  13. random.shuffle(colors_list)
  14. #datasets = ["R15", "2d-10c", "square1"] #Marche avec kmeans
  15. #datasets = ["donut1", "banana", "spiral", "long1"] #Ne marche pas avec kmeans
  16. #datasets = ["donut1", "banana", "spiral", "long1"] #Marche avec Agglomerative
  17. #datasets = ["dartboard1", "R15", "atom"] #Ne marche pas avec Agglomerative
  18. """
  19. datasets = ["dartboard1", "long1", "banana"] #Marche avec DBSCAN
  20. eps_list = [0.05, 0.4, 0.04]
  21. min_samples_list=[5,3,5]
  22. """
  23. """
  24. datasets = ["diamond9", "R15", "atom"] #Ne marche pas avec DBSCAN
  25. eps_list = [0.13, 0.35, 1.75]
  26. min_samples_list=[5,3,5]
  27. """
  28. datasets=["a", "h", "t", "tr", "zgn", "zgo"]
  29. eps_list=[1.75, 0.85, 0.4, 0.5, 0.36, 0.5]
  30. min_samples_list=[5,5,3,5,3,5]
  31. #n_clu = [9, 4, 15]
  32. data = []
  33. for dataset in datasets:
  34. #data += [arff.loadarff(open('clustering-benchmark/src/main/resources/datasets/artificial/' + dataset + '.arff','r'))]
  35. csvfile = open('dataset/' + dataset + '.data','r')
  36. data += [[list(map(float, point)) for point in csv.reader(csvfile, delimiter = '\t')]]
  37. #print(data)
  38. for i in range(len(data)):
  39. #points = np.array([[data[i][0][j][0], data[i][0][j][1]] for j in range(len(data[i][0]))])
  40. points = np.array(data[i])
  41. best_coef = 0
  42. best_labels = []
  43. for n in range(2,16):
  44. labels = KMeans(n_clusters = n).fit_predict(points)
  45. #labels = AgglomerativeClustering(n_clusters = n, linkage = 'single').fit_predict(points)
  46. coef = metrics.silhouette_score(points, labels)
  47. #coef = metrics.calinski_harabasz_score(points, labels)
  48. if coef > best_coef:
  49. best_coef = coef
  50. best_labels = labels
  51. #best_labels = DBSCAN(eps=eps_list[i], min_samples=min_samples_list[i]).fit_predict(points)
  52. #best_labels = hdbscan.HDBSCAN(min_samples=min_samples_list[i]).fit_predict(points)
  53. n_clu_best = max(best_labels)+1
  54. fig = plt.figure()
  55. if len(points[0]) == 2:
  56. ax = fig.add_subplot(111)
  57. ax.scatter(points[:,0], points[:,1], s = 1, c = best_labels)
  58. elif len(points[0]) == 3:
  59. ax = fig.add_subplot(111, projection='3d')
  60. ax.scatter(points[:,0], points[:,1], points[:,2], s = 1, c = best_labels)
  61. plt.title(datasets[i] + "(" + str(n_clu_best) + " clusters)")
  62. plt.show()