No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

synthese.py 8.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. from numpy.lib.function_base import digitize
  2. from scipy.io import arff
  3. from sklearn import cluster
  4. import numpy as np
  5. import matplotlib.pyplot as plt
  6. import time
  7. import math
  8. from sklearn import metrics
  9. import numpy as np
  10. from hdbscan import HDBSCAN
  11. from operator import itemgetter
  12. def write(f, title, label,time,timeTotal,score,nb_clusters,method,bruit="No value"):
  13. f.write(title + '\n')
  14. f.write("Label = "+label+"\n")
  15. f.write("Time = "+time+"\n")
  16. f.write("Total time = "+ timeTotal+"\n")
  17. f.write("Score = "+score+"\n")
  18. f.write("Number of clusters = "+nb_clusters+"\n")
  19. f.write("Bruit = "+bruit+"\n")
  20. f.write("Metric = "+method+"\n")
  21. f.write("\n")
  22. ############################################
  23. ############## HDBSCAN #############
  24. ############################################
  25. def calcul_hdbscan(data_tab,nb_min,nb_max,file,dim=2,method="silhouette"):
  26. grades=[]
  27. times=[]
  28. big_start=time.time()
  29. for nb in range(nb_min,nb_max):
  30. start=time.time()
  31. db = HDBSCAN(min_cluster_size=nb).fit(data_tab)
  32. end=time.time()
  33. labels = db.labels_
  34. grades.append(score(method,labels,data_tab))
  35. times.append(end-start)
  36. big_end=time.time()
  37. max_value = bestScore(method,grades)
  38. max_index = grades.index(max_value)
  39. clusters = HDBSCAN(min_cluster_size=max_index+nb_min).fit_predict(data_tab)
  40. title = "HDBSCAN Clustering, min_cluster_size="+str(max_index+nb_min)
  41. if(len(clusters)!=0):
  42. bruit = 100*(clusters.tolist().count(-1)/len(clusters))
  43. else:
  44. bruit = "Problème, liste vide"
  45. write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method,str(bruit)+"%")
  46. print("FIN HDBDSCAN")
  47. ############################################
  48. ############## DBSCAN #############
  49. ############################################
  50. def calcul_dbscan(data_tab,dist_min,dist_max,dist_step,nb_min,nb_max,file,dim=2,method="silhouette"):
  51. grades=[]
  52. times=[]
  53. big_start=time.time()
  54. for dist in np.arange(dist_min,dist_max,dist_step) :
  55. for nb in range(nb_min,nb_max):
  56. start=time.time()
  57. db = cluster.DBSCAN(eps=dist,min_samples=nb).fit(data_tab)
  58. end=time.time()
  59. labels = db.labels_
  60. grades.append((dist,nb,score(method,labels,data_tab)))
  61. times.append(end-start)
  62. big_end=time.time()
  63. max_tuple = bestScore(method,grades,True)
  64. max_index = grades.index(max_tuple)
  65. clusters = cluster.DBSCAN(eps=max_tuple[0],min_samples=max_tuple[1]).fit_predict(data_tab)
  66. if(len(clusters)!=0):
  67. bruit = 100*(clusters.tolist().count(-1)/len(clusters))
  68. else:
  69. bruit = "Problème, liste vide"
  70. title = "DBSCAN Clustering, eps="+str(max_tuple[0])+ ', min_samples='+str(max_tuple[1])+', clusters='+str(max(clusters)+1)
  71. write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_tuple),str(max(clusters)+1),method,str(bruit)+"%")
  72. print("FIN DBSCAN")
  73. ############################################
  74. ############## AGGLO #############
  75. ############################################
  76. def calcul_agglo(data_tab,linkage,deb,fin,file,method="silhouette",dim=2):
  77. agglo = []
  78. grades = []
  79. times = []
  80. big_start = time.time()
  81. for k in range(deb,fin):
  82. start=time.time()
  83. agglo = cluster.AgglomerativeClustering(n_clusters=k,linkage=linkage).fit(data_tab)
  84. end=time.time()
  85. labels = agglo.labels_
  86. grades.append(score(method,labels,data_tab))
  87. times.append(end-start)
  88. big_end=time.time()
  89. max_value = bestScore(method,grades)
  90. max_index = grades.index(max_value)
  91. agglo = cluster.AgglomerativeClustering(n_clusters=max_index+deb,linkage=linkage).fit_predict(data_tab)
  92. title = "Agglomerative Clustering, k="+str(max_index+deb)+", " + method+", "+linkage
  93. write(file,title,str(agglo.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(agglo)+1),method)
  94. print("FIN AGGLO")
  95. ############################################
  96. ############## KMEANS #############
  97. ############################################
  98. def calcul_kmeans(data_tab,deb,fin,file,method="silhouette",dim=2):
  99. times=[]
  100. grades=[]
  101. max_value=0
  102. big_start=time.time()
  103. for k in range(deb,fin):
  104. start=time.time()
  105. kmeans = cluster.KMeans(n_clusters=k).fit(data_tab)
  106. end=time.time()
  107. labels = kmeans.labels_
  108. grades.append(score(method,labels,data_tab))
  109. times.append(end-start)
  110. max_value = bestScore(method,grades)
  111. big_end=time.time()
  112. max_index = grades.index(max_value)
  113. clusters = cluster.KMeans(n_clusters=max_index+deb).fit_predict(data_tab)
  114. title = "K-Means Clustering, k="+str(max_index+deb)+", " + method
  115. write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method)
  116. print("FIN KMEANS")
  117. def score(metric,labels,data_tab):
  118. if(metric=="silhouette"):
  119. try :
  120. score = metrics.silhouette_score(data_tab,labels,metric = 'euclidean')
  121. except ValueError :
  122. score = - math.inf
  123. elif metric=="bouldin":
  124. try :
  125. score = metrics.davies_bouldin_score(data_tab,labels)
  126. except ValueError :
  127. score = math.inf
  128. elif metric=="calinski":
  129. try :
  130. score = metrics.calinski_harabasz_score(data_tab,labels)
  131. except ValueError :
  132. score = - math.inf
  133. else:
  134. print("Methode pas reconnue")
  135. return -1
  136. return score
  137. def bestScore(metric,scores,tuple=False):
  138. if(metric=="silhouette"):
  139. if tuple:
  140. value = max(scores, key=itemgetter(2))
  141. else :
  142. value = max(scores)
  143. elif metric=="bouldin":
  144. if tuple:
  145. value = min(scores, key=itemgetter(2))
  146. else :
  147. value = min(scores)
  148. elif metric=="calinski":
  149. if tuple:
  150. value = max(scores, key=itemgetter(2))
  151. else :
  152. value = max(scores)
  153. else:
  154. print("Methode pas reconnue")
  155. return -1
  156. return value
  157. def process(path,name,dim,d="\t"):
  158. databrut = np.loadtxt(path+name+".data",delimiter=d)
  159. open(name+".txt", "w").close() #Clear file first
  160. f=open(name+".txt", "a")
  161. if dim==3 :
  162. data = [[x[0],x[1],x[2]] for x in databrut]
  163. f0 = [f[0] for f in data]
  164. f1 = [f[1] for f in data]
  165. f2 = [f[2] for f in data]
  166. f.write("f0="+str(f0)+"\n")
  167. f.write("f1="+str(f1)+"\n")
  168. f.write("f2="+str(f2)+"\n")
  169. elif (dim == 2) :
  170. data = [[x[0],x[1]] for x in databrut]
  171. f0 = [f[0] for f in data]
  172. f1 = [f[1] for f in data]
  173. f.write("f0="+str(f0)+"\n")
  174. f.write("f1="+str(f1)+"\n")
  175. methods=["silhouette","bouldin","calinski"]
  176. for meth in methods :
  177. # A.DATA SILHOUETTE , H.DATA SILHOUETTE
  178. calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
  179. calcul_dbscan(data,dist_min=1,dist_max=6,dist_step=1,nb_min=5,nb_max=15,dim=dim,file=f,method=meth)
  180. calcul_agglo(data,'complete',2,3,dim=dim, file=f,method=meth)
  181. calcul_kmeans(data,2,20,f,method=meth)
  182. # T.DATA , Agglo Average SILHOUETTE
  183. #calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
  184. #calcul_dbscan(data,dist_min=0.2,dist_max=1.6,dist_step=0.2,nb_min=5,nb_max=30,dim=dim,file=f,method=meth)
  185. #calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
  186. #calcul_kmeans(data,2,20,f,method=meth)
  187. # ZGN.DATA
  188. #calcul_hdbscan(data,3,4,dim=dim,file=f,method=meth)
  189. #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=15,nb_max=30,dim=dim,file=f,method=meth)
  190. #calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
  191. #calcul_kmeans(data,2,20,f,method=meth)
  192. # TR.DATA
  193. #calcul_hdbscan(data,2,6,dim=dim,file=f,method=meth)
  194. #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
  195. #calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
  196. #calcul_kmeans(data,2,20,f,method=meth)
  197. # ZGO.DATA
  198. #calcul_hdbscan(data,3,15,dim=dim,file=f,method=meth)
  199. #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
  200. #calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
  201. #calcul_kmeans(data,2,20,f,method=meth)
  202. f.close()
  203. process('./custom/','a',3)
  204. #process('./custom/','h',3)
  205. #process('./custom/','t',3)
  206. #process('./custom/','tr',2, d=" ")
  207. #process('./custom/','zgn',2, d=" ")
  208. #process('./custom/','zgo',2, d=" ")