123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231 |
- from numpy.lib.function_base import digitize
- from scipy.io import arff
- from sklearn import cluster
- import numpy as np
- import matplotlib.pyplot as plt
- import time
- import math
- from sklearn import metrics
- import numpy as np
- from hdbscan import HDBSCAN
- from operator import itemgetter
-
- def write(f, title, label,time,timeTotal,score,nb_clusters,method,bruit="No value"):
- f.write(title + '\n')
- f.write("Label = "+label+"\n")
- f.write("Time = "+time+"\n")
- f.write("Total time = "+ timeTotal+"\n")
- f.write("Score = "+score+"\n")
- f.write("Number of clusters = "+nb_clusters+"\n")
- f.write("Bruit = "+bruit+"\n")
- f.write("Metric = "+method+"\n")
- f.write("\n")
-
- ############################################
- ############## HDBSCAN #############
- ############################################
-
-
- def calcul_hdbscan(data_tab,nb_min,nb_max,file,dim=2,method="silhouette"):
- grades=[]
- times=[]
- big_start=time.time()
- for nb in range(nb_min,nb_max):
- start=time.time()
- db = HDBSCAN(min_cluster_size=nb).fit(data_tab)
- end=time.time()
- labels = db.labels_
- grades.append(score(method,labels,data_tab))
- times.append(end-start)
- big_end=time.time()
- max_value = bestScore(method,grades)
- max_index = grades.index(max_value)
- clusters = HDBSCAN(min_cluster_size=max_index+nb_min).fit_predict(data_tab)
- title = "HDBSCAN Clustering, min_cluster_size="+str(max_index+nb_min)
- if(len(clusters)!=0):
- bruit = 100*(clusters.tolist().count(-1)/len(clusters))
- else:
- bruit = "Problème, liste vide"
- write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method,str(bruit)+"%")
- print("FIN HDBDSCAN")
-
- ############################################
- ############## DBSCAN #############
- ############################################
-
-
- def calcul_dbscan(data_tab,dist_min,dist_max,dist_step,nb_min,nb_max,file,dim=2,method="silhouette"):
- grades=[]
- times=[]
- big_start=time.time()
- for dist in np.arange(dist_min,dist_max,dist_step) :
- for nb in range(nb_min,nb_max):
- start=time.time()
- db = cluster.DBSCAN(eps=dist,min_samples=nb).fit(data_tab)
- end=time.time()
- labels = db.labels_
- grades.append((dist,nb,score(method,labels,data_tab)))
- times.append(end-start)
- big_end=time.time()
- max_tuple = bestScore(method,grades,True)
- max_index = grades.index(max_tuple)
- clusters = cluster.DBSCAN(eps=max_tuple[0],min_samples=max_tuple[1]).fit_predict(data_tab)
- if(len(clusters)!=0):
- bruit = 100*(clusters.tolist().count(-1)/len(clusters))
- else:
- bruit = "Problème, liste vide"
- title = "DBSCAN Clustering, eps="+str(max_tuple[0])+ ', min_samples='+str(max_tuple[1])+', clusters='+str(max(clusters)+1)
- write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_tuple),str(max(clusters)+1),method,str(bruit)+"%")
- print("FIN DBSCAN")
-
- ############################################
- ############## AGGLO #############
- ############################################
-
- def calcul_agglo(data_tab,linkage,deb,fin,file,method="silhouette",dim=2):
- agglo = []
- grades = []
- times = []
- big_start = time.time()
- for k in range(deb,fin):
- start=time.time()
- agglo = cluster.AgglomerativeClustering(n_clusters=k,linkage=linkage).fit(data_tab)
- end=time.time()
- labels = agglo.labels_
- grades.append(score(method,labels,data_tab))
- times.append(end-start)
- big_end=time.time()
- max_value = bestScore(method,grades)
- max_index = grades.index(max_value)
- agglo = cluster.AgglomerativeClustering(n_clusters=max_index+deb,linkage=linkage).fit_predict(data_tab)
- title = "Agglomerative Clustering, k="+str(max_index+deb)+", " + method+", "+linkage
- write(file,title,str(agglo.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(agglo)+1),method)
- print("FIN AGGLO")
-
- ############################################
- ############## KMEANS #############
- ############################################
-
- def calcul_kmeans(data_tab,deb,fin,file,method="silhouette",dim=2):
- times=[]
- grades=[]
- max_value=0
- big_start=time.time()
- for k in range(deb,fin):
- start=time.time()
- kmeans = cluster.KMeans(n_clusters=k).fit(data_tab)
- end=time.time()
- labels = kmeans.labels_
- grades.append(score(method,labels,data_tab))
- times.append(end-start)
- max_value = bestScore(method,grades)
- big_end=time.time()
- max_index = grades.index(max_value)
- clusters = cluster.KMeans(n_clusters=max_index+deb).fit_predict(data_tab)
- title = "K-Means Clustering, k="+str(max_index+deb)+", " + method
- write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method)
- print("FIN KMEANS")
-
- def score(metric,labels,data_tab):
- if(metric=="silhouette"):
- try :
- score = metrics.silhouette_score(data_tab,labels,metric = 'euclidean')
- except ValueError :
- score = - math.inf
- elif metric=="bouldin":
- try :
- score = metrics.davies_bouldin_score(data_tab,labels)
- except ValueError :
- score = math.inf
- elif metric=="calinski":
- try :
- score = metrics.calinski_harabasz_score(data_tab,labels)
- except ValueError :
- score = - math.inf
- else:
- print("Methode pas reconnue")
- return -1
- return score
-
- def bestScore(metric,scores,tuple=False):
- if(metric=="silhouette"):
- if tuple:
- value = max(scores, key=itemgetter(2))
- else :
- value = max(scores)
- elif metric=="bouldin":
- if tuple:
- value = min(scores, key=itemgetter(2))
- else :
- value = min(scores)
- elif metric=="calinski":
- if tuple:
- value = max(scores, key=itemgetter(2))
- else :
- value = max(scores)
- else:
- print("Methode pas reconnue")
- return -1
- return value
-
-
- def process(path,name,dim,d="\t"):
- databrut = np.loadtxt(path+name+".data",delimiter=d)
- open(name+".txt", "w").close() #Clear file first
- f=open(name+".txt", "a")
- if dim==3 :
- data = [[x[0],x[1],x[2]] for x in databrut]
- f0 = [f[0] for f in data]
- f1 = [f[1] for f in data]
- f2 = [f[2] for f in data]
- f.write("f0="+str(f0)+"\n")
- f.write("f1="+str(f1)+"\n")
- f.write("f2="+str(f2)+"\n")
- elif (dim == 2) :
- data = [[x[0],x[1]] for x in databrut]
- f0 = [f[0] for f in data]
- f1 = [f[1] for f in data]
- f.write("f0="+str(f0)+"\n")
- f.write("f1="+str(f1)+"\n")
-
- methods=["silhouette","bouldin","calinski"]
- for meth in methods :
- # A.DATA SILHOUETTE , H.DATA SILHOUETTE
- calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
- calcul_dbscan(data,dist_min=1,dist_max=6,dist_step=1,nb_min=5,nb_max=15,dim=dim,file=f,method=meth)
- calcul_agglo(data,'complete',2,3,dim=dim, file=f,method=meth)
- calcul_kmeans(data,2,20,f,method=meth)
-
-
- # T.DATA , Agglo Average SILHOUETTE
- #calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
- #calcul_dbscan(data,dist_min=0.2,dist_max=1.6,dist_step=0.2,nb_min=5,nb_max=30,dim=dim,file=f,method=meth)
- #calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
- #calcul_kmeans(data,2,20,f,method=meth)
-
- # ZGN.DATA
- #calcul_hdbscan(data,3,4,dim=dim,file=f,method=meth)
- #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=15,nb_max=30,dim=dim,file=f,method=meth)
- #calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
- #calcul_kmeans(data,2,20,f,method=meth)
-
- # TR.DATA
- #calcul_hdbscan(data,2,6,dim=dim,file=f,method=meth)
- #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
- #calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
- #calcul_kmeans(data,2,20,f,method=meth)
-
- # ZGO.DATA
- #calcul_hdbscan(data,3,15,dim=dim,file=f,method=meth)
- #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
- #calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
- #calcul_kmeans(data,2,20,f,method=meth)
-
- f.close()
-
- process('./custom/','a',3)
- #process('./custom/','h',3)
- #process('./custom/','t',3)
- #process('./custom/','tr',2, d=" ")
- #process('./custom/','zgn',2, d=" ")
- #process('./custom/','zgo',2, d=" ")
|