from numpy.lib.function_base import digitize from scipy.io import arff from sklearn import cluster import numpy as np import matplotlib.pyplot as plt import time import math from sklearn import metrics import numpy as np from hdbscan import HDBSCAN from operator import itemgetter def write(f, title, label,time,timeTotal,score,nb_clusters,method,bruit="No value"): f.write(title + '\n') f.write("Label = "+label+"\n") f.write("Time = "+time+"\n") f.write("Total time = "+ timeTotal+"\n") f.write("Score = "+score+"\n") f.write("Number of clusters = "+nb_clusters+"\n") f.write("Bruit = "+bruit+"\n") f.write("Metric = "+method+"\n") f.write("\n") ############################################ ############## HDBSCAN ############# ############################################ def calcul_hdbscan(data_tab,nb_min,nb_max,file,dim=2,method="silhouette"): grades=[] times=[] big_start=time.time() for nb in range(nb_min,nb_max): start=time.time() db = HDBSCAN(min_cluster_size=nb).fit(data_tab) end=time.time() labels = db.labels_ grades.append(score(method,labels,data_tab)) times.append(end-start) big_end=time.time() max_value = bestScore(method,grades) max_index = grades.index(max_value) clusters = HDBSCAN(min_cluster_size=max_index+nb_min).fit_predict(data_tab) title = "HDBSCAN Clustering, min_cluster_size="+str(max_index+nb_min) if(len(clusters)!=0): bruit = 100*(clusters.tolist().count(-1)/len(clusters)) else: bruit = "Problème, liste vide" write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method,str(bruit)+"%") print("FIN HDBDSCAN") ############################################ ############## DBSCAN ############# ############################################ def calcul_dbscan(data_tab,dist_min,dist_max,dist_step,nb_min,nb_max,file,dim=2,method="silhouette"): grades=[] times=[] big_start=time.time() for dist in np.arange(dist_min,dist_max,dist_step) : for nb in range(nb_min,nb_max): start=time.time() db = cluster.DBSCAN(eps=dist,min_samples=nb).fit(data_tab) end=time.time() labels = db.labels_ grades.append((dist,nb,score(method,labels,data_tab))) times.append(end-start) big_end=time.time() max_tuple = bestScore(method,grades,True) max_index = grades.index(max_tuple) clusters = cluster.DBSCAN(eps=max_tuple[0],min_samples=max_tuple[1]).fit_predict(data_tab) if(len(clusters)!=0): bruit = 100*(clusters.tolist().count(-1)/len(clusters)) else: bruit = "Problème, liste vide" title = "DBSCAN Clustering, eps="+str(max_tuple[0])+ ', min_samples='+str(max_tuple[1])+', clusters='+str(max(clusters)+1) write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_tuple),str(max(clusters)+1),method,str(bruit)+"%") print("FIN DBSCAN") ############################################ ############## AGGLO ############# ############################################ def calcul_agglo(data_tab,linkage,deb,fin,file,method="silhouette",dim=2): agglo = [] grades = [] times = [] big_start = time.time() for k in range(deb,fin): start=time.time() agglo = cluster.AgglomerativeClustering(n_clusters=k,linkage=linkage).fit(data_tab) end=time.time() labels = agglo.labels_ grades.append(score(method,labels,data_tab)) times.append(end-start) big_end=time.time() max_value = bestScore(method,grades) max_index = grades.index(max_value) agglo = cluster.AgglomerativeClustering(n_clusters=max_index+deb,linkage=linkage).fit_predict(data_tab) title = "Agglomerative Clustering, k="+str(max_index+deb)+", " + method+", "+linkage write(file,title,str(agglo.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(agglo)+1),method) print("FIN AGGLO") ############################################ ############## KMEANS ############# ############################################ def calcul_kmeans(data_tab,deb,fin,file,method="silhouette",dim=2): times=[] grades=[] max_value=0 big_start=time.time() for k in range(deb,fin): start=time.time() kmeans = cluster.KMeans(n_clusters=k).fit(data_tab) end=time.time() labels = kmeans.labels_ grades.append(score(method,labels,data_tab)) times.append(end-start) max_value = bestScore(method,grades) big_end=time.time() max_index = grades.index(max_value) clusters = cluster.KMeans(n_clusters=max_index+deb).fit_predict(data_tab) title = "K-Means Clustering, k="+str(max_index+deb)+", " + method write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method) print("FIN KMEANS") def score(metric,labels,data_tab): if(metric=="silhouette"): try : score = metrics.silhouette_score(data_tab,labels,metric = 'euclidean') except ValueError : score = - math.inf elif metric=="bouldin": try : score = metrics.davies_bouldin_score(data_tab,labels) except ValueError : score = math.inf elif metric=="calinski": try : score = metrics.calinski_harabasz_score(data_tab,labels) except ValueError : score = - math.inf else: print("Methode pas reconnue") return -1 return score def bestScore(metric,scores,tuple=False): if(metric=="silhouette"): if tuple: value = max(scores, key=itemgetter(2)) else : value = max(scores) elif metric=="bouldin": if tuple: value = min(scores, key=itemgetter(2)) else : value = min(scores) elif metric=="calinski": if tuple: value = max(scores, key=itemgetter(2)) else : value = max(scores) else: print("Methode pas reconnue") return -1 return value def process(path,name,dim,d="\t"): databrut = np.loadtxt(path+name+".data",delimiter=d) open(name+".txt", "w").close() #Clear file first f=open(name+".txt", "a") if dim==3 : data = [[x[0],x[1],x[2]] for x in databrut] f0 = [f[0] for f in data] f1 = [f[1] for f in data] f2 = [f[2] for f in data] f.write("f0="+str(f0)+"\n") f.write("f1="+str(f1)+"\n") f.write("f2="+str(f2)+"\n") elif (dim == 2) : data = [[x[0],x[1]] for x in databrut] f0 = [f[0] for f in data] f1 = [f[1] for f in data] f.write("f0="+str(f0)+"\n") f.write("f1="+str(f1)+"\n") methods=["silhouette","bouldin","calinski"] for meth in methods : # A.DATA SILHOUETTE , H.DATA SILHOUETTE calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth) calcul_dbscan(data,dist_min=1,dist_max=6,dist_step=1,nb_min=5,nb_max=15,dim=dim,file=f,method=meth) calcul_agglo(data,'complete',2,3,dim=dim, file=f,method=meth) calcul_kmeans(data,2,20,f,method=meth) # T.DATA , Agglo Average SILHOUETTE #calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth) #calcul_dbscan(data,dist_min=0.2,dist_max=1.6,dist_step=0.2,nb_min=5,nb_max=30,dim=dim,file=f,method=meth) #calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth) #calcul_kmeans(data,2,20,f,method=meth) # ZGN.DATA #calcul_hdbscan(data,3,4,dim=dim,file=f,method=meth) #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=15,nb_max=30,dim=dim,file=f,method=meth) #calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth) #calcul_kmeans(data,2,20,f,method=meth) # TR.DATA #calcul_hdbscan(data,2,6,dim=dim,file=f,method=meth) #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth) #calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth) #calcul_kmeans(data,2,20,f,method=meth) # ZGO.DATA #calcul_hdbscan(data,3,15,dim=dim,file=f,method=meth) #calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth) #calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth) #calcul_kmeans(data,2,20,f,method=meth) f.close() process('./custom/','a',3) #process('./custom/','h',3) #process('./custom/','t',3) #process('./custom/','tr',2, d=" ") #process('./custom/','zgn',2, d=" ") #process('./custom/','zgo',2, d=" ")