TP_Clustering/synthese.py
2021-11-08 18:34:41 +01:00

231 lines
No EOL
8.6 KiB
Python

from numpy.lib.function_base import digitize
from scipy.io import arff
from sklearn import cluster
import numpy as np
import matplotlib.pyplot as plt
import time
import math
from sklearn import metrics
import numpy as np
from hdbscan import HDBSCAN
from operator import itemgetter
def write(f, title, label,time,timeTotal,score,nb_clusters,method,bruit="No value"):
f.write(title + '\n')
f.write("Label = "+label+"\n")
f.write("Time = "+time+"\n")
f.write("Total time = "+ timeTotal+"\n")
f.write("Score = "+score+"\n")
f.write("Number of clusters = "+nb_clusters+"\n")
f.write("Bruit = "+bruit+"\n")
f.write("Metric = "+method+"\n")
f.write("\n")
############################################
############## HDBSCAN #############
############################################
def calcul_hdbscan(data_tab,nb_min,nb_max,file,dim=2,method="silhouette"):
grades=[]
times=[]
big_start=time.time()
for nb in range(nb_min,nb_max):
start=time.time()
db = HDBSCAN(min_cluster_size=nb).fit(data_tab)
end=time.time()
labels = db.labels_
grades.append(score(method,labels,data_tab))
times.append(end-start)
big_end=time.time()
max_value = bestScore(method,grades)
max_index = grades.index(max_value)
clusters = HDBSCAN(min_cluster_size=max_index+nb_min).fit_predict(data_tab)
title = "HDBSCAN Clustering, min_cluster_size="+str(max_index+nb_min)
if(len(clusters)!=0):
bruit = 100*(clusters.tolist().count(-1)/len(clusters))
else:
bruit = "Problème, liste vide"
write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method,str(bruit)+"%")
print("FIN HDBDSCAN")
############################################
############## DBSCAN #############
############################################
def calcul_dbscan(data_tab,dist_min,dist_max,dist_step,nb_min,nb_max,file,dim=2,method="silhouette"):
grades=[]
times=[]
big_start=time.time()
for dist in np.arange(dist_min,dist_max,dist_step) :
for nb in range(nb_min,nb_max):
start=time.time()
db = cluster.DBSCAN(eps=dist,min_samples=nb).fit(data_tab)
end=time.time()
labels = db.labels_
grades.append((dist,nb,score(method,labels,data_tab)))
times.append(end-start)
big_end=time.time()
max_tuple = bestScore(method,grades,True)
max_index = grades.index(max_tuple)
clusters = cluster.DBSCAN(eps=max_tuple[0],min_samples=max_tuple[1]).fit_predict(data_tab)
if(len(clusters)!=0):
bruit = 100*(clusters.tolist().count(-1)/len(clusters))
else:
bruit = "Problème, liste vide"
title = "DBSCAN Clustering, eps="+str(max_tuple[0])+ ', min_samples='+str(max_tuple[1])+', clusters='+str(max(clusters)+1)
write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_tuple),str(max(clusters)+1),method,str(bruit)+"%")
print("FIN DBSCAN")
############################################
############## AGGLO #############
############################################
def calcul_agglo(data_tab,linkage,deb,fin,file,method="silhouette",dim=2):
agglo = []
grades = []
times = []
big_start = time.time()
for k in range(deb,fin):
start=time.time()
agglo = cluster.AgglomerativeClustering(n_clusters=k,linkage=linkage).fit(data_tab)
end=time.time()
labels = agglo.labels_
grades.append(score(method,labels,data_tab))
times.append(end-start)
big_end=time.time()
max_value = bestScore(method,grades)
max_index = grades.index(max_value)
agglo = cluster.AgglomerativeClustering(n_clusters=max_index+deb,linkage=linkage).fit_predict(data_tab)
title = "Agglomerative Clustering, k="+str(max_index+deb)+", " + method+", "+linkage
write(file,title,str(agglo.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(agglo)+1),method)
print("FIN AGGLO")
############################################
############## KMEANS #############
############################################
def calcul_kmeans(data_tab,deb,fin,file,method="silhouette",dim=2):
times=[]
grades=[]
max_value=0
big_start=time.time()
for k in range(deb,fin):
start=time.time()
kmeans = cluster.KMeans(n_clusters=k).fit(data_tab)
end=time.time()
labels = kmeans.labels_
grades.append(score(method,labels,data_tab))
times.append(end-start)
max_value = bestScore(method,grades)
big_end=time.time()
max_index = grades.index(max_value)
clusters = cluster.KMeans(n_clusters=max_index+deb).fit_predict(data_tab)
title = "K-Means Clustering, k="+str(max_index+deb)+", " + method
write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method)
print("FIN KMEANS")
def score(metric,labels,data_tab):
if(metric=="silhouette"):
try :
score = metrics.silhouette_score(data_tab,labels,metric = 'euclidean')
except ValueError :
score = - math.inf
elif metric=="bouldin":
try :
score = metrics.davies_bouldin_score(data_tab,labels)
except ValueError :
score = math.inf
elif metric=="calinski":
try :
score = metrics.calinski_harabasz_score(data_tab,labels)
except ValueError :
score = - math.inf
else:
print("Methode pas reconnue")
return -1
return score
def bestScore(metric,scores,tuple=False):
if(metric=="silhouette"):
if tuple:
value = max(scores, key=itemgetter(2))
else :
value = max(scores)
elif metric=="bouldin":
if tuple:
value = min(scores, key=itemgetter(2))
else :
value = min(scores)
elif metric=="calinski":
if tuple:
value = max(scores, key=itemgetter(2))
else :
value = max(scores)
else:
print("Methode pas reconnue")
return -1
return value
def process(path,name,dim,d="\t"):
databrut = np.loadtxt(path+name+".data",delimiter=d)
open(name+".txt", "w").close() #Clear file first
f=open(name+".txt", "a")
if dim==3 :
data = [[x[0],x[1],x[2]] for x in databrut]
f0 = [f[0] for f in data]
f1 = [f[1] for f in data]
f2 = [f[2] for f in data]
f.write("f0="+str(f0)+"\n")
f.write("f1="+str(f1)+"\n")
f.write("f2="+str(f2)+"\n")
elif (dim == 2) :
data = [[x[0],x[1]] for x in databrut]
f0 = [f[0] for f in data]
f1 = [f[1] for f in data]
f.write("f0="+str(f0)+"\n")
f.write("f1="+str(f1)+"\n")
methods=["silhouette","bouldin","calinski"]
for meth in methods :
# A.DATA SILHOUETTE , H.DATA SILHOUETTE
calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
calcul_dbscan(data,dist_min=1,dist_max=6,dist_step=1,nb_min=5,nb_max=15,dim=dim,file=f,method=meth)
calcul_agglo(data,'complete',2,3,dim=dim, file=f,method=meth)
calcul_kmeans(data,2,20,f,method=meth)
# T.DATA , Agglo Average SILHOUETTE
#calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
#calcul_dbscan(data,dist_min=0.2,dist_max=1.6,dist_step=0.2,nb_min=5,nb_max=30,dim=dim,file=f,method=meth)
#calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
#calcul_kmeans(data,2,20,f,method=meth)
# ZGN.DATA
#calcul_hdbscan(data,3,4,dim=dim,file=f,method=meth)
#calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=15,nb_max=30,dim=dim,file=f,method=meth)
#calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
#calcul_kmeans(data,2,20,f,method=meth)
# TR.DATA
#calcul_hdbscan(data,2,6,dim=dim,file=f,method=meth)
#calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
#calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
#calcul_kmeans(data,2,20,f,method=meth)
# ZGO.DATA
#calcul_hdbscan(data,3,15,dim=dim,file=f,method=meth)
#calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
#calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
#calcul_kmeans(data,2,20,f,method=meth)
f.close()
process('./custom/','a',3)
#process('./custom/','h',3)
#process('./custom/','t',3)
#process('./custom/','tr',2, d=" ")
#process('./custom/','zgn',2, d=" ")
#process('./custom/','zgo',2, d=" ")