231 lines
No EOL
8.6 KiB
Python
231 lines
No EOL
8.6 KiB
Python
from numpy.lib.function_base import digitize
|
|
from scipy.io import arff
|
|
from sklearn import cluster
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import time
|
|
import math
|
|
from sklearn import metrics
|
|
import numpy as np
|
|
from hdbscan import HDBSCAN
|
|
from operator import itemgetter
|
|
|
|
def write(f, title, label,time,timeTotal,score,nb_clusters,method,bruit="No value"):
|
|
f.write(title + '\n')
|
|
f.write("Label = "+label+"\n")
|
|
f.write("Time = "+time+"\n")
|
|
f.write("Total time = "+ timeTotal+"\n")
|
|
f.write("Score = "+score+"\n")
|
|
f.write("Number of clusters = "+nb_clusters+"\n")
|
|
f.write("Bruit = "+bruit+"\n")
|
|
f.write("Metric = "+method+"\n")
|
|
f.write("\n")
|
|
|
|
############################################
|
|
############## HDBSCAN #############
|
|
############################################
|
|
|
|
|
|
def calcul_hdbscan(data_tab,nb_min,nb_max,file,dim=2,method="silhouette"):
|
|
grades=[]
|
|
times=[]
|
|
big_start=time.time()
|
|
for nb in range(nb_min,nb_max):
|
|
start=time.time()
|
|
db = HDBSCAN(min_cluster_size=nb).fit(data_tab)
|
|
end=time.time()
|
|
labels = db.labels_
|
|
grades.append(score(method,labels,data_tab))
|
|
times.append(end-start)
|
|
big_end=time.time()
|
|
max_value = bestScore(method,grades)
|
|
max_index = grades.index(max_value)
|
|
clusters = HDBSCAN(min_cluster_size=max_index+nb_min).fit_predict(data_tab)
|
|
title = "HDBSCAN Clustering, min_cluster_size="+str(max_index+nb_min)
|
|
if(len(clusters)!=0):
|
|
bruit = 100*(clusters.tolist().count(-1)/len(clusters))
|
|
else:
|
|
bruit = "Problème, liste vide"
|
|
write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method,str(bruit)+"%")
|
|
print("FIN HDBDSCAN")
|
|
|
|
############################################
|
|
############## DBSCAN #############
|
|
############################################
|
|
|
|
|
|
def calcul_dbscan(data_tab,dist_min,dist_max,dist_step,nb_min,nb_max,file,dim=2,method="silhouette"):
|
|
grades=[]
|
|
times=[]
|
|
big_start=time.time()
|
|
for dist in np.arange(dist_min,dist_max,dist_step) :
|
|
for nb in range(nb_min,nb_max):
|
|
start=time.time()
|
|
db = cluster.DBSCAN(eps=dist,min_samples=nb).fit(data_tab)
|
|
end=time.time()
|
|
labels = db.labels_
|
|
grades.append((dist,nb,score(method,labels,data_tab)))
|
|
times.append(end-start)
|
|
big_end=time.time()
|
|
max_tuple = bestScore(method,grades,True)
|
|
max_index = grades.index(max_tuple)
|
|
clusters = cluster.DBSCAN(eps=max_tuple[0],min_samples=max_tuple[1]).fit_predict(data_tab)
|
|
if(len(clusters)!=0):
|
|
bruit = 100*(clusters.tolist().count(-1)/len(clusters))
|
|
else:
|
|
bruit = "Problème, liste vide"
|
|
title = "DBSCAN Clustering, eps="+str(max_tuple[0])+ ', min_samples='+str(max_tuple[1])+', clusters='+str(max(clusters)+1)
|
|
write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_tuple),str(max(clusters)+1),method,str(bruit)+"%")
|
|
print("FIN DBSCAN")
|
|
|
|
############################################
|
|
############## AGGLO #############
|
|
############################################
|
|
|
|
def calcul_agglo(data_tab,linkage,deb,fin,file,method="silhouette",dim=2):
|
|
agglo = []
|
|
grades = []
|
|
times = []
|
|
big_start = time.time()
|
|
for k in range(deb,fin):
|
|
start=time.time()
|
|
agglo = cluster.AgglomerativeClustering(n_clusters=k,linkage=linkage).fit(data_tab)
|
|
end=time.time()
|
|
labels = agglo.labels_
|
|
grades.append(score(method,labels,data_tab))
|
|
times.append(end-start)
|
|
big_end=time.time()
|
|
max_value = bestScore(method,grades)
|
|
max_index = grades.index(max_value)
|
|
agglo = cluster.AgglomerativeClustering(n_clusters=max_index+deb,linkage=linkage).fit_predict(data_tab)
|
|
title = "Agglomerative Clustering, k="+str(max_index+deb)+", " + method+", "+linkage
|
|
write(file,title,str(agglo.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(agglo)+1),method)
|
|
print("FIN AGGLO")
|
|
|
|
############################################
|
|
############## KMEANS #############
|
|
############################################
|
|
|
|
def calcul_kmeans(data_tab,deb,fin,file,method="silhouette",dim=2):
|
|
times=[]
|
|
grades=[]
|
|
max_value=0
|
|
big_start=time.time()
|
|
for k in range(deb,fin):
|
|
start=time.time()
|
|
kmeans = cluster.KMeans(n_clusters=k).fit(data_tab)
|
|
end=time.time()
|
|
labels = kmeans.labels_
|
|
grades.append(score(method,labels,data_tab))
|
|
times.append(end-start)
|
|
max_value = bestScore(method,grades)
|
|
big_end=time.time()
|
|
max_index = grades.index(max_value)
|
|
clusters = cluster.KMeans(n_clusters=max_index+deb).fit_predict(data_tab)
|
|
title = "K-Means Clustering, k="+str(max_index+deb)+", " + method
|
|
write(file,title,str(clusters.tolist()), str(times[max_index]),str(big_end-big_start),str(max_value),str(max(clusters)+1),method)
|
|
print("FIN KMEANS")
|
|
|
|
def score(metric,labels,data_tab):
|
|
if(metric=="silhouette"):
|
|
try :
|
|
score = metrics.silhouette_score(data_tab,labels,metric = 'euclidean')
|
|
except ValueError :
|
|
score = - math.inf
|
|
elif metric=="bouldin":
|
|
try :
|
|
score = metrics.davies_bouldin_score(data_tab,labels)
|
|
except ValueError :
|
|
score = math.inf
|
|
elif metric=="calinski":
|
|
try :
|
|
score = metrics.calinski_harabasz_score(data_tab,labels)
|
|
except ValueError :
|
|
score = - math.inf
|
|
else:
|
|
print("Methode pas reconnue")
|
|
return -1
|
|
return score
|
|
|
|
def bestScore(metric,scores,tuple=False):
|
|
if(metric=="silhouette"):
|
|
if tuple:
|
|
value = max(scores, key=itemgetter(2))
|
|
else :
|
|
value = max(scores)
|
|
elif metric=="bouldin":
|
|
if tuple:
|
|
value = min(scores, key=itemgetter(2))
|
|
else :
|
|
value = min(scores)
|
|
elif metric=="calinski":
|
|
if tuple:
|
|
value = max(scores, key=itemgetter(2))
|
|
else :
|
|
value = max(scores)
|
|
else:
|
|
print("Methode pas reconnue")
|
|
return -1
|
|
return value
|
|
|
|
|
|
def process(path,name,dim,d="\t"):
|
|
databrut = np.loadtxt(path+name+".data",delimiter=d)
|
|
open(name+".txt", "w").close() #Clear file first
|
|
f=open(name+".txt", "a")
|
|
if dim==3 :
|
|
data = [[x[0],x[1],x[2]] for x in databrut]
|
|
f0 = [f[0] for f in data]
|
|
f1 = [f[1] for f in data]
|
|
f2 = [f[2] for f in data]
|
|
f.write("f0="+str(f0)+"\n")
|
|
f.write("f1="+str(f1)+"\n")
|
|
f.write("f2="+str(f2)+"\n")
|
|
elif (dim == 2) :
|
|
data = [[x[0],x[1]] for x in databrut]
|
|
f0 = [f[0] for f in data]
|
|
f1 = [f[1] for f in data]
|
|
f.write("f0="+str(f0)+"\n")
|
|
f.write("f1="+str(f1)+"\n")
|
|
|
|
methods=["silhouette","bouldin","calinski"]
|
|
for meth in methods :
|
|
# A.DATA SILHOUETTE , H.DATA SILHOUETTE
|
|
calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
|
|
calcul_dbscan(data,dist_min=1,dist_max=6,dist_step=1,nb_min=5,nb_max=15,dim=dim,file=f,method=meth)
|
|
calcul_agglo(data,'complete',2,3,dim=dim, file=f,method=meth)
|
|
calcul_kmeans(data,2,20,f,method=meth)
|
|
|
|
|
|
# T.DATA , Agglo Average SILHOUETTE
|
|
#calcul_hdbscan(data,2,10,dim=dim,file=f,method=meth)
|
|
#calcul_dbscan(data,dist_min=0.2,dist_max=1.6,dist_step=0.2,nb_min=5,nb_max=30,dim=dim,file=f,method=meth)
|
|
#calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
|
|
#calcul_kmeans(data,2,20,f,method=meth)
|
|
|
|
# ZGN.DATA
|
|
#calcul_hdbscan(data,3,4,dim=dim,file=f,method=meth)
|
|
#calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=15,nb_max=30,dim=dim,file=f,method=meth)
|
|
#calcul_agglo(data,'average',2,20,dim=dim, file=f,method=meth)
|
|
#calcul_kmeans(data,2,20,f,method=meth)
|
|
|
|
# TR.DATA
|
|
#calcul_hdbscan(data,2,6,dim=dim,file=f,method=meth)
|
|
#calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
|
|
#calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
|
|
#calcul_kmeans(data,2,20,f,method=meth)
|
|
|
|
# ZGO.DATA
|
|
#calcul_hdbscan(data,3,15,dim=dim,file=f,method=meth)
|
|
#calcul_dbscan(data,dist_min=0.1,dist_max=0.5,dist_step=0.05,nb_min=2,nb_max=10,dim=dim,file=f,method=meth)
|
|
#calcul_agglo(data,'single',2,20,dim=dim, file=f,method=meth)
|
|
#calcul_kmeans(data,2,20,f,method=meth)
|
|
|
|
f.close()
|
|
|
|
process('./custom/','a',3)
|
|
#process('./custom/','h',3)
|
|
#process('./custom/','t',3)
|
|
#process('./custom/','tr',2, d=" ")
|
|
#process('./custom/','zgn',2, d=" ")
|
|
#process('./custom/','zgo',2, d=" ") |