tp_apprentissage/divers/clustering_synthese_.py
2020-11-10 21:38:27 +01:00

223 lines
No EOL
11 KiB
Python

import os
import csv
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import davies_bouldin_score, silhouette_score
from hdbscan import HDBSCAN
import matplotlib.pyplot as plt
import time
from random import seed, uniform
from statistics import mean
# For making all the process reproductible
seed(10)
def synthese(files_path, n_iteration=20, output_path=".", show=True):
files_names = os.listdir(path=files_path)
for file_name in files_names:
index_db = {}
coef_sil = {}
times = {}
# Points extraction from current file
points = []
with open("dataset/" + file_name, newline='') as f:
rows = csv.reader(f, delimiter=";", quoting=csv.QUOTE_NONNUMERIC)
for row in rows:
points += [row]
# Application modele KMeans
print("Starts with KMeans clustering... (file: %s)"%(file_name))
index_db['kmeans'] = {}
coef_sil['kmeans'] = {}
times['kmeans'] = {}
for K in range(2,n_iteration+3):
modele = KMeans(n_clusters=K, init='k-means++')
tic = time.process_time()
predictions = modele.fit_predict(X=points)
times['kmeans'][K] = (time.process_time() - tic) / 60
index_db['kmeans'][K] = davies_bouldin_score(X=points, labels=predictions)
coef_sil['kmeans'][K] = silhouette_score(X=points, labels=predictions)
# index_db['kmeans'] = {get_min_or_max_key_value(index_db['kmeans'])[0]:get_min_or_max_key_value(index_db['kmeans'])[1]}
# coef_sil['kmeans'] = {get_min_or_max_key_value(index_db['kmeans'],get="max")[0]:get_min_or_max_key_value(index_db['kmeans'],get="max")[1]}
# times['kmeans'] = mean(list(times['kmeans'].values()))
print("Ends with KMeans clustering")
# Application modele AgglomerativeClustering
print("Starts with AgglomerativeClustering clustering... (file: %s)"%(file_name))
index_db['agglclst'] = {}
coef_sil['agglclst'] = {}
times['agglclst'] = {}
linkages = ["single", "average", "complete", "ward"]
for linkage in linkages:
index_db['agglclst'][linkage] = {}
coef_sil['agglclst'][linkage] = {}
times['agglclst'][linkage] = {}
for K in range(2, n_iteration + 3):
modele = AgglomerativeClustering(n_clusters=K, linkage=linkage, affinity="euclidean")
tic = time.process_time()
predictions = modele.fit_predict(X=points)
times['agglclst'][linkage][K] = (time.process_time() - tic) / 60
index_db['agglclst'][linkage][K] = davies_bouldin_score(X=points, labels=predictions)
coef_sil['agglclst'][linkage][K] = silhouette_score(X=points, labels=predictions)
print("Ends with AgglomerativeClustering clustering")
# Application modele DBSCAN
print("Starts with DBSCAN clustering... (file: %s)"%(file_name))
eps_min_dbscan = 0.1
eps_max_dbscan = 1.5
index_db['dbscan'] = {}
coef_sil['dbscan'] = {}
times['dbscan'] = {}
for eps in [uniform(eps_min_dbscan, eps_max_dbscan) for i in range(n_iteration)]:
for min_s in range(1, n_iteration+1):
modele = DBSCAN(eps=eps, min_samples=min_s)
tic = time.process_time()
predictions = modele.fit_predict(X=points)
times['dbscan'][eps,min_s] = (time.process_time() - tic) / 60
index_db['dbscan'][eps,min_s] = davies_bouldin_score(X=points, labels=predictions)
coef_sil['dbscan'][eps,min_s] = silhouette_score(X=points, labels=predictions)
print("Ends with DBSCAN clustering")
# Application modele HDBSCAN
print("Starts with HDBSCAN clustering... (file: %s)"%(file_name))
eps_min_hdbscan = 0.1
eps_max_hdbscan = 1.5
index_db['hdbscan'] = {}
coef_sil['hdbscan'] = {}
times['hdbscan'] = {}
for eps in [uniform(eps_min_hdbscan, eps_max_hdbscan) for i in range(n_iteration)]:
for min_s in range(1, n_iteration+1):
modele = HDBSCAN(cluster_selection_epsilon=eps, min_samples=min_s)
tic = time.process_time()
predictions = modele.fit_predict(X=points)
times['hdbscan'][eps,min_s] = (time.process_time() - tic) / 60
index_db['hdbscan'][eps,min_s] = davies_bouldin_score(X=points, labels=predictions)
coef_sil['hdbscan'][eps,min_s] = silhouette_score(X=points, labels=predictions)
print("Ends with HDBSCAN clustering")
# Plotting process
generate_graphs(file_name, index_db, coef_sil, times, output_path=output_path, show=show)
def generate_graphs(file_name, index_db, coef_sil, times, output_path=".", show=False):
# For KMeans
index_db_kmeans = index_db['kmeans']
coef_sil_kmeans = coef_sil['kmeans']
times_kmeans = times['kmeans']
# For AgglomerativeClustering
index_db_agglclst = index_db['agglclst']
coef_sil_agglclst = coef_sil['agglclst']
times_agglclst = times['agglclst']
# For DBSCAN
index_db_dbscan = index_db['dbscan']
coef_sil_dbscan = coef_sil['dbscan']
times_dbscan = times['dbscan']
# For HDBSCAN
index_db_hdbscan = index_db['hdbscan']
coef_sil_hdbscan = coef_sil['hdbscan']
times_hdbscan = times['hdbscan']
# Extracting smallest davies bouldin indexes
print("Smallest DB extraction... (file: %s)"%(file_name))
kmeans_smallest_db = get_min_or_max_key_value(index_db_kmeans, get="min")
single_smallest_db = get_min_or_max_key_value(index_db_agglclst['single'], get="min")
average_smallest_db = get_min_or_max_key_value(index_db_agglclst['average'], get="min")
complete_smallest_db = get_min_or_max_key_value(index_db_agglclst['complete'], get="min")
ward_smallest_db = get_min_or_max_key_value(index_db_agglclst['ward'], get="min")
dbscan_smallest_db = get_min_or_max_key_value(index_db_dbscan, get="min")
hdbscan_smallest_db = get_min_or_max_key_value(index_db_hdbscan, get="min")
# Extracting biggest silhouette coefficients
print("Biggest Silhouette extraction... (file: %s)"%(file_name))
kmeans_biggest_sil = get_min_or_max_key_value(coef_sil_kmeans, get="max")
single_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['single'], get="max")
average_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['average'], get="max")
complete_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['complete'], get="max")
ward_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['ward'], get="max")
dbscan_biggest_sil = get_min_or_max_key_value(coef_sil_dbscan, get="max")
hdbscan_biggest_sil = get_min_or_max_key_value(coef_sil_hdbscan, get="max")
# Extracting mean clustering process time
print("Mean clustering time calculation... (file: %s)"%(file_name))
kmeans_mean_time = mean(list(times_kmeans.values()))
single_mean_time = mean(list(times_agglclst['single'].values()))
average_mean_time = mean(list(times_agglclst['average'].values()))
complete_mean_time = mean(list(times_agglclst['complete'].values()))
ward_mean_time = mean(list(times_agglclst['ward'].values()))
dbscan_mean_time = mean(list(times_dbscan.values()))
hdbscan_mean_time = mean(list(times_hdbscan.values()))
# Plotting davies bouldin bars
print("Generating DB scores plot... (file: %s)"%(file_name))
plt.figure(figsize=(12.8, 9.6))
algorithms_db = ['KMeans\nK=%s'%(kmeans_smallest_db[0]),
'AgglClst\n(single/K=%s)'%(single_smallest_db[0]),
'AgglClst\n(average/K=%s)'%(average_smallest_db[0]),
'AgglClst\n(complete/K=%s)'%(complete_smallest_db[0]),
'AgglClst\n(ward/K=%s)'%(ward_smallest_db[0]),
'DBSCAN\n(eps=%s/min_s=%s)'%(dbscan_smallest_db[0][0],dbscan_smallest_db[0][1]),
'HDBSCAN\n(eps=%s/min_s=%s)'%(hdbscan_smallest_db[0][0],hdbscan_smallest_db[0][1])]
values_db = [kmeans_smallest_db[1],
single_smallest_db[1],
average_smallest_db[1],
complete_smallest_db[1],
ward_smallest_db[1],
dbscan_smallest_db[1],
hdbscan_smallest_db[1]]
plt.bar(x=algorithms_db, height=values_db, width=0.25, color="blue")
plt.xlabel("Algorithms")
plt.ylabel("Davies Bouldin")
plt.savefig(output_path+"/"+file_name+"smallest_db_values.png")
if show:
plt.show()
plt.close()
# Plotting silhouette bars
print("Generating Silhouette coefficients plot... (file: %s)"%(file_name))
plt.figure(figsize=(12.8, 9.6))
algorithms_sil = ['KMeans\nK=%s' % (kmeans_biggest_sil[0]),
'AgglClst\n(single/K=%s)' % (single_biggest_sil[0]),
'AgglClst\n(average/K=%s)' % (average_biggest_sil[0]),
'AgglClst\n(complete/K=%s)' % (complete_biggest_sil[0]),
'AgglClst\n(ward/K=%s)' % (ward_biggest_sil[0]),
'DBSCAN\n(eps=%s/min_s=%s)' % (dbscan_biggest_sil[0][0], dbscan_biggest_sil[0][1]),
'HDBSCAN\n(eps=%s/min_s=%s)' % (hdbscan_biggest_sil[0][0], hdbscan_biggest_sil[0][1])]
values_sil = [kmeans_biggest_sil[1],
single_biggest_sil[1],
average_biggest_sil[1],
complete_biggest_sil[1],
ward_biggest_sil[1],
dbscan_biggest_sil[1],
hdbscan_biggest_sil[1]]
plt.bar(x=algorithms_sil, height=values_sil, width=0.25, color="red")
plt.xlabel("Algorithms")
plt.ylabel("Coefficient silhouette")
plt.savefig(output_path+"/"+file_name+"biggest_sil_values.png")
if show:
plt.show()
plt.close()
# Plotting mean time bars
print("Generating mean times plot... (file: %s)"%(file_name))
plt.figure(figsize=(12.8, 9.6))
algorithms = ['KMeans',
'AgglClst\n(single)',
'AgglClst\n(average)',
'AgglClst\n(complete)',
'AgglClst\n(ward)',
'DBSCAN)',
'HDBSCAN']
values_time = [kmeans_mean_time,
single_mean_time,
average_mean_time,
complete_mean_time,
ward_mean_time,
dbscan_mean_time,
hdbscan_mean_time]
plt.bar(x=algorithms, height=values_time, width=0.25, color="green")
plt.xlabel("Algorithms")
plt.ylabel("Mean time")
plt.savefig(output_path+"/"+file_name+"_mean_times.png")
if show:
plt.show()
plt.close()
def get_min_or_max_key_value(dictionary, get="min"):
"""This function takes a dictionary and return the smallest(respectively biggest) value and its key if get is set on min(respectively max)
It returns a tuple (key,value)"""
if get=="max":
return max(dictionary.items(), key=lambda x: x[1])
if get=="min":
return min(dictionary.items(), key=lambda x: x[1])
synthese(files_path="dataset", output_path="synthese", n_iteration=3)