223 lines
No EOL
11 KiB
Python
223 lines
No EOL
11 KiB
Python
import os
|
|
import csv
|
|
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
|
|
from sklearn.metrics import davies_bouldin_score, silhouette_score
|
|
from hdbscan import HDBSCAN
|
|
import matplotlib.pyplot as plt
|
|
import time
|
|
from random import seed, uniform
|
|
from statistics import mean
|
|
|
|
# For making all the process reproductible
|
|
seed(10)
|
|
|
|
def synthese(files_path, n_iteration=20, output_path=".", show=True):
|
|
files_names = os.listdir(path=files_path)
|
|
for file_name in files_names:
|
|
index_db = {}
|
|
coef_sil = {}
|
|
times = {}
|
|
# Points extraction from current file
|
|
points = []
|
|
with open("dataset/" + file_name, newline='') as f:
|
|
rows = csv.reader(f, delimiter=";", quoting=csv.QUOTE_NONNUMERIC)
|
|
for row in rows:
|
|
points += [row]
|
|
# Application modele KMeans
|
|
print("Starts with KMeans clustering... (file: %s)"%(file_name))
|
|
index_db['kmeans'] = {}
|
|
coef_sil['kmeans'] = {}
|
|
times['kmeans'] = {}
|
|
for K in range(2,n_iteration+3):
|
|
modele = KMeans(n_clusters=K, init='k-means++')
|
|
tic = time.process_time()
|
|
predictions = modele.fit_predict(X=points)
|
|
times['kmeans'][K] = (time.process_time() - tic) / 60
|
|
index_db['kmeans'][K] = davies_bouldin_score(X=points, labels=predictions)
|
|
coef_sil['kmeans'][K] = silhouette_score(X=points, labels=predictions)
|
|
# index_db['kmeans'] = {get_min_or_max_key_value(index_db['kmeans'])[0]:get_min_or_max_key_value(index_db['kmeans'])[1]}
|
|
# coef_sil['kmeans'] = {get_min_or_max_key_value(index_db['kmeans'],get="max")[0]:get_min_or_max_key_value(index_db['kmeans'],get="max")[1]}
|
|
# times['kmeans'] = mean(list(times['kmeans'].values()))
|
|
print("Ends with KMeans clustering")
|
|
# Application modele AgglomerativeClustering
|
|
print("Starts with AgglomerativeClustering clustering... (file: %s)"%(file_name))
|
|
index_db['agglclst'] = {}
|
|
coef_sil['agglclst'] = {}
|
|
times['agglclst'] = {}
|
|
linkages = ["single", "average", "complete", "ward"]
|
|
for linkage in linkages:
|
|
index_db['agglclst'][linkage] = {}
|
|
coef_sil['agglclst'][linkage] = {}
|
|
times['agglclst'][linkage] = {}
|
|
for K in range(2, n_iteration + 3):
|
|
modele = AgglomerativeClustering(n_clusters=K, linkage=linkage, affinity="euclidean")
|
|
tic = time.process_time()
|
|
predictions = modele.fit_predict(X=points)
|
|
times['agglclst'][linkage][K] = (time.process_time() - tic) / 60
|
|
index_db['agglclst'][linkage][K] = davies_bouldin_score(X=points, labels=predictions)
|
|
coef_sil['agglclst'][linkage][K] = silhouette_score(X=points, labels=predictions)
|
|
print("Ends with AgglomerativeClustering clustering")
|
|
# Application modele DBSCAN
|
|
print("Starts with DBSCAN clustering... (file: %s)"%(file_name))
|
|
eps_min_dbscan = 0.1
|
|
eps_max_dbscan = 1.5
|
|
index_db['dbscan'] = {}
|
|
coef_sil['dbscan'] = {}
|
|
times['dbscan'] = {}
|
|
for eps in [uniform(eps_min_dbscan, eps_max_dbscan) for i in range(n_iteration)]:
|
|
for min_s in range(1, n_iteration+1):
|
|
modele = DBSCAN(eps=eps, min_samples=min_s)
|
|
tic = time.process_time()
|
|
predictions = modele.fit_predict(X=points)
|
|
times['dbscan'][eps,min_s] = (time.process_time() - tic) / 60
|
|
index_db['dbscan'][eps,min_s] = davies_bouldin_score(X=points, labels=predictions)
|
|
coef_sil['dbscan'][eps,min_s] = silhouette_score(X=points, labels=predictions)
|
|
print("Ends with DBSCAN clustering")
|
|
# Application modele HDBSCAN
|
|
print("Starts with HDBSCAN clustering... (file: %s)"%(file_name))
|
|
eps_min_hdbscan = 0.1
|
|
eps_max_hdbscan = 1.5
|
|
index_db['hdbscan'] = {}
|
|
coef_sil['hdbscan'] = {}
|
|
times['hdbscan'] = {}
|
|
for eps in [uniform(eps_min_hdbscan, eps_max_hdbscan) for i in range(n_iteration)]:
|
|
for min_s in range(1, n_iteration+1):
|
|
modele = HDBSCAN(cluster_selection_epsilon=eps, min_samples=min_s)
|
|
tic = time.process_time()
|
|
predictions = modele.fit_predict(X=points)
|
|
times['hdbscan'][eps,min_s] = (time.process_time() - tic) / 60
|
|
index_db['hdbscan'][eps,min_s] = davies_bouldin_score(X=points, labels=predictions)
|
|
coef_sil['hdbscan'][eps,min_s] = silhouette_score(X=points, labels=predictions)
|
|
print("Ends with HDBSCAN clustering")
|
|
# Plotting process
|
|
generate_graphs(file_name, index_db, coef_sil, times, output_path=output_path, show=show)
|
|
|
|
|
|
def generate_graphs(file_name, index_db, coef_sil, times, output_path=".", show=False):
|
|
# For KMeans
|
|
index_db_kmeans = index_db['kmeans']
|
|
coef_sil_kmeans = coef_sil['kmeans']
|
|
times_kmeans = times['kmeans']
|
|
# For AgglomerativeClustering
|
|
index_db_agglclst = index_db['agglclst']
|
|
coef_sil_agglclst = coef_sil['agglclst']
|
|
times_agglclst = times['agglclst']
|
|
# For DBSCAN
|
|
index_db_dbscan = index_db['dbscan']
|
|
coef_sil_dbscan = coef_sil['dbscan']
|
|
times_dbscan = times['dbscan']
|
|
# For HDBSCAN
|
|
index_db_hdbscan = index_db['hdbscan']
|
|
coef_sil_hdbscan = coef_sil['hdbscan']
|
|
times_hdbscan = times['hdbscan']
|
|
# Extracting smallest davies bouldin indexes
|
|
print("Smallest DB extraction... (file: %s)"%(file_name))
|
|
kmeans_smallest_db = get_min_or_max_key_value(index_db_kmeans, get="min")
|
|
single_smallest_db = get_min_or_max_key_value(index_db_agglclst['single'], get="min")
|
|
average_smallest_db = get_min_or_max_key_value(index_db_agglclst['average'], get="min")
|
|
complete_smallest_db = get_min_or_max_key_value(index_db_agglclst['complete'], get="min")
|
|
ward_smallest_db = get_min_or_max_key_value(index_db_agglclst['ward'], get="min")
|
|
dbscan_smallest_db = get_min_or_max_key_value(index_db_dbscan, get="min")
|
|
hdbscan_smallest_db = get_min_or_max_key_value(index_db_hdbscan, get="min")
|
|
# Extracting biggest silhouette coefficients
|
|
print("Biggest Silhouette extraction... (file: %s)"%(file_name))
|
|
kmeans_biggest_sil = get_min_or_max_key_value(coef_sil_kmeans, get="max")
|
|
single_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['single'], get="max")
|
|
average_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['average'], get="max")
|
|
complete_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['complete'], get="max")
|
|
ward_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['ward'], get="max")
|
|
dbscan_biggest_sil = get_min_or_max_key_value(coef_sil_dbscan, get="max")
|
|
hdbscan_biggest_sil = get_min_or_max_key_value(coef_sil_hdbscan, get="max")
|
|
# Extracting mean clustering process time
|
|
print("Mean clustering time calculation... (file: %s)"%(file_name))
|
|
kmeans_mean_time = mean(list(times_kmeans.values()))
|
|
single_mean_time = mean(list(times_agglclst['single'].values()))
|
|
average_mean_time = mean(list(times_agglclst['average'].values()))
|
|
complete_mean_time = mean(list(times_agglclst['complete'].values()))
|
|
ward_mean_time = mean(list(times_agglclst['ward'].values()))
|
|
dbscan_mean_time = mean(list(times_dbscan.values()))
|
|
hdbscan_mean_time = mean(list(times_hdbscan.values()))
|
|
# Plotting davies bouldin bars
|
|
print("Generating DB scores plot... (file: %s)"%(file_name))
|
|
plt.figure(figsize=(12.8, 9.6))
|
|
algorithms_db = ['KMeans\nK=%s'%(kmeans_smallest_db[0]),
|
|
'AgglClst\n(single/K=%s)'%(single_smallest_db[0]),
|
|
'AgglClst\n(average/K=%s)'%(average_smallest_db[0]),
|
|
'AgglClst\n(complete/K=%s)'%(complete_smallest_db[0]),
|
|
'AgglClst\n(ward/K=%s)'%(ward_smallest_db[0]),
|
|
'DBSCAN\n(eps=%s/min_s=%s)'%(dbscan_smallest_db[0][0],dbscan_smallest_db[0][1]),
|
|
'HDBSCAN\n(eps=%s/min_s=%s)'%(hdbscan_smallest_db[0][0],hdbscan_smallest_db[0][1])]
|
|
values_db = [kmeans_smallest_db[1],
|
|
single_smallest_db[1],
|
|
average_smallest_db[1],
|
|
complete_smallest_db[1],
|
|
ward_smallest_db[1],
|
|
dbscan_smallest_db[1],
|
|
hdbscan_smallest_db[1]]
|
|
plt.bar(x=algorithms_db, height=values_db, width=0.25, color="blue")
|
|
plt.xlabel("Algorithms")
|
|
plt.ylabel("Davies Bouldin")
|
|
plt.savefig(output_path+"/"+file_name+"smallest_db_values.png")
|
|
if show:
|
|
plt.show()
|
|
plt.close()
|
|
# Plotting silhouette bars
|
|
print("Generating Silhouette coefficients plot... (file: %s)"%(file_name))
|
|
plt.figure(figsize=(12.8, 9.6))
|
|
algorithms_sil = ['KMeans\nK=%s' % (kmeans_biggest_sil[0]),
|
|
'AgglClst\n(single/K=%s)' % (single_biggest_sil[0]),
|
|
'AgglClst\n(average/K=%s)' % (average_biggest_sil[0]),
|
|
'AgglClst\n(complete/K=%s)' % (complete_biggest_sil[0]),
|
|
'AgglClst\n(ward/K=%s)' % (ward_biggest_sil[0]),
|
|
'DBSCAN\n(eps=%s/min_s=%s)' % (dbscan_biggest_sil[0][0], dbscan_biggest_sil[0][1]),
|
|
'HDBSCAN\n(eps=%s/min_s=%s)' % (hdbscan_biggest_sil[0][0], hdbscan_biggest_sil[0][1])]
|
|
values_sil = [kmeans_biggest_sil[1],
|
|
single_biggest_sil[1],
|
|
average_biggest_sil[1],
|
|
complete_biggest_sil[1],
|
|
ward_biggest_sil[1],
|
|
dbscan_biggest_sil[1],
|
|
hdbscan_biggest_sil[1]]
|
|
plt.bar(x=algorithms_sil, height=values_sil, width=0.25, color="red")
|
|
plt.xlabel("Algorithms")
|
|
plt.ylabel("Coefficient silhouette")
|
|
plt.savefig(output_path+"/"+file_name+"biggest_sil_values.png")
|
|
if show:
|
|
plt.show()
|
|
plt.close()
|
|
# Plotting mean time bars
|
|
print("Generating mean times plot... (file: %s)"%(file_name))
|
|
plt.figure(figsize=(12.8, 9.6))
|
|
algorithms = ['KMeans',
|
|
'AgglClst\n(single)',
|
|
'AgglClst\n(average)',
|
|
'AgglClst\n(complete)',
|
|
'AgglClst\n(ward)',
|
|
'DBSCAN)',
|
|
'HDBSCAN']
|
|
values_time = [kmeans_mean_time,
|
|
single_mean_time,
|
|
average_mean_time,
|
|
complete_mean_time,
|
|
ward_mean_time,
|
|
dbscan_mean_time,
|
|
hdbscan_mean_time]
|
|
plt.bar(x=algorithms, height=values_time, width=0.25, color="green")
|
|
plt.xlabel("Algorithms")
|
|
plt.ylabel("Mean time")
|
|
plt.savefig(output_path+"/"+file_name+"_mean_times.png")
|
|
if show:
|
|
plt.show()
|
|
plt.close()
|
|
|
|
|
|
def get_min_or_max_key_value(dictionary, get="min"):
|
|
"""This function takes a dictionary and return the smallest(respectively biggest) value and its key if get is set on min(respectively max)
|
|
It returns a tuple (key,value)"""
|
|
if get=="max":
|
|
return max(dictionary.items(), key=lambda x: x[1])
|
|
if get=="min":
|
|
return min(dictionary.items(), key=lambda x: x[1])
|
|
|
|
|
|
synthese(files_path="dataset", output_path="synthese", n_iteration=3) |