import os import csv from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN from sklearn.metrics import davies_bouldin_score, silhouette_score from hdbscan import HDBSCAN import matplotlib.pyplot as plt import time from random import seed, uniform from statistics import mean # For making all the process reproductible seed(10) def synthese(files_path, n_iteration=20, output_path=".", show=True): files_names = os.listdir(path=files_path) for file_name in files_names: index_db = {} coef_sil = {} times = {} # Points extraction from current file points = [] with open("dataset/" + file_name, newline='') as f: rows = csv.reader(f, delimiter=";", quoting=csv.QUOTE_NONNUMERIC) for row in rows: points += [row] # Application modele KMeans print("Starts with KMeans clustering... (file: %s)"%(file_name)) index_db['kmeans'] = {} coef_sil['kmeans'] = {} times['kmeans'] = {} for K in range(2,n_iteration+3): modele = KMeans(n_clusters=K, init='k-means++') tic = time.process_time() predictions = modele.fit_predict(X=points) times['kmeans'][K] = (time.process_time() - tic) / 60 index_db['kmeans'][K] = davies_bouldin_score(X=points, labels=predictions) coef_sil['kmeans'][K] = silhouette_score(X=points, labels=predictions) # index_db['kmeans'] = {get_min_or_max_key_value(index_db['kmeans'])[0]:get_min_or_max_key_value(index_db['kmeans'])[1]} # coef_sil['kmeans'] = {get_min_or_max_key_value(index_db['kmeans'],get="max")[0]:get_min_or_max_key_value(index_db['kmeans'],get="max")[1]} # times['kmeans'] = mean(list(times['kmeans'].values())) print("Ends with KMeans clustering") # Application modele AgglomerativeClustering print("Starts with AgglomerativeClustering clustering... (file: %s)"%(file_name)) index_db['agglclst'] = {} coef_sil['agglclst'] = {} times['agglclst'] = {} linkages = ["single", "average", "complete", "ward"] for linkage in linkages: index_db['agglclst'][linkage] = {} coef_sil['agglclst'][linkage] = {} times['agglclst'][linkage] = {} for K in range(2, n_iteration + 3): modele = AgglomerativeClustering(n_clusters=K, linkage=linkage, affinity="euclidean") tic = time.process_time() predictions = modele.fit_predict(X=points) times['agglclst'][linkage][K] = (time.process_time() - tic) / 60 index_db['agglclst'][linkage][K] = davies_bouldin_score(X=points, labels=predictions) coef_sil['agglclst'][linkage][K] = silhouette_score(X=points, labels=predictions) print("Ends with AgglomerativeClustering clustering") # Application modele DBSCAN print("Starts with DBSCAN clustering... (file: %s)"%(file_name)) eps_min_dbscan = 0.1 eps_max_dbscan = 1.5 index_db['dbscan'] = {} coef_sil['dbscan'] = {} times['dbscan'] = {} for eps in [uniform(eps_min_dbscan, eps_max_dbscan) for i in range(n_iteration)]: for min_s in range(1, n_iteration+1): modele = DBSCAN(eps=eps, min_samples=min_s) tic = time.process_time() predictions = modele.fit_predict(X=points) times['dbscan'][eps,min_s] = (time.process_time() - tic) / 60 index_db['dbscan'][eps,min_s] = davies_bouldin_score(X=points, labels=predictions) coef_sil['dbscan'][eps,min_s] = silhouette_score(X=points, labels=predictions) print("Ends with DBSCAN clustering") # Application modele HDBSCAN print("Starts with HDBSCAN clustering... (file: %s)"%(file_name)) eps_min_hdbscan = 0.1 eps_max_hdbscan = 1.5 index_db['hdbscan'] = {} coef_sil['hdbscan'] = {} times['hdbscan'] = {} for eps in [uniform(eps_min_hdbscan, eps_max_hdbscan) for i in range(n_iteration)]: for min_s in range(1, n_iteration+1): modele = HDBSCAN(cluster_selection_epsilon=eps, min_samples=min_s) tic = time.process_time() predictions = modele.fit_predict(X=points) times['hdbscan'][eps,min_s] = (time.process_time() - tic) / 60 index_db['hdbscan'][eps,min_s] = davies_bouldin_score(X=points, labels=predictions) coef_sil['hdbscan'][eps,min_s] = silhouette_score(X=points, labels=predictions) print("Ends with HDBSCAN clustering") # Plotting process generate_graphs(file_name, index_db, coef_sil, times, output_path=output_path, show=show) def generate_graphs(file_name, index_db, coef_sil, times, output_path=".", show=False): # For KMeans index_db_kmeans = index_db['kmeans'] coef_sil_kmeans = coef_sil['kmeans'] times_kmeans = times['kmeans'] # For AgglomerativeClustering index_db_agglclst = index_db['agglclst'] coef_sil_agglclst = coef_sil['agglclst'] times_agglclst = times['agglclst'] # For DBSCAN index_db_dbscan = index_db['dbscan'] coef_sil_dbscan = coef_sil['dbscan'] times_dbscan = times['dbscan'] # For HDBSCAN index_db_hdbscan = index_db['hdbscan'] coef_sil_hdbscan = coef_sil['hdbscan'] times_hdbscan = times['hdbscan'] # Extracting smallest davies bouldin indexes print("Smallest DB extraction... (file: %s)"%(file_name)) kmeans_smallest_db = get_min_or_max_key_value(index_db_kmeans, get="min") single_smallest_db = get_min_or_max_key_value(index_db_agglclst['single'], get="min") average_smallest_db = get_min_or_max_key_value(index_db_agglclst['average'], get="min") complete_smallest_db = get_min_or_max_key_value(index_db_agglclst['complete'], get="min") ward_smallest_db = get_min_or_max_key_value(index_db_agglclst['ward'], get="min") dbscan_smallest_db = get_min_or_max_key_value(index_db_dbscan, get="min") hdbscan_smallest_db = get_min_or_max_key_value(index_db_hdbscan, get="min") # Extracting biggest silhouette coefficients print("Biggest Silhouette extraction... (file: %s)"%(file_name)) kmeans_biggest_sil = get_min_or_max_key_value(coef_sil_kmeans, get="max") single_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['single'], get="max") average_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['average'], get="max") complete_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['complete'], get="max") ward_biggest_sil = get_min_or_max_key_value(coef_sil_agglclst['ward'], get="max") dbscan_biggest_sil = get_min_or_max_key_value(coef_sil_dbscan, get="max") hdbscan_biggest_sil = get_min_or_max_key_value(coef_sil_hdbscan, get="max") # Extracting mean clustering process time print("Mean clustering time calculation... (file: %s)"%(file_name)) kmeans_mean_time = mean(list(times_kmeans.values())) single_mean_time = mean(list(times_agglclst['single'].values())) average_mean_time = mean(list(times_agglclst['average'].values())) complete_mean_time = mean(list(times_agglclst['complete'].values())) ward_mean_time = mean(list(times_agglclst['ward'].values())) dbscan_mean_time = mean(list(times_dbscan.values())) hdbscan_mean_time = mean(list(times_hdbscan.values())) # Plotting davies bouldin bars print("Generating DB scores plot... (file: %s)"%(file_name)) plt.figure(figsize=(12.8, 9.6)) algorithms_db = ['KMeans\nK=%s'%(kmeans_smallest_db[0]), 'AgglClst\n(single/K=%s)'%(single_smallest_db[0]), 'AgglClst\n(average/K=%s)'%(average_smallest_db[0]), 'AgglClst\n(complete/K=%s)'%(complete_smallest_db[0]), 'AgglClst\n(ward/K=%s)'%(ward_smallest_db[0]), 'DBSCAN\n(eps=%s/min_s=%s)'%(dbscan_smallest_db[0][0],dbscan_smallest_db[0][1]), 'HDBSCAN\n(eps=%s/min_s=%s)'%(hdbscan_smallest_db[0][0],hdbscan_smallest_db[0][1])] values_db = [kmeans_smallest_db[1], single_smallest_db[1], average_smallest_db[1], complete_smallest_db[1], ward_smallest_db[1], dbscan_smallest_db[1], hdbscan_smallest_db[1]] plt.bar(x=algorithms_db, height=values_db, width=0.25, color="blue") plt.xlabel("Algorithms") plt.ylabel("Davies Bouldin") plt.savefig(output_path+"/"+file_name+"smallest_db_values.png") if show: plt.show() plt.close() # Plotting silhouette bars print("Generating Silhouette coefficients plot... (file: %s)"%(file_name)) plt.figure(figsize=(12.8, 9.6)) algorithms_sil = ['KMeans\nK=%s' % (kmeans_biggest_sil[0]), 'AgglClst\n(single/K=%s)' % (single_biggest_sil[0]), 'AgglClst\n(average/K=%s)' % (average_biggest_sil[0]), 'AgglClst\n(complete/K=%s)' % (complete_biggest_sil[0]), 'AgglClst\n(ward/K=%s)' % (ward_biggest_sil[0]), 'DBSCAN\n(eps=%s/min_s=%s)' % (dbscan_biggest_sil[0][0], dbscan_biggest_sil[0][1]), 'HDBSCAN\n(eps=%s/min_s=%s)' % (hdbscan_biggest_sil[0][0], hdbscan_biggest_sil[0][1])] values_sil = [kmeans_biggest_sil[1], single_biggest_sil[1], average_biggest_sil[1], complete_biggest_sil[1], ward_biggest_sil[1], dbscan_biggest_sil[1], hdbscan_biggest_sil[1]] plt.bar(x=algorithms_sil, height=values_sil, width=0.25, color="red") plt.xlabel("Algorithms") plt.ylabel("Coefficient silhouette") plt.savefig(output_path+"/"+file_name+"biggest_sil_values.png") if show: plt.show() plt.close() # Plotting mean time bars print("Generating mean times plot... (file: %s)"%(file_name)) plt.figure(figsize=(12.8, 9.6)) algorithms = ['KMeans', 'AgglClst\n(single)', 'AgglClst\n(average)', 'AgglClst\n(complete)', 'AgglClst\n(ward)', 'DBSCAN)', 'HDBSCAN'] values_time = [kmeans_mean_time, single_mean_time, average_mean_time, complete_mean_time, ward_mean_time, dbscan_mean_time, hdbscan_mean_time] plt.bar(x=algorithms, height=values_time, width=0.25, color="green") plt.xlabel("Algorithms") plt.ylabel("Mean time") plt.savefig(output_path+"/"+file_name+"_mean_times.png") if show: plt.show() plt.close() def get_min_or_max_key_value(dictionary, get="min"): """This function takes a dictionary and return the smallest(respectively biggest) value and its key if get is set on min(respectively max) It returns a tuple (key,value)""" if get=="max": return max(dictionary.items(), key=lambda x: x[1]) if get=="min": return min(dictionary.items(), key=lambda x: x[1]) synthese(files_path="dataset", output_path="synthese", n_iteration=3)