#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Dec 8 16:07:28 2021 @author: pfaure """ from numpy import arange from myplotlib import print_1d_data, print_2d_data from mydatalib import scale_data, apply_DBSCAN, evaluate, extract_data_csv, apply_kmeans, \ apply_agglomerative_clustering, apply_mean_shift path = './new-data/' dataset_name = "pluie" eps = 0.6 first_column = 1 last_column = 12 save = False print("-----------------------------------------------------------") print(" Chargement du dataset : " + dataset_name) (villes, data) = extract_data_csv(path + dataset_name, first_column, last_column) print(data) print("-----------------------------------------------------------") print(" Mise à l'échelle") data_scaled = scale_data(data) k_max = 20 print("-----------------------------------------------------------") print(" Application de k-means") # Application de k-means pour plusieurs valeurs de k # et evaluation de la solution k = [] durations = [] silouettes = [] daviess = [] calinskis = [] inerties = [] iterations = [] for i in range(2, k_max): # Application de k-means (model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++") # Evaluation de la solution de clustering (silouette, davies, calinski) = evaluate(data_scaled, model) # Enregistrement des valeurs k += [i] durations += [duration] silouettes += [silouette] daviess += [davies] calinskis += [calinski] inerties += [model.inertia_] iterations += [model.n_iter_] # Affichage des résultats print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) print_1d_data(k, inerties, x_name="k", y_name="inertie", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) print_1d_data(k, iterations, x_name="k", y_name="nombre_d_iterations", dataset_name=dataset_name, method_name="k-means", stop=True, save=save) print("-----------------------------------------------------------") print(" Création clusters : agglomerative ") # Application du clustering agglomeratif pour plusieurs valeurs de k # et evaluation de la solution linkage = "ward" k = [] durations = [] silouettes = [] daviess = [] calinskis = [] for i in range(2, k_max): # Application du clustering agglomeratif (model, duration) = apply_agglomerative_clustering( data_scaled, k=i, linkage=linkage) # Evaluation de la solution de clustering (silouette, davies, calinski) = evaluate(data_scaled, model) # Enregistrement des valeurs k += [i] durations += [duration] silouettes += [silouette] daviess += [davies] calinskis += [calinski] # Affichage des résultats print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name, method_name="agglomerative_" + linkage, stop=False, save=save) print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name, method_name="agglomerative_" + linkage, stop=False, save=save) print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette", dataset_name=dataset_name, method_name="agglomerative_" + linkage, stop=False, save=save) print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies", dataset_name=dataset_name, method_name="agglomerative_" + linkage, stop=False, save=save) print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski", dataset_name=dataset_name, method_name="agglomerative_" + linkage, stop=False, save=save) min_sample_max = 30 print("-----------------------------------------------------------") print(" Création clusters : DBSCAN") params = [] for i in range(1, min_sample_max): params += [(eps, i)] durations = [] silouettes = [] daviess = [] calinskis = [] clusters = [] noise_points = [] for (distance, min_pts) in params: # Application du clustering agglomeratif (model, duration) = apply_DBSCAN(data_scaled, distance, min_pts) cl_pred = model.labels_ # Evaluation de la solution de clustering (silouette, davies, calinski) = evaluate(data_scaled, model) # Enregistrement des valeurs durations += [duration] silouettes += [silouette] daviess += [davies] calinskis += [calinski] clusters += [len(set(cl_pred)) - (1 if -1 in cl_pred else 0)] noise_points += [list(cl_pred).count(-1)] # Affichage des résultats params = [str(i) for i in params] print_1d_data(params, durations, x_name="(eps,min_pts)", y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name, method_name="DBSCAN", stop=False, save=save) print_1d_data(params, silouettes, x_name="(eps,min_pts)", y_name="coeficient_de_silhouette", dataset_name=dataset_name, method_name="DBSCAN", stop=False, save=save) print_1d_data(params, daviess, x_name="(eps,min_pts)", y_name="coeficient_de_Davies", dataset_name=dataset_name, method_name="DBSCAN", stop=False, save=save) print_1d_data(params, calinskis, x_name="(eps,min_pts)", y_name="coeficient_de_Calinski", dataset_name=dataset_name, method_name="DBSCAN", stop=False, save=save) print_1d_data(params, clusters, x_name="(eps,min_pts)", y_name="nombre_de_clusters", dataset_name=dataset_name, method_name="DBSCAN", stop=False, save=save) print_1d_data(params, noise_points, x_name="(eps,min_pts)", y_name="points_de_bruit", dataset_name=dataset_name, method_name="DBSCAN", stop=False, save=save) print("-----------------------------------------------------------") print(" Création clusters : mean-shift") # Application de Affinity Propagation pour plusieurs valeurs de préférence # et evaluation de la solution k_max = 2 k = [] durations = [] silouettes = [] daviess = [] calinskis = [] for bandwidth in arange(0.1, k_max, 0.2): # Application du clustering (model, duration) = apply_mean_shift( data_scaled, bandwidth=bandwidth) # Evaluation de la solution de clustering (silouette, davies, calinski) = evaluate(data_scaled, model) # Enregistrement des valeurs k += [bandwidth] durations += [duration] silouettes += [silouette] daviess += [davies] calinskis += [calinski] # Affichage des résultats print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name, method_name="mean-shift", stop=False, save=save) print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name, method_name="mean-shift", stop=False, save=save) print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette", dataset_name=dataset_name, method_name="mean-shift", stop=False, save=save) print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies", dataset_name=dataset_name, method_name="mean-shift", stop=False, save=save) print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski", dataset_name=dataset_name, method_name="mean-shift", stop=False, save=save)