From 262b4cef0c1489c565f2da7475dcae1d741ce10d Mon Sep 17 00:00:00 2001 From: Faure Paul Date: Thu, 6 Jan 2022 17:01:07 +0100 Subject: [PATCH] Ajout DBSCAN --- mydatalib.py | 33 ++++--- tp1-read-plot-5iss.py | 20 +++- ...d-standardization-dendrogram-agglo-1val.py | 17 +++- tp3-dbscan.py | 94 +++++++++++++++++++ 4 files changed, 142 insertions(+), 22 deletions(-) create mode 100644 tp3-dbscan.py diff --git a/mydatalib.py b/mydatalib.py index 0792e35..e4f4f5c 100644 --- a/mydatalib.py +++ b/mydatalib.py @@ -28,27 +28,34 @@ def scale_data(data): def apply_kmeans(data, k: int = 3, init="k-means++"): tps1 = time.time() - model_km = cluster.KMeans(n_clusters=k, init=init) - model_km.fit(data) + model = cluster.KMeans(n_clusters=k, init=init) + model.fit(data) tps2 = time.time() - return (model_km, round((tps2 - tps1)*1000, 2)) + return (model, round((tps2 - tps1)*1000, 2)) def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"): tps1 = time.time() - model_agg = cluster.AgglomerativeClustering( + model = cluster.AgglomerativeClustering( n_clusters=k, affinity='euclidean', linkage=linkage) - model_agg.fit(data) + model.fit(data) tps2 = time.time() - return (model_agg, round((tps2 - tps1)*1000, 2)) + return (model, round((tps2 - tps1)*1000, 2)) -def evaluate_kmeans(data, model_km): - silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean') - return (silh, model_km.inertia_, model_km.n_iter_) +def apply_DBSCAN(data, eps, min_pts): + tps1 = time.time() + model = cluster.DBSCAN(eps=eps, min_samples=min_pts) + model.fit(data) + tps2 = time.time() + return (model, round((tps2 - tps1)*1000, 2)) -def evaluate_agglomerative_clustering(data, model_agg): - silh = metrics.silhouette_score( - data, model_agg.labels_, metric='euclidean') - return silh +def evaluate(data, model): + try: + silh = metrics.silhouette_score(data, model.labels_) + davies = metrics.davies_bouldin_score(data, model.labels_) + calinski = metrics.calinski_harabasz_score(data, model.labels_) + return (silh, davies, calinski) + except ValueError: + return (None, None, None) diff --git a/tp1-read-plot-5iss.py b/tp1-read-plot-5iss.py index 8dd25bf..3486af4 100644 --- a/tp1-read-plot-5iss.py +++ b/tp1-read-plot-5iss.py @@ -7,8 +7,8 @@ Created on Fri Nov 19 23:08:23 2021 from myplotlib import print_1d_data, print_2d_data, print_3d_data -from mydatalib import extract_data_2d, extract_data_3d, scale_data -from mydatalib import apply_kmeans, evaluate_kmeans +from mydatalib import (extract_data_2d, extract_data_3d, scale_data, + apply_kmeans, evaluate) path = './artificial/' @@ -34,6 +34,8 @@ print_2d_data(data_scaled, dataset_name=dataset_name + k = [] durations = [] silouettes = [] +daviess = [] +calinskis = [] inerties = [] iterations = [] for i in range(2, 50): @@ -44,13 +46,15 @@ for i in range(2, 50): method_name="k-means", k=i, c=model.labels_, stop=False, save=save) # Evaluation de la solution de clustering - (silouette, inertie, iteration) = evaluate_kmeans(data_scaled, model) + (silouette, davies, calinski) = evaluate(data_scaled, model) # Enregistrement des valeurs k += [i] durations += [duration] silouettes += [silouette] - inerties += [inertie] - iterations += [iteration] + daviess += [davies] + calinskis += [calinski] + inerties += [model.inertia_] + iterations += [model.n_iter_] # Affichage des résultats print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name, @@ -61,6 +65,12 @@ print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms", print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) +print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies", + dataset_name=dataset_name, method_name="k-means", + stop=False, save=save) +print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski", + dataset_name=dataset_name, method_name="k-means", + stop=False, save=save) print_1d_data(k, inerties, x_name="k", y_name="inertie", dataset_name=dataset_name, method_name="k-means", stop=False, save=save) diff --git a/tp2-read-standardization-dendrogram-agglo-1val.py b/tp2-read-standardization-dendrogram-agglo-1val.py index ae047d9..3b0591d 100644 --- a/tp2-read-standardization-dendrogram-agglo-1val.py +++ b/tp2-read-standardization-dendrogram-agglo-1val.py @@ -6,9 +6,8 @@ Created on Sat Nov 20 21:28:40 2021 """ from myplotlib import print_1d_data, print_2d_data, print_dendrogramme -from mydatalib import extract_data_2d, scale_data -from mydatalib import apply_agglomerative_clustering -from mydatalib import evaluate_agglomerative_clustering +from mydatalib import (extract_data_2d, scale_data, + apply_agglomerative_clustering, evaluate) ################################################################## @@ -61,6 +60,8 @@ print(" Création clusters : linkage " + k = [] durations = [] silouettes = [] +daviess = [] +calinskis = [] for i in range(2, k_max): # Application du clustering agglomeratif (model, duration) = apply_agglomerative_clustering( @@ -70,11 +71,13 @@ for i in range(2, k_max): method_name="agglomerative_" + linkage, k=i, stop=False, save=save, c=model.labels_) # Evaluation de la solution de clustering - silouette = evaluate_agglomerative_clustering(data_scaled, model) + (silouette, davies, calinski) = evaluate(data_scaled, model) # Enregistrement des valeurs k += [i] durations += [duration] silouettes += [silouette] + daviess += [davies] + calinskis += [calinski] # Affichage des résultats print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name, @@ -85,3 +88,9 @@ print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms", print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette", dataset_name=dataset_name, method_name="agglomerative_" + linkage, stop=False, save=save) +print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies", + dataset_name=dataset_name, + method_name="agglomerative_" + linkage, stop=False, save=save) +print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski", + dataset_name=dataset_name, + method_name="agglomerative_" + linkage, stop=False, save=save) diff --git a/tp3-dbscan.py b/tp3-dbscan.py new file mode 100644 index 0000000..2d1b750 --- /dev/null +++ b/tp3-dbscan.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Dec 8 16:07:28 2021 + +@author: pfaure +""" + +from sklearn.neighbors import NearestNeighbors +import numpy as np + +from myplotlib import print_1d_data, print_2d_data +from mydatalib import extract_data_2d, scale_data, apply_DBSCAN, evaluate + +path = './artificial/' +dataset_name = "banana" +save = True + +print("-----------------------------------------------------------") +print(" Chargement du dataset : " + dataset_name) +data = extract_data_2d(path + dataset_name) +print_2d_data(data, dataset_name=dataset_name + + "_brutes", stop=False, save=save) + +print("-----------------------------------------------------------") +print(" Mise à l'échelle") +data_scaled = scale_data(data) +print_2d_data(data_scaled, dataset_name=dataset_name + + "_scaled", stop=False, save=save) + +print("-----------------------------------------------------------") +print(" Calcul du voisinage") +n = 50 +neighbors = NearestNeighbors(n_neighbors=n) +neighbors.fit(data) +distances, indices = neighbors.kneighbors(data) +distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances)) +print(distances) +distances = np.sort(distances, axis=0) +print(distances) +print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne", + y_name="nombre_de_points", stop=False, save=False) + + +print("-----------------------------------------------------------") +print(" Création clusters : DBSCAN") +params = [] +for i in range(1, 20): + params += [(i/100, 5)] +durations = [] +silouettes = [] +daviess = [] +calinskis = [] +clusters = [] +noise_points = [] +for (distance, min_pts) in params: + # Application du clustering agglomeratif + (model, duration) = apply_DBSCAN(data, distance, min_pts) + cl_pred = model.labels_ + # Affichage des clusters# Affichage des clusters + print_2d_data(data_scaled, dataset_name=dataset_name, + method_name="DBSCAN-Eps=" + + str(distance)+"-Minpt="+str(min_pts), + k=0, stop=False, save=save, c=cl_pred) + # Evaluation de la solution de clustering + (silouette, davies, calinski) = evaluate(data_scaled, model) + # Enregistrement des valeurs + durations += [duration] + silouettes += [silouette] + daviess += [davies] + calinskis += [calinski] + clusters += [len(set(cl_pred)) - (1 if -1 in cl_pred else 0)] + noise_points += [list(cl_pred).count(-1)] + +# Affichage des résultats +params = [str(i) for i in params] +print_1d_data(params, durations, x_name="(eps,min_pts)", + y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name, + method_name="DBSCAN", stop=False, save=save) +print_1d_data(params, silouettes, x_name="(eps,min_pts)", + y_name="coeficient_de_silhouette", dataset_name=dataset_name, + method_name="DBSCAN", stop=False, save=save) +print_1d_data(params, daviess, x_name="(eps,min_pts)", + y_name="coeficient_de_Davies", dataset_name=dataset_name, + method_name="DBSCAN", stop=False, save=save) +print_1d_data(params, calinskis, x_name="(eps,min_pts)", + y_name="coeficient_de_Calinski", dataset_name=dataset_name, + method_name="DBSCAN", stop=False, save=save) +print_1d_data(params, clusters, x_name="(eps,min_pts)", + y_name="nombre_de_clusters", dataset_name=dataset_name, + method_name="DBSCAN", stop=False, save=save) +print_1d_data(params, noise_points, x_name="(eps,min_pts)", + y_name="points_de_bruit", dataset_name=dataset_name, + method_name="DBSCAN", stop=False, save=save)