Ajout DBSCAN
This commit is contained in:
parent
e213447913
commit
262b4cef0c
4 changed files with 142 additions and 22 deletions
33
mydatalib.py
33
mydatalib.py
|
@ -28,27 +28,34 @@ def scale_data(data):
|
|||
|
||||
def apply_kmeans(data, k: int = 3, init="k-means++"):
|
||||
tps1 = time.time()
|
||||
model_km = cluster.KMeans(n_clusters=k, init=init)
|
||||
model_km.fit(data)
|
||||
model = cluster.KMeans(n_clusters=k, init=init)
|
||||
model.fit(data)
|
||||
tps2 = time.time()
|
||||
return (model_km, round((tps2 - tps1)*1000, 2))
|
||||
return (model, round((tps2 - tps1)*1000, 2))
|
||||
|
||||
|
||||
def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
|
||||
tps1 = time.time()
|
||||
model_agg = cluster.AgglomerativeClustering(
|
||||
model = cluster.AgglomerativeClustering(
|
||||
n_clusters=k, affinity='euclidean', linkage=linkage)
|
||||
model_agg.fit(data)
|
||||
model.fit(data)
|
||||
tps2 = time.time()
|
||||
return (model_agg, round((tps2 - tps1)*1000, 2))
|
||||
return (model, round((tps2 - tps1)*1000, 2))
|
||||
|
||||
|
||||
def evaluate_kmeans(data, model_km):
|
||||
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
||||
return (silh, model_km.inertia_, model_km.n_iter_)
|
||||
def apply_DBSCAN(data, eps, min_pts):
|
||||
tps1 = time.time()
|
||||
model = cluster.DBSCAN(eps=eps, min_samples=min_pts)
|
||||
model.fit(data)
|
||||
tps2 = time.time()
|
||||
return (model, round((tps2 - tps1)*1000, 2))
|
||||
|
||||
|
||||
def evaluate_agglomerative_clustering(data, model_agg):
|
||||
silh = metrics.silhouette_score(
|
||||
data, model_agg.labels_, metric='euclidean')
|
||||
return silh
|
||||
def evaluate(data, model):
|
||||
try:
|
||||
silh = metrics.silhouette_score(data, model.labels_)
|
||||
davies = metrics.davies_bouldin_score(data, model.labels_)
|
||||
calinski = metrics.calinski_harabasz_score(data, model.labels_)
|
||||
return (silh, davies, calinski)
|
||||
except ValueError:
|
||||
return (None, None, None)
|
||||
|
|
|
@ -7,8 +7,8 @@ Created on Fri Nov 19 23:08:23 2021
|
|||
|
||||
|
||||
from myplotlib import print_1d_data, print_2d_data, print_3d_data
|
||||
from mydatalib import extract_data_2d, extract_data_3d, scale_data
|
||||
from mydatalib import apply_kmeans, evaluate_kmeans
|
||||
from mydatalib import (extract_data_2d, extract_data_3d, scale_data,
|
||||
apply_kmeans, evaluate)
|
||||
|
||||
|
||||
path = './artificial/'
|
||||
|
@ -34,6 +34,8 @@ print_2d_data(data_scaled, dataset_name=dataset_name +
|
|||
k = []
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
calinskis = []
|
||||
inerties = []
|
||||
iterations = []
|
||||
for i in range(2, 50):
|
||||
|
@ -44,13 +46,15 @@ for i in range(2, 50):
|
|||
method_name="k-means", k=i, c=model.labels_,
|
||||
stop=False, save=save)
|
||||
# Evaluation de la solution de clustering
|
||||
(silouette, inertie, iteration) = evaluate_kmeans(data_scaled, model)
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
k += [i]
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
inerties += [inertie]
|
||||
iterations += [iteration]
|
||||
daviess += [davies]
|
||||
calinskis += [calinski]
|
||||
inerties += [model.inertia_]
|
||||
iterations += [model.n_iter_]
|
||||
|
||||
# Affichage des résultats
|
||||
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||
|
@ -61,6 +65,12 @@ print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
|||
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, inerties, x_name="k", y_name="inertie",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
|
|
|
@ -6,9 +6,8 @@ Created on Sat Nov 20 21:28:40 2021
|
|||
"""
|
||||
|
||||
from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
|
||||
from mydatalib import extract_data_2d, scale_data
|
||||
from mydatalib import apply_agglomerative_clustering
|
||||
from mydatalib import evaluate_agglomerative_clustering
|
||||
from mydatalib import (extract_data_2d, scale_data,
|
||||
apply_agglomerative_clustering, evaluate)
|
||||
|
||||
|
||||
##################################################################
|
||||
|
@ -61,6 +60,8 @@ print(" Création clusters : linkage " +
|
|||
k = []
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
calinskis = []
|
||||
for i in range(2, k_max):
|
||||
# Application du clustering agglomeratif
|
||||
(model, duration) = apply_agglomerative_clustering(
|
||||
|
@ -70,11 +71,13 @@ for i in range(2, k_max):
|
|||
method_name="agglomerative_" + linkage, k=i,
|
||||
stop=False, save=save, c=model.labels_)
|
||||
# Evaluation de la solution de clustering
|
||||
silouette = evaluate_agglomerative_clustering(data_scaled, model)
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
k += [i]
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
daviess += [davies]
|
||||
calinskis += [calinski]
|
||||
|
||||
# Affichage des résultats
|
||||
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||
|
@ -85,3 +88,9 @@ print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
|||
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
|
|
94
tp3-dbscan.py
Normal file
94
tp3-dbscan.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Dec 8 16:07:28 2021
|
||||
|
||||
@author: pfaure
|
||||
"""
|
||||
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
import numpy as np
|
||||
|
||||
from myplotlib import print_1d_data, print_2d_data
|
||||
from mydatalib import extract_data_2d, scale_data, apply_DBSCAN, evaluate
|
||||
|
||||
path = './artificial/'
|
||||
dataset_name = "banana"
|
||||
save = True
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
data = extract_data_2d(path + dataset_name)
|
||||
print_2d_data(data, dataset_name=dataset_name +
|
||||
"_brutes", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Mise à l'échelle")
|
||||
data_scaled = scale_data(data)
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name +
|
||||
"_scaled", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Calcul du voisinage")
|
||||
n = 50
|
||||
neighbors = NearestNeighbors(n_neighbors=n)
|
||||
neighbors.fit(data)
|
||||
distances, indices = neighbors.kneighbors(data)
|
||||
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
|
||||
print(distances)
|
||||
distances = np.sort(distances, axis=0)
|
||||
print(distances)
|
||||
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
|
||||
y_name="nombre_de_points", stop=False, save=False)
|
||||
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Création clusters : DBSCAN")
|
||||
params = []
|
||||
for i in range(1, 20):
|
||||
params += [(i/100, 5)]
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
calinskis = []
|
||||
clusters = []
|
||||
noise_points = []
|
||||
for (distance, min_pts) in params:
|
||||
# Application du clustering agglomeratif
|
||||
(model, duration) = apply_DBSCAN(data, distance, min_pts)
|
||||
cl_pred = model.labels_
|
||||
# Affichage des clusters# Affichage des clusters
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||
method_name="DBSCAN-Eps=" +
|
||||
str(distance)+"-Minpt="+str(min_pts),
|
||||
k=0, stop=False, save=save, c=cl_pred)
|
||||
# Evaluation de la solution de clustering
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
daviess += [davies]
|
||||
calinskis += [calinski]
|
||||
clusters += [len(set(cl_pred)) - (1 if -1 in cl_pred else 0)]
|
||||
noise_points += [list(cl_pred).count(-1)]
|
||||
|
||||
# Affichage des résultats
|
||||
params = [str(i) for i in params]
|
||||
print_1d_data(params, durations, x_name="(eps,min_pts)",
|
||||
y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
||||
print_1d_data(params, silouettes, x_name="(eps,min_pts)",
|
||||
y_name="coeficient_de_silhouette", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
||||
print_1d_data(params, daviess, x_name="(eps,min_pts)",
|
||||
y_name="coeficient_de_Davies", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
||||
print_1d_data(params, calinskis, x_name="(eps,min_pts)",
|
||||
y_name="coeficient_de_Calinski", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
||||
print_1d_data(params, clusters, x_name="(eps,min_pts)",
|
||||
y_name="nombre_de_clusters", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
||||
print_1d_data(params, noise_points, x_name="(eps,min_pts)",
|
||||
y_name="points_de_bruit", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
Loading…
Reference in a new issue