Use mean shift instead of affinity propagation

This commit is contained in:
Arnaud Vergnet 2022-01-08 21:07:47 +01:00
parent 2fa6908d64
commit a41d8033c5
4 changed files with 99 additions and 110 deletions

View file

@ -31,7 +31,7 @@ def apply_kmeans(data, k: int = 3, init="k-means++"):
model = cluster.KMeans(n_clusters=k, init=init)
model.fit(data)
tps2 = time.time()
return (model, round((tps2 - tps1)*1000, 2))
return model, round((tps2 - tps1) * 1000, 2)
def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
@ -40,7 +40,7 @@ def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
n_clusters=k, affinity='euclidean', linkage=linkage)
model.fit(data)
tps2 = time.time()
return (model, round((tps2 - tps1)*1000, 2))
return model, round((tps2 - tps1) * 1000, 2)
def apply_DBSCAN(data, eps, min_pts):
@ -48,7 +48,15 @@ def apply_DBSCAN(data, eps, min_pts):
model = cluster.DBSCAN(eps=eps, min_samples=min_pts)
model.fit(data)
tps2 = time.time()
return (model, round((tps2 - tps1)*1000, 2))
return model, round((tps2 - tps1) * 1000, 2)
def apply_mean_shift(data, bandwidth: float):
tps1 = time.time()
model = cluster.MeanShift(bandwidth=bandwidth)
model.fit(data)
tps2 = time.time()
return model, round((tps2 - tps1) * 1000, 2)
def evaluate(data, model):
@ -56,6 +64,6 @@ def evaluate(data, model):
silh = metrics.silhouette_score(data, model.labels_)
davies = metrics.davies_bouldin_score(data, model.labels_)
calinski = metrics.calinski_harabasz_score(data, model.labels_)
return (silh, davies, calinski)
return silh, davies, calinski
except ValueError:
return (None, None, None)
return None, None, None

View file

@ -22,7 +22,7 @@ def print_3d_data(data,
f2 = data[:, 2] # tous les éléments de la troisième colonne
fig = plt.figure()
ax = fig.gca(projection='3d') # Affichage en 3D
if (c is None):
if c is None:
ax.scatter(f0, f1, f2, label='Courbe',
marker='d')
plt.title("Données initiales : " + dataset_name)
@ -35,8 +35,8 @@ def print_3d_data(data,
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.tight_layout()
if (save):
if (c is None):
if save:
if c is None:
save_path = "IMG/DATA_VISUALISATION/"
if not os.path.exists(save_path):
os.makedirs(save_path)
@ -54,24 +54,26 @@ def print_3d_data(data,
def print_2d_data(data,
dataset_name: str = "",
method_name: str = "",
k: int = 0,
k = 0,
stop: bool = True,
save: bool = False,
c=None):
f0 = data[:, 0] # tous les élements de la première colonne
f1 = data[:, 1] # tous les éléments de la deuxième colonne
plt.figure()
# utilisation d'une décimale si float
k_str = str(round(k, 1)) if isinstance(k, float) else str(k)
# plt.hist2d(f0, f1)
if (c is None):
if c is None:
plt.scatter(f0, f1, s=8)
plt.title("Données initiales : " + dataset_name)
else:
plt.scatter(f0, f1, c=c, s=8)
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
plt.title("Graphique de " + k_str + " clusters avec la méthode " +
method_name + " sur le jeu de données " + dataset_name)
if (save):
if (c is None):
if save:
if c is None:
save_path = "IMG/DATA_VISUALISATION/"
if not os.path.exists(save_path):
os.makedirs(save_path)
@ -80,7 +82,7 @@ def print_2d_data(data,
save_path = "IMG/" + method_name + "/" + dataset_name + "/CLUSTERS/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + "k=" + str(k) + ".png")
plt.savefig(save_path + "k=" + k_str + ".png")
plt.close()
else:
plt.show(block=stop)
@ -101,7 +103,7 @@ def print_1d_data(x, y,
method_name + " sur les données " + dataset_name)
plt.xlabel(x_name + " (" + x_unit + ")")
plt.ylabel(y_name + " (" + y_unit + ")")
if (save):
if save:
save_path = "IMG/" + method_name + "/" + dataset_name + "/EVALUATION/"
if not os.path.exists(save_path):
os.makedirs(save_path)
@ -126,7 +128,7 @@ def print_dendrogramme(data,
show_leaf_counts=False)
plt.title("Dendrogramme du jeu de données " +
dataset_name + " avec le linkage " + linkage)
if (save):
if save:
save_path = "IMG/DENDROGRAMME/" + linkage + "/"
if not os.path.exists(save_path):
os.makedirs(save_path)

View file

@ -1,94 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 8 16:07:28 2021
@author: pfaure
"""
from sklearn.neighbors import NearestNeighbors
import numpy as np
from myplotlib import print_1d_data, print_2d_data
from mydatalib import extract_data_2d, scale_data, apply_DBSCAN, evaluate
path = './artificial/'
dataset_name = "banana"
save = True
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
data = extract_data_2d(path + dataset_name)
print_2d_data(data, dataset_name=dataset_name +
"_brutes", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Mise à l'échelle")
data_scaled = scale_data(data)
print_2d_data(data_scaled, dataset_name=dataset_name +
"_scaled", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Calcul du voisinage")
n = 50
neighbors = NearestNeighbors(n_neighbors=n)
neighbors.fit(data)
distances, indices = neighbors.kneighbors(data)
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
print(distances)
distances = np.sort(distances, axis=0)
print(distances)
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
y_name="nombre_de_points", stop=False, save=False)
print("-----------------------------------------------------------")
print(" Création clusters : DBSCAN")
params = []
for i in range(1, 20):
params += [(i/100, 5)]
durations = []
silouettes = []
daviess = []
calinskis = []
clusters = []
noise_points = []
for (distance, min_pts) in params:
# Application du clustering agglomeratif
(model, duration) = apply_DBSCAN(data, distance, min_pts)
cl_pred = model.labels_
# Affichage des clusters# Affichage des clusters
print_2d_data(data_scaled, dataset_name=dataset_name,
method_name="DBSCAN-Eps=" +
str(distance)+"-Minpt="+str(min_pts),
k=0, stop=False, save=save, c=cl_pred)
# Evaluation de la solution de clustering
(silouette, davies, calinski) = evaluate(data_scaled, model)
# Enregistrement des valeurs
durations += [duration]
silouettes += [silouette]
daviess += [davies]
calinskis += [calinski]
clusters += [len(set(cl_pred)) - (1 if -1 in cl_pred else 0)]
noise_points += [list(cl_pred).count(-1)]
# Affichage des résultats
params = [str(i) for i in params]
print_1d_data(params, durations, x_name="(eps,min_pts)",
y_name="temps_de_calcul", y_unit="ms", dataset_name=dataset_name,
method_name="DBSCAN", stop=False, save=save)
print_1d_data(params, silouettes, x_name="(eps,min_pts)",
y_name="coeficient_de_silhouette", dataset_name=dataset_name,
method_name="DBSCAN", stop=False, save=save)
print_1d_data(params, daviess, x_name="(eps,min_pts)",
y_name="coeficient_de_Davies", dataset_name=dataset_name,
method_name="DBSCAN", stop=False, save=save)
print_1d_data(params, calinskis, x_name="(eps,min_pts)",
y_name="coeficient_de_Calinski", dataset_name=dataset_name,
method_name="DBSCAN", stop=False, save=save)
print_1d_data(params, clusters, x_name="(eps,min_pts)",
y_name="nombre_de_clusters", dataset_name=dataset_name,
method_name="DBSCAN", stop=False, save=save)
print_1d_data(params, noise_points, x_name="(eps,min_pts)",
y_name="points_de_bruit", dataset_name=dataset_name,
method_name="DBSCAN", stop=False, save=save)

73
tp4-mean-shift.py Normal file
View file

@ -0,0 +1,73 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 8 16:07:28 2021
@author: pfaure
"""
from numpy import arange
from sklearn.neighbors import NearestNeighbors
import numpy as np
from myplotlib import print_1d_data, print_2d_data
from mydatalib import extract_data_2d, scale_data, apply_mean_shift, evaluate
path = './artificial/'
dataset_name = "xclara"
method_name = "mean-shift"
save = True
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
data = extract_data_2d(path + dataset_name)
print_2d_data(data, dataset_name=dataset_name +
"_brutes", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Mise à l'échelle")
data_scaled = scale_data(data)
print_2d_data(data_scaled, dataset_name=dataset_name +
"_scaled", stop=False, save=save)
# Application de Affinity Propagation pour plusieurs valeurs de préférence
# et evaluation de la solution
k_max = 2
k = []
durations = []
silouettes = []
daviess = []
calinskis = []
for bandwidth in arange(0.1, k_max, 0.1):
# Application du clustering
(model, duration) = apply_mean_shift(
data_scaled, bandwidth=bandwidth)
# Affichage des clusters
print_2d_data(data_scaled, dataset_name=dataset_name,
method_name=method_name, k=bandwidth,
stop=False, save=save, c=model.labels_)
# Evaluation de la solution de clustering
(silouette, davies, calinski) = evaluate(data_scaled, model)
# Enregistrement des valeurs
k += [bandwidth]
durations += [duration]
silouettes += [silouette]
daviess += [davies]
calinskis += [calinski]
# Affichage des résultats
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
method_name=method_name, stop=False, save=save)
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
dataset_name=dataset_name,
method_name=method_name, stop=False, save=save)
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
dataset_name=dataset_name,
method_name=method_name, stop=False, save=save)
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
dataset_name=dataset_name,
method_name=method_name, stop=False, save=save)
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
dataset_name=dataset_name,
method_name=method_name, stop=False, save=save)