Amelioration des librairies, fin de la partie clustering agglomeratif

This commit is contained in:
Paul Faure 2021-12-03 18:22:15 +01:00
parent d29db6660c
commit 60992b033c
5 changed files with 158 additions and 62 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
__pycache__/ __pycache__/
IMG/

View file

@ -12,12 +12,12 @@ from sklearn import cluster, metrics, preprocessing
def extract_data_2d(data_path): def extract_data_2d(data_path):
databrut = arff.loadarff(open(data_path, 'r')) databrut = arff.loadarff(open(data_path + ".arff", 'r'))
return np.array([[x[0], x[1]] for x in databrut[0]]) return np.array([[x[0], x[1]] for x in databrut[0]])
def extract_data_3d(data_path): def extract_data_3d(data_path):
databrut = arff.loadarff(open(data_path, 'r')) databrut = arff.loadarff(open(data_path + ".arff", 'r'))
return np.array([[x[0], x[1], x[2]] for x in databrut[0]]) return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
@ -34,6 +34,21 @@ def apply_kmeans(data, k: int = 3, init="k-means++"):
return (model_km, round((tps2 - tps1)*1000, 2)) return (model_km, round((tps2 - tps1)*1000, 2))
def evaluate(data, model_km): def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
tps1 = time.time()
model_agg = cluster.AgglomerativeClustering(
n_clusters=k, affinity='euclidean', linkage=linkage)
model_agg.fit(data)
tps2 = time.time()
return (model_agg, round((tps2 - tps1)*1000, 2))
def evaluate_kmeans(data, model_km):
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean') silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
return (silh, model_km.inertia_, model_km.n_iter_) return (silh, model_km.inertia_, model_km.n_iter_)
def evaluate_agglomerative_clustering(data, model_agg):
silh = metrics.silhouette_score(
data, model_agg.labels_, metric='euclidean')
return silh

View file

@ -5,7 +5,7 @@ Created on Fri Dec 3 15:28:19 2021
@author: pfaure @author: pfaure
""" """
import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc import scipy.cluster.hierarchy as shc
@ -15,6 +15,7 @@ def print_3d_data(data,
method_name: str = "", method_name: str = "",
k: int = 0, k: int = 0,
stop: bool = True, stop: bool = True,
save: bool = False,
c=None): c=None):
f0 = data[:, 0] # tous les élements de la première colonne f0 = data[:, 0] # tous les élements de la première colonne
f1 = data[:, 1] # tous les éléments de la deuxième colonne f1 = data[:, 1] # tous les éléments de la deuxième colonne
@ -34,7 +35,20 @@ def print_3d_data(data,
ax.set_ylabel('Y') ax.set_ylabel('Y')
ax.set_zlabel('Z') ax.set_zlabel('Z')
plt.tight_layout() plt.tight_layout()
plt.show(block=stop) if (save):
if (c is None):
save_path = "IMG/DATA_VISUALISATION/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + dataset_name + ".png")
else:
save_path = "IMG/" + method_name + "/" + dataset_name + "/CLUSTERS/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + "k=" + str(k) + ".png")
plt.close()
else:
plt.show(block=stop)
def print_2d_data(data, def print_2d_data(data,
@ -42,6 +56,7 @@ def print_2d_data(data,
method_name: str = "", method_name: str = "",
k: int = 0, k: int = 0,
stop: bool = True, stop: bool = True,
save: bool = False,
c=None): c=None):
f0 = data[:, 0] # tous les élements de la première colonne f0 = data[:, 0] # tous les élements de la première colonne
f1 = data[:, 1] # tous les éléments de la deuxième colonne f1 = data[:, 1] # tous les éléments de la deuxième colonne
@ -54,22 +69,53 @@ def print_2d_data(data,
plt.scatter(f0, f1, c=c, s=8) plt.scatter(f0, f1, c=c, s=8)
plt.title("Graphique de " + str(k) + " clusters avec la méthode " + plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
method_name + " sur le jeu de données " + dataset_name) method_name + " sur le jeu de données " + dataset_name)
plt.show(block=stop)
if (save):
if (c is None):
save_path = "IMG/DATA_VISUALISATION/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + dataset_name + ".png")
else:
save_path = "IMG/" + method_name + "/" + dataset_name + "/CLUSTERS/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + "k=" + str(k) + ".png")
plt.close()
else:
plt.show(block=stop)
def print_1d_data(x, y, x_name: str = "toto", def print_1d_data(x, y,
x_name: str = "toto",
y_name: str = "tata", y_name: str = "tata",
stop: bool = True): x_unit: str = "",
y_unit: str = "",
dataset_name: str = "",
method_name: str = "",
stop: bool = True,
save: bool = False):
plt.figure() plt.figure()
plt.plot(x, y) plt.plot(x, y)
plt.title(y_name + " = f(" + x_name + ")") plt.title(y_name + " = f(" + x_name + ") pour " +
plt.show(block=stop) method_name + " sur les données " + dataset_name)
plt.xlabel(x_name + " (" + x_unit + ")")
plt.ylabel(y_name + " (" + y_unit + ")")
if (save):
save_path = "IMG/" + method_name + "/" + dataset_name + "/EVALUATION/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + y_name + ".png")
plt.close()
else:
plt.show(block=stop)
def print_dendrogramme(data, def print_dendrogramme(data,
dataset_name: str = "", dataset_name: str = "",
linkage: str = "", linkage: str = "",
stop: bool = True): stop: bool = True,
save: bool = False):
distance = shc.linkage(data, linkage) distance = shc.linkage(data, linkage)
@ -80,4 +126,11 @@ def print_dendrogramme(data,
show_leaf_counts=False) show_leaf_counts=False)
plt.title("Dendrogramme du jeu de données " + plt.title("Dendrogramme du jeu de données " +
dataset_name + " avec le linkage " + linkage) dataset_name + " avec le linkage " + linkage)
plt.show(block=stop) if (save):
save_path = "IMG/DENDROGRAMME/" + linkage + "/"
if not os.path.exists(save_path):
os.makedirs(save_path)
plt.savefig(save_path + dataset_name + ".png")
plt.close()
else:
plt.show(block=stop)

View file

@ -8,47 +8,62 @@ Created on Fri Nov 19 23:08:23 2021
from myplotlib import print_1d_data, print_2d_data, print_3d_data from myplotlib import print_1d_data, print_2d_data, print_3d_data
from mydatalib import extract_data_2d, extract_data_3d, scale_data from mydatalib import extract_data_2d, extract_data_3d, scale_data
from mydatalib import apply_kmeans, evaluate from mydatalib import apply_kmeans, evaluate_kmeans
path = './artificial/' path = './artificial/'
dataset_name = "xclara.arff" dataset_name = "xclara"
save = True
# Extraction et visualisation d'un dataset 2D # Extraction et visualisation d'un dataset 2D
data = extract_data_2d(path + dataset_name) data = extract_data_2d(path + dataset_name)
print_2d_data(data, dataset_name=dataset_name+" brute", stop=False) print_2d_data(data, dataset_name=dataset_name+"_brute", stop=False, save=save)
# Extraction et visualisation d'un dataset 3D # Extraction et visualisation d'un dataset 3D
data_golfball = extract_data_3d(path+"golfball.arff") data_golfball = extract_data_3d(path+"golfball")
print_3d_data(data_golfball, dataset_name="golfball.arff", stop=False) print_3d_data(data_golfball, dataset_name="golfball", stop=False, save=save)
# Scaling des data 2D et visualisation # Scaling des data 2D et visualisation
data_scaled = scale_data(data) data_scaled = scale_data(data)
print_2d_data(data_scaled, dataset_name=dataset_name+" scaled", stop=False) print_2d_data(data_scaled, dataset_name=dataset_name +
"_scaled", stop=False, save=save)
# Application de k-means pour plusieurs valeurs de k
# kmeans = KMeans(n_clusters=3000, random_state=0).fit_predict(data_scaled) # et evaluation de la solution
# print_2d_data(data_s_set2_scaled, True, kmeans)
k = [] k = []
durations = [] durations = []
silouettes = [] silouettes = []
inerties = [] inerties = []
iterations = [] iterations = []
for i in range(2, 5): for i in range(2, 50):
# Application de k-means
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++") (model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
# Affichage des clusters
print_2d_data(data_scaled, dataset_name=dataset_name, print_2d_data(data_scaled, dataset_name=dataset_name,
method_name="k-means", k=i, stop=False, c=model.labels_) method_name="k-means", k=i, c=model.labels_,
(silouette, inertie, iteration) = evaluate(data_scaled, model) stop=False, save=save)
# Evaluation de la solution de clustering
(silouette, inertie, iteration) = evaluate_kmeans(data_scaled, model)
# Enregistrement des valeurs
k += [i] k += [i]
durations += [duration] durations += [duration]
silouettes += [silouette] silouettes += [silouette]
inerties += [inertie] inerties += [inertie]
iterations += [iteration] iterations += [iteration]
print_1d_data(k, k, x_name="k", y_name="k", stop=False) # Affichage des résultats
print_1d_data(k, durations, x_name="k", y_name="temps de calcul", stop=False) print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
print_1d_data(k, silouettes, x_name="k", method_name="k-means", stop=False, save=save)
y_name="coeficient de silhouette", stop=False) print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
print_1d_data(k, inerties, x_name="k", y_name="inertie", stop=False) dataset_name=dataset_name, method_name="k-means",
print_1d_data(k, iterations, x_name="k", stop=False, save=save)
y_name="nombre d'itérations", stop=True) print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
dataset_name=dataset_name, method_name="k-means",
stop=False, save=save)
print_1d_data(k, inerties, x_name="k", y_name="inertie",
dataset_name=dataset_name, method_name="k-means",
stop=False, save=save)
print_1d_data(k, iterations, x_name="k", y_name="nombre_d_iterations",
dataset_name=dataset_name, method_name="k-means",
stop=True, save=save)

View file

@ -4,12 +4,11 @@ Created on Sat Nov 20 21:28:40 2021
@author: huguet @author: huguet
""" """
import numpy as np
import time
from scipy.io import arff
from sklearn import cluster, metrics, preprocessing
from myplotlib import print_1d_data, print_2d_data, print_dendrogramme from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
from mydatalib import extract_data_2d, scale_data
from mydatalib import apply_agglomerative_clustering
from mydatalib import evaluate_agglomerative_clustering
################################################################## ##################################################################
@ -29,47 +28,60 @@ from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
# 2d-4c-no9.arff xclara.arff # 2d-4c-no9.arff xclara.arff
path = './artificial/' path = './artificial/'
dataset_name = "xclara.arff" dataset_name = "xclara"
save = True
print("-----------------------------------------------------------") print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name) print(" Chargement du dataset : " + dataset_name)
databrut = arff.loadarff(open(path + dataset_name, 'r')) data = extract_data_2d(path + dataset_name)
datanp = np.array([[x[0], x[1]] for x in databrut[0]]) print_2d_data(data, dataset_name=dataset_name +
print_2d_data(datanp, dataset_name=dataset_name + " brutes", stop=False) "_brutes", stop=False, save=save)
print("-----------------------------------------------------------") print("-----------------------------------------------------------")
print(" Mise à l'échelle") print(" Mise à l'échelle")
scaler = preprocessing.StandardScaler().fit(datanp) data_scaled = scale_data(data)
data_scaled = scaler.transform(datanp) print_2d_data(data_scaled, dataset_name=dataset_name +
print_2d_data(data_scaled, dataset_name=dataset_name + " scaled", stop=False) "_scaled", stop=False, save=save)
# Types de linkage : single, average, complete, ward linkage # Types de linkage : single, average, complete, ward linkage
linkage = "complete" linkage = "complete"
print("-----------------------------------------------------------") print("-----------------------------------------------------------")
print(" Création du dendrogramme : linkage " + linkage) print(" Création du dendrogramme : linkage " + linkage)
print_dendrogramme(data_scaled, dataset_name=dataset_name, print_dendrogramme(data_scaled, dataset_name=dataset_name,
linkage=linkage, stop=False) linkage=linkage, stop=False)
k = 10 k_max = 10
print("-----------------------------------------------------------") print("-----------------------------------------------------------")
print(" Création clusters : linkage " + linkage + ", k=" + str(k)) print(" Création clusters : linkage " +
tps3 = time.time() linkage + ", k=(0 to " + str(k_max) + ")")
model_scaled = cluster.AgglomerativeClustering( # Application du clustering agglomeratif pour plusieurs valeurs de k
n_clusters=k, affinity='euclidean', linkage=linkage) # et evaluation de la solution
model_scaled.fit(data_scaled) k = []
# cluster.fit_predict(X) durations = []
tps4 = time.time() silouettes = []
print_2d_data(data_scaled, dataset_name=dataset_name, for i in range(2, k_max):
method_name="Agglomératif " + linkage, k=k, # Application du clustering agglomeratif
stop=False, c=model_scaled.labels_) (model, duration) = apply_agglomerative_clustering(
data_scaled, k=i, linkage=linkage)
# Affichage des clusters
print_2d_data(data_scaled, dataset_name=dataset_name,
method_name="agglomerative_" + linkage, k=i,
stop=False, save=save, c=model.labels_)
# Evaluation de la solution de clustering
silouette = evaluate_agglomerative_clustering(data_scaled, model)
# Enregistrement des valeurs
k += [i]
durations += [duration]
silouettes += [silouette]
# Some evaluation metrics # Affichage des résultats
silh = metrics.silhouette_score( print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
data_scaled, model_scaled.labels_, metric='euclidean') method_name="agglomerative_" + linkage, stop=False, save=save)
print("Coefficient de silhouette : ", silh) print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
dataset_name=dataset_name,
######################################################################## method_name="agglomerative_" + linkage, stop=False, save=save)
# TRY : parameters for dendrogram and hierarchical clustering print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
# EVALUATION : with several metrics (for several number of clusters) dataset_name=dataset_name,
######################################################################## method_name="agglomerative_" + linkage, stop=False, save=save)