Amelioration des librairies, fin de la partie clustering agglomeratif
This commit is contained in:
parent
d29db6660c
commit
60992b033c
5 changed files with 158 additions and 62 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1 +1,2 @@
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
IMG/
|
||||||
|
|
21
mydatalib.py
21
mydatalib.py
|
@ -12,12 +12,12 @@ from sklearn import cluster, metrics, preprocessing
|
||||||
|
|
||||||
|
|
||||||
def extract_data_2d(data_path):
|
def extract_data_2d(data_path):
|
||||||
databrut = arff.loadarff(open(data_path, 'r'))
|
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
||||||
return np.array([[x[0], x[1]] for x in databrut[0]])
|
return np.array([[x[0], x[1]] for x in databrut[0]])
|
||||||
|
|
||||||
|
|
||||||
def extract_data_3d(data_path):
|
def extract_data_3d(data_path):
|
||||||
databrut = arff.loadarff(open(data_path, 'r'))
|
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
||||||
return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
|
return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,6 +34,21 @@ def apply_kmeans(data, k: int = 3, init="k-means++"):
|
||||||
return (model_km, round((tps2 - tps1)*1000, 2))
|
return (model_km, round((tps2 - tps1)*1000, 2))
|
||||||
|
|
||||||
|
|
||||||
def evaluate(data, model_km):
|
def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
|
||||||
|
tps1 = time.time()
|
||||||
|
model_agg = cluster.AgglomerativeClustering(
|
||||||
|
n_clusters=k, affinity='euclidean', linkage=linkage)
|
||||||
|
model_agg.fit(data)
|
||||||
|
tps2 = time.time()
|
||||||
|
return (model_agg, round((tps2 - tps1)*1000, 2))
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_kmeans(data, model_km):
|
||||||
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
||||||
return (silh, model_km.inertia_, model_km.n_iter_)
|
return (silh, model_km.inertia_, model_km.n_iter_)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_agglomerative_clustering(data, model_agg):
|
||||||
|
silh = metrics.silhouette_score(
|
||||||
|
data, model_agg.labels_, metric='euclidean')
|
||||||
|
return silh
|
||||||
|
|
71
myplotlib.py
71
myplotlib.py
|
@ -5,7 +5,7 @@ Created on Fri Dec 3 15:28:19 2021
|
||||||
|
|
||||||
@author: pfaure
|
@author: pfaure
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import scipy.cluster.hierarchy as shc
|
import scipy.cluster.hierarchy as shc
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ def print_3d_data(data,
|
||||||
method_name: str = "",
|
method_name: str = "",
|
||||||
k: int = 0,
|
k: int = 0,
|
||||||
stop: bool = True,
|
stop: bool = True,
|
||||||
|
save: bool = False,
|
||||||
c=None):
|
c=None):
|
||||||
f0 = data[:, 0] # tous les élements de la première colonne
|
f0 = data[:, 0] # tous les élements de la première colonne
|
||||||
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
||||||
|
@ -34,7 +35,20 @@ def print_3d_data(data,
|
||||||
ax.set_ylabel('Y')
|
ax.set_ylabel('Y')
|
||||||
ax.set_zlabel('Z')
|
ax.set_zlabel('Z')
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.show(block=stop)
|
if (save):
|
||||||
|
if (c is None):
|
||||||
|
save_path = "IMG/DATA_VISUALISATION/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
plt.savefig(save_path + dataset_name + ".png")
|
||||||
|
else:
|
||||||
|
save_path = "IMG/" + method_name + "/" + dataset_name + "/CLUSTERS/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
plt.savefig(save_path + "k=" + str(k) + ".png")
|
||||||
|
plt.close()
|
||||||
|
else:
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
||||||
|
|
||||||
def print_2d_data(data,
|
def print_2d_data(data,
|
||||||
|
@ -42,6 +56,7 @@ def print_2d_data(data,
|
||||||
method_name: str = "",
|
method_name: str = "",
|
||||||
k: int = 0,
|
k: int = 0,
|
||||||
stop: bool = True,
|
stop: bool = True,
|
||||||
|
save: bool = False,
|
||||||
c=None):
|
c=None):
|
||||||
f0 = data[:, 0] # tous les élements de la première colonne
|
f0 = data[:, 0] # tous les élements de la première colonne
|
||||||
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
||||||
|
@ -54,22 +69,53 @@ def print_2d_data(data,
|
||||||
plt.scatter(f0, f1, c=c, s=8)
|
plt.scatter(f0, f1, c=c, s=8)
|
||||||
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
|
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
|
||||||
method_name + " sur le jeu de données " + dataset_name)
|
method_name + " sur le jeu de données " + dataset_name)
|
||||||
plt.show(block=stop)
|
|
||||||
|
if (save):
|
||||||
|
if (c is None):
|
||||||
|
save_path = "IMG/DATA_VISUALISATION/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
plt.savefig(save_path + dataset_name + ".png")
|
||||||
|
else:
|
||||||
|
save_path = "IMG/" + method_name + "/" + dataset_name + "/CLUSTERS/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
plt.savefig(save_path + "k=" + str(k) + ".png")
|
||||||
|
plt.close()
|
||||||
|
else:
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
||||||
|
|
||||||
def print_1d_data(x, y, x_name: str = "toto",
|
def print_1d_data(x, y,
|
||||||
|
x_name: str = "toto",
|
||||||
y_name: str = "tata",
|
y_name: str = "tata",
|
||||||
stop: bool = True):
|
x_unit: str = "",
|
||||||
|
y_unit: str = "",
|
||||||
|
dataset_name: str = "",
|
||||||
|
method_name: str = "",
|
||||||
|
stop: bool = True,
|
||||||
|
save: bool = False):
|
||||||
plt.figure()
|
plt.figure()
|
||||||
plt.plot(x, y)
|
plt.plot(x, y)
|
||||||
plt.title(y_name + " = f(" + x_name + ")")
|
plt.title(y_name + " = f(" + x_name + ") pour " +
|
||||||
plt.show(block=stop)
|
method_name + " sur les données " + dataset_name)
|
||||||
|
plt.xlabel(x_name + " (" + x_unit + ")")
|
||||||
|
plt.ylabel(y_name + " (" + y_unit + ")")
|
||||||
|
if (save):
|
||||||
|
save_path = "IMG/" + method_name + "/" + dataset_name + "/EVALUATION/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
plt.savefig(save_path + y_name + ".png")
|
||||||
|
plt.close()
|
||||||
|
else:
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
||||||
|
|
||||||
def print_dendrogramme(data,
|
def print_dendrogramme(data,
|
||||||
dataset_name: str = "",
|
dataset_name: str = "",
|
||||||
linkage: str = "",
|
linkage: str = "",
|
||||||
stop: bool = True):
|
stop: bool = True,
|
||||||
|
save: bool = False):
|
||||||
|
|
||||||
distance = shc.linkage(data, linkage)
|
distance = shc.linkage(data, linkage)
|
||||||
|
|
||||||
|
@ -80,4 +126,11 @@ def print_dendrogramme(data,
|
||||||
show_leaf_counts=False)
|
show_leaf_counts=False)
|
||||||
plt.title("Dendrogramme du jeu de données " +
|
plt.title("Dendrogramme du jeu de données " +
|
||||||
dataset_name + " avec le linkage " + linkage)
|
dataset_name + " avec le linkage " + linkage)
|
||||||
plt.show(block=stop)
|
if (save):
|
||||||
|
save_path = "IMG/DENDROGRAMME/" + linkage + "/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.makedirs(save_path)
|
||||||
|
plt.savefig(save_path + dataset_name + ".png")
|
||||||
|
plt.close()
|
||||||
|
else:
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
|
@ -8,47 +8,62 @@ Created on Fri Nov 19 23:08:23 2021
|
||||||
|
|
||||||
from myplotlib import print_1d_data, print_2d_data, print_3d_data
|
from myplotlib import print_1d_data, print_2d_data, print_3d_data
|
||||||
from mydatalib import extract_data_2d, extract_data_3d, scale_data
|
from mydatalib import extract_data_2d, extract_data_3d, scale_data
|
||||||
from mydatalib import apply_kmeans, evaluate
|
from mydatalib import apply_kmeans, evaluate_kmeans
|
||||||
|
|
||||||
|
|
||||||
path = './artificial/'
|
path = './artificial/'
|
||||||
dataset_name = "xclara.arff"
|
dataset_name = "xclara"
|
||||||
|
|
||||||
|
save = True
|
||||||
|
|
||||||
# Extraction et visualisation d'un dataset 2D
|
# Extraction et visualisation d'un dataset 2D
|
||||||
data = extract_data_2d(path + dataset_name)
|
data = extract_data_2d(path + dataset_name)
|
||||||
print_2d_data(data, dataset_name=dataset_name+" brute", stop=False)
|
print_2d_data(data, dataset_name=dataset_name+"_brute", stop=False, save=save)
|
||||||
|
|
||||||
# Extraction et visualisation d'un dataset 3D
|
# Extraction et visualisation d'un dataset 3D
|
||||||
data_golfball = extract_data_3d(path+"golfball.arff")
|
data_golfball = extract_data_3d(path+"golfball")
|
||||||
print_3d_data(data_golfball, dataset_name="golfball.arff", stop=False)
|
print_3d_data(data_golfball, dataset_name="golfball", stop=False, save=save)
|
||||||
|
|
||||||
# Scaling des data 2D et visualisation
|
# Scaling des data 2D et visualisation
|
||||||
data_scaled = scale_data(data)
|
data_scaled = scale_data(data)
|
||||||
print_2d_data(data_scaled, dataset_name=dataset_name+" scaled", stop=False)
|
print_2d_data(data_scaled, dataset_name=dataset_name +
|
||||||
|
"_scaled", stop=False, save=save)
|
||||||
|
|
||||||
|
# Application de k-means pour plusieurs valeurs de k
|
||||||
# kmeans = KMeans(n_clusters=3000, random_state=0).fit_predict(data_scaled)
|
# et evaluation de la solution
|
||||||
# print_2d_data(data_s_set2_scaled, True, kmeans)
|
|
||||||
k = []
|
k = []
|
||||||
durations = []
|
durations = []
|
||||||
silouettes = []
|
silouettes = []
|
||||||
inerties = []
|
inerties = []
|
||||||
iterations = []
|
iterations = []
|
||||||
for i in range(2, 5):
|
for i in range(2, 50):
|
||||||
|
# Application de k-means
|
||||||
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
|
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
|
||||||
|
# Affichage des clusters
|
||||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||||
method_name="k-means", k=i, stop=False, c=model.labels_)
|
method_name="k-means", k=i, c=model.labels_,
|
||||||
(silouette, inertie, iteration) = evaluate(data_scaled, model)
|
stop=False, save=save)
|
||||||
|
# Evaluation de la solution de clustering
|
||||||
|
(silouette, inertie, iteration) = evaluate_kmeans(data_scaled, model)
|
||||||
|
# Enregistrement des valeurs
|
||||||
k += [i]
|
k += [i]
|
||||||
durations += [duration]
|
durations += [duration]
|
||||||
silouettes += [silouette]
|
silouettes += [silouette]
|
||||||
inerties += [inertie]
|
inerties += [inertie]
|
||||||
iterations += [iteration]
|
iterations += [iteration]
|
||||||
|
|
||||||
print_1d_data(k, k, x_name="k", y_name="k", stop=False)
|
# Affichage des résultats
|
||||||
print_1d_data(k, durations, x_name="k", y_name="temps de calcul", stop=False)
|
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||||
print_1d_data(k, silouettes, x_name="k",
|
method_name="k-means", stop=False, save=save)
|
||||||
y_name="coeficient de silhouette", stop=False)
|
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||||
print_1d_data(k, inerties, x_name="k", y_name="inertie", stop=False)
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
print_1d_data(k, iterations, x_name="k",
|
stop=False, save=save)
|
||||||
y_name="nombre d'itérations", stop=True)
|
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, inerties, x_name="k", y_name="inertie",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, iterations, x_name="k", y_name="nombre_d_iterations",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=True, save=save)
|
||||||
|
|
|
@ -4,12 +4,11 @@ Created on Sat Nov 20 21:28:40 2021
|
||||||
|
|
||||||
@author: huguet
|
@author: huguet
|
||||||
"""
|
"""
|
||||||
import numpy as np
|
|
||||||
import time
|
|
||||||
|
|
||||||
from scipy.io import arff
|
|
||||||
from sklearn import cluster, metrics, preprocessing
|
|
||||||
from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
|
from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
|
||||||
|
from mydatalib import extract_data_2d, scale_data
|
||||||
|
from mydatalib import apply_agglomerative_clustering
|
||||||
|
from mydatalib import evaluate_agglomerative_clustering
|
||||||
|
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
|
@ -29,47 +28,60 @@ from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
|
||||||
# 2d-4c-no9.arff xclara.arff
|
# 2d-4c-no9.arff xclara.arff
|
||||||
|
|
||||||
path = './artificial/'
|
path = './artificial/'
|
||||||
dataset_name = "xclara.arff"
|
dataset_name = "xclara"
|
||||||
|
save = True
|
||||||
|
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Chargement du dataset : " + dataset_name)
|
print(" Chargement du dataset : " + dataset_name)
|
||||||
databrut = arff.loadarff(open(path + dataset_name, 'r'))
|
data = extract_data_2d(path + dataset_name)
|
||||||
datanp = np.array([[x[0], x[1]] for x in databrut[0]])
|
print_2d_data(data, dataset_name=dataset_name +
|
||||||
print_2d_data(datanp, dataset_name=dataset_name + " brutes", stop=False)
|
"_brutes", stop=False, save=save)
|
||||||
|
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Mise à l'échelle")
|
print(" Mise à l'échelle")
|
||||||
scaler = preprocessing.StandardScaler().fit(datanp)
|
data_scaled = scale_data(data)
|
||||||
data_scaled = scaler.transform(datanp)
|
print_2d_data(data_scaled, dataset_name=dataset_name +
|
||||||
print_2d_data(data_scaled, dataset_name=dataset_name + " scaled", stop=False)
|
"_scaled", stop=False, save=save)
|
||||||
|
|
||||||
# Types de linkage : single, average, complete, ward linkage
|
# Types de linkage : single, average, complete, ward linkage
|
||||||
linkage = "complete"
|
linkage = "complete"
|
||||||
|
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Création du dendrogramme : linkage " + linkage)
|
print(" Création du dendrogramme : linkage " + linkage)
|
||||||
print_dendrogramme(data_scaled, dataset_name=dataset_name,
|
print_dendrogramme(data_scaled, dataset_name=dataset_name,
|
||||||
linkage=linkage, stop=False)
|
linkage=linkage, stop=False)
|
||||||
|
|
||||||
|
|
||||||
k = 10
|
k_max = 10
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Création clusters : linkage " + linkage + ", k=" + str(k))
|
print(" Création clusters : linkage " +
|
||||||
tps3 = time.time()
|
linkage + ", k=(0 to " + str(k_max) + ")")
|
||||||
model_scaled = cluster.AgglomerativeClustering(
|
# Application du clustering agglomeratif pour plusieurs valeurs de k
|
||||||
n_clusters=k, affinity='euclidean', linkage=linkage)
|
# et evaluation de la solution
|
||||||
model_scaled.fit(data_scaled)
|
k = []
|
||||||
# cluster.fit_predict(X)
|
durations = []
|
||||||
tps4 = time.time()
|
silouettes = []
|
||||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
for i in range(2, k_max):
|
||||||
method_name="Agglomératif " + linkage, k=k,
|
# Application du clustering agglomeratif
|
||||||
stop=False, c=model_scaled.labels_)
|
(model, duration) = apply_agglomerative_clustering(
|
||||||
|
data_scaled, k=i, linkage=linkage)
|
||||||
|
# Affichage des clusters
|
||||||
|
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||||
|
method_name="agglomerative_" + linkage, k=i,
|
||||||
|
stop=False, save=save, c=model.labels_)
|
||||||
|
# Evaluation de la solution de clustering
|
||||||
|
silouette = evaluate_agglomerative_clustering(data_scaled, model)
|
||||||
|
# Enregistrement des valeurs
|
||||||
|
k += [i]
|
||||||
|
durations += [duration]
|
||||||
|
silouettes += [silouette]
|
||||||
|
|
||||||
# Some evaluation metrics
|
# Affichage des résultats
|
||||||
silh = metrics.silhouette_score(
|
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||||
data_scaled, model_scaled.labels_, metric='euclidean')
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
print("Coefficient de silhouette : ", silh)
|
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||||
|
dataset_name=dataset_name,
|
||||||
########################################################################
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
# TRY : parameters for dendrogram and hierarchical clustering
|
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||||
# EVALUATION : with several metrics (for several number of clusters)
|
dataset_name=dataset_name,
|
||||||
########################################################################
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
|
|
Loading…
Reference in a new issue