Mise en place d'une librairie d'affichage et de gestion des data
This commit is contained in:
parent
1ac7ab4212
commit
d29db6660c
5 changed files with 239 additions and 128 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -0,0 +1 @@
|
||||||
|
__pycache__/
|
39
mydatalib.py
Normal file
39
mydatalib.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Dec 3 16:29:12 2021
|
||||||
|
|
||||||
|
@author: pfaure
|
||||||
|
"""
|
||||||
|
from scipy.io import arff
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
from sklearn import cluster, metrics, preprocessing
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data_2d(data_path):
|
||||||
|
databrut = arff.loadarff(open(data_path, 'r'))
|
||||||
|
return np.array([[x[0], x[1]] for x in databrut[0]])
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data_3d(data_path):
|
||||||
|
databrut = arff.loadarff(open(data_path, 'r'))
|
||||||
|
return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
|
||||||
|
|
||||||
|
|
||||||
|
def scale_data(data):
|
||||||
|
scaler = preprocessing.StandardScaler()
|
||||||
|
return scaler.fit_transform(data)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_kmeans(data, k: int = 3, init="k-means++"):
|
||||||
|
tps1 = time.time()
|
||||||
|
model_km = cluster.KMeans(n_clusters=k, init=init)
|
||||||
|
model_km.fit(data)
|
||||||
|
tps2 = time.time()
|
||||||
|
return (model_km, round((tps2 - tps1)*1000, 2))
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(data, model_km):
|
||||||
|
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
||||||
|
return (silh, model_km.inertia_, model_km.n_iter_)
|
83
myplotlib.py
Normal file
83
myplotlib.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Dec 3 15:28:19 2021
|
||||||
|
|
||||||
|
@author: pfaure
|
||||||
|
"""
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import scipy.cluster.hierarchy as shc
|
||||||
|
|
||||||
|
|
||||||
|
def print_3d_data(data,
|
||||||
|
dataset_name: str = "",
|
||||||
|
method_name: str = "",
|
||||||
|
k: int = 0,
|
||||||
|
stop: bool = True,
|
||||||
|
c=None):
|
||||||
|
f0 = data[:, 0] # tous les élements de la première colonne
|
||||||
|
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
||||||
|
f2 = data[:, 2] # tous les éléments de la troisième colonne
|
||||||
|
fig = plt.figure()
|
||||||
|
ax = fig.gca(projection='3d') # Affichage en 3D
|
||||||
|
if (c is None):
|
||||||
|
ax.scatter(f0, f1, f2, label='Courbe',
|
||||||
|
marker='d')
|
||||||
|
plt.title("Données initiales : " + dataset_name)
|
||||||
|
else:
|
||||||
|
ax.scatter(f0, f1, f2, c=c, label='Courbe',
|
||||||
|
marker='d')
|
||||||
|
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
|
||||||
|
method_name + " sur le jeu de données " + dataset_name)
|
||||||
|
ax.set_xlabel('X')
|
||||||
|
ax.set_ylabel('Y')
|
||||||
|
ax.set_zlabel('Z')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
||||||
|
|
||||||
|
def print_2d_data(data,
|
||||||
|
dataset_name: str = "",
|
||||||
|
method_name: str = "",
|
||||||
|
k: int = 0,
|
||||||
|
stop: bool = True,
|
||||||
|
c=None):
|
||||||
|
f0 = data[:, 0] # tous les élements de la première colonne
|
||||||
|
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
||||||
|
plt.figure()
|
||||||
|
# plt.hist2d(f0, f1)
|
||||||
|
if (c is None):
|
||||||
|
plt.scatter(f0, f1, s=8)
|
||||||
|
plt.title("Données initiales : " + dataset_name)
|
||||||
|
else:
|
||||||
|
plt.scatter(f0, f1, c=c, s=8)
|
||||||
|
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
|
||||||
|
method_name + " sur le jeu de données " + dataset_name)
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
||||||
|
|
||||||
|
def print_1d_data(x, y, x_name: str = "toto",
|
||||||
|
y_name: str = "tata",
|
||||||
|
stop: bool = True):
|
||||||
|
plt.figure()
|
||||||
|
plt.plot(x, y)
|
||||||
|
plt.title(y_name + " = f(" + x_name + ")")
|
||||||
|
plt.show(block=stop)
|
||||||
|
|
||||||
|
|
||||||
|
def print_dendrogramme(data,
|
||||||
|
dataset_name: str = "",
|
||||||
|
linkage: str = "",
|
||||||
|
stop: bool = True):
|
||||||
|
|
||||||
|
distance = shc.linkage(data, linkage)
|
||||||
|
|
||||||
|
plt.figure(figsize=(12, 12))
|
||||||
|
shc.dendrogram(distance,
|
||||||
|
orientation='top',
|
||||||
|
distance_sort='descending',
|
||||||
|
show_leaf_counts=False)
|
||||||
|
plt.title("Dendrogramme du jeu de données " +
|
||||||
|
dataset_name + " avec le linkage " + linkage)
|
||||||
|
plt.show(block=stop)
|
|
@ -2,140 +2,53 @@
|
||||||
"""
|
"""
|
||||||
Created on Fri Nov 19 23:08:23 2021
|
Created on Fri Nov 19 23:08:23 2021
|
||||||
|
|
||||||
@author: huguet
|
@author: pfaure
|
||||||
"""
|
"""
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from mpl_toolkits.mplot3d import axes3d # Fonction pour la 3D
|
|
||||||
import time
|
|
||||||
|
|
||||||
from scipy.io import arff
|
|
||||||
from sklearn import cluster
|
|
||||||
from sklearn import metrics
|
|
||||||
from sklearn import preprocessing
|
|
||||||
from sklearn.cluster import KMeans
|
|
||||||
|
|
||||||
##################################################################
|
from myplotlib import print_1d_data, print_2d_data, print_3d_data
|
||||||
# READ a data set (arff format)
|
from mydatalib import extract_data_2d, extract_data_3d, scale_data
|
||||||
|
from mydatalib import apply_kmeans, evaluate
|
||||||
|
|
||||||
# Parser un fichier de données au format arff
|
|
||||||
# datanp est un tableau (numpy) d'exemples avec pour chacun la liste
|
|
||||||
# des valeurs des features
|
|
||||||
|
|
||||||
# Note 1 :
|
|
||||||
# dans les jeux de données considérés : 2 features (dimension 2 seulement)
|
|
||||||
# t =np.array([[1,2], [3,4], [5,6], [7,8]])
|
|
||||||
#
|
|
||||||
# Note 2 :
|
|
||||||
# le jeu de données contient aussi un numéro de cluster pour chaque point
|
|
||||||
# --> IGNORER CETTE INFORMATION ....
|
|
||||||
# 2d-4c-no9.arff
|
|
||||||
|
|
||||||
def extract_data_2d(databrut):
|
|
||||||
return np.array([[x[0],x[1]] for x in databrut[0]])
|
|
||||||
|
|
||||||
def extract_data_3d(databrut):
|
|
||||||
return np.array([[x[0],x[1],x[2]] for x in databrut[0]])
|
|
||||||
|
|
||||||
def print_3d_data(data, stop:bool = True, c=None):
|
|
||||||
print("---------------------------------------")
|
|
||||||
print("Affichage données initiales ")
|
|
||||||
f0 = data[:,0] # tous les élements de la première colonne
|
|
||||||
f1 = data[:,1] # tous les éléments de la deuxième colonne
|
|
||||||
f2 = data[:,2] # tous les éléments de la troisième colonne
|
|
||||||
fig = plt.figure()
|
|
||||||
ax = fig.gca(projection='3d') # Affichage en 3D
|
|
||||||
if (c is None):
|
|
||||||
ax.scatter(f0, f1, f2, label='Courbe', marker='d') # Tracé des points 3D
|
|
||||||
else:
|
|
||||||
ax.scatter(f0, f1, f2, c=c, label='Courbe', marker='d') # Tracé des points 3D
|
|
||||||
ax.set_xlabel('X')
|
|
||||||
ax.set_ylabel('Y')
|
|
||||||
ax.set_zlabel('Z')
|
|
||||||
plt.tight_layout()
|
|
||||||
plt.title("Donnees initiales")
|
|
||||||
plt.show(block=stop)
|
|
||||||
|
|
||||||
def print_2d_data(data, stop:bool = True, c=None):
|
|
||||||
print("---------------------------------------")
|
|
||||||
print("Affichage données initiales ")
|
|
||||||
f0 = data[:,0] # tous les élements de la première colonne
|
|
||||||
f1 = data[:,1] # tous les éléments de la deuxième colonne
|
|
||||||
fig = plt.figure()
|
|
||||||
#plt.hist2d(f0, f1)
|
|
||||||
if (c is None):
|
|
||||||
plt.scatter(f0, f1, s=8)
|
|
||||||
else:
|
|
||||||
plt.scatter(f0, f1, c=c, s=8)
|
|
||||||
plt.title("Donnees initiales")
|
|
||||||
plt.show(block=stop)
|
|
||||||
|
|
||||||
def print_1d_data(x, y, stop:bool = True):
|
|
||||||
fig = plt.figure()
|
|
||||||
plt.plot(x, y)
|
|
||||||
plt.title("Toto")
|
|
||||||
plt.show(block=stop)
|
|
||||||
|
|
||||||
# (model, duration) = apply_kmeans(data, k=3)
|
|
||||||
def apply_kmeans(data, k:int=3, init="k-means++"):
|
|
||||||
##################################################################
|
|
||||||
# Run clustering method for a given number of clusters
|
|
||||||
tps1 = time.time()
|
|
||||||
model_km = cluster.KMeans(n_clusters=k, init=init)
|
|
||||||
model_km.fit(data)
|
|
||||||
tps2 = time.time()
|
|
||||||
|
|
||||||
return (model_km, round((tps2 - tps1)*1000,2))
|
|
||||||
|
|
||||||
def evaluate(data, model_km):
|
|
||||||
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
|
||||||
return (silh, model_km.inertia_, model_km.n_iter_)
|
|
||||||
|
|
||||||
path = './artificial/'
|
path = './artificial/'
|
||||||
databrut_s_set2 = arff.loadarff(open(path+"s-set2.arff", 'r'))
|
dataset_name = "xclara.arff"
|
||||||
data_s_set2 = extract_data_2d(databrut_s_set2)
|
|
||||||
print_2d_data(data_s_set2, False)
|
# Extraction et visualisation d'un dataset 2D
|
||||||
|
data = extract_data_2d(path + dataset_name)
|
||||||
|
print_2d_data(data, dataset_name=dataset_name+" brute", stop=False)
|
||||||
|
|
||||||
|
# Extraction et visualisation d'un dataset 3D
|
||||||
|
data_golfball = extract_data_3d(path+"golfball.arff")
|
||||||
|
print_3d_data(data_golfball, dataset_name="golfball.arff", stop=False)
|
||||||
|
|
||||||
|
# Scaling des data 2D et visualisation
|
||||||
|
data_scaled = scale_data(data)
|
||||||
|
print_2d_data(data_scaled, dataset_name=dataset_name+" scaled", stop=False)
|
||||||
|
|
||||||
|
|
||||||
########################################################################
|
# kmeans = KMeans(n_clusters=3000, random_state=0).fit_predict(data_scaled)
|
||||||
# AUTRES VISUALISATION DU JEU DE DONNEES
|
# print_2d_data(data_s_set2_scaled, True, kmeans)
|
||||||
# (histogrammes par exemple,)
|
k = []
|
||||||
# But : essayer d'autres types de plot
|
durations = []
|
||||||
########################################################################
|
silouettes = []
|
||||||
|
inerties = []
|
||||||
databrut_golfball = arff.loadarff(open(path+"golfball.arff", 'r'))
|
iterations = []
|
||||||
data_golfball = extract_data_3d(databrut_golfball)
|
for i in range(2, 5):
|
||||||
print_3d_data(data_golfball, False)
|
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
|
||||||
|
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||||
########################################################################
|
method_name="k-means", k=i, stop=False, c=model.labels_)
|
||||||
# STANDARDISER ET VISUALISER
|
(silouette, inertie, iteration) = evaluate(data_scaled, model)
|
||||||
# But : comparer des méthodes de standardisation, ...
|
k += [i]
|
||||||
########################################################################
|
durations += [duration]
|
||||||
scaler = preprocessing.StandardScaler()
|
silouettes += [silouette]
|
||||||
data_s_set2_scaled = scaler.fit_transform(data_s_set2)
|
inerties += [inertie]
|
||||||
|
iterations += [iteration]
|
||||||
|
|
||||||
#kmeans = KMeans(n_clusters=3000, random_state=0).fit_predict(data_s_set2_scaled)
|
|
||||||
#print_2d_data(data_s_set2_scaled, True, kmeans)
|
|
||||||
k=[]
|
|
||||||
durations=[]
|
|
||||||
silouettes=[]
|
|
||||||
inerties=[]
|
|
||||||
iterations=[]
|
|
||||||
for i in range(2,50):
|
|
||||||
(model, duration) = apply_kmeans(data_s_set2_scaled, k=i, init="k-means++")
|
|
||||||
print_2d_data(data_s_set2_scaled, False, model.labels_)
|
|
||||||
(silouette, inertie, iteration) = evaluate(data_s_set2_scaled, model)
|
|
||||||
k+=[i]
|
|
||||||
durations+=[duration]
|
|
||||||
silouettes+=[silouette]
|
|
||||||
inerties+=[inertie]
|
|
||||||
iterations+=[iteration]
|
|
||||||
|
|
||||||
print_1d_data(k, k, False)
|
|
||||||
print_1d_data(k, durations, False)
|
|
||||||
print_1d_data(k, silouettes, False)
|
|
||||||
print_1d_data(k, inerties, False)
|
|
||||||
print_1d_data(k, iterations, True)
|
|
||||||
|
|
||||||
|
|
||||||
|
print_1d_data(k, k, x_name="k", y_name="k", stop=False)
|
||||||
|
print_1d_data(k, durations, x_name="k", y_name="temps de calcul", stop=False)
|
||||||
|
print_1d_data(k, silouettes, x_name="k",
|
||||||
|
y_name="coeficient de silhouette", stop=False)
|
||||||
|
print_1d_data(k, inerties, x_name="k", y_name="inertie", stop=False)
|
||||||
|
print_1d_data(k, iterations, x_name="k",
|
||||||
|
y_name="nombre d'itérations", stop=True)
|
||||||
|
|
75
tp2-read-standardization-dendrogram-agglo-1val.py
Normal file
75
tp2-read-standardization-dendrogram-agglo-1val.py
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Sat Nov 20 21:28:40 2021
|
||||||
|
|
||||||
|
@author: huguet
|
||||||
|
"""
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
|
||||||
|
from scipy.io import arff
|
||||||
|
from sklearn import cluster, metrics, preprocessing
|
||||||
|
from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
|
||||||
|
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# READ a data set (arff format)
|
||||||
|
|
||||||
|
# Parser un fichier de données au format arff
|
||||||
|
# datanp est un tableau (numpy) d'exemples avec pour chacun la liste
|
||||||
|
# des valeurs des features
|
||||||
|
|
||||||
|
# Note 1 :
|
||||||
|
# dans les jeux de données considérés : 2 features (dimension 2 seulement)
|
||||||
|
# t =np.array([[1,2], [3,4], [5,6], [7,8]])
|
||||||
|
#
|
||||||
|
# Note 2 :
|
||||||
|
# le jeu de données contient aussi un numéro de cluster pour chaque point
|
||||||
|
# --> IGNORER CETTE INFORMATION ....
|
||||||
|
# 2d-4c-no9.arff xclara.arff
|
||||||
|
|
||||||
|
path = './artificial/'
|
||||||
|
dataset_name = "xclara.arff"
|
||||||
|
|
||||||
|
print("-----------------------------------------------------------")
|
||||||
|
print(" Chargement du dataset : " + dataset_name)
|
||||||
|
databrut = arff.loadarff(open(path + dataset_name, 'r'))
|
||||||
|
datanp = np.array([[x[0], x[1]] for x in databrut[0]])
|
||||||
|
print_2d_data(datanp, dataset_name=dataset_name + " brutes", stop=False)
|
||||||
|
|
||||||
|
print("-----------------------------------------------------------")
|
||||||
|
print(" Mise à l'échelle")
|
||||||
|
scaler = preprocessing.StandardScaler().fit(datanp)
|
||||||
|
data_scaled = scaler.transform(datanp)
|
||||||
|
print_2d_data(data_scaled, dataset_name=dataset_name + " scaled", stop=False)
|
||||||
|
|
||||||
|
# Types de linkage : single, average, complete, ward linkage
|
||||||
|
linkage = "complete"
|
||||||
|
print("-----------------------------------------------------------")
|
||||||
|
print(" Création du dendrogramme : linkage " + linkage)
|
||||||
|
print_dendrogramme(data_scaled, dataset_name=dataset_name,
|
||||||
|
linkage=linkage, stop=False)
|
||||||
|
|
||||||
|
|
||||||
|
k = 10
|
||||||
|
print("-----------------------------------------------------------")
|
||||||
|
print(" Création clusters : linkage " + linkage + ", k=" + str(k))
|
||||||
|
tps3 = time.time()
|
||||||
|
model_scaled = cluster.AgglomerativeClustering(
|
||||||
|
n_clusters=k, affinity='euclidean', linkage=linkage)
|
||||||
|
model_scaled.fit(data_scaled)
|
||||||
|
# cluster.fit_predict(X)
|
||||||
|
tps4 = time.time()
|
||||||
|
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||||
|
method_name="Agglomératif " + linkage, k=k,
|
||||||
|
stop=False, c=model_scaled.labels_)
|
||||||
|
|
||||||
|
# Some evaluation metrics
|
||||||
|
silh = metrics.silhouette_score(
|
||||||
|
data_scaled, model_scaled.labels_, metric='euclidean')
|
||||||
|
print("Coefficient de silhouette : ", silh)
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
# TRY : parameters for dendrogram and hierarchical clustering
|
||||||
|
# EVALUATION : with several metrics (for several number of clusters)
|
||||||
|
########################################################################
|
Loading…
Reference in a new issue