Mise en place d'une librairie d'affichage et de gestion des data
This commit is contained in:
parent
1ac7ab4212
commit
d29db6660c
5 changed files with 239 additions and 128 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -0,0 +1 @@
|
|||
__pycache__/
|
39
mydatalib.py
Normal file
39
mydatalib.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Dec 3 16:29:12 2021
|
||||
|
||||
@author: pfaure
|
||||
"""
|
||||
from scipy.io import arff
|
||||
import numpy as np
|
||||
import time
|
||||
from sklearn import cluster, metrics, preprocessing
|
||||
|
||||
|
||||
def extract_data_2d(data_path):
|
||||
databrut = arff.loadarff(open(data_path, 'r'))
|
||||
return np.array([[x[0], x[1]] for x in databrut[0]])
|
||||
|
||||
|
||||
def extract_data_3d(data_path):
|
||||
databrut = arff.loadarff(open(data_path, 'r'))
|
||||
return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
|
||||
|
||||
|
||||
def scale_data(data):
|
||||
scaler = preprocessing.StandardScaler()
|
||||
return scaler.fit_transform(data)
|
||||
|
||||
|
||||
def apply_kmeans(data, k: int = 3, init="k-means++"):
|
||||
tps1 = time.time()
|
||||
model_km = cluster.KMeans(n_clusters=k, init=init)
|
||||
model_km.fit(data)
|
||||
tps2 = time.time()
|
||||
return (model_km, round((tps2 - tps1)*1000, 2))
|
||||
|
||||
|
||||
def evaluate(data, model_km):
|
||||
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
||||
return (silh, model_km.inertia_, model_km.n_iter_)
|
83
myplotlib.py
Normal file
83
myplotlib.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Dec 3 15:28:19 2021
|
||||
|
||||
@author: pfaure
|
||||
"""
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import scipy.cluster.hierarchy as shc
|
||||
|
||||
|
||||
def print_3d_data(data,
|
||||
dataset_name: str = "",
|
||||
method_name: str = "",
|
||||
k: int = 0,
|
||||
stop: bool = True,
|
||||
c=None):
|
||||
f0 = data[:, 0] # tous les élements de la première colonne
|
||||
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
||||
f2 = data[:, 2] # tous les éléments de la troisième colonne
|
||||
fig = plt.figure()
|
||||
ax = fig.gca(projection='3d') # Affichage en 3D
|
||||
if (c is None):
|
||||
ax.scatter(f0, f1, f2, label='Courbe',
|
||||
marker='d')
|
||||
plt.title("Données initiales : " + dataset_name)
|
||||
else:
|
||||
ax.scatter(f0, f1, f2, c=c, label='Courbe',
|
||||
marker='d')
|
||||
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
|
||||
method_name + " sur le jeu de données " + dataset_name)
|
||||
ax.set_xlabel('X')
|
||||
ax.set_ylabel('Y')
|
||||
ax.set_zlabel('Z')
|
||||
plt.tight_layout()
|
||||
plt.show(block=stop)
|
||||
|
||||
|
||||
def print_2d_data(data,
|
||||
dataset_name: str = "",
|
||||
method_name: str = "",
|
||||
k: int = 0,
|
||||
stop: bool = True,
|
||||
c=None):
|
||||
f0 = data[:, 0] # tous les élements de la première colonne
|
||||
f1 = data[:, 1] # tous les éléments de la deuxième colonne
|
||||
plt.figure()
|
||||
# plt.hist2d(f0, f1)
|
||||
if (c is None):
|
||||
plt.scatter(f0, f1, s=8)
|
||||
plt.title("Données initiales : " + dataset_name)
|
||||
else:
|
||||
plt.scatter(f0, f1, c=c, s=8)
|
||||
plt.title("Graphique de " + str(k) + " clusters avec la méthode " +
|
||||
method_name + " sur le jeu de données " + dataset_name)
|
||||
plt.show(block=stop)
|
||||
|
||||
|
||||
def print_1d_data(x, y, x_name: str = "toto",
|
||||
y_name: str = "tata",
|
||||
stop: bool = True):
|
||||
plt.figure()
|
||||
plt.plot(x, y)
|
||||
plt.title(y_name + " = f(" + x_name + ")")
|
||||
plt.show(block=stop)
|
||||
|
||||
|
||||
def print_dendrogramme(data,
|
||||
dataset_name: str = "",
|
||||
linkage: str = "",
|
||||
stop: bool = True):
|
||||
|
||||
distance = shc.linkage(data, linkage)
|
||||
|
||||
plt.figure(figsize=(12, 12))
|
||||
shc.dendrogram(distance,
|
||||
orientation='top',
|
||||
distance_sort='descending',
|
||||
show_leaf_counts=False)
|
||||
plt.title("Dendrogramme du jeu de données " +
|
||||
dataset_name + " avec le linkage " + linkage)
|
||||
plt.show(block=stop)
|
|
@ -2,140 +2,53 @@
|
|||
"""
|
||||
Created on Fri Nov 19 23:08:23 2021
|
||||
|
||||
@author: huguet
|
||||
@author: pfaure
|
||||
"""
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from mpl_toolkits.mplot3d import axes3d # Fonction pour la 3D
|
||||
import time
|
||||
|
||||
from scipy.io import arff
|
||||
from sklearn import cluster
|
||||
from sklearn import metrics
|
||||
from sklearn import preprocessing
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
##################################################################
|
||||
# READ a data set (arff format)
|
||||
from myplotlib import print_1d_data, print_2d_data, print_3d_data
|
||||
from mydatalib import extract_data_2d, extract_data_3d, scale_data
|
||||
from mydatalib import apply_kmeans, evaluate
|
||||
|
||||
# Parser un fichier de données au format arff
|
||||
# datanp est un tableau (numpy) d'exemples avec pour chacun la liste
|
||||
# des valeurs des features
|
||||
|
||||
# Note 1 :
|
||||
# dans les jeux de données considérés : 2 features (dimension 2 seulement)
|
||||
# t =np.array([[1,2], [3,4], [5,6], [7,8]])
|
||||
#
|
||||
# Note 2 :
|
||||
# le jeu de données contient aussi un numéro de cluster pour chaque point
|
||||
# --> IGNORER CETTE INFORMATION ....
|
||||
# 2d-4c-no9.arff
|
||||
|
||||
def extract_data_2d(databrut):
|
||||
return np.array([[x[0],x[1]] for x in databrut[0]])
|
||||
|
||||
def extract_data_3d(databrut):
|
||||
return np.array([[x[0],x[1],x[2]] for x in databrut[0]])
|
||||
|
||||
def print_3d_data(data, stop:bool = True, c=None):
|
||||
print("---------------------------------------")
|
||||
print("Affichage données initiales ")
|
||||
f0 = data[:,0] # tous les élements de la première colonne
|
||||
f1 = data[:,1] # tous les éléments de la deuxième colonne
|
||||
f2 = data[:,2] # tous les éléments de la troisième colonne
|
||||
fig = plt.figure()
|
||||
ax = fig.gca(projection='3d') # Affichage en 3D
|
||||
if (c is None):
|
||||
ax.scatter(f0, f1, f2, label='Courbe', marker='d') # Tracé des points 3D
|
||||
else:
|
||||
ax.scatter(f0, f1, f2, c=c, label='Courbe', marker='d') # Tracé des points 3D
|
||||
ax.set_xlabel('X')
|
||||
ax.set_ylabel('Y')
|
||||
ax.set_zlabel('Z')
|
||||
plt.tight_layout()
|
||||
plt.title("Donnees initiales")
|
||||
plt.show(block=stop)
|
||||
|
||||
def print_2d_data(data, stop:bool = True, c=None):
|
||||
print("---------------------------------------")
|
||||
print("Affichage données initiales ")
|
||||
f0 = data[:,0] # tous les élements de la première colonne
|
||||
f1 = data[:,1] # tous les éléments de la deuxième colonne
|
||||
fig = plt.figure()
|
||||
#plt.hist2d(f0, f1)
|
||||
if (c is None):
|
||||
plt.scatter(f0, f1, s=8)
|
||||
else:
|
||||
plt.scatter(f0, f1, c=c, s=8)
|
||||
plt.title("Donnees initiales")
|
||||
plt.show(block=stop)
|
||||
|
||||
def print_1d_data(x, y, stop:bool = True):
|
||||
fig = plt.figure()
|
||||
plt.plot(x, y)
|
||||
plt.title("Toto")
|
||||
plt.show(block=stop)
|
||||
|
||||
# (model, duration) = apply_kmeans(data, k=3)
|
||||
def apply_kmeans(data, k:int=3, init="k-means++"):
|
||||
##################################################################
|
||||
# Run clustering method for a given number of clusters
|
||||
tps1 = time.time()
|
||||
model_km = cluster.KMeans(n_clusters=k, init=init)
|
||||
model_km.fit(data)
|
||||
tps2 = time.time()
|
||||
|
||||
return (model_km, round((tps2 - tps1)*1000,2))
|
||||
|
||||
def evaluate(data, model_km):
|
||||
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
|
||||
return (silh, model_km.inertia_, model_km.n_iter_)
|
||||
|
||||
path = './artificial/'
|
||||
databrut_s_set2 = arff.loadarff(open(path+"s-set2.arff", 'r'))
|
||||
data_s_set2 = extract_data_2d(databrut_s_set2)
|
||||
print_2d_data(data_s_set2, False)
|
||||
dataset_name = "xclara.arff"
|
||||
|
||||
# Extraction et visualisation d'un dataset 2D
|
||||
data = extract_data_2d(path + dataset_name)
|
||||
print_2d_data(data, dataset_name=dataset_name+" brute", stop=False)
|
||||
|
||||
# Extraction et visualisation d'un dataset 3D
|
||||
data_golfball = extract_data_3d(path+"golfball.arff")
|
||||
print_3d_data(data_golfball, dataset_name="golfball.arff", stop=False)
|
||||
|
||||
# Scaling des data 2D et visualisation
|
||||
data_scaled = scale_data(data)
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name+" scaled", stop=False)
|
||||
|
||||
|
||||
########################################################################
|
||||
# AUTRES VISUALISATION DU JEU DE DONNEES
|
||||
# (histogrammes par exemple,)
|
||||
# But : essayer d'autres types de plot
|
||||
########################################################################
|
||||
|
||||
databrut_golfball = arff.loadarff(open(path+"golfball.arff", 'r'))
|
||||
data_golfball = extract_data_3d(databrut_golfball)
|
||||
print_3d_data(data_golfball, False)
|
||||
|
||||
########################################################################
|
||||
# STANDARDISER ET VISUALISER
|
||||
# But : comparer des méthodes de standardisation, ...
|
||||
########################################################################
|
||||
scaler = preprocessing.StandardScaler()
|
||||
data_s_set2_scaled = scaler.fit_transform(data_s_set2)
|
||||
|
||||
|
||||
#kmeans = KMeans(n_clusters=3000, random_state=0).fit_predict(data_s_set2_scaled)
|
||||
# kmeans = KMeans(n_clusters=3000, random_state=0).fit_predict(data_scaled)
|
||||
# print_2d_data(data_s_set2_scaled, True, kmeans)
|
||||
k = []
|
||||
durations = []
|
||||
silouettes = []
|
||||
inerties = []
|
||||
iterations = []
|
||||
for i in range(2,50):
|
||||
(model, duration) = apply_kmeans(data_s_set2_scaled, k=i, init="k-means++")
|
||||
print_2d_data(data_s_set2_scaled, False, model.labels_)
|
||||
(silouette, inertie, iteration) = evaluate(data_s_set2_scaled, model)
|
||||
for i in range(2, 5):
|
||||
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||
method_name="k-means", k=i, stop=False, c=model.labels_)
|
||||
(silouette, inertie, iteration) = evaluate(data_scaled, model)
|
||||
k += [i]
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
inerties += [inertie]
|
||||
iterations += [iteration]
|
||||
|
||||
print_1d_data(k, k, False)
|
||||
print_1d_data(k, durations, False)
|
||||
print_1d_data(k, silouettes, False)
|
||||
print_1d_data(k, inerties, False)
|
||||
print_1d_data(k, iterations, True)
|
||||
|
||||
|
||||
print_1d_data(k, k, x_name="k", y_name="k", stop=False)
|
||||
print_1d_data(k, durations, x_name="k", y_name="temps de calcul", stop=False)
|
||||
print_1d_data(k, silouettes, x_name="k",
|
||||
y_name="coeficient de silhouette", stop=False)
|
||||
print_1d_data(k, inerties, x_name="k", y_name="inertie", stop=False)
|
||||
print_1d_data(k, iterations, x_name="k",
|
||||
y_name="nombre d'itérations", stop=True)
|
||||
|
|
75
tp2-read-standardization-dendrogram-agglo-1val.py
Normal file
75
tp2-read-standardization-dendrogram-agglo-1val.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Nov 20 21:28:40 2021
|
||||
|
||||
@author: huguet
|
||||
"""
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
from scipy.io import arff
|
||||
from sklearn import cluster, metrics, preprocessing
|
||||
from myplotlib import print_1d_data, print_2d_data, print_dendrogramme
|
||||
|
||||
|
||||
##################################################################
|
||||
# READ a data set (arff format)
|
||||
|
||||
# Parser un fichier de données au format arff
|
||||
# datanp est un tableau (numpy) d'exemples avec pour chacun la liste
|
||||
# des valeurs des features
|
||||
|
||||
# Note 1 :
|
||||
# dans les jeux de données considérés : 2 features (dimension 2 seulement)
|
||||
# t =np.array([[1,2], [3,4], [5,6], [7,8]])
|
||||
#
|
||||
# Note 2 :
|
||||
# le jeu de données contient aussi un numéro de cluster pour chaque point
|
||||
# --> IGNORER CETTE INFORMATION ....
|
||||
# 2d-4c-no9.arff xclara.arff
|
||||
|
||||
path = './artificial/'
|
||||
dataset_name = "xclara.arff"
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
databrut = arff.loadarff(open(path + dataset_name, 'r'))
|
||||
datanp = np.array([[x[0], x[1]] for x in databrut[0]])
|
||||
print_2d_data(datanp, dataset_name=dataset_name + " brutes", stop=False)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Mise à l'échelle")
|
||||
scaler = preprocessing.StandardScaler().fit(datanp)
|
||||
data_scaled = scaler.transform(datanp)
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name + " scaled", stop=False)
|
||||
|
||||
# Types de linkage : single, average, complete, ward linkage
|
||||
linkage = "complete"
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Création du dendrogramme : linkage " + linkage)
|
||||
print_dendrogramme(data_scaled, dataset_name=dataset_name,
|
||||
linkage=linkage, stop=False)
|
||||
|
||||
|
||||
k = 10
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Création clusters : linkage " + linkage + ", k=" + str(k))
|
||||
tps3 = time.time()
|
||||
model_scaled = cluster.AgglomerativeClustering(
|
||||
n_clusters=k, affinity='euclidean', linkage=linkage)
|
||||
model_scaled.fit(data_scaled)
|
||||
# cluster.fit_predict(X)
|
||||
tps4 = time.time()
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||
method_name="Agglomératif " + linkage, k=k,
|
||||
stop=False, c=model_scaled.labels_)
|
||||
|
||||
# Some evaluation metrics
|
||||
silh = metrics.silhouette_score(
|
||||
data_scaled, model_scaled.labels_, metric='euclidean')
|
||||
print("Coefficient de silhouette : ", silh)
|
||||
|
||||
########################################################################
|
||||
# TRY : parameters for dendrogram and hierarchical clustering
|
||||
# EVALUATION : with several metrics (for several number of clusters)
|
||||
########################################################################
|
Loading…
Reference in a new issue