start tp6
This commit is contained in:
parent
b9da01d3e0
commit
42c641d044
3 changed files with 146 additions and 30 deletions
|
@ -9,7 +9,7 @@ from scipy.io import arff
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
from sklearn import cluster, metrics, preprocessing
|
from sklearn import cluster, metrics, preprocessing
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
def extract_data_2d(data_path):
|
def extract_data_2d(data_path):
|
||||||
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
||||||
|
@ -30,6 +30,11 @@ def extract_data_txt(data_path):
|
||||||
return np.array(ret)
|
return np.array(ret)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_data_csv(data_path: str, first_col: int, last_col: int):
|
||||||
|
data = pd.read_csv(data_path + ".csv")
|
||||||
|
return data.iloc[:, last_col]
|
||||||
|
|
||||||
|
|
||||||
def scale_data(data):
|
def scale_data(data):
|
||||||
scaler = preprocessing.StandardScaler()
|
scaler = preprocessing.StandardScaler()
|
||||||
return scaler.fit_transform(data)
|
return scaler.fit_transform(data)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
Ville,JANVIERp,FEVRIERp,MARSp,AVRILp,MAIp,JUINp,JUILLETp,AOUTp,SEPTEMBREp,OCTOBREp,NOVEMBREp,DECEMBREp,Précipitations annuelles,JANVIERnb.j.pl,FEVRIERnb.j.pl,MARSnb.j.pl,AVRILnb.j.pl,MAInb.j.pl,JUINnb.j.pl,JUILLETnb.j.pl,AOUTnb.j.pl,SEPTEMBREnb.j.pl,OCTOBREnb.j.pl,NOVEMBREnb.j.pl,DECEMBREnb.j.pl,Nombre annuel de jours de pluie,Température moyenne annuelle,Amplitude annuelle des températures,Insolation annuelle,Latitude,Longitude,Précipitations de mai à aout,Précipitations sept-oct,Géographie
|
Ville,JANVIERp,FEVRIERp,MARSp,AVRILp,MAIp,JUINp,JUILLETp,AOUTp,SEPTEMBREp,OCTOBREp,NOVEMBREp,DECEMBREp,Précipitations annuelles,JANVIERnb.j.pl,FEVRIERnb.j.pl,MARSnb.j.pl,AVRILnb.j.pl,MAInb.j.pl,JUINnb.j.pl,JUILLETnb.j.pl,AOUTnb.j.pl,SEPTEMBREnb.j.pl,OCTOBREnb.j.pl,NOVEMBREnb.j.pl,DECEMBREnb.j.pl,Nombre annuel de jours de pluie,Température moyenne annuelle,Amplitude annuelle des températures,Insolation annuelle,Latitude,Longitude,Précipitations de mai à aout,Précipitations sept-oct,Géographie
|
||||||
Ajaccio,78.00,69.00,51.00,39.00,43.00,23.00,10.00,15.00,43.00,81.00,105.00,96.00,653.00,12.00,10.00,9.00,9.00,8.00,4.00,1.00,2.00,6.00,10.00,11.00,13.00,95.00,14.71,14.50,2811.00,41.55,8.44,13.90,25.80,Sud
|
Ajaccio,78.00,69.00,51.00,39.00,43.00,23.00,10.00,15.00,43.00,81.00,105.00,96.00,653.00,12.00,10.00,9.00,9.00,8.00,4.00,1.00,2.00,6.00,10.00,11.00,13.00,95.00,14.71,14.50,2811.00,41.55,8.44,13.90,25.80,Sud
|
||||||
Angers,65.00,50.00,60.00,45.00,50.00,55.00,35.00,60.00,55.00,65.00,80.00,70.00,690.00,16.00,13.00,12.00,12.00,13.00,10.00,11.00,11.00,12.00,13.00,15.00,16.00,154.00,11.28,14.50,1899.00,47.28,-0.33,29.00,30.80,Ouest
|
Angers,65.00,50.00,60.00,45.00,50.00,55.00,35.00,60.00,55.00,65.00,80.00,70.00,690.00,16.00,13.00,12.00,12.00,13.00,10.00,11.00,11.00,12.00,13.00,15.00,16.00,154.00,11.28,14.50,1899.00,47.28,-0.33,29.00,30.80,Ouest
|
||||||
Angouleme,79.00,68.00,64.00,62.00,70.00,58.00,53.00,66.00,69.00,70.00,79.00,88.00,826.00,16.00,14.00,13.00,12.00,14.00,11.00,12.00,12.00,12.00,13.00,15.00,16.00,160.00,12.02,14.90,1989.00,45.39,0.09,29.90,31.00,Ouest
|
Angouleme,79.00,68.00,64.00,62.00,70.00,58.00,53.00,66.00,69.00,70.00,79.00,88.00,826.00,16.00,14.00,13.00,12.00,14.00,11.00,12.00,12.00,12.00,13.00,15.00,16.00,160.00,12.02,14.90,1989.00,45.39,0.09,29.90,31.00,Ouest
|
||||||
|
|
|
|
@ -5,48 +5,123 @@ Created on Wed Dec 8 16:07:28 2021
|
||||||
|
|
||||||
@author: pfaure
|
@author: pfaure
|
||||||
"""
|
"""
|
||||||
|
from numpy import arange
|
||||||
from sklearn.neighbors import NearestNeighbors
|
from sklearn.neighbors import NearestNeighbors
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from myplotlib import print_1d_data, print_2d_data
|
from myplotlib import print_1d_data, print_2d_data
|
||||||
from mydatalib import extract_data_2d, scale_data, apply_DBSCAN, evaluate
|
from mydatalib import scale_data, apply_DBSCAN, evaluate, extract_data_csv, apply_kmeans, \
|
||||||
|
apply_agglomerative_clustering, apply_mean_shift
|
||||||
|
|
||||||
path = './artificial/'
|
path = './new-data/'
|
||||||
dataset_name = "banana"
|
dataset_name = "pluie"
|
||||||
save = True
|
save = True
|
||||||
|
eps = 0.8
|
||||||
|
|
||||||
|
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Chargement du dataset : " + dataset_name)
|
print(" Chargement du dataset : " + dataset_name)
|
||||||
data = extract_data_2d(path + dataset_name)
|
data = extract_data_csv(path + dataset_name, 1, 5)
|
||||||
print_2d_data(data, dataset_name=dataset_name +
|
|
||||||
"_brutes", stop=False, save=save)
|
print(data)
|
||||||
|
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Mise à l'échelle")
|
print(" Mise à l'échelle")
|
||||||
data_scaled = scale_data(data)
|
data_scaled = scale_data(data)
|
||||||
print_2d_data(data_scaled, dataset_name=dataset_name +
|
|
||||||
"_scaled", stop=False, save=save)
|
k_max = 10
|
||||||
|
print("-----------------------------------------------------------")
|
||||||
|
print(" Application de k-means")
|
||||||
|
# Application de k-means pour plusieurs valeurs de k
|
||||||
|
# et evaluation de la solution
|
||||||
|
k = []
|
||||||
|
durations = []
|
||||||
|
silouettes = []
|
||||||
|
daviess = []
|
||||||
|
calinskis = []
|
||||||
|
inerties = []
|
||||||
|
iterations = []
|
||||||
|
for i in range(2, k_max):
|
||||||
|
# Application de k-means
|
||||||
|
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
|
||||||
|
# Evaluation de la solution de clustering
|
||||||
|
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||||
|
# Enregistrement des valeurs
|
||||||
|
k += [i]
|
||||||
|
durations += [duration]
|
||||||
|
silouettes += [silouette]
|
||||||
|
daviess += [davies]
|
||||||
|
calinskis += [calinski]
|
||||||
|
inerties += [model.inertia_]
|
||||||
|
iterations += [model.n_iter_]
|
||||||
|
|
||||||
|
# Affichage des résultats
|
||||||
|
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||||
|
method_name="k-means", stop=False, save=save)
|
||||||
|
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, inerties, x_name="k", y_name="inertie",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=False, save=save)
|
||||||
|
print_1d_data(k, iterations, x_name="k", y_name="nombre_d_iterations",
|
||||||
|
dataset_name=dataset_name, method_name="k-means",
|
||||||
|
stop=True, save=save)
|
||||||
|
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Calcul du voisinage")
|
print(" Création clusters : agglomerative ")
|
||||||
n = 50
|
# Application du clustering agglomeratif pour plusieurs valeurs de k
|
||||||
neighbors = NearestNeighbors(n_neighbors=n)
|
# et evaluation de la solution
|
||||||
neighbors.fit(data)
|
linkage = "ward"
|
||||||
distances, indices = neighbors.kneighbors(data)
|
k = []
|
||||||
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
|
durations = []
|
||||||
print(distances)
|
silouettes = []
|
||||||
distances = np.sort(distances, axis=0)
|
daviess = []
|
||||||
print(distances)
|
calinskis = []
|
||||||
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
|
for i in range(2, k_max):
|
||||||
y_name="nombre_de_points", stop=False, save=False)
|
# Application du clustering agglomeratif
|
||||||
|
(model, duration) = apply_agglomerative_clustering(
|
||||||
|
data_scaled, k=i, linkage=linkage)
|
||||||
|
# Evaluation de la solution de clustering
|
||||||
|
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||||
|
# Enregistrement des valeurs
|
||||||
|
k += [i]
|
||||||
|
durations += [duration]
|
||||||
|
silouettes += [silouette]
|
||||||
|
daviess += [davies]
|
||||||
|
calinskis += [calinski]
|
||||||
|
|
||||||
|
# Affichage des résultats
|
||||||
|
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||||
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
|
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
|
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
|
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
|
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||||
|
|
||||||
|
min_sample_max = 30
|
||||||
print("-----------------------------------------------------------")
|
print("-----------------------------------------------------------")
|
||||||
print(" Création clusters : DBSCAN")
|
print(" Création clusters : DBSCAN")
|
||||||
params = []
|
params = []
|
||||||
for i in range(1, 20):
|
for i in range(1, min_sample_max):
|
||||||
params += [(i/100, 5)]
|
params += [(eps, i)]
|
||||||
durations = []
|
durations = []
|
||||||
silouettes = []
|
silouettes = []
|
||||||
daviess = []
|
daviess = []
|
||||||
|
@ -55,13 +130,8 @@ clusters = []
|
||||||
noise_points = []
|
noise_points = []
|
||||||
for (distance, min_pts) in params:
|
for (distance, min_pts) in params:
|
||||||
# Application du clustering agglomeratif
|
# Application du clustering agglomeratif
|
||||||
(model, duration) = apply_DBSCAN(data, distance, min_pts)
|
(model, duration) = apply_DBSCAN(data_scaled, distance, min_pts)
|
||||||
cl_pred = model.labels_
|
cl_pred = model.labels_
|
||||||
# Affichage des clusters# Affichage des clusters
|
|
||||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
|
||||||
method_name="DBSCAN-Eps=" +
|
|
||||||
str(distance)+"-Minpt="+str(min_pts),
|
|
||||||
k=0, stop=False, save=save, c=cl_pred)
|
|
||||||
# Evaluation de la solution de clustering
|
# Evaluation de la solution de clustering
|
||||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||||
# Enregistrement des valeurs
|
# Enregistrement des valeurs
|
||||||
|
@ -92,3 +162,44 @@ print_1d_data(params, clusters, x_name="(eps,min_pts)",
|
||||||
print_1d_data(params, noise_points, x_name="(eps,min_pts)",
|
print_1d_data(params, noise_points, x_name="(eps,min_pts)",
|
||||||
y_name="points_de_bruit", dataset_name=dataset_name,
|
y_name="points_de_bruit", dataset_name=dataset_name,
|
||||||
method_name="DBSCAN", stop=False, save=save)
|
method_name="DBSCAN", stop=False, save=save)
|
||||||
|
|
||||||
|
print("-----------------------------------------------------------")
|
||||||
|
print(" Création clusters : mean-shift")
|
||||||
|
# Application de Affinity Propagation pour plusieurs valeurs de préférence
|
||||||
|
# et evaluation de la solution
|
||||||
|
|
||||||
|
k_max = 2
|
||||||
|
|
||||||
|
k = []
|
||||||
|
durations = []
|
||||||
|
silouettes = []
|
||||||
|
daviess = []
|
||||||
|
calinskis = []
|
||||||
|
for bandwidth in arange(0.1, k_max, 0.2):
|
||||||
|
# Application du clustering
|
||||||
|
(model, duration) = apply_mean_shift(
|
||||||
|
data_scaled, bandwidth=bandwidth)
|
||||||
|
# Evaluation de la solution de clustering
|
||||||
|
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||||
|
# Enregistrement des valeurs
|
||||||
|
k += [bandwidth]
|
||||||
|
durations += [duration]
|
||||||
|
silouettes += [silouette]
|
||||||
|
daviess += [davies]
|
||||||
|
calinskis += [calinski]
|
||||||
|
|
||||||
|
# Affichage des résultats
|
||||||
|
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||||
|
method_name="mean-shift", stop=False, save=save)
|
||||||
|
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="mean-shift", stop=False, save=save)
|
||||||
|
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="mean-shift", stop=False, save=save)
|
||||||
|
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="mean-shift", stop=False, save=save)
|
||||||
|
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
method_name="mean-shift", stop=False, save=save)
|
||||||
|
|
Loading…
Reference in a new issue