start tp6
This commit is contained in:
parent
b9da01d3e0
commit
42c641d044
3 changed files with 146 additions and 30 deletions
|
@ -9,7 +9,7 @@ from scipy.io import arff
|
|||
import numpy as np
|
||||
import time
|
||||
from sklearn import cluster, metrics, preprocessing
|
||||
|
||||
import pandas as pd
|
||||
|
||||
def extract_data_2d(data_path):
|
||||
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
||||
|
@ -30,6 +30,11 @@ def extract_data_txt(data_path):
|
|||
return np.array(ret)
|
||||
|
||||
|
||||
def extract_data_csv(data_path: str, first_col: int, last_col: int):
|
||||
data = pd.read_csv(data_path + ".csv")
|
||||
return data.iloc[:, last_col]
|
||||
|
||||
|
||||
def scale_data(data):
|
||||
scaler = preprocessing.StandardScaler()
|
||||
return scaler.fit_transform(data)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
Ville,JANVIERp,FEVRIERp,MARSp,AVRILp,MAIp,JUINp,JUILLETp,AOUTp,SEPTEMBREp,OCTOBREp,NOVEMBREp,DECEMBREp,Précipitations annuelles,JANVIERnb.j.pl,FEVRIERnb.j.pl,MARSnb.j.pl,AVRILnb.j.pl,MAInb.j.pl,JUINnb.j.pl,JUILLETnb.j.pl,AOUTnb.j.pl,SEPTEMBREnb.j.pl,OCTOBREnb.j.pl,NOVEMBREnb.j.pl,DECEMBREnb.j.pl,Nombre annuel de jours de pluie,Température moyenne annuelle,Amplitude annuelle des températures,Insolation annuelle,Latitude,Longitude,Précipitations de mai à aout,Précipitations sept-oct,Géographie
|
||||
Ville,JANVIERp,FEVRIERp,MARSp,AVRILp,MAIp,JUINp,JUILLETp,AOUTp,SEPTEMBREp,OCTOBREp,NOVEMBREp,DECEMBREp,Précipitations annuelles,JANVIERnb.j.pl,FEVRIERnb.j.pl,MARSnb.j.pl,AVRILnb.j.pl,MAInb.j.pl,JUINnb.j.pl,JUILLETnb.j.pl,AOUTnb.j.pl,SEPTEMBREnb.j.pl,OCTOBREnb.j.pl,NOVEMBREnb.j.pl,DECEMBREnb.j.pl,Nombre annuel de jours de pluie,Température moyenne annuelle,Amplitude annuelle des températures,Insolation annuelle,Latitude,Longitude,Précipitations de mai à aout,Précipitations sept-oct,Géographie
|
||||
Ajaccio,78.00,69.00,51.00,39.00,43.00,23.00,10.00,15.00,43.00,81.00,105.00,96.00,653.00,12.00,10.00,9.00,9.00,8.00,4.00,1.00,2.00,6.00,10.00,11.00,13.00,95.00,14.71,14.50,2811.00,41.55,8.44,13.90,25.80,Sud
|
||||
Angers,65.00,50.00,60.00,45.00,50.00,55.00,35.00,60.00,55.00,65.00,80.00,70.00,690.00,16.00,13.00,12.00,12.00,13.00,10.00,11.00,11.00,12.00,13.00,15.00,16.00,154.00,11.28,14.50,1899.00,47.28,-0.33,29.00,30.80,Ouest
|
||||
Angouleme,79.00,68.00,64.00,62.00,70.00,58.00,53.00,66.00,69.00,70.00,79.00,88.00,826.00,16.00,14.00,13.00,12.00,14.00,11.00,12.00,12.00,12.00,13.00,15.00,16.00,160.00,12.02,14.90,1989.00,45.39,0.09,29.90,31.00,Ouest
|
||||
|
|
|
|
@ -5,48 +5,123 @@ Created on Wed Dec 8 16:07:28 2021
|
|||
|
||||
@author: pfaure
|
||||
"""
|
||||
|
||||
from numpy import arange
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
import numpy as np
|
||||
|
||||
from myplotlib import print_1d_data, print_2d_data
|
||||
from mydatalib import extract_data_2d, scale_data, apply_DBSCAN, evaluate
|
||||
from mydatalib import scale_data, apply_DBSCAN, evaluate, extract_data_csv, apply_kmeans, \
|
||||
apply_agglomerative_clustering, apply_mean_shift
|
||||
|
||||
path = './artificial/'
|
||||
dataset_name = "banana"
|
||||
path = './new-data/'
|
||||
dataset_name = "pluie"
|
||||
save = True
|
||||
eps = 0.8
|
||||
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
data = extract_data_2d(path + dataset_name)
|
||||
print_2d_data(data, dataset_name=dataset_name +
|
||||
"_brutes", stop=False, save=save)
|
||||
data = extract_data_csv(path + dataset_name, 1, 5)
|
||||
|
||||
print(data)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Mise à l'échelle")
|
||||
data_scaled = scale_data(data)
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name +
|
||||
"_scaled", stop=False, save=save)
|
||||
|
||||
k_max = 10
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Application de k-means")
|
||||
# Application de k-means pour plusieurs valeurs de k
|
||||
# et evaluation de la solution
|
||||
k = []
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
calinskis = []
|
||||
inerties = []
|
||||
iterations = []
|
||||
for i in range(2, k_max):
|
||||
# Application de k-means
|
||||
(model, duration) = apply_kmeans(data_scaled, k=i, init="k-means++")
|
||||
# Evaluation de la solution de clustering
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
k += [i]
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
daviess += [davies]
|
||||
calinskis += [calinski]
|
||||
inerties += [model.inertia_]
|
||||
iterations += [model.n_iter_]
|
||||
|
||||
# Affichage des résultats
|
||||
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||
method_name="k-means", stop=False, save=save)
|
||||
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, inerties, x_name="k", y_name="inertie",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=False, save=save)
|
||||
print_1d_data(k, iterations, x_name="k", y_name="nombre_d_iterations",
|
||||
dataset_name=dataset_name, method_name="k-means",
|
||||
stop=True, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Calcul du voisinage")
|
||||
n = 50
|
||||
neighbors = NearestNeighbors(n_neighbors=n)
|
||||
neighbors.fit(data)
|
||||
distances, indices = neighbors.kneighbors(data)
|
||||
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
|
||||
print(distances)
|
||||
distances = np.sort(distances, axis=0)
|
||||
print(distances)
|
||||
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
|
||||
y_name="nombre_de_points", stop=False, save=False)
|
||||
print(" Création clusters : agglomerative ")
|
||||
# Application du clustering agglomeratif pour plusieurs valeurs de k
|
||||
# et evaluation de la solution
|
||||
linkage = "ward"
|
||||
k = []
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
calinskis = []
|
||||
for i in range(2, k_max):
|
||||
# Application du clustering agglomeratif
|
||||
(model, duration) = apply_agglomerative_clustering(
|
||||
data_scaled, k=i, linkage=linkage)
|
||||
# Evaluation de la solution de clustering
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
k += [i]
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
daviess += [davies]
|
||||
calinskis += [calinski]
|
||||
|
||||
# Affichage des résultats
|
||||
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||
dataset_name=dataset_name,
|
||||
method_name="agglomerative_" + linkage, stop=False, save=save)
|
||||
|
||||
min_sample_max = 30
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Création clusters : DBSCAN")
|
||||
params = []
|
||||
for i in range(1, 20):
|
||||
params += [(i/100, 5)]
|
||||
for i in range(1, min_sample_max):
|
||||
params += [(eps, i)]
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
|
@ -55,13 +130,8 @@ clusters = []
|
|||
noise_points = []
|
||||
for (distance, min_pts) in params:
|
||||
# Application du clustering agglomeratif
|
||||
(model, duration) = apply_DBSCAN(data, distance, min_pts)
|
||||
(model, duration) = apply_DBSCAN(data_scaled, distance, min_pts)
|
||||
cl_pred = model.labels_
|
||||
# Affichage des clusters# Affichage des clusters
|
||||
print_2d_data(data_scaled, dataset_name=dataset_name,
|
||||
method_name="DBSCAN-Eps=" +
|
||||
str(distance)+"-Minpt="+str(min_pts),
|
||||
k=0, stop=False, save=save, c=cl_pred)
|
||||
# Evaluation de la solution de clustering
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
|
@ -92,3 +162,44 @@ print_1d_data(params, clusters, x_name="(eps,min_pts)",
|
|||
print_1d_data(params, noise_points, x_name="(eps,min_pts)",
|
||||
y_name="points_de_bruit", dataset_name=dataset_name,
|
||||
method_name="DBSCAN", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Création clusters : mean-shift")
|
||||
# Application de Affinity Propagation pour plusieurs valeurs de préférence
|
||||
# et evaluation de la solution
|
||||
|
||||
k_max = 2
|
||||
|
||||
k = []
|
||||
durations = []
|
||||
silouettes = []
|
||||
daviess = []
|
||||
calinskis = []
|
||||
for bandwidth in arange(0.1, k_max, 0.2):
|
||||
# Application du clustering
|
||||
(model, duration) = apply_mean_shift(
|
||||
data_scaled, bandwidth=bandwidth)
|
||||
# Evaluation de la solution de clustering
|
||||
(silouette, davies, calinski) = evaluate(data_scaled, model)
|
||||
# Enregistrement des valeurs
|
||||
k += [bandwidth]
|
||||
durations += [duration]
|
||||
silouettes += [silouette]
|
||||
daviess += [davies]
|
||||
calinskis += [calinski]
|
||||
|
||||
# Affichage des résultats
|
||||
print_1d_data(k, k, x_name="k", y_name="k", dataset_name=dataset_name,
|
||||
method_name="mean-shift", stop=False, save=save)
|
||||
print_1d_data(k, durations, x_name="k", y_name="temps_de_calcul", y_unit="ms",
|
||||
dataset_name=dataset_name,
|
||||
method_name="mean-shift", stop=False, save=save)
|
||||
print_1d_data(k, silouettes, x_name="k", y_name="coeficient_de_silhouette",
|
||||
dataset_name=dataset_name,
|
||||
method_name="mean-shift", stop=False, save=save)
|
||||
print_1d_data(k, daviess, x_name="k", y_name="coeficient_de_Davies",
|
||||
dataset_name=dataset_name,
|
||||
method_name="mean-shift", stop=False, save=save)
|
||||
print_1d_data(k, calinskis, x_name="k", y_name="coeficient_de_Calinski",
|
||||
dataset_name=dataset_name,
|
||||
method_name="mean-shift", stop=False, save=save)
|
||||
|
|
Loading…
Reference in a new issue