Utilisation du jeu de données réel

This commit is contained in:
Paul Faure 2022-01-09 11:26:28 +01:00
parent 42c641d044
commit ce36d559f2
5 changed files with 94 additions and 9 deletions

View file

@ -11,6 +11,7 @@ import time
from sklearn import cluster, metrics, preprocessing
import pandas as pd
def extract_data_2d(data_path):
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
return np.array([[x[0], x[1]] for x in databrut[0]])
@ -31,8 +32,12 @@ def extract_data_txt(data_path):
def extract_data_csv(data_path: str, first_col: int, last_col: int):
data = pd.read_csv(data_path + ".csv")
return data.iloc[:, last_col]
databrut = pd.read_csv(data_path + ".csv")
ret = []
for x in range(first_col, last_col+1):
ret += [databrut.iloc[:, x]]
ret = np.array(ret)
return (list(databrut.iloc[:, 0]), np.array(list(map(list, zip(*ret)))))
def scale_data(data):

40
tp-preprocessing Normal file
View file

@ -0,0 +1,40 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 9 11:12:30 2022
@author: pfaure
"""
from sklearn.neighbors import NearestNeighbors
import numpy as np
from myplotlib import print_1d_data, print_2d_data
from mydatalib import extract_data_csv, scale_data
path = './new-data/'
dataset_name = "pluie"
save = False
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
(villes, data) = extract_data_csv(path + dataset_name, 13, 13)
print(data)
# print_2d_data(data, dataset_name=dataset_name +
# "_brutes", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Mise à l'échelle")
data_scaled = scale_data(data)
# print_2d_data(data_scaled, dataset_name=dataset_name +
# "_scaled", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Calcul du voisinage")
n = 5
neighbors = NearestNeighbors(n_neighbors=n)
neighbors.fit(data_scaled)
distances, indices = neighbors.kneighbors(data_scaled)
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
distances = np.sort(distances, axis=0)
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
y_name="nombre_de_points", stop=False, save=save)

View file

@ -13,12 +13,13 @@ from myplotlib import print_1d_data, print_2d_data
from mydatalib import extract_data_txt, scale_data
path = './new-data/'
dataset_name = "d32"
dataset_name = "w2"
save = False
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
data = extract_data_txt(path + dataset_name)
print(data)
print_2d_data(data, dataset_name=dataset_name +
"_brutes", stop=False, save=save)

40
tp6-preprocessing Normal file
View file

@ -0,0 +1,40 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 9 11:12:30 2022
@author: pfaure
"""
from sklearn.neighbors import NearestNeighbors
import numpy as np
from myplotlib import print_1d_data, print_2d_data
from mydatalib import extract_data_csv, scale_data
path = './new-data/'
dataset_name = "pluie"
save = False
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
(villes, data) = extract_data_csv(path + dataset_name, 13, 13)
print(data)
# print_2d_data(data, dataset_name=dataset_name +
# "_brutes", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Mise à l'échelle")
data_scaled = scale_data(data)
# print_2d_data(data_scaled, dataset_name=dataset_name +
# "_scaled", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Calcul du voisinage")
n = 5
neighbors = NearestNeighbors(n_neighbors=n)
neighbors.fit(data_scaled)
distances, indices = neighbors.kneighbors(data_scaled)
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
distances = np.sort(distances, axis=0)
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
y_name="nombre_de_points", stop=False, save=save)

View file

@ -15,13 +15,12 @@ from mydatalib import scale_data, apply_DBSCAN, evaluate, extract_data_csv, app
path = './new-data/'
dataset_name = "pluie"
save = True
eps = 0.8
save = False
eps = 0.6
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
data = extract_data_csv(path + dataset_name, 1, 5)
(villes, data) = extract_data_csv(path + dataset_name, 1, 12)
print(data)
@ -29,7 +28,7 @@ print("-----------------------------------------------------------")
print(" Mise à l'échelle")
data_scaled = scale_data(data)
k_max = 10
k_max = 20
print("-----------------------------------------------------------")
print(" Application de k-means")
# Application de k-means pour plusieurs valeurs de k