tp-analyse-donnees/tp5-preprocessing.py

41 lines
1.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 8 16:07:28 2021
@author: pfaure
"""
from sklearn.neighbors import NearestNeighbors
import numpy as np
from myplotlib import print_1d_data, print_2d_data
from mydatalib import extract_data_txt, scale_data
path = './new-data/'
dataset_name = "w2"
save = False
print("-----------------------------------------------------------")
print(" Chargement du dataset : " + dataset_name)
data = extract_data_txt(path + dataset_name)
print(data)
print_2d_data(data, dataset_name=dataset_name +
"_brutes", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Mise à l'échelle")
data_scaled = scale_data(data)
print_2d_data(data_scaled, dataset_name=dataset_name +
"_scaled", stop=False, save=save)
print("-----------------------------------------------------------")
print(" Calcul du voisinage")
n = 50
neighbors = NearestNeighbors(n_neighbors=n)
neighbors.fit(data_scaled)
distances, indices = neighbors.kneighbors(data_scaled)
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
distances = np.sort(distances, axis=0)
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
y_name="nombre_de_points", stop=False, save=save)