Utilisation du jeu de données réel
This commit is contained in:
parent
42c641d044
commit
ce36d559f2
5 changed files with 94 additions and 9 deletions
|
@ -11,6 +11,7 @@ import time
|
|||
from sklearn import cluster, metrics, preprocessing
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def extract_data_2d(data_path):
|
||||
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
||||
return np.array([[x[0], x[1]] for x in databrut[0]])
|
||||
|
@ -31,8 +32,12 @@ def extract_data_txt(data_path):
|
|||
|
||||
|
||||
def extract_data_csv(data_path: str, first_col: int, last_col: int):
|
||||
data = pd.read_csv(data_path + ".csv")
|
||||
return data.iloc[:, last_col]
|
||||
databrut = pd.read_csv(data_path + ".csv")
|
||||
ret = []
|
||||
for x in range(first_col, last_col+1):
|
||||
ret += [databrut.iloc[:, x]]
|
||||
ret = np.array(ret)
|
||||
return (list(databrut.iloc[:, 0]), np.array(list(map(list, zip(*ret)))))
|
||||
|
||||
|
||||
def scale_data(data):
|
||||
|
|
40
tp-preprocessing
Normal file
40
tp-preprocessing
Normal file
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sun Jan 9 11:12:30 2022
|
||||
|
||||
@author: pfaure
|
||||
"""
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
import numpy as np
|
||||
|
||||
from myplotlib import print_1d_data, print_2d_data
|
||||
from mydatalib import extract_data_csv, scale_data
|
||||
|
||||
path = './new-data/'
|
||||
dataset_name = "pluie"
|
||||
save = False
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
(villes, data) = extract_data_csv(path + dataset_name, 13, 13)
|
||||
print(data)
|
||||
# print_2d_data(data, dataset_name=dataset_name +
|
||||
# "_brutes", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Mise à l'échelle")
|
||||
data_scaled = scale_data(data)
|
||||
# print_2d_data(data_scaled, dataset_name=dataset_name +
|
||||
# "_scaled", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Calcul du voisinage")
|
||||
n = 5
|
||||
neighbors = NearestNeighbors(n_neighbors=n)
|
||||
neighbors.fit(data_scaled)
|
||||
distances, indices = neighbors.kneighbors(data_scaled)
|
||||
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
|
||||
distances = np.sort(distances, axis=0)
|
||||
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
|
||||
y_name="nombre_de_points", stop=False, save=save)
|
|
@ -13,12 +13,13 @@ from myplotlib import print_1d_data, print_2d_data
|
|||
from mydatalib import extract_data_txt, scale_data
|
||||
|
||||
path = './new-data/'
|
||||
dataset_name = "d32"
|
||||
dataset_name = "w2"
|
||||
save = False
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
data = extract_data_txt(path + dataset_name)
|
||||
print(data)
|
||||
print_2d_data(data, dataset_name=dataset_name +
|
||||
"_brutes", stop=False, save=save)
|
||||
|
||||
|
|
40
tp6-preprocessing
Normal file
40
tp6-preprocessing
Normal file
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sun Jan 9 11:12:30 2022
|
||||
|
||||
@author: pfaure
|
||||
"""
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
import numpy as np
|
||||
|
||||
from myplotlib import print_1d_data, print_2d_data
|
||||
from mydatalib import extract_data_csv, scale_data
|
||||
|
||||
path = './new-data/'
|
||||
dataset_name = "pluie"
|
||||
save = False
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
(villes, data) = extract_data_csv(path + dataset_name, 13, 13)
|
||||
print(data)
|
||||
# print_2d_data(data, dataset_name=dataset_name +
|
||||
# "_brutes", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Mise à l'échelle")
|
||||
data_scaled = scale_data(data)
|
||||
# print_2d_data(data_scaled, dataset_name=dataset_name +
|
||||
# "_scaled", stop=False, save=save)
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Calcul du voisinage")
|
||||
n = 5
|
||||
neighbors = NearestNeighbors(n_neighbors=n)
|
||||
neighbors.fit(data_scaled)
|
||||
distances, indices = neighbors.kneighbors(data_scaled)
|
||||
distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
|
||||
distances = np.sort(distances, axis=0)
|
||||
print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
|
||||
y_name="nombre_de_points", stop=False, save=save)
|
|
@ -15,13 +15,12 @@ from mydatalib import scale_data, apply_DBSCAN, evaluate, extract_data_csv, app
|
|||
|
||||
path = './new-data/'
|
||||
dataset_name = "pluie"
|
||||
save = True
|
||||
eps = 0.8
|
||||
|
||||
save = False
|
||||
eps = 0.6
|
||||
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Chargement du dataset : " + dataset_name)
|
||||
data = extract_data_csv(path + dataset_name, 1, 5)
|
||||
(villes, data) = extract_data_csv(path + dataset_name, 1, 12)
|
||||
|
||||
print(data)
|
||||
|
||||
|
@ -29,7 +28,7 @@ print("-----------------------------------------------------------")
|
|||
print(" Mise à l'échelle")
|
||||
data_scaled = scale_data(data)
|
||||
|
||||
k_max = 10
|
||||
k_max = 20
|
||||
print("-----------------------------------------------------------")
|
||||
print(" Application de k-means")
|
||||
# Application de k-means pour plusieurs valeurs de k
|
||||
|
|
Loading…
Reference in a new issue