Browse Source

Utilisation du jeu de données réel

Paul Faure 2 years ago
parent
commit
ce36d559f2
5 changed files with 94 additions and 9 deletions
  1. 7
    2
      mydatalib.py
  2. 40
    0
      tp-preprocessing
  3. 2
    1
      tp5-preprocessing.py
  4. 40
    0
      tp6-preprocessing
  5. 5
    6
      tp6-real-dataset.py

+ 7
- 2
mydatalib.py View File

@@ -11,6 +11,7 @@ import time
11 11
 from sklearn import cluster, metrics, preprocessing
12 12
 import pandas as pd
13 13
 
14
+
14 15
 def extract_data_2d(data_path):
15 16
     databrut = arff.loadarff(open(data_path + ".arff", 'r'))
16 17
     return np.array([[x[0], x[1]] for x in databrut[0]])
@@ -31,8 +32,12 @@ def extract_data_txt(data_path):
31 32
 
32 33
 
33 34
 def extract_data_csv(data_path: str, first_col: int, last_col: int):
34
-    data = pd.read_csv(data_path + ".csv")
35
-    return data.iloc[:, last_col]
35
+    databrut = pd.read_csv(data_path + ".csv")
36
+    ret = []
37
+    for x in range(first_col, last_col+1):
38
+        ret += [databrut.iloc[:, x]]
39
+    ret = np.array(ret)
40
+    return (list(databrut.iloc[:, 0]), np.array(list(map(list, zip(*ret)))))
36 41
 
37 42
 
38 43
 def scale_data(data):

+ 40
- 0
tp-preprocessing View File

@@ -0,0 +1,40 @@
1
+#!/usr/bin/env python3
2
+# -*- coding: utf-8 -*-
3
+"""
4
+Created on Sun Jan  9 11:12:30 2022
5
+
6
+@author: pfaure
7
+"""
8
+from sklearn.neighbors import NearestNeighbors
9
+import numpy as np
10
+
11
+from myplotlib import print_1d_data, print_2d_data
12
+from mydatalib import extract_data_csv, scale_data
13
+
14
+path = './new-data/'
15
+dataset_name = "pluie"
16
+save = False
17
+
18
+print("-----------------------------------------------------------")
19
+print("     Chargement du dataset : " + dataset_name)
20
+(villes, data) = extract_data_csv(path + dataset_name, 13, 13)
21
+print(data)
22
+# print_2d_data(data, dataset_name=dataset_name +
23
+#              "_brutes", stop=False, save=save)
24
+
25
+print("-----------------------------------------------------------")
26
+print("     Mise à l'échelle")
27
+data_scaled = scale_data(data)
28
+# print_2d_data(data_scaled, dataset_name=dataset_name +
29
+#              "_scaled", stop=False, save=save)
30
+
31
+print("-----------------------------------------------------------")
32
+print("     Calcul du voisinage")
33
+n = 5
34
+neighbors = NearestNeighbors(n_neighbors=n)
35
+neighbors.fit(data_scaled)
36
+distances, indices = neighbors.kneighbors(data_scaled)
37
+distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
38
+distances = np.sort(distances, axis=0)
39
+print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
40
+              y_name="nombre_de_points", stop=False, save=save)

+ 2
- 1
tp5-preprocessing.py View File

@@ -13,12 +13,13 @@ from myplotlib import print_1d_data, print_2d_data
13 13
 from mydatalib import extract_data_txt, scale_data
14 14
 
15 15
 path = './new-data/'
16
-dataset_name = "d32"
16
+dataset_name = "w2"
17 17
 save = False
18 18
 
19 19
 print("-----------------------------------------------------------")
20 20
 print("     Chargement du dataset : " + dataset_name)
21 21
 data = extract_data_txt(path + dataset_name)
22
+print(data)
22 23
 print_2d_data(data, dataset_name=dataset_name +
23 24
               "_brutes", stop=False, save=save)
24 25
 

+ 40
- 0
tp6-preprocessing View File

@@ -0,0 +1,40 @@
1
+#!/usr/bin/env python3
2
+# -*- coding: utf-8 -*-
3
+"""
4
+Created on Sun Jan  9 11:12:30 2022
5
+
6
+@author: pfaure
7
+"""
8
+from sklearn.neighbors import NearestNeighbors
9
+import numpy as np
10
+
11
+from myplotlib import print_1d_data, print_2d_data
12
+from mydatalib import extract_data_csv, scale_data
13
+
14
+path = './new-data/'
15
+dataset_name = "pluie"
16
+save = False
17
+
18
+print("-----------------------------------------------------------")
19
+print("     Chargement du dataset : " + dataset_name)
20
+(villes, data) = extract_data_csv(path + dataset_name, 13, 13)
21
+print(data)
22
+# print_2d_data(data, dataset_name=dataset_name +
23
+#              "_brutes", stop=False, save=save)
24
+
25
+print("-----------------------------------------------------------")
26
+print("     Mise à l'échelle")
27
+data_scaled = scale_data(data)
28
+# print_2d_data(data_scaled, dataset_name=dataset_name +
29
+#              "_scaled", stop=False, save=save)
30
+
31
+print("-----------------------------------------------------------")
32
+print("     Calcul du voisinage")
33
+n = 5
34
+neighbors = NearestNeighbors(n_neighbors=n)
35
+neighbors.fit(data_scaled)
36
+distances, indices = neighbors.kneighbors(data_scaled)
37
+distances = list(map(lambda x: sum(x[1:n-1])/(len(x)-1), distances))
38
+distances = np.sort(distances, axis=0)
39
+print_1d_data(distances, range(1, len(distances)+1), x_name="distance_moyenne",
40
+              y_name="nombre_de_points", stop=False, save=save)

+ 5
- 6
tp6-real-dataset.py View File

@@ -10,18 +10,17 @@ from sklearn.neighbors import NearestNeighbors
10 10
 import numpy as np
11 11
 
12 12
 from myplotlib import print_1d_data, print_2d_data
13
-from mydatalib import  scale_data, apply_DBSCAN, evaluate, extract_data_csv, apply_kmeans, \
13
+from mydatalib import scale_data, apply_DBSCAN, evaluate, extract_data_csv, apply_kmeans, \
14 14
     apply_agglomerative_clustering, apply_mean_shift
15 15
 
16 16
 path = './new-data/'
17 17
 dataset_name = "pluie"
18
-save = True
19
-eps = 0.8
20
-
18
+save = False
19
+eps = 0.6
21 20
 
22 21
 print("-----------------------------------------------------------")
23 22
 print("     Chargement du dataset : " + dataset_name)
24
-data = extract_data_csv(path + dataset_name, 1, 5)
23
+(villes, data) = extract_data_csv(path + dataset_name, 1, 12)
25 24
 
26 25
 print(data)
27 26
 
@@ -29,7 +28,7 @@ print("-----------------------------------------------------------")
29 28
 print("     Mise à l'échelle")
30 29
 data_scaled = scale_data(data)
31 30
 
32
-k_max = 10
31
+k_max = 20
33 32
 print("-----------------------------------------------------------")
34 33
 print("     Application de k-means")
35 34
 # Application de k-means pour plusieurs valeurs de k

Loading…
Cancel
Save