Contient les fichiers et dossiers liés au TP en Apprentissage portant sur les algorithmes de clustering.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

real_world_synthese.py 2.8KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import os
  2. import csv
  3. from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
  4. from sklearn.metrics import davies_bouldin_score, silhouette_score
  5. from hdbscan import HDBSCAN
  6. from graphs import bars_plot, clustering_plot
  7. from scipy.io import arff
  8. from sklearn.decomposition import PCA
  9. from random import seed
  10. # For making all the process reproductible
  11. seed(10)
  12. # Chargement des données contenues dans le fichier
  13. data = arff.loadarff(f="real-world/iris.arff")[0]
  14. print(type(data))
  15. # Extraction des points contenus dans la donnée IRIS et les labels sous forme de texte
  16. iris_data = [[c[0],c[1],c[2],c[3]] for c in data]
  17. labels = [c[4].decode("utf-8") for c in data]
  18. # Transformation des labels textuels en entiers
  19. labels_as_integers = []
  20. for i in range(len(labels)):
  21. # Iris-setosa ==> 0
  22. # Iris-virginica ==> 1
  23. # Iris-versicolor ==> 2
  24. if labels[i]=="Iris-setosa": labels_as_integers +=[0]
  25. elif labels[i]=="Iris-virginica": labels_as_integers +=[1]
  26. else: labels_as_integers += [2]
  27. print(labels)
  28. print(labels_as_integers)
  29. # Réduction de dimension
  30. modele_pca = PCA(n_components=2)
  31. iris_dimension_reduced = modele_pca.fit_transform(X=iris_data)
  32. modele = KMeans(n_clusters=3, init="k-means++")
  33. predictions = modele.fit_predict(X=iris_dimension_reduced)
  34. clustering_plot(iris_dimension_reduced[:,0],iris_dimension_reduced[:,1],predictions,
  35. fig_title="Iris clustering with KMeans", xlabel="1ere composante IRIS",
  36. ylabel="2eme composante IRIS", output="real_synthese/iris_kmeans.png")
  37. linkages = ["single", "average", "complete", "ward"]
  38. for linkage in linkages:
  39. modele = AgglomerativeClustering(n_clusters=3, linkage=linkage)
  40. predictions = modele.fit_predict(X=iris_dimension_reduced)
  41. clustering_plot(iris_dimension_reduced[:, 0], iris_dimension_reduced[:, 1], predictions,
  42. fig_title="Iris clustering with AgglomerativeClustering and linkage=%s"%(linkage), xlabel="1ere composante IRIS",
  43. ylabel="2eme composante IRIS", output="real_synthese/iris_agglclst_with_linkage_%s.png"%(linkage))
  44. modele = DBSCAN(eps=0.19,min_samples=5)
  45. predictions = modele.fit_predict(X=iris_dimension_reduced)
  46. clustering_plot(iris_dimension_reduced[:,0],iris_dimension_reduced[:,1],predictions,
  47. fig_title="Iris clustering with DBSCANs", xlabel="1ere composante IRIS",
  48. ylabel="2eme composante IRIS", output="real_synthese/iris_dbscan.png")
  49. modele = HDBSCAN(cluster_selection_epsilon=0.19,min_samples=5)
  50. predictions = modele.fit_predict(X=iris_dimension_reduced)
  51. clustering_plot(iris_dimension_reduced[:,0],iris_dimension_reduced[:,1],predictions,
  52. fig_title="Iris clustering with HDBSCAN", xlabel="1ere composante IRIS",
  53. ylabel="2eme composante IRIS", output="real_synthese/iris_hdbscan.png")