12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- import os
- import csv
- from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
- from sklearn.metrics import davies_bouldin_score, silhouette_score
- from hdbscan import HDBSCAN
- from graphs import bars_plot, clustering_plot
- from scipy.io import arff
- from sklearn.decomposition import PCA
- from random import seed
-
- # For making all the process reproductible
- seed(10)
-
- # Chargement des données contenues dans le fichier
- data = arff.loadarff(f="real-world/iris.arff")[0]
- print(type(data))
- # Extraction des points contenus dans la donnée IRIS et les labels sous forme de texte
- iris_data = [[c[0],c[1],c[2],c[3]] for c in data]
- labels = [c[4].decode("utf-8") for c in data]
- # Transformation des labels textuels en entiers
- labels_as_integers = []
- for i in range(len(labels)):
- # Iris-setosa ==> 0
- # Iris-virginica ==> 1
- # Iris-versicolor ==> 2
- if labels[i]=="Iris-setosa": labels_as_integers +=[0]
- elif labels[i]=="Iris-virginica": labels_as_integers +=[1]
- else: labels_as_integers += [2]
-
- print(labels)
- print(labels_as_integers)
-
- # Réduction de dimension
- modele_pca = PCA(n_components=2)
- iris_dimension_reduced = modele_pca.fit_transform(X=iris_data)
-
- modele = KMeans(n_clusters=3, init="k-means++")
- predictions = modele.fit_predict(X=iris_dimension_reduced)
- clustering_plot(iris_dimension_reduced[:,0],iris_dimension_reduced[:,1],predictions,
- fig_title="Iris clustering with KMeans", xlabel="1ere composante IRIS",
- ylabel="2eme composante IRIS", output="real_synthese/iris_kmeans.png")
-
- linkages = ["single", "average", "complete", "ward"]
- for linkage in linkages:
- modele = AgglomerativeClustering(n_clusters=3, linkage=linkage)
- predictions = modele.fit_predict(X=iris_dimension_reduced)
- clustering_plot(iris_dimension_reduced[:, 0], iris_dimension_reduced[:, 1], predictions,
- fig_title="Iris clustering with AgglomerativeClustering and linkage=%s"%(linkage), xlabel="1ere composante IRIS",
- ylabel="2eme composante IRIS", output="real_synthese/iris_agglclst_with_linkage_%s.png"%(linkage))
-
- modele = DBSCAN(eps=0.19,min_samples=5)
- predictions = modele.fit_predict(X=iris_dimension_reduced)
- clustering_plot(iris_dimension_reduced[:,0],iris_dimension_reduced[:,1],predictions,
- fig_title="Iris clustering with DBSCANs", xlabel="1ere composante IRIS",
- ylabel="2eme composante IRIS", output="real_synthese/iris_dbscan.png")
-
- modele = HDBSCAN(cluster_selection_epsilon=0.19,min_samples=5)
- predictions = modele.fit_predict(X=iris_dimension_reduced)
- clustering_plot(iris_dimension_reduced[:,0],iris_dimension_reduced[:,1],predictions,
- fig_title="Iris clustering with HDBSCAN", xlabel="1ere composante IRIS",
- ylabel="2eme composante IRIS", output="real_synthese/iris_hdbscan.png")
|