tp-analyse-donnees/mydatalib.py

54 lines
1.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 3 16:29:12 2021
@author: pfaure
"""
from scipy.io import arff
import numpy as np
import time
from sklearn import cluster, metrics, preprocessing
def extract_data_2d(data_path):
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
return np.array([[x[0], x[1]] for x in databrut[0]])
def extract_data_3d(data_path):
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
def scale_data(data):
scaler = preprocessing.StandardScaler()
return scaler.fit_transform(data)
def apply_kmeans(data, k: int = 3, init="k-means++"):
tps1 = time.time()
model_km = cluster.KMeans(n_clusters=k, init=init)
model_km.fit(data)
tps2 = time.time()
return (model_km, round((tps2 - tps1)*1000, 2))
def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
tps1 = time.time()
model_agg = cluster.AgglomerativeClustering(
n_clusters=k, affinity='euclidean', linkage=linkage)
model_agg.fit(data)
tps2 = time.time()
return (model_agg, round((tps2 - tps1)*1000, 2))
def evaluate_kmeans(data, model_km):
silh = metrics.silhouette_score(data, model_km.labels_, metric='euclidean')
return (silh, model_km.inertia_, model_km.n_iter_)
def evaluate_agglomerative_clustering(data, model_agg):
silh = metrics.silhouette_score(
data, model_agg.labels_, metric='euclidean')
return silh