88 lines
2.4 KiB
Python
88 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Fri Dec 3 16:29:12 2021
|
|
|
|
@author: pfaure
|
|
"""
|
|
from scipy.io import arff
|
|
import numpy as np
|
|
import time
|
|
from sklearn import cluster, metrics, preprocessing
|
|
import pandas as pd
|
|
|
|
|
|
def extract_data_2d(data_path):
|
|
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
|
return np.array([[x[0], x[1]] for x in databrut[0]])
|
|
|
|
|
|
def extract_data_3d(data_path):
|
|
databrut = arff.loadarff(open(data_path + ".arff", 'r'))
|
|
return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
|
|
|
|
|
|
def extract_data_txt(data_path):
|
|
databrut = open(data_path + ".txt").readlines()
|
|
ret = []
|
|
for x in databrut:
|
|
line = list(map(float, x.split()))
|
|
ret += [line]
|
|
return np.array(ret)
|
|
|
|
|
|
def extract_data_csv(data_path: str, first_col: int, last_col: int):
|
|
databrut = pd.read_csv(data_path + ".csv")
|
|
ret = []
|
|
for x in range(first_col, last_col+1):
|
|
ret += [databrut.iloc[:, x]]
|
|
ret = np.array(ret)
|
|
return (list(databrut.iloc[:, 0]), np.array(list(map(list, zip(*ret)))))
|
|
|
|
|
|
def scale_data(data):
|
|
scaler = preprocessing.StandardScaler()
|
|
return scaler.fit_transform(data)
|
|
|
|
|
|
def apply_kmeans(data, k: int = 3, init="k-means++"):
|
|
tps1 = time.time()
|
|
model = cluster.KMeans(n_clusters=k, init=init)
|
|
model.fit(data)
|
|
tps2 = time.time()
|
|
return model, round((tps2 - tps1) * 1000, 2)
|
|
|
|
|
|
def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
|
|
tps1 = time.time()
|
|
model = cluster.AgglomerativeClustering(
|
|
n_clusters=k, affinity='euclidean', linkage=linkage)
|
|
model.fit(data)
|
|
tps2 = time.time()
|
|
return model, round((tps2 - tps1) * 1000, 2)
|
|
|
|
|
|
def apply_DBSCAN(data, eps, min_pts):
|
|
tps1 = time.time()
|
|
model = cluster.DBSCAN(eps=eps, min_samples=min_pts)
|
|
model.fit(data)
|
|
tps2 = time.time()
|
|
return model, round((tps2 - tps1) * 1000, 2)
|
|
|
|
|
|
def apply_mean_shift(data, bandwidth: float):
|
|
tps1 = time.time()
|
|
model = cluster.MeanShift(bandwidth=bandwidth)
|
|
model.fit(data)
|
|
tps2 = time.time()
|
|
return model, round((tps2 - tps1) * 1000, 2)
|
|
|
|
|
|
def evaluate(data, model):
|
|
try:
|
|
silh = metrics.silhouette_score(data, model.labels_)
|
|
davies = metrics.davies_bouldin_score(data, model.labels_)
|
|
calinski = metrics.calinski_harabasz_score(data, model.labels_)
|
|
return silh, davies, calinski
|
|
except ValueError:
|
|
return None, None, None
|