#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Dec 3 16:29:12 2021 @author: pfaure """ from scipy.io import arff import numpy as np import time from sklearn import cluster, metrics, preprocessing import pandas as pd def extract_data_2d(data_path): databrut = arff.loadarff(open(data_path + ".arff", 'r')) return np.array([[x[0], x[1]] for x in databrut[0]]) def extract_data_3d(data_path): databrut = arff.loadarff(open(data_path + ".arff", 'r')) return np.array([[x[0], x[1], x[2]] for x in databrut[0]]) def extract_data_txt(data_path): databrut = open(data_path + ".txt").readlines() ret = [] for x in databrut: line = list(map(float, x.split())) ret += [line] return np.array(ret) def extract_data_csv(data_path: str, first_col: int, last_col: int): databrut = pd.read_csv(data_path + ".csv") ret = [] for x in range(first_col, last_col+1): ret += [databrut.iloc[:, x]] ret = np.array(ret) return (list(databrut.iloc[:, 0]), np.array(list(map(list, zip(*ret))))) def scale_data(data): scaler = preprocessing.StandardScaler() return scaler.fit_transform(data) def apply_kmeans(data, k: int = 3, init="k-means++"): tps1 = time.time() model = cluster.KMeans(n_clusters=k, init=init) model.fit(data) tps2 = time.time() return model, round((tps2 - tps1) * 1000, 2) def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"): tps1 = time.time() model = cluster.AgglomerativeClustering( n_clusters=k, affinity='euclidean', linkage=linkage) model.fit(data) tps2 = time.time() return model, round((tps2 - tps1) * 1000, 2) def apply_DBSCAN(data, eps, min_pts): tps1 = time.time() model = cluster.DBSCAN(eps=eps, min_samples=min_pts) model.fit(data) tps2 = time.time() return model, round((tps2 - tps1) * 1000, 2) def apply_mean_shift(data, bandwidth: float): tps1 = time.time() model = cluster.MeanShift(bandwidth=bandwidth) model.fit(data) tps2 = time.time() return model, round((tps2 - tps1) * 1000, 2) def evaluate(data, model): try: silh = metrics.silhouette_score(data, model.labels_) davies = metrics.davies_bouldin_score(data, model.labels_) calinski = metrics.calinski_harabasz_score(data, model.labels_) return silh, davies, calinski except ValueError: return None, None, None