12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Fri Dec 3 16:29:12 2021
-
- @author: pfaure
- """
- from scipy.io import arff
- import numpy as np
- import time
- from sklearn import cluster, metrics, preprocessing
- import pandas as pd
-
-
- def extract_data_2d(data_path):
- databrut = arff.loadarff(open(data_path + ".arff", 'r'))
- return np.array([[x[0], x[1]] for x in databrut[0]])
-
-
- def extract_data_3d(data_path):
- databrut = arff.loadarff(open(data_path + ".arff", 'r'))
- return np.array([[x[0], x[1], x[2]] for x in databrut[0]])
-
-
- def extract_data_txt(data_path):
- databrut = open(data_path + ".txt").readlines()
- ret = []
- for x in databrut:
- line = list(map(float, x.split()))
- ret += [line]
- return np.array(ret)
-
-
- def extract_data_csv(data_path: str, first_col: int, last_col: int):
- databrut = pd.read_csv(data_path + ".csv")
- ret = []
- for x in range(first_col, last_col+1):
- ret += [databrut.iloc[:, x]]
- ret = np.array(ret)
- return (list(databrut.iloc[:, 0]), np.array(list(map(list, zip(*ret)))))
-
-
- def scale_data(data):
- scaler = preprocessing.StandardScaler()
- return scaler.fit_transform(data)
-
-
- def apply_kmeans(data, k: int = 3, init="k-means++"):
- tps1 = time.time()
- model = cluster.KMeans(n_clusters=k, init=init)
- model.fit(data)
- tps2 = time.time()
- return model, round((tps2 - tps1) * 1000, 2)
-
-
- def apply_agglomerative_clustering(data, k: int = 3, linkage="complete"):
- tps1 = time.time()
- model = cluster.AgglomerativeClustering(
- n_clusters=k, affinity='euclidean', linkage=linkage)
- model.fit(data)
- tps2 = time.time()
- return model, round((tps2 - tps1) * 1000, 2)
-
-
- def apply_DBSCAN(data, eps, min_pts):
- tps1 = time.time()
- model = cluster.DBSCAN(eps=eps, min_samples=min_pts)
- model.fit(data)
- tps2 = time.time()
- return model, round((tps2 - tps1) * 1000, 2)
-
-
- def apply_mean_shift(data, bandwidth: float):
- tps1 = time.time()
- model = cluster.MeanShift(bandwidth=bandwidth)
- model.fit(data)
- tps2 = time.time()
- return model, round((tps2 - tps1) * 1000, 2)
-
-
- def evaluate(data, model):
- try:
- silh = metrics.silhouette_score(data, model.labels_)
- davies = metrics.davies_bouldin_score(data, model.labels_)
- calinski = metrics.calinski_harabasz_score(data, model.labels_)
- return silh, davies, calinski
- except ValueError:
- return None, None, None
|