Programmes python apprentissage supervise.
34
TP1_prog1.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from sklearn import datasets
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
###################
|
||||
### EXERCICE 1 ###
|
||||
###################
|
||||
# Chargement de la base de données MNIST
|
||||
mnist = datasets.fetch_openml('mnist_784')
|
||||
|
||||
# Affichage de quelques commandes
|
||||
affichage = True
|
||||
if affichage:
|
||||
print(mnist)
|
||||
print (mnist.data)
|
||||
print (mnist.target)
|
||||
print (len(mnist.data))
|
||||
help(len)
|
||||
print (mnist.data.shape)
|
||||
print (mnist.target.shape)
|
||||
print (mnist.data[0])
|
||||
print (mnist.data[0][1])
|
||||
print (mnist.data[:,1])
|
||||
print (len(mnist.data[:100]))
|
||||
|
||||
# Extraction et affichage de la première image de la base de données
|
||||
images = mnist.data.reshape((-1, 28, 28))
|
||||
for i in range(5):
|
||||
plt.imshow(images[i],cmap=plt.gray(),interpolation="nearest")
|
||||
plt.show()
|
||||
|
||||
# Affichage de la classe de la première image de la base de données
|
||||
targets = mnist.target
|
||||
print("Classe de la première image: %s"%(targets[0]))
|
151
TP1_prog2.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split, KFold
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from random import seed
|
||||
from graphs import bars_plot2, bars_plot
|
||||
import time
|
||||
|
||||
np.random.seed(0) # initialise le generateur aletoire avec seed=0
|
||||
seed(0)
|
||||
|
||||
###################
|
||||
### EXERCICE 2 ###
|
||||
###################
|
||||
# Chargement de la base de données MNIST
|
||||
mnist = datasets.fetch_openml('mnist_784')
|
||||
|
||||
# Extraction des images de la base de données
|
||||
images = mnist.data.reshape((-1, 28, 28))
|
||||
|
||||
# Extraction des classes des images de la base de données
|
||||
targets = mnist.target
|
||||
|
||||
# Génération d'échantillon de données avec 5000 exemples
|
||||
indexes = np.random.randint(70000, size=5000)
|
||||
data = images[indexes,:]
|
||||
data_targets = targets[indexes]
|
||||
|
||||
# Division de la base de données en lots d'entraînement et de test
|
||||
def k_10_classifier_and_score():
|
||||
pourcentage_donnees = 0.8
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, data_targets, train_size=pourcentage_donnees)
|
||||
# Mise en conformité des dimensions de xtrain et xtest aux dimensions attendues par le classifeur
|
||||
xtrain = xtrain.reshape((-1, 784))
|
||||
xtest = xtest.reshape((-1, 784))
|
||||
# Instanciation et entraînement du classifieur
|
||||
k = 10
|
||||
clf = KNeighborsClassifier(n_neighbors=k)
|
||||
clf.fit(X=xtrain, y=ytrain)
|
||||
predictions = clf.predict(X=xtest)
|
||||
# Classe d'image 4 et sa classe prédite
|
||||
print("VERDICT IMAGE N°4\nClasse réelle: %s\nClasse prédite: %s"%(ytest[3], predictions[3]))
|
||||
# Affichage du score sur l'échantillon de test
|
||||
score_xtest = clf.score(X=xtest, y=ytest)
|
||||
print("Score xtest = %.2f%%"%(score_xtest*100))
|
||||
# Taux d'erreur sur les données d'apprentissage
|
||||
score_xtrain = clf.score(X=xtrain, y=ytrain)
|
||||
print("Score d'erreur xtrain = %.2f%%"%(100*(1-score_xtrain)))
|
||||
|
||||
def k_variation_and_score():
|
||||
# Variation du nombre k de voisins et score résultant
|
||||
n_folds = 10
|
||||
kf = KFold(n_splits=n_folds, shuffle=True)
|
||||
k_score = {}
|
||||
for k in range(2,16):
|
||||
sum_score = 0
|
||||
for train_index, test_index in kf.split(X=data):
|
||||
x_train, x_test = data[train_index], data[test_index]
|
||||
y_train, y_test = data_targets[train_index], data_targets[test_index]
|
||||
x_train = x_train.reshape((-1, 784))
|
||||
x_test = x_test.reshape((-1, 784))
|
||||
clf = KNeighborsClassifier(n_neighbors=k)
|
||||
clf.fit(X=x_train, y=y_train)
|
||||
sum_score += clf.score(X=x_test, y=y_test)
|
||||
k_score[k] = (sum_score/n_folds)*100
|
||||
bars_plot2(bar_labels=list(k_score.keys()), bar_heights=list(k_score.values()), xlabel="Valeur de k",
|
||||
ylabel="Score", fig_title="Score par valeur de k (nombre de voisins)", show=True, percent_mark=True,
|
||||
output="tp1Output/mnist_k_variation_scores.png")
|
||||
|
||||
def train_test_percent_variation_and_score():
|
||||
# Variation du pourcentage des échantillons (training et test) et affichage des scores résultants
|
||||
k_score_percent = {}
|
||||
for pourcentage in [0.05*i for i in range(1,20)]:
|
||||
xtrain_percent, xtest_percent, ytrain_percent, ytest_percent = train_test_split(data, data_targets, train_size=pourcentage)
|
||||
xtrain_percent = xtrain_percent.reshape((-1, 784))
|
||||
xtest_percent = xtest_percent.reshape((-1, 784))
|
||||
clf = KNeighborsClassifier(n_neighbors=10)
|
||||
clf.fit(X=xtrain_percent, y=ytrain_percent)
|
||||
k_score_percent[pourcentage] = clf.score(X=xtest_percent, y=ytest_percent)*100
|
||||
bars_plot2(bar_labels=["%.2f" % (percent) for percent in list(k_score_percent.keys())],
|
||||
bar_heights=list(k_score_percent.values()), xlabel="Pourcentage d'échantillons train",
|
||||
ylabel="Score",
|
||||
fig_title="Score par pourcentage de découpage de DATA (5000 images) en échantillons train et test",
|
||||
show=True, percent_mark=True, bar_colors="red",
|
||||
output="tp1Output/mnist_split_data_percentage_scores.png")
|
||||
|
||||
def train_percent_variation_and_score():
|
||||
# Fixation de la taille de l'échantillon test (25% de DATA) et variation de la taille de l'échantillon train avec affichage de score
|
||||
k_score_train_variation = {}
|
||||
xtrain_global, xtest_25_percent, ytrain_global, ytest_25_percent = train_test_split(data, data_targets, train_size=0.75)
|
||||
xtest_25_percent = xtest_25_percent.reshape((-1, 784))
|
||||
for pourcentage in [(10*j)/100.0 for j in range(1,10)]:
|
||||
xtrain_echantillon = xtrain_global[:int(pourcentage*len(xtrain_global))+1]
|
||||
xtrain_echantillon = xtrain_echantillon.reshape((-1, 784))
|
||||
ytrain_echantillon = ytrain_global[:int(pourcentage*len(xtrain_global))+1]
|
||||
clf = KNeighborsClassifier(n_neighbors=10)
|
||||
clf.fit(X=xtrain_echantillon, y=ytrain_echantillon)
|
||||
k_score_train_variation[pourcentage] = clf.score(X=xtest_25_percent, y=ytest_25_percent)*100
|
||||
bars_plot2(bar_labels=["%.2f" % (percent) for percent in list(k_score_train_variation.keys())],
|
||||
bar_heights=list(k_score_train_variation.values()),
|
||||
xlabel="Pourcentage d'échantillons train", ylabel="Score",
|
||||
fig_title="Score par pourcentage de découpage de train (%s images initialement) et %s images comme test (25%% de DATA)" % (
|
||||
len(xtrain_global), len(xtest_25_percent)),
|
||||
show=True, percent_mark=True, bar_colors="magenta",
|
||||
output="tp1Output/mnist_split_various_train_percentage_scores_with_fixed_test.png")
|
||||
|
||||
def distance_type_variation_and_score():
|
||||
# Variation du type de distance avec train=75% de DATA et test=25% de DATA puis affichage de score
|
||||
xtrain_distance, xtest_distance, ytrain_distance, ytest_distance = train_test_split(data, data_targets, train_size=0.75)
|
||||
xtrain_distance = xtrain_distance.reshape((-1, 784))
|
||||
xtest_distance = xtest_distance.reshape((-1, 784))
|
||||
distances = ["minkowski", "euclidean", "hamming", "canberra", "braycurtis"]
|
||||
distance_score ={}
|
||||
for distance in distances:
|
||||
clf = KNeighborsClassifier(n_neighbors=10, metric=distance)
|
||||
clf.fit(X=xtrain_distance, y=ytrain_distance)
|
||||
distance_score[distance] = clf.score(X=xtest_distance, y=ytest_distance)*100
|
||||
bars_plot2(bar_labels=list(distance_score.keys()), bar_heights=list(distance_score.values()),
|
||||
xlabel="Types de distance", ylabel="Score",
|
||||
fig_title="Score pour différent type de distance (métrique) avec train=75% de DATA et test=25% de DATA",
|
||||
show=True, percent_mark=True, bar_colors="green",
|
||||
output="tp1Output/mnist_various_distance_metric_scores.png")
|
||||
|
||||
|
||||
def parameter_njobs_variation_and_score():
|
||||
xtrain_distance, xtest_distance, ytrain_distance, ytest_distance = train_test_split(data, data_targets, train_size=0.75)
|
||||
xtrain_distance = xtrain_distance.reshape((-1, 784))
|
||||
xtest_distance = xtest_distance.reshape((-1, 784))
|
||||
distances = ["minkowski", "euclidean", "hamming", "canberra", "braycurtis"]
|
||||
for n_jobs in [6, -1]:
|
||||
distance_based_training_time = {}
|
||||
for distance in distances:
|
||||
clf = KNeighborsClassifier(n_neighbors=10, metric=distance, n_jobs=n_jobs)
|
||||
tic = time.process_time()
|
||||
clf.fit(X=xtrain_distance, y=ytrain_distance)
|
||||
toc = time.process_time()
|
||||
distance_based_training_time[distance] = (toc-tic)
|
||||
bars_plot2(bar_labels=list(distance_based_training_time.keys()),
|
||||
bar_heights=list(distance_based_training_time.values()),
|
||||
xlabel="Types de distance", ylabel="Temps d'entraînement (seconde)", show=True, bar_colors="pink",
|
||||
fig_title="Temps d'entraînement pour différent type de distance avec train=75%% de DATA et test=25%% de DATA et\nN_JOBS=%s" % (
|
||||
n_jobs))
|
||||
|
||||
|
||||
k_10_classifier_and_score()
|
||||
k_variation_and_score()
|
||||
train_test_percent_variation_and_score()
|
||||
train_percent_variation_and_score()
|
||||
distance_type_variation_and_score()
|
||||
parameter_njobs_variation_and_score()
|
217
TP2_prog1.py
Normal file
|
@ -0,0 +1,217 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split, KFold
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.metrics import precision_score, zero_one_loss, recall_score
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from random import seed, randint
|
||||
from graphs import bars_plot2, bars_plot
|
||||
import time
|
||||
|
||||
|
||||
np.random.seed(0) # initialise le generateur aletoire avec seed=0
|
||||
seed(0)
|
||||
|
||||
#######################
|
||||
### TP2 PROGRAMME ###
|
||||
#######################
|
||||
# Chargement de la base de données MNIST
|
||||
mnist = datasets.fetch_openml('mnist_784')
|
||||
|
||||
# Division de la base de données
|
||||
x_train = mnist.data[:4900,:] ## training
|
||||
y_train = mnist.target[:4900]
|
||||
x_test = mnist.data[4900:7000,:] ## test
|
||||
y_test = mnist.target[4900:7000]
|
||||
|
||||
# Mise en conformité des dimensions de xtrain et xtest aux dimensions attendues par le classifeur
|
||||
x_train = x_train.reshape((-1, 784))
|
||||
x_test = x_test.reshape((-1, 784))
|
||||
|
||||
def modele_hidden_layer_sizes_50():
|
||||
# Création du modèle, entraînement et génération de précession
|
||||
modele = MLPClassifier(hidden_layer_sizes=(50))
|
||||
modele.fit(X=x_train, y=y_train)
|
||||
score = modele.score(X=x_test, y=y_test)
|
||||
print("MLPClassifier[hidden_layer_sizes=(50)] score = %.2f"%(100*score)) # après compilation 95.19%
|
||||
# Classe d'image 4 et sa classe prédite
|
||||
y_pred = modele.predict(X=x_test)
|
||||
print("VERDICT IMAGE N°4\nClasse réelle: %s\nClasse prédite: %s"%(y_test[3], y_pred[3]))
|
||||
# Calcul de la précession à travers le package precision_score
|
||||
score_package = precision_score(y_true=y_test, y_pred=y_pred, average="micro")
|
||||
print("MLPClassifier[hidden_layer_sizes=(50)] score[using precision_score] = %.2f"%(100*score_package)) # après compilation %
|
||||
|
||||
def layers_number_variation_and_scores():
|
||||
# Variation du nombre de couches et affichage de scores résultants
|
||||
scores_dictionary = {}
|
||||
layers = []
|
||||
for iteration in range(1,11):
|
||||
layers.append(50)
|
||||
hidden_layers = tuple(layers)
|
||||
print(hidden_layers)
|
||||
modele_iter = MLPClassifier(hidden_layer_sizes=hidden_layers)
|
||||
modele_iter.fit(X=x_train, y=y_train)
|
||||
scores_dictionary["%s couches"%(iteration)] = modele_iter.score(X=x_test, y=y_test) * 100
|
||||
# Génération sous forme diagramme à barre
|
||||
print(scores_dictionary)
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(range(1,11), list(scores_dictionary.values()))
|
||||
plt.xlabel("Nombre de couches de 50 neurones")
|
||||
plt.xlim(left=1)
|
||||
plt.ylabel("Score")
|
||||
plt.title("Courbe d'évolution du score de précession en fonction du nombre de couches cachées")
|
||||
plt.savefig("tp2Output/scores_with_hidden_layers_variation.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
|
||||
def five_models_with_different_layers():
|
||||
scores = {}
|
||||
times = {}
|
||||
layers = {}
|
||||
for counter in range(5):
|
||||
hl = tuple([randint(10, 300) for i in range(counter*2 + 2)])
|
||||
modele = MLPClassifier(hidden_layer_sizes=hl)
|
||||
tic = time.process_time()
|
||||
modele.fit(X=x_train, y=y_train)
|
||||
toc = time.process_time()
|
||||
times[counter] = toc - tic
|
||||
scores[counter] = modele.score(X=x_test, y=y_test) * 100
|
||||
layers[counter] = hl
|
||||
barWidth = 0.4
|
||||
y1 = [times[c] for c in range(5)]
|
||||
y2 = [scores[c] for c in range(5)]
|
||||
r1 = [0, 4, 8, 12, 16]
|
||||
r2 = [x + barWidth for x in r1]
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
bar1 = plt.bar(r1, y1, width=barWidth, color=['red' for i in y1], linewidth=2, label="temps (sec)")
|
||||
bar2 = plt.bar(r2, y2, width=barWidth, color=['green' for i in y2], linewidth=4, label="precision (%)")
|
||||
plt.xticks([r + barWidth for r in r1], ['%s couches\n%s'%(i*2+2, layers[i]) for i in range(5)], rotation=10)
|
||||
plt.xlabel("Modèles")
|
||||
plt.ylabel("Valeur")
|
||||
plt.title("Evaluation du temps d'apprentissage et la précession de cinq modèles de différentes couches")
|
||||
for rect in bar1 + bar2:
|
||||
height = rect.get_height()
|
||||
plt.text(rect.get_x() + rect.get_width() / 2.0, height, '%.2f' % height, ha='center', va='bottom')
|
||||
plt.legend()
|
||||
plt.savefig("tp2Output/comparison_times_scores_of_methods_different_layers", dpi="figure")
|
||||
plt.show()
|
||||
|
||||
def optimization_algorithms_variation():
|
||||
solvers = ["lbfgs", "sgd", "adam"]
|
||||
scores = {}
|
||||
scores["lbfgs"] = {}
|
||||
scores["sgd"] = {}
|
||||
scores["adam"] = {}
|
||||
for counter in range(1, 11):
|
||||
hl = tuple([randint(10, 300) for i in range(counter)])
|
||||
for solver in solvers:
|
||||
modele = MLPClassifier(hidden_layer_sizes=hl, solver=solver)
|
||||
modele.fit(X=x_train, y=y_train)
|
||||
scores[solver][counter] = modele.score(X=x_test, y=y_test) * 100
|
||||
print(scores)
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(range(1,11), list(scores["lbfgs"].values()), label="L-BFGS")
|
||||
plt.plot(range(1,11), list(scores["sgd"].values()), label="SGD")
|
||||
plt.plot(range(1,11), list(scores["adam"].values()), label="ADAM")
|
||||
plt.xlabel("Nombre de couches cachées")
|
||||
plt.xlim(left=1)
|
||||
plt.ylabel("Score de précession")
|
||||
plt.title("Courbe d'évolution du score de précession en fonction du nombre de couches cachées")
|
||||
plt.legend()
|
||||
plt.savefig("tp2Output/scores_with_hidden_layers_variation_different_solvers.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
|
||||
def activation_functions_variation():
|
||||
activations = ["identity", "logistic", "tanh", "relu"]
|
||||
scores = {}
|
||||
scores["identity"] = {}
|
||||
scores["logistic"] = {}
|
||||
scores["tanh"] = {}
|
||||
scores["relu"] = {}
|
||||
for counter in range(1, 11):
|
||||
hl = tuple([randint(10, 300) for i in range(counter)])
|
||||
for activation in activations:
|
||||
modele = MLPClassifier(hidden_layer_sizes=hl, activation=activation)
|
||||
modele.fit(X=x_train, y=y_train)
|
||||
scores[activation][counter] = modele.score(X=x_test, y=y_test) * 100
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(range(1,11), list(scores["identity"].values()), label="identity")
|
||||
plt.plot(range(1,11), list(scores["logistic"].values()), label="logistic")
|
||||
plt.plot(range(1,11), list(scores["tanh"].values()), label="tanh")
|
||||
plt.plot(range(1, 11), list(scores["relu"].values()), label="relu")
|
||||
plt.xlabel("Nombre de couches cachées")
|
||||
plt.xlim(left=1)
|
||||
plt.ylabel("Score de précession")
|
||||
plt.title("Courbe d'évolution du score de précession en fonction du nombre de couches cachées")
|
||||
plt.legend()
|
||||
plt.savefig("tp2Output/scores_with_hidden_layers_variation_different_activations_with_adam.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
|
||||
def alpha_parameter_variation():
|
||||
scores = []
|
||||
alphas = [10**(-i) for i in range(1,9)]
|
||||
alphas = alphas[::-1]
|
||||
for alpha in alphas:
|
||||
# hl = tuple([randint(10, 300) for i in range(counter)])
|
||||
modele = MLPClassifier(hidden_layer_sizes=(60,47,179,251,296,61,191,232,171,114), alpha=alpha)
|
||||
modele.fit(X=x_train, y=y_train)
|
||||
scores.append(modele.score(X=x_test, y=y_test) * 100)
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(alphas, scores)
|
||||
plt.xlabel("Valeurs d'alpha")
|
||||
plt.xlim(left=10e-8)
|
||||
plt.ylabel("Score de précession")
|
||||
plt.title("Courbe d'évolution du score de précession en fonction du paramètre alpha")
|
||||
plt.savefig("tp2Output/scores_with_alpha_variation_same_hidden_layers_sizes.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
|
||||
def times_scores_recalls_errors_comparison():
|
||||
scores = {}
|
||||
times = {}
|
||||
recalls = {}
|
||||
errors = {}
|
||||
layers = {}
|
||||
for counter in range(5):
|
||||
hl = tuple([randint(10, 300) for i in range(counter * 2 + 2)])
|
||||
modele = MLPClassifier(hidden_layer_sizes=hl)
|
||||
tic = time.process_time()
|
||||
modele.fit(X=x_train, y=y_train)
|
||||
toc = time.process_time()
|
||||
y_pred = modele.predict(X=x_test)
|
||||
times[counter] = toc - tic
|
||||
scores[counter] = modele.score(X=x_test, y=y_test) * 100
|
||||
recalls[counter] = recall_score(y_true=y_test, y_pred=y_pred, average="micro")
|
||||
errors[counter] = zero_one_loss(y_true=y_test, y_pred=y_pred) * 100
|
||||
layers[counter] = hl
|
||||
barWidth = 0.4
|
||||
y1 = [times[c] for c in range(5)]
|
||||
y2 = [scores[c] for c in range(5)]
|
||||
y3 = [recalls[c] for c in range(5)]
|
||||
y4 = [errors[c] for c in range(5)]
|
||||
r1 = [0, 4, 8, 12, 16]
|
||||
r2 = [x + barWidth for x in r1]
|
||||
r3 = [x + 2 * barWidth for x in r1]
|
||||
r4 = [x + 3 * barWidth for x in r1]
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
bar1 = plt.bar(r1, y1, width=barWidth, color=['red' for i in y1], linewidth=2, label="temps (sec)")
|
||||
bar2 = plt.bar(r2, y2, width=barWidth, color=['green' for i in y2], linewidth=4, label="precision (%)")
|
||||
bar3 = plt.bar(r3, y3, width=barWidth, color=['red' for i in y3], linewidth=1, label="recall")
|
||||
bar4 = plt.bar(r4, y4, width=barWidth, color=['magenta' for i in y4], linewidth=3, label="error (%)")
|
||||
plt.xticks([r + 1.5*barWidth for r in r1], ['%s couches\n%s' % (i * 2 + 2, layers[i]) for i in range(5)], rotation=10)
|
||||
plt.xlabel("Modèles")
|
||||
plt.ylabel("Valeur")
|
||||
plt.title("Evaluation du temps d'apprentissage, la précession, le recall et l'erreur de cinq modèles de différentes couches")
|
||||
for rect in bar1 + bar2 + bar3 + bar4:
|
||||
height = rect.get_height()
|
||||
plt.text(rect.get_x() + rect.get_width() / 2.0, height, '%.2f' % height, ha='center', va='bottom')
|
||||
plt.legend()
|
||||
plt.savefig("tp2Output/comparison_times_scores_recalls_errors_of_methods_different_layers", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
|
||||
|
||||
modele_hidden_layer_sizes_50()
|
||||
layers_number_variation_and_scores()
|
||||
five_models_with_different_layers()
|
||||
optimization_algorithms_variation()
|
||||
activation_functions_variation()
|
||||
alpha_parameter_variation()
|
||||
times_scores_recalls_errors_comparison()
|
257
TP3_prog1.py
Normal file
|
@ -0,0 +1,257 @@
|
|||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split, KFold
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from random import seed, randint
|
||||
from graphs import bars_plot2, curve_plot
|
||||
from statistics import mean
|
||||
import time
|
||||
|
||||
|
||||
np.random.seed(0) # initialise le generateur aletoire avec seed=0
|
||||
seed(0)
|
||||
|
||||
#######################
|
||||
### TP3 PROGRAMME ###
|
||||
#######################
|
||||
# Chargement de la base de données MNIST
|
||||
mnist = datasets.fetch_openml('mnist_784')
|
||||
|
||||
# Division de la base de données
|
||||
x_train = mnist.data[:3500,:] ## training
|
||||
y_train = mnist.target[:3500]
|
||||
x_test = mnist.data[3500:5001,:] ## test
|
||||
y_test = mnist.target[3500:5001]
|
||||
|
||||
# print(x_train.shape,x_test.shape) #
|
||||
|
||||
# Mise en conformité des dimensions de xtrain et xtest aux dimensions attendues par le classifeur
|
||||
x_train = x_train.reshape((-1, 784))
|
||||
x_test = x_test.reshape((-1, 784))
|
||||
|
||||
def kernel_variation():
|
||||
kernels = ["linear","poly","rbf","sigmoid"]
|
||||
scores = {}
|
||||
for kernel in kernels:
|
||||
clsvm = SVC(kernel=kernel)
|
||||
clsvm.fit(X=x_train, y=y_train)
|
||||
scores[kernel] = clsvm.score(X=x_test, y=y_test) * 100
|
||||
print("Score avec kernel=%s: %.2f"%(kernel, scores[kernel]))
|
||||
bars_plot2(bar_labels=kernels, bar_heights=list(scores.values()), xlabel="Kernels utilisés",
|
||||
ylabel="Score", fig_title="Score par kernel utilisé", show=True, percent_mark=True,
|
||||
output="tp3Output/scores_for_kernel_variation.png")
|
||||
|
||||
|
||||
def kernel_and_c_variation():
|
||||
kernels = ["linear", "poly", "rbf", "sigmoid"]
|
||||
scores = {}
|
||||
c_values = [0.1 + 0.18 * x for x in range(6)]
|
||||
for kernel in kernels:
|
||||
scores[kernel] = {}
|
||||
for c_value in c_values:
|
||||
clsvm = SVC(kernel=kernel, C=c_value)
|
||||
clsvm.fit(X=x_train, y=y_train)
|
||||
scores[kernel][c_value] = clsvm.score(X=x_test, y=y_test) * 100
|
||||
print("Score avec kernel=%s: %.2f" % (kernel, clsvm.score(X=x_test, y=y_test) * 100))
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(c_values, list(scores["linear"].values()), label="linear")
|
||||
plt.plot(c_values, list(scores["poly"].values()), label="poly")
|
||||
plt.plot(c_values, list(scores["rbf"].values()), label="rbf")
|
||||
plt.plot(c_values, list(scores["sigmoid"].values()), label="sigmoid")
|
||||
plt.xlabel("Valeurs de C")
|
||||
plt.xlim(left=0.1)
|
||||
plt.ylabel("Score (précision)")
|
||||
plt.title("Courbe d'évolution de la précision en fonction de C")
|
||||
plt.legend()
|
||||
plt.savefig("tp3Output/scores_all_kernel_with_c_variation.png", dpi="figure")
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def train_test_error_with_c_variation():
|
||||
kernels = ["linear", "poly", "rbf", "sigmoid"]
|
||||
scores = {}
|
||||
c_values = [0.1 + 0.18 * x for x in range(6)]
|
||||
data_types = ["train", "test"]
|
||||
for data_type in data_types:
|
||||
scores[data_type] = {}
|
||||
for kernel in kernels:
|
||||
scores[data_type][kernel] = {}
|
||||
for c_value in c_values:
|
||||
clsvm = SVC(kernel=kernel, C=c_value)
|
||||
clsvm.fit(X=x_train, y=y_train)
|
||||
if data_type=="train":
|
||||
scores[data_type][kernel][c_value] = (1 - clsvm.score(X=x_train, y=y_train)) * 100
|
||||
else:
|
||||
scores[data_type][kernel][c_value] = (1 - clsvm.score(X=x_test, y=y_test)) * 100
|
||||
print("Score d'erreur données=%s avec kernel=%s: %.2f" % (data_type, kernel, scores[data_type][kernel][c_value]))
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(c_values, list(scores["train"]["linear"].values()), label="train linear", marker="o")
|
||||
plt.plot(c_values, list(scores["train"]["poly"].values()), label="train poly", marker="o")
|
||||
plt.plot(c_values, list(scores["train"]["rbf"].values()), label="train rbf", marker="o")
|
||||
plt.plot(c_values, list(scores["train"]["sigmoid"].values()), label="train sigmoid", marker="o")
|
||||
plt.plot(c_values, list(scores["test"]["linear"].values()), label="test linear", marker="*")
|
||||
plt.plot(c_values, list(scores["test"]["poly"].values()), label="test poly", marker="*")
|
||||
plt.plot(c_values, list(scores["test"]["rbf"].values()), label="test rbf", marker="*")
|
||||
plt.plot(c_values, list(scores["test"]["sigmoid"].values()), label="test sigmoid", marker="*")
|
||||
plt.xlabel("Valeurs de C")
|
||||
plt.xlim(left=0.1)
|
||||
plt.ylabel("Error score")
|
||||
plt.title("Courbe d'évolution du pourcentage d'erreur de précision en fonction de C")
|
||||
plt.legend()
|
||||
plt.savefig("tp3Output/error_scores_train_test_with_kernel_and_c_variation.png", dpi="figure")
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def generate_confusion_matrix():
|
||||
kernels = ["linear", "poly", "rbf", "sigmoid"]
|
||||
for kernel in kernels:
|
||||
clsvm = SVC(kernel=kernel)
|
||||
clsvm.fit(X=x_train, y=y_train)
|
||||
plot_confusion_matrix(estimator=clsvm, X=x_test, y_true=y_test, values_format=".1f")
|
||||
plt.show()
|
||||
|
||||
def time_precision_recall_error():
|
||||
kernels = ["linear", "poly", "rbf", "sigmoid"]
|
||||
times = {}
|
||||
scores = {}
|
||||
recalls = {}
|
||||
errors = {}
|
||||
c_values = [0.1 + 0.18 * x for x in range(6)]
|
||||
for kernel in kernels:
|
||||
times[kernel] = {}
|
||||
scores[kernel] = {}
|
||||
recalls[kernel] = {}
|
||||
errors[kernel] = {}
|
||||
for c_value in c_values:
|
||||
clsvm = SVC(kernel=kernel, C=c_value)
|
||||
tic = time.process_time()
|
||||
clsvm.fit(X=x_train, y=y_train)
|
||||
toc = time.process_time()
|
||||
y_pred = clsvm.predict(X=x_test)
|
||||
times[kernel][c_value] = toc - tic
|
||||
scores[kernel][c_value] = clsvm.score(X=x_test, y=y_test) * 100
|
||||
recalls[kernel][c_value] = recall_score(y_true=y_test, y_pred=y_pred, average="micro")
|
||||
errors[kernel][c_value] = 100 - scores[kernel][c_value]
|
||||
print("Score avec kernel=%s: %.2f" % (kernel, clsvm.score(X=x_test, y=y_test) * 100))
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(c_values, list(times["linear"].values()), label="linear")
|
||||
plt.plot(c_values, list(times["poly"].values()), label="poly")
|
||||
plt.plot(c_values, list(times["rbf"].values()), label="rbf")
|
||||
plt.plot(c_values, list(times["sigmoid"].values()), label="sigmoid")
|
||||
plt.xlabel("Valeurs de C")
|
||||
plt.xlim(left=0.1)
|
||||
plt.ylabel("Temps d'apprentissage (secondes)")
|
||||
plt.title("Courbe d'évolution du temps d'apprentissage en fonction de C")
|
||||
plt.legend()
|
||||
plt.savefig("tp3Output/times_all_kernel_with_c_variation.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
###
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(c_values, list(scores["linear"].values()), label="linear precision")
|
||||
plt.plot(c_values, list(scores["poly"].values()), label="poly precision")
|
||||
plt.plot(c_values, list(scores["rbf"].values()), label="rbf precision")
|
||||
plt.plot(c_values, list(scores["sigmoid"].values()), label="sigmoid precision")
|
||||
plt.plot(c_values, list(errors["linear"].values()), label="linear error", marker="*", linestyle="-.")
|
||||
plt.plot(c_values, list(errors["poly"].values()), label="poly error", marker="*", linestyle="-.")
|
||||
plt.plot(c_values, list(errors["rbf"].values()), label="rbf error", marker="*", linestyle="-.")
|
||||
plt.plot(c_values, list(errors["sigmoid"].values()), label="sigmoid error", marker="*", linestyle="-.")
|
||||
plt.xlabel("Valeurs de C")
|
||||
plt.xlim(left=0.1)
|
||||
plt.ylabel("Score")
|
||||
plt.title("Courbe d'évolution de la précision et de l'erreur de classification en fonction de C")
|
||||
plt.legend()
|
||||
plt.savefig("tp3Output/scores_and_errors_all_kernel_with_c_variation.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
###
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
plt.plot(c_values, list(recalls["linear"].values()), label="linear")
|
||||
plt.plot(c_values, list(recalls["poly"].values()), label="poly")
|
||||
plt.plot(c_values, list(recalls["rbf"].values()), label="rbf")
|
||||
plt.plot(c_values, list(recalls["sigmoid"].values()), label="sigmoid")
|
||||
plt.xlabel("Valeurs de C")
|
||||
plt.xlim(left=0.1)
|
||||
plt.ylabel("Recall score")
|
||||
plt.title("Courbe d'évolution du recall en fonction de C")
|
||||
plt.legend()
|
||||
plt.savefig("tp3Output/recalls_all_kernel_with_c_variation.png", dpi="figure")
|
||||
plt.show(); plt.close()
|
||||
|
||||
def methods_comparison():
|
||||
times = {}
|
||||
scores = {}
|
||||
recalls = {}
|
||||
errors = {}
|
||||
matrices = {}
|
||||
### Méthode 1
|
||||
knn = KNeighborsClassifier(n_neighbors=10)
|
||||
tic = time.process_time()
|
||||
knn.fit(X=x_train, y=y_train)
|
||||
toc = time.process_time()
|
||||
y_pred = knn.predict(X=x_test)
|
||||
times["knn"] = toc - tic
|
||||
scores["knn"] = knn.score(X=x_test, y=y_pred) * 100
|
||||
recalls["knn"] = recall_score(y_true=y_test, y_pred=y_pred, average="micro")
|
||||
errors["knn"] = 100 - scores["knn"]
|
||||
# matrices["knn"] = confusion_matrix(y_true=y_test,y_pred=y_pred)
|
||||
plot_confusion_matrix(estimator=knn, X=x_test, y_true=y_test, values_format=".1f"); plt.show()
|
||||
### Méthode 2
|
||||
mlp = MLPClassifier(hidden_layer_sizes=(50))
|
||||
tic = time.process_time()
|
||||
mlp.fit(X=x_train, y=y_train)
|
||||
toc = time.process_time()
|
||||
y_pred = knn.predict(X=x_test)
|
||||
times["mlp"] = toc - tic
|
||||
scores["mlp"] = mlp.score(X=x_test, y=y_pred) * 100
|
||||
recalls["mlp"] = recall_score(y_true=y_test, y_pred=y_pred, average="micro")
|
||||
errors["mlp"] = 100 - scores["mlp"]
|
||||
# matrices["mlp"] = confusion_matrix(y_true=y_test, y_pred=y_pred)
|
||||
plot_confusion_matrix(estimator=mlp, X=x_test, y_true=y_test, values_format=".1f"); plt.show()
|
||||
### Méthode 3
|
||||
svc = SVC(kernel="rbf", C=1)
|
||||
tic = time.process_time()
|
||||
svc.fit(X=x_train, y=y_train)
|
||||
toc = time.process_time()
|
||||
y_pred = knn.predict(X=x_test)
|
||||
times["svc"] = toc - tic
|
||||
scores["svc"] = svc.score(X=x_test, y=y_pred) * 100
|
||||
recalls["svc"] = recall_score(y_true=y_test, y_pred=y_pred, average="micro")
|
||||
errors["svc"] = 100 - scores["svc"]
|
||||
# matrices["svc"] = confusion_matrix(y_true=y_test, y_pred=y_pred)
|
||||
plot_confusion_matrix(estimator=svc, X=x_test, y_true=y_test, values_format=".1f"); plt.show()
|
||||
### Plotting
|
||||
barWidth = 0.5
|
||||
y1 = [times["knn"], times["mlp"], times["svc"]]
|
||||
y2 = [scores["knn"], scores["mlp"], scores["svc"]]
|
||||
y3 = [recalls["knn"], recalls["mlp"], recalls["svc"]]
|
||||
y4 = [errors["knn"], errors["mlp"], errors["svc"]]
|
||||
r1 = [0, 3, 6]
|
||||
r2 = [x + barWidth for x in r1]
|
||||
r3 = [x + 2 * barWidth for x in r1]
|
||||
r4 = [x + 3 * barWidth for x in r1]
|
||||
plt.figure(figsize=(12.8, 9.6))
|
||||
bar1 = plt.bar(r1, y1, width=barWidth, color=['green' for i in y1], linewidth=2, label="temps (sec)")
|
||||
bar2 = plt.bar(r2, y2, width=barWidth, color=['yellow' for i in y2], linewidth=4, label="precision (%)")
|
||||
bar3 = plt.bar(r3, y3, width=barWidth, color=['red' for i in y3], linewidth=1, label="recall")
|
||||
bar4 = plt.bar(r4, y4, width=barWidth, color=['magenta' for i in y4], linewidth=3, label="error (%)")
|
||||
plt.xticks([r + 1.5 * barWidth for r in r1], ['Modèle K-nn', 'Modèle MLP', 'Modèle SVM'])
|
||||
plt.xlabel("Méthodes")
|
||||
plt.ylabel("Valeur")
|
||||
plt.title("Evaluation de différentes métriques de performance pour chacune des méthodes vues")
|
||||
for rect in bar1 + bar2 + bar3 + bar4:
|
||||
height = rect.get_height()
|
||||
plt.text(rect.get_x() + rect.get_width() / 2.0, height, '%.2f' % height, ha='center', va='bottom')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
plt.savefig("tp3Output/global_comparison_of_methods_different_metrics", dpi="figure")
|
||||
|
||||
|
||||
kernel_variation()
|
||||
kernel_and_c_variation()
|
||||
train_test_error_with_c_variation()
|
||||
generate_confusion_matrix()
|
||||
time_precision_recall_error()
|
||||
methods_comparison()
|
54
graphs.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
def bars_plot(bar_labels=None, bar_heights=None, bar_colors="blue", bar_width=0.25, xlabel="xlabel", ylabel="ylabel", fig_title="title", figsize=(12.8, 9.6), output="./bars.png", show=False):
|
||||
import matplotlib.pyplot as plt
|
||||
plt.figure(figsize=figsize)
|
||||
plt.bar(x=list(map(str,bar_labels)), height=bar_heights, width=bar_width, color=bar_colors)
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
plt.title(fig_title)
|
||||
plt.savefig(output, dpi="figure")
|
||||
if show:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def clustering_plot(abcissas=None, ordinates=None, predictions=None, marker="o", xlabel="xlabel", ylabel="ylabel", fig_title="title", figsize=(12.8, 9.6), output="./bars.png", show=False):
|
||||
import matplotlib.pyplot as plt
|
||||
plt.figure(figsize=figsize)
|
||||
plt.scatter(x=abcissas, y=ordinates, c=predictions, marker=marker)
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
plt.title(fig_title)
|
||||
plt.savefig(output, dpi="figure")
|
||||
if show:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def bars_plot2(bar_labels=None, bar_heights=None, bar_colors="blue", bar_width=0.25, xlabel="xlabel", ylabel="ylabel", fig_title="title", figsize=(12.8, 9.6), output="./bars.png", show=False, show_heights=True, percent_mark=False):
|
||||
import matplotlib.pyplot as plt
|
||||
plt.figure(figsize=figsize)
|
||||
bar = plt.bar(x=list(map(str,bar_labels)), height=bar_heights, width=bar_width, color=bar_colors)
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
plt.title(fig_title)
|
||||
if show_heights:
|
||||
for rect in bar:
|
||||
height = rect.get_height()
|
||||
if percent_mark:
|
||||
plt.text(rect.get_x() + rect.get_width() / 2.0, height, '%.2f%%' % height, ha='center', va='bottom')
|
||||
else:
|
||||
plt.text(rect.get_x() + rect.get_width() / 2.0, height, '%.2f' % height, ha='center', va='bottom')
|
||||
plt.savefig(output, dpi="figure")
|
||||
if show:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def curve_plot(abcissas=None, ordinates=None, xlabel="xlabel", ylabel="ylabel", fig_title="title", figsize=(12.8, 9.6), output="./bars.png", show=False):
|
||||
import matplotlib.pyplot as plt
|
||||
plt.figure(figsize=figsize)
|
||||
plt.plot(abcissas,ordinates)
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel(ylabel)
|
||||
plt.title(fig_title)
|
||||
plt.savefig(output, dpi="figure")
|
||||
if show:
|
||||
plt.show()
|
||||
plt.close()
|
BIN
tp1Output/mnist_k_fold_scores.png
Normal file
After Width: | Height: | Size: 30 KiB |
BIN
tp1Output/mnist_split_data_percentage_scores.png
Normal file
After Width: | Height: | Size: 49 KiB |
BIN
tp1Output/mnist_split_data_percentage_scores1.png
Normal file
After Width: | Height: | Size: 49 KiB |
After Width: | Height: | Size: 44 KiB |
BIN
tp1Output/mnist_various_distance_metric_njobs_at_1_timesX.png
Normal file
After Width: | Height: | Size: 46 KiB |
After Width: | Height: | Size: 46 KiB |
BIN
tp1Output/mnist_various_distance_metric_scores.png
Normal file
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 68 KiB |
After Width: | Height: | Size: 77 KiB |
After Width: | Height: | Size: 36 KiB |
BIN
tp2Output/scores_with_hidden_layers_variation.png
Normal file
After Width: | Height: | Size: 46 KiB |
After Width: | Height: | Size: 54 KiB |
After Width: | Height: | Size: 71 KiB |
BIN
tp3Output/comparison_knn_confusion_matrix.png
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
tp3Output/comparison_mlp_confusion_matrix.png
Normal file
After Width: | Height: | Size: 48 KiB |
BIN
tp3Output/comparison_svc_confusion_matrix.png
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
tp3Output/confusion_matrix_with_linear_as_kernel.png
Normal file
After Width: | Height: | Size: 47 KiB |
BIN
tp3Output/confusion_matrix_with_poly_as_kernel.png
Normal file
After Width: | Height: | Size: 45 KiB |
BIN
tp3Output/confusion_matrix_with_rbf_as_kernel.png
Normal file
After Width: | Height: | Size: 42 KiB |
BIN
tp3Output/confusion_matrix_with_sigmoid_as_kernel.png
Normal file
After Width: | Height: | Size: 50 KiB |
After Width: | Height: | Size: 87 KiB |
BIN
tp3Output/global_comparison_of_methods_different_metrics.png
Normal file
After Width: | Height: | Size: 38 KiB |
BIN
tp3Output/recalls_all_kernel_with_c_variation.png
Normal file
After Width: | Height: | Size: 54 KiB |
BIN
tp3Output/scores_all_kernel_with_c_variation.png
Normal file
After Width: | Height: | Size: 54 KiB |
BIN
tp3Output/scores_and_errors_all_kernel_with_c_variation.png
Normal file
After Width: | Height: | Size: 61 KiB |
BIN
tp3Output/scores_for_kernel_variation.png
Normal file
After Width: | Height: | Size: 24 KiB |
BIN
tp3Output/times_all_kernel_with_c_variation.png
Normal file
After Width: | Height: | Size: 62 KiB |