TP-APP-SUPERVISE/TP1/KNN.py
2021-12-15 19:20:05 +01:00

110 lines
No EOL
3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 15 19:07:59 2021
@author: chouiya
"""
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
#**********Echantillons de données "data" avec une taille de 5000 échantillons **********
mnist = fetch_openml('mnist_784', as_frame=False)
index= np.random.randint(70000, size=5000)
data = mnist.data[index]
target = mnist.target[index]
# *************utilisation de 80% de la base de données pour le training ***********
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8)
# **********classifieur k-nn avec k=10 ********
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
clf = neighbors.KNeighborsClassifier(10)
clf.fit(xtrain,ytrain)
prediction = clf.predict(xtest)
score = clf.score(xtest, ytest)
# **********Classe de l'image 4 et sa classe prédite ****************
print("Prédiction : {}, Valeur : {}, Score : {}".format(prediction[4], ytest[4], score))
#*********Taux d'erreur sur les données d'apprentissage *******
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
clf = neighbors.KNeighborsClassifier(10)
clf.fit(xtrain,ytrain)
prediction = clf.predict(xtrain)
score = clf.score(xtrain, ytrain)
print("score: ", score*100)
# **********Variation du nombre de voisins k de 2 à 15 en utilisant une boucle*****
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
tab_scores=[]
for i in range (2,16):
clf = neighbors.KNeighborsClassifier(i)
clf.fit(xtrain, ytrain)
prediction = clf.predict(xtest)
score = clf.score(xtest, ytest)
tab_scores.append(score)
print("K : {}, Score: {}".format(i, score*100))
#plot score=f(k)
range_tab=range(2,16)
plt.plot(range_tab,tab_scores)
plt.xlabel("valeurs de K pour KNN")
plt.ylabel("score")
# ******** Variation du nombre de voisins k de 2 à 15 en utilisant la fonction KFold******
kf = KFold(14,shuffle=True)
kf.get_n_splits(data)
k = 2
for train_index, test_index in kf.split(data):
xtrain, xtest = data[train_index], data[test_index]
ytrain, ytest = target[train_index], target[test_index]
clf = neighbors.KNeighborsClassifier(k)
clf.fit(xtrain,ytrain)
prediction = clf.predict(xtest)
score = clf.score(xtest, ytest)
print("K : {}, Score : {}".format(k, score*100))
k = k + 1
# *********Variation du pourcentage des échantillons du training et test************
change_percent = range (2,10)
for s in change_percent:
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=(s/10))
clasifier = neighbors.KNeighborsClassifier(5)
clasifier.fit(xtrain,ytrain)
prediction = clasifier.predict(xtest)
print("Training size = {} %, Score = {} ".format((s/10), clasifier.score(xtest, ytest)))