#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Dec 15 19:07:59 2021 @author: chouiya """ from time import time import numpy as np import matplotlib.pyplot as plt from sklearn import neighbors from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split, KFold from sklearn.metrics import accuracy_score #**********Echantillons de données "data" avec une taille de 5000 échantillons ********** mnist = fetch_openml('mnist_784', as_frame=False) index= np.random.randint(70000, size=5000) data = mnist.data[index] target = mnist.target[index] # *************utilisation de 80% de la base de données pour le training *********** xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8) # **********classifieur k-nn avec k=10 ******** xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2) clf = neighbors.KNeighborsClassifier(10) clf.fit(xtrain,ytrain) prediction = clf.predict(xtest) score = clf.score(xtest, ytest) # **********Classe de l'image 4 et sa classe prédite **************** print("Prédiction : {}, Valeur : {}, Score : {}".format(prediction[4], ytest[4], score)) #*********Taux d'erreur sur les données d'apprentissage ******* xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2) clf = neighbors.KNeighborsClassifier(10) clf.fit(xtrain,ytrain) prediction = clf.predict(xtrain) score = clf.score(xtrain, ytrain) print("score: ", score*100) # **********Variation du nombre de voisins k de 2 à 15 en utilisant une boucle***** xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2) tab_scores=[] for i in range (2,16): clf = neighbors.KNeighborsClassifier(i) clf.fit(xtrain, ytrain) prediction = clf.predict(xtest) score = clf.score(xtest, ytest) tab_scores.append(score) print("K : {}, Score: {}".format(i, score*100)) #plot score=f(k) range_tab=range(2,16) plt.plot(range_tab,tab_scores) plt.xlabel("valeurs de K pour KNN") plt.ylabel("score") # ******** Variation du nombre de voisins k de 2 à 15 en utilisant la fonction KFold****** kf = KFold(14,shuffle=True) kf.get_n_splits(data) k = 2 for train_index, test_index in kf.split(data): xtrain, xtest = data[train_index], data[test_index] ytrain, ytest = target[train_index], target[test_index] clf = neighbors.KNeighborsClassifier(k) clf.fit(xtrain,ytrain) prediction = clf.predict(xtest) score = clf.score(xtest, ytest) print("K : {}, Score : {}".format(k, score*100)) k = k + 1 # *********Variation du pourcentage des échantillons du training et test************ change_percent = range (2,10) for s in change_percent: xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=(s/10)) clasifier = neighbors.KNeighborsClassifier(5) clasifier.fit(xtrain,ytrain) prediction = clasifier.predict(xtest) print("Training size = {} %, Score = {} ".format((s/10), clasifier.score(xtest, ytest)))