Implementing the knn algo
This commit is contained in:
commit
5ce154d774
1 changed files with 155 additions and 0 deletions
155
TP1/knn_vrai.py
Normal file
155
TP1/knn_vrai.py
Normal file
|
@ -0,0 +1,155 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Dec 11 15:40:46 2021
|
||||
|
||||
@author: chouiya
|
||||
"""
|
||||
|
||||
from time import time
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn import neighbors
|
||||
|
||||
from sklearn.datasets import fetch_openml
|
||||
from sklearn.model_selection import train_test_split, KFold
|
||||
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
|
||||
#**********Echantillons de données "data" avec une taille de 5000 échantillons **********
|
||||
|
||||
mnist = fetch_openml('mnist_784', as_frame=False)
|
||||
index= np.random.randint(70000, size=5000)
|
||||
data = mnist.data[index]
|
||||
target = mnist.target[index]
|
||||
|
||||
# *************utilisation de 80% de la base de données pour le training ***********
|
||||
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8)
|
||||
|
||||
# **********classifieur k-nn avec k=10 ********
|
||||
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
|
||||
clf = neighbors.KNeighborsClassifier(10)
|
||||
|
||||
clf.fit(xtrain,ytrain)
|
||||
prediction = clf.predict(xtest)
|
||||
score = clf.score(xtest, ytest)
|
||||
|
||||
# **********Classe de l'image 4 et sa classe prédite ****************
|
||||
|
||||
print("Prédiction : {}, Valeur : {}, Score : {}".format(prediction[4], ytest[4], score))
|
||||
|
||||
#*********Taux d'erreur sur les données d'apprentissage *******
|
||||
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
|
||||
clf = neighbors.KNeighborsClassifier(10)
|
||||
clf.fit(xtrain,ytrain)
|
||||
prediction = clf.predict(xtrain)
|
||||
score = clf.score(xtrain, ytrain)
|
||||
print("score: ", score*100)
|
||||
|
||||
|
||||
# **********Variation du nombre de voisins k de 2 à 15 en utilisant une boucle*****
|
||||
|
||||
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
|
||||
|
||||
tab_scores=[]
|
||||
for i in range (2,16):
|
||||
clf = neighbors.KNeighborsClassifier(i)
|
||||
clf.fit(xtrain, ytrain)
|
||||
prediction = clf.predict(xtest)
|
||||
score = clf.score(xtest, ytest)
|
||||
tab_scores.append(score)
|
||||
print("K : {}, Score: {}".format(i, score*100))
|
||||
|
||||
#plot score=f(k)
|
||||
range_tab=range(2,16)
|
||||
plt.plot(range_tab,tab_scores)
|
||||
plt.xlabel("valeurs de K pour KNN")
|
||||
plt.ylabel("score")
|
||||
|
||||
# ******** Variation du nombre de voisins k de 2 à 15 en utilisant la fonction KFold******
|
||||
|
||||
|
||||
kf = KFold(14,shuffle=True)
|
||||
kf.get_n_splits(data)
|
||||
k = 2
|
||||
for train_index, test_index in kf.split(data):
|
||||
xtrain, xtest = data[train_index], data[test_index]
|
||||
ytrain, ytest = target[train_index], target[test_index]
|
||||
clf = neighbors.KNeighborsClassifier(k)
|
||||
clf.fit(xtrain,ytrain)
|
||||
prediction = clf.predict(xtest)
|
||||
score = clf.score(xtest, ytest)
|
||||
print("K : {}, Score : {}".format(k, score*100))
|
||||
k = k + 1
|
||||
|
||||
|
||||
|
||||
|
||||
# *********Variation du pourcentage des échantillons du training et test************
|
||||
|
||||
change_percent = range (2,10)
|
||||
for s in change_percent:
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=(s/10))
|
||||
clasifier = neighbors.KNeighborsClassifier(5)
|
||||
clasifier.fit(xtrain,ytrain)
|
||||
prediction = clasifier.predict(xtest)
|
||||
print("Training size = {} %, Score = {} ".format((s/10), clasifier.score(xtest, ytest)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#*******************Variation de la taille de l'echantillon training *****************
|
||||
|
||||
tab_sample=[5000,6000,8000,10000,20000,50000,70000]
|
||||
for x in range(len(tab_sample)):
|
||||
index= np.random.randint(70000, size=tab_sample[x])
|
||||
data = mnist.data[index]
|
||||
target = mnist.target[index]
|
||||
clf = neighbors.KNeighborsClassifier(10)
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8)
|
||||
clf.fit(xtrain,ytrain)
|
||||
prediction = clf.predict(xtest)
|
||||
score = clf.score(xtest, ytest)
|
||||
|
||||
print("sample size= {} , accuracy = {} ".format(tab_sample[x], score))
|
||||
|
||||
#*****************Variation du type de la distance p *******
|
||||
xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size=0.8, test_size=0.2)
|
||||
|
||||
for i in range(0,3):
|
||||
|
||||
tab_dist=["manhattan","euclidean", "minkowski"]
|
||||
|
||||
|
||||
clf = neighbors.KNeighborsClassifier(10, p=(i+1))
|
||||
clf.fit(xtrain,ytrain)
|
||||
prediction = clf.predict(xtrain)
|
||||
score = clf.score(xtrain, ytrain)
|
||||
print("type de distance : {}, Score: {}".format(tab_dist[i], score))
|
||||
|
||||
|
||||
# ************** fixer n_jobs à 1 puis à -1 **********
|
||||
|
||||
|
||||
for i in [-1,1]:
|
||||
clf = neighbors.KNeighborsClassifier(5,n_jobs=i)
|
||||
clf.fit(xtrain, ytrain)
|
||||
time_start = time()
|
||||
prediction = clf.predict(xtest)
|
||||
time_stop = time()
|
||||
score = clf.score(xtest, ytest)
|
||||
print("n_jobs : {}, Temps total : {}".format(i,time_stop-time_start))
|
||||
|
||||
|
Loading…
Reference in a new issue