Solved – How to get 38.6% accuracy on CIFAR-10 dataset using Nearest Neighbor

classificationk nearest neighbourmachine learningpython

I just read these notes from a Standford course. It says "you would see that this classifier [Nearest Neighbor] only achieves 38.6% on CIFAR-10" I did my own implementation, but I only got 24.9% accuracy with L1 and 25.3% accuracy with L2 in the test set. This is the code:

class NearestNeighbor(object):
    def __init__(self):
        pass

    def train(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X, l1=True):
        num_pred = X.shape[0]
        pred =  np.zeros(num_pred, dtype=self.y.dtype)
        for i in range(num_pred):
            if(l1): # L1 distance
                distances = np.sum(np.abs(self.X - X[i,:]), axis=1)
            else: # L2 distance
                distances = np.sqrt(np.sum(np.square(self.X - X[i,:]), axis=1))
            min_index = np.argmin(distances)
            pred[i] = self.y[min_index]
        return pred;

This is how I'm reading the dataset:

def load_pickle(f):
    with open(f, 'rb') as fo:
    return pickle.load(fo, encoding='latin1')

def load_data(directory='./cifar-10-batches-py/'):
    # Reading trainig batches
    train_batches = []
    for i in range(1,6):
        train_batch_file = directory + 'data_batch_' + str(i)
        train_batches.append(load_pickle(train_batch_file))
    X_train = np.concatenate([batch['data'] for batch in train_batches], 0)
    y_train = np.concatenate([batch['labels'] for batch in train_batches], 0)

    # Reading test batch
    test_batch_file = directory + 'test_batch'
    test_batch = load_pickle(test_batch_file)
    X_test = test_batch['data']
    y_test = test_batch['labels']
    return X_train, y_train, X_test, y_test

I don't know if I'm doing something wrong or not. If I ain't, is it possible to get 38.6% accuracy only with NN? Under which conditions?

Best Answer

I get 31% by just using b in range(1,2) cause I have low memory. You can get best result with this code using b in range(1,6):

import numpy as np
import pickle
import os
#from settings import PROJECT_ROOT
import sys
import platform
class NearestNeighbor(object):
  def __init__(self):
    pass

  def train(self, X, y):
    """ X is N x D where each row is an example. Y is 1-dimension of size N """
    # the nearest neighbor classifier simply remembers all the training data
    self.Xtr = X
    self.ytr = y

  def predict(self, X):
    """ X is N x D where each row is an example we wish to predict label for """
    num_test = X.shape[0]

    print(num_test)
    # lets make sure that the output type matches the input type
    Ypred = np.zeros(num_test, dtype = self.ytr.dtype)
    xrange=range
    # loop over all test rows
    for i in xrange(num_test):
      print(i)
      # find the nearest training image to the i'th test image
      # using the L1 distance (sum of absolute value differences)
      distances = np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
      min_index = np.argmin(distances) # get the index with smallest distance
      Ypred[i] = self.ytr[min_index] # predict the label of the nearest example
    return Ypred  
def load_pickle(f):
    version = platform.python_version_tuple()
    if version[0] == '2':
        return  pickle.load(f)
    elif version[0] == '3':
        return  pickle.load(f, encoding='latin1')
    raise ValueError("invalid python version: {}".format(version))

def load_CIFAR_batch(filename):
    """ load single batch of cifar """
    with open(filename, 'rb') as f:
        datadict = load_pickle(f)
        X = datadict['data']
        Y = datadict['labels']
        X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        Y = np.array(Y)
        return X, Y

def load_CIFAR10(ROOT):
    """ load all of cifar """
    xs = []
    ys = []
    for b in range(1,3):
        f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
        X, Y = load_CIFAR_batch(f)
        xs.append(X)
        ys.append(Y)
    Xtr = np.concatenate(xs)
    Ytr = np.concatenate(ys)
    del X, Y
    Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
    return Xtr, Ytr, Xte, Yte  



Xtr, Ytr, Xte, Yte = load_CIFAR10('cifar10/') # a magic function we provide
# flatten out all images to be one-dimensional
Xtr_rows = Xtr.reshape(Xtr.shape[0], 32 * 32 * 3) # Xtr_rows becomes 50000 x 3072
Xte_rows = Xte.reshape(Xte.shape[0], 32 * 32 * 3) # Xte_rows becomes 10000 x 3072

nn = NearestNeighbor() # create a Nearest Neighbor classifier class
nn.train(Xtr_rows, Ytr) # train the classifier on the training images and labels
Yte_predict = nn.predict(Xte_rows) # predict labels on the test images
# and now print the classification accuracy, which is the average number
# of examples that are correctly predicted (i.e. label matches)
print ('accuracy: %f' % ( np.mean(Yte_predict == Yte) ))
Related Question