Introduction¶

This notebook presetns Multi-Layer Perceptron solution to MNIST dataset.

Contents

MNIST Dataset - load data
PyTorch Model - create and train model

Imports¶

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision import datasets

Pick GPU if available

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

MNIST Dataset¶

PyTorch will download dataset to this location:

dataset_location = '~/.pytorch/MNIST_data/'

Download data

trainset = datasets.MNIST(dataset_location, download=True, train=True)
testset = datasets.MNIST(dataset_location, download=True, train=False)

MNIST dataset is tiny, so we will load whole thing to GPU memory. In my tests DataLoader took approx 5s per epoch, while this approach takes 0.17s

x_train_raw = trainset.train_data.float()
y_train_raw = trainset.train_labels
x_test_raw = testset.test_data.float()
y_test_raw = testset.test_labels

Raw data shapes

print(x_train_raw.shape)
print(y_train_raw.shape)

torch.Size([60000, 28, 28])
torch.Size([60000])

Show example images

fig, axes = plt.subplots(nrows=1, ncols=6, figsize=[16, 9])
for i in range(len(axes)):
    axes[i].set_title('Label: '+str(y_train_raw[i]))
    axes[i].imshow(x_train_raw[i])

Example labels

y_train = y_train_raw  # no conversions
y_test = y_test_raw
print('y_train:')
print('shape', y_train.shape)
print('data')
print(y_train[:20])

y_train:
shape torch.Size([60000])
data
tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9])

Normalize and flatten images

x_train = (x_train_raw - x_train_raw.mean()) / x_train_raw.std()
x_test = (x_test_raw - x_train_raw.mean()) / x_train_raw.std()    # reuse mean/std from train set
x_train = x_train.view([len(x_train), -1])
x_test = x_test.view([len(x_test), -1])
print('x_train:')
print('shape', x_train.shape)
print('data')
print(x_train[0, 300:400])

x_train:
shape torch.Size([60000, 784])
data
tensor([-0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240,  1.3452,  2.7962,  1.9943, -0.3985, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.2840,
         1.9943,  2.7962,  0.4670, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240,  0.0215,  2.6435,  2.4398,  1.6125,
         0.9506, -0.4112, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240, -0.4240,
        -0.4240, -0.4240, -0.4240, -0.4240])

Move dataset to GPU

x_train, y_train = x_train.to(device), y_train.to(device)
x_test, y_test = x_test.to(device), y_test.to(device)

PyTorch Model¶

Helper to calculate accuracy, opeartes on tensors

def accuracy(logits, labels): 
    predictions = torch.argmax(logits, dim=1)
    return (predictions == labels).float().mean()  # tensor!!

Simple multi-layer perceptron

model = nn.Sequential(
    nn.Linear(in_features=784, out_features=512),
    nn.ELU(),
    nn.Dropout(0.2),
    nn.Linear(in_features=512, out_features=512),
    nn.ELU(),
    nn.Dropout(0.2),
    nn.Linear(in_features=512, out_features=10),  # return logits
)
model.to(device)
criterion = nn.CrossEntropyLoss()                 # softmax included in here
optimizer = torch.optim.Adam(model.parameters())
print(model)

Sequential(
  (0): Linear(in_features=784, out_features=512, bias=True)
  (1): ELU(alpha=1.0)
  (2): Dropout(p=0.2)
  (3): Linear(in_features=512, out_features=512, bias=True)
  (4): ELU(alpha=1.0)
  (5): Dropout(p=0.2)
  (6): Linear(in_features=512, out_features=10, bias=True)
)

Train model

batch_size = 1000
hist = { 'loss':[], 'acc':[] }

model.train()                                                      # set model for training (dropout etc)

for epoch in range(5):
    indices = torch.randperm(len(x_train), device=device)          # randmly shuffled indices [0..59999]
    for i in range(0, len(x_train), batch_size):                   # i = 0, 1000, 2000, 3000, ...
        
        # Pick mini-batch
        x = x_train[indices[i:i+batch_size]]
        y = y_train[indices[i:i+batch_size]]

        # Zero gradients
        optimizer.zero_grad()

        # Train
        outputs = model(x)                                         # outputs is un-normalized, i.e. no softmax
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            acc = accuracy(outputs, y)
            hist['loss'].append( loss.item() )
            hist['acc'].append( acc.item() )

Final result

model.eval()                                                       # set model for evaluatiotn, disable dropout etc
with torch.no_grad():
    outputs = model(x_train)
    acc = accuracy(outputs, y_train).item() 
print(f'Accuracy on train set: {acc:.2f}')

Accuracy on train set: 0.98

model.eval()  
with torch.no_grad():
    outputs = model(x_test)
    acc = accuracy(outputs, y_test).item() 
print(f'Accuracy on test set: {acc:.2f}')

Accuracy on test set: 0.97

Plot loss and accuracy over training period

plt.plot(hist['loss'], label='loss')
plt.plot(hist['acc'], label='acc', color='red')
plt.legend();