This notebook presetns Multi-Layer Perceptron solution to MNIST dataset.
Contents
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision import datasets
Pick GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
PyTorch will download dataset to this location:
dataset_location = '~/.pytorch/MNIST_data/'
Download data
trainset = datasets.MNIST(dataset_location, download=True, train=True)
testset = datasets.MNIST(dataset_location, download=True, train=False)
MNIST dataset is tiny, so we will load whole thing to GPU memory. In my tests DataLoader
took approx 5s per epoch, while this approach takes 0.17s
x_train_raw = trainset.train_data.float()
y_train_raw = trainset.train_labels
x_test_raw = testset.test_data.float()
y_test_raw = testset.test_labels
Raw data shapes
print(x_train_raw.shape)
print(y_train_raw.shape)
Show example images
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=[16, 9])
for i in range(len(axes)):
axes[i].set_title('Label: '+str(y_train_raw[i]))
axes[i].imshow(x_train_raw[i])
Example labels
y_train = y_train_raw # no conversions
y_test = y_test_raw
print('y_train:')
print('shape', y_train.shape)
print('data')
print(y_train[:20])
Normalize and flatten images
x_train = (x_train_raw - x_train_raw.mean()) / x_train_raw.std()
x_test = (x_test_raw - x_train_raw.mean()) / x_train_raw.std() # reuse mean/std from train set
x_train = x_train.view([len(x_train), -1])
x_test = x_test.view([len(x_test), -1])
print('x_train:')
print('shape', x_train.shape)
print('data')
print(x_train[0, 300:400])
Move dataset to GPU
x_train, y_train = x_train.to(device), y_train.to(device)
x_test, y_test = x_test.to(device), y_test.to(device)
Helper to calculate accuracy, opeartes on tensors
def accuracy(logits, labels):
predictions = torch.argmax(logits, dim=1)
return (predictions == labels).float().mean() # tensor!!
Simple multi-layer perceptron
model = nn.Sequential(
nn.Linear(in_features=784, out_features=512),
nn.ELU(),
nn.Dropout(0.2),
nn.Linear(in_features=512, out_features=512),
nn.ELU(),
nn.Dropout(0.2),
nn.Linear(in_features=512, out_features=10), # return logits
)
model.to(device)
criterion = nn.CrossEntropyLoss() # softmax included in here
optimizer = torch.optim.Adam(model.parameters())
print(model)
Train model
batch_size = 1000
hist = { 'loss':[], 'acc':[] }
model.train() # set model for training (dropout etc)
for epoch in range(5):
indices = torch.randperm(len(x_train), device=device) # randmly shuffled indices [0..59999]
for i in range(0, len(x_train), batch_size): # i = 0, 1000, 2000, 3000, ...
# Pick mini-batch
x = x_train[indices[i:i+batch_size]]
y = y_train[indices[i:i+batch_size]]
# Zero gradients
optimizer.zero_grad()
# Train
outputs = model(x) # outputs is un-normalized, i.e. no softmax
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
with torch.no_grad():
acc = accuracy(outputs, y)
hist['loss'].append( loss.item() )
hist['acc'].append( acc.item() )
Final result
model.eval() # set model for evaluatiotn, disable dropout etc
with torch.no_grad():
outputs = model(x_train)
acc = accuracy(outputs, y_train).item()
print(f'Accuracy on train set: {acc:.2f}')
model.eval()
with torch.no_grad():
outputs = model(x_test)
acc = accuracy(outputs, y_test).item()
print(f'Accuracy on test set: {acc:.2f}')
Plot loss and accuracy over training period
plt.plot(hist['loss'], label='loss')
plt.plot(hist['acc'], label='acc', color='red')
plt.legend();