# import os
# import gzip
# import numpy as np
import numpy as np
import matplotlib.pyplot as plt
Leaky ReLU
def lrelu(x):
return np.where(x > 0, x, x * 0.01)
def lrelu_der(x):
dx = np.ones_like(x)
dx[x <= 0] = 0.01
return dx
Softmax
# see e.g. here: https://deepnotes.io/softmax-crossentropy
def softmax(x):
"""Numerically stable softmax"""
max_ = np.max(x, axis=-1, keepdims=True) # shape: (n_batch, 1)
ex = np.exp(x - max_) # shape: (n_batch, n_out)
ex_sum = np.sum(ex, axis=-1, keepdims=True) # shape: (n_batch, 1)
return ex / ex_sum # probabilities shape: (n_batch, n_out)
Cross Entropy
Cross entropy for single training example
$$ CE_t = -\sum_{k=1}^{n_\text{out}}y_{tk} \log \hat{y}_{tk} $$
Loss is average CE over mini-batch
$$ J(y, \hat{y}) \ = \ \frac{1}{n_\text{batch}} \sum_{t=1}^{n_{\text{batch}}} CE_t \ = \ - \frac{1}{n_\text{batch}} \sum_{t=1}^{n_{\text{batch}}} \sum_{k=1}^{n_\text{out}}y_{tk} \log \hat{y}_{tk} $$
def cross_entropy(y, y_hat):
"""CE for any two propbability distributions, averages over batch."""
result = -np.sum(y * np.log(y_hat), axis=-1) # cross entropy shape (n_batch,)
return np.average(result) # average over batch shape: scalar
def cross_entropy(y, y_hat):
"""CE for one-hot targets y, averages over batch."""
assert np.alltrue(y.sum(axis=-1) == 1) # make sure y is one-hot encoded
assert np.alltrue(y.max(axis=-1) == 1)
prob_correct = y_hat[range(len(y_hat)), np.argmax(y, axis=-1)] # pick y_hat for correct class (n_batch,)
return np.average( -np.log(prob_correct) )
Accuracy
def accuracy(y, y_hat):
result = np.argmax(y, axis=-1) == np.argmax(y_hat, axis=-1) # shape: (len(y), 1)
return np.mean(result) # shape: scalar
Forward Pass
def forward(x, Wh, bh, Wo, bo):
z_hid = x @ Wh + bh # (n_batch, n_hid)
a_hid = lrelu(z_hid) # (n_batch, n_hid)
logits = a_hid @ Wo + bo # (n_batch, n_out)
return softmax(logits) # (n_batch, n_out)
def train_classifier(x_train, y_train, nb_epochs, n_batch, learning_rate, Wh, bh, Wo, bo):
indices = list(range(len(x_train)))
losses = []
accuracies = []
for epoch in range(nb_epochs):
np.random.shuffle(indices)
for i in range(0, len(x_train), n_batch ):
# Pick mini-batch
i_train = indices[i:i+n_batch]
x = x_train[i_train] # (n_batch, n_in)
y = y_train[i_train] # (n_batch, n_out)
# Forward
z_hid = x @ Wh + bh # (n_batch, n_hid)
a_hid = lrelu(z_hid) # (n_batch, n_hid)
logits = a_hid @ Wo + bo # (n_batch, n_out)
y_hat = softmax(logits) # (n_batch, n_out)
# Backward
dJdy = (y_hat - y) / len(x) # backprop through CE and softmax (n_batch, n_out)
dWo = a_hid.T @ dJdy # (same as Wo)
dbo = np.sum(dJdy, axis=0, keepdims=True) # (same as bo)
dJdz = dJdy @ Wo.T * lrelu_der(z_hid) # backprop into hidden (n_batch, n_hid)
dWh = x.T @ dJdz # (same as Wh)
dbh = np.sum(dJdz, axis=0, keepdims=True) # (same as bh)
Wh += -learning_rate * dWh
bh += -learning_rate * dbh
Wo += -learning_rate * dWo
bo += -learning_rate * dbo
loss_train = cross_entropy(y, y_hat)
acc_train = accuracy(y, y_hat)
losses.append(loss_train)
accuracies.append(acc_train)
print(f'epoch {epoch} loss: {loss_train:.3f} acc: {acc_train:.3f}')
#print('epoch:', epoch, '\tloss:', round(loss_train, 2), '\tacc:', acc_train)
return losses, accuracies
Fashion-MNIST dataset, originally taken from here: https://github.com/zalandoresearch/fashion-mnist
Download and Load
def download(url, path, md5sum):
import os
import urllib
import hashlib
folder, file = os.path.split(path)
if folder != '':
os.makedirs(folder, exist_ok=True)
if not os.path.isfile(path):
print('Downloading', path, '...')
urllib.request.urlretrieve(url, path)
assert hashlib.md5(open(path, 'rb').read()).hexdigest() == md5sum
else:
print('Already Exists:', file)
download('https://github.com/zalandoresearch/fashion-mnist/raw/master/data/fashion/t10k-images-idx3-ubyte.gz',
path='fashion-mnist/t10k-images-idx3-ubyte.gz', md5sum='bef4ecab320f06d8554ea6380940ec79')
download('https://github.com/zalandoresearch/fashion-mnist/raw/master/data/fashion/t10k-labels-idx1-ubyte.gz',
path='fashion-mnist/t10k-labels-idx1-ubyte.gz', md5sum='bb300cfdad3c16e7a12a480ee83cd310')
download('https://github.com/zalandoresearch/fashion-mnist/raw/master/data/fashion/train-images-idx3-ubyte.gz',
path='fashion-mnist/train-images-idx3-ubyte.gz', md5sum='8d4fb7e6c68d591d4c3dfef9ec88bf0d')
download('https://github.com/zalandoresearch/fashion-mnist/raw/master/data/fashion/train-labels-idx1-ubyte.gz',
path='fashion-mnist/train-labels-idx1-ubyte.gz', md5sum='25c81989df183df01b3e8a0aad5dffbe')
print('All Done')
# This function was obtained from: https://github.com/zalandoresearch/fashion-mnist
def load_mnist(path, kind='train'):
import os
import gzip
import numpy as np
"""Load MNIST data from `path`"""
labels_path = os.path.join(path, '%s-labels-idx1-ubyte.gz' % kind)
images_path = os.path.join(path, '%s-images-idx3-ubyte.gz' % kind)
with gzip.open(labels_path, 'rb') as lbpath:
labels = np.frombuffer(lbpath.read(), dtype=np.uint8, offset=8)
with gzip.open(images_path, 'rb') as imgpath:
images = np.frombuffer(imgpath.read(), dtype=np.uint8, offset=16).reshape(len(labels), 784)
return images, labels
x_train_raw, y_train_raw = load_mnist('fashion-mnist', kind='train')
x_test_raw, y_test_raw = load_mnist('fashion-mnist', kind='t10k')
Preprocess
# Preprocess image dataset
x_train_raw_mean = x_train_raw.mean() # reuse train mean/std for both train/test
x_train_raw_std = x_train_raw.std() # (i.e. pretend test dataset doesn't exist)
x_train = (x_train_raw - x_train_raw_mean) / x_train_raw_std
x_test = (x_test_raw - x_train_raw_mean) / x_train_raw_std
print('x_train')
print('mean:', x_train.mean().round(2))
print('std:', x_train.std().round(2))
print('min:', x_train.min().round(2))
print('max:', x_train.max().round(2))
print('shape:', x_train.shape)
print('data:')
print(x_train.round(3))
plt.imshow(x_train_raw[0].reshape([28,28]))
plt.title('x_train sample');
plt.hist(x_train.ravel(), bins=100)
plt.title('x_train histogram');
# Convert labels to one-hot
# https://stackoverflow.com/questions/38592324/one-hot-encoding-using-numpy
y_train = np.eye(N=10)[y_train_raw]
y_test = np.eye(N=10)[y_test_raw]
print('y_train')
print('shape:', y_train.shape)
print(y_train)
Initialize Neural Network
n_in = x_train.shape[1]
n_hid = 64
n_out = 10
n_batch = 125
nb_epochs = 20
learning_rate = 0.001
np.random.seed(0)
Wh = np.random.normal(scale=1/(n_in)**.5, size=[n_in, n_hid])
bh = np.zeros(shape=[1, n_hid])
Wo = np.random.normal(scale=1/(n_hid)**.5, size=[n_hid, n_out])
bo = np.zeros(shape=[1, n_out])
losses, accuracies = train_classifier(x_train, y_train, nb_epochs, n_batch, learning_rate, Wh=Wh, bh=bh, Wo=Wo, bo=bo)
plt.plot(losses, label='train loss')
plt.plot(accuracies, label='train acc')
plt.title('Train loss and accuracy')
plt.legend();
y_hat = forward(x_test, Wh, bh, Wo, bo)
accuracy(y_test, y_hat)
def numerical_gradient(x, y, Wh, bh, Wo, bo):
"""Calculate gradient numerically"""
assert Wh.ndim == 2
assert bh.ndim == 2
assert bh.shape[0] == 1
assert Wo.ndim == 2
assert bo.ndim == 2
assert bo.shape[0] == 1
eps = 1e-6
# Weights Hidden
dWh = np.zeros_like(Wh)
for r in range(Wh.shape[0]):
for c in range(Wh.shape[1]):
Wh_min = Wh.copy()
Wh_pls = Wh.copy()
Wh_min[r, c] -= eps
Wh_pls[r, c] += eps
y_hat_pls = forward(x, Wh_pls, bh, Wo, bo)
y_hat_min = forward(x, Wh_min, bh, Wo, bo)
l_pls = cross_entropy(y, y_hat_pls)
l_min = cross_entropy(y, y_hat_min)
dWh[r, c] = (l_pls - l_min) / (eps * 2)
# Biases Hidden
dbh = np.zeros_like(bh)
for c in range(bh.shape[1]):
bh_min = bh.copy()
bh_pls = bh.copy()
bh_min[0, c] -= eps
bh_pls[0, c] += eps
y_hat_pls = forward(x, Wh, bh_pls, Wo, bo)
y_hat_min = forward(x, Wh, bh_min, Wo, bo)
l_pls = cross_entropy(y, y_hat_pls)
l_min = cross_entropy(y, y_hat_min)
dbh[0, c] = (l_pls - l_min) / (eps * 2)
# Weights Output
dWo = np.zeros_like(Wo)
for r in range(Wo.shape[0]):
for c in range(Wo.shape[1]):
Wo_min = Wo.copy()
Wo_pls = Wo.copy()
Wo_min[r, c] -= eps
Wo_pls[r, c] += eps
y_hat_pls = forward(x, Wh, bh, Wo_pls, bo)
y_hat_min = forward(x, Wh, bh, Wo_min, bo)
l_pls = cross_entropy(y, y_hat_pls)
l_min = cross_entropy(y, y_hat_min)
dWo[r, c] = (l_pls - l_min) / (eps * 2)
# Biases Output
dbo = np.zeros_like(bo)
for c in range(bo.shape[1]):
bo_min = bo.copy()
bo_pls = bo.copy()
bo_min[0, c] -= eps
bo_pls[0, c] += eps
y_hat_pls = forward(x, Wh, bh, Wo, bo_pls)
y_hat_min = forward(x, Wh, bh, Wo, bo_min)
l_pls = cross_entropy(y, y_hat_pls)
l_min = cross_entropy(y, y_hat_min)
dbo[0, c] = (l_pls - l_min) / (eps * 2)
return dWh, dbh, dWo, dbo
def test_gradients():
n_batch = 100
n_in = 10
n_hid = 20
n_out = 4
for i in range(100):
x = np.random.randn(n_batch, n_in)
y = np.zeros(shape=[n_batch, n_out])
idx = np.random.randint(0, n_out, size=[n_batch])
y[range(len(y)), idx] = 1 # random one-hot encoded matrix
Wh = np.random.randn(n_in, n_hid)
bh = np.random.randn(1, n_hid)
Wo = np.random.randn(n_hid, n_out)
bo = np.random.randn(1, n_out)
# Forward
z_hid = x @ Wh + bh # (n_batch, n_hid)
a_hid = lrelu(z_hid) # (n_batch, n_hid)
logits = a_hid @ Wo + bo # (n_batch, n_out)
y_hat = softmax(logits) # (n_batch, n_out)
# Backward
dJdy = (y_hat - y) / len(x) # backprop through CE and softmax (n_batch, n_out)
dWo = a_hid.T @ dJdy # (same as Wo)
dbo = np.sum(dJdy, axis=0, keepdims=True) # (same as bo)
dJdz = dJdy @ Wo.T * lrelu_der(z_hid) # backprop into hidden (n_batch, n_hid)
dWh = x.T @ dJdz # (same as Wh)
dbh = np.sum(dJdz, axis=0, keepdims=True) # (same as bh)
ngWh, ngbh, ngWo, ngbo = numerical_gradient(x, y, Wh, bh, Wo, bo)
assert np.allclose(dbo, ngbo)
assert np.allclose(dWo, ngWo)
assert np.allclose(dWh, ngWh)
assert np.allclose(dbh, ngbh)
test_gradients()