This notebook demonstratess use of Batch Normalization in a simple ConvNet applied to CIFAR-10 dataset.
Note: Original batch norm paper explains it's effectivness with "internal covariate shift". But this recent paper shows it's actually due to batch norm making "optimization landscape significantly smoother". Both might be worth to read.
Contents
import numpy as np
import matplotlib.pyplot as plt
Limit TensorFlow GPU memory usage
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config):
pass # init sessin with allow_growth
Load dataset and show example images
(x_train_raw, y_train_raw), (x_test_raw, y_test_raw) = tf.keras.datasets.cifar10.load_data()
class2txt = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
Show example images
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=[16, 9])
for i in range(len(axes)):
axes[i].set_title(class2txt[y_train_raw[i, 0]])
axes[i].imshow(x_train_raw[i])
Normalize features
x_train = (x_train_raw - x_train_raw.mean()) / x_train_raw.std()
x_test = (x_test_raw - x_train_raw.mean()) / x_train_raw.std()
print('x_train.shape', x_train.shape)
print('x_test.shape', x_test.shape)
One-hot encode labels
y_train = tf.keras.utils.to_categorical(y_train_raw, num_classes=10)
y_test = tf.keras.utils.to_categorical(y_test_raw, num_classes=10)
print('y_train.shape', y_train.shape)
print(y_train[:3])
Create model
There is a bit of confusion going on with BatchNormalization axis=-1 parameter. As per original batch norm paper, in convolutional layers we want to apply batch norm per channel (as opposed to per-feature in dense layers). Batch norm creates 4x params for each distinct feature it normalizes, so a good sanity check is to ensure that Param # equals to 4x nb filters.
from tensorflow.keras.layers import InputLayer, Conv2D, BatchNormalization, MaxPooling2D, Activation
from tensorflow.keras.layers import Flatten, Dense, Dropout
model = tf.keras.Sequential()
model.add(InputLayer(input_shape=[32, 32, 3]))
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation=None, use_bias=False))
model.add(BatchNormalization()) # leave default axis=-1 in all BN layers
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=[2,2], strides=[2, 2], padding='same'))
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation=None, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=[2,2], strides=[2, 2], padding='same'))
model.add(Conv2D(filters=64, kernel_size=3, padding='same', activation=None, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(MaxPooling2D(pool_size=[2,2], strides=[2, 2], padding='same'))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(512, activation='elu'))
model.add(Dropout(0.3))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
Train model
hist = model.fit(x=x_train, y=y_train, batch_size=250, epochs=10, validation_data=[x_test, y_test], verbose=2)
Final results
loss, acc = model.evaluate(x_train, y_train, batch_size=250, verbose=0)
print(f'Accuracy on train set: {acc:.3f}')
loss, acc = model.evaluate(x_test, y_test, batch_size=250, verbose=0)
print(f'Accuracy on test set: {acc:.3f}')
Create model
model = tf.keras.Sequential()
model.add(InputLayer(input_shape=[32, 32, 3]))
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='elu', use_bias=False))
#model.add(Activation('elu'))
model.add(BatchNormalization()) # leave default axis=-1 in all BN layers
model.add(MaxPooling2D(pool_size=[2,2], strides=[2, 2], padding='same'))
model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='elu', use_bias=False))
#model.add(Activation('elu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=[2,2], strides=[2, 2], padding='same'))
model.add(Conv2D(filters=64, kernel_size=3, padding='same', activation='elu', use_bias=False))
#model.add(Activation('elu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=[2,2], strides=[2, 2], padding='same'))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(512, activation='elu'))
model.add(Dropout(0.3))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
hist = model.fit(x=x_train, y=y_train, batch_size=250, epochs=10, validation_data=[x_test, y_test], verbose=2)
Final results
loss, acc = model.evaluate(x_train, y_train, batch_size=250, verbose=0)
print(f'Accuracy on train set: {acc:.3f}')
loss, acc = model.evaluate(x_test, y_test, batch_size=250, verbose=0)
print(f'Accuracy on test set: {acc:.3f}')