This notebook presents bag-of-words sentiment anlysis on IMDB movie reviews dataset in Keras.
Contents
import numpy as np
import matplotlib.pyplot as plt
Limit TensorFlow GPU memory usage
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config):
pass # init sessin with allow_growth
Load IMDB movie review dataset using Keras API. Most params are set to default values.
(x_train_raw, y_train), (x_test_raw, y_test) = tf.keras.datasets.imdb.load_data(
path='imbd.npz', # download to '~/.keras/datasets/' + path
num_words=10000, # top most frequent words to consider
skip_top=0, # top most frequent words to ignore ('the', 'a', 'at', ...)
maxlen=None, # truncate reviews longer than this
seed=113, # data shuffling seed
start_char=1, # start-of-sequence token
oov_char=2, # if skip_top used, then dropped words replaced with this token
index_from=3) # actual word tokens start here
Data shapes
print(x_train_raw.shape)
print(x_test_raw.shape)
Example data sample
print('Label:', y_train[0])
print('Review:', x_train_raw[0])
As a sanity check recreate word dictionary
w2i = tf.keras.datasets.imdb.get_word_index()
w2i = {k:(v+2) for k,v in w2i.items()} # 0 is <PAD>; add +2 for <ST> and <UNK>
w2i['<PAD>'] = 0
w2i['<ST>'] = 1
w2i['<UNK>'] = 2
i2w = {v: k for k, v in w2i.items()}
Print subset of vocabulary
vocab_10000 = [i2w[i] for i in range(10000)] # 1000 most common words (indices are sorted)
print(sorted(vocab_10000)[:100]) # sort alphabeticaly and show first 100 words
Print sample review as actual text
# 16965 is a "Bromwell High is a cartoon comedy..." review,
# which in original dataset is first train review
print(' '.join(i2w[id] for id in x_train_raw[16965] ))
Convert movie reviews to multi-hot vectors of length 1000, where each position corresponds to one word in vocabulary
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
x_train = tokenizer.sequences_to_matrix(x_train_raw, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test_raw, mode='binary')
print('shape:', x_train.shape)
print('data:')
print(x_train[0, :100])
Simple multi-layer perceptron
from tensorflow.keras.layers import Dense
model = tf.keras.Sequential()
model.add(Dense(units=50, input_dim=10000, activation='sigmoid'))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
Keras only logs metrics every epoch, to get more datapoints we implement our own callback.
class Callback(tf.keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.losses = []
self.accs = []
def on_batch_end(self, batch, logs={}):
self.losses.append(logs.get('loss'))
self.accs.append(logs.get('acc'))
Train model
cback = Callback()
history = model.fit(x=x_train, y=y_train, batch_size=250, epochs=2, callbacks=[cback])
Final results
loss, acc = model.evaluate(x_train, y_train, verbose=0)
print(f'Accuracy on train set: {acc:.2f}')
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print(f'Accuracy on test set: {acc:.2f}')
Plot loss and accuracy during training period
plt.plot(cback.losses, label='loss')
plt.plot(cback.accs, label='acc', color='red')
plt.legend();