This notebook presents Sequence-to-Sequence encoder-decoder architecture based on LSTM cells. Neural network is used to learn English to French translation task on a small corpus of sequences.
Dataset
Code
Resources
import os
import numpy as np
import matplotlib.pyplot as plt
Limit TensorFlow GPU memory usage
import tensorflow as tf
gpu_options = tf.GPUOptions(allow_growth=True) # init TF ...
config=tf.ConfigProto(gpu_options=gpu_options) # w/o taking ...
with tf.Session(config=config): pass # all GPU memory
Download dataset from the link in the introduction and point path below to folder with small_vocab_en and small_vocab_fr
dataset_location = '/home/marcin/Dropbox/Courses/Udacity/NLPND/aind2-nlp-capstone/data/'
small_vocab_en contains approx 137860 short sentences in English. small_vocab_fr contains corresponding sentences in french.
with open(os.path.join(dataset_location, 'small_vocab_en')) as f:
# line below: 1) reads lines from file,
# 2) strips /n char and converts to lowercase,
# 3) adds special start/end words
data_en_raw = list(map(lambda x: 'ST '+x.strip().lower()+' EN', f.readlines()))
print('len:', len(data_en_raw))
print('example sentences:')
data_en_raw[4:7]
with open(os.path.join(dataset_location, 'small_vocab_fr')) as f:
# line below: 1) reads lines from file,
# 2) strips /n char and converts to lowercase,
# 3) adds special start/end words
data_fr_raw = list(map(lambda x: 'ST '+x.strip().lower()+' EN', f.readlines()))
print('len:', len(data_fr_raw))
print('example sentences:')
data_fr_raw[4:7]
Use Keras tokenizer to convert text sentences to tokens. Each word gets it's own unique integer token. Special words ST/EN also get their tokens.
tok_en = tf.keras.preprocessing.text.Tokenizer(lower=False)
tok_en.fit_on_texts(data_en_raw)
data_en_tok = tok_en.texts_to_sequences(data_en_raw)
print('example tokens for English:')
print('is:', tok_en.word_index['is'], ' ',
'ST:', tok_en.word_index['ST'], ' ',
'EN:', tok_en.word_index['EN'], ' ',
'in:', tok_en.word_index['in'], ' ',
'it:', tok_en.word_index['it'])
print('example sentences after tokenization:')
data_en_tok[4:7]
tok_fr = tf.keras.preprocessing.text.Tokenizer(lower=False)
tok_fr.fit_on_texts(data_fr_raw)
data_fr_tok = tok_fr.texts_to_sequences(data_fr_raw)
print('example tokens for French:')
print('est:', tok_fr.word_index['est'], ' ',
'ST:', tok_fr.word_index['ST'], ' ',
'EN:', tok_fr.word_index['EN'], ' ',
'en:', tok_fr.word_index['en'], ' ',
'il:', tok_fr.word_index['il'])
print('example sentences after tokenization:')
data_fr_tok[4:7]
Calculate maximum sentence lengths
max_len_en = len(max(data_en_tok, key=len))
max_len_fr = len(max(data_fr_tok, key=len))
max_len_both = max(max_len_en, max_len_fr)
print('Max length English sentence (tokens): ', max_len_en)
print('Max length French sentence (tokens): ', max_len_fr)
print('Max length in either English or French:', max_len_both, 'tokens (including EN/ST)')
Pad both corpuses to longest sentence in each language
data_en = tf.keras.preprocessing.sequence.pad_sequences(
data_en_tok, maxlen=max_len_en, padding='post')
data_fr = tf.keras.preprocessing.sequence.pad_sequences(
data_fr_tok, maxlen=max_len_fr, padding='post')
Print some statistics
n_en_seq = data_en.shape[1]
n_fr_seq = data_fr.shape[1]
n_en_vocab = len(tok_en.word_index)
n_fr_vocab = len(tok_fr.word_index)
max_seq_len = max(n_en_seq, n_fr_seq)
print('Max length English sentence (tokens): ', n_en_seq)
print('Max length French sentence (tokens): ', n_fr_seq)
print('Num tokens in English vocabulary: ', n_en_vocab)
print('Num tokens in English vocabulary: ', n_fr_vocab)
print('English train data')
print('shape:', data_en.shape)
print(data_en[4:7])
print('French train targets data')
print('shape:', data_fr.shape)
print(data_fr[4:7])
We will use technique called 'Teacher Forces' to train decoder. I.e. instead of getting decoder to generate one word at a time and then feed it into the next step in decoder, we will pretend decoder generated correct sequence and just feed in correct inputs. Because we know correct french translation, we don't have to sample one-at-a-time.
To do this we will need two version of French dataset:
data_fr_noST = np.roll(data_fr, shift=-1, axis=-1) # shift left by one and pad 0 on right
data_fr_noST[:,-1] = 0
print('French train targets data')
print('shape:', data_fr_noST.shape)
print(data_fr_noST[4:7])
Create following parts of graph:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Activation
# Encoder
E_in = Input(shape=(n_en_seq,), name='Enc_In') # (?, 17)
E_emb = Embedding(input_dim=n_en_vocab, output_dim=50, name='Enc_Emb')(E_in) # (?, 17, 50)
_, Eh, Ec = LSTM(units=512, return_state=True, name='Enc_LSTM')(E_emb) # (?, 512)
# Decoder layer definitions - we will need to reuse these in sampling code later on
decoder_embedding = Embedding(input_dim=n_fr_vocab, output_dim=50, name='Dec_Emb')
decoder_lstm = LSTM(512, return_sequences=True, return_state=True, name='Dec_LSTM')
decoder_dense = Dense(n_fr_vocab, activation='softmax', name='Dec_Out')
# Decoder in train mode
D_in = Input(shape=(n_fr_seq,), name='Dec_Target') # (?, 23)
D_emb = decoder_embedding(D_in) # (?, 23, 50)
D_lstm, _, _ = decoder_lstm(D_emb, initial_state=[Eh, Ec]) # (?, 23, 512)
D_out = decoder_dense(D_lstm) # (?, 23, 346)
Create end-to-end Keras model for training. Contains both encoder and decoder
# full seq-2-seq model
model = tf.keras.Model(inputs=[E_in, D_in], outputs=D_out)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
loss=tf.keras.losses.sparse_categorical_crossentropy,
metrics=[tf.keras.metrics.sparse_categorical_accuracy])
model.summary()
Optional: plot nice diagram and save to file. This requires graphviz
and pydot
to be installed.
# from tensorflow.keras.utils import plot_model
# plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
The result should be as follows
Train model
model.fit(x=[data_en, data_fr], y=np.expand_dims(data_fr_noST, axis=-1),
batch_size=1024, epochs=10, validation_split=0.2)
Model for sampling translations
Create Keras model for encoder as separate unit
encoder = tf.keras.Model(inputs=E_in, outputs=[Eh, Ec])
encoder.summary()
Create decoder in sampling mode, reuse layer definitions form previous section
Sh_init = Input(shape=(512,)) # (?, 512)
Sc_init = Input(shape=(512,)) # (?, 512)
S_input = Input(shape=(1,), name='Sam_Input') # (?, 1)
S_emb = decoder_embedding(S_input) # (?, 1, 50)
S_lstm, Sh, Sc = decoder_lstm(S_emb, initial_state=[Sh_init, Sc_init]) # (?, 1, 512)
S_output = decoder_dense(S_lstm) # (?, 1, 346)
Create Keras model for decoder-sampler (one word at a time)
sampler = tf.keras.Model(inputs=[S_input, Sh_init, Sc_init], outputs=[S_output, Sh, Sc])
sampler.summary()
Optional: plot nice diagram
# plot_model(sampler, to_file='sampler.png', show_shapes=True, show_layer_names=True)
index = 777
english_sentence = data_en_raw[index]
french_sentence = data_fr_raw[index]
print('english: ', english_sentence)
print('french (original): ', french_sentence)
Actually Sample
Run input sentence through encoder
st_h, st_c = encoder.predict(data_en[index:index+1])
assert st_h.shape == (1, 512) and st_c.shape == (1, 512)
Create input variables - thse will be feed into decoder at first decode time step
st_input = tok_fr.word_index['ST']
st_input = np.array([[st_input]]) # batch size = 1, seq len = 1
assert st_input.shape == (1, 1)
Generate output words one-at-a-time and feed them back next time step
prediction_tok = [] # list of output tokens, generated one at a time
for i in range(n_fr_seq):
# feed one word (st_input) intot decoder
probs, st_h, st_c = sampler.predict([st_input, st_h, st_c])
assert st_h.shape == (1, 512) and st_c.shape == (1, 512)
# pick maximum probability prediction as next word
# (but keep shape so we can feed in next step)
st_input = probs.argmax(axis=-1)
assert st_input.shape == (1, 1)
# pick maximum probability prediction and append to generate list
# (this does same as line above, but discards shape)
token = probs.argmax()
prediction_tok.append(token)
# if decoder generated special end-word, break
if token == tok_fr.word_index['EN']:
break
Print output sentence tokens
prediction_tok
Helper to convert tokenized sequence back to words
def sequence_to_french(seq):
words = [tok_fr.index_word[x] for x in seq if x in tok_fr.index_word]
return ' '.join(words)
Print input english sentence, target French and generated French sentences
print('english: ', english_sentence)
print('french (original): ', french_sentence)
predicted_sentence = sequence_to_french(prediction_tok)
print('french (predicted): ', predicted_sentence)