This notebook presents bag-of-words sentiment anlysis on IMDB movie reviews dataset.
Contents
import os
import re
import collections
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
Pick GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Download the dataset from here and extract. Point path below to extracted location.
dataset_location = '/home/marcin/Datasets/imdb'
Helper to load the dataset
def load_imdb_dataset(dataset_loc):
def read_reviews(path, label, reviews, labels):
files_list = sorted(os.listdir(path))
for filename in sorted(os.listdir(path)):
with open(os.path.join(path, filename)) as f:
reviews.append(f.read())
labels.append(label)
return reviews, labels
path_train_pos = os.path.join(dataset_location, 'aclImdb_v1/aclImdb/train/pos')
path_train_neg = os.path.join(dataset_location, 'aclImdb_v1/aclImdb/train/neg')
path_test_pos = os.path.join(dataset_location, 'aclImdb_v1/aclImdb/test/pos')
path_test_neg = os.path.join(dataset_location, 'aclImdb_v1/aclImdb/test/neg')
train_revs, train_labels = [], []
train_revs, train_labels = read_reviews(path_train_pos, 1, train_revs, train_labels)
train_revs, train_labels = read_reviews(path_train_neg, 0, train_revs, train_labels)
test_revs, test_labels = [], []
test_revs, test_labels = read_reviews(path_test_pos, 1, test_revs, test_labels)
test_revs, test_labels = read_reviews(path_test_neg, 0, test_revs, test_labels)
return (train_revs, train_labels), (test_revs, test_labels)
Load dataset
train_data, test_data = load_imdb_dataset(dataset_location)
train_reviews_raw, train_labels_raw = train_data
test_reviews_raw, test_labels_raw = test_data
Look at the Data
Lets see a sample review
print(train_reviews_raw[0])
Count words in the dataset
def count_words(list_of_examples):
if isinstance(list_of_examples[0], str):
split = True # got list of strings, need to split words
if isinstance(list_of_examples[0], list):
split = False # list of lists, already split by words
words_counter = collections.Counter()
for example in list_of_examples:
if split:
words_counter.update(example.split())
else:
words_counter.update(example)
total_words = sum(list(words_counter.values()))
unique_words = len(words_counter)
return total_words, unique_words, words_counter
total_words, unique_words, words_counter = count_words(train_reviews_raw)
print('Total words: ', total_words)
print('Unique words: ', unique_words)
We have 5.8M words (as separated by spaces) and 280k unique words
And lets have a look at word count distributions
def plot_counts(words_counter, title):
sorted_all = np.array(sorted(list(words_counter.values()), reverse=True))
fig, [ax1, ax2] = plt.subplots(1, 2, figsize=[16,6])
ax1.plot(sorted_all); ax1.set_title(title + ' Counts (linear scale)')
ax2.plot(sorted_all); ax2.set_title(title + ' Counts (log scale)')
ax2.set_yscale('log')
plot_counts(words_counter, title='Word')
Some words appear 300k times (left plot), while there is over 150k "words" that appear only once (right plot)
We are going to perform following pre-processing steps:
Text Cleanup
We are going to perform following pre-processing steps:
This will leave us with dataset build of 26 letters.
Note that words like "didn't" will be converted to "didn t", but that's ok. Words "did" and "didn" will still be encoded as different characters. Word "t" can be dropped when removing stopwords.
def text_cleanup(list_of_texts):
"""Perform text cleanup, reduce to a-z and space."""
def cleanup(text):
res = text.lower()
res = regex.sub(' ', res)
return res.split()
result_cleaned = []
regex = re.compile('[^a-z ]+') # removes anything that is not a-z, alternative: '[^a-z\' ]+'
for text in list_of_texts:
result_cleaned.append(cleanup(text))
return result_cleaned # doubly nested list of words
train_reviews = text_cleanup(train_reviews_raw)
test_reviews = text_cleanup(test_reviews_raw)
print(train_reviews[0])
Lets see how it looks like now
total_words, unique_words, words_counter = count_words(train_reviews)
print('Total words: ', total_words)
print('Unique words: ', unique_words)
Remove Stopwords
Check most common words, they don't contribute to overall meaning of sentences
display(words_counter.most_common()[:10])
List of stopwords from NLTK
# import nltk
# nltk.download('stopwords')
# en_stopwords = nltk.corpus.stopwords.words('english')
# stopwords = {sw for sw in en_stopwords}
# print(stopwords)
stopwords = {'down', 'then', 'of', 'but', 'only', 'yours', 'himself', 'again',
'very', 'or', 'once', 'until', 'have', "doesn't", 'what', 'during',
"that'll", 'some', 'was', 'be', 'he', "should've", 'between',
"shouldn't", 'further', 'no', 'yourself', 'm', 've', "you'll",
'ain', 't', 'our', 'his', 'o', 'wouldn', 'below', 'any', 'under',
'you', 'isn', 'theirs', 'why', 'that', 'mightn', 'ourselves', 'on',
'haven', 'while', 'to', 'than', 'your', 'she', 'is', 'just',
"mightn't", 'with', "you've", 'mustn', 'needn', 'same', 'me',
'such', 'myself', 'there', 'own', 'this', 're', 'ma', 'from',
'did', 'couldn', 'hasn', 'for', 'won', "won't", "mustn't", 'her',
'can', 'doesn', "wouldn't", 'when', "you're", 'who', 'which', 'll',
'itself', 'against', 'out', 'up', "it's", 'a', 'here', 'being',
'they', 'as', 'didn', 'weren', 'aren', 'herself', 'the', 'if',
"didn't", 'should', 'doing', 'other', 'has', 'so', "you'd",
'above', 'do', 'before', 'at', 'had', 'each', "aren't", 'their',
'now', 'an', 'through', 'how', 'those', 'nor', "hasn't", 'over',
'by', 'into', 'themselves', 'most', 'shan', 'been', "she's",
"haven't", "isn't", "wasn't", 'where', 'about', 'in', "hadn't",
'because', 'too', 'whom', 'ours', 'him', 'yourselves', 'after',
'and', 'were', 'both', 'will', 'it', 'my', 'few', 'having', 'them',
'hadn', 'shouldn', 'does', 's', "couldn't", 'y', 'all', 'don',
'off', 'more', 'am', 'd', 'hers', 'its', 'are', "shan't",
"weren't", 'we', "needn't", 'i', 'these', "don't", 'wasn', 'not'}
stopwords.add('br') # <br /> tag in a lot of reviews
Remove stopwords
def remove_stopwords(list_of_examples, stopwords):
result_no_stop = []
for list_of_words in list_of_examples:
result_no_stop.append( [w for w in list_of_words if w not in stopwords])
return result_no_stop
train_reviews_no_stop = remove_stopwords(train_reviews, stopwords)
test_reviews_no_stop = remove_stopwords(test_reviews, stopwords)
Show sample review
print(train_reviews_no_stop[0])
And word counts
total_words, unique_words, words_counter = count_words(train_reviews_no_stop)
print('Total words: ', total_words)
print('Unique words: ', unique_words)
Reduce Vocabulary
Likewise, check most rare words. They also don't provide much meaning (what is "lagomorph" anyways?)
display(words_counter.most_common()[-10:])
We will reduce vocabulary to 998 words plus <PAD> and <UNK> tokens for total of 1000 words
def get_most_common_words(list_of_examples, num_words):
words_ctr = collections.Counter()
for example in list_of_examples:
words_ctr.update(example)
keep_words = {w for w, n in words_ctr.most_common()[:num_words]}
return keep_words
allowed_words = get_most_common_words(train_reviews_no_stop, 9998)
Print some of the allowed words
print([w for w in allowed_words][:20])
And reduce vocabulary
def reduce_vocabulary(list_of_examples, allowed_words, unk_tok='<UNK>'):
result_reduced = []
for example in list_of_examples:
result_reduced.append( [w if w in allowed_words else unk_tok for w in example] )
return result_reduced
train_reviews_reduced = reduce_vocabulary(train_reviews_no_stop, allowed_words)
test_reviews_reduced = reduce_vocabulary(test_reviews_no_stop, allowed_words)
Show example after reduction
print(train_reviews_reduced[0])
And count words
total_words, unique_words, words_counter = count_words(train_reviews_reduced)
print('Total words: ', total_words)
print('Unique words: ', unique_words)
Create dictionaries
Technically we don't do any padding in this notebook but I'm leaving "<PAD>" in anyway
i2w = {i : w for i, (w, c) in enumerate(words_counter.most_common(), 1)}
w2i = {w : i for i, w in i2w.items()}
i2w[0] = '<PAD>' # use zero index for padding
w2i[i2w[0]] = 0
print('Number of words in dictionaries:', len(i2w))
And confirm dictionaries are build correctly
for i in range(10):
word = i2w[i]
print(i, ':', word, ':', w2i[word])
Print subset of vocabulary
print(sorted(list(i2w.values()))[:100])
Tokenize
Convert words into integer tokens
def tokenize(list_of_examples, word2idx):
result_tokenized = []
for list_of_words in list_of_examples:
result_tokenized.append( [word2idx[w] for w in list_of_words] )
return result_tokenized
train_reviews_tok = tokenize(train_reviews_reduced, w2i)
test_reviews_tok = tokenize(test_reviews_reduced, w2i)
Show example
print(train_reviews_tok[0])
def encode_multihot(arr_of_tokens, encode_size):
result_encoded = np.zeros(shape=(len(arr_of_tokens), encode_size))
for i, tokens in enumerate(arr_of_tokens):
result_encoded[i, tokens] = 1
return result_encoded # numpy array
train_reviews_enc = encode_multihot(train_reviews_tok, encode_size=len(i2w))
test_reviews_enc = encode_multihot(test_reviews_tok, encode_size=len(i2w))
Show example
print(train_reviews_enc[0, :100])
Convert Labels
train_labels = np.array(train_labels_raw).reshape(-1, 1)
test_labels = np.array(test_labels_raw).reshape(-1, 1)
print(train_labels)
Final Sanity Check
Get words in the first review back from multi-hot encoded feature vector
words = np.nonzero(train_reviews_enc[0])[0]
Print them in alphabetical order
print(' '.join(sorted(i2w[i] for i in words)))
For comparison, print same review in cleaned up form. Words are sorted for easier comparison
print(sorted(train_reviews_reduced[0]))
Check shapes
print(train_reviews_enc.shape)
print(train_labels.shape)
print(test_reviews_enc.shape)
print(test_labels.shape)
Helper function for accuracy
def accuracy(pred, tar):
return (pred == tar).float().mean() # tensor!!
Model with one hidden, one output layer
model = nn.Sequential(
nn.Linear(in_features=10000, out_features=50),
nn.Sigmoid(),
nn.Linear(in_features=50, out_features=1)) # no sigmoid at the output
model.to(device)
criterion = nn.BCEWithLogitsLoss() # because this expects logits
optimizer = torch.optim.Adam(model.parameters())
print(model)
Convert dataset to tensors
train_features = torch.tensor(train_reviews_enc, dtype=torch.float32, device=device)
train_targets = torch.tensor(train_labels, dtype=torch.float32, device=device)
test_features = torch.tensor(test_reviews_enc, dtype=torch.float32, device=device)
test_targets = torch.tensor(test_labels, dtype=torch.float32, device=device)
Train model
batch_size = 250
hist = { 'loss':[], 'acc':[] }
model.train()
for epoch in range(2): # loop over the dataset multiple times
indices = torch.randperm(len(train_features), device=device)
for i in range(0, len(train_features), batch_size):
# Pick mini-batch
inputs = train_features[indices[i:i+batch_size]]
targets = train_targets[indices[i:i+batch_size]]
# Optimize
optimizer.zero_grad()
logits = model(inputs)
loss = criterion(logits, targets)
loss.backward()
optimizer.step()
# Record
with torch.no_grad():
probabilities = torch.sigmoid(logits)
predictions = probabilities.round()
acc = accuracy(predictions, targets)
hist['loss'].append( loss.item() )
hist['acc'].append( acc.item() )
Final results
model.eval() # set model for evaluatiotn, disable dropout etc
with torch.no_grad():
outputs = model(train_features)
probabilities = torch.sigmoid(outputs)
predictions = probabilities.data.round()
acc = accuracy(predictions, train_targets).item()
print(f'Accuracy on train set: {acc:.2f}')
model.eval() # set model for evaluatiotn, disable dropout etc
with torch.no_grad():
outputs = model(test_features)
probabilities = torch.sigmoid(outputs)
predictions = probabilities.data.round()
acc = accuracy(predictions, test_targets).item()
print(f'Accuracy on test set: {acc:.2f}')
Plot loss and accuracy over training period
plt.plot(hist['loss'], label='loss')
plt.plot(hist['acc'], label='acc', color='red')
plt.legend();