
This notebook presents DCGAN architecture trained on SVHN and CelebA datasets.

Depending on which dataset you want to use comment out cells for the other one. There is no need to change DCGAN code, it will work with both datasets.



import os
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

SVHN Dataset

# from torchvision import datasets

Download dataset

# dataset = datasets.SVHN(root='~/.pytorch/svhn/', split='train', download=True, transform=None)

Because this is very small dataset we will load it fully into GPU. Otherwise dataloader will slow learning considerably.

# x_train = ( / 127.5) - 1   # scale to range [-1..1]
# x_train = x_train[:-41]  # make divisible by 512
# print(x_train.shape)

Visualize the data

# fig, axes = plt.subplots(nrows=1, ncols=8, figsize=[16,6])
# for i in range(len(axes)):
#     idx = np.random.randint(0, len(x_train))
#     img = x_train[idx]
#     img = img/2 + .5         # -1..1 -> 0..1
#     img = img.transpose(1, 2, 0)
#     axes[i].imshow(img); axes[i].axis('off')

Move to GPU if available

# x_train = torch.tensor(x_train, device=device)

CelebA Dataset

Point this to folder with CelebA unzipped

dataset_location = '/home/marcin/Datasets/img_align_celeba'
datafile = os.path.join(dataset_location, 'img_align_celeba_32x32.npz')
import PIL

Get names of all images in CelebA data folder

all_files = os.listdir(os.path.join(dataset_location, 'img_align_celeba'))

Images in CelebA are pre-aligned such that eyes and mouth location on different images is almost identical. We will use this fact and hardcode cropping to 128x128 and rescale to 32x32

def crop_celeba(img, size):
    assert img.size == (178, 218)
    box = [25, 65, img.width-25, img.height-25]  # crop 25/65/25/25 from left/top/right/bottom
    return img.crop(box)                         # result.size = (128, 128)

Show couple images with and without cropping

fig, (axes1, axes2) = plt.subplots(nrows=2, ncols=6, figsize=[16,6])
for i in range(len(axes1)):
    idx = i  # np.random.randint(0, len(all_files))
    img_full_path = os.path.join(dataset_location, 'img_align_celeba', all_files[idx])
    img =
    img_128x128 = crop_celeba(img, None)
    img_64x64 = img_128x128.resize([64, 64], PIL.Image.BICUBIC)

This function will read, crop, scale all images and put them into one large numpy array

def load_all_images(dataset_location, all_files, size=32):
    all_images = []
    for i in range(len(all_files)):
        img_full_path = os.path.join(dataset_location, 'img_align_celeba', all_files[i])
        img =
        img_128x128 = crop_celeba(img, None)
        img_small = img_128x128.resize([size, size], PIL.Image.BICUBIC)  
        arr = np.array(img_small)
        assert arr.shape == (size, size, 3)
        if i % 10000 == 0:
            print(f'Image {i} of {len(all_files)}')
    return np.array(all_images)

Check if dataset was processed earlier, if not then process and save to .npz, if yes then just load from file

if not os.path.isfile(datafile):
    print('Creating datafile...')
    all_images = load_all_images(dataset_location, all_files, size=32)
    np.savez(datafile, all_images=all_images)
    print('Loading from file...')
    npzfile = np.load(datafile)
    all_images = npzfile['all_images']
Loading from file...

Convert to float32, scale to -1..1 range and re-order dimensions to match pytorch convention

x_train = (all_images.astype(np.float32) / 127.5) - 1      # scale to range [-1..1]
x_train = x_train.transpose(0, 3, 1, 2)                    # pytorch ordering NCHW
x_train = x_train[:-103]                                   # make divisible by 128
(202496, 3, 32, 32)
del all_images  # save memory

Move everything to GPU if available (approx 2.5GB)

x_train = torch.tensor(x_train, device=device)



class Discriminator(nn.Module):
    def __init__(self, conv_dim=32):
        super(Discriminator, self).__init__()

        self.L1_conv = nn.Conv2d(3, conv_dim, kernel_size=4, stride=2, padding=1, bias=False)
        self.L1_act = nn.LeakyReLU(0.2)

        self.L2_conv = nn.Conv2d(conv_dim, 2*conv_dim, kernel_size=4, stride=2, padding=1, bias=False)
        self.L2_bn = nn.BatchNorm2d(2*conv_dim)
        self.L2_act = nn.LeakyReLU(0.2)
        self.L3_conv = nn.Conv2d(2*conv_dim, 4*conv_dim, kernel_size=4, stride=2, padding=1, bias=False)
        self.L3_bn = nn.BatchNorm2d(4*conv_dim)
        self.L3_act = nn.LeakyReLU(0.2)
        self.L4_fc = nn.Linear(in_features=4*conv_dim*4*4, out_features=1)

    def forward(self, x):
        x = self.L1_conv(x)
        x = self.L1_act(x)
        x = self.L2_conv(x)
        x = self.L2_bn(x)
        x = self.L2_act(x)
        x = self.L3_conv(x)
        x = self.L3_bn(x)
        x = self.L3_act(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.L4_fc(x)
        return x                   # return logits


class Generator(nn.Module):
    def __init__(self, z_size, conv_dim=32):
        super(Generator, self).__init__()
        self.conv_dim = conv_dim
        self.L0_fc = nn.Linear(in_features=z_size, out_features=4*conv_dim*4*4)
        self.L1_convT = nn.ConvTranspose2d(4*conv_dim, 2*conv_dim, kernel_size=4, stride=2, padding=1, bias=False)
        self.L1_bn = nn.BatchNorm2d(2*conv_dim)
        self.L1_act = nn.ReLU()
        self.L2_convT = nn.ConvTranspose2d(2*conv_dim, conv_dim, kernel_size=4, stride=2, padding=1, bias=False)
        self.L2_bn = nn.BatchNorm2d(conv_dim)
        self.L2_act = nn.ReLU()
        self.L3_convT = nn.ConvTranspose2d(conv_dim, 3, kernel_size=4, stride=2, padding=1, bias=False)
        self.L3_act = nn.Tanh()
    def forward(self, x):
        x = self.L0_fc(x)
        x = x.view(x.size(0), 4*self.conv_dim, 4, 4)
        x = self.L1_convT(x)
        x = self.L1_bn(x)
        x = self.L1_act(x)
        x = self.L2_convT(x)
        x = self.L2_bn(x)
        x = self.L2_act(x)
        x = self.L3_convT(x)
        x = self.L3_act(x)
        return x                # image pixels in range -1..1

Build models, loss and optimizers

conv_dim = 32
z_size = 100

generator = Generator(z_size=z_size, conv_dim=conv_dim)
discriminator = Discriminator(conv_dim)

criterion = nn.BCEWithLogitsLoss()

lr, betas = 0.0002, (0.5, 0.999)
d_optimizer = optim.Adam(discriminator.parameters(), lr, betas)
g_optimizer = optim.Adam(generator.parameters(), lr, betas)


def plot_images(x_fake):  # Expects tensor shape [batch, 3, width, height]
    fig, axes = plt.subplots(nrows=1, ncols=10, figsize=[16,9])
    for i, ax in enumerate(axes):
        img = x_fake[i].detach().cpu().permute(1, 2, 0).numpy()
def plot_loss(losses):
    fig, ax = plt.subplots(nrows=1, ncols=1)
    ax.plot(losses['disc'], label='disc')
    ax.plot(losses['gen'], label='gen')

Few more hyperparameters. fixed_noise is used to generate consistent sample images every epoch

n_batch = 128
n_epochs = 1     # even 1 epoch will show some face-ish results, recommended 10-100 train epochs

fixed_noise = torch.rand(16, z_size, device=device)*2-1   # uniform -1..1

Train the model

losses = {'gen':[], 'disc':[]}
indices = np.array(range(len(x_train)))

iteration = 0
for e in range(n_epochs):
    time_start = time.time()
    for i in range(0, len(x_train), n_batch):
        # Pick next batch of real images
        i_batch = indices[i:i+n_batch]
        x_real = x_train[i_batch]
        # Generate fake images
        noise = torch.rand(len(x_real), z_size, device=device)*2-1  # uniform -1..1
        x_fake = generator(noise)
        # Train the discriminator
        outputs_real = discriminator(x_real)  # logits
        y_real = torch.ones(len(x_real), 1, device=device) * .9
        d_real_loss = criterion(outputs_real, y_real)
        outputs_fake = discriminator(x_fake)  # logits
        y_fake = torch.zeros(len(x_fake), 1, device=device)
        d_fake_loss = criterion(outputs_fake, y_fake)
        d_loss = d_real_loss + d_fake_loss
        # Train the generator
        noise = torch.rand(len(x_real), z_size, device=device)*2-1  # uniform -1..1
        x_fake = generator(noise)
        y_fake = torch.ones(len(x_real), 1, device=device)   # flip labels
        outputs_fake = discriminator(x_fake)
        g_loss = criterion(outputs_fake, y_fake)
        # Print some loss stats
        if iteration % 100 == 0:
            generator.eval()                   # switch batchnorm to eval mode
            samples = generator(fixed_noise)
            time_epoch = time.time() - time_start
            print(f'Epoch ({time_epoch:4.1f}s): {e:3}     gloss: {g_loss:4.2f}     dloss: {d_loss:4.2f}')
        iteration += 1

    samples = generator(fixed_noise)
    time_epoch = time.time() - time_start
    print(f'Epoch ({time_epoch:4.1f}s): {e:3}     gloss: {g_loss:4.2f}     dloss: {d_loss:4.2f}')
Epoch ( 0.0s:   0     gloss: 0.95     dloss: 1.34
Epoch ( 1.9s:   0     gloss: 3.77     dloss: 0.40
Epoch ( 3.7s:   0     gloss: 2.11     dloss: 0.60
Epoch ( 5.6s:   0     gloss: 2.21     dloss: 0.71
Epoch ( 7.5s:   0     gloss: 2.36     dloss: 0.77
Epoch ( 9.3s:   0     gloss: 1.50     dloss: 0.85
Epoch (11.1s:   0     gloss: 2.08     dloss: 0.82
Epoch (12.9s:   0     gloss: 1.59     dloss: 0.91
Epoch (14.8s:   0     gloss: 1.99     dloss: 0.97
Epoch (16.5s:   0     gloss: 1.54     dloss: 0.91
Epoch (18.3s:   0     gloss: 1.52     dloss: 0.83
Epoch (20.2s:   0     gloss: 1.57     dloss: 1.00
Epoch (22.0s:   0     gloss: 1.19     dloss: 0.96
Epoch (23.9s:   0     gloss: 1.32     dloss: 0.86
Epoch (25.8s:   0     gloss: 1.32     dloss: 0.87
Epoch (27.6s:   0     gloss: 1.25     dloss: 1.00
epoch 29.094604s:   0     gloss: 1.48     dloss: 0.96

