We propose a deep learning framework for modeling complex high-dimensional densities called Non-linear Independent Component Estimation (NICE). It is based on the idea that a good representation is one in which the data has a distribution that is easy to model. For this purpose, a non-linear deterministic transformation of the data is learned that maps it to a latent space so as to make the transformed data conform to a factorized distribution, i.e., resulting in independent latent variables. We parametrize this transformation so that computing the Jacobian determinant and inverse transform is trivial, yet we maintain the ability to learn complex non-linear transformations, via a composition of simple building blocks, each based on a deep neural network. The training criterion is simply the exact log-likelihood, which is tractable. Unbiased ancestral sampling is also easy. We show that this approach yields good generative models on four image datasets and can be used for inpainting.
Source: NICE: Non-linear Independent Components Estimation (2014-10-30). See: paper link.
import torch
import torch.nn as nn
import numpy as np
from keras.datasets.mnist import load_data
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.distributions.transformed_distribution import TransformedDistribution
from torch.distributions.uniform import Uniform
from torch.distributions.transforms import SigmoidTransform
from torch.distributions.transforms import AffineTransform
torch.manual_seed(0)
# load (and normalize) mnist dataset
(trainX, trainY), (testX, testy) = load_data()
trainX = (np.float32(trainX) + torch.rand(trainX.shape).numpy()) / 255.
trainX = trainX.clip(0, 1)
trainX = torch.tensor(trainX.reshape(-1, 28 * 28))
class StandardLogisticDistribution:
def __init__(self, data_dim=28 * 28, device='cpu'):
self.m = TransformedDistribution(
Uniform(torch.zeros(data_dim, device=device),
torch.ones(data_dim, device=device)),
[SigmoidTransform().inv, AffineTransform(torch.zeros(data_dim, device=device),
torch.ones(data_dim, device=device))]
)
def log_pdf(self, z):
return self.m.log_prob(z).sum(dim=1)
def sample(self):
return self.m.sample()
class NICE(nn.Module):
def __init__(self, data_dim=28 * 28, hidden_dim=1000):
super().__init__()
self.m = torch.nn.ModuleList([nn.Sequential(
nn.Linear(data_dim // 2, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.ReLU(),
nn.Linear(hidden_dim, data_dim // 2), ) for i in range(4)])
self.s = torch.nn.Parameter(torch.randn(data_dim))
def forward(self, x):
x = x.clone()
for i in range(len(self.m)):
x_i1 = x[:, ::2] if (i % 2) == 0 else x[:, 1::2]
x_i2 = x[:, 1::2] if (i % 2) == 0 else x[:, ::2]
h_i1 = x_i1
h_i2 = x_i2 + self.m[i](x_i1)
x = torch.empty(x.shape, device=x.device)
x[:, ::2] = h_i1
x[:, 1::2] = h_i2
z = torch.exp(self.s) * x
log_jacobian = torch.sum(self.s)
return z, log_jacobian
def invert(self, z):
x = z.clone() / torch.exp(self.s)
for i in range(len(self.m) - 1, -1, -1):
h_i1 = x[:, ::2]
h_i2 = x[:, 1::2]
x_i1 = h_i1
x_i2 = h_i2 - self.m[i](x_i1)
x = torch.empty(x.shape, device=x.device)
x[:, ::2] = x_i1 if (i % 2) == 0 else x_i2
x[:, 1::2] = x_i2 if (i % 2) == 0 else x_i1
return x
def training(normalizing_flow, optimizer, dataloader, distribution, nb_epochs=1500, device='cpu'):
training_loss = []
for _ in tqdm(range(nb_epochs)):
for batch in dataloader:
z, log_jacobian = normalizing_flow(batch.to(device))
log_likelihood = distribution.log_pdf(z) + log_jacobian
loss = -log_likelihood.sum()
optimizer.zero_grad()
loss.backward()
optimizer.step()
training_loss.append(loss.item())
return training_loss
if __name__ == 'main':
device = 'cuda'
normalizing_flow = NICE().to(device)
logistic_distribution = StandardLogisticDistribution(device=device)
x = torch.randn(10, 28 * 28, device=device)
assert torch.allclose(normalizing_flow.invert(normalizing_flow(x)[0]), x, rtol=1e-04, atol=1e-06)
optimizer = torch.optim.Adam(normalizing_flow.parameters(), lr=0.0002, weight_decay=0.9)
dataloader = DataLoader(trainX, batch_size=32, shuffle=True)
training_loss = training(normalizing_flow, optimizer, dataloader, logistic_distribution, nb_epochs=500,
device=device)
nb_data = 10
fig, axs = plt.subplots(nb_data, nb_data, figsize=(10, 10))
for i in range(nb_data):
for j in range(nb_data):
x = normalizing_flow.invert(logistic_distribution.sample().unsqueeze(0)).data.cpu().numpy()
axs[i, j].imshow(x.reshape(28, 28).clip(0, 1), cmap='gray')
axs[i, j].set_xticks([])
axs[i, j].set_yticks([])
plt.savefig('Imgs/Generated_MNIST_data.png')
plt.show()
python implementation NICE: Non-linear Independent Components Estimation in 100 lines
2014-10-30