cornellius-gp / gpytorch

A highly efficient implementation of Gaussian Processes in PyTorch
MIT License
3.54k stars 557 forks source link

[Bug?] Natural gradient descent and OrthogonallyDecoupledVariationalStrategy #1267

Open jrojsel opened 4 years ago

jrojsel commented 4 years ago

πŸ› Bug

I'd like to use natural gradient descent together with the orthogonally decoupled variational strategy, so I ran the NGD example notebook but with the OrthDecoupledApproximateGP from the variational strategy example notebook.

Perhaps I am doing something wrong (the NGD var. dist. and the orth. decoupled var. strat. might not even be compatible with each other), but it seems unstable.

To reproduce

Code snippet to reproduce

import tqdm
import math
import torch
import gpytorch
from matplotlib import pyplot as plt
import urllib.request
import os
from scipy.io import loadmat
from math import floor

# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)

if not smoke_test and not os.path.isfile('../elevators.mat'):
    print('Downloading \'elevators\' UCI dataset...')
    urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1jhWL3YUHvXIaftia4qeAyDwVxo6j1alk', '../elevators.mat')

if smoke_test:  # this is for running the notebook in our testing framework
    X, y = torch.randn(1000, 3), torch.randn(1000)
else:
    data = torch.Tensor(loadmat('../elevators.mat')['data'])
    X = data[:, :-1]
    X = X - X.min(0)[0]
    X = 2 * (X / X.max(0)[0]) - 1
    y = data[:, -1]

train_n = int(floor(0.8 * len(X)))
train_x = X[:train_n, :].contiguous()
train_y = y[:train_n].contiguous()

test_x = X[train_n:, :].contiguous()
test_y = y[train_n:].contiguous()

if torch.cuda.is_available():
    train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

def make_orthogonal_vs(model, train_x):
    mean_inducing_points = torch.randn(1000, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)
    covar_inducing_points = torch.randn(100, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)

    covar_variational_strategy = gpytorch.variational.VariationalStrategy(
        model, covar_inducing_points,
        gpytorch.variational.CholeskyVariationalDistribution(covar_inducing_points.size(-2)),
        learn_inducing_locations=True
    )

    variational_strategy = gpytorch.variational.OrthogonallyDecoupledVariationalStrategy(
        covar_variational_strategy, mean_inducing_points,
        gpytorch.variational.DeltaVariationalDistribution(mean_inducing_points.size(-2)),
    )
    return variational_strategy

class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(-2))
        variational_strategy = make_orthogonal_vs(self, train_x)
        super().__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

inducing_points = train_x[:500, :]
model = GPModel(inducing_points=inducing_points)
likelihood = gpytorch.likelihoods.GaussianLikelihood()

if torch.cuda.is_available():
    model = model.cuda()
    likelihood = likelihood.cuda()

variational_ngd_optimizer = gpytorch.optim.NGD(model.variational_parameters(), num_data=train_y.size(0), lr=0.1)

hyperparameter_optimizer = torch.optim.Adam([
    {'params': model.hyperparameters()},
    {'params': likelihood.parameters()},
], lr=0.01)

model.train()
likelihood.train()
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))

num_epochs = 1 if smoke_test else 4
epochs_iter = tqdm.notebook.tqdm(range(num_epochs), desc="Epoch")
for i in epochs_iter:
    minibatch_iter = tqdm.notebook.tqdm(train_loader, desc="Minibatch", leave=False)

    for x_batch, y_batch in minibatch_iter:
        ### Perform NGD step to optimize variational parameters
        variational_ngd_optimizer.zero_grad()
        hyperparameter_optimizer.zero_grad()
        output = model(x_batch)
        loss = -mll(output, y_batch)
        minibatch_iter.set_postfix(loss=loss.item())
        loss.backward()
        variational_ngd_optimizer.step()
        hyperparameter_optimizer.step()

model.eval()
likelihood.eval()
means = torch.tensor([0.])
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        preds = model(x_batch)
        means = torch.cat([means, preds.mean.cpu()])
means = means[1:]
print('Test MAE: {}'.format(torch.mean(torch.abs(means - test_y.cpu()))))

Stack trace/error message

Traceback (most recent call last):

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\cholesky.py", line 27, in psd_safe_cholesky
    L = torch.cholesky(A, upper=upper, out=out)

RuntimeError: cholesky_cpu: U(1,1) is zero, singular U.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "C:\Users\fasx\Desktop\jrojsel\natural_gradients_test.py", line 109, in <module>
    output = model(x_batch)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\models\approximate_gp.py", line 81, in __call__
    return self.variational_strategy(inputs, prior=prior)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\_variational_strategy.py", line 135, in __call__
    x, inducing_points, inducing_values=variational_dist_u.mean, variational_inducing_covar=None

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\module.py", line 28, in __call__
    outputs = self.forward(*inputs, **kwargs)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\orthogonally_decoupled_variational_strategy.py", line 74, in forward
    full_output = self.model(torch.cat([x, inducing_points], dim=-2))

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 166, in __call__
    return super().__call__(x, prior=prior)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\_variational_strategy.py", line 131, in __call__
    variational_inducing_covar=variational_dist_u.lazy_covariance_matrix,

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\module.py", line 28, in __call__
    outputs = self.forward(*inputs, **kwargs)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 99, in forward
    L = self._cholesky_factor(induc_induc_covar)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\memoize.py", line 68, in g
    return _add_to_cache_ignore_args(self, cache_name, method(self, *args, **kwargs))

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 72, in _cholesky_factor
    L = psd_safe_cholesky(delazify(induc_induc_covar).double(), jitter=settings.cholesky_jitter.value())

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\cholesky.py", line 33, in psd_safe_cholesky
    f"cholesky_cpu: {isnan.sum().item()} of {A.numel()} elements of the {A.shape} tensor are NaN."

NanError: cholesky_cpu: 10000 of 10000 elements of the torch.Size([100, 100]) tensor are NaN.

Expected Behavior

Optimization should not crash

System information

Please complete the following information:

Additional context

Add any other context about the problem here.

gpleiss commented 4 years ago

At the moment this is not supported. NGD only works with NaturalVariationalDistribution and TrilNaturalVariationalDistribution.

mrlj-hash commented 1 week ago

Has there been more thought on this @gpleiss since the issue was opened? There are recent developments (https://arxiv.org/abs/1910.10596) on this orthogonal decoupling approach which allow greater flexibility that would be great to see implemented, especially for deep GPs.