cornellius-gp / gpytorch

A highly efficient implementation of Gaussian Processes in PyTorch
MIT License
3.57k stars 562 forks source link

[Docs] Should ScaleKernel always be nested on the outside? #1370

Open jeffwillette opened 3 years ago

jeffwillette commented 3 years ago

📚 Documentation/Examples

Is documentation wrong?

This page says to use GridInterpolationKernel(ScaleKernel(RBF()))

https://docs.gpytorch.ai/en/v1.1.1/examples/06_PyTorch_NN_Integration_DKL/KISSGP_Deep_Kernel_Regression_CUDA.html

But in the github issues https://github.com/cornellius-gp/gpytorch/issues/716 I read that the ScaleKernel should be on the outside. Which one is correct?

Beyond that. I am getting some stability issues with DKL being very unstable in training (sometimes the loss goes down and then blows up regardless of the LR) and giving cholesky errors sometimes which is what prompted me to search the docs in the first palce. Is DKL known to be unstable like this, or is something wrong?

I am not using the exact data in the docs, but the simple boston-housing UCI dataset.

jacobrgardner commented 3 years ago

ScaleKernel should ideally go on the outside just because it's a more exact way to handle that.

Do you have specific code samples that are failing? It definitely shouldn't be unstable on a standard UCI dataset.

jeffwillette commented 3 years ago

I made a standalone example here, If you just swap out the location where the boston-housing dataset is loaded from (line 35-38, it should work.

from __future__ import annotations

import os
from typing import Any, Tuple, Union

import gpytorch  # type: ignore
import numpy as np  # type: ignore
import torch
from gpytorch.likelihoods import GaussianLikelihood  # type: ignore
from torch import nn
from tqdm import tqdm  # type: ignore

T = torch.Tensor
M = nn.Module
MVN = gpytorch.distributions.MultivariateNormal
_MVN = Union[gpytorch.distributions.MultivariateNormal]

pbp_sets = [
    "boston-housing",
    "concrete",
    "energy",
    "kin8nm",
    "naval-propulsion-plant",
    "power-plant",
    "wine-quality-red",
    "yacht",
    "protein-tertiary-structure",
]

def get_pbp_data(name: str, datadir: str) -> Tuple[T, ...]:
    if name not in pbp_sets:
        raise ValueError(f"{name} is an unknown pbp dataset")

    data_path = os.path.join(datadir, "UCI_Datasets", name, "data")
    # load data
    path = os.path.join(data_path, "data.txt")
    data = torch.from_numpy(np.loadtxt(path)).float()

    train_n = int(0.8 * data.size(0))
    train_x, train_y, test_x, test_y = data[:train_n, :-1], data[:train_n, -1], data[train_n:, :-1], data[train_n:, -1]

    mu, std = train_x.mean(dim=0), train_x.std(dim=0)
    ymu, ystd = train_y.mean(), train_y.std()

    train_x, train_y = (train_x - mu) / std, (train_y - ymu) / ystd
    test_x, test_y = (test_x - mu) / std, (test_y - ymu) / ystd

    return train_x, train_y, test_x, test_y, ymu, ystd

class ExactDKLGPRegression(gpytorch.models.ExactGP):
    def __init__(self, in_dim: int, h_dim: int, x: T, y: T, likelihood: GaussianLikelihood) -> None:
        super(ExactDKLGPRegression, self).__init__(x, y, likelihood)

        self.in_dim = in_dim
        self.h_dim = h_dim

        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.GridInterpolationKernel(
            gpytorch.kernels.RBFKernel(),
            num_dims=2,
            grid_size=100
        ))

        self.layers = nn.Sequential(
            nn.Linear(in_dim, h_dim),
            nn.BatchNorm1d(h_dim),
            nn.ReLU(inplace=True),
            nn.Linear(h_dim, h_dim),
            nn.BatchNorm1d(h_dim),
            nn.ReLU(inplace=True),
            nn.Linear(h_dim, 2),
        )

    def forward(self, x: T) -> _MVN:
        x = self.layers(x)
        mean = self.mean_module(x)
        covar = self.covar_module(x)
        return MVN(mean, covar)

class GPTrainer:
    def __init__(self, model: M, likelihood: M, mll: M, optimizer: torch.optim.Optimizer) -> None:
        self.model = model
        self.likelihood = likelihood
        self.mll = mll
        self.opt = optimizer

    def fit(self, x: T, y: T, epochs: int) -> None:
        self.model.train()
        self.likelihood.train()
        with tqdm(range(int(epochs))) as t:
            for epoch in t:
                self.opt.zero_grad()

                yhat = self.model(x)
                loss = -self.mll(yhat, y)
                t.set_postfix(loss=f"{loss.item():.4f}")
                loss.backward()
                self.opt.step()

    def eval(self, x: T, y: T) -> T:
        self.model.eval()
        self.likelihood.eval()

        with torch.no_grad(), gpytorch.settings.use_toeplitz(False), gpytorch.settings.fast_pred_var():
            yhat = self.model(x)

        return self.mll(yhat, y)  # type: ignore

def get_exact_dkl_gp_model(config: Any, x: T, y: T) -> Tuple[M, M, M, torch.optim.Optimizer]:
    likelihood = gpytorch.likelihoods.GaussianLikelihood().cuda()
    model = ExactDKLGPRegression(x.size(1), config["h_dim"], x, y, likelihood).cuda()
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    optimizer = torch.optim.Adam([
        {'params': model.layers.parameters(), "weight_decay": config["dnn_wd"], "lr": config["dnn_lr"]},
        {'params': model.covar_module.parameters()},
        {'params': model.mean_module.parameters()},
        {'params': likelihood.parameters()}
    ], lr=config["gp_lr"])

    return model, likelihood, mll, optimizer

def exact_dkl_gp_regression() -> None:
    config = {"h_dim": 50, "dnn_wd": 1e-4, "dnn_lr": 1e-2, "gp_lr": 0.1}

    x, y, xt, yt, ymu, ystd = get_pbp_data("boston-housing", "/home/jeff/datasets")
    x, y, xt, yt = x.cuda(), y.cuda(), xt.cuda(), yt.cuda()

    model, likelihood, mll, opt = get_exact_dkl_gp_model(config, x, y)
    trainer = GPTrainer(model, likelihood, mll, opt)

    trainer.fit(x, y, 100)
    ll = trainer.eval(xt, yt)
    ll -= np.log(ystd)

    print(f"ll: {ll.item()}")

if __name__ == "__main__":
    exact_dkl_gp_regression()

This is giving me the following stacktrace. It doesn't happen every time, but it should happen if you run it a few times:

Traceback (most recent call last):
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/utils/cholesky.py", line 27, in psd_safe_cholesky
    L = torch.cholesky(A, upper=upper, out=out)
RuntimeError: cholesky_cuda: U(1,1) is zero, singular U.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "trainers.py", line 145, in <module>
    exact_dkl_gp_regression()
  File "trainers.py", line 138, in exact_dkl_gp_regression
    ll = trainer.eval(xt, yt)
  File "trainers.py", line 110, in eval
    return self.mll(yhat, y)  # type: ignore
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/module.py", line 28, in __call__
    outputs = self.forward(*inputs, **kwargs)
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py", line 62, in forward
    res = output.log_prob(target)
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/distributions/multivariate_normal.py", line 140, in log_prob
    inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 1022, in inv_quad_logdet
    cholesky = CholLazyTensor(TriangularLazyTensor(self.cholesky()))
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 750, in cholesky
    chol = self._cholesky(upper=False)
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/utils/memoize.py", line 59, in g
    return _add_to_cache(self, cache_name, method(self, *args, **kwargs), *args, kwargs_pkl=kwargs_pkl)
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 419, in _cholesky
    cholesky = psd_safe_cholesky(evaluated_mat, jitter=settings.cholesky_jitter.value(), upper=upper).contiguous()
  File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/utils/cholesky.py", line 50, in psd_safe_cholesky
    raise NotPSDError(
gpytorch.utils.errors.NotPSDError: Matrix not positive definite after repeatedly adding jitter up to 1.0e-04. Original error on first attempt: cholesky_cuda: U(1,1) is zero, singular U.
gpleiss commented 3 years ago

What's the dataset size that you're using? We shouldn't be getting Cholesky errors with GridInterpolationKernel - we should probably be using CG in that case.

@jeffwillette - do you see these errors when ScaleKernel is not on the outside?

jeffwillette commented 3 years ago

@gpleiss the dataset is the boston housing dataset from here I couldn't remember the exact name on the UCI website. It only has ~500 instances

I have tried putting scale kernel both on the inside and the outside and it didn't seem to make a difference, I got the error in both cases.

gpleiss commented 3 years ago

Okay - I'm not familiar with Boston, but it's not uncommon to see Cholesky errors on some particularly ill-conditioned datasets. I would try reducing gp_lr, or adding a SmoothedBox prior on the likelihood noise to set the minimum noise to something like 0.01.