Open jeffwillette opened 3 years ago
ScaleKernel should ideally go on the outside just because it's a more exact way to handle that.
Do you have specific code samples that are failing? It definitely shouldn't be unstable on a standard UCI dataset.
I made a standalone example here, If you just swap out the location where the boston-housing dataset is loaded from (line 35-38, it should work.
from __future__ import annotations
import os
from typing import Any, Tuple, Union
import gpytorch # type: ignore
import numpy as np # type: ignore
import torch
from gpytorch.likelihoods import GaussianLikelihood # type: ignore
from torch import nn
from tqdm import tqdm # type: ignore
T = torch.Tensor
M = nn.Module
MVN = gpytorch.distributions.MultivariateNormal
_MVN = Union[gpytorch.distributions.MultivariateNormal]
pbp_sets = [
"boston-housing",
"concrete",
"energy",
"kin8nm",
"naval-propulsion-plant",
"power-plant",
"wine-quality-red",
"yacht",
"protein-tertiary-structure",
]
def get_pbp_data(name: str, datadir: str) -> Tuple[T, ...]:
if name not in pbp_sets:
raise ValueError(f"{name} is an unknown pbp dataset")
data_path = os.path.join(datadir, "UCI_Datasets", name, "data")
# load data
path = os.path.join(data_path, "data.txt")
data = torch.from_numpy(np.loadtxt(path)).float()
train_n = int(0.8 * data.size(0))
train_x, train_y, test_x, test_y = data[:train_n, :-1], data[:train_n, -1], data[train_n:, :-1], data[train_n:, -1]
mu, std = train_x.mean(dim=0), train_x.std(dim=0)
ymu, ystd = train_y.mean(), train_y.std()
train_x, train_y = (train_x - mu) / std, (train_y - ymu) / ystd
test_x, test_y = (test_x - mu) / std, (test_y - ymu) / ystd
return train_x, train_y, test_x, test_y, ymu, ystd
class ExactDKLGPRegression(gpytorch.models.ExactGP):
def __init__(self, in_dim: int, h_dim: int, x: T, y: T, likelihood: GaussianLikelihood) -> None:
super(ExactDKLGPRegression, self).__init__(x, y, likelihood)
self.in_dim = in_dim
self.h_dim = h_dim
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.GridInterpolationKernel(
gpytorch.kernels.RBFKernel(),
num_dims=2,
grid_size=100
))
self.layers = nn.Sequential(
nn.Linear(in_dim, h_dim),
nn.BatchNorm1d(h_dim),
nn.ReLU(inplace=True),
nn.Linear(h_dim, h_dim),
nn.BatchNorm1d(h_dim),
nn.ReLU(inplace=True),
nn.Linear(h_dim, 2),
)
def forward(self, x: T) -> _MVN:
x = self.layers(x)
mean = self.mean_module(x)
covar = self.covar_module(x)
return MVN(mean, covar)
class GPTrainer:
def __init__(self, model: M, likelihood: M, mll: M, optimizer: torch.optim.Optimizer) -> None:
self.model = model
self.likelihood = likelihood
self.mll = mll
self.opt = optimizer
def fit(self, x: T, y: T, epochs: int) -> None:
self.model.train()
self.likelihood.train()
with tqdm(range(int(epochs))) as t:
for epoch in t:
self.opt.zero_grad()
yhat = self.model(x)
loss = -self.mll(yhat, y)
t.set_postfix(loss=f"{loss.item():.4f}")
loss.backward()
self.opt.step()
def eval(self, x: T, y: T) -> T:
self.model.eval()
self.likelihood.eval()
with torch.no_grad(), gpytorch.settings.use_toeplitz(False), gpytorch.settings.fast_pred_var():
yhat = self.model(x)
return self.mll(yhat, y) # type: ignore
def get_exact_dkl_gp_model(config: Any, x: T, y: T) -> Tuple[M, M, M, torch.optim.Optimizer]:
likelihood = gpytorch.likelihoods.GaussianLikelihood().cuda()
model = ExactDKLGPRegression(x.size(1), config["h_dim"], x, y, likelihood).cuda()
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
optimizer = torch.optim.Adam([
{'params': model.layers.parameters(), "weight_decay": config["dnn_wd"], "lr": config["dnn_lr"]},
{'params': model.covar_module.parameters()},
{'params': model.mean_module.parameters()},
{'params': likelihood.parameters()}
], lr=config["gp_lr"])
return model, likelihood, mll, optimizer
def exact_dkl_gp_regression() -> None:
config = {"h_dim": 50, "dnn_wd": 1e-4, "dnn_lr": 1e-2, "gp_lr": 0.1}
x, y, xt, yt, ymu, ystd = get_pbp_data("boston-housing", "/home/jeff/datasets")
x, y, xt, yt = x.cuda(), y.cuda(), xt.cuda(), yt.cuda()
model, likelihood, mll, opt = get_exact_dkl_gp_model(config, x, y)
trainer = GPTrainer(model, likelihood, mll, opt)
trainer.fit(x, y, 100)
ll = trainer.eval(xt, yt)
ll -= np.log(ystd)
print(f"ll: {ll.item()}")
if __name__ == "__main__":
exact_dkl_gp_regression()
This is giving me the following stacktrace. It doesn't happen every time, but it should happen if you run it a few times:
Traceback (most recent call last):
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/utils/cholesky.py", line 27, in psd_safe_cholesky
L = torch.cholesky(A, upper=upper, out=out)
RuntimeError: cholesky_cuda: U(1,1) is zero, singular U.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "trainers.py", line 145, in <module>
exact_dkl_gp_regression()
File "trainers.py", line 138, in exact_dkl_gp_regression
ll = trainer.eval(xt, yt)
File "trainers.py", line 110, in eval
return self.mll(yhat, y) # type: ignore
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/module.py", line 28, in __call__
outputs = self.forward(*inputs, **kwargs)
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py", line 62, in forward
res = output.log_prob(target)
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/distributions/multivariate_normal.py", line 140, in log_prob
inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 1022, in inv_quad_logdet
cholesky = CholLazyTensor(TriangularLazyTensor(self.cholesky()))
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 750, in cholesky
chol = self._cholesky(upper=False)
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/utils/memoize.py", line 59, in g
return _add_to_cache(self, cache_name, method(self, *args, **kwargs), *args, kwargs_pkl=kwargs_pkl)
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/lazy/lazy_tensor.py", line 419, in _cholesky
cholesky = psd_safe_cholesky(evaluated_mat, jitter=settings.cholesky_jitter.value(), upper=upper).contiguous()
File "/home/jeff/.venv/env/lib/python3.8/site-packages/gpytorch/utils/cholesky.py", line 50, in psd_safe_cholesky
raise NotPSDError(
gpytorch.utils.errors.NotPSDError: Matrix not positive definite after repeatedly adding jitter up to 1.0e-04. Original error on first attempt: cholesky_cuda: U(1,1) is zero, singular U.
What's the dataset size that you're using? We shouldn't be getting Cholesky errors with GridInterpolationKernel
- we should probably be using CG in that case.
@jeffwillette - do you see these errors when ScaleKernel is not on the outside?
@gpleiss the dataset is the boston housing dataset from here I couldn't remember the exact name on the UCI website. It only has ~500 instances
I have tried putting scale kernel both on the inside and the outside and it didn't seem to make a difference, I got the error in both cases.
Okay - I'm not familiar with Boston, but it's not uncommon to see Cholesky errors on some particularly ill-conditioned datasets. I would try reducing gp_lr
, or adding a SmoothedBox prior on the likelihood noise to set the minimum noise to something like 0.01.
📚 Documentation/Examples
Is documentation wrong?
This page says to use
GridInterpolationKernel(ScaleKernel(RBF()))
https://docs.gpytorch.ai/en/v1.1.1/examples/06_PyTorch_NN_Integration_DKL/KISSGP_Deep_Kernel_Regression_CUDA.html
But in the github issues https://github.com/cornellius-gp/gpytorch/issues/716 I read that the
ScaleKernel
should be on the outside. Which one is correct?Beyond that. I am getting some stability issues with DKL being very unstable in training (sometimes the loss goes down and then blows up regardless of the LR) and giving cholesky errors sometimes which is what prompted me to search the docs in the first palce. Is DKL known to be unstable like this, or is something wrong?
I am not using the exact data in the docs, but the simple boston-housing UCI dataset.