🐛 Bug

(Low priority bug, it's easy to bypass by always using ScaleKernel or gpytorch.settings.max_cholesky_size(0).)

If I run the GP_Regression_on_Classification_Labels example, but using KeOps, and without wrapping the kernel with ScaleKernel, I get an error. I checked it works fine in these cases:

w/ KeOps w/ ScaleKernel
w/o KeOps w/ ScaleKernel
w/ KeOps w/o ScaleKernel on a basic continuous regression

I checked the error triggers even if I skip the training phase and just do eval.

Caveat: I had to pin gpytorch and linear-operator to old versions to bypass issue #2363.

To reproduce

Code snippet to reproduce

# Copied from
#
# gpytorch/examples/01_Exact_GPs/GP_Regression_on_Classification_Labels.ipynb
#
# Modifications:
#   - hacks to make keops work on my M1 mac
#   - use kernels.keops.RBFKernel instead of kernels.RBFKernel
#   - do not use ScaleKernel
#       - accordingly, do not log the lengthscale parameter
#
# Run with gpytorch==1.10, linear-operator==0.4.0 to bypass issue #2363

import os
os.environ['CPATH'] = '/opt/homebrew/opt/libomp/include'
    # to have pykeops find the omp.h header, see keops issue #332

import math
import torch
import numpy as np
import sklearn # keep before gpytorch/pykeops, see keops issue #332
import gpytorch
from matplotlib import pyplot as plt

from gpytorch.models import ExactGP
from gpytorch.likelihoods import DirichletClassificationLikelihood
from gpytorch.means import ConstantMean
from gpytorch.kernels.keops import RBFKernel

def gen_data(num_data, seed = 2019):
    torch.random.manual_seed(seed)

    x = torch.randn(num_data,1)
    y = torch.randn(num_data,1)

    u = torch.rand(1)
    data_fn = lambda x, y: 1 * torch.sin(0.15 * u * 3.1415 * (x + y)) + 1
    latent_fn = data_fn(x, y)
    z = torch.round(latent_fn).long().squeeze()
    return torch.cat((x,y),dim=1), z, data_fn

train_x, train_y, genfn = gen_data(500)

fig, ax = plt.subplots()
ax.scatter(train_x[:,0].numpy(), train_x[:,1].numpy(), c = train_y)

test_d1 = np.linspace(-3, 3, 20)
test_d2 = np.linspace(-3, 3, 20)

test_x_mat, test_y_mat = np.meshgrid(test_d1, test_d2)
test_x_mat, test_y_mat = torch.Tensor(test_x_mat), torch.Tensor(test_y_mat)

test_x = torch.cat((test_x_mat.view(-1,1), test_y_mat.view(-1,1)),dim=1)
test_labels = torch.round(genfn(test_x_mat, test_y_mat))
test_y = test_labels.view(-1)

fig, ax = plt.subplots()
ax.contourf(test_x_mat.numpy(), test_y_mat.numpy(), test_labels.numpy())

# We will use the simplest form of GP model, exact inference
class DirichletGPModel(ExactGP):
    def __init__(self, train_x, train_y, likelihood, num_classes):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean(batch_shape=torch.Size((num_classes,)))
        self.covar_module = RBFKernel(batch_shape=torch.Size((num_classes,)))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# initialize likelihood and model
# we let the DirichletClassificationLikelihood compute the targets for us
likelihood = DirichletClassificationLikelihood(train_y, learn_additional_noise=True)
model = DirichletGPModel(train_x, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes)

# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

training_iter = 50

for i in range(training_iter):
    # Zero gradients from previous iteration
    optimizer.zero_grad()
    # Output from model
    output = model(train_x)
    # Calc loss and backprop gradients
    loss = -mll(output, likelihood.transformed_targets).sum()
    loss.backward()
    if i % 5 == 0:
        print('Iter %d/%d - Loss: %.3f   noise: %.3f' % (
            i + 1, training_iter, loss.item(),
            model.likelihood.second_noise_covar.noise.mean().item()
        ))
    optimizer.step()

model.eval()
likelihood.eval()

with gpytorch.settings.fast_pred_var(), torch.no_grad():
    test_dist = model(test_x)

    pred_means = test_dist.loc

fig, ax = plt.subplots(1, 3)

for i in range(3):
    im = ax[i].contourf(
        test_x_mat.numpy(), test_y_mat.numpy(), pred_means[i].numpy().reshape((20,20))
    )
    fig.colorbar(im, ax=ax[i])
    ax[i].set_title("Logits: Class " + str(i))

pred_samples = test_dist.sample(torch.Size((256,))).exp()
probabilities = (pred_samples / pred_samples.sum(-2, keepdim=True)).mean(0)

fig, ax = plt.subplots(1, 3)

levels = np.linspace(0, 1.05, 20)
for i in range(3):
    im = ax[i].contourf(
        test_x_mat.numpy(), test_y_mat.numpy(), probabilities[i].numpy().reshape((20,20)), levels=levels
    )
    fig.colorbar(im, ax=ax[i])
    ax[i].set_title("Probabilities: Class " + str(i))

fig, ax = plt.subplots(1, 2)

ax[0].contourf(test_x_mat.numpy(), test_y_mat.numpy(), test_labels.numpy())
ax[0].set_title('True Response')

ax[1].contourf(test_x_mat.numpy(), test_y_mat.numpy(), pred_means.max(0)[1].reshape((20,20)))
ax[1].set_title('Estimated Response')

Stack trace/error message

[KeOps] Warning : Cuda libraries were not detected on the system ; using cpu only mode
[KeOps] Generating code for formula Sum_Reduction(Exp(-Sum((Var(0,1,0)-Var(1,1,1))**2)/2)*Var(2,1,1),0) ... OK
[pyKeOps] Compiling pykeops cpp 9f16c0f49a module ... OK
[KeOps] Generating code for formula Sum_Reduction(Exp(-Sum((Var(0,1,0)-Var(1,1,1))**2)/2)*Var(2,1000,1),0) ... OK
[pyKeOps] Compiling pykeops cpp 4e08c6566c module ... OK

In [2]:                                                                                      
Do you really want to exit ([y]/n)? 
(pyenv) ╭─[giacomo]─[~/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code]
╰─[0] python gpclass-keops.py 
[KeOps] Warning : Cuda libraries were not detected on the system ; using cpu only mode
Traceback (most recent call last):
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/gpclass-keops.py", line 94, in <module>
    loss = -mll(output, likelihood.transformed_targets).sum()
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/gpytorch/module.py", line 31, in __call__
    outputs = self.forward(*inputs, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py", line 64, in forward
    res = output.log_prob(target)
          ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/gpytorch/distributions/multivariate_normal.py", line 193, in log_prob
    inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/linear_operator/operators/_linear_operator.py", line 1615, in inv_quad_logdet
    cholesky = CholLinearOperator(TriangularLinearOperator(self.cholesky()))
                                                           ^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/linear_operator/operators/_linear_operator.py", line 1244, in cholesky
    chol = self._cholesky(upper=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/linear_operator/utils/memoize.py", line 59, in g
    return _add_to_cache(self, cache_name, method(self, *args, **kwargs), *args, kwargs_pkl=kwargs_pkl)
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/giacomo/Documents/Scuola/Robe_prof/Linero_co/Bayesian_double_robustness/code/pyenv/lib/python3.11/site-packages/linear_operator/operators/_linear_operator.py", line 485, in _cholesky
    raise RuntimeError("Cannot run Cholesky with KeOps: it will either be really slow or not work.")
RuntimeError: Cannot run Cholesky with KeOps: it will either be really slow or not work.

Expected Behavior

It should do the inference without complaints.

System information

Please complete the following information:

gpytorch 1.10
torch 2.1.0
linear-operator 0.4.0
pykeops 2.1.2
CPython 3.11.6
macOS 14.1
M1 mac

cornellius-gp / gpytorch

[Bug] Regression on classification labels fails if I use KeOps without `ScaleKernel` #2430

🐛 Bug

To reproduce

Expected Behavior

System information