🐛 Bug

I am using KroneckerMultiTaskGP as a surrogate for multi-objective BO with correlated outputs, following the composite BO example. Passing the Standardize outcome transform as an argument to the KroneckerMultiTaskGP leads to an esoteric error when computing gradients of the acquisition function.

To reproduce

Here is some code adapted from the tutorial on composite BO using KroneckerMultiTaskGP that can be found here. The only functional change is to pass Standardize as an outcome_transform to the surrogate.

Code snippet to reproduce

import torch
import os
import time

from botorch.test_functions import Hartmann
from botorch.models import SingleTaskGP, KroneckerMultiTaskGP
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.sampling import IIDNormalSampler
from botorch.optim import optimize_acqf
from botorch.optim.fit import fit_gpytorch_torch
from botorch.acquisition.objective import GenericMCObjective
from botorch.acquisition.monte_carlo import qExpectedImprovement
from botorch.models.transforms.outcome import Standardize
from botorch.models.transforms.input import Normalize

torch.random.manual_seed(2010)

from botorch.test_functions import Hartmann
from torch import Tensor

class ContextualHartmann6(Hartmann):
    def __init__(self, num_tasks: int = 20, noise_std = None, negate = False):
        super().__init__(dim=6, noise_std = noise_std, negate = negate)
        self.task_range = torch.linspace(0, 1, num_tasks).unsqueeze(-1)
        self._bounds = [(0.0, 1.0) for _ in range(self.dim - 1)]
        self.bounds = torch.tensor(self._bounds).t()

    def evaluate_true(self, X: Tensor) -> Tensor:
        batch_X = X.unsqueeze(-2)
        batch_dims = X.ndim - 1

        expanded_task_range = self.task_range
        for _ in range(batch_dims):
            expanded_task_range = expanded_task_range.unsqueeze(0)
        task_range = expanded_task_range.repeat(*X.shape[:-1], 1, 1).to(X)
        concatenated_X = torch.cat(
            (batch_X.repeat(*[1]*batch_dims, self.task_range.shape[0], 1), task_range), dim=-1
        )
        return super().evaluate_true(concatenated_X)

problem = ContextualHartmann6(noise_std = 0.001, negate=True)

# we choose 20 random weights
weights = torch.randn(20)

def callable_func(samples, X=None):
    res = -torch.cos((samples**2) + samples * weights)
    return res.sum(dim=-1)

objective = GenericMCObjective(callable_func)

bounds = problem.bounds

def optimize_acqf_and_get_candidate(acq_func, bounds, batch_size):
    """Optimizes the acquisition function, and returns a new candidate and a noisy observation."""
    # optimize
    candidates, _ = optimize_acqf(
        acq_function=acq_func,
        bounds=bounds,
        q=batch_size,
        num_restarts=10,
        raw_samples=512,  # used for intialization heuristic
        options={"batch_limit": 5, "maxiter": 200, "init_batch_limit": 5},
    )
    # observe new values 
    new_x = candidates.detach()
    return new_x

def construct_acqf(model, objective, num_samples, best_f):
    sampler = IIDNormalSampler(num_samples=num_samples)
    qEI = qExpectedImprovement(
        model=model, 
        best_f=best_f,
        sampler=sampler, 
        objective=objective,
    )
    return qEI

n_init = 20
n_steps = 20
batch_size = 3
num_samples = 128
n_trials = 3
verbose = True

mtgp_train_x = (bounds[1] - bounds[0]) * torch.rand(n_init, bounds.shape[1]) + bounds[0]

mtgp_train_y = problem(mtgp_train_x)

best_value_mtgp = objective(mtgp_train_y).max()

torch.cuda.empty_cache()

mtgp = KroneckerMultiTaskGP(
    mtgp_train_x, 
    mtgp_train_y, 
    outcome_transform = Standardize(m=mtgp_train_y.shape[-1])
)
mtgp_mll = ExactMarginalLogLikelihood(mtgp.likelihood, mtgp)
fit_gpytorch_torch(mtgp_mll, options={"maxiter": 3000, "lr": 0.01, "disp": False})
mtgp_acqf = construct_acqf(mtgp, objective, num_samples, best_value_mtgp)
new_mtgp_x = optimize_acqf_and_get_candidate(mtgp_acqf, bounds, batch_size)

Stack trace/error message

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/var/folders/83/snpkj24x1_j5sfmlhcrgqpm00000gn/T/ipykernel_67834/636811502.py in <module>
    100 fit_gpytorch_torch(mtgp_mll, options={"maxiter": 3000, "lr": 0.01, "disp": False})
    101 mtgp_acqf = construct_acqf(mtgp, objective, num_samples, best_value_mtgp)
--> 102 new_mtgp_x = optimize_acqf_and_get_candidate(mtgp_acqf, bounds, batch_size)
    103 
    104 mtgp_train_y.shape

/var/folders/83/snpkj24x1_j5sfmlhcrgqpm00000gn/T/ipykernel_67834/636811502.py in optimize_acqf_and_get_candidate(acq_func, bounds, batch_size)
     55     """Optimizes the acquisition function, and returns a new candidate and a noisy observation."""
     56     # optimize
---> 57     candidates, _ = optimize_acqf(
     58         acq_function=acq_func,
     59         bounds=bounds,

~/Documents/.../.venv/lib/python3.9/site-packages/botorch/optim/optimize.py in optimize_acqf(acq_function, bounds, q, num_restarts, raw_samples, options, inequality_constraints, equality_constraints, fixed_features, post_processing_func, batch_initial_conditions, return_best_only, sequential, **kwargs)
    184         # optimize using random restart optimization
    185         print("BATCH INITIAL CONDITIONS SHAPE:", batched_ics_.shape)
--> 186         batch_candidates_curr, batch_acq_values_curr = gen_candidates_scipy(
    187             initial_conditions=batched_ics_,
    188             acquisition_function=acq_function,

~/Documents/.../.venv/lib/python3.9/site-packages/botorch/generation/gen.py in gen_candidates_scipy(initial_conditions, acquisition_function, lower_bounds, upper_bounds, inequality_constraints, equality_constraints, options, fixed_features)
    164         return fval, gradf
    165 
--> 166     res = minimize(
    167         f,
    168         x0,

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    617                                   **options)
    618     elif meth == 'l-bfgs-b':
--> 619         return _minimize_lbfgsb(fun, x0, args, jac, bounds,
    620                                 callback=callback, **options)
    621     elif meth == 'tnc':

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
    304             iprint = disp
    305 
--> 306     sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
    307                                   bounds=new_bounds,
    308                                   finite_diff_rel_step=finite_diff_rel_step)

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
    259     # ScalarFunction caches. Reuse of fun(x) during grad
    260     # calculation reduces overall function evaluations.
--> 261     sf = ScalarFunction(fun, x0, args, grad, hess,
    262                         finite_diff_rel_step, bounds, epsilon=epsilon)
    263 

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
    133 
    134         self._update_fun_impl = update_fun
--> 135         self._update_fun()
    136 
    137         # Gradient evaluation

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/_differentiable_functions.py in _update_fun(self)
    223     def _update_fun(self):
    224         if not self.f_updated:
--> 225             self._update_fun_impl()
    226             self.f_updated = True
    227 

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/_differentiable_functions.py in update_fun()
    130 
    131         def update_fun():
--> 132             self.f = fun_wrapped(self.x)
    133 
    134         self._update_fun_impl = update_fun

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/_differentiable_functions.py in fun_wrapped(x)
    127         def fun_wrapped(x):
    128             self.nfev += 1
--> 129             return fun(x, *args)
    130 
    131         def update_fun():

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/optimize.py in __call__(self, x, *args)
     72     def __call__(self, x, *args):
     73         """ returns the the function value """
---> 74         self._compute_if_needed(x, *args)
     75         return self._value
     76 

~/Documents/.../.venv/lib/python3.9/site-packages/scipy/optimize/optimize.py in _compute_if_needed(self, x, *args)
     66         if not np.all(x == self.x) or self._value is None or self.jac is None:
     67             self.x = np.asarray(x).copy()
---> 68             fg = self.fun(x, *args)
     69             self.jac = fg[1]
     70             self._value = fg[0]

~/Documents/.../.venv/lib/python3.9/site-packages/botorch/generation/gen.py in f(x)
    152         loss = -acquisition_function(X_fix).sum()
    153         # compute gradient w.r.t. the inputs (does not accumulate in leaves)
--> 154         gradf = _arrayify(torch.autograd.grad(loss, X)[0].contiguous().view(-1))
    155         if np.isnan(gradf).any():
    156             msg = (

~/Documents/.../.venv/lib/python3.9/site-packages/torch/autograd/__init__.py in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
    224         retain_graph = create_graph
    225 
--> 226     return Variable._execution_engine.run_backward(
    227         outputs, grad_outputs_, retain_graph, create_graph,
    228         inputs, allow_unused, accumulate_grad=False)

~/Documents/.../.venv/lib/python3.9/site-packages/torch/autograd/function.py in apply(self, *args)
     85     def apply(self, *args):
     86         # _forward_cls is defined by derived class
---> 87         return self._forward_cls.backward(self, *args)  # type: ignore[attr-defined]
     88 
     89 

~/Documents/.../.venv/lib/python3.9/site-packages/gpytorch/functions/_matmul.py in backward(ctx, grad_output)
     44             rhs = rhs.unsqueeze(-1) if (rhs.ndimension() == 1) else rhs
     45             grad_output_matrix = grad_output.unsqueeze(-1) if grad_output.ndimension() == 1 else grad_output
---> 46             arg_grads = ctx.representation_tree(*matrix_args)._quad_form_derivative(grad_output_matrix, rhs)
     47 
     48         # input_2 gradient

~/Documents/.../.venv/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py in _quad_form_derivative(self, left_vecs, right_vecs)
--> 358             loss = (left_vecs * self._matmul(right_vecs)).sum()
    359             loss.requires_grad_(True)
    360             actual_grads = deque(torch.autograd.grad(loss, args_with_grads, allow_unused=True))

RuntimeError: The size of tensor a (5) must match the size of tensor b (60) at non-singleton dimension 1

Expected Behavior

I expected candidates to be generated from optimize_acqf similarly to when removing the outcome transform or replacing KroneckerMultiTaskGP with SingleTaskGP.

System information

`botorch==0.5.1`
`gpytorch==1.5.1`
`torch==1.9.1`
MacOS Catalina 10.15.4

pytorch / botorch

[Bug] optimize_acqf fails when Standardize outcome transform applied to KroneckerMultiTaskGP #982

🐛 Bug

To reproduce

Expected Behavior

System information