cornellius-gp / gpytorch

A highly efficient implementation of Gaussian Processes in PyTorch
MIT License
3.57k stars 560 forks source link

[Bug] MultiDeviceKernel not supported in Botorch #1132

Open Bibyutatsu opened 4 years ago

Bibyutatsu commented 4 years ago

🐛 Bug

Hi, I wanted to train SingleTaskGP on multiple GPUs as I have got 8 cards on my node. So I searched and found out about the Gpytorch's MultiDeviceKernel kernel which can be used to accomplish this task. But I couldn't find anything similar in the Botorch modules. So I changed the covar_module of the SingleTaskGP to use this specific kernel. But I am getting the Bug:

RuntimeError: graph_task->future_result_->completed() INTERNAL ASSERT FAILED at /opt/conda/conda-bld/pytorch_1587428398394/work/torch/csrc/autograd/engine.cpp:800, please report a bug to PyTorch.

I am unable to train my BoTorch's SingleTaskGP on multiple GPUs with Gpytorch's MultiDeviceKernel kernel.

To reproduce

Code snippet to reproduce

import torch
from botorch import fit_gpytorch_model
from botorch.acquisition.monte_carlo import qExpectedImprovement, qNoisyExpectedImprovement
from botorch.sampling.samplers import SobolQMCNormalSampler
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.models import SingleTaskGP
import gpytorch
from botorch.optim import optimize_acqf
from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bounds = torch.stack([torch.zeros(900), torch.ones(900)]).to(device)
BATCH_SIZE = 20
MC_SAMPLES = 256

train_x_ei = torch.randn((500,3)).to(device)
train_obj_ei = (train_x_ei**2).sum(dim=-1).unsqueeze(-1).to(device)
best_observed_value_ei = train_obj_ei.max().item()
model_ei = SingleTaskGP(train_x_ei, train_obj_ei)
base_covar_module = model_ei.covar_module
covar_module_multi = gpytorch.kernels.MultiDeviceKernel(
    base_covar_module, device_ids=range(8),
    output_device=device)
model_ei.covar_module = covar_module_multi
mll_ei = ExactMarginalLogLikelihood(model_ei.likelihood, model_ei)

for iteration in range(5):
    print(iteration)
    fit_gpytorch_model(mll_ei)

    qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)

    # for best_f, we use the best observed noisy values as an approximation
    qEI = qExpectedImprovement(
        model=model_ei, 
        best_f=train_obj_ei.max(),
        sampler=qmc_sampler
    )

    candidates, _ = optimize_acqf(
        acq_function=qEI,
        bounds=bounds,
        q=BATCH_SIZE,
        num_restarts=10,
        raw_samples=512,  # used for intialization heuristic
        options={"batch_limit": 5, "maxiter": 200},
    )

Stack trace/error message

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-c510035d82af> in <module>
     28 for iteration in range(5):
     29     print(iteration)
---> 30     fit_gpytorch_model(mll_ei)
     31 
     32     qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)

/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/fit.py in fit_gpytorch_model(mll, optimizer, **kwargs)
     99                 mll.model.load_state_dict(original_state_dict)
    100                 sample_all_priors(mll.model)
--> 101             mll, _ = optimizer(mll, track_iterations=False, **kwargs)
    102             if not any(issubclass(w.category, OptimizationWarning) for w in ws):
    103                 mll.eval()

/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/optim/fit.py in fit_gpytorch_scipy(mll, bounds, method, options, track_iterations, approx_mll)
    224             jac=True,
    225             options=options,
--> 226             callback=cb,
    227         )
    228         iterations = []

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    608     elif meth == 'l-bfgs-b':
    609         return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 610                                 callback=callback, **options)
    611     elif meth == 'tnc':
    612         return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    343             # until the completion of the current minimization iteration.
    344             # Overwrite f and g:
--> 345             f, g = func_and_grad(x)
    346         elif task_str.startswith(b'NEW_X'):
    347             # new iteration

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
    293     else:
    294         def func_and_grad(x):
--> 295             f = fun(x, *args)
    296             g = jac(x, *args)
    297             return f, g

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
    325     def function_wrapper(*wrapper_args):
    326         ncalls[0] += 1
--> 327         return function(*(wrapper_args + args))
    328 
    329     return ncalls, function_wrapper

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/optimize.py in __call__(self, x, *args)
     63     def __call__(self, x, *args):
     64         self.x = numpy.asarray(x).copy()
---> 65         fg = self.fun(x, *args)
     66         self.jac = fg[1]
     67         return fg[0]

/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/optim/fit.py in _scipy_objective_and_grad(x, mll, property_dict)
    283         else:
    284             raise e  # pragma: nocover
--> 285     loss.backward()
    286     param_dict = OrderedDict(mll.named_parameters())
    287     grad = []

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    196                 products. Defaults to ``False``.
    197         """
--> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    199 
    200     def register_hook(self, hook):

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     98     Variable._execution_engine.run_backward(
     99         tensors, grad_tensors, retain_graph, create_graph,
--> 100         allow_unreachable=True)  # allow_unreachable flag
    101 
    102 

RuntimeError: graph_task->future_result_->completed() INTERNAL ASSERT FAILED at /opt/conda/conda-bld/pytorch_1587428398394/work/torch/csrc/autograd/engine.cpp:800, please report a bug to PyTorch.

Expected Behavior

I expected the usage of all the GPUs to train the model so that I can scale it across multiple GPUs for a faster execution and sampling.

System information

Please complete the following information:

Bibyutatsu commented 4 years ago

I am also getting a RuntimeError while running the tutorial. But this occurs when I put checkpoint_size=10000.

But when I put checkpoint_size=0, The training completes without any issues.

This is the exact traceback in my case:

model, likelihood = train(train_x, train_y,
                          n_devices=n_devices, output_device=output_device,
                          checkpoint_size=10000,
                          preconditioner_size=100,
                          n_training_iter=20)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-16-dba074377e42> in <module>
      3                           checkpoint_size=10000,
      4                           preconditioner_size=100,
----> 5                           n_training_iter=20)

<ipython-input-14-4da03f9af78d> in train(train_x, train_y, n_devices, output_device, checkpoint_size, preconditioner_size, n_training_iter)
     43 
     44         loss = closure()
---> 45         loss.backward()
     46 
     47         for i in range(n_training_iter):

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    196                 products. Defaults to ``False``.
    197         """
--> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    199 
    200     def register_hook(self, hook):

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     98     Variable._execution_engine.run_backward(
     99         tensors, grad_tensors, retain_graph, create_graph,
--> 100         allow_unreachable=True)  # allow_unreachable flag
    101 
    102 

RuntimeError: start (1250) + length (1250) exceeds dimension size (1250).
KeAWang commented 4 years ago

Are you also using 8 devices in the gpytorch tutorial?

Bibyutatsu commented 4 years ago

Hi KeAWang, Yes I am using 8 devices.

KeAWang commented 4 years ago

It's a bit hard for me to reproduce this issue at this moment as I don't have access to 8 GPUs. Are you able to reproduce this on say, 1 GPU with checkpointing?

Bibyutatsu commented 4 years ago

Hi KeAWang, No for 1 GPU it is working correctly, this error occurs when I use multiple GPUs.

I am also getting a RuntimeError while running the tutorial. But this occurs when I put checkpoint_size=10000.

But when I put checkpoint_size=0, The training completes without any issues.