cornellius-gp / gpytorch

A highly efficient implementation of Gaussian Processes in PyTorch
MIT License
3.53k stars 555 forks source link

[Bug] Error handling on model predictions #1033

Closed khameedk closed 3 years ago

khameedk commented 4 years ago

🐛 Bug

When running the code to train a Gaussian process, I get the following error. I haven't encountered an error thrown this way so I'm unable to debug it post-mortem. I've attached two input CSV files for the program, one of which throws the error (trained_agent.csv) and the other which doesn't. Files are linked below. data_files.zip

To reproduce

Code snippet to reproduce

import pandas as pd
import numpy as np
import torch
import gpytorch

class MultitaskGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)
        num_tasks = train_y.shape[-1]
        self.mean_module = gpytorch.means.MultitaskMean(
            gpytorch.means.ConstantMean(), num_tasks=num_tasks
        )
        self.covar_module = gpytorch.kernels.MultitaskKernel(
            gpytorch.kernels.RBFKernel(), num_tasks=num_tasks, rank=1
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultitaskMultivariateNormal(
            mean_x, covar_x
        )

def check_for_tensor_and_convert(x):
    """Checks if the input is a torch.Tensor. If not, convert it. Also, returns as a float to ensure conformity between tensors.

    Args:
        x (iterable): input data.

    Returns:
        torch.Tensor: converted data.
    """
    if not isinstance(x, torch.Tensor):
        x = np.array(x)
        x = torch.from_numpy(x)
    return x.float()

def convert_array_from_string(array_as_string):
    """Converts a string that appears as a certain kind of list to a numerical array. For example, "[1 2 3]" will be converted to the array [1, 2, 3].

    Args:
        array_as_string (str): The array as a string.

    Returns:
        numpy.ndarray: The converted array.
    """
    l = array_as_string[1:-1].split(' ')
    array = [float(x) for x in l if x != '']
    return np.array(array)

def import_episode_data_from_file(file_name):
    """Imports a file that can be used in a regresssion problem.

    Args:
        file_name (str): The file containing the data.

    Returns:
        np.array, np.array: Matrices corresponding to the predictors and
        responses to be used for regression.
    """
    df = pd.read_csv(file_name)
    # convert string columns
    df['image'] = df['image'].apply(convert_array_from_string)
    df['action'] = df['action'].apply(convert_array_from_string)
    df['action'] = np.roll(df['action'], -1)

    def concat(row):
        return np.concatenate( [row['action'], row['image']] )
    predictors = df.apply(concat, axis=1)
    predictors = np.vstack(predictors)
    predictors = predictors[:-1, :]
    responses = np.vstack(df['image'][1:])
    return predictors, responses

def train_gp(model, likelihood, train_x, train_y, n_iter=2):
    """Trains a Gaussian process with the given training data.

    Credit to https://gpytorch.readthedocs.io/en/latest/examples/01_Exact_GPs/Simple_GP_Regression.html#Training-the-model

    Args:
        model (gpytorch.models.ExactGP): the model to train.
        likelihood (gpytorch.mlls): A likelihood that is compatible with model.
        for more info.
        train_x (torch.Tensor): The training covariates, as a matrix.
        train_y (torch.Tensor): The training responses, may be a vector or matrix.
        n_iter (int, optional): The number of iterations to train the Gaussian process.

    Returns:
        gpytorch.models.ExactGP, gpytorch.mlls): The trained model and the
        likelihood used.
    """
    # Find optimal model hyperparameters
    model.train()
    likelihood.train()
    # Use the adam optimizer
    optimizer = torch.optim.Adam([
        {'params': model.parameters()}, # Includes GaussianLikelihood parameters
    ], lr=0.01)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for i in range(n_iter):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        print('Iter %d/%d - Loss: %.3f' % (i + 1, n_iter, loss.item()))
        optimizer.step()

    return model, likelihood

if __name__ == '__main__':
    X, y = import_episode_data_from_file('logs/test_runs/trained_agent.csv')
    X = check_for_tensor_and_convert(X)
    y = check_for_tensor_and_convert(y)

    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
        num_tasks=y.shape[-1]
    )
    model = MultitaskGPModel(X, y, likelihood)

    model, likelihood = train_gp(
        model, likelihood, X, y, n_iter=2
    )

    test_x = check_for_tensor_and_convert(np.ones((10, 34)))
    model.training = False
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        predictions = likelihood(model(test_x))

Stack trace/error message

Traceback (most recent call last):
  File "test.py", line 138, in <module>
    predictions = likelihood(model(test_x))
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\models\exact_gp.py", line 326, in __call__
    predictive_mean, predictive_covar = self.prediction_strategy.exact_prediction(full_mean, full_covar)
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py", line 302, in exact_prediction
    self.exact_predictive_mean(test_mean, test_train_covar),
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py", line 320, in exact_predictive_mean
    res = (test_train_covar @ self.mean_cache.unsqueeze(-1)).squeeze(-1)
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\utils\memoize.py", line 34, in g
    add_to_cache(self, cache_name, method(self, *args, **kwargs))
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py", line 269, in mean_cache
    mean_cache = train_train_covar.inv_matmul(train_labels_offset).squeeze(-1)
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\lazy\lazy_tensor.py", line 939, in inv_matmul
    return func.apply(self.representation_tree(), False, right_tensor, *self.representation())
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\functions\_inv_matmul.py", line 47, in forward
    solves = _solve(lazy_tsr, right_tensor)
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\functions\_inv_matmul.py", line 15, in _solve
    return lazy_tsr._solve(rhs, preconditioner)
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\lazy\lazy_tensor.py", line 655, in _solve
    preconditioner=preconditioner,
  File "C:\Users\user\Anaconda3\lib\site-packages\gpytorch\utils\linear_cg.py", line 271, in linear_cg
    curr_conjugate_vec,
RuntimeError: Expected object of type Variable but found type CPUFloatType for argument #0 'self'
The above operation failed in interpreter, with the following stack trace:
at C:\Users\user\Anaconda3\lib\site-packages\gpytorch\utils\linear_cg.py:55:4
    eps,
    beta,
    residual,
    precond_residual,
    mul_storage,
    is_zero,
    curr_conjugate_vec,
):
    torch.mul(curr_conjugate_vec, mvms, out=mul_storage)
    torch.sum(mul_storage, dim=-2, keepdim=True, out=alpha)
    ~~~~~~~~~ <--- HERE

    # Do a safe division here
    torch.lt(alpha, eps, out=is_zero)
    alpha.masked_fill_(is_zero, 1)
    torch.div(residual_inner_prod, alpha, out=alpha)
    alpha.masked_fill_(is_zero, 0)

    # We'll cancel out any updates by setting alpha=0 for any vector that has already converged
    alpha.masked_fill_(has_converged, 0)

Expected

With the given input data files, the program should simply print the training loss for each iteration and complete without errors. For example,

Iter 1/2 - Loss: 12250.880
Iter 2/2 - Loss: 12203.549

System information

Please complete the following information:

Additional context

This happens to work on macOS Mojave Version 10.14.6, running GPyTorch version 0.3.6 and PyTorch version 1.3.0, with both files.

jacobrgardner commented 4 years ago

Hi @khameedk ,

Glancing at the error (haven't had a chance to run your code yet), it looks like your issue may just be that you need to account for the fact that numpy's default dtype is float64 / double precision while torch's is float32 / single precision. You'll (depending on whether you are using a GPU and have lots of data) either need to convert your GP model objects and the like to double (e.g., via model.double()), or convert your data to single precision e.g. by torch.from_numpy(x).float()

Also looking at the output you give in the loss: those losses are gigantic, and suggest to me that you haven't normalized your data. In general, the initial hyper settings we use in GPyTorch roughly assume standardized / normalized training features and labels, so you'll likely either need to normalize your data or change the hyper initializations to get good performance.

ahlusar1989 commented 4 years ago

Thanks @jacobrgardner for the detailed explanation. We will experiment with the strategies that you have suggested and get report back as soon as possible.

khameedk commented 4 years ago

Thank you for the suggestions Jake! The check_for_tensor_and_convert method takes care of the float32 conversion. I have tried converting the model to double which predictably yields

*** RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'other' in call to _th_equal

Making the data and model both doubles produces the original error.

In terms of performance, standardizing produces larger losses, which suggests we should tune the hyper initializations as you suggested.

gpleiss commented 4 years ago

@khameedk - can you modify your __name__ == __main__ to be:

if __name__ == '__main__':
    X, y = import_episode_data_from_file('logs/test_runs/trained_agent.csv')
    X = check_for_tensor_and_convert(X)
    y = check_for_tensor_and_convert(y)

    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
        num_tasks=y.shape[-1]
    )
    print(X.dtype, y.dtype)
    model = MultitaskGPModel(X, y, likelihood)

    model, likelihood = train_gp(
        model, likelihood, X, y, n_iter=2
    )

Your code looks fine to me at a glance, but something seems to be wrong.

How many tasks do you have? That's the other thing that might be associated with the large loss. And by "standardizing" - did you z-score the inputs and targets?

khameedk commented 4 years ago

@gpleiss I get the output

torch.float32 torch.float32
Iter 1/2 - Loss: 12340.818
Iter 2/2 - Loss: 12291.732

There are 32 tasks on 34 inputs. For standardizing we used sklearn.preprocessing.StandardScaler on the inputs and outputs.

gpleiss commented 4 years ago

(Sorry for the slow reply - I've been swamped with ICML and KDD)

Sorry - I just realized from your original issue that the error occurred during testing. Try:

if __name__ == '__main__':
    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
        num_tasks=y.shape[-1]
    )
    model = MultitaskGPModel(X, y, likelihood)
    test_x = check_for_tensor_and_convert(np.ones((10, 34)))
    print(test_x.dtype)
    model.training = False
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        predictions = likelihood(model(test_x))

Can you also check the mean and standard deviation coming out of your scaled data? Those loss values are WAY too large for z-scored data. On z-scored data, a loss of ~1-2 would correspond to random guessing. Those losses that you're posting means that the GP is doing 10^12000 times worse than random guessing.

khameedk commented 4 years ago

Just for completeness I ran this:

if __name__ == '__main__':
    X, y = import_episode_data_from_file('logs/test_runs/trained_agent.csv')
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler()
    X, y = scaler.fit_transform(X), scaler.fit_transform(y)

    X = check_for_tensor_and_convert(X)
    y = check_for_tensor_and_convert(y)

    likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(
        num_tasks=y.shape[-1]
    )
    model = MultitaskGPModel(X, y, likelihood)
    test_x = check_for_tensor_and_convert(np.ones((10, 34)))
    print(test_x.dtype)
    model.training = False
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        predictions = likelihood(model(test_x))

With the output:

torch.float32
Traceback (most recent call last):
  File "transfer_learning_module\test.py", line 137, in <module>
    predictions = likelihood(model(test_x))
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\models\exact_gp.py", line 291, in __call__
    predictive_mean, predictive_covar = self.prediction_strategy.exact_prediction(full_mean, full_covar)
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py", line 289, in exact_prediction   
    self.exact_predictive_mean(test_mean, test_train_covar),
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py", line 307, in exact_predictive_mean
    res = (test_train_covar @ self.mean_cache.unsqueeze(-1)).squeeze(-1)
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\utils\memoize.py", line 34, in g
    add_to_cache(self, cache_name, method(self, *args, **kwargs))
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py", line 261, in mean_cache
    mean_cache = train_train_covar.inv_matmul(train_labels_offset).squeeze(-1)
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\lazy\lazy_tensor.py", line 928, in inv_matmul
    *self.representation(),
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\functions\_inv_matmul.py", line 46, in forward
    solves = _solve(lazy_tsr, right_tensor)
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\functions\_inv_matmul.py", line 14, in _solve
    return lazy_tsr._solve(rhs, preconditioner)
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\lazy\lazy_tensor.py", line 641, in _solve
    preconditioner=preconditioner,
  File "C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\utils\linear_cg.py", line 252, in linear_cg
    curr_conjugate_vec,
RuntimeError: Expected object of type Variable but found type CPUFloatType for argument #0 'self'
The above operation failed in interpreter, with the following stack trace:
at C:\Users\hamee\Anaconda3\lib\site-packages\gpytorch\utils\linear_cg.py:44:4
@torch.jit.script
def _jit_linear_cg_updates_no_precond(
    mvms, result, has_converged, alpha, residual_inner_prod, eps, beta, residual, precond_residual,
    mul_storage, is_zero, curr_conjugate_vec
):
    torch.mul(curr_conjugate_vec, mvms, out=mul_storage)
    torch.sum(mul_storage, dim=-2, keepdim=True, out=alpha)
    ~~~~~~~~~ <--- HERE

    torch.lt(alpha, eps, out=is_zero)
    torch.div(residual_inner_prod, alpha, out=alpha)

    alpha.masked_fill_(has_converged, 0)

Here are the column-wise means and standard deviations

>>> X.mean(axis=0)
tensor([ 6.6943e-09,  4.1310e-09, -1.0384e-08,  1.7502e-08, -1.8694e-08,
         3.4216e-09, -9.7857e-08, -3.8747e-11,  9.0441e-08,  4.8213e-08,
        -4.4827e-09, -1.2161e-08, -1.7672e-08, -1.8229e-08, -2.7817e-08,
        -2.7838e-09,  3.8175e-08,  2.0299e-08,  6.4976e-09, -4.1915e-08,
         3.3883e-08,  4.0768e-08, -2.8244e-08,  1.0272e-07, -3.9581e-09,
        -4.6162e-08, -4.5149e-08,  1.4014e-08, -1.6047e-08, -6.7002e-09,
         1.9063e-08,  1.6453e-08,  8.6435e-09, -5.9289e-08])
>>> X.std(axis=0)
tensor([1.0001, 1.0000, 1.0000, 1.0000, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001,
        1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001,
        1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0000, 1.0001, 1.0001,
        1.0001, 1.0001, 1.0001, 1.0000, 1.0001, 1.0001, 1.0001])
>>> y.mean(axis=0)
tensor([ 9.0513e-08, -1.9779e-08,  1.8169e-08, -1.9373e-10, -4.6812e-08,
        -3.0558e-09, -2.1126e-08, -8.6018e-08,  4.4350e-09,  1.6453e-08,
        -1.0694e-08, -2.1263e-08, -2.2968e-08, -1.4480e-07, -2.7087e-08,
         3.2149e-08, -9.0608e-09, -9.3471e-08, -6.1780e-08, -1.3949e-09,
         5.6380e-08,  4.1000e-08, -2.3451e-08, -2.0196e-08,  2.4321e-09,
         3.7793e-09, -3.0044e-08, -1.3937e-08, -8.7031e-09, -1.5223e-07,
        -6.9382e-08,  2.0756e-08])
>>> y.std(axis=0)
tensor([1.0001, 1.0000, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0001,
        1.0000, 1.0001, 1.0000, 1.0001, 1.0001, 1.0001, 1.0000, 1.0001, 1.0001,
        1.0001, 1.0001, 1.0001, 1.0001, 1.0001, 1.0000, 1.0001, 1.0000, 1.0001,
        1.0001, 1.0000, 1.0001, 1.0001, 1.0001])

We'll try parameter tuning shortly to try to fix the large losses.