lanczos_tridiag error - Githubissues

h-jia commented 5 years ago

I'm using DKL model to do regression. However, I met this error during testing. Any suggestions?

Iter 1/0 - Loss: 0.970
Iter 1/1 - Loss: 0.965
Iter 1/2 - Loss: 0.962
Iter 1/3 - Loss: 0.960
Iter 1/4 - Loss: 0.956
Iter 1/5 - Loss: 0.952
Iter 1/6 - Loss: 0.949
Iter 1/7 - Loss: 0.946
Iter 1/8 - Loss: 0.942
Iter 1/9 - Loss: 0.939
Iter 1/10 - Loss: 0.935

Traceback (most recent call last):
  File "rrgp.py", line 165, in <module>
    preds = test(epoch)
  File "rrgp.py", line 155, in test
    preds = model(test_x)
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/models/exact_gp.py", line 166, in __call__
    non_batch_train=non_batch_train,
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/functions/__init__.py", line 128, in exact_predictive_covar
    return full_covar.exact_predictive_covar(train_inputs, num_train, likelihood, precomputed_cache, non_batch_train)
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py", line 242, in exact_predictive_covar
    train_inputs, num_train, likelihood, precomputed_cache, non_batch_train
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/lazy/interpolated_lazy_tensor.py", line 477, in exact_predictive_covar
    train_train_covar_inv_root = train_train_covar.root_inv_decomposition(probe_vectors, test_vectors).root
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/lazy/lazy_tensor.py", line 1124, in root_inv_decomposition
    )(*self.representation())
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/functions/_root_decomposition.py", line 51, in forward
    init_vecs=self.initial_vectors,
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/utils/lanczos.py", line 81, in lanczos_tridiag
    t_mat[0, 0].copy_(alpha_0)
RuntimeError: expand(torch.cuda.FloatTensor{[1, 1]}, size=[1]): the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)

gpleiss commented 5 years ago

@h-jia is there any chance you could provide example code, or an example notebook? My guess is that you're probably encountering a NaN issue - but that's hard to debug without a script that we can run.

h-jia commented 5 years ago

Thanks for the reply. The code is a little bit rough, but hope it generally tells where problems happens. Thanks!

import torch.nn as nn
import torch
import gpytorch

nums = 60
sequence_length = nums
input_size = 2
hidden_size = 64
num_layers = 4
num_classes = nums
batch_size = 1
learning_rate = 0.01
epochs = 20

train_x = torch.randn(1, 60, 2)
train_y = torch.randn(60)

class Down(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, init_func=nn.init.normal_):
        super(Down, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)
        if init_func is not None:
            init_func(self.lstm.weight_hh_l0)
            init_func(self.lstm.weight_ih_l0)
            init_func(self.fc.weight)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :]).view(train_x.shape[0] * train_x.shape[1], -1)
        return out

feature_extractor = Down(input_size, hidden_size, num_layers, num_classes)

# We will use the simplest form of GP model, exact inference
class GPRegressionModel(gpytorch.models.ExactGP):
        def __init__(self, train_x, train_y, likelihood):
            super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
            self.feature_extractor = feature_extractor
            self.mean_module = gpytorch.means.ConstantMean()
            self.covar_module = gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=3)),
                num_dims=3, grid_size=5
            )

        def forward(self, x):
            projected_x = self.feature_extractor(x)
            projected_x = projected_x - projected_x.min(0)[0]
            projected_x = 2 * (projected_x / projected_x.max(0)[0]) - 1

            mean_x = self.mean_module(projected_x)
            covar_x = self.covar_module(projected_x)
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPRegressionModel(train_x, train_y, likelihood)
train_loss = []

optimizer = torch.optim.Adam([
    {'params': model.feature_extractor.parameters()},
    {'params': model.covar_module.parameters()},
    {'params': model.mean_module.parameters()},
    {'params': model.likelihood.parameters()},
], lr=learning_rate)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

train_loss = []

def train(epoch):
    with gpytorch.settings.use_toeplitz(True):

        model.train()
        likelihood.train()

        # for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        prediction = model(train_x)
        loss = -mll(prediction, train_y)
        loss.backward()
        print('Iter %d - Loss: %.3f' % (epoch, loss.item()))
        train_loss.append(loss.cpu().data.numpy().tolist())
        optimizer.step()
        return loss

def test(epoch):

        model.eval()
        likelihood.eval()

        test_x = torch.randn(1, 60, 2)
        test_y = torch.randn(60)

        # for batch_size, (x_all, y_all) in enumerate(test_loader):
        test_x = test_x
        test_y = test_y.squeeze()
        with torch.no_grad(), gpytorch.settings.use_toeplitz(False), gpytorch.fast_pred_var():
            preds = model(test_x)
        print('Test MAE: {}'.format(torch.mean(torch.abs(preds.mean - test_y))))
        if torch.mean(torch.abs(preds.mean - test_y)) < 0.02:
            print("we can save all parameters")
        return preds

for epoch in range(epochs):
    loss = train(epoch)
    if epoch % 10 == 0 and epoch > 0:
        preds = test(epoch)
        print("test finished")

h-jia commented 5 years ago

Also wish to know if it is a good way to do this as the loss in simple GP decrease with no limits (can be -x.xxx(negative) for example). Is the best model with the minimum tested loss?

gpleiss commented 5 years ago

@h-jia hmmm I'm not seeing this error. What version of gpytorch are you using? We have added a couple stability fixes recently, so this might actually be a solved problem.

Also, with regards to the loss - once the loss levels out that's usually a good time to stop and test.

h-jia commented 5 years ago

@gpleiss Gpytorch: Version: 0.1.0rc5 Pytorch: torch-nightly-1.0.0.dev20181206

Could you help me to post your Gpytorch and Pytorch version please? After uninstall and reinstall I still get the error below. Thanks!

Iter 0 - Loss: 1.513
Iter 1 - Loss: 1.499
Iter 2 - Loss: 1.505
Iter 3 - Loss: 1.506
Iter 4 - Loss: 1.502
Iter 5 - Loss: 1.495
Iter 6 - Loss: 1.489
Iter 7 - Loss: 1.488
Iter 8 - Loss: 1.489
Iter 9 - Loss: 1.485
Iter 10 - Loss: 1.481
Traceback (most recent call last):
  File "newtest.py", line 127, in <module>
    preds = test(epoch)
  File "newtest.py", line 117, in test
    preds = model(test_x)
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/models/exact_gp.py", line 158, in __call__
    non_batch_train=non_batch_train,
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/functions/__init__.py", line 99, in exact_predictive_mean
    full_mean, train_inputs, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py", line 233, in exact_predictive_mean
    full_mean, train_inputs, train_labels, num_train, likelihood, precomputed_cache, non_batch_train
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/lazy/interpolated_lazy_tensor.py", line 389, in exact_predictive_mean
    res = left_interp(test_interp_indices, test_interp_values, precomputed_cache).squeeze(-1) + test_mean
  File "/anaconda3-server/lib/python3.6/site-packages/gpytorch/utils/interpolation.py", line 207, in left_interp
    res = rhs.unsqueeze(-2).expand(*rhs_size).gather(-3, interp_indices_expanded)
RuntimeError: expand(torch.FloatTensor{[1, 125, 1, 1]}, size=[125, 4, 1]): the number of sizes provided (3) must be greater or equal to the number of dimensions in the tensor (4)

jacobrgardner commented 5 years ago

@h-jia I was able to fix your problem by fixing a few issues in your GP model specification.

Your train_x is batch mode (e.g., b x n x d), but after projecting in forward, projected_x is not (e.g. it is n x d). You need to unsqueeze projected_x to be 1 x n x d so that it matches the batch shape of x coming in. This was causing the mean module to not recognize that we were in batch mode, leading to the issue.
In addition, your kernel specification has a few issues:
1. You are specifying “num_dims=3” and “ard_num_dims=3”, but in your code projected_x has d=1 (e.g., it is 1 x n x 1 after unsqueezing). This is a silent failure we should fix, but these are invalid values for this data. I changed them to 1.
2. Your grid size is very very small (5). This is too few grid points to get a particularly accurate kernel matrix, and results in numerical instability. I’d recommend increasing this, or else using the utility we provide for setting a grid size automatically.

jacobrgardner commented 5 years ago

I’m removing the bug label for now, because I was able to fix this by solving some sizing issues with the GP model specification.

h-jia commented 5 years ago

@jacobrgardner Hi Jacob, thank you very much for your help! Yeah I have found those issues and changed those according to you help.

However, I still facing a little bit confusion about the test.

I have changed a very small learning rate and a tiny network for feature extracting, while I still get this error. For the train, we get 1 x n x 2 for tran_x, and to make the code as simple to work I set the test_x as the same of the train, i.e. 1 x n x 2 (I have also unsqueezed the test_x for batch mode as 1 x 1 x n x 2 it will meet the error with input must have 3 dimensions, got 4).

It seems in my preds.mean I get the error The size of tensor a (0) must match the size of tensor b (60) at non-singleton dimension 0 with tensor([]). It seems the preds is Nan during test.

Dose my batch mode setting has some problem?

Hope you could give me some advice at your very convenience. You have saved me a lot of time, Thanks!

The code I have change into below to click with run.

import torch.nn as nn
import torch
import gpytorch

nums = 60
sequence_length = nums
input_size = 2
hidden_size = 64
num_layers = 4
num_classes = nums
batch_size = 1
learning_rate = 0.0000001
epochs = 20

train_x = torch.randn(1, 60, 2)
train_y = torch.randn(60)

class Down(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, init_func=nn.init.normal_):
        super(Down, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)
        if init_func is not None:
            init_func(self.lstm.weight_hh_l0)
            init_func(self.lstm.weight_ih_l0)
            init_func(self.fc.weight)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :]).view(train_x.shape[0] * train_x.shape[1], -1)
        return out

feature_extractor = Down(input_size, hidden_size, num_layers, num_classes)

# We will use the simplest form of GP model, exact inference
class GPRegressionModel(gpytorch.models.ExactGP):
        def __init__(self, train_x, train_y, likelihood):
            super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)
            self.feature_extractor = feature_extractor
            self.mean_module = gpytorch.means.ConstantMean()
            grid_size = gpytorch.utils.grid.choose_grid_size(train_x)  # Fix 1
            self.covar_module = gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=1)),
                num_dims=1, grid_size=grid_size  # Fix 2
            )

        def forward(self, x):
            projected_x = self.feature_extractor(x)
            projected_x = projected_x - projected_x.min(0)[0]
            projected_x = 2 * (projected_x / projected_x.max(0)[0]) - 1
            projected_x = torch.unsqueeze(projected_x, 0)  # Fix 3
            mean_x = self.mean_module(projected_x)
            covar_x = self.covar_module(projected_x)
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPRegressionModel(train_x, train_y, likelihood)
train_loss = []

optimizer = torch.optim.Adam([
    {'params': model.feature_extractor.parameters()},
    {'params': model.covar_module.parameters()},
    {'params': model.mean_module.parameters()},
    {'params': model.likelihood.parameters()},
], lr=learning_rate)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

train_loss = []

def train(epoch):
    with gpytorch.settings.use_toeplitz(True):

        model.train()
        likelihood.train()

        # for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        prediction = model(train_x)
        loss = -mll(prediction, train_y)
        loss.backward()
        print('Iter %d - Loss: %.3f' % (epoch, loss.item()))
        train_loss.append(loss.cpu().data.numpy().tolist())
        optimizer.step()
        return loss

def test(epoch):

        model.eval()
        likelihood.eval()

        test_x = torch.randn(1, 60, 2)
        test_y = torch.randn(60)

        # for batch_size, (x_all, y_all) in enumerate(test_loader):
        # torch.unsqueeze(test_x, 0)
        with torch.no_grad(), gpytorch.settings.use_toeplitz(False), gpytorch.fast_pred_var():
            preds = model(test_x)
        print(preds.mean)
        print('Test MAE: {}'.format(torch.mean(torch.abs(preds.mean - test_y))))
        if torch.mean(torch.abs(preds.mean - test_y)) < 0.02:
            print("we can save all parameters")
        return preds

for epoch in range(epochs):
    loss = train(epoch)
    if epoch % 10 == 0 and epoch > 0:
        preds = test(epoch)
        print("test finished")

jacobrgardner commented 5 years ago

@h-jia I'll try to look in to this more tomorrow!

jacobrgardner commented 5 years ago

@h-jia Sorry for the delay. Finally had a chance to look at this. It looks like the issue is that your covariance function is always returning a 60 x 60 covariance matrix. For example:

        test_x = torch.randn(1, 60, 2)
        print('size', model.forward(test_x).covariance_matrix.shape)
        # size torch.Size([1, 60, 60])
        test_x = torch.randn(1, 120, 2)
        print('size 2', model.forward(test_x).covariance_matrix.shape)
        # size 2 torch.Size([1, 60, 60])

Since the training data is also 60 points, when you slice in to this you get empty tensors.

h-jia commented 5 years ago

@jacobrgardner Thanks for your kind and patient help. Yes it seems my model was poor designed. When I finish the model fix I will share in here how it modified.

THX! 👍 :)

adam-rysanek commented 5 years ago

Hi @h-jia . Were you able to fix your model? I'm interested in how you resolved it? With many thanks.

cornellius-gp / gpytorch

lanczos_tridiag error #410