[Docs] Deep Sigma Point Processes

EvanBaker57 commented 3 years ago

Hi,

The DSPPs documentaiton makes it seem like theyare coded up very similarly to the standard Deep GPs, and this does seem true for two layers.

However, if I try to add a third layer (a second hidden layer) and update the input and output dims and make sure the forward call is correct, the loss function returns a vector rather than a single value (which the optimsier obviously doesn't like).

I'm guessing these losses are meant to be weighted according to the quadrature points. Is this true? Perhpas this could be more clear in the documentation (or alternatively, should this be done automatically in the backend?)

gpleiss commented 3 years ago

Hi @EvanBaker57 sorry for the slow reply. Would you mind posting a small reproducible code example?

EvanBaker57 commented 3 years ago

No worries,

Below is a basic extension of the DSPP in the documentation (its necessarily a bit long). Note that this all works fine when dropped back to a two layer DSPP (because its essentially the tutorial example)

import numpy as np
from matplotlib import pyplot as plt
import gpytorch
import torch
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.means import ConstantMean, LinearMean
from gpytorch.kernels import ScaleKernel, MaternKernel
from gpytorch.variational import VariationalStrategy, BatchDecoupledVariationalStrategy
from gpytorch.variational import MeanFieldVariationalDistribution
from gpytorch.models.deep_gps.dspp import DSPPLayer, DSPP
import gpytorch.settings as settings

#get some fake data
train_n = 100
x= np.linspace(0,1, train_n)
y = np.sin(2*x) + np.random.normal(size = train_n, loc = 0, scale = 0.1)
train_x = torch.from_numpy(x.reshape(-1,1)).float() 
train_y = torch.from_numpy(y).float() 

batch_size = 1000                 # Size of minibatch
num_inducing_pts = 300            # Number of inducing points in each hidden layer
num_epochs = 1000                # Number of epochs to train for
initial_lr = 0.01                 # Initial learning rate
hidden_dim = 3                    # Number of GPs (i.e., the width) in the hidden layer.
num_quadrature_sites = 8          # Number of quadrature sites (see paper for a description of this. 5-10 generally works well).

milestones = (np.linspace(0, num_epochs, 5)[1:-1]).astype(int).tolist()      # Epochs at which we will lower the learning rate by a factor of 0.1

#minibatch
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

from scipy.cluster.vq import kmeans2

# Use k-means to initialize inducing points (only helpful for the first layer)
inducing_points = (train_x[torch.randperm(min(1000 * 100, train_n))[0:num_inducing_pts], :])
inducing_points = inducing_points.clone().data.cpu().numpy()
inducing_points = torch.tensor(kmeans2(train_x.data.cpu().numpy(),
                               inducing_points, minit='matrix')[0])

class DSPPHiddenLayer(DSPPLayer):
    def __init__(self, input_dims, output_dims, num_inducing=300, inducing_points=None, mean_type='constant', Q=8):
        if inducing_points is not None and output_dims is not None and inducing_points.dim() == 2:
            # The inducing points were passed in, but the shape doesn't match the number of GPs in this layer.
            # Let's assume we wanted to use the same inducing point initialization for each GP in the layer,
            # and expand the inducing points to match this.
            inducing_points = inducing_points.unsqueeze(0).expand((output_dims,) + inducing_points.shape)
            inducing_points = inducing_points.clone() + 0.01 * torch.randn_like(inducing_points)
        if inducing_points is None:
            # No inducing points were specified, let's just initialize them randomly.
            if output_dims is None:
                # An output_dims of None implies there is only one GP in this layer
                # (e.g., the last layer for univariate regression).
                inducing_points = torch.randn(num_inducing, input_dims)
            else:
                inducing_points = torch.randn(output_dims, num_inducing, input_dims)
        else:
            # Get the number of inducing points from the ones passed in.
            num_inducing = inducing_points.size(-2)

        # Let's use mean field / diagonal covariance structure.
        variational_distribution = MeanFieldVariationalDistribution(
            num_inducing_points=num_inducing,
            batch_shape=torch.Size([output_dims]) if output_dims is not None else torch.Size([])
        )

        # Standard variational inference.
        variational_strategy = VariationalStrategy(
            self,
            inducing_points,
            variational_distribution,
            learn_inducing_locations=True
        )

        batch_shape = torch.Size([]) if output_dims is None else torch.Size([output_dims])

        super(DSPPHiddenLayer, self).__init__(variational_strategy, input_dims, output_dims, Q)

        if mean_type == 'constant':
            # We'll use a constant mean for the final output layer.
            self.mean_module = ConstantMean(batch_shape=batch_shape)
        elif mean_type == 'linear':
            # As in Salimbeni et al. 2017, we find that using a linear mean for the hidden layer improves performance.
            self.mean_module = LinearMean(input_dims, batch_shape=batch_shape)

        self.covar_module = ScaleKernel(MaternKernel(batch_shape=batch_shape, ard_num_dims=input_dims),
                                        batch_shape=batch_shape, ard_num_dims=None)

    def forward(self, x, mean_input=None, **kwargs):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

class ThreeLayerDSPP(DSPP):
    def __init__(self, train_x_shape, inducing_points, num_inducing, hidden_dim=3, Q=3):
        hidden_layer1 = DSPPHiddenLayer(
            input_dims=train_x_shape[-1],
            output_dims=hidden_dim,
            mean_type='linear',
            inducing_points=inducing_points,
            Q=Q,
        )
        hidden_layer2 = DSPPHiddenLayer(
            input_dims=hidden_layer1.output_dims,
            output_dims=hidden_dim,
            mean_type='linear',
            inducing_points=None,
            num_inducing = num_inducing,
            Q=Q,
        )
        last_layer = DSPPHiddenLayer(
            input_dims=hidden_layer2.output_dims,
            output_dims=None,
            mean_type='constant',
            inducing_points=None,
            num_inducing=num_inducing,
            Q=Q,
        )

        likelihood = GaussianLikelihood()

        super().__init__(Q)
        self.likelihood = likelihood
        self.last_layer = last_layer
        self.hidden_layer2 = hidden_layer2
        self.hidden_layer1 = hidden_layer1

    def forward(self, inputs, **kwargs):
        hidden_rep1 = self.hidden_layer1(inputs, **kwargs)
        hidden_rep2 = self.hidden_layer2(hidden_rep1, **kwargs)
        output = self.last_layer(hidden_rep2, **kwargs)
        return output

    def predict(self, loader):
        with settings.fast_computations(log_prob=False, solves=False), torch.no_grad():
            mus, variances = [], []
            for x_batch in loader:
                preds = self.likelihood(self(x_batch[0], mean_input=x_batch[0]))
                mus.append(preds.mean.cpu())
                variances.append(preds.variance.cpu())

        return torch.cat(mus, dim=-1), torch.cat(variances, dim=-1)

model = ThreeLayerDSPP(
    train_x.shape,
    inducing_points,
    num_inducing=num_inducing_pts,
    hidden_dim=hidden_dim,
    Q=num_quadrature_sites
)

model.train()

from gpytorch.mlls import DeepPredictiveLogLikelihood
adam = torch.optim.Adam([{'params': model.parameters()}], lr=initial_lr, betas=(0.9, 0.999))
sched = torch.optim.lr_scheduler.MultiStepLR(adam, milestones=milestones, gamma=0.1)

# The "beta" parameter here corresponds to \beta_{reg} from the paper, and represents a scaling factor on the KL divergence
# portion of the loss.
objective = DeepPredictiveLogLikelihood(model.likelihood, model, num_data=train_n, beta=0.05)

epochs_iter = range(num_epochs)
losses = []

for i in epochs_iter:
    minibatch_iter = train_loader
    for x_batch, y_batch in minibatch_iter:
        adam.zero_grad()
        output = model(x_batch)
        loss = -objective(output, y_batch)
        loss.backward()
        adam.step()
    sched.step()
    if i % 10 == 0:
        print('Iter %d/%d - Loss: %.3f      Noise: %.5f' % (
            i + 1, num_epochs, loss.item(), model.likelihood.noise_covar.noise))
    losses.append(loss.item())

plt.plot(range(0, len(losses)), losses[0:])
plt.show()

n_test = 1000
x_test = np.linspace(-0.2,1.2, num = n_test)
test_x = torch.from_numpy(x_test.reshape(-1,1)).float()

#do preds
test_dataset = TensorDataset(test_x)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)    

model.eval()
means, vars = model.predict(test_loader)
weights = model.quad_weights.unsqueeze(-1).exp().cpu()
mean = (weights * means).sum(0)
var = (weights * vars).sum(0) + (weights * means**2).sum(0) - (weights * means).sum(0)**2

with torch.no_grad(): 
    plt.plot(test_x[:,0], mean, zorder = 5)
    plt.fill_between(test_x[:,0], mean - 2*np.sqrt(var), mean + 2*np.sqrt(var), alpha = 0.5, zorder = 4)
    plt.scatter(train_x, train_y, color = "black", zorder = 1) #(zorder makes the points be at the top)
    plt.show()

gpleiss commented 3 years ago

@EvanBaker57 try now - this should be fixed.

EvanBaker57 commented 3 years ago

Sorry for the slow reply. I can indeed get 3 layer DSPPs working now, thank you

cornellius-gp / gpytorch

[Docs] Deep Sigma Point Processes #1538