cornellius-gp / gpytorch

A highly efficient implementation of Gaussian Processes in PyTorch
MIT License
3.54k stars 557 forks source link

GPyTorch vs. Sklearn Classification Model #1006

Closed quadrupole closed 4 years ago

quadrupole commented 4 years ago

Hello,

I am new to GPyTorch. In particular I am trying to perform classification, and to this end I am comparing the Sklearn implementation of GPs. I have attached some code that illustrates the differences between GPyTorch and sklearn on a synthetic 2D feature space, where I find that the GPyTorch is overfitting the data. Hopefully someone can suggest what can be done to get the GPyTorch model to be closer to the Sklearn implementation.

import torch
import gpytorch
import math
import numpy as np
from sklearn.datasets import make_moons, make_circles
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
default_seed = 10000
from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import UnwhitenedVariationalStrategy
from gpytorch.variational import VariationalStrategy

# Create dataset 
X, y = make_moons(noise=0.3, random_state=0)
Xt = torch.from_numpy(X).float()
yt = torch.from_numpy(y).float()

# Create evalutaion grid
h = 0.05
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
X_eval = np.vstack((xx.reshape(-1), 
                    yy.reshape(-1))).T
X_eval = torch.from_numpy(X_eval).float()

class GPClassificationModel(ApproximateGP):
    def __init__(self, train_x):
        variational_distribution = CholeskyVariationalDistribution(train_x.size(0))
        variational_strategy = UnwhitenedVariationalStrategy(
            self, train_x, variational_distribution, learn_inducing_locations=False
        )
        super(GPClassificationModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
        return latent_pred

# Initialize model and likelihood
model = GPClassificationModel(Xt)
likelihood = gpytorch.likelihoods.BernoulliLikelihood()
training_iterations = 300
# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# "Loss" for GPs - the marginal log likelihood
# num_data refers to the number of training datapoints
mll = gpytorch.mlls.VariationalELBO(likelihood, 
                                    model, 
                                    yt.numel(), 
                                    combine_terms=False)

for i in range(training_iterations):
    # Zero backpropped gradients from previous iteration
    optimizer.zero_grad()
    # Get predictive output
    output = model(Xt)
    # Calc loss and backprop gradients
    log_lik, kl_div, log_prior = mll(output, yt)
    loss = -(log_lik - kl_div + log_prior)
    #loss = -mll(output, yt)
    loss.backward()

    print('Iter %d/%d - Loss: %.3f lengthscale: %.3f outputscale: %.3f' % (
        i + 1, training_iterations, loss.item(),
        model.covar_module.base_kernel.lengthscale.item(),
        model.covar_module.outputscale.item() # There is no noise in the Bernoulli likelihood
    ))

    optimizer.step()

# Go into eval mode
model.eval()
likelihood.eval()

with torch.no_grad():    
    test_x = torch.linspace(0, 1, 101)
    # Get classification predictions
    observed_pred = likelihood(model(X_eval))

    p = observed_pred.mean.numpy()
    Z_gpy = p.reshape(xx.shape)

optimizer = 'fmin_l_bfgs_b'
gp_skl = GaussianProcessClassifier(kernel=1.0**2*RBF(length_scale=1.),
                                  optimizer=optimizer).fit(X, y)

# plot the decision function for each datapoint on the grid
Z_skl = gp_skl.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
Z_skl = Z_skl.reshape(xx.shape)

print("Kernel: {}".format(gp_skl.kernel_))
print("Log Marginal Likelihood: {}, {}".format(gp_skl.log_marginal_likelihood(gp_skl.kernel_.theta),
                                               gp_skl.log_marginal_likelihood_value_))

# Initialize fig and axes for plot
f, ax = plt.subplots(1, 2, figsize=(10, 3))
ax[0].contourf(xx,yy,Z_gpy, levels=16)
ax[1].contourf(xx,yy,Z_skl, levels=16)
ax[0].scatter(X[y == 0,0], X[y == 0,1])
ax[0].scatter(X[y == 1,0], X[y == 1,1])
ax[1].scatter(X[y == 0,0], X[y == 0,1])
ax[1].scatter(X[y == 1,0], X[y == 1,1])
ax[0].set_title('GPyTorch')
ax[1].set_title('Sklearn')
plt.show()

compare

gpleiss commented 4 years ago

GPyTorch and Sklearn use different approximations for classification. SKlearn (according to their documentation) uses the laplace approximation whereas GPyTorch uses variational inference. Therefore, the results are going to be different.

That being said - the overfitting you are seeing in this example comes from the model not being fully optimized.

1) Set learn_inducing_points=True. This chooses the locations of the inducing points which gives the model more freedom to find a better fit, and the model will optimize faster. 2) Use VariationalStrategy rather than UnwhitenedVariationalStrategy. This also will accelerate optimization. 3) If you don't want to make the other changes, just increase the number of training iterations.

Hope this helps!