cornellius-gp / gpytorch

A highly efficient implementation of Gaussian Processes in PyTorch
MIT License
3.53k stars 556 forks source link

Help with fitting a multidimensional GP regression with IndexKernel (and maybe AR1?) #323

Open sadatnfs opened 5 years ago

sadatnfs commented 5 years ago

Hi everyone,

I'm trying to fit a multidimensional regression where I have GPs on 3 dimensions:

Y_{l,a,t} = b1*X1 + b2*X2 + (GP by L) + (GP by A) + (GP by T)

where I'm treating X1 and X2 as fixed effects (and hence entering with a linear kernel), and L, A and T are all integers denoting location, age and time.

I'd like to have, let's say, a Matern and RBF on T and A respectively (planning to have an AR1 on A), and an IID like kernel on L. I found that the IndexKernel seem to fit the IID nature the most. I think I've implemented everything except the IndexKernel part in my code below (I know that the sim is wrong, I'm just trying to learn about fitting the model first!), but I get a type-esque error after:

import torch
import pandas as pd
import numpy as np
import gpytorch
import gpytorch.kernels as kern

def cartesian(*arrays):
    mesh = np.meshgrid(*arrays)  # standard numpy meshgrid
    dim = len(mesh)  # number of dimensions
    elements = mesh[0].size  # number of elements, any index will do
    flat = np.concatenate(mesh).ravel()  # flatten the whole meshgrid
    reshape = np.reshape(flat, (dim, elements)).T  # reshape and transpose
    return reshape

def prep_data(name_list, *arrays):
    cartprod = cartesian(*arrays)
    dout = pd.DataFrame(cartprod)
    dout.columns = name_list
    return dout

## Simulate data placeholders
L = [i for i in range(20)]
A = [i for i in range(5)]
T = [i for i in range(30)]
dtt = prep_data(['L', 'A', 'T'], L,A,T)

## Simulate random values
beta_x1 = 0.4
beta_x2 = 0.3
LL_noise = 0.25

dtt['x1'] = np.random.uniform(size = deef.shape[0])
dtt['x2'] = np.random.normal(size = deef.shape[0])
dtt['y'] = beta_x1*dtt['x1'] + beta_x2*dtt['x2'] + np.random.normal(scale=LL_noise, size = dtt.shape[0])

## Create train x and y tensors
train_x = torch.from_numpy(dtt[['x1', 'x2', 'A', 'T', 'L']].values.astype(np.double)).to(torch.float)
train_y = torch.from_numpy(dtt[['y']].values[:,0].astype(np.double)).to(torch.float)

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)

        # SKI requires a grid size hyperparameter. This util can help with that
#         grid_size = gpytorch.utils.grid.choose_grid_size(train_x)

        # Mean funk
        self.mean_module = gpytorch.means.ConstantMean()

        # x1 and x2 (enter as linear)
        covar_module_zero = kern.LinearKernel(num_dimensions=2, active_dims=[0,1])

        # location
        covar_module_one = kern.GridInterpolationKernel(kern.IndexKernel(num_tasks=1), 
                                                   grid_size=7, active_dims=[4], num_dims=1)

        # age
        covar_module_two = kern.GridInterpolationKernel(kern.ScaleKernel(kern.RBFKernel()), 
                                                   grid_size=7, active_dims=[2], num_dims=1) 

        # time
        covar_module_three = kern.GridInterpolationKernel(kern.ScaleKernel(kern.MaternKernel()), 
                                                   grid_size=7, active_dims=[3], num_dims=1) 

        self.covar_module = covar_module_zero + covar_module_one + \
            covar_module_two + covar_module_three

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPRegressionModel(train_x, train_y, likelihood)

# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam([
    {'params': model.parameters()},  # Includes GaussianLikelihood parameters
], lr=0.1)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

def train():
    training_iterations = 30
    for i in range(training_iterations):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
        optimizer.step()

and the error trace I get (which goes away if I comment out the IndexKernel part) is as follows:



<ipython-input-182-a138dc3fa672> in train()
     16         optimizer.zero_grad()
     17         output = model(train_x)
---> 18         loss = -mll(output, train_y)
     19         loss.backward()
     20         print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))

/opt/conda/lib/python3.6/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
    179 
    180     def __call__(self, *inputs, **kwargs):
--> 181         outputs = self.forward(*inputs, **kwargs)
    182 
    183         if isinstance(outputs, tuple):

/opt/conda/lib/python3.6/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py in forward(self, output, target)
     47 
     48         # Get log determininat and first part of quadratic form
---> 49         inv_quad, log_det = covar.inv_quad_log_det(inv_quad_rhs=(target - mean).unsqueeze(-1), log_det=True)
     50 
     51         # Add terms for SGPR / when inducing points are learned

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_tensor.py in inv_quad_log_det(self, inv_quad_rhs, log_det)
    621         batch_size = self.size(0) if self.ndimension() == 3 else None
    622 
--> 623         args = lazy_tsr.representation()
    624         if inv_quad_rhs is not None:
    625             args = [inv_quad_rhs] + list(args)

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_tensor.py in representation(self)
    801                 representation.append(arg)
    802             elif isinstance(arg, LazyTensor):
--> 803                 representation += list(arg.representation())
    804             else:
    805                 raise RuntimeError("Representation of a LazyTensor should consist only of Tensors")

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py in representation(self)
    161 
    162     def representation(self):
--> 163         return self.evaluate_kernel().representation()
    164 
    165     def representation_tree(self):

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py in evaluate_kernel(self)
    143 
    144             self._cached_kernel_eval = super(Kernel, self.kernel).__call__(
--> 145                 x1, x2, diag=False, batch_dims=self.batch_dims, **self.params
    146             )
    147             if self.squeeze_row:

/opt/conda/lib/python3.6/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
    179 
    180     def __call__(self, *inputs, **kwargs):
--> 181         outputs = self.forward(*inputs, **kwargs)
    182 
    183         if isinstance(outputs, tuple):

/opt/conda/lib/python3.6/site-packages/gpytorch/kernels/kernel.py in forward(self, x1, x2, **params)
    325             next_term = kern(x1, x2, **params)
    326             if isinstance(next_term, LazyEvaluatedKernelTensor):
--> 327                 next_term = next_term.evaluate_kernel()
    328             res = res + next_term
    329         return res

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py in evaluate_kernel(self)
    143 
    144             self._cached_kernel_eval = super(Kernel, self.kernel).__call__(
--> 145                 x1, x2, diag=False, batch_dims=self.batch_dims, **self.params
    146             )
    147             if self.squeeze_row:

/opt/conda/lib/python3.6/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
    179 
    180     def __call__(self, *inputs, **kwargs):
--> 181         outputs = self.forward(*inputs, **kwargs)
    182 
    183         if isinstance(outputs, tuple):

/opt/conda/lib/python3.6/site-packages/gpytorch/kernels/kernel.py in forward(self, x1, x2, **params)
    325             next_term = kern(x1, x2, **params)
    326             if isinstance(next_term, LazyEvaluatedKernelTensor):
--> 327                 next_term = next_term.evaluate_kernel()
    328             res = res + next_term
    329         return res

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py in evaluate_kernel(self)
    143 
    144             self._cached_kernel_eval = super(Kernel, self.kernel).__call__(
--> 145                 x1, x2, diag=False, batch_dims=self.batch_dims, **self.params
    146             )
    147             if self.squeeze_row:

/opt/conda/lib/python3.6/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
    179 
    180     def __call__(self, *inputs, **kwargs):
--> 181         outputs = self.forward(*inputs, **kwargs)
    182 
    183         if isinstance(outputs, tuple):

/opt/conda/lib/python3.6/site-packages/gpytorch/kernels/kernel.py in forward(self, x1, x2, **params)
    325             next_term = kern(x1, x2, **params)
    326             if isinstance(next_term, LazyEvaluatedKernelTensor):
--> 327                 next_term = next_term.evaluate_kernel()
    328             res = res + next_term
    329         return res

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py in evaluate_kernel(self)
    143 
    144             self._cached_kernel_eval = super(Kernel, self.kernel).__call__(
--> 145                 x1, x2, diag=False, batch_dims=self.batch_dims, **self.params
    146             )
    147             if self.squeeze_row:

/opt/conda/lib/python3.6/site-packages/gpytorch/module.py in __call__(self, *inputs, **kwargs)
    179 
    180     def __call__(self, *inputs, **kwargs):
--> 181         outputs = self.forward(*inputs, **kwargs)
    182 
    183         if isinstance(outputs, tuple):

/opt/conda/lib/python3.6/site-packages/gpytorch/kernels/grid_interpolation_kernel.py in forward(self, x1, x2, batch_dims, **params)
    177                 self.update_inducing_points_and_grid(inducing_points, grid)
    178 
--> 179         base_lazy_tsr = self._inducing_forward(batch_dims=batch_dims, **params)
    180         if x1.size(0) > 1:
    181             base_lazy_tsr = base_lazy_tsr.repeat(x1.size(0), 1, 1)

/opt/conda/lib/python3.6/site-packages/gpytorch/kernels/grid_interpolation_kernel.py in _inducing_forward(self, batch_dims, **params)
    146     def _inducing_forward(self, batch_dims, **params):
    147         return super(GridInterpolationKernel, self).forward(
--> 148             self.inducing_points, self.inducing_points, batch_dims=batch_dims, **params
    149         )
    150 

/opt/conda/lib/python3.6/site-packages/gpytorch/kernels/grid_kernel.py in forward(self, x1, x2, diag, batch_dims, **params)
     74                 first_item = grid[:, 0:1]
     75                 covar_columns = self.base_kernel(first_item, grid, diag=False, batch_dims=(0, 2), **params)
---> 76                 covar_columns = covar_columns.evaluate().squeeze(-2)
     77                 if batch_dims == (0, 2):
     78                     covars = [ToeplitzLazyTensor(covar_columns)]

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py in evaluate(self)
    167 
    168     def evaluate(self):
--> 169         return self.evaluate_kernel().evaluate()
    170 
    171     def exact_predictive_mean(self, full_mean, train_labels, num_train, likelihood, precomputed_cache=None):

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/lazy_tensor.py in evaluate(self)
    416             if batch_mode:
    417                 eye = eye.unsqueeze(0).expand(batch_size, num_rows, num_rows)
--> 418                 return self.transpose(1, 2).matmul(eye).transpose(1, 2).contiguous()
    419             else:
    420                 return self.t().matmul(eye).t().contiguous()

/opt/conda/lib/python3.6/site-packages/gpytorch/lazy/interpolated_lazy_tensor.py in matmul(self, tensor)
    531         # right_interp^T * tensor
    532         base_size = self.base_lazy_tensor.size(-1)
--> 533         right_interp_res = left_t_interp(self.right_interp_indices, self.right_interp_values, tensor, base_size)
    534 
    535         # base_lazy_tensor * right_interp^T * tensor

/opt/conda/lib/python3.6/site-packages/gpytorch/utils/interpolation.py in left_t_interp(interp_indices, interp_values, rhs, output_dim)
    237     column_indices = column_indices.repeat(batch_size, 1).view(1, -1)
    238 
--> 239     summing_matrix_indices = torch.cat([batch_indices, flat_interp_indices, column_indices])
    240     summing_matrix_values = torch.ones(
    241         batch_size * n_data * n_interp, dtype=interp_values.dtype, device=interp_values.device

RuntimeError: Expected object of scalar type Long but got scalar type Float for sequence elment 1 in sequence argument at position #1 'tensors'```

I feel like I am not coercing something I should for the IndexKernel. I basically have two questions:

1) Could someone help me with this particular error if it's clear on what's going wrong? 
2) Has anyone implemented a first-order autoregressive kernel in GPyTorch yet? I feel like that'd be an amazing addition to have if you guys have done it.

Thanks!!
sumitsk commented 5 years ago

IndexKernel only accepts discrete indices as it is represented as a simple lookup table. You have set num_tasks=1 while defining it, so you should verify what index is propagating in forward pass. Also, why do you want to use a GridInterpolationKernel wrapped with IndexKernel? If you have only a pre-known discrete set of locations L, can't use work with only an IndexKernel and set num_tasks=L?

sadatnfs commented 5 years ago

Ah I didn't know that I had to additionally define it in forward pass. I had the interpolation because I just copied out other kernels... I'll remove that.

On that note, I'm super new to using Pytorch in general. Do you mind pointing me to exactly I need to add more stuff to make the location kernel work (I know you told me it's forward pass, but I'm sure it'd be easier with your help rather than me blindly adding things!).

jacobrgardner commented 5 years ago

@sadatnfs What exactly is the output you'd like to get out of the location kernel? You mentioned they are iid, do you just want a learned diagonal matrix out of it?

sadatnfs commented 5 years ago

Thanks @jacobrgardner. (I think one of my biggest issues is that I'm thinking of this in terms of how I'd model this in an econometric sense rather than statistical!) Yeah I'd like to have a diagonal matrix covariance. I'm trying to model the equivalent kernel of a distribution like :

intercept_by_location ~ Normal(0, location_sigma^2)

and so the location intercepts will be fit like a classical mixed effects model (which is why I'm more familiar with), and that will allow me to condition out the location effect, while having GPs by age and time.

Oh also @sumitsk : I realized that the IndexKernel does not seem to take in an active_dims argument. How would you (or @jacobrgardner) suggest that I implement this kind of 'random effect'' in my model?

(BTW I'd be happy to write up a mixed effects model example once I have figured out all my questions! I'm working on a large scale model which a lot of people in my institute can implement, and I think it'd be super cool to be able to have a mixed effects model example which others could also learn from!)

jacobrgardner commented 5 years ago

Sorry, two more short questions! Is the set of possible locations you will find in your data discrete, and would you like to have different noise values for each possible location, or a single location_sigma as in your equation above?

sadatnfs commented 5 years ago

Thanks @jacobrgardner ! Here's some answers (and one question!):

1) Yes the location indicators are discrete (ints let's say).

2) For now, I am just trying to fit with a single noise component where the location intercepts are drawn from (that Normal distribution).

3) How would you suggest I add data variance to a general model? What I have in mind is to go into here (https://github.com/cornellius-gp/gpytorch/blob/master/gpytorch/likelihoods/gaussian_likelihood.py), copy the class out, and having the noise be a vector of data points which can be added to the covar (unless self.noise can take a vector of noise already...)

jacobrgardner commented 5 years ago

Given that you are trying to add a single learned noise component that is independent of the actual location, I might just initialize this as a model parameter and add it in forward. Something like:

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        ...
        self.register_parameter(name="log_location_noise", parameter=torch.nn.Parameter(torch.zeros(1)))

    def forward(self, x):
        ...
        # add_diag adds a constant diagonal component, see GaussianLikelihood.
        covar_x = covar_x.add_diag(self.log_location_noise.exp())
        return MultivariateNormal(mean_x, covar_x)

If you want different noises for the different discrete locations, you can make the parameter a vector, and then do something like:

    location_noises = self.log_location_noise[x_locs].exp()
    covar_x = covar_x + gpytorch.lazy.DiagLazyTensor(location_noises)

Do you think this is a reasonable solution for your use case?

sadatnfs commented 5 years ago

I guess the way to describe my problem would be : I'm trying to fit a hierarchical mixed effects model with GP errors, where the hierarchy component comes from the locations, such that we have all our global coefficients and then we can further decompose the intercepts by country. Starting with that, I'll continue to build on other fun stuff, like coefficients by location, etc.

Balandat commented 5 years ago

Relevant also to #235

jacobrgardner commented 5 years ago

As one last addendum, if you want to do something more complicated like add location_noise to the covariance matrix entries wherever x1_loc == x2_loc, you should be able to accomplish that with a custom model parameter as well.

Basically, you should have pretty wide freedom to define parameters you need and modify the covariance matrix you create directly in forward, which might be an easier solution than hacking in to one of our internal kernels or likelihoods.

sadatnfs commented 5 years ago

Oh shoot sorry Jake, I completely missed your reply before posting my last reply. I will give your code a shot later on in the day, thank you so so much for the detailed help!

sadatnfs commented 5 years ago

@jacobrgardner ah I think I might have misspecified my question. What I'm trying to estimate is a location specific intercept fit and those intercepts happen to be drawn for a normal distribution. With your implementation, it looks like I can't seem to tie the location groups with the noise value right? (Amusingly, your example told me exactly how to implement noise on the covariance for my other question!)

sadatnfs commented 5 years ago

(Apologies for all these qs!)

jacobrgardner commented 5 years ago

@sadatnfs No problem!

Are you sure a GP is actually the thing you want to be using for that specific component of the model? I'm not sure I fully understand the model you're trying to implement, so it's hard to give specific feedback.

Maybe a clarifying question would be: what would your model look like if location was your only input?

sadatnfs commented 5 years ago

Hah @jacobrgardner funny that you mention that, because I was actually trying to code up the location part as a non GP component right now (flurrying through documentations).

Yes if we were to think of this setup in the world of mixed effects, I have written his horrible C++ pseudo code to sort of try to explain my model (I've been fitting large models in an R package called TMB which uses C++ model templates to do fast MLEs, hence it's easier to explain using for loops).

Please let me know if something doesn't make sense here. Ideally what I would do is to layer the GPs on top of this shell of a model.


// Data
Y = an array of location by time [L by T]
X = an array of location by time by feature [L by T by F]
Yhat = prediction; location by time [L by T]

// Parameters to fit
beta = a vector of fixed effects
zeta = a vector of location intercepts
zeta_sigma = the variance across the locations
data_variance = the variance of data

// New placeholders
neg_log_likelihood = negative log likelihood function to be optimized

// Model

    // Add fixed effects to Yhat

    // Loop over locations
    for(int loc = 0; loc < Y.rows(); loc++) {

        // Loop over time periods
        for(int time = 0; time < Y.cols(); time++) {

            // Loop over different features in X
            for(int feature = 0; feature < X.dim(2); feature++) {

                Yhat(loc, time) += beta[feature] * X(loc, time, feature)

            }
        }
    }

    // Add the random effect component (intercept)

    // Loop over number of locations
    for(int loc = 0; loc < Y.rows(); loc++) {

        // Given that each row in Yhat represents a different location,
        // I add the location intercept for all time periods per row:

        Yhat.row(loc) += zeta[loc]

    }

    // Add zero mean prior of the random effect (Normal ~ (0., zeta_sigma))
    neg_log_likelihood -= ( loop over each of the zeta components and add that prior here)

    // Finally, I would link the data in my likelihood with a Gaussian likelihood
    neg_log_likelihood -= Yhat ~ Normal(Y, data_variance)
sadatnfs commented 5 years ago

One thing I was trying was to roll the fixed effects and the location intercept part in the forward pass like such (which would feed as the mean), but I struck out on thinking up a way of doing the matrix multiplication of the location parameter without a for loop:


class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)

        # SKI requires a grid size hyperparameter. This util can help with that
        grid_size = gpytorch.utils.grid.choose_grid_size(train_x)         

        self.register_parameter(name="b1", parameter=torch.nn.Parameter(torch.zeros(1)))
        self.register_parameter(name="b2", parameter=torch.nn.Parameter(torch.zeros(1)))

        self.register_parameter(name="loc_beta", parameter=torch.nn.Parameter(torch.zeros(20)))
        self.register_parameter(name="loc_sigma", parameter=torch.nn.Parameter(torch.zeros(1)) )

        # age
        covar_module_two = kern.GridInterpolationKernel(kern.ScaleKernel(kern.RBFKernel()), 
                                                   grid_size=grid_size, active_dims=[3], num_dims=1)

        # time
        covar_module_three = kern.GridInterpolationKernel(kern.ScaleKernel(kern.MaternKernel()), 
                                                   grid_size=grid_size, active_dims=[4], num_dims=1) 

        self.covar_module = covar_module_two + covar_module_three        

    def forward(self, x):
        mean_x = train_x[:,0]*self.b1 + train_x[:,1]*self.b2 + INSERT_LOC_INTERCEPT_HERE
        covar_x = self.covar_module(x)

        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
sadatnfs commented 5 years ago

EDIT: @jacobrgardner I have almost everything here, except the part in the forward() function where I'd like to add the random effects' prior distribution, which I'm stuck at:


import math
import torch
import pandas as pd
import numpy as np
import gpytorch
import gpytorch.kernels as kern
from matplotlib import pyplot as plt
from math import exp

get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

def cartesian(*arrays):
    mesh = np.meshgrid(*arrays)  # standard numpy meshgrid
    dim = len(mesh)  # number of dimensions
    elements = mesh[0].size  # number of elements, any index will do
    flat = np.concatenate(mesh).ravel()  # flatten the whole meshgrid
    reshape = np.reshape(flat, (dim, elements)).T  # reshape and transpose
    return reshape

def prep_data(name_list, *arrays):
    cartprod = cartesian(*arrays)
    dout = pd.DataFrame(cartprod)
    dout.columns = name_list
    return dout

## Simulate data placeholders
L = [i for i in range(20)]
A = [i for i in range(5)]
T = [i for i in range(30)]
dtt = prep_data(['L', 'A', 'T'], L,A,T)

## Simulate random values
beta_x1 = 0.4
beta_x2 = 0.3
LL_noise = 0.25

dtt['x1'] = np.random.uniform(size = dtt.shape[0])
dtt['x2'] = np.random.normal(size = dtt.shape[0])
dtt['y'] = beta_x1*dtt['x1'] + beta_x2*dtt['x2'] + np.random.normal(scale=LL_noise, size = dtt.shape[0])

## Create train x and y tensors
train_x = torch.from_numpy(dtt[['x1', 'x2', 'L', 'A', 'T' ]].values.astype(np.double)).to(torch.float)
train_y = torch.from_numpy(dtt[['y']].values[:,0].astype(np.double)).to(torch.float)
test_x = torch.from_numpy(dtt[['x1', 'x2', 'L', 'A', 'T' ]].values.astype(np.double)).to(torch.float)

class GPRegressionModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(GPRegressionModel, self).__init__(train_x, train_y, likelihood)

        # SKI requires a grid size hyperparameter. This util can help with that
        grid_size = gpytorch.utils.grid.choose_grid_size(train_x)

        self.register_parameter(name="alpha", parameter=torch.nn.Parameter(torch.zeros(1)))

        self.register_parameter(name="b1", parameter=torch.nn.Parameter(torch.zeros(1)))
        self.register_parameter(name="b2", parameter=torch.nn.Parameter(torch.zeros(1)))

        self.register_parameter(name="loc_sigma", parameter=torch.nn.Parameter(torch.ones(1)) )
        self.register_parameter(name="loc_beta", parameter=torch.nn.Parameter(torch.zeros(20)))

#         self.register_prior(name = 'loc_beta_prior', 
#                             prior = gpytorch.priors.NormalPrior(loc = 0., scale = 0.1),
#                             param_or_closure ='loc_beta') 

        # age
        covar_module_two = kern.GridInterpolationKernel(kern.ScaleKernel(kern.RBFKernel()), 
                                                   grid_size=grid_size, active_dims=[3], num_dims=1)

        # time
        covar_module_three = kern.GridInterpolationKernel(kern.ScaleKernel(kern.MaternKernel()), 
                                                   grid_size=grid_size, active_dims=[4], num_dims=1) 

        self.covar_module = covar_module_two + covar_module_three        

    def forward(self, x):

        ## x can be the full dimension of training data
        ## because we only use specific columns as subsetted

        mean_x = self.alpha + x[:,0]*self.b1 + x[:,1]*self.b2
        covar_x = self.covar_module(x)

        ## Convert location indices to int
        beta_train_df = x[:,2].type(torch.int)

        ## Add the random effects in a for loop corresponding to the integer location value
        ## I love a good hack ##

        ## Create a NxN eye matrix for location variance
        loc_covar_matx = torch.eye(mean_x.shape[0])    
        loc_mean = torch.zeros(mean_x.shape[0])

        for lox in range(mean_x.shape[0]):            
            ## Add location int prior to the loc tensors
            loc_covar_matx[lox, lox] = self.loc_sigma
            loc_mean[lox] = self.loc_beta[beta_train_df[lox]]

            ## Add location intercept to mean function
            mean_x[lox] = mean_x[lox] + loc_mean[lox]

        ## Add prior for the location random effect  
        #### IDEALLY: I'd like to add a prior on the self.loc_beta with self.loc_sigma as the hyperparameter to train on
        covar_x = covar_x.add_diag(self.loc_sigma)
        data_distro = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

        return data_distro

likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = GPRegressionModel(train_x, train_y, likelihood)

# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam([
    {'params': model.parameters()},  # Includes GaussianLikelihood parameters
], lr=0.05)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

def train():
    training_iterations = 52
    for i in range(training_iterations):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        if i % 3 == 0:
            print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
        optimizer.step()

get_ipython().run_line_magic('time', 'train()')

## Pred

# Set model and likelihood into evaluation mode
model.eval()
likelihood.eval()

with torch.no_grad(), gpytorch.fast_pred_var():
    model(test_x)
    observed_pred = likelihood(model(test_x))
    pred_y_full = observed_pred.mean
jacobrgardner commented 5 years ago

@sadatnfs I'll take a look once I'm back from traveling and see if I can help out!

sadatnfs commented 5 years ago

I've actually been able to run through with the prediction, so the VERY last thing I want to implement here is to add the distribution of the random effects. I'll edit my post above. Thanks again!!