Should the gradient be inversed?

Thanks a lot for such wonderful papers and this user-friendly library!

I am wondering if there is something wrong with the gradient. I am working with cvxpy==1.3.1, cvxpylayers==0.1.6.

Here is an example to reproduce my failure case:

import cvxpy as cp
from cvxpylayers.torch import CvxpyLayer
import torch

# We try to encode Ax <= b constraint by CVXPY layers
A = torch.tensor(
    [[1, 1, 0, 0],
     [0, 0, 1, 1]], dtype=torch.float32
)
b = torch.tensor([1, 1], dtype=torch.float32)
s = torch.tensor([0.1, 0.9, 0.4, 0.5], dtype=torch.float32)  # s could be the output of neural network
s = s.requires_grad_(True) # optimize over s
opt = torch.optim.SGD([s], lr=.1, momentum=0.) # use SGD optimizer

x_gt = torch.tensor(
    [1, 0, 1, 0], dtype=torch.float32
) # After SGD steps, the output is expected to be like x_gt

def get_opt_layer(num_var, num_constr, tau):
    """
    Get a CVXPY differentiable optimization layer with entropic regularization
    """
    varX = cp.Variable(num_var)
    paramW = cp.Parameter(num_var)
    constrA = cp.Parameter((num_constr, num_var))
    constrb = cp.Parameter(num_constr)
    obj = cp.Maximize(cp.sum(cp.multiply(varX, paramW) + tau * cp.entr(varX)))
    cons = [constrA @ varX <= constrb, varX >= 0, varX <= 1]
    prob = cp.Problem(obj, cons)
    opt_layer = CvxpyLayer(prob, parameters=[paramW, constrA, constrb], variables=[varX])
    return opt_layer

cvxpylayer = get_opt_layer(4, 2, 0.1)

# Test gradient-based optimization, but the loss gradually grows
niters = 1001
with torch.autograd.set_detect_anomaly(True):
    for i in range(niters):
        x, = cvxpylayer(s, A, b)
        cv = torch.matmul(A, x.t()).t() - b.unsqueeze(0)
        loss = ((x - x_gt) ** 2).sum()
        loss.backward()
        opt.step()
        opt.zero_grad()
        if i % 100 == 0:
            print(f'{i}/{niters}\n'
                  f'  loss={loss},\n'
                  f'  x={x}')

The above code leads to a result that the loss grows from 3.068 to 3.968 after 1000 gradient steps.

However, if we inverse the gradient:

# Interestingly, if we inverse the gradient, the loss decreases
s2 = torch.tensor([0.2, 0.6, 0.4, 0.5], dtype=torch.float32)  # s could be the output of neural network
s2 = s2.requires_grad_(True) # optimize over s
opt = torch.optim.SGD([s2], lr=.1, momentum=0.)
with torch.autograd.set_detect_anomaly(True):
    for i in range(niters):
        x, = cvxpylayer(s2, A, b)
        cv = torch.matmul(A, x.t()).t() - b.unsqueeze(0)
        loss = ((x - x_gt) ** 2).sum()
        (-loss).backward() # inversed here!!
        opt.step()
        opt.zero_grad()
        if i % 100 == 0:
            print(f'{i}/{niters}\n'
                  f'  loss={loss},\n'
                  f'  x={x}')

The loss decreases from 2.997 to 0.005 after 1000 gradient steps.

Any clues? Thanks!

cvxgrp / cvxpylayers

Should the gradient be inversed? #147