DLG is available for non-twice-differentiable function.

Hi. I try to use dlg to recover the data from non-twice-differentiable function, the algorithm successfully recover the data. Here are the following code:

import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(12345)
class predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(predictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    def forward(self, x): 
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        return x
if __name__ == '__main__': 
    ipt = torch.randn((1, 14)).requires_grad_(True)
    lbl = torch.randn((1, 1)).requires_grad_(True)
    model = predictor(input_size=14, hidden_size=32, output_size=1)
    criterion = nn.MSELoss()
    opt = model(ipt)
    loss = criterion(opt, lbl)
    print(loss)
    dy_dx = torch.autograd.grad(loss, model.parameters())
    original_dy_dx = list((_.detach().clone() for _ in dy_dx))
    print(dy_dx)
    cal_loss = dy_dx[-1].detach().clone()[0]
    cal_loss.requires_grad_(True)
    print(cal_loss)
    dummy_data = torch.randn(ipt.size()).requires_grad_(True)
    dummy_label = torch.randn(lbl.size()).requires_grad_(True)
    optimizer = optim.LBFGS([dummy_data, dummy_label], lr=0.1)
    for iters in range(1500):
        def closure():
            optimizer.zero_grad()
            dummy_pred = model(dummy_data)
            dummy_loss = criterion(dummy_pred, dummy_label)
            dummy_dy_dx = torch.autograd.grad(dummy_loss, model.parameters(), create_graph=True) 
            grad_diff = 0
            for i in range(len(dummy_dy_dx)):
                grad_diff += ((dummy_dy_dx[i] - original_dy_dx[i]) ** 2).sum()
            grad_diff.backward()
            return grad_diff
        optimizer.step(closure)
        if iters % 10 == 0:
            current_loss = closure()
            print(current_loss)
            print(iters, "%.4f" % current_loss.item())
    print(ipt)
    print(dummy_data)
    print(lbl)
    print(dummy_label)

The result as follow: The data was almostly recovered. The model code is:

class predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(predictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    def forward(self, x): 
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        return x

I test the model with 2 ReLU layers:

class predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(predictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, output_size)
    def forward(self, x): 
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

Here is the result: The result was only slightly worse. The paper replaces the ReLU function with sigmoid function and gets a good result. So, I try to use sigmoid function to improve the result. Here is the code:

class predictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(predictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.sigmoid1 = nn.Sigmoid()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.sigmoid2 = nn.Sigmoid()
        self.fc3 = nn.Linear(hidden_size, output_size)
    def forward(self, x): 
        x = self.fc1(x)
        x = self.sigmoid1(x)
        x = self.fc2(x)
        x = self.sigmoid2(x)
        x = self.fc3(x)
        return x

and here is the result: The result got worse. So I don't think the non-twice-differentiable function lead to a worse result. When the DLG algorithm is optimizing, it's not optimizing weights, it's optimizing dummy_data and dummy_lable. So the second order derivative is d(dL/dW)/ddummy_data and d(dL/dW)/ddummy_label, not d(dL/dW)/dW. Looking forward to your reply. :-)

mit-han-lab / dlg

DLG is available for non-twice-differentiable function. #10