`PyTorchBackend` backpropagation compatibility with `torch.nn.Module` and `torch.optim`

I am trying to train a VQC through the PyTorchBackend and using the pytorch API. Here you can find a small test example.

import torch
import numpy as np
from qibo.backends import PyTorchBackend
from qibo import Circuit, gates
from qibo import hamiltonians
from qibo.symbols import Z

BACKEND = PyTorchBackend()

class QuantumCircuit(torch.nn.Module):

    def __init__(self, encoding: Circuit, circuit: Circuit):
        super().__init__()
        self.encoding = encoding
        self.circuit = circuit
        self.circuit_parameters = torch.nn.Parameter(BACKEND.cast(circuit.get_parameters(), dtype=torch.float64))
        self.circuit.set_parameters(self.circuit_parameters)
        self.observable = hamiltonians.SymbolicHamiltonian(
            sum([Z(int(i)) for i in range(circuit.nqubits)]),
            nqubits=circuit.nqubits,
            backend=BACKEND,
        )
        # self.linear = torch.nn.Linear(2 ** circuit.nqubits, 1).double()

    def forward(self, x):
        self.encoding.set_parameters(x * BACKEND.np.pi)
        circuit = self.encoding + self.circuit
        state = BACKEND.execute_circuit(circuit).state()
        return self.observable.expectation(
            state
        )
        #return self.linear(BACKEND.execute_circuit(circuit).probabilities())

if __name__ == "__main__":

    nqubits = 3
    encoding = Circuit(nqubits)
    [encoding.add(gates.H(i)) for i in range(nqubits)]
    [encoding.add(gates.RZ(i, theta=0.)) for i in range(nqubits)]
    circuit = Circuit(nqubits)
    for q in range(nqubits):
        circuit.add(gates.RY(q, theta=torch.randn(1) * BACKEND.np.pi))
        circuit.add(gates.RZ(q, theta=torch.randn(1) * BACKEND.np.pi))
    for i, q in enumerate(range(nqubits)[:-2]):
        circuit.add(gates.CNOT(q0=q, q1=i + 1))
    circuit.add(gates.CNOT(q0=nqubits-1, q1=0))
    circuit.add(gates.M(*range(nqubits)))

    model = QuantumCircuit(encoding, circuit)
    data = torch.randn(100, 3)
    target = (data[:, 0] * BACKEND.np.pi) ** 2
    loss_f = lambda x, y: (x - y)**2
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
    for x, y in zip(data, target):
        optimizer.zero_grad()
        out = model(x)
        loss = loss_f(out, y)
        loss.backward()
        optimizer.step()
        print(f"Loss: {loss}")
        print(f"Parameters: {list(model.parameters())}")

You first wrap the circuit inside a torch.nn.Module object, register the parameters and then, in the forward, you execute the circuit and calculate the desired output, for instance an expectation value. Everything is trained using a torch.optim optimizer. Now, if you run the training it is easy to see that the parameters never get updated. If you try to switch from the expectation value to the probabilities, for instance, by adding a linear layer that maps to the correct shape (2**nqubits --> 1) you can also notice as the parameters of the linear layer get updated, but not those of the circuit.

Therefore, I would say that there is some operation, either in the process of setting/getting the parameters of the circuit or in the execution + post processing (expectation value or probabilities calculation) that brakes the tracing of the tensors. @Simone-Bordoni in the test did we actually try to run a gradient-based optimization anywhere?

Last week I found out that there is a problem in the backpropagation due to circuit execution. I am currently working on it.

Ok @Simone-Bordoni by using #1450 and defining two custom functions to get and set the parameters of a circuit

def get_parameters(circuit):
    return (param for gate in circuit.parametrized_gates for param in gate.parameters)

def set_parameters(circuit, params):
    index = 0
    for gate in circuit.parametrized_gates:
        nparams = len(gate.parameters)
        gate.parameters = params[index:index + nparams]
        index += nparams

I was able to make this run and I see the parameters finally changing. However the loss doesn't really decrease, but the problem seems to be way harder than I expected. Using 1000 data points and a classical linear FFNN with ReLU activation (~5000 parameters in total) I get an average MSE loss of ~ 1 and the fit looks like this:

tmp

Thus probably I would need a way bigger circuit to hope to fit this. Most likely I'll make the problem simpler though...

Ok I am now testing a simpler problem, namely I generate some labels by running the model with some manually chosen parameters, and then I try to recover them through training. It seems to be working even though ~3000 data points are needed:

Before Training --> avg loss: 0.74

immagine

After Training --> avg loss: 0.004

immagine

This is the script in case someone wants to try it out

import torch
import numpy as np
import matplotlib.pyplot as plt
from qibo.backends import PyTorchBackend
from qibo import Circuit, gates
from qibo import hamiltonians
from qibo.symbols import Z

BACKEND = PyTorchBackend()

def observable(nqubits, backend):
    return hamiltonians.SymbolicHamiltonian(
        sum([Z(int(i)) for i in range(nqubits)]),
        nqubits=nqubits,
        backend=backend,
    )

def get_parameters(circuit):
    return (param for gate in circuit.parametrized_gates for param in gate.parameters)

def set_parameters(circuit, params):
    index = 0
    for gate in circuit.parametrized_gates:
        nparams = len(gate.parameters)
        gate.parameters = params[index:index + nparams]
        index += nparams

def eval_model(model, data, target, loss_f):
    with torch.no_grad():
        loss = 0.
        for x, y in zip(data, target):
            out = model(x)
            loss += loss_f(out.ravel(), y.ravel())
        print(f"Avg loss: {loss/data.shape[0]}")
        plt.scatter(data[:, 0], target, alpha=0.5)
        plt.scatter(data[:, 0], [model(x) for x in data], alpha=0.5)
        plt.show()

class QuantumCircuit(torch.nn.Module):

    def __init__(self, encoding: Circuit, circuit: Circuit):
        super().__init__()
        self.encoding = encoding
        self.circuit = circuit
        for i, param in enumerate(get_parameters(circuit)):
            setattr(self, f"param_{i}", torch.nn.Parameter(param))
        set_parameters(self.circuit, list(self.parameters()))
        self.observable = observable(circuit.nqubits, BACKEND)
        #self.linear = torch.nn.Linear(2 ** circuit.nqubits, 1).double()

    def forward(self, x):
        self.encoding.set_parameters(x * BACKEND.np.pi)
        circuit = self.encoding + self.circuit
        state = BACKEND.execute_circuit(circuit).state()
        return self.observable.expectation(state)
        #return self.linear(BACKEND.execute_circuit(circuit).probabilities())

if __name__ == "__main__":

    nqubits = 3
    encoding = Circuit(nqubits)
    [encoding.add(gates.H(i)) for i in range(nqubits)]
    [encoding.add(gates.RZ(i, theta=torch.tensor(0.).double())) for i in range(nqubits)]
    circuit = Circuit(nqubits)
    for q in range(nqubits):
        circuit.add(gates.RY(q, theta=torch.randn(1) * BACKEND.np.pi))
        circuit.add(gates.RZ(q, theta=torch.randn(1) * BACKEND.np.pi))
        circuit.add(gates.U3(q, theta=torch.randn(1), phi=torch.randn(1), lam=torch.randn(1)))
    for i, q in enumerate(range(nqubits)[:-2]):
        circuit.add(gates.CNOT(q0=q, q1=i + 1))
    circuit.add(gates.CNOT(q0=nqubits-1, q1=0))
    circuit.add(gates.M(*range(nqubits)))

    model = QuantumCircuit(encoding, circuit)
    data = torch.randn(3000, 3).double()
    # back up the initial parameters
    initial_params = {k: v.clone() for k,v in model.state_dict().items()}
    # generate the labels by using a specific set of parameters
    true_params = torch.randn(len(list(model.parameters()))).double()
    true_params = {k: val.view(-1) for k, val in zip(model.state_dict().keys(), true_params)}
    # set the true parameters
    model.load_state_dict(true_params)
    # generate the targets
    with torch.no_grad():
        target = [model(x) for x in data]
    # restore the old parameters
    model.load_state_dict(initial_params)
    loss_f = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())

    eval_model(model, data, target, loss_f)

    cum_loss = 0.
    for i, (x, y) in enumerate(zip(data, target)):
        optimizer.zero_grad()
        out = model(x)
        loss = loss_f(out.ravel(), y.ravel())
        loss.backward()
        optimizer.step()
        cum_loss += loss
        if i % 50 == 0 and i != 0:
            print(f"Loss: {cum_loss / 50}")
            cum_loss = 0.

    eval_model(model, data, target, loss_f)

qiboteam / qibo

`PyTorchBackend` backpropagation compatibility with `torch.nn.Module` and `torch.optim` #1449