Make compatible for PyTorch 2

MemCNN version: 1.5.1
PyTorch version: 2.0.x
Python version: 3.9
Operating System: Any
Description

There have been some major changes for PyTorch 2.0.x that break some of the reversible operations. I should investigate how to fix these.
What I Did

Run CI tests for PyTorch 2.0.x
tmp_path = PosixPath('/tmp/pytest-of-circleci/pytest-0/test_train_networks_False_revn0')
network = 'revnet38', use_cuda = False

    @pytest.mark.parametrize("network", [
        pytest.param(network,
                     marks=pytest.mark.skipif(
                         condition=("FULL_NETWORK_TESTS" not in os.environ) and ("revnet38" != network),
                         reason="Too memory intensive for CI so these tests are disabled by default. "
                                "Set FULL_NETWORK_TESTS environment variable to enable the tests.")
                     )
        for network in ["resnet32", "resnet110", "resnet164", "revnet38", "revnet110", "revnet164"]
    ])
    @pytest.mark.parametrize("use_cuda", [
        False,
        pytest.param(True, marks=pytest.mark.skipif(condition=not torch.cuda.is_available(), reason="No GPU available"))
    ])
    def test_train_networks(tmp_path, network, use_cuda):
        exptags = ["cifar10", network, "epoch5"]
        exp_file = str(Path(__file__).parent / "resources" / "experiments.json")
        data_dir = str(tmp_path / "tmpdata")
        results_dir = str(tmp_path / "resdir")
        os.makedirs(data_dir)
        os.makedirs(results_dir)
>       run_experiment(experiment_tags=exptags, data_dir=data_dir, results_dir=results_dir,
                       start_fresh=True, use_cuda=use_cuda, workers=None, experiments_file=exp_file,
                       disp_iter=1,
                       save_iter=5,
                       valid_iter=5,)

memcnn/trainers/tests/test_train.py:89: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
memcnn/train.py:48: in run_experiment
    trainer(manager, start_iter=last_iter, use_cuda=use_cuda, *args, **trainer_params)
memcnn/trainers/classification.py:111: in train
    loss.backward()
.tox/py38-torchlatest/lib/python3.8/site-packages/torch/_tensor.py:487: in backward
    torch.autograd.backward(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

tensors = (tensor(2.3590, grad_fn=<MulBackward0>),), grad_tensors = None
retain_graph = False, create_graph = False, grad_variables = None, inputs = ()

    def backward(
        tensors: _TensorOrTensors,
        grad_tensors: Optional[_TensorOrTensors] = None,
        retain_graph: Optional[bool] = None,
        create_graph: bool = False,
        grad_variables: Optional[_TensorOrTensors] = None,
        inputs: Optional[_TensorOrTensors] = None,
    ) -> None:
        r"""Computes the sum of gradients of given tensors with respect to graph
        leaves.

        The graph is differentiated using the chain rule. If any of ``tensors``
        are non-scalar (i.e. their data has more than one element) and require
        gradient, then the Jacobian-vector product would be computed, in this
        case the function additionally requires specifying ``grad_tensors``.
        It should be a sequence of matching length, that contains the "vector"
        in the Jacobian-vector product, usually the gradient of the differentiated
        function w.r.t. corresponding tensors (``None`` is an acceptable value for
        all tensors that don't need gradient tensors).

        This function accumulates gradients in the leaves - you might need to zero
        ``.grad`` attributes or set them to ``None`` before calling it.
        See :ref:`Default gradient layouts<default-grad-layouts>`
        for details on the memory layout of accumulated gradients.

        .. note::
            Using this method with ``create_graph=True`` will create a reference cycle
            between the parameter and its gradient which can cause a memory leak.
            We recommend using ``autograd.grad`` when creating the graph to avoid this.
            If you have to use this function, make sure to reset the ``.grad`` fields of your
            parameters to ``None`` after use to break the cycle and avoid the leak.

        .. note::

            If you run any forward ops, create ``grad_tensors``, and/or call ``backward``
            in a user-specified CUDA stream context, see
            :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.

        .. note::

            When ``inputs`` are provided and a given input is not a leaf,
            the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
            It is an implementation detail on which the user should not rely.
            See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.

        Args:
            tensors (Sequence[Tensor] or Tensor): Tensors of which the derivative will be
                computed.
            grad_tensors (Sequence[Tensor or None] or Tensor, optional): The "vector" in
                the Jacobian-vector product, usually gradients w.r.t. each element of
                corresponding tensors. None values can be specified for scalar Tensors or
                ones that don't require grad. If a None value would be acceptable for all
                grad_tensors, then this argument is optional.
            retain_graph (bool, optional): If ``False``, the graph used to compute the grad
                will be freed. Note that in nearly all cases setting this option to ``True``
                is not needed and often can be worked around in a much more efficient
                way. Defaults to the value of ``create_graph``.
            create_graph (bool, optional): If ``True``, graph of the derivative will
                be constructed, allowing to compute higher order derivative products.
                Defaults to ``False``.
            inputs (Sequence[Tensor] or Tensor, optional): Inputs w.r.t. which the gradient
                be will accumulated into ``.grad``. All other Tensors will be ignored. If
                not provided, the gradient is accumulated into all the leaf Tensors that
                were used to compute the attr::tensors.
        """
        if torch._C._are_functorch_transforms_active():
            raise RuntimeError(
                "backward() called inside a functorch transform. This is not "
                "supported, please use functorch.grad or functorch.vjp instead "
                "or call backward() outside of functorch transforms.")

        if grad_variables is not None:
            warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
            if grad_tensors is None:
                grad_tensors = grad_variables
            else:
                raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
                                   "arguments both passed to backward(). Please only "
                                   "use 'grad_tensors'.")
        if inputs is not None and len(inputs) == 0:
            raise RuntimeError("'inputs' argument to backward() cannot be empty.")

        tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
        inputs = (inputs,) if isinstance(inputs, torch.Tensor) else \
            tuple(inputs) if inputs is not None else tuple()

        grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
        grad_tensors_ = _make_grads(tensors, grad_tensors_, is_grads_batched=False)
        if retain_graph is None:
            retain_graph = create_graph

        # The reason we repeat same the comment below is that
        # some Python versions print out the first line of a multi-line function
        # calls in the traceback and some print out the last line
>       Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
            tensors, grad_tensors_, retain_graph, create_graph, inputs,
            allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass
E       RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 32, 32, 32]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
silvandeleemput / memcnn

Make compatible for PyTorch 2 #75

Description

What I Did