There have been some major changes for PyTorch 2.0.x that break some of the reversible operations.
I should investigate how to fix these.
What I Did
Run CI tests for PyTorch 2.0.x
tmp_path = PosixPath('/tmp/pytest-of-circleci/pytest-0/test_train_networks_False_revn0')
network = 'revnet38', use_cuda = False
@pytest.mark.parametrize("network", [
pytest.param(network,
marks=pytest.mark.skipif(
condition=("FULL_NETWORK_TESTS" not in os.environ) and ("revnet38" != network),
reason="Too memory intensive for CI so these tests are disabled by default. "
"Set FULL_NETWORK_TESTS environment variable to enable the tests.")
)
for network in ["resnet32", "resnet110", "resnet164", "revnet38", "revnet110", "revnet164"]
])
@pytest.mark.parametrize("use_cuda", [
False,
pytest.param(True, marks=pytest.mark.skipif(condition=not torch.cuda.is_available(), reason="No GPU available"))
])
def test_train_networks(tmp_path, network, use_cuda):
exptags = ["cifar10", network, "epoch5"]
exp_file = str(Path(__file__).parent / "resources" / "experiments.json")
data_dir = str(tmp_path / "tmpdata")
results_dir = str(tmp_path / "resdir")
os.makedirs(data_dir)
os.makedirs(results_dir)
> run_experiment(experiment_tags=exptags, data_dir=data_dir, results_dir=results_dir,
start_fresh=True, use_cuda=use_cuda, workers=None, experiments_file=exp_file,
disp_iter=1,
save_iter=5,
valid_iter=5,)
memcnn/trainers/tests/test_train.py:89:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
memcnn/train.py:48: in run_experiment
trainer(manager, start_iter=last_iter, use_cuda=use_cuda, *args, **trainer_params)
memcnn/trainers/classification.py:111: in train
loss.backward()
.tox/py38-torchlatest/lib/python3.8/site-packages/torch/_tensor.py:487: in backward
torch.autograd.backward(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tensors = (tensor(2.3590, grad_fn=<MulBackward0>),), grad_tensors = None
retain_graph = False, create_graph = False, grad_variables = None, inputs = ()
def backward(
tensors: _TensorOrTensors,
grad_tensors: Optional[_TensorOrTensors] = None,
retain_graph: Optional[bool] = None,
create_graph: bool = False,
grad_variables: Optional[_TensorOrTensors] = None,
inputs: Optional[_TensorOrTensors] = None,
) -> None:
r"""Computes the sum of gradients of given tensors with respect to graph
leaves.
The graph is differentiated using the chain rule. If any of ``tensors``
are non-scalar (i.e. their data has more than one element) and require
gradient, then the Jacobian-vector product would be computed, in this
case the function additionally requires specifying ``grad_tensors``.
It should be a sequence of matching length, that contains the "vector"
in the Jacobian-vector product, usually the gradient of the differentiated
function w.r.t. corresponding tensors (``None`` is an acceptable value for
all tensors that don't need gradient tensors).
This function accumulates gradients in the leaves - you might need to zero
``.grad`` attributes or set them to ``None`` before calling it.
See :ref:`Default gradient layouts<default-grad-layouts>`
for details on the memory layout of accumulated gradients.
.. note::
Using this method with ``create_graph=True`` will create a reference cycle
between the parameter and its gradient which can cause a memory leak.
We recommend using ``autograd.grad`` when creating the graph to avoid this.
If you have to use this function, make sure to reset the ``.grad`` fields of your
parameters to ``None`` after use to break the cycle and avoid the leak.
.. note::
If you run any forward ops, create ``grad_tensors``, and/or call ``backward``
in a user-specified CUDA stream context, see
:ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
.. note::
When ``inputs`` are provided and a given input is not a leaf,
the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
It is an implementation detail on which the user should not rely.
See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
Args:
tensors (Sequence[Tensor] or Tensor): Tensors of which the derivative will be
computed.
grad_tensors (Sequence[Tensor or None] or Tensor, optional): The "vector" in
the Jacobian-vector product, usually gradients w.r.t. each element of
corresponding tensors. None values can be specified for scalar Tensors or
ones that don't require grad. If a None value would be acceptable for all
grad_tensors, then this argument is optional.
retain_graph (bool, optional): If ``False``, the graph used to compute the grad
will be freed. Note that in nearly all cases setting this option to ``True``
is not needed and often can be worked around in a much more efficient
way. Defaults to the value of ``create_graph``.
create_graph (bool, optional): If ``True``, graph of the derivative will
be constructed, allowing to compute higher order derivative products.
Defaults to ``False``.
inputs (Sequence[Tensor] or Tensor, optional): Inputs w.r.t. which the gradient
be will accumulated into ``.grad``. All other Tensors will be ignored. If
not provided, the gradient is accumulated into all the leaf Tensors that
were used to compute the attr::tensors.
"""
if torch._C._are_functorch_transforms_active():
raise RuntimeError(
"backward() called inside a functorch transform. This is not "
"supported, please use functorch.grad or functorch.vjp instead "
"or call backward() outside of functorch transforms.")
if grad_variables is not None:
warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
if grad_tensors is None:
grad_tensors = grad_variables
else:
raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
"arguments both passed to backward(). Please only "
"use 'grad_tensors'.")
if inputs is not None and len(inputs) == 0:
raise RuntimeError("'inputs' argument to backward() cannot be empty.")
tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
inputs = (inputs,) if isinstance(inputs, torch.Tensor) else \
tuple(inputs) if inputs is not None else tuple()
grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
grad_tensors_ = _make_grads(tensors, grad_tensors_, is_grads_batched=False)
if retain_graph is None:
retain_graph = create_graph
# The reason we repeat same the comment below is that
# some Python versions print out the first line of a multi-line function
# calls in the traceback and some print out the last line
> Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
tensors, grad_tensors_, retain_graph, create_graph, inputs,
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
E RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [100, 32, 32, 32]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
Description
There have been some major changes for PyTorch 2.0.x that break some of the reversible operations. I should investigate how to fix these.
What I Did
Run CI tests for PyTorch 2.0.x