mila-iqia / milabench

Repository of machine learning benchmarks
https://milabench.readthedocs.io
MIT License
19 stars 23 forks source link

reformer on HPU #301

Open Delaunay opened 6 days ago

Delaunay commented 6 days ago

Eager Mode

    * 1 x RuntimeError: mat1 and mat2 must have the same dtype, but got Half and Float
        | Traceback (most recent call last):
        |   File "/homes/delaunap/hpu/results/venv/torch/bin/voir", line 8, in <module>
        |     sys.exit(main())
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/cli.py", line 128, in main
        |     ov(sys.argv[1:] if argv is None else argv)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/phase.py", line 331, in __call__
        |     self._run(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/overseer.py", line 242, in _run
        |     set_value(func())
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/scriptutils.py", line 37, in <lambda>
        |     return lambda: exec(mainsection, glb, glb)
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 208, in <module>
        |     main()
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 204, in main
        |     runner.train()
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 120, in train
        |     loss = self.step(data)
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 92, in step
        |     self.amp_scaler.scale(loss).backward()
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/_tensor.py", line 535, in backward
        |     torch.autograd.backward(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward
        |     _engine_run_backward(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
        |     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/autograd/function.py", line 301, in apply
        |     return user_fn(self, *args)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/reformer/modeling_reformer.py", line 1671, in backward
        |     output = layer.backward_pass(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/reformer/modeling_reformer.py", line 1534, in backward_pass
        |     res_hidden_states = self.feed_forward(next_attn_output)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/reformer/modeling_reformer.py", line 1396, in forward
        |     return apply_chunking_to_forward(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 239, in apply_chunking_to_forward
        |     return forward_fn(*input_tensors)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/reformer/modeling_reformer.py", line 1405, in forward_chunk
        |     hidden_states = self.dense(hidden_states)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/reformer/modeling_reformer.py", line 1366, in forward
        |     hidden_states = self.dense(hidden_states)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 116, in forward
        |     return F.linear(input, self.weight, self.bias)
        | RuntimeError: mat1 and mat2 must have the same dtype, but got Half and Float
Delaunay commented 6 days ago

Lazy Mode

 * no training rate retrieved
  * Error codes = 1, 1
  * 1 exceptions found
    * 1 x RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_DEVMEM Allocation failed for size::268435456 (256)MB
        | Traceback (most recent call last):
        |   File "/homes/delaunap/hpu/results/venv/torch/bin/voir", line 8, in <module>
        |   sys.exit(main())
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/cli.py", line 128, in main
        |   ov(sys.argv[1:] if argv is None else argv)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/phase.py", line 331, in __call__
        |   self._run(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/overseer.py", line 242, in _run
        |   set_value(func())
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/voir/scriptutils.py", line 37, in <lambda>
        |   return lambda: exec(mainsection, glb, glb)
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 208, in <module>
        |   main()
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 204, in main
        |   runner.train()
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 120, in train
        |   loss = self.step(data)
        |   File "/homes/delaunap/milabench/benchmarks/huggingface/bench/__main__.py", line 96, in step
        |   accelerator.mark_step()
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/habana_frameworks/torch/utils/internal.py", line 27, in wrapper
        |   func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/habana_frameworks/torch/core/step_closure.py", line 66, in mark_step
        |   htcore._mark_step(device_str, sync)
        | RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_DEVMEM Allocation failed for size::268435456 (256)MB