mila-iqia / milabench

Repository of machine learning benchmarks
https://milabench.readthedocs.io
MIT License
19 stars 23 forks source link

rlhf HPU #303

Open Delaunay opened 2 days ago

Delaunay commented 2 days ago

Lazy Mode

habana_frameworks/torch/gpu_migration/torch/cuda/random.py

Changed

            pyhlml.hlmlInit()
                count = pyhlml.hlmlDeviceGetCount()
            pyhlml.hlmlShutdown()

to

            try:
                   pyhlml.hlmlInit()
                   count = pyhlml.hlmlDeviceGetCount()
                   pyhlml.hlmlShutdown()
            except Exception:
                  count = pyhlml.hlmlDeviceGetCount()

Latest error

rlhf-single.D5
==============
  * no training rate retrieved
  * Error codes = 1
  * 1 exceptions found
  0%|          | 0/254 [00:02<?, ?it/s]
        | Traceback (most recent call last):
        |   File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 137, in <module>
        |     main()
        |   File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 128, in main
        |     trainer.train()
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/ppov2_trainer.py", line 309, in train
        |     query_responses, logitss = batch_generation(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
        |     return func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1176, in batch_generation
        |     query_response, logits = generate(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1151, in generate
        |     output = lm_backbone.generate(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
        |     return func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
        |     result = self._sample(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 3038, in _sample
        |     unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/stopping_criteria.py", line 509, in __call__
        |     is_done = torch.full((input_ids.shape[0],), False, device=input_ids.device, dtype=torch.bool)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/habana_frameworks/torch/gpu_migration/torch/_C/_VariableFunctions.py", line 244, in full
        |     return TorchModuleRegister.full(*args, **kwargs)
        | RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_BRIDGE Exception in acc thread pool task has been thrown: Got a non-HPU tensor, expecting an HPU tensor
        | [Rank:0] Habana exception raised from GetHbLazyTensor at aten_lazy_bridge.cpp:290
Delaunay commented 1 day ago

Changed voir so

@contextmanager
def hlmlinit():
    try:
        pyhlml.hlmlInit()
        yield
        pyhlml.hlmlShutdown()
    except pyhlml.hlml_error.HLMLError_AlreadyInitialized:
        pass

...

        with hlmlinit():
            deviceCount = pyhlml.hlmlDeviceGetCount()
            for i in range(0, deviceCount):
                self.handles[i] = pyhlml.hlmlDeviceGetHandleByIndex(i)

...

    def get_gpus_info(self, selection=None):
        with hlmlinit():
            return make_gpu_infos(self.handles, selection)
        | Traceback (most recent call last):
        |   File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 137, in <module>
        |     main()
        |   File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 128, in main
        |     trainer.train()
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/ppov2_trainer.py", line 309, in train
        |     query_responses, logitss = batch_generation(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
        |     return func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1176, in batch_generation
        |     query_response, logits = generate(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1151, in generate
        |     output = lm_backbone.generate(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
        |     return func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
        |     result = self._sample(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 3038, in _sample
        |     unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/stopping_criteria.py", line 509, in __call__
        |     is_done = torch.full((input_ids.shape[0],), False, device=input_ids.device, dtype=torch.bool)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/habana_frameworks/torch/gpu_migration/torch/_C/_VariableFunctions.py", line 244, in full
        |     return TorchModuleRegister.full(*args, **kwargs)
        | RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_BRIDGE Exception in acc thread pool task has been thrown: Got a non-HPU tensor, expecting an HPU tensor
        | [Rank:0] Habana exception raised from GetHbLazyTensor at aten_lazy_bridge.cpp:290
Delaunay commented 1 day ago

Eager Mode

        | Traceback (most recent call last):
        |   File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 137, in <module>
        |     main()
        |   File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 128, in main
        |     trainer.train()
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/ppov2_trainer.py", line 309, in train
        |     query_responses, logitss = batch_generation(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
        |     return func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1176, in batch_generation
        |     query_response, logits = generate(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1151, in generate
        |     output = lm_backbone.generate(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
        |     return func(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
        |     result = self._sample(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2982, in _sample
        |     outputs = self(**model_inputs, return_dict=True)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 989, in forward
        |     outputs = self.gpt_neox(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 880, in forward
        |     outputs = layer(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 648, in forward
        |     self.input_layernorm(hidden_states),
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
        |     return self._call_impl(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
        |     return forward_call(*args, **kwargs)
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 201, in forward
        |     return F.layer_norm(
        |   File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/functional.py", line 2573, in layer_norm
        |     return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
        | RuntimeError: Expected all tensors to be on the HPU device, but found at least one input[idx=2] on cpu (details: torch.FloatTensor)