Open Delaunay opened 2 days ago
Changed voir so
@contextmanager
def hlmlinit():
try:
pyhlml.hlmlInit()
yield
pyhlml.hlmlShutdown()
except pyhlml.hlml_error.HLMLError_AlreadyInitialized:
pass
...
with hlmlinit():
deviceCount = pyhlml.hlmlDeviceGetCount()
for i in range(0, deviceCount):
self.handles[i] = pyhlml.hlmlDeviceGetHandleByIndex(i)
...
def get_gpus_info(self, selection=None):
with hlmlinit():
return make_gpu_infos(self.handles, selection)
| Traceback (most recent call last):
| File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 137, in <module>
| main()
| File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 128, in main
| trainer.train()
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/ppov2_trainer.py", line 309, in train
| query_responses, logitss = batch_generation(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1176, in batch_generation
| query_response, logits = generate(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1151, in generate
| output = lm_backbone.generate(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
| result = self._sample(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 3038, in _sample
| unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/stopping_criteria.py", line 509, in __call__
| is_done = torch.full((input_ids.shape[0],), False, device=input_ids.device, dtype=torch.bool)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/habana_frameworks/torch/gpu_migration/torch/_C/_VariableFunctions.py", line 244, in full
| return TorchModuleRegister.full(*args, **kwargs)
| RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_BRIDGE Exception in acc thread pool task has been thrown: Got a non-HPU tensor, expecting an HPU tensor
| [Rank:0] Habana exception raised from GetHbLazyTensor at aten_lazy_bridge.cpp:290
| Traceback (most recent call last):
| File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 137, in <module>
| main()
| File "/homes/delaunap/milabench/benchmarks/rlhf/main.py", line 128, in main
| trainer.train()
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/ppov2_trainer.py", line 309, in train
| query_responses, logitss = batch_generation(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1176, in batch_generation
| query_response, logits = generate(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/trl/trainer/utils.py", line 1151, in generate
| output = lm_backbone.generate(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
| result = self._sample(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/generation/utils.py", line 2982, in _sample
| outputs = self(**model_inputs, return_dict=True)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
| return self._call_impl(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
| return forward_call(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 989, in forward
| outputs = self.gpt_neox(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
| return self._call_impl(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
| return forward_call(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 880, in forward
| outputs = layer(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
| return self._call_impl(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
| return forward_call(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 648, in forward
| self.input_layernorm(hidden_states),
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl
| return self._call_impl(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1544, in _call_impl
| return forward_call(*args, **kwargs)
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/modules/normalization.py", line 201, in forward
| return F.layer_norm(
| File "/homes/delaunap/hpu/results/venv/torch/lib/python3.10/site-packages/torch/nn/functional.py", line 2573, in layer_norm
| return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
| RuntimeError: Expected all tensors to be on the HPU device, but found at least one input[idx=2] on cpu (details: torch.FloatTensor)
Lazy Mode
habana_frameworks/torch/gpu_migration/torch/cuda/random.py
Changed
to
Latest error