Open BaideBear opened 4 days ago
(habanalabs-venv) (habanalabs-venv) root@vmInstancetmhmacpr:~/lier_workload/test# python inference_test.py /usr/lib/python3.10/inspect.py:288: FutureWarning: `torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead return isinstance(object, types.FunctionType) Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 17.61it/s] ============================= HABANA PT BRIDGE CONFIGURATION =========================== PT_HPU_LAZY_MODE = 1 PT_RECIPE_CACHE_PATH = PT_CACHE_FOLDER_DELETE = 0 PT_HPU_RECIPE_CACHE_CONFIG = PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807 PT_HPU_LAZY_ACC_PAR_MODE = 1 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0 PT_HPU_EAGER_PIPELINE_ENABLE = 1 PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1 ---------------------------: System Configuration :--------------------------- Num CPU Cores : 28 CPU RAM : 123576844 KB ------------------------------------------------------------------------------ Traceback (most recent call last): File "/root/lier_workload/test/inference_test.py", line 16, in <module> response, history = model.chat(tokenizer, "你好", history=[]) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py", line 1195, in chat outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) File "/root/habanalabs-venv/lib/python3.10/site-packages/transformers/generation/utils.py", line 2047, in generate result = self._sample( File "/root/habanalabs-venv/lib/python3.10/site-packages/transformers/generation/utils.py", line 3007, in _sample outputs = self(**model_inputs, return_dict=True) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1556, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1565, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py", line 1094, in forward transformer_outputs = self.transformer( File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1556, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1606, in _call_impl result = forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py", line 987, in forward hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1556, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1606, in _call_impl result = forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py", line 797, in forward layer_ret = layer( File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1556, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1606, in _call_impl result = forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py", line 701, in forward attention_output, kv_cache = self.self_attention( File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1556, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/root/habanalabs-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1606, in _call_impl result = forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py", line 565, in forward query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) NotImplementedError: Unknown device for graph fuser
examples
import os import platform import torch from transformers import AutoTokenizer, AutoModel import habana_frameworks.torch.core as htcore
MODEL_PATH = os.environ.get('MODEL_PATH', '/data/chatglm3-6b') TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True) model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).eval() model = model.to("hpu")
response, history = model.chat(tokenizer, "你好", history=[]) htcore.mark_step() print(response) response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history) htcore.mark_step() print(response)
Successfully able to perform inference on the model.
could you try https://github.com/huggingface/optimum-habana/pull/1478 ? thank you
System Info
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
import os import platform import torch from transformers import AutoTokenizer, AutoModel import habana_frameworks.torch.core as htcore
MODEL_PATH = os.environ.get('MODEL_PATH', '/data/chatglm3-6b') TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH) tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True) model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).eval() model = model.to("hpu")
response, history = model.chat(tokenizer, "你好", history=[]) htcore.mark_step() print(response) response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history) htcore.mark_step() print(response)
Expected behavior
Successfully able to perform inference on the model.