Accelerate local LLM inference and finetuning (LLaMA, Mistral, ChatGLM, Qwen, Mixtral, Gemma, Phi, MiniCPM, Qwen-VL, MiniCPM-V, etc.) on Intel XPU (e.g., local PC with iGPU and NPU, discrete GPU such as Arc, Flex and Max); seamlessly integrate with llama.cpp, Ollama, HuggingFace, LangChain, LlamaIndex, vLLM, GraphRAG, DeepSpeed, Axolotl, etc
Apache License 2.0
6.76k
stars
1.27k
forks
source link
"can NOT allocate memory block with size larger than 4GB" on Arc A770 GPU when inference #11248
model = AutoModel.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.bfloat16).eval()
after:
model = AutoModel.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
load_in_4bit=True).eval().to("xpu:1")
You: How does TiO2 works?
..............(He gave me his answer)
You: How does TiO2 works?
GLM-4:Exception in thread Thread-3 (generate):
Traceback (most recent call last):
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\threading.py", line 1045, in _bootstrap_inner
self.run()
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\threading.py", line 982, in run
self._target(*self._args, self._kwargs)
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\ipex_llm\transformers\lookup.py", line 88, in generate
return original_generate(self,
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context
return func(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\ipex_llm\transformers\speculative.py", line 109, in generate
return original_generate(self,
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context
return func(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\transformers\generation\utils.py", line 1764, in generate
return self.sample(
^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\transformers\generation\utils.py", line 2861, in sample
outputs = self(
^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 881, in forward
transformer_outputs = self.transformer(
^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 777, in forward
hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 610, in forward
layer_ret = layer(
^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 513, in forward
attention_output, kv_cache = self.self_attention(
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 410, in forward
context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 189, in forward
context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Current platform can NOT allocate memory block with size larger than 4GB! Tried to allocate 4.84 GiB (GPU 1; 15.56 GiB total capacity; 7.61 GiB already allocated; 19.70 GiB reserved in total by PyTorch)
I use glm-4-9b-chat model to process an input about 4k tokens, and I got an "RuntimeError". The code is directly copied from "https://github.com/THUDM/GLM-4/blob/main/basic_demo/trans_cli_demo.py" with some modification to apply to my Arc A770.
before:
after:
GLM-4:Exception in thread Thread-3 (generate): Traceback (most recent call last): File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\threading.py", line 1045, in _bootstrap_inner self.run() File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\threading.py", line 982, in run self._target(*self._args, self._kwargs) File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context return func(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\ipex_llm\transformers\lookup.py", line 88, in generate return original_generate(self, ^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context return func(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\ipex_llm\transformers\speculative.py", line 109, in generate return original_generate(self, ^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context return func(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\transformers\generation\utils.py", line 1764, in generate return self.sample( ^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\transformers\generation\utils.py", line 2861, in sample outputs = self( ^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 881, in forward transformer_outputs = self.transformer( ^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 777, in forward hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( ^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 610, in forward layer_ret = layer( ^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 513, in forward attention_output, kv_cache = self.self_attention( ^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 410, in forward context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl return self._call_impl(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light\anaconda3\envs\ipex_llm_env\Lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\Invisible_Light.cache\huggingface\modules\transformers_modules\glm-4-9b-chat\modeling_chatglm.py", line 189, in forward context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: Current platform can NOT allocate memory block with size larger than 4GB! Tried to allocate 4.84 GiB (GPU 1; 15.56 GiB total capacity; 7.61 GiB already allocated; 19.70 GiB reserved in total by PyTorch)