Accelerate local LLM inference and finetuning (LLaMA, Mistral, ChatGLM, Qwen, Baichuan, Mixtral, Gemma, Phi, MiniCPM, etc.) on Intel XPU (e.g., local PC with iGPU and NPU, discrete GPU such as Arc, Flex and Max); seamlessly integrate with llama.cpp, Ollama, HuggingFace, LangChain, LlamaIndex, GraphRAG, DeepSpeed, vLLM, FastChat, Axolotl, etc.
Traceback (most recent call last):
File "D:\ultra_test_code_and_data\benchmark_test2intel\speed_test_ultra-qwen27b.py", line 270, in
infer_test(model, tokenizer, input_token_num, output_token_num, total_speed_file)
File "D:\ultra_test_code_and_data\benchmark_test2intel\speed_test_ultra-qwen27b.py", line 108, in infer_test
prefill_output = model(model_inputs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\transformers\models\qwen2\modeling_qwen2.py", line 1186, in forward
logits = self.lm_head(hidden_states)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(args, **kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\ipex_llm\transformers\low_bit_linear.py", line 737, in forward
result = xe_linear.forward_new(x_2d, self.weight.data, self.weight.qtype,
RuntimeError: Native API failed. Native API returns: -2 (PI_ERROR_DEVICE_NOT_AVAILABLE) -2 (PI_ERROR_DEVICE_NOT_AVAILABLE)
Qwen2 input token 6800, output token 512 OOM
已经去掉如下操作 C:\ProgramData\miniconda3\envs\qwen1.5\Lib\site-packages\transformers\models\qwen2\modeling_qwen2.py
logits = logits.float()
报错
Traceback (most recent call last): File "D:\ultra_test_code_and_data\benchmark_test2intel\speed_test_ultra-qwen27b.py", line 270, in
infer_test(model, tokenizer, input_token_num, output_token_num, total_speed_file)
File "D:\ultra_test_code_and_data\benchmark_test2intel\speed_test_ultra-qwen27b.py", line 108, in infer_test
prefill_output = model(model_inputs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\transformers\models\qwen2\modeling_qwen2.py", line 1186, in forward
logits = self.lm_head(hidden_states)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
return forward_call(args, **kwargs)
File "C:\Users\seewo\miniforge3\envs\qwen2\lib\site-packages\ipex_llm\transformers\low_bit_linear.py", line 737, in forward
result = xe_linear.forward_new(x_2d, self.weight.data, self.weight.qtype,
RuntimeError: Native API failed. Native API returns: -2 (PI_ERROR_DEVICE_NOT_AVAILABLE) -2 (PI_ERROR_DEVICE_NOT_AVAILABLE)