Closed thesby closed 1 month ago
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/engine.py", line 22, in _raise_exception_on_finish
task.result()
File "/opt/conda/envs/python3.10.13/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/engine.py", line 151, in forward
outputs = self.model.forward(inputs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/model/internvl.py", line 172, in forward
return self._forward_func(images)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/model/internvl.py", line 153, in _forward_v1_5
outputs = self.model.extract_feature(outputs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_internvl_chat.py", line 168, in extract_feature
vit_embeds = self.vision_model(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 418, in forward
encoder_outputs = self.encoder(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 354, in forward
layer_outputs = encoder_layer(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 169, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 296, in forward
hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 169, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 252, in forward
x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 244, in _flash_attn
context, _ = self.inner_attn(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 77, in forward
output = flash_attn_unpadded_qkvpacked_func(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 887, in flash_attn_varlen_qkvpacked_func
return FlashAttnVarlenQKVPackedFunc.apply(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/autograd/function.py", line 553, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 288, in forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/flash_attn/flash_attn_interface.py", line 85, in _flash_attn_varlen_forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
RuntimeError: FlashAttention only supports Ampere GPUs or newer.
可以把 flash_attn 卸载了看看。
Flash Attention requires architecture greater than Ampere.
LMDeploy部署 VLM 模型的时候,视觉部分的模型推理是复用上游的算法库。
从上面日志上看,vit 模块用到了 flash attn。查了下上游库代码:
try:
try: # v1
from flash_attn.flash_attn_interface import \
flash_attn_unpadded_qkvpacked_func
except: # v2
from flash_attn.flash_attn_interface import \
flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
from flash_attn.bert_padding import pad_input, unpad_input
has_flash_attn = True
except:
print('FlashAttention is not installed.')
has_flash_attn = False
它根据环境中是否有flash attn 决定要不要用。
所以,在V100上,不要安装 flash-attn。而且flash-attn也不支持V100架构。 你可以把 flash-attn卸载掉,就像@irexyc建议的那样。这样 vit 就不用 flash attention了。
而LLM部分则由 LMDeploy 引擎负责推理的,它实现的 flash attention 支持 V100 架构。
感谢,我卸载 flash_attn 后发现无法启动,看到报错在 transformer_engine,然后把 transformer_engine 也卸载掉就好了
服务可以正常启动了,但是调用时还是报错:
Exception in callback <function _raise_exception_on_finish at 0x7f9ff2b05fc0>
handle: <Handle _raise_exception_on_finish>
Traceback (most recent call last):
le "uvloop/cbhandles.pyx", line 63, in uvloop.loop.Handle._run
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/engine.py", line 26, in _raise_exception_on_finish
ise e
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/engine.py", line 22, in _raise_exception_on_finish
sk.result()
File "/opt/conda/envs/python3.10.13/lib/python3.10/concurrent/futures/thread.py", line 58, in run
sult = self.fn(*self.args, **self.kwargs)
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/engine.py", line 151, in forward
tputs = self.model.forward(inputs)
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
turn func(*args, **kwargs)
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/model/internvl.py", line 172, in forward
turn self._forward_func(images)
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/lmdeploy/vl/model/internvl.py", line 153, in _forward_v1_5
tputs = self.model.extract_feature(outputs)
Fi "/root/.cache/huggingface/modules/transformers_modules/modeling_internvl_chat.py", line 168, in extract_feature
t_embeds = self.vision_model(
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
turn self._call_impl(*args, **kwargs)
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
turn forward_call(*args, **kwargs)
Fi "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 418, in forward
coder_outputs = self.encoder(
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
turn self._call_impl(*args, **kwargs)
Fi "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 354, in forward
layer_outputs = encoder_layer(
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 169, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 296, in forward
hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 169, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 252, in forward
x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
File "/root/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 217, in _naive_attn
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/accelerate/hooks.py", line 169, in new_forward
output = module._old_forward(*args, **kwargs)
File "/opt/conda/envs/python3.10.13/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 116, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, &fbeta, c, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
启动服务后显存剩余的情况是怎样的,可以把启动后的nvidia-smi的结果贴上来。
如果剩余显存比较小的话,可以在启动的时候加上 --cache-max-entry-count 0.2 来试试。
重新创建一个python 环境就好了
Checklist
Describe the bug
我用8卡V100启动Internvl2-llama3-76B,在运行阶段报错
Reproduction
python -m lmdeploy serve api_server InternVL2-Llama3-76B --model-name internvl2-internlm2 --tp 8 --quant-policy 8
Environment
Error traceback