Assertion fail via lmdeploy - internlm-xcomposer2d5-7b-4bit

I'm trying to run your model on GCP L4 (vertex workbench defaults), but it always crashes out with assertion failures without any additional context. Is there something I'm doing wrong?

Python / Cuda versions:

$ python --version
Python 3.10.14

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0

Install script:

export LMDEPLOY_VERSION=0.5.0
export PYTHON_VERSION=310
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
pip install decord

Running:

import nest_asyncio
nest_asyncio.apply()

from lmdeploy import TurbomindEngineConfig, pipeline
from lmdeploy.vl import load_image
engine_config = TurbomindEngineConfig(model_format='awq')
pipe = pipeline('internlm/internlm-xcomposer2d5-7b-4bit', backend_config=engine_config)
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
response = pipe(('describe this image', image))
print(response)

Open to see the error logs

```log --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[2], line 4 2 from lmdeploy.vl import load_image 3 engine_config = TurbomindEngineConfig(model_format='awq') ----> 4 pipe = pipeline('internlm/internlm-xcomposer2d5-7b-4bit', backend_config=engine_config) 5 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') 6 response = pipe(('describe this image', image)) File /opt/conda/lib/python3.10/site-packages/lmdeploy/api.py:89, in pipeline(model_path, model_name, backend_config, chat_template_config, log_level, **kwargs) 86 else: 87 tp = 1 if backend_config is None else backend_config.tp ---> 89 return pipeline_class(model_path, 90 model_name=model_name, 91 backend=backend, 92 backend_config=backend_config, 93 chat_template_config=chat_template_config, 94 tp=tp, 95 **kwargs) File /opt/conda/lib/python3.10/site-packages/lmdeploy/serve/vl_async_engine.py:24, in VLAsyncEngine.__init__(self, model_path, **kwargs) 20 backend_config = kwargs.get('backend_config', None) 21 self.vl_encoder = ImageEncoder(model_path, 22 vision_config, 23 backend_config=backend_config) ---> 24 super().__init__(model_path, **kwargs) 25 if self.model_name == 'base': 26 raise RuntimeError( 27 'please specify chat template as guided in https://lmdeploy.readthedocs.io/en/latest/inference/vl_pipeline.html#set-chat-template' # noqa: E501 28 ) File /opt/conda/lib/python3.10/site-packages/lmdeploy/serve/async_engine.py:189, in AsyncEngine.__init__(self, model_path, model_name, backend, backend_config, chat_template_config, tp, **kwargs) 187 # build backend engine 188 if backend == 'turbomind': --> 189 self._build_turbomind(model_path=model_path, 190 backend_config=backend_config, 191 tp=tp, 192 **kwargs) 193 elif backend == 'pytorch': 194 self._build_pytorch(model_path=model_path, 195 backend_config=backend_config, 196 **kwargs) File /opt/conda/lib/python3.10/site-packages/lmdeploy/serve/async_engine.py:234, in AsyncEngine._build_turbomind(self, model_path, backend_config, tp, **kwargs) 230 assert isinstance(backend_config, TurbomindEngineConfig), 'Please'\ 231 ' use TurbomindEngineConfig imported from lmdeploy.messages for ' \ 232 'turbomind backend' 233 from lmdeploy import turbomind as tm --> 234 self.engine = tm.TurboMind.from_pretrained( 235 model_path, engine_config=backend_config, **kwargs) 236 self.backend_config = backend_config 237 self.hf_tm_cfg = self.engine.config File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/turbomind.py:342, in TurboMind.from_pretrained(cls, pretrained_model_name_or_path, engine_config, model_format, group_size, tp, **kwargs) 340 model_source = get_model_source(pretrained_model_name_or_path) 341 logger.info(f'model_source: {model_source}') --> 342 return cls(model_path=pretrained_model_name_or_path, 343 engine_config=engine_config, 344 model_source=model_source, 345 model_format=model_format, 346 group_size=group_size, 347 tp=tp, 348 **kwargs) File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/turbomind.py:144, in TurboMind.__init__(self, model_path, engine_config, model_source, model_name, model_format, group_size, tp, **kwargs) 141 model_path = get_model(model_path, engine_config.download_dir, 142 engine_config.revision) 143 self.tokenizer = Tokenizer(model_path) --> 144 self.model_comm = self._from_hf(model_source=model_source, 145 model_path=model_path, 146 engine_config=engine_config) 148 self.session_len = self.config.session_len 149 self.eos_id = self.tokenizer.eos_token_id File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/turbomind.py:259, in TurboMind._from_hf(self, model_source, model_path, engine_config) 257 self._get_model_params(model_comm, tm_params) 258 logger.warning(f'get {len(tm_params)} model params') --> 259 output_model.export() 260 # there should be no left turbomind params. 261 if len(tm_params) > 0: File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/deploy/target_model/base.py:285, in BaseOutputModel.export(self) 283 self.export_misc(bin) 284 for i in range(bin.start_layer_id, bin.end_layer_id): --> 285 self.export_transformer_block(bin, i) 286 pbar.update(1) 287 pbar.close() File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/deploy/target_model/plora_w4.py:98, in TurbomindPloraW4Model.export_transformer_block(self, bin, i) 96 lora_a_qkv, lora_a_o = transpose_tensor([lora_a_qkv, lora_a_o]) 97 # print(lora_a_qkv.shape, lora_a_o.shape) ---> 98 self.save_split(lora_a_qkv, 99 f'layers.{i}.attention.w_qkv.lora_a.weight', 100 copy=True) 101 self.save_split(lora_a_o, f'layers.{i}.attention.wo.lora_a.weight', 0) 102 # attn lora_b File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/deploy/target_model/base.py:270, in BaseOutputModel.save_split(self, tensor, name, split_dim, copy) 268 for i, copy in enumerate(copies): 269 prefix, ext = osp.splitext(name) --> 270 self.export_weight(copy, f'{prefix}.{i}{ext}') 271 else: 272 self.export_weight(tensor, name) File /opt/conda/lib/python3.10/site-packages/lmdeploy/turbomind/deploy/target_model/base.py:241, in BaseOutputModel.export_weight(self, param, name) 239 torch_tensor = torch_tensor.float() 240 for tm_tensor in tm_params[name]: --> 241 tm_tensor.copy_from(torch_tensor) 242 tm_params.pop(name) 243 else: RuntimeError: [TM][ERROR] Assertion fail: /lmdeploy/src/turbomind/python/bind.cpp:294 ```

InternLM / InternLM-XComposer

Assertion fail via lmdeploy - internlm-xcomposer2d5-7b-4bit #382