This issue solves some of the inference problems of Intern VL2, but there are still problems in multi-card parallel situations.
(VllmWorkerProcess pid=29708) ERROR 09-03 12:08:10 multiproc_worker_utils.py:226]
[rank0]: Traceback (most recent call last):
[rank0]: File "offline_inference_vision_language.py", line 240, in <module>
[rank0]: main(args)
[rank0]: File "offline_inference_vision_language.py", line 190, in main
[rank0]: llm, prompt, stop_token_ids = model_example_map[model](question)
[rank0]: File "offline_inference_vision_language.py", line 135, in run_internvl
[rank0]: llm = LLM(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/entrypoints/llm.py", line 177, in __init__
[rank0]: self.llm_engine = LLMEngine.from_engine_args(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/engine/llm_engine.py", line 541, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/engine/llm_engine.py", line 302, in __init__
[rank0]: self.model_executor = executor_class(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/executor/distributed_gpu_executor.py", line 26, in __init__
[rank0]: super().__init__(*args, **kwargs)
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/executor/executor_base.py", line 47, in __init__
[rank0]: self._init_executor()
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/executor/multiproc_gpu_executor.py", line 125, in _init_executor
[rank0]: self._run_workers("load_model",
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/executor/multiproc_gpu_executor.py", line 199, in _run_workers
[rank0]: driver_worker_output = driver_worker_method(*args, **kwargs)
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/worker/worker.py", line 182, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/worker/model_runner.py", line 915, in load_model
[rank0]: self.model = get_model(model_config=self.model_config,
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/model_loader/__init__.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/model_loader/loader.py", line 341, in load_model
[rank0]: model = _initialize_model(model_config, self.load_config,
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/model_loader/loader.py", line 170, in _initialize_model
[rank0]: return build_model(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/model_loader/loader.py", line 155, in build_model
[rank0]: return model_class(config=hf_config,
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/models/internvl.py", line 328, in __init__
[rank0]: self.vision_model = InternVisionModel(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/models/intern_vit.py", line 252, in __init__
[rank0]: self.encoder = InternVisionEncoder(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/models/intern_vit.py", line 228, in __init__
[rank0]: self.layers = nn.ModuleList([
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/models/intern_vit.py", line 229, in <listcomp>
[rank0]: InternVisionEncoderLayer(config=config, quant_config=quant_config)
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/models/intern_vit.py", line 190, in __init__
[rank0]: self.attn = InternAttention(config, quant_config=quant_config)
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/models/intern_vit.py", line 104, in __init__
[rank0]: self.qkv = QKVParallelLinear(
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/model_executor/layers/linear.py", line 659, in __init__
[rank0]: self.num_heads = divide(self.total_num_heads, tp_size)
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/distributed/utils.py", line 24, in divide
[rank0]: ensure_divisibility(numerator, denominator)
[rank0]: File "/home/tdj/intervl/temp_vllm/vllm/vllm/distributed/utils.py", line 17, in ensure_divisibility
[rank0]: assert numerator % denominator == 0, "{} is not divisible by {}".format(
[rank0]: AssertionError: 25 is not divisible by 4
ERROR 09-03 12:08:10 multiproc_worker_utils.py:120] Worker VllmWorkerProcess pid 29708 died, exit code: -15
my code
# InternVL
def run_internvl(question):
model_name = "/home/tdj/model/InternVL2-26B"
llm = LLM(
model=model_name,
# dtype = "half",
trust_remote_code=True,
gpu_memory_utilization=0.9,
tensor_parallel_size=4,
max_num_batched_tokens=8192,
max_model_len=4096,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids
Before submitting a new issue...
[X] Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
Your current environment
Python3.8 8*A10 GPU Model:InternVL2-26B vllm branch:main torch 2.4.0 torchvision 0.19.0
🐛 Describe the bug
ref: https://github.com/vllm-project/vllm/pull/8055#issuecomment-2324237046 https://github.com/vllm-project/vllm/issues/7996
This issue solves some of the inference problems of Intern VL2, but there are still problems in multi-card parallel situations.
my code
Before submitting a new issue...