THUDM / GLM-4

GLM-4 series: Open Multilingual Multimodal Chat LMs | 开源多语言多模态对话模型
Apache License 2.0
5.24k stars 430 forks source link

batch运行GLM4V报错 #105

Closed wciq1208 closed 4 months ago

wciq1208 commented 5 months ago

System Info / 系統信息

容器:pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime GPU:一张3090 PYTHON:3.10 transformers:4.41.2

Who can help? / 谁可以帮助到您?

No response

Information / 问题信息

Reproduction / 复现过程

代码:

class StopOnTokens(StoppingCriteria):

    def __init__(self, m) -> None:
        self.model = m
        super().__init__()

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = self.model.config.eos_token_id
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

def test():
    model_name_or_path = os.path.join(ROOT_DIR, "model", "glm-4v-9b")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        trust_remote_code=True,
        encode_special_tokens=True
    )
    model = AutoModel.from_pretrained(
        model_name_or_path,
        trust_remote_code=True,
        device_map="cuda:1",
        torch_dtype=torch.bfloat16, load_in_4bit=True).eval()
    txt_list = ["图片里有什么文字信息吗", "给图片做个总结"]
    messages = [[{"role": "user", "content": txt, "image": Image.open("123.jpg").convert("RGB")}] for txt in txt_list]
    print(messages)
    model_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt",
        return_dict=True,
        padding=True
    ).to("cuda:1")
    generate_kwargs = {
        **model_inputs,
        "do_sample": True,
        "top_p": .8,
        "temperature": .6,
        "stopping_criteria": StoppingCriteriaList([StopOnTokens(model)]),
        "repetition_penalty": 1.2,
        "eos_token_id": [151329, 151336, 151338],
        "max_new_tokens": 4096
    }
    generated_ids = model.generate(**generate_kwargs)
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    print(response)

if __name__ == "__main__":
    test()
    exit()

输出和报错:

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. The load_in_4bit and load_in_8bit arguments are deprecated and will be removed in the future versions. Please, pass a BitsAndBytesConfig object in quantization_config argument instead. Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:17<00:00, 1.19s/it] [[{'role': 'user', 'content': '图片里有什么文字信息吗', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2543x1308 at 0x7FBF1F948CA0>}], [{'role': 'user', 'content': '给图片做个总结', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2543x1308 at 0x7FBF1698EA10>}]] Traceback (most recent call last): File "/hestia/src/adapter/chatglm.py", line 233, in test() File "/hestia/src/adapter/chatglm.py", line 224, in test generated_ids = model.generate(generate_kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, *kwargs) File "/opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py", line 1758, in generate result = self._sample( File "/opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py", line 2397, in _sample outputs = self( File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, kwargs) File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/root/.cache/huggingface/modules/transformers_modules/glm-4v-9b/modeling_chatglm.py", line 1017, in forward transformer_outputs = self.transformer( File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, kwargs) File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/root/.cache/huggingface/modules/transformers_modules/glm-4v-9b/modeling_chatglm.py", line 906, in forward hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, kwargs) File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/root/.cache/huggingface/modules/transformers_modules/glm-4v-9b/modeling_chatglm.py", line 664, in forward layer_ret = layer( File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, kwargs) File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/root/.cache/huggingface/modules/transformers_modules/glm-4v-9b/modeling_chatglm.py", line 567, in forward attention_output, kv_cache = self.self_attention( File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, kwargs) File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/root/.cache/huggingface/modules/transformers_modules/glm-4v-9b/modeling_chatglm.py", line 464, in forward context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(args, kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/glm-4v-9b/modeling_chatglm.py", line 250, in forward context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, RuntimeError: The expanded size of the tensor (1613) must match the existing size (14) at non-singleton dimension 3. Target sizes: [2, 32, 1613, 1613]. Tensor sizes: [2, 1, 14, 14]

Expected behavior / 期待表现

能正确work

woskii commented 4 months ago

遇到了同样的问题,如果prompt长度一样可以正常推理,长度不一致时padding之后推理会报错 RuntimeError: The expanded size of the tensor (1626) must match the existing size (27) at non-singleton dimension 3. Target sizes: [2, 32, 1626, 1626]. Tensor sizes: [2, 1, 27, 27]

zRzRzRzRzRzRzR commented 4 months ago

已修

duzx16 commented 4 months ago

已修复 https://huggingface.co/THUDM/glm-4v-9b/commit/f81daa39cae87b8373ae28f52851b2c2e94d7e4e