ChatGLM3-6b 回复中内容问题

Bannerli commented 5 months ago

背景：采用fastchat加载了ChatGLM3-6b，用openai-api的方式调用。效果：调用过程正常，但是数据后处理似乎有问题

我排查过应该不是prompt的问题，然后langchain-chatchat的本地部署也有类似的问题。

Bannerli commented 5 months ago

有大量的 <|user|> <|assistant|>

Bannerli commented 5 months ago

截屏2024-01-30 17 20 19

hanbingmew commented 5 months ago

一种可行的改法：(使用vllm worker启动chatglm3-6b-32k) conversation.py里面chatglm3的template直接返回messages: fastchat/conversation.py

        elif self.sep_style == SeparatorStyle.CHATGLM3:
            # ret = ""
            # if self.system_message:
            #     ret += system_prompt
            # for role, message in self.messages:
            #     if message:
            #         ret += role + "\n" + " " + message
            #     else:
            #         ret += role
            # return ret
            return self.messages

vllm_worker.py里面根据messages生成history，query，调用tokenizer.build_chat_input方法生成正确的input_id，然后在调用engine.generate时使用转换后的input_id而不是原始的prompt字符串。最后需要处理一下输出。具体代码如下： fastchat/serve/vllm_worker.py

class VLLMWorker(BaseModelWorker):
    def __init__(
        self,
        controller_addr: str,
        worker_addr: str,
        worker_id: str,
        model_path: str,
        model_names: List[str],
        limit_worker_concurrency: int,
        no_register: bool,
        llm_engine: AsyncLLMEngine,
        conv_template: str,
    ):
        super().__init__(
            controller_addr,
            worker_addr,
            worker_id,
            model_path,
            model_names,
            limit_worker_concurrency,
            conv_template,
        )

        logger.info(
            f"Loading the model {self.model_names} on worker {worker_id}, worker type: vLLM worker..."
        )
        self.tokenizer = llm_engine.engine.tokenizer
        self.context_len = get_context_length(llm_engine.engine.model_config.hf_config)
        #针对chatglm3特殊处理
        self.is_chatglm3 = 'chatglm3' in model_path

        if not no_register:
            self.init_heart_beat()

    async def generate_stream(self, params):
        self.call_ct += 1

        context = params.pop("prompt")
        #根据messages构建history和query，调用build_chat_input方法获取input_id
        if self.is_chatglm3:
            messages = context
            hist = []
            for i in range(0, len(messages), 2):
                hist.append({"role":"user", "content": messages[i][1]})
                hist.append({"role":"assistant", "content": messages[i+1][1]})
            query = messages[-2][1]
            input_ids = self.tokenizer.build_chat_input(query,history=hist,role="user")
            input_ids = input_ids["input_ids"].tolist()[0]
        request_id = params.pop("request_id")
        temperature = float(params.get("temperature", 1.0))
        top_p = float(params.get("top_p", 1.0))
        top_k = params.get("top_k", -1.0)
        presence_penalty = float(params.get("presence_penalty", 0.0))
        frequency_penalty = float(params.get("frequency_penalty", 0.0))
        max_new_tokens = params.get("max_new_tokens", 256)
        stop_str = params.get("stop", None)
        stop_token_ids = params.get("stop_token_ids", None) or []
        if self.tokenizer.eos_token_id is not None:
            stop_token_ids.append(self.tokenizer.eos_token_id)
        echo = params.get("echo", True)
        use_beam_search = params.get("use_beam_search", False)
        best_of = params.get("best_of", None)

        # Handle stop_str
        stop = set()
        if isinstance(stop_str, str) and stop_str != "":
            stop.add(stop_str)
        elif isinstance(stop_str, list) and stop_str != []:
            stop.update(stop_str)

        for tid in stop_token_ids:
            if tid is not None:
                stop.add(self.tokenizer.decode(tid))

        # make sampling params in vllm
        top_p = max(top_p, 1e-5)
        if temperature <= 1e-5:
            top_p = 1.0

        sampling_params = SamplingParams(
            n=1,
            temperature=temperature,
            top_p=top_p,
            use_beam_search=use_beam_search,
            stop=list(stop),
            stop_token_ids=stop_token_ids,
            max_tokens=max_new_tokens,
            top_k=top_k,
            presence_penalty=presence_penalty,
            frequency_penalty=frequency_penalty,
            best_of=best_of,
        )
        #chatglm3不输入字符串prompt，输入转换后的input_ids
        if self.is_chatglm3:
            results_generator = engine.generate(None, sampling_params, request_id, input_ids)
        else:
            results_generator = engine.generate(context, sampling_params, request_id)

        async for request_output in results_generator:
            prompt = request_output.prompt
            if echo:
                text_outputs = [
                    prompt + output.text for output in request_output.outputs
                ]
            else:
                text_outputs = [output.text for output in request_output.outputs]
            text_outputs = " ".join(text_outputs)

            partial_stop = any(is_partial_stop(text_outputs, i) for i in stop)
            # prevent yielding partial stop sequence
            if partial_stop:
                continue

            prompt_tokens = len(request_output.prompt_token_ids)
            completion_tokens = sum(
                len(output.token_ids) for output in request_output.outputs
            )
            #后处理生成的结果
            if self.is_chatglm3:
                temp = text_outputs.split("\n",maxsplit=1)
                text_outputs = temp[-1].strip().replace("[[训练时间]]", "2023年") if len(temp)==2 else ''
            ret = {
                "text": text_outputs,
                "error_code": 0,
                "usage": {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens,
                },
                "cumulative_logprob": [
                    output.cumulative_logprob for output in request_output.outputs
                ],
                "finish_reason": request_output.outputs[0].finish_reason
                if len(request_output.outputs) == 1
                else [output.finish_reason for output in request_output.outputs],
            }
            # Emit twice here to ensure a 'finish_reason' with empty content in the OpenAI API response.
            # This aligns with the behavior of model_worker.
            if request_output.finished:
                yield (json.dumps(ret | {"finish_reason": None}) + "\0").encode()
            yield (json.dumps(ret) + "\0").encode()

    async def generate(self, params):
        async for x in self.generate_stream(params):
            pass
        return json.loads(x[:-1].decode())

这样修改完之后就和原版chatglm3的前后处理方式保持一致了，输出的内容也是正常的。

不使用vllm worker: 如果你不使用vllm worker，那么可以直接根据messages得到query和history，调用tokenizer的build_chat_input方法生成inputs。需要修改fastchat/model/model_chatglm.py:

@torch.inference_mode()
def generate_stream_chatglm(
    model,
    tokenizer,
    params,
    device,
    context_len=2048,
    stream_interval=2,
    judge_sent_end=False,
):
    prompt = params["prompt"]
    temperature = float(params.get("temperature", 1.0))
    repetition_penalty = float(params.get("repetition_penalty", 1.0))
    top_p = float(params.get("top_p", 1.0))
    max_new_tokens = int(params.get("max_new_tokens", 256))
    echo = params.get("echo", True)

    # 针对chatglm3使用tokenizer的build_chat_input方法生成inputs
    is_chatglm3 = "chatglm3" in params["model"]
    if is_chatglm3:
        messages = prompt
        hist = []
        for i in range(0, len(messages), 2):
            hist.append({"role": "user", "content": messages[i][1]})
            hist.append({"role": "assistant", "content": messages[i + 1][1]})
        query = messages[-2][1]
        inputs = tokenizer.build_chat_input(query, history=hist, role="user").to(model.device)
    else:
        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
    input_echo_len = len(inputs["input_ids"][0])

    gen_kwargs = {
        "max_length": max_new_tokens + input_echo_len,
        "do_sample": True if temperature > 1e-5 else False,
        "top_p": top_p,
        "repetition_penalty": repetition_penalty,
        "logits_processor": [invalid_score_processor],
    }
    if temperature > 1e-5:
        gen_kwargs["temperature"] = temperature

    total_len = 0
    for total_ids in model.stream_generate(**inputs, **gen_kwargs):
        total_ids = total_ids.tolist()[0]
        total_len = len(total_ids)
        if echo:
            output_ids = total_ids
        else:
            output_ids = total_ids[input_echo_len:]
        response = tokenizer.decode(output_ids)
        response = process_response(response)

        yield {
            "text": response,
            "usage": {
                "prompt_tokens": input_echo_len,
                "completion_tokens": total_len - input_echo_len,
                "total_tokens": total_len,
            },
            "finish_reason": None,
        }

参考资料： https://huggingface.co/THUDM/chatglm3-6b-32k/blob/main/modeling_chatglm.py https://huggingface.co/THUDM/chatglm3-6b-32k/blob/main/tokenization_chatglm.py

hanbingmew commented 5 months ago

根据上面的方案修改完之后模型输出内容正常：微信图片_20240205021929

p208p2002 commented 4 months ago

我嘗試對chatglm3發出PR修復了 tokenizer.encode 無法識別 specail token 問題，可以再看看問題還是否存在 https://huggingface.co/THUDM/chatglm3-6b/discussions/36

lm-sys / FastChat

ChatGLM3-6b 回复中内容问题 #2982