meta-llama / llama3

The official Meta Llama 3 GitHub site
Other
23.47k stars 2.54k forks source link

Response begins with .MixedReality! SGlobal! urdu! #246

Closed YueChenkkk closed 1 month ago

YueChenkkk commented 1 month ago

I tried Llama-3-70B-instruct and sometimes it returned a string starting with meaningless words like ".MixedReality!" "SGlobal!" "urdu!".

Anyone encounter the same problem?

YueChenkkk commented 1 month ago

I remove tokens.extend(self.encode_header({"role": "assistant", "content": ""})) from ChatFormat.encode_dialog_prompt in llama.tokenizer and the problem seems to be solved.

class ChatFormat:
    def __init__(self, tokenizer: Tokenizer):
        self.tokenizer = tokenizer

    def encode_header(self, message: Message) -> List[int]:
        tokens = []
        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
        return tokens

    def encode_message(self, message: Message) -> List[int]:
        tokens = self.encode_header(message)
        tokens.extend(
            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
        )
        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
        return tokens

    def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
        tokens = []
        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
        for message in dialog:
            tokens.extend(self.encode_message(message))
        # Add the start of an assistant message for the model to complete.
        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
        return tokens