deepseek-ai / DeepSeek-Coder

DeepSeek Coder: Let the Code Write Itself
https://coder.deepseek.com/
MIT License
6.01k stars 433 forks source link

Potential bug: EOS token mismatch #74

Closed rookielxy closed 6 months ago

rookielxy commented 6 months ago

While executing the follow code:

import os 

os.environ['CUDA_VISIBLE_DEVICES'] = "6,7"

from vllm import LLM, SamplingParams

from datasets import load_dataset
from transformers import AutoTokenizer

N_SHOTS = 0
def build_dataset_deepseek_coder_33b():
    dataset = load_dataset("mbpp", split="test")
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-instruct")

    prompt_template = "You are an exper Python programmer, and here is your task: {task}. Your code should pass these tests: \n\n{tests}\n"
    examples = [{"role": "system", "content": "You are a helpful AI programming assistant."}]
    if N_SHOTS > 0:
        example_dataset = load_dataset("mbpp", split="prompt")
        for x in example_dataset[:N_SHOTS]:
            examples += [
                {"role": "user", "content": prompt_template.format(task=x["text"], tests="\n".join(x["test_list"]))},
                {"role": "assistant", "content": "```python\n" + x["code"] + "\n```"}
            ]

    def map_func(example):
        task_id = f'MBPP/{example["task_id"]}'
        content = prompt_template.format(task=example["text"], tests="\n".join(example["test_list"]))
        if N_SHOTS == 0:
            content += "\nCode should be written in a markdown codeblock and NO explanation is required. Talk is easy, show me the code!"
        message = [{
            "role": "user", "content": content
        }]
        prompt = tokenizer.apply_chat_template(examples + message, tokenize=False)
        return {
            "task_id": task_id,
            "prompt": prompt
        }

    dataset = dataset.map(map_func, remove_columns=dataset.column_names)
    return dataset

dataset = build_dataset_deepseek_coder_33b()

llm = LLM(model="deepseek-ai/deepseek-coder-33b-instruct", 
          tensor_parallel_size=2, 
          gpu_memory_utilization=0.8,
          max_model_len=8192)

sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)

prompts = dataset['prompt'][:10]
outputs = llm.generate(prompts, sampling_params)
print(outputs[0].outputs[0].text)

I got the response with trailing <EOT> tokens.

def remove_Occ(s, c):
    first_index = s.find(c)
    last_index = s.rfind(c)

    if first_index != -1 and last_index != -1:
        if first_index == last_index:
            return s[:first_index] + s[first_index + 1:]
        else:
            return s[:first_index] + s[first_index + 1:last_index] + s[last_index + 1:]
    return s

assert remove_Occ("hello","l") == "heo"
assert remove_Occ("abcda","a") == "bcd"
assert remove_Occ("PHP","P") == "H"
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍
...
<|EOT|> ‍
<|EOT|> ‍
<|EOT|> ‍

I found that the model generates <EOT> token as end of sequence but the eos token is set to <|end▁of▁sentence|>. This behavior may be due to the latest update in huggingface which changed the eos token.

pkuzqh commented 6 months ago

Thanks! We have fixed it.

txy6666yr commented 6 months ago

Thanks! We have fixed it.

请问这个问题怎么解决呢?

pkuzqh commented 6 months ago

I have changed the tokenizer_config.json.