Closed rookielxy closed 6 months ago
While executing the follow code:
import os os.environ['CUDA_VISIBLE_DEVICES'] = "6,7" from vllm import LLM, SamplingParams from datasets import load_dataset from transformers import AutoTokenizer N_SHOTS = 0 def build_dataset_deepseek_coder_33b(): dataset = load_dataset("mbpp", split="test") tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-instruct") prompt_template = "You are an exper Python programmer, and here is your task: {task}. Your code should pass these tests: \n\n{tests}\n" examples = [{"role": "system", "content": "You are a helpful AI programming assistant."}] if N_SHOTS > 0: example_dataset = load_dataset("mbpp", split="prompt") for x in example_dataset[:N_SHOTS]: examples += [ {"role": "user", "content": prompt_template.format(task=x["text"], tests="\n".join(x["test_list"]))}, {"role": "assistant", "content": "```python\n" + x["code"] + "\n```"} ] def map_func(example): task_id = f'MBPP/{example["task_id"]}' content = prompt_template.format(task=example["text"], tests="\n".join(example["test_list"])) if N_SHOTS == 0: content += "\nCode should be written in a markdown codeblock and NO explanation is required. Talk is easy, show me the code!" message = [{ "role": "user", "content": content }] prompt = tokenizer.apply_chat_template(examples + message, tokenize=False) return { "task_id": task_id, "prompt": prompt } dataset = dataset.map(map_func, remove_columns=dataset.column_names) return dataset dataset = build_dataset_deepseek_coder_33b() llm = LLM(model="deepseek-ai/deepseek-coder-33b-instruct", tensor_parallel_size=2, gpu_memory_utilization=0.8, max_model_len=8192) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024) prompts = dataset['prompt'][:10] outputs = llm.generate(prompts, sampling_params) print(outputs[0].outputs[0].text)
I got the response with trailing <EOT> tokens.
<EOT>
def remove_Occ(s, c): first_index = s.find(c) last_index = s.rfind(c) if first_index != -1 and last_index != -1: if first_index == last_index: return s[:first_index] + s[first_index + 1:] else: return s[:first_index] + s[first_index + 1:last_index] + s[last_index + 1:] return s assert remove_Occ("hello","l") == "heo" assert remove_Occ("abcda","a") == "bcd" assert remove_Occ("PHP","P") == "H" <|EOT|> <|EOT|> <|EOT|> <|EOT|> <|EOT|> <|EOT|> <|EOT|> <|EOT|> <|EOT|> ... <|EOT|> <|EOT|> <|EOT|>
I found that the model generates <EOT> token as end of sequence but the eos token is set to <|end▁of▁sentence|>. This behavior may be due to the latest update in huggingface which changed the eos token.
<|end▁of▁sentence|>
Thanks! We have fixed it.
请问这个问题怎么解决呢?
I have changed the tokenizer_config.json.
While executing the follow code:
I got the response with trailing
<EOT>
tokens.I found that the model generates
<EOT>
token as end of sequence but the eos token is set to<|end▁of▁sentence|>
. This behavior may be due to the latest update in huggingface which changed the eos token.