PaddlePaddle / PaddleNLP

👑 Easy-to-use and powerful NLP and LLM library with 🤗 Awesome model zoo, supporting wide-range of NLP tasks from research to industrial applications, including 🗂Text Classification, 🔍 Neural Search, ❓ Question Answering, ℹ️ Information Extraction, 📄 Document Intelligence, 💌 Sentiment Analysis etc.
https://paddlenlp.readthedocs.io
Apache License 2.0
12.06k stars 2.93k forks source link

[Bug]: AssertionError: All tokenizer files should be in the same directory #9156

Open wangzy0327 opened 1 month ago

wangzy0327 commented 1 month ago

软件环境

Markdown

尝试使用paddlenlp执行gpt模型的推理。选用模型gpt-cpm-small-cn-distill 下面是执行脚本

benchmark_gpt.py

import os
import time
import paddle  
from paddle import nn  
from paddlenlp.transformers import BertModel, BertTokenizer, ErnieModel, ErnieTokenizer
from paddlenlp.transformers import GPTTokenizer, GPTLMHeadModel
from paddlenlp.transformers import Llama3Tokenizer, LlamaModel
import numpy as np

def benchmark(net, input_ids, token_type_ids, repeat=5, warmup=3):
    # warm up
    for _ in range(warmup):
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
    # time
    t = []
    for _ in range(repeat):
        t1 = time.time()
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
        t2 = time.time()
        t.append((t2 - t1)*1000)
    print("--[benchmark] Run for %d times, the average latency is: %f ms" % (repeat, np.mean(t)))    

class TestBase:
    def __init__(self):
        device_info = paddle.get_device()
        print("Current Paddle device : %s"%(device_info))
        self.net = None
        self.input = None
        self.cinn_net = None

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.numpy(), pd_out.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input)

class TestGPT(TestBase):
    def __init__(self, batch_size=1):
        super().__init__()
        max_seq_length = 1024  # 最大序列长度
        model_name = 'gpt-cpm-small-cn-distill'
        self.net = GPTLMHeadModel.from_pretrained(model_name)
        self.tokenizer = GPTTokenizer.from_pretrained(model_name)

        # 随机生成输入数据
        encoded_text = self.tokenizer(text="请输入测试样例")
        self.input_ids = paddle.to_tensor([encoded_text['input_ids']])
        self.token_type_ids = paddle.to_tensor([encoded_text['token_type_ids']])        

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input_ids, self.token_type_ids)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.last_hidden_state.numpy(), pd_out.last_hidden_state.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input_ids, self.token_type_ids)

if __name__ == "__main__":
    print("Test GPT Model gpt-cpm-small ........")
    model = TestGPT()       
    model.benchmark(use_cinn=False)

执行后输出结果为:

/home/wzy/.local/lib/python3.8/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.
  warnings.warn("Setuptools is replacing distutils.")
Test GPT Model gpt-cpm-small ........
Current Paddle device : gpu:0
[2024-09-20 01:37:50,268] [    INFO] - Loading weights file from cache at /home/wzy/.paddlenlp/models/gpt-cpm-large-cn/model_state.pdparams
[2024-09-20 01:38:03,948] [    INFO] - Loaded weights file from disk, setting weights to model.
W0920 01:38:03.950428 12651 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0920 01:38:03.953485 12651 gpu_resources.cc:164] device: 0, cuDNN Version: 8.1.
[2024-09-20 01:38:30,694] [    INFO] - All model checkpoint weights were used when initializing GPTForCausalLM.

[2024-09-20 01:38:30,695] [ WARNING] - Some weights of GPTForCausalLM were not initialized from the model checkpoint at gpt-cpm-large-cn and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2024-09-20 01:38:30,995] [    INFO] - Generation config file not found, using a generation config created from the model config.
Traceback (most recent call last):
  File "benchmark_lstm.py", line 297, in <module>
    model = TestGPT()
  File "benchmark_lstm.py", line 241, in __init__
    self.tokenizer = GPTTokenizer.from_pretrained(model_name)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils.py", line 709, in from_pretrained
    tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils_base.py", line 1515, in from_pretrained
    assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory."
AssertionError: All tokenizer files should be in the same directory.

重复问题

错误描述

/home/wzy/.local/lib/python3.8/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.
  warnings.warn("Setuptools is replacing distutils.")
Test GPT Model gpt-cpm-small ........
Current Paddle device : gpu:0
[2024-09-20 01:37:50,268] [    INFO] - Loading weights file from cache at /home/wzy/.paddlenlp/models/gpt-cpm-large-cn/model_state.pdparams
[2024-09-20 01:38:03,948] [    INFO] - Loaded weights file from disk, setting weights to model.
W0920 01:38:03.950428 12651 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0920 01:38:03.953485 12651 gpu_resources.cc:164] device: 0, cuDNN Version: 8.1.
[2024-09-20 01:38:30,694] [    INFO] - All model checkpoint weights were used when initializing GPTForCausalLM.

[2024-09-20 01:38:30,695] [ WARNING] - Some weights of GPTForCausalLM were not initialized from the model checkpoint at gpt-cpm-large-cn and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2024-09-20 01:38:30,995] [    INFO] - Generation config file not found, using a generation config created from the model config.
Traceback (most recent call last):
  File "benchmark_lstm.py", line 297, in <module>
    model = TestGPT()
  File "benchmark_lstm.py", line 241, in __init__
    self.tokenizer = GPTTokenizer.from_pretrained(model_name)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils.py", line 709, in from_pretrained
    tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils_base.py", line 1515, in from_pretrained
    assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory."
AssertionError: All tokenizer files should be in the same directory.

稳定复现步骤 & 代码

import os
import time
import paddle  
from paddle import nn  
from paddlenlp.transformers import BertModel, BertTokenizer, ErnieModel, ErnieTokenizer
from paddlenlp.transformers import GPTTokenizer, GPTLMHeadModel
from paddlenlp.transformers import Llama3Tokenizer, LlamaModel
import numpy as np

def benchmark(net, input_ids, token_type_ids, repeat=5, warmup=3):
    # warm up
    for _ in range(warmup):
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
    # time
    t = []
    for _ in range(repeat):
        t1 = time.time()
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
        t2 = time.time()
        t.append((t2 - t1)*1000)
    print("--[benchmark] Run for %d times, the average latency is: %f ms" % (repeat, np.mean(t)))    

class TestBase:
    def __init__(self):
        device_info = paddle.get_device()
        print("Current Paddle device : %s"%(device_info))
        self.net = None
        self.input = None
        self.cinn_net = None

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.numpy(), pd_out.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input)

class TestGPT(TestBase):
    def __init__(self, batch_size=1):
        super().__init__()
        max_seq_length = 1024  # 最大序列长度
        model_name = 'gpt-cpm-small-cn-distill'
        self.net = GPTLMHeadModel.from_pretrained(model_name)
        self.tokenizer = GPTTokenizer.from_pretrained(model_name)

        # 随机生成输入数据
        encoded_text = self.tokenizer(text="请输入测试样例")
        self.input_ids = paddle.to_tensor([encoded_text['input_ids']])
        self.token_type_ids = paddle.to_tensor([encoded_text['token_type_ids']])        

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input_ids, self.token_type_ids)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.last_hidden_state.numpy(), pd_out.last_hidden_state.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input_ids, self.token_type_ids)

if __name__ == "__main__":
    print("Test GPT Model gpt-cpm-small ........")
    model = TestGPT()       
    model.benchmark(use_cinn=False)
DrownFish19 commented 1 week ago

gpt-cpm-small-cngpt-cpm-small-cn-distill需要使用GPTChineseTokenizer。