[BUG] docker 部署通义千问 Qwen/Qwen1.5-14B-Chat 启动报错

xuzhenjun130 commented 5 months ago

问题描述 / Problem Description

docker run -v /home/ubuntu/custom_models/qwen-7b-chat:/Qwen/Qwen-7B-Chat -d --gpus all -p 80:8501 registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.7

模型已经提前下载好了 https://huggingface.co/Qwen/Qwen1.5-7B-Chat

配置文件：

import os

# 可以指定一个绝对路径，统一存放所有的Embedding和LLM模型。
# 每个模型可以是一个单独的目录，也可以是某个目录下的二级子目录。
# 如果模型目录名称和 MODEL_PATH 中的 key 或 value 相同，程序会自动检测加载，无需修改 MODEL_PATH 中的路径。
MODEL_ROOT_PATH = ""

# 选用的 Embedding 名称
EMBEDDING_MODEL = "text2vec-bge-large-chinese" # bge-large-zh

# Embedding 模型运行设备。设为"auto"会自动检测，也可手动设定为"cuda","mps","cpu"其中之一。
EMBEDDING_DEVICE = "auto"

# 如果需要在 EMBEDDING_MODEL 中增加自定义的关键字时配置
EMBEDDING_KEYWORD_FILE = "keywords.txt"
EMBEDDING_MODEL_OUTPUT_PATH = "output"

# 要运行的 LLM 名称，可以包括本地模型和在线模型。
# 第一个将作为 API 和 WEBUI 的默认模型
LLM_MODELS = ["chatglm2-6b", "zhipu-api", "openai-api", "Qwen-7B-Chat"]

# AgentLM模型的名称 (可以不指定，指定之后就锁定进入Agent之后的Chain的模型，不指定就是LLM_MODELS[0])
Agent_MODEL = None

# LLM 运行设备。设为"auto"会自动检测，也可手动设定为"cuda","mps","cpu"其中之一。
LLM_DEVICE = "auto"

# 历史对话轮数
HISTORY_LEN = 3

# 大模型最长支持的长度，如果不填写，则使用模型默认的最大长度，如果填写，则为用户设定的最大长度
MAX_TOKENS = None

# LLM通用对话参数
TEMPERATURE = 0.7
# TOP_P = 0.95 # ChatOpenAI暂不支持该参数

ONLINE_LLM_MODEL = {
    # 线上模型。请在server_config中为每个在线API设置不同的端口

    "openai-api": {
        "model_name": "gpt-35-turbo",
        "api_base_url": "https://api.openai.com/v1",
        "api_key": "",
        "openai_proxy": "",
    },

    # 具体注册及api key获取请前往 http://open.bigmodel.cn
    "zhipu-api": {
        "api_key": "",
        "version": "chatglm_turbo",  # 可选包括 "chatglm_turbo"
        "provider": "ChatGLMWorker",
    },

    # 具体注册及api key获取请前往 https://api.minimax.chat/
    "minimax-api": {
        "group_id": "",
        "api_key": "",
        "is_pro": False,
        "provider": "MiniMaxWorker",
    },

    # 具体注册及api key获取请前往 https://xinghuo.xfyun.cn/
    "xinghuo-api": {
        "APPID": "",
        "APISecret": "",
        "api_key": "",
        "version": "v1.5",  # 你使用的讯飞星火大模型版本，可选包括 "v3.0", "v1.5", "v2.0"
        "provider": "XingHuoWorker",
    },

    # 百度千帆 API，申请方式请参考 https://cloud.baidu.com/doc/WENXINWORKSHOP/s/4lilb2lpf
    "qianfan-api": {
        "version": "ERNIE-Bot",  # 注意大小写。当前支持 "ERNIE-Bot" 或 "ERNIE-Bot-turbo"， 更多的见官方文档。
        "version_url": "",  # 也可以不填写version，直接填写在千帆申请模型发布的API地址
        "api_key": "",
        "secret_key": "",
        "provider": "QianFanWorker",
    },

    # 火山方舟 API，文档参考 https://www.volcengine.com/docs/82379
    "fangzhou-api": {
        "version": "chatglm-6b-model",  # 当前支持 "chatglm-6b-model"， 更多的见文档模型支持列表中方舟部分。
        "version_url": "",  # 可以不填写version，直接填写在方舟申请模型发布的API地址
        "api_key": "",
        "secret_key": "",
        "provider": "FangZhouWorker",
    },

    # 阿里云通义千问 API，文档参考 https://help.aliyun.com/zh/dashscope/developer-reference/api-details
    "qwen-api": {
        "version": "qwen-turbo",  # 可选包括 "qwen-turbo", "qwen-plus"
        "api_key": "",  # 请在阿里云控制台模型服务灵积API-KEY管理页面创建
        "provider": "QwenWorker",
    },

    # 百川 API，申请方式请参考 https://www.baichuan-ai.com/home#api-enter
    "baichuan-api": {
        "version": "Baichuan2-53B",  # 当前支持 "Baichuan2-53B"， 见官方文档。
        "api_key": "",
        "secret_key": "",
        "provider": "BaiChuanWorker",
    },

    # Azure API
    "azure-api": {
        "deployment_name": "",  # 部署容器的名字
        "resource_name": "",  # https://{resource_name}.openai.azure.com/openai/ 填写resource_name的部分，其他部分不要填写
        "api_version": "",  # API的版本，不是模型版本
        "api_key": "",
        "provider": "AzureWorker",
    },

}

# 在以下字典中修改属性值，以指定本地embedding模型存储位置。支持3种设置方法：
# 1、将对应的值修改为模型绝对路径
# 2、不修改此处的值（以 text2vec 为例）：
#       2.1 如果{MODEL_ROOT_PATH}下存在如下任一子目录：
#           - text2vec
#           - GanymedeNil/text2vec-large-chinese
#           - text2vec-large-chinese
#       2.2 如果以上本地路径不存在，则使用huggingface模型
MODEL_PATH = {
    "embed_model": {
        "ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
        "ernie-base": "nghuyong/ernie-3.0-base-zh",
        "text2vec-base": "shibing624/text2vec-base-chinese",
        "text2vec": "GanymedeNil/text2vec-large-chinese",
        "text2vec-paraphrase": "shibing624/text2vec-base-chinese-paraphrase",
        "text2vec-sentence": "shibing624/text2vec-base-chinese-sentence",
        "text2vec-multilingual": "shibing624/text2vec-base-multilingual",
        "text2vec-bge-large-chinese": "/text2vec-bge-large-chinese",
        "m3e-small": "moka-ai/m3e-small",
        "m3e-base": "moka-ai/m3e-base",
        "m3e-large": "moka-ai/m3e-large",
        "bge-small-zh": "BAAI/bge-small-zh",
        "bge-base-zh": "BAAI/bge-base-zh",
        "bge-large-zh": "BAAI/bge-large-zh",
        "bge-large-zh-noinstruct": "BAAI/bge-large-zh-noinstruct",
        "bge-base-zh-v1.5": "BAAI/bge-base-zh-v1.5",
        "bge-large-zh-v1.5": "BAAI/bge-large-zh-v1.5",
        "piccolo-base-zh": "sensenova/piccolo-base-zh",
        "piccolo-large-zh": "sensenova/piccolo-large-zh",
        "text-embedding-ada-002": "your OPENAI_API_KEY",
    },

    "llm_model": {
        # 以下部分模型并未完全测试，仅根据fastchat和vllm模型的模型列表推定支持
        "chatglm2-6b": "/chatglm2-6b",
        "chatglm2-6b-32k": "THUDM/chatglm2-6b-32k",

        "baichuan2-13b": "baichuan-inc/Baichuan2-13B-Chat",
        "baichuan2-7b": "baichuan-inc/Baichuan2-7B-Chat",

        "baichuan-7b": "baichuan-inc/Baichuan-7B",
        "baichuan-13b": "baichuan-inc/Baichuan-13B",
        'baichuan-13b-chat': 'baichuan-inc/Baichuan-13B-Chat',

        "aquila-7b": "BAAI/Aquila-7B",
        "aquilachat-7b": "BAAI/AquilaChat-7B",

        "internlm-7b": "internlm/internlm-7b",
        "internlm-chat-7b": "internlm/internlm-chat-7b",

        "falcon-7b": "tiiuae/falcon-7b",
        "falcon-40b": "tiiuae/falcon-40b",
        "falcon-rw-7b": "tiiuae/falcon-rw-7b",

        "gpt2": "gpt2",
        "gpt2-xl": "gpt2-xl",

        "gpt-j-6b": "EleutherAI/gpt-j-6b",
        "gpt4all-j": "nomic-ai/gpt4all-j",
        "gpt-neox-20b": "EleutherAI/gpt-neox-20b",
        "pythia-12b": "EleutherAI/pythia-12b",
        "oasst-sft-4-pythia-12b-epoch-3.5": "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5",
        "dolly-v2-12b": "databricks/dolly-v2-12b",
        "stablelm-tuned-alpha-7b": "stabilityai/stablelm-tuned-alpha-7b",

        "Llama-2-13b-hf": "meta-llama/Llama-2-13b-hf",
        "Llama-2-70b-hf": "meta-llama/Llama-2-70b-hf",
        "open_llama_13b": "openlm-research/open_llama_13b",
        "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
        "koala": "young-geng/koala",

        "mpt-7b": "mosaicml/mpt-7b",
        "mpt-7b-storywriter": "mosaicml/mpt-7b-storywriter",
        "mpt-30b": "mosaicml/mpt-30b",
        "opt-66b": "facebook/opt-66b",
        "opt-iml-max-30b": "facebook/opt-iml-max-30b",

        "Qwen-7B": "Qwen/Qwen-7B",
        "Qwen-14B": "Qwen/Qwen-14B",
        "Qwen-7B-Chat": "/Qwen/Qwen-7B-Chat",
        "Qwen-14B-Chat": "Qwen/Qwen-14B-Chat",
        "Qwen-14B-Chat-Int8": "Qwen/Qwen-14B-Chat-Int8",  # 确保已经安装了auto-gptq optimum flash-attn
        "Qwen-14B-Chat-Int4": "Qwen/Qwen-14B-Chat-Int4",  # 确保已经安装了auto-gptq optimum flash-attn
    },
}

只是修改了两处地方：

LLM_MODELS = ["chatglm2-6b", "zhipu-api", "openai-api", "Qwen-7B-Chat"]
"Qwen-7B-Chat": "/Qwen/Qwen-7B-Chat",

复现问题的步骤 / Steps to Reproduce 保存配置，重启docker 容器

发现日志报错：

2024-03-28 09:23:08 | INFO | model_worker | Loading the model ['Qwen-7B-Chat'] on worker 58bb1911 ...
2024-03-28 09:23:08 | ERROR | stderr | Process model_worker - Qwen-7B-Chat:
2024-03-28 09:23:08 | ERROR | stderr | Traceback (most recent call last):
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
2024-03-28 09:23:08 | ERROR | stderr |     self.run()
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
2024-03-28 09:23:08 | ERROR | stderr |     self._target(*self._args, **self._kwargs)
2024-03-28 09:23:08 | ERROR | stderr |   File "/langchain-chatchat/startup.py", line 383, in run_model_worker
2024-03-28 09:23:08 | ERROR | stderr |     app = create_model_worker_app(log_level=log_level, **kwargs)
2024-03-28 09:23:08 | ERROR | stderr |   File "/langchain-chatchat/startup.py", line 211, in create_model_worker_app
2024-03-28 09:23:08 | ERROR | stderr |     worker = ModelWorker(
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/local/lib/python3.10/dist-packages/fastchat/serve/model_worker.py", line 74, in __init__
2024-03-28 09:23:08 | ERROR | stderr |     self.model, self.tokenizer = load_model(
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/local/lib/python3.10/dist-packages/fastchat/model/model_adapter.py", line 306, in load_model
2024-03-28 09:23:08 | ERROR | stderr |     model, tokenizer = adapter.load_model(model_path, kwargs)
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/local/lib/python3.10/dist-packages/fastchat/model/model_adapter.py", line 1441, in load_model
2024-03-28 09:23:08 | ERROR | stderr |     config = AutoConfig.from_pretrained(
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/configuration_auto.py", line 1064, in from_pretrained
2024-03-28 09:23:08 | ERROR | stderr |     config_class = CONFIG_MAPPING[config_dict["model_type"]]
2024-03-28 09:23:08 | ERROR | stderr |   File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/configuration_auto.py", line 761, in __getitem__
2024-03-28 09:23:08 | ERROR | stderr |     raise KeyError(key)
2024-03-28 09:23:08 | ERROR | stderr | KeyError: 'qwen2'

预期的结果 / Expected Result 正常加载通义千问模型

实际结果 / Actual Result

无法启动

环境信息 / Environment Information ubuntu ，RTX 4090 内存32G

dosubot[bot] commented 5 months ago

Hello @xuzhenjun130! I'm Dosu, a bot here to lend a hand with solving bugs, answering questions, and guiding you on your journey to becoming a contributor. While you're waiting for a human maintainer, I'm here to assist. I'm diving into your issue now and will get back to you with a full answer shortly.

KenwayCJZX commented 5 months ago

what's your transformers version

dosubot[bot] commented 5 months ago

Hey there @xuzhenjun130! Good to see you diving into new challenges. Hope this one hasn't been too much of a headache for you yet.

The KeyError: 'qwen2' error you're encountering is likely due to the Langchain-Chatchat application trying to access a key named 'qwen2' in a dictionary (or similar data structure) that does not exist. This error is not directly visible in the provided code snippet, which means the issue might be related to the changes made in the configuration file or how those changes are being accessed or applied within the application.

In Python, a KeyError is raised when trying to access a dictionary key that does not exist. If you've modified the configuration file to include or modify the handling of a model or parameter named 'qwen2', ensure that:

The key 'qwen2' is correctly spelled and matches exactly with what is being accessed in the code.
The configuration file is correctly loaded and parsed before any attempt is made to access the 'qwen2' key.
Any code that accesses the 'qwen2' key is wrapped in a try-except block to handle the possibility that the key does not exist, or checks are performed to ensure the key exists before attempting to access it.

Since the provided code does not directly reference 'qwen2', the error is likely occurring in a part of the code not shown or in the interaction with the configuration file. Double-check the changes made to the configuration file and how those changes are reflected in the code.

The Langchain-Chatchat application handles the loading of models, including the Qwen/Qwen1.5-7B-Chat model, through a dynamic and configurable approach that supports both online API-based models and offline models, including those supported directly by Langchain. The process is managed by the create_model_worker_app function, which takes various parameters to configure the model worker appropriately based on the model type (Langchain model, online API, or offline model).

For offline models, the application supports loading models from a Hugging Face repository ID or a local path, and it can run on specified devices. The configuration includes specifying the model path, device, and other parameters like the number of GPUs, memory utilization, and concurrency limits. The application uses the VLLMWorker for models that are compatible with the VLLM (Very Large Language Model) framework, allowing for advanced configurations such as tensor parallel size, block size, and GPU memory utilization.

For models not directly supported by Langchain or those requiring online APIs, the application allows specifying a worker class that handles the interaction with the model or API. This flexibility enables the Langchain-Chatchat application to support a wide range of models and APIs by abstracting the model loading and interaction logic into configurable worker classes.

The application also includes provisions for model quantization and optimization, such as support for 8-bit loading and adaptive weight quantization (AWQ), through the GptqConfig and AWQConfig configurations. These features are aimed at improving performance and reducing memory consumption, which is crucial for deploying large models like Qwen/Qwen1.5-7B-Chat efficiently.

In summary, the Langchain-Chatchat application's approach to model loading is highly flexible and configurable, supporting a wide range of model types and configurations, including the Qwen/Qwen1.5-7B-Chat model, through a combination of offline model support, online API integration, and advanced optimization techniques.

You can find more details in the startup.py file in the Langchain-Chatchat repository.

Regarding the 'qwen2' key, it does not appear directly in the provided code snippets from the Langchain-Chatchat codebase. Therefore, it's not possible to determine its specific role based on the given context. It might be helpful to search the entire codebase for references to 'qwen2' to understand its role and why it might be causing a KeyError.

Finally, it's not clear from the provided context how the Langchain-Chatchat application handles KeyError exceptions during the model loading process. It's possible that the application has a general error handling mechanism that catches and logs exceptions, but without more specific information or context, it's difficult to say for sure. It might be helpful to review the application's error handling and logging code to understand how it handles exceptions like KeyError during the model loading process.

I hope this helps! If you have any more questions or need further clarification, feel free to ask.

Sources

#### About Dosu

This response is meant to be useful and save you time. It is **not meant to be a precise solution**, but rather a starting point for your own research. Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant. If you want to continue the conversation, start your reply with @dosu-bot.

xuzhenjun130 commented 5 months ago

what's your transformers version

pip show transformers Name: transformers Version: 4.35.0

zRzRzRzRzRzRzR commented 5 months ago

更新transformer版本，docker估计没更新，

KenwayCJZX commented 5 months ago

what's your transformers version

pip show transformers Name: transformers Version: 4.35.0

try transformers==4.37.2

danny-zhu commented 4 months ago

Requirements The code of Qwen1.5 has been in the latest Hugging face transformers and we advise you to install transformers>=4.37.0, or you might encounter the following error:

KeyError: 'qwen2'

细节见：https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/README.md

chatchat-space / Langchain-Chatchat