wenda-LLM / wenda

闻达:一个LLM调用平台。目标为针对特定环境的高效内容生成,同时考虑个人和中小企业的计算资源局限性,以及知识安全和私密性问题
GNU Affero General Public License v3.0
6.22k stars 809 forks source link

在两张GPU卡上加载Chatglm3-6b 出错,请问如何解决? #514

Open jamesjiangxing opened 8 months ago

jamesjiangxing commented 8 months ago

Describe the bug 在两张GPU卡上加载Chatglm3-6b 出错.

To Reproduce Steps to reproduce the behavior:

  1. 在配置文件中,strategy: "cuda:0 fp16 *14 -> cuda:1 fp16"
  2. 运行服务GLM6B
  3. 加载模型出错

Expected behavior A clear and concise description of what you expected to happen.

Screenshots

Wenda-chatglm3-屏幕截图 2023-12-02 191320

hekang26 commented 6 months ago

from plugins.settings import settings

import os from typing import Dict, Tuple, Union, Optional

from torch.nn import Module from transformers import AutoModel

def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:

transformer.word_embeddings 占用1层

# transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上
num_trans_layers = 28
per_gpu_layers = 30 / num_gpus

# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
device_map = {'transformer.word_embeddings': 0,
              'transformer.final_layernorm': 0, 'lm_head': 0}

used = 2
gpu_target = 0
for i in range(num_trans_layers):
    if used >= per_gpu_layers:
        gpu_target += 1
        used = 0
    assert gpu_target < num_gpus
    device_map[f'transformer.layers.{i}'] = gpu_target
    used += 1

return device_map

def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2, device_map: Optional[Dict[str, int]] = None, kwargs) -> Module: if num_gpus < 2 and device_map is None: model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, kwargs).half().cuda() else: from accelerate import dispatch_model

    model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()

    if device_map is None:
        device_map = auto_configure_device_map(num_gpus)

    model = dispatch_model(model, device_map=device_map)

return 

def chat_init(history): history_formatted = None if history is not None: history_formatted = [] tmp = [] for i, old_chat in enumerate(history): if len(tmp) == 0 and old_chat['role'] == "user": tmp.append(old_chat['content']) elif old_chat['role'] == "AI" or old_chat['role'] == 'assistant': tmp.append(old_chat['content']) history_formatted.append(tuple(tmp)) tmp = [] else: continue return history_formatted

def chat_one(prompt, history_formatted, max_length, top_p, temperature, zhishiku=False): for response, history in model.stream_chat(tokenizer, prompt, history_formatted, max_length=max_length, top_p=top_p, temperature=temperature): yield response

def load_model(): global model, tokenizer from transformers import AutoModel, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( settings.Path, local_files_only=True, trust_remote_code=True)

model = AutoModel.from_pretrained(

#     settings.Path, local_files_only=True, trust_remote_code=True)
model = load_model_on_gpus(settings.Path, num_gpus=2)
if not (settings.Lora == '' or settings.Lora == None):
    print('Lora模型地址', settings.Lora)
    from peft import PeftModel
    model = PeftModel.from_pretrained(model, settings.Lora)
device, precision = settings.Strategy.split()
# 根据设备执行不同的操作
if device == 'cpu':
    # 如果是cpu,不做任何操作
    pass
elif device == 'cuda':
    # 如果是gpu,把模型移动到显卡
    import torch
    if not (precision.startswith('fp16i') and torch.cuda.get_device_properties(0).total_memory < 1.4e+10):
        model = model.cuda()
else:
    # 如果是其他设备,报错并退出程序
    print('Error: 不受支持的设备')
    exit()
# 根据精度执行不同的操作
if precision == 'fp16':
    # 如果是fp16,把模型转化为半精度
    model = model.half()
elif precision == 'fp32':
    # 如果是fp32,把模型转化为全精度
    model = model.float()
elif precision.startswith('fp16i'):
    # 如果是fp16i开头,把模型转化为指定的精度
    # 从字符串中提取精度的数字部分
    bits = int(precision[5:])
    # 调用quantize方法,传入精度参数
    model = model.quantize(bits)
    if device == 'cuda':
        model = model.cuda()
    model = model.half()
elif precision.startswith('fp32i'):
    # 如果是fp32i开头,把模型转化为指定的精度
    # 从字符串中提取精度的数字部分
    bits = int(precision[5:])
    # 调用quantize方法,传入精度参数
    model = model.quantize(bits)
    if device == 'cuda':
        model = model.cuda()
    model = model.float()
else:
    # 如果是其他精度,报错并退出程序
    print('Error: 不受支持的精度')
    exit()
#model = model.eval()