Oneflow-Inc / libai

LiBai(李白): A Toolbox for Large-Scale Distributed Parallel Training
https://libai.readthedocs.io
Apache License 2.0
391 stars 55 forks source link

Optimize glm #484

Closed xiezipeng-ML closed 1 year ago

xiezipeng-ML commented 1 year ago
# ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT=1 python demo.py 
import time

import oneflow as flow
import torch

from libai.utils import distributed as dist
from projects.GLM.configs.glm_inference import cfg
from projects.GLM.modeling_glm import GLMForConditionalGeneration
from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace

tokenizer = GLMChineseTokenzier.from_pretrained("/data/home/xiezipeng/glm-10b-chinese")
input_ids = tokenizer.encode(
    ["橘子的颜色是[MASK]。"],
    return_tensors="of",
)
inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size(), dtype=flow.bool)}
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)

sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
placement = dist.get_layer_placement(0)

dist.set_device_type("cpu")
loader = GLMLoaderHuggerFace(
    GLMForConditionalGeneration,
    cfg,
    "/data/home/xiezipeng/glm-10b-chinese",
    embedding_dropout_prob=0,
    attention_dropout_prob=0,
    output_dropout_prob=0,
)
model = loader.load()
model = model.half().cuda()
model.eval()

dist.set_device_type("cuda")

while True:
    t0 = time.time()
    outputs = model.generate(
        inputs=inputs["input_ids"].to_global(sbp=sbp, placement=placement),
        position_ids=inputs["position_ids"].to_global(sbp=sbp, placement=placement),
        generation_attention_mask=inputs["generation_attention_mask"].to_global(
            sbp=sbp, placement=placement
        ),
        max_length=512,
    )
    flow.cuda.synchronize()
    if dist.is_main_process():
        print("cost time", time.time() - t0)

    res = tokenizer.decode(outputs[0])
    if dist.is_main_process():
        print(res)
xiezipeng-ML commented 1 year ago

目前单卡GLM,torch推理0.4s,oneflow推理0.26s

BasicCoder commented 1 year ago

目前单卡GLM,torch推理0.4s,oneflow推理0.26s

您好,可以提供一下测试的设备信息么? 采用A100 80G PCIE 测试出来的结果是0.30s~0.31s. 按照提供的测试代码,结果输出是: [CLS] 橘子的颜色是 [MASK] 。 <|endoftext|> <|startofpiece|> 橘黄色的,所以叫 橘子 <|endofpiece|>