The following model configurations has been modified according to `config.json` or kwargs:
{'num_layers', 'num_attention_heads', 'hidden_size', 'vocab_size'}
W20230627 17:50:38.624037 18121 cuda_stream.cpp:48] Runtime version 11.2 of cuBLAS incompatible with compiletime version 11.10.
W20230627 17:50:39.510000 18121 cuda_stream.cpp:48] Runtime version 8.4 of cuDNN incompatible with compiletime version 8.5.
Traceback (most recent call last):
File "/home/ubuntu/newspace/deploy/oneflow/libai/demo_glm.py", line 124, in <module>
outputs = model.generate(
File "/home/ubuntu/anaconda3/envs/libai/lib/python3.9/site-packages/oneflow/autograd/autograd_mode.py", line 154, in wrapper
return func(*args, **kwargs)
File "/home/ubuntu/newspace/deploy/oneflow/libai/libai/inference/generator/generation_utils.py", line 1002, in generate
return self.greedy_search(
File "/home/ubuntu/newspace/deploy/oneflow/libai/libai/inference/generator/generation_utils.py", line 490, in greedy_search
outputs = self(**model_inputs)
File "/home/ubuntu/anaconda3/envs/libai/lib/python3.9/site-packages/oneflow/nn/modules/module.py", line 163, in __call__
res = self.forward(*args, **kwargs)
File "/home/ubuntu/newspace/deploy/oneflow/libai/projects/GLM/modeling_glm.py", line 397, in forward
lm_logits, mems = self.glm(
File "/home/ubuntu/anaconda3/envs/libai/lib/python3.9/site-packages/oneflow/nn/modules/module.py", line 163, in __call__
res = self.forward(*args, **kwargs)
File "/home/ubuntu/newspace/deploy/oneflow/libai/projects/GLM/modeling_glm.py", line 204, in forward
logits, mem_layers = self.transformer(
File "/home/ubuntu/anaconda3/envs/libai/lib/python3.9/site-packages/oneflow/nn/modules/module.py", line 163, in __call__
res = self.forward(*args, **kwargs)
File "/home/ubuntu/newspace/deploy/oneflow/libai/projects/GLM/modeling_glm.py", line 75, in forward
hidden_states = layer(hidden_states, attention_mask, mem=mem_i)
File "/home/ubuntu/anaconda3/envs/libai/lib/python3.9/site-packages/oneflow/nn/modules/module.py", line 163, in __call__
res = self.forward(*args, **kwargs)
File "/home/ubuntu/newspace/deploy/oneflow/libai/projects/GLM/layers/transformer_layer.py", line 103, in forward
attention_output = self.attention(
File "/home/ubuntu/anaconda3/envs/libai/lib/python3.9/site-packages/oneflow/nn/modules/module.py", line 163, in __call__
res = self.forward(*args, **kwargs)
File "/home/ubuntu/newspace/deploy/oneflow/libai/projects/GLM/layers/attention_layer.py", line 111, in forward
context = flow._C.fused_multi_head_attention_inference_v2(
AttributeError: module 'oneflow._C' has no attribute 'fused_multi_head_attention_inference_v2'
Code to reproduce bug
import oneflow as flow
from projects.GLM.tokenizer.glm_tokenizer import GLMChineseTokenzier
from libai.utils import distributed as dist
from projects.GLM.configs.glm_inference import cfg
from projects.GLM.modeling_glm import GLMForConditionalGeneration
from projects.GLM.utils.glm_loader import GLMLoaderHuggerFace
from omegaconf import DictConfig
parallel_config = DictConfig(
dict(
data_parallel_size=1,
tensor_parallel_size=1,
pipeline_parallel_size=1
)
)
dist.setup_dist_util(parallel_config)
model_path = "/home/ubuntu/workspace/models/glm-large-chinese"
tokenizer = GLMChineseTokenzier.from_pretrained(model_path)
input_ids = tokenizer.encode(
[
"凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。"
],
return_tensors="of",
)
inputs = {"input_ids": input_ids, "attention_mask": flow.ones(input_ids.size())}
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
placement = dist.get_layer_placement(0)
loader = GLMLoaderHuggerFace(
GLMForConditionalGeneration,
cfg,
model_path,
embedding_dropout_prob=0,
attention_dropout_prob=0,
output_dropout_prob=0,
)
model = loader.load()
outputs = model.generate(
inputs=inputs['input_ids'].to_global(sbp=sbp, placement=placement),
position_ids=inputs['position_ids'].to_global(sbp=sbp, placement=placement),
generation_attention_mask=inputs['generation_attention_mask'].to_global(sbp=sbp, placement=placement),
max_length=512
)
res = tokenizer.decode(outputs[0])
if dist.is_main_process():
print(res)
Summary
运行projects目录GLM例子,运行报错:
Code to reproduce bug
python -m oneflow.distributed.launch --nproc_per_node 1 demo_glm.py 报错
System Information
python3 -m oneflow --doctor
):0.9.0