Closed yFhope closed 1 year ago
补充:您在Issues中提到的这份代码 https://github.com/Tongjilibo/bert4torch/issues/62 我是可以直接跑的,不会有任何问题
主要的几点修改如下:
# 放在最前面
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=-1)
args = parser.parse_args()
torch.cuda.set_device(args.local_rank)
device = torch.device('cuda', args.local_rank)
torch.distributed.init_process_group(backend='nccl')
# 定义bert上的模型结构
class Model(nn.Module):
def __init__(self):
super().__init__()
self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, segment_vocab_size=0)
self.fc = nn.Linear(768, len(categories)) # 包含首尾
self.crf = CRF(len(categories))
def forward(self, token_ids):
sequence_output = self.bert([token_ids]) # [btz, seq_len, hdsz]
emission_score = self.fc(sequence_output) # [btz, seq_len, tag_size]
attention_mask = token_ids.gt(0).long()
return emission_score, attention_mask
def predict(self, token_ids):
self.eval()
with torch.no_grad():
emission_score, attention_mask = self.forward(token_ids)
best_path = self.crf.decode(emission_score, attention_mask) # [btz, seq_len]
return best_path
model = Model().to(device)
# 指定DDP模型使用多gpu, master_rank为指定用于打印训练过程的local_rank
model = BaseModelDDP(model, master_rank=0, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
class Loss(nn.Module):
def forward(self, outputs, labels):
return model.module.crf(*outputs, labels)
按如上所示修改好错误信息提示如下: 我之前是有尝试上述修改的,在issues中没贴出全部修改过的代码,
2023-05-24 10:17:46 - Start Training
2023-05-24 10:17:46 - Epoch: 1/20
Traceback (most recent call last):
File "mlu-2-task_sequence_labeling_ner_crf.py", line 202, in <module>
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch4keras/model.py", line 225, in fit
self.output, self.loss, self.loss_detail = self.train_step(self.train_X, self.train_y)
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch4keras/model.py", line 112, in train_step
loss_detail = self.criterion(output, train_y)
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
Traceback (most recent call last):
File "mlu-2-task_sequence_labeling_ner_crf.py", line 202, in <module>
return forward_call(*input, **kwargs)
File "mlu-2-task_sequence_labeling_ner_crf.py", line 127, in forward
return model.crf(*outputs, labels)
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1131, in __getattr__
model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch4keras/model.py", line 225, in fit
type(self).__name__, name))
AttributeError: 'BaseModelDDP' object has no attribute 'crf'
self.output, self.loss, self.loss_detail = self.train_step(self.train_X, self.train_y)
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch4keras/model.py", line 112, in train_step
loss_detail = self.criterion(output, train_y)
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "mlu-2-task_sequence_labeling_ner_crf.py", line 127, in forward
return model.crf(*outputs, labels)
File "/torch/venv3/pytorch/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1131, in __getattr__
type(self).__name__, name))
AttributeError: 'BaseModelDDP' object has no attribute 'crf'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1477) of binary: /torch/venv3/pytorch/bin/python
ERROR:torch.distributed.elastic.agent.server.local_elastic_agent:[default] Worker group failed
你好,我的脚本我昨天是跑过的,在Loss计算那里要改成model.module.crf
您好! 我想要验证在单机多卡环境下跑这份代码 task_sequence_labeling_ner_crf.py
我参考了您在其他Issues中提到的这份代码做修改 task_distributed_data_parallel.py
但是总是有各种各样的问题,您那边方便提供一份通用的修改单机多卡的教程吗?或者帮我把上述需要修改的代码 task_sequence_labeling_ner_crf.py 改成单机多卡形式,以便于我总结发现规律。谢谢您!
基本信息
核心代码
输出信息
自我尝试
只做了简单修改,把模型继承换了下,改成了BaseModelDDP,也尝试了BaseModelDP、BaseModel、nn.Module,应该就是我写法不对,在定义bert上的模型结构 这里的代码应该不是简单继承即可