`Traceback (most recent call last):
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/run_ner_crf.py", line 499, in
main()
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/run_ner_crf.py", line 440, in main
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/run_ner_crf.py", line 134, in train
outputs = model(inputs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, *kwargs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(input, kwargs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, *kwargs)
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/models/bert_for_ner.py", line 58, in forward
outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(input, **kwargs)
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/models/transformers/modeling_bert.py", line 606, in forward
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration
大哥,按照你的说法注释完后出现
warnings.warn('Was asked to gather along dimension 0, but all '
Traceback (most recent call last):
File "run_ner_span.py", line 541, in
main()
File "run_ner_span.py", line 483, in main
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
File "run_ner_span.py", line 128, in train
loss.backward()
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/autograd/init.py", line 147, in backward
Variable._execution_engine.run_backward(
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/autograd/function.py", line 87, in apply
return self._forward_cls.backward(self, args) # type: ignore[attr-defined]
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 34, in backward
return (None,) + ReduceAddCoalesced.apply(ctx.input_device, ctx.num_inputs, grad_outputs)
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 45, in forward
return comm.reduce_addcoalesced(grads, destination)
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 143, in reduce_add_coalesced
flat_result = reduce_add(flat_tensors, destination)
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 96, in reduce_add
nccl.reduce(inputs, output=result, root=root_index)
File "/data/jtf/Minicoda3/envs/python3_8/lib/python3.8/site-packages/torch/cuda/nccl.py", line 89, in reduce
torch._C._nccl_reduce(inputs, _output, root, op, streams, comms)
RuntimeError: NCCL Error 2: unhandled system error
https://github.com/CLUEbenchmark/CLUENER2020/blob/b6597268c000e06aa95bcdc59ef122805254cab6/pytorch_version/models/transformers/modeling_bert.py#L606
pytroch==1.6.0 需要注释extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)才能运行 可能是和pytorch版本有关 否则会报错StopIteration
`Traceback (most recent call last): File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/run_ner_crf.py", line 499, in
main()
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/run_ner_crf.py", line 440, in main
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/run_ner_crf.py", line 134, in train
outputs = model(inputs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, *kwargs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(input, kwargs)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, *kwargs)
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/models/bert_for_ner.py", line 58, in forward
outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
File "/home/admin/anaconda3/envs/zzh_dp/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(input, **kwargs)
File "/home/admin/zihe.zhu/20200902-CLUENER2020/pytorch_version/models/transformers/modeling_bert.py", line 606, in forward
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration
Process finished with exit code 1 `