Train-Batch Progress: 0%| | 0/5000 [00:03<?, ?it/s]
Epoch: 0%| | 0/1 [00:03<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 218, in
main()
File "main.py", line 164, in main
training_model, train_features, opt.warmup_epoch)
File "/home/cike/zetaolian/FewShotTagging/utils/trainer.py", line 117, in do_train
loss = self.do_forward(batch, model, epoch_id, step)
File "/home/cike/zetaolian/FewShotTagging/utils/trainer.py", line 575, in do_forward
label_output_mask,
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, *kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(input, kwargs)
File "/home/cike/zetaolian/FewShotTagging/models/few_shot_seq_labeler.py", line 221, in forward
support_token_ids, support_segment_ids, support_nwp_index, support_input_mask
File "/home/cike/zetaolian/FewShotTagging/models/few_shot_seq_labeler.py", line 138, in get_context_reps
support_nwp_index, support_input_mask
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, *kwargs)
File "/home/cike/zetaolian/FewShotTagging/models/fewshot_seqlabel/context_embedder_base.py", line 211, in forward
support_token_ids, support_segment_ids, support_nwp_index, support_input_mask,
File "/home/cike/zetaolian/FewShotTagging/models/fewshot_seqlabel/context_embedder_base.py", line 96, in concatenating_reps
sequenceoutput, = self.bert(input_ids, segment_ids, input_mask, output_all_encoded_layers=False)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(input, **kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/pytorch_pretrained_bert/modeling.py", line 708, in forward
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration
Hello,我运行了一下多gpu的命令,报错信息如下,请问是什么原因呢?
Train-Batch Progress: 0%| | 0/5000 [00:03<?, ?it/s] Epoch: 0%| | 0/1 [00:03<?, ?it/s] Traceback (most recent call last): File "main.py", line 218, in
main()
File "main.py", line 164, in main
training_model, train_features, opt.warmup_epoch)
File "/home/cike/zetaolian/FewShotTagging/utils/trainer.py", line 117, in do_train
loss = self.do_forward(batch, model, epoch_id, step)
File "/home/cike/zetaolian/FewShotTagging/utils/trainer.py", line 575, in do_forward
label_output_mask,
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, *kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(input, kwargs)
File "/home/cike/zetaolian/FewShotTagging/models/few_shot_seq_labeler.py", line 221, in forward
support_token_ids, support_segment_ids, support_nwp_index, support_input_mask
File "/home/cike/zetaolian/FewShotTagging/models/few_shot_seq_labeler.py", line 138, in get_context_reps
support_nwp_index, support_input_mask
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, *kwargs)
File "/home/cike/zetaolian/FewShotTagging/models/fewshot_seqlabel/context_embedder_base.py", line 211, in forward
support_token_ids, support_segment_ids, support_nwp_index, support_input_mask,
File "/home/cike/zetaolian/FewShotTagging/models/fewshot_seqlabel/context_embedder_base.py", line 96, in concatenating_reps
sequenceoutput, = self.bert(input_ids, segment_ids, input_mask, output_all_encoded_layers=False)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(input, **kwargs)
File "/home/cike/anaconda3/envs/zetaolian2/lib/python3.7/site-packages/pytorch_pretrained_bert/modeling.py", line 708, in forward
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration