Open huangxianliang opened 5 years ago
this is the detail of gradients error: -- Process 1 terminated with the following error: Traceback (most recent call last): File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap fn(i, args) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/fairseq-0.7.1-py3.6-linux-x86_64.egg/fairseq_cli/train.py", line 265, in distributed_main main(args, init_distributed=True) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/fairseq-0.7.1-py3.6-linux-x86_64.egg/fairseq_cli/train.py", line 80, in main train(args, trainer, task, epoch_itr) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/fairseq-0.7.1-py3.6-linux-x86_64.egg/fairseq_cli/train.py", line 121, in train log_output = trainer.train_step(samples) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/fairseq-0.7.1-py3.6-linux-x86_64.egg/fairseq/trainer.py", line 283, in train_step raise e File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/fairseq-0.7.1-py3.6-linux-x86_64.egg/fairseq/trainer.py", line 260, in train_step ignore_grad File "/home/slurm/job/tmp/job-139545/mass/xmasked_seq2seq.py", line 398, in train_step forward_backward(model, sample[sample_key], sample_key, lang_pair) File "/home/slurm/job/tmp/job-139545/mass/xmasked_seq2seq.py", line 383, in forward_backward loss, sample_size, logging_output = criterion(model, samples) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in call result = self.forward(input, kwargs) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/fairseq-0.7.1-py3.6-linux-x86_64.egg/fairseq/criterions/label_smoothed_cross_entropy.py", line 38, in forward net_output = model(sample['net_input']) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in call result = self.forward(*input, **kwargs) File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 353, in forward self._check_previous_reduction() File "/home/slurm/job/tmp/job-139545/env/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 343, in _check_previous_reduction raise RuntimeError("Not all gradients have been reduced from " RuntimeError: Not all gradients have been reduced from the backward of the previous iteration. This is unexpected and fatal error. Please check and ensure that the model's parameters are not changed after you wrap up the model with DistributedDataParallel.
i try to pre-train supNMT,but this error occur:Not all gradients have been reduced from the backward of the previous iteration.
All params(src and tgt are mono lang,just like en and zh): Namespace(activation_dropout=0.1, activation_fn='relu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='xtransformer', attention_dropout=0.1, bucket_cap_mb=25, clip_norm=0.0, cpu=False, criterion='label_smoothed_cross_entropy', curriculum=0, data='/home/slurm/job/tmp/job-139665/data/processed', dataset_impl='cached', ddp_backend='c10d', decoder_attention_heads=8, decoder_embed_dim=128, decoder_embed_path=None, decoder_ffn_embed_dim=512, decoder_input_dim=128, decoder_layers=6, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=128, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method='tcp://localhost:16747', distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=2, dropout=0.1, encoder_attention_heads=8, encoder_embed_dim=128, encoder_embed_path=None, encoder_ffn_embed_dim=512, encoder_layers=6, encoder_learned_pos=False, encoder_normalize_before=False, find_unused_parameters=False, fix_batches_to_gpus=False, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, global_sync_iter=10, keep_interval_updates=100, keep_last_epochs=-1, label_smoothing=0.0, langs='src,tgt', lazy_load=True, left_pad_source='True', left_pad_target='False', lm_bias=True, log_format='simple', log_interval=50, lr=[5e-05], lr_scheduler='inverse_sqrt', mass_steps='src-src,tgt-tgt', max_epoch=20, max_sentences=512, max_sentences_valid=512, max_source_positions=20, max_target_positions=20, max_tokens=4096, max_update=100000000, memory_efficient_fp16=False, memt_steps='src-tgt,tgt-src', min_loss_scale=0.0001, min_lr=1e-09, mt_steps='', no_epoch_checkpoints=False, no_progress_bar=False, no_save=False, no_token_positional_embeddings=False, num_workers=0, optimizer='adam', optimizer_overrides='{}', raw_text=False, reload_checkpoint=None, required_batch_size_multiple=8, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, restore_file='checkpoint_last.pt', save_dir='output/checkpoints/mass/pre-training/', save_interval=1, save_interval_updates=3000, seed=1, sentence_avg=False, share_all_embeddings=True, share_decoder_input_output_embed=True, skip_invalid_size_inputs_valid_test=True, source_lang=None, source_langs='src,tgt', target_lang=None, target_langs='src,tgt', task='xmasked_seq2seq', tbmf_wrapper=False, tensorboard_logdir='', threshold_loss_scale=None, train_subset='train', update_freq=[1], use_bmuf=False, user_dir='mass', valid_lang_pairs='src-tgt', valid_subset='valid', validate_interval=1, warmup_init_lr=-1, warmup_updates=4000, weight_decay=0.0, word_mask=0.3, word_mask_keep_rand='0.1,0.1,0.8')
why so many errors occur in my training process, i am crying......