An error during finetuning for the TVR task

linjieli222 / HERO

Research code for EMNLP 2020 paper "HERO: Hierarchical Encoder for Video+Language Omni-representation Pre-training"

MIT License

230 stars 34 forks source link

# inside the container CUDA_VISIBLE_DEVICES = 0 horovodrun -np 1 python train_vcmr.py --config config/train-tvr-8gpu.json ... ... [1,0]<stderr>:12/13/2021 09:08:05 - INFO - model.model - Decoder Transformer config: None [1,0]<stderr>:12/13/2021 09:08:08 - INFO - model.modeling_utils - Weights of HeroForVcmr not initialized from pretrained model: ['v_encoder.fom_output.linear_1.weight', 'v_encoder.fom_output.linear_1.bias', 'v_encoder.fom_output.LayerNorm.weight', 'v_encoder.fom_output.LayerNorm.bias', 'v_encoder.fom_output.linear_2.weight', 'v_encoder.fom_output.linear_2.bias'] [1,0]<stderr>:12/13/2021 09:08:08 - INFO - model.modeling_utils - Weights from pretrained model not used in HeroForVcmr: ['vocab_padded', 'v_encoder.fr_output.linear_1.weight', 'v_encoder.fr_output.linear_1.bias', 'v_encoder.fr_output.LayerNorm.weight', 'v_encoder.fr_output.LayerNorm.bias', 'v_encoder.fr_output.linear_2.weight', 'v_encoder.fr_output.linear_2.bias', 'v_encoder.itm_clip_transform.linear_1.weight', 'v_encoder.itm_clip_transform.linear_1.bias', 'v_encoder.itm_clip_transform.LayerNorm.weight', 'v_encoder.itm_clip_transform.LayerNorm.bias', 'v_encoder.itm_clip_transform.linear_2.weight', 'v_encoder.itm_clip_transform.linear_2.bias', 'v_encoder.itm_sub_transform.linear_1.weight', 'v_encoder.itm_sub_transform.linear_1.bias', 'v_encoder.itm_sub_transform.LayerNorm.weight', 'v_encoder.itm_sub_transform.LayerNorm.bias', 'v_encoder.itm_sub_transform.linear_2.weight', 'v_encoder.itm_sub_transform.linear_2.bias'] [1,0]<stdout>:Selected optimization level O2: FP16 training with FP32 batchnorm and FP32 master weights. [1,0]<stdout>: [1,0]<stdout>:Defaults for this optimization level are: [1,0]<stdout>:enabled : True [1,0]<stdout>:opt_level : O2 [1,0]<stdout>:cast_model_type : torch.float16 [1,0]<stdout>:patch_torch_functions : False [1,0]<stdout>:keep_batchnorm_fp32 : True [1,0]<stdout>:master_weights : True [1,0]<stdout>:loss_scale : dynamic [1,0]<stdout>:Processing user overrides (additional kwargs that are not None)... [1,0]<stdout>:After processing overrides, optimization options are: [1,0]<stdout>:enabled : True [1,0]<stdout>:opt_level : O2 [1,0]<stdout>:cast_model_type : torch.float16 [1,0]<stdout>:patch_torch_functions : False [1,0]<stdout>:keep_batchnorm_fp32 : True [1,0]<stdout>:master_weights : True [1,0]<stdout>:loss_scale : dynamic [1,0]<stderr>:Traceback (most recent call last): [1,0]<stderr>: File "train_vcmr.py", line 399, in <module> [1,0]<stderr>: main(args) [1,0]<stderr>: File "train_vcmr.py", line 161, in main [1,0]<stderr>: restorer = TrainingRestorer(opts, model, optimizer) [1,0]<stderr>: File "/src/utils/save.py", line 141, in __init__ [1,0]<stderr>: assert vars(opts) == restore_opts [1,0]<stderr>:AssertionError -------------------------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code. Per user-direction, the job has been aborted. -------------------------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[30056,1],0] Exit code: 1 --------------------------------------------------------------------------

vars(opts)= {'model_config': 'config/hero_finetune.json', 'checkpoint': '/pretrain/hero-tv-ht100.pt', 'train_batch_size': 32, 'val_batch_size': 20, 'gradient_accumulation_steps': 2, 'learning_rate': 0.0001, 'valid_steps': 200, 'save_steps': 200, 'optim': 'adamw', 'betas': [ 0.9, 0.98 ], 'dropout': 0.1, 'weight_decay': 0.01, 'grad_norm': 1.0, 'warmup_steps': 500, 'lr_mul': 1.0, 'num_train_steps': 5000, 'output_dir': '/storage/tvr_default', 'sub_ctx_len': 0, 'max_clip_len': 100, 'max_txt_len': 60, 'vfeat_version': 'resnet_slowfast', 'vfeat_interval': 1.5, 'compressed_db': False, 'seed': 77, 'n_workers': 4, 'pin_mem': True, 'fp16': True, 'task': 'tvr', 'vcmr_eval_video_batch_size': 50, 'vcmr_eval_q_batch_size': 80, 'drop_svmr_prob': 0.8, 'lw_neg_q': 8.0, 'lw_neg_ctx': 8.0, 'lw_st_ed': 0.01, 'ranking_loss_type': 'hinge', 'margin': 0.1, 'hard_pool_size': [ 20 ], 'hard_neg_weights': [ 10 ], 'hard_negtiave_start_step': [ 2000 ], 'train_span_start_step': 0, 'use_all_neg': True, 'eval_with_query_type': True, 'max_before_nms': 200, 'max_after_nms': 100, 'distributed_eval': True, 'nms_thd': 0.5, 'q2c_alpha': 20, 'max_vcmr_video': 100, 'full_eval_tasks': ['VCMR', 'SVMR', 'VR' ], 'min_pred_l': 2, 'max_pred_l': 16, 'sub_txt_db': '/txt/tv_subtitles.db', 'vfeat_db': '/video/tv', 'train_query_txt_db': '/txt/tvr_train.db', 'val_query_txt_db': '/txt/tvr_val.db', 'test_query_txt_db': None, 'vcmr_eval_batch_size': 80, 'rank': 0, 'n_gpu': 1 } restore_opts= {'model_config': 'config/hero.json', 'checkpoint': '/pretrain/hero-tv-ht100.pt', 'train_batch_size': 32, 'val_batch_size': 20, 'gradient_accumulation_steps': 2, 'learning_rate': 0.0001, 'valid_steps': 200, 'save_steps': 200, 'optim': 'adamw', 'betas': [ 0.9, 0.98 ], 'dropout': 0.1, 'weight_decay': 0.01, 'grad_norm': 1.0, 'warmup_steps': 500, 'lr_mul': 1.0, 'num_train_steps': 5000, 'output_dir': '/storage/linjie_saved_results/release_debug/tvr_default', 'sub_ctx_len': 0, 'max_clip_len': 100, 'max_txt_len': 60, 'vfeat_version': 'resnet_slowfast', 'vfeat_interval': 1.5, 'compressed_db': False, 'seed': 77, 'n_workers': 4, 'pin_mem': True, 'fp16': True, 'task': 'tvr', 'vcmr_eval_video_batch_size': 50, 'vcmr_eval_q_batch_size': 80, 'drop_svmr_prob': 0.8, 'lw_neg_q': 8.0, 'lw_neg_ctx': 8.0, 'lw_st_ed': 0.01, 'ranking_loss_type': 'hinge', 'margin': 0.1, 'hard_pool_size': [ 20 ], 'hard_neg_weights': [ 10 ], 'hard_negtiave_start_step': [ 2000 ], 'train_span_start_step': 0, 'use_all_neg': True, 'eval_with_query_type': True, 'max_before_nms': 200, 'max_after_nms': 100, 'distributed_eval': True, 'nms_thd': 0.5, 'q2c_alpha': 20, 'max_vcmr_video': 100, 'full_eval_tasks': ['VCMR', 'SVMR', 'VR' ], 'min_pred_l': 2, 'max_pred_l': 16, 'tasks': 'tvr', 'sub_txt_db': '/txt/tv_subtitles.db', 'vfeat_db': '/video/tv', 'train_query_txt_db': '/txt/tvr_train.db', 'val_query_txt_db': '/txt/tvr_val.db', 'drop_sub_prob': 0, 'vcmr_eval_batch_size': 80, 'rank': 0, 'n_gpu': 8 }

# store_temp/finetune/tvr_default/log/hps.json { "model_config": "config/hero_finetune.json", "checkpoint": "/pretrain/hero-tv-ht100.pt", "train_batch_size": 32, "val_batch_size": 20, "gradient_accumulation_steps": 2, "learning_rate": 0.0001, "valid_steps": 200, "save_steps": 200, "optim": "adamw", "betas": [ 0.9, 0.98 ], "dropout": 0.1, "weight_decay": 0.01, "grad_norm": 1.0, "warmup_steps": 500, "lr_mul": 1.0, "num_train_steps": 5000, "output_dir": "/storage/tvr_default", "sub_ctx_len": 0, "max_clip_len": 100, "max_txt_len": 60, "vfeat_version": "resnet_slowfast", "vfeat_interval": 1.5, "compressed_db": false, "seed": 77, "n_workers": 4, "pin_mem": true, "fp16": true, "task": "tvr", "vcmr_eval_video_batch_size": 50, "vcmr_eval_q_batch_size": 80, "drop_svmr_prob": 0.8, "lw_neg_q": 8.0, "lw_neg_ctx": 8.0, "lw_st_ed": 0.01, "ranking_loss_type": "hinge", "margin": 0.1, "hard_pool_size": [ 20 ], "hard_neg_weights": [ 10 ], "hard_negtiave_start_step": [ 2000 ], "train_span_start_step": 0, "use_all_neg": true, "eval_with_query_type": true, "max_before_nms": 200, "max_after_nms": 100, "distributed_eval": true, "nms_thd": 0.5, "q2c_alpha": 20, "max_vcmr_video": 100, "full_eval_tasks": [ "VCMR", "SVMR", "VR" ], "min_pred_l": 2, "max_pred_l": 16, "sub_txt_db": "/txt/tv_subtitles.db", "vfeat_db": "/video/tv", "train_query_txt_db": "/txt/tvr_train.db", "val_query_txt_db": "/txt/tvr_val.db", "test_query_txt_db": null, "vcmr_eval_batch_size": 80, "rank": 0, "tasks": "tvr", "drop_sub_prob": 0, "n_gpu": 1 }

# config/train-tvr-8gpu.json { "task": "tvr", "sub_txt_db": "/txt/tv_subtitles.db", "vfeat_db": "/video/tv", "train_query_txt_db": "/txt/tvr_train.db", "val_query_txt_db": "/txt/tvr_val.db", "test_query_txt_db": null, "compressed_db": false, "model_config": "config/hero_finetune.json", "checkpoint": "/pretrain/hero-tv-ht100.pt", "output_dir": "/storage/tvr_default", "eval_with_query_type": true, "max_before_nms": 200, "max_after_nms": 100, "distributed_eval": true, "nms_thd": 0.5, "q2c_alpha": 20, "max_vcmr_video": 100, "full_eval_tasks": [ "VCMR", "SVMR", "VR" ], "max_clip_len": 100, "max_txt_len": 60, "vfeat_version": "resnet_slowfast", "vfeat_interval": 1.5, "min_pred_l": 2, "max_pred_l": 16, "drop_svmr_prob": 0.8, "train_batch_size": 32, "val_batch_size": 20, "vcmr_eval_video_batch_size": 50, "vcmr_eval_batch_size": 80, "gradient_accumulation_steps":2, "learning_rate": 1e-04, "valid_steps": 200, "save_steps": 200, "num_train_steps": 5000, "optim": "adamw", "betas": [ 0.9, 0.98 ], "dropout": 0.1, "weight_decay": 0.01, "grad_norm": 1.0, "warmup_steps": 500, "lw_neg_q": 8.0, "lw_neg_ctx": 8.0, "lw_st_ed": 0.01, "ranking_loss_type": "hinge", "margin": 0.1, "hard_pool_size": [ 20 ], "hard_neg_weights": [ 10 ], "hard_negtiave_start_step": [ 2000 ], "train_span_start_step": 0, "sub_ctx_len": 0, "use_all_neg": true, "seed": 77, "fp16": true, "n_workers": 4, "pin_mem": true, "rank": 0, "tasks": "tvr", "drop_sub_prob": 0 }

[1,0]<stderr>:12/13/2021 10:13:52 - INFO - model.model - Decoder Transformer config: None [1,0]<stderr>:12/13/2021 10:13:55 - INFO - model.modeling_utils - Weights of HeroForVcmr not initialized from pretrained model: ['v_encoder.fom_output.linear_1.weight', 'v_encoder.fom_output.linear_1.bias', 'v_encoder.fom_output.LayerNorm.weight', 'v_encoder.fom_output.LayerNorm.bias', 'v_encoder.fom_output.linear_2.weight', 'v_encoder.fom_output.linear_2.bias'] [1,0]<stderr>:12/13/2021 10:13:55 - INFO - model.modeling_utils - Weights from pretrained model not used in HeroForVcmr: ['vocab_padded', 'v_encoder.fr_output.linear_1.weight', 'v_encoder.fr_output.linear_1.bias', 'v_encoder.fr_output.LayerNorm.weight', 'v_encoder.fr_output.LayerNorm.bias', 'v_encoder.fr_output.linear_2.weight', 'v_encoder.fr_output.linear_2.bias', 'v_encoder.itm_clip_transform.linear_1.weight', 'v_encoder.itm_clip_transform.linear_1.bias', 'v_encoder.itm_clip_transform.LayerNorm.weight', 'v_encoder.itm_clip_transform.LayerNorm.bias', 'v_encoder.itm_clip_transform.linear_2.weight', 'v_encoder.itm_clip_transform.linear_2.bias', 'v_encoder.itm_sub_transform.linear_1.weight', 'v_encoder.itm_sub_transform.linear_1.bias', 'v_encoder.itm_sub_transform.LayerNorm.weight', 'v_encoder.itm_sub_transform.LayerNorm.bias', 'v_encoder.itm_sub_transform.linear_2.weight', 'v_encoder.itm_sub_transform.linear_2.bias'] [1,0]<stdout>:Selected optimization level O2: FP16 training with FP32 batchnorm and FP32 master weights. [1,0]<stdout>: [1,0]<stdout>:Defaults for this optimization level are: [1,0]<stdout>:enabled : True [1,0]<stdout>:opt_level : O2 [1,0]<stdout>:cast_model_type : torch.float16 [1,0]<stdout>:patch_torch_functions : False [1,0]<stdout>:keep_batchnorm_fp32 : True [1,0]<stdout>:master_weights : True [1,0]<stdout>:loss_scale : dynamic [1,0]<stdout>:Processing user overrides (additional kwargs that are not None)... [1,0]<stdout>:After processing overrides, optimization options are: [1,0]<stdout>:enabled : True [1,0]<stdout>:opt_level : O2 [1,0]<stdout>:cast_model_type : torch.float16 [1,0]<stdout>:patch_torch_functions : False [1,0]<stdout>:keep_batchnorm_fp32 : True [1,0]<stdout>:master_weights : True [1,0]<stdout>:loss_scale : dynamic [1,0]<stdout>: [1,0]<stdout>: [1,0]<stdout>:vars(opts)= {'model_config': 'config/hero_finetune.json', 'checkpoint': '/pretrain/hero-tv-ht100.pt', 'train_batch_size': 32, 'val_batch_size': 20, 'gradient_accumulation_steps': 2, 'learning_rate': 0.0001, 'valid_steps': 200, 'save_steps': 200, 'optim': 'adamw', 'betas': [0.9, 0.98], 'dropout': 0.1, 'weight_decay': 0.01, 'grad_norm': 1.0, 'warmup_steps': 500, 'lr_mul': 1.0, 'num_train_steps': 5000, 'output_dir': '/storage/tvr_default', 'sub_ctx_len': 0, 'max_clip_len': 100, 'max_txt_len': 60, 'vfeat_version': 'resnet_slowfast', 'vfeat_interval': 1.5, 'compressed_db': False, 'seed': 77, 'n_workers': 4, 'pin_mem': True, 'fp16': True, 'task': 'tvr', 'vcmr_eval_video_batch_size': 50, 'vcmr_eval_q_batch_size': 80, 'drop_svmr_prob': 0.8, 'lw_neg_q': 8.0, 'lw_neg_ctx': 8.0, 'lw_st_ed': 0.01, 'ranking_loss_type': 'hinge', 'margin': 0.1, 'hard_pool_size': [20], 'hard_neg_weights': [10], 'hard_negtiave_start_step': [2000], 'train_span_start_step': 0, 'use_all_neg': True, 'eval_with_query_type': True, 'max_before_nms': 200, 'max_after_nms': 100, 'distributed_eval': True, 'nms_thd': 0.5, 'q2c_alpha': 20, 'max_vcmr_video': 100, 'full_eval_tasks': ['VCMR', 'SVMR', 'VR'], 'min_pred_l': 2, 'max_pred_l': 16, 'sub_txt_db': '/txt/tv_subtitles.db', 'vfeat_db': '/video/tv', 'train_query_txt_db': '/txt/tvr_train.db', 'val_query_txt_db': '/txt/tvr_val.db', 'test_query_txt_db': None, 'vcmr_eval_batch_size': 80, 'rank': 0, 'tasks': 'tvr', 'drop_sub_prob': 0, 'n_gpu': 1} [1,0]<stdout>: [1,0]<stdout>: [1,0]<stdout>:restore_opts= {'model_config': 'config/hero_finetune.json', 'checkpoint': '/pretrain/hero-tv-ht100.pt', 'train_batch_size': 32, 'val_batch_size': 20, 'gradient_accumulation_steps': 2, 'learning_rate': 0.0001, 'valid_steps': 200, 'save_steps': 200, 'optim': 'adamw', 'betas': [0.9, 0.98], 'dropout': 0.1, 'weight_decay': 0.01, 'grad_norm': 1.0, 'warmup_steps': 500, 'lr_mul': 1.0, 'num_train_steps': 5000, 'output_dir': '/storage/tvr_default', 'sub_ctx_len': 0, 'max_clip_len': 100, 'max_txt_len': 60, 'vfeat_version': 'resnet_slowfast', 'vfeat_interval': 1.5, 'compressed_db': False, 'seed': 77, 'n_workers': 4, 'pin_mem': True, 'fp16': True, 'task': 'tvr', 'vcmr_eval_video_batch_size': 50, 'vcmr_eval_q_batch_size': 80, 'drop_svmr_prob': 0.8, 'lw_neg_q': 8.0, 'lw_neg_ctx': 8.0, 'lw_st_ed': 0.01, 'ranking_loss_type': 'hinge', 'margin': 0.1, 'hard_pool_size': [20], 'hard_neg_weights': [10], 'hard_negtiave_start_step': [2000], 'train_span_start_step': 0, 'use_all_neg': True, 'eval_with_query_type': True, 'max_before_nms': 200, 'max_after_nms': 100, 'distributed_eval': True, 'nms_thd': 0.5, 'q2c_alpha': 20, 'max_vcmr_video': 100, 'full_eval_tasks': ['VCMR', 'SVMR', 'VR'], 'min_pred_l': 2, 'max_pred_l': 16, 'tasks': 'tvr', 'sub_txt_db': '/txt/tv_subtitles.db', 'vfeat_db': '/video/tv', 'train_query_txt_db': '/txt/tvr_train.db', 'val_query_txt_db': '/txt/tvr_val.db', 'drop_sub_prob': 0, 'vcmr_eval_batch_size': 80, 'rank': 0, 'n_gpu': 1, 'test_query_txt_db': None} [1,0]<stdout>: [1,0]<stdout>: [1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - Waiting on git info.... [1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - Git branch: master [1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - Git SHA: 32c1c523c7a9f547a29f14c8e33dec24ebd14156 0%| | 0/5000 [00:00<?, ?it/s][1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - ***** Running training with 1 GPUs ***** [1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - Batch size = 32 [1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - Accumulate steps = 2 [1,0]<stderr>:12/13/2021 10:13:56 - INFO - __main__ - Num steps = 5000 [1,0]<stderr>:Traceback (most recent call last): [1,0]<stderr>: File "train_vcmr.py", line 399, in <module> [1,0]<stderr>: main(args) [1,0]<stderr>: File "train_vcmr.py", line 214, in main [1,0]<stderr>: loss = model(batch, task=task, compute_loss=True) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/apex/amp/_initialize.py", line 194, in new_fwd [1,0]<stderr>: **applier(kwargs, input_caster)) [1,0]<stderr>: File "/src/model/vcmr.py", line 33, in forward [1,0]<stderr>: batch, task='vsm', compute_loss=compute_loss) [1,0]<stderr>: File "/src/model/pretrain.py", line 65, in forward [1,0]<stderr>: frame_embeddings = self.v_encoder(batch, 'repr') [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/src/model/model.py", line 144, in forward [1,0]<stderr>: return self.forward_repr(batch) [1,0]<stderr>: File "/src/model/model.py", line 196, in forward_repr [1,0]<stderr>: frame_outputs = self.f_encoder(batch, 'repr') [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/src/model/encoder.py", line 311, in forward [1,0]<stderr>: img_masks=f_v_mask) [1,0]<stderr>: File "/src/model/encoder.py", line 344, in forward_repr [1,0]<stderr>: encoder_outputs = self.encoder(embedding_output, attention_mask) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/src/model/layers.py", line 311, in forward [1,0]<stderr>: hidden_states, extended_attention_mask, None) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/src/model/layers.py", line 266, in forward [1,0]<stderr>: hidden_states, attention_mask, head_mask) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/src/model/layers.py", line 218, in forward [1,0]<stderr>: self_outputs = self.self(input_tensor, attention_mask, head_mask) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/src/model/layers.py", line 127, in forward [1,0]<stderr>: mixed_value_layer = self.value(hidden_states) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 545, in __call__ [1,0]<stderr>: result = self.forward(*input, **kwargs) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 87, in forward [1,0]<stderr>: return F.linear(input, self.weight, self.bias) [1,0]<stderr>: File "/opt/conda/lib/python3.6/site-packages/torch/nn/functional.py", line 1372, in linear [1,0]<stderr>: output = input.matmul(weight.t()) [1,0]<stderr>:RuntimeError: CUDA out of memory. Tried to allocate 40.00 MiB (GPU 0; 5.80 GiB total capacity; 4.45 GiB already allocated; 64.62 MiB free; 120.02 MiB cached) -------------------------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code. Per user-direction, the job has been aborted. -------------------------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[28779,1],0] Exit code: 1 --------------------------------------------------------------------------

@linjieli222 Hi, I just encountered an error in Quick Start Step3 using 1 GPU:

# inside the container
CUDA_VISIBLE_DEVICES = 0
horovodrun -np 1 python train_vcmr.py --config config/train-tvr-8gpu.json
...
...
[1,0]<stderr>:12/13/2021 09:08:05 - INFO - model.model -        Decoder Transformer config: None
[1,0]<stderr>:12/13/2021 09:08:08 - INFO - model.modeling_utils -   Weights of HeroForVcmr not initialized from pretrained model: ['v_encoder.fom_output.linear_1.weight', 'v_encoder.fom_output.linear_1.bias', 'v_encoder.fom_output.LayerNorm.weight', 'v_encoder.fom_output.LayerNorm.bias', 'v_encoder.fom_output.linear_2.weight', 'v_encoder.fom_output.linear_2.bias']
[1,0]<stderr>:12/13/2021 09:08:08 - INFO - model.modeling_utils -   Weights from pretrained model not used in HeroForVcmr: ['vocab_padded', 'v_encoder.fr_output.linear_1.weight', 'v_encoder.fr_output.linear_1.bias', 'v_encoder.fr_output.LayerNorm.weight', 'v_encoder.fr_output.LayerNorm.bias', 'v_encoder.fr_output.linear_2.weight', 'v_encoder.fr_output.linear_2.bias', 'v_encoder.itm_clip_transform.linear_1.weight', 'v_encoder.itm_clip_transform.linear_1.bias', 'v_encoder.itm_clip_transform.LayerNorm.weight', 'v_encoder.itm_clip_transform.LayerNorm.bias', 'v_encoder.itm_clip_transform.linear_2.weight', 'v_encoder.itm_clip_transform.linear_2.bias', 'v_encoder.itm_sub_transform.linear_1.weight', 'v_encoder.itm_sub_transform.linear_1.bias', 'v_encoder.itm_sub_transform.LayerNorm.weight', 'v_encoder.itm_sub_transform.LayerNorm.bias', 'v_encoder.itm_sub_transform.linear_2.weight', 'v_encoder.itm_sub_transform.linear_2.bias']
[1,0]<stdout>:Selected optimization level O2:  FP16 training with FP32 batchnorm and FP32 master weights.
[1,0]<stdout>:
[1,0]<stdout>:Defaults for this optimization level are:
[1,0]<stdout>:enabled                : True
[1,0]<stdout>:opt_level              : O2
[1,0]<stdout>:cast_model_type        : torch.float16
[1,0]<stdout>:patch_torch_functions  : False
[1,0]<stdout>:keep_batchnorm_fp32    : True
[1,0]<stdout>:master_weights         : True
[1,0]<stdout>:loss_scale             : dynamic
[1,0]<stdout>:Processing user overrides (additional kwargs that are not None)...
[1,0]<stdout>:After processing overrides, optimization options are:
[1,0]<stdout>:enabled                : True
[1,0]<stdout>:opt_level              : O2
[1,0]<stdout>:cast_model_type        : torch.float16
[1,0]<stdout>:patch_torch_functions  : False
[1,0]<stdout>:keep_batchnorm_fp32    : True
[1,0]<stdout>:master_weights         : True
[1,0]<stdout>:loss_scale             : dynamic
[1,0]<stderr>:Traceback (most recent call last):
[1,0]<stderr>:  File "train_vcmr.py", line 399, in <module>
[1,0]<stderr>:    main(args)
[1,0]<stderr>:  File "train_vcmr.py", line 161, in main
[1,0]<stderr>:    restorer = TrainingRestorer(opts, model, optimizer)
[1,0]<stderr>:  File "/src/utils/save.py", line 141, in __init__
[1,0]<stderr>:    assert vars(opts) == restore_opts
[1,0]<stderr>:AssertionError
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[30056,1],0]
  Exit code:    1
--------------------------------------------------------------------------

It seems to be caused by vars(opts) and restore_opts:

vars(opts)= {'model_config': 'config/hero_finetune.json', 'checkpoint': '/pretrain/hero-tv-ht100.pt', 'train_batch_size': 32, 'val_batch_size': 20, 'gradient_accumulation_steps': 2, 'learning_rate': 0.0001, 'valid_steps': 200, 'save_steps': 200, 'optim': 'adamw', 'betas': [
        0.9,
        0.98
    ], 'dropout': 0.1, 'weight_decay': 0.01, 'grad_norm': 1.0, 'warmup_steps': 500, 'lr_mul': 1.0, 'num_train_steps': 5000, 'output_dir': '/storage/tvr_default', 'sub_ctx_len': 0, 'max_clip_len': 100, 'max_txt_len': 60, 'vfeat_version': 'resnet_slowfast', 'vfeat_interval': 1.5, 'compressed_db': False, 'seed': 77, 'n_workers': 4, 'pin_mem': True, 'fp16': True, 'task': 'tvr', 'vcmr_eval_video_batch_size': 50, 'vcmr_eval_q_batch_size': 80, 'drop_svmr_prob': 0.8, 'lw_neg_q': 8.0, 'lw_neg_ctx': 8.0, 'lw_st_ed': 0.01, 'ranking_loss_type': 'hinge', 'margin': 0.1, 'hard_pool_size': [
        20
    ], 'hard_neg_weights': [
        10
    ], 'hard_negtiave_start_step': [
        2000
    ], 'train_span_start_step': 0, 'use_all_neg': True, 'eval_with_query_type': True, 'max_before_nms': 200, 'max_after_nms': 100, 'distributed_eval': True, 'nms_thd': 0.5, 'q2c_alpha': 20, 'max_vcmr_video': 100, 'full_eval_tasks': ['VCMR', 'SVMR', 'VR'
    ], 'min_pred_l': 2, 'max_pred_l': 16, 'sub_txt_db': '/txt/tv_subtitles.db', 'vfeat_db': '/video/tv', 'train_query_txt_db': '/txt/tvr_train.db', 'val_query_txt_db': '/txt/tvr_val.db', 'test_query_txt_db': None, 'vcmr_eval_batch_size': 80, 'rank': 0, 'n_gpu': 1
}

restore_opts= {'model_config': 'config/hero.json', 'checkpoint': '/pretrain/hero-tv-ht100.pt', 'train_batch_size': 32, 'val_batch_size': 20, 'gradient_accumulation_steps': 2, 'learning_rate': 0.0001, 'valid_steps': 200, 'save_steps': 200, 'optim': 'adamw', 'betas': [
        0.9,
        0.98
    ], 'dropout': 0.1, 'weight_decay': 0.01, 'grad_norm': 1.0, 'warmup_steps': 500, 'lr_mul': 1.0, 'num_train_steps': 5000, 'output_dir': '/storage/linjie_saved_results/release_debug/tvr_default', 'sub_ctx_len': 0, 'max_clip_len': 100, 'max_txt_len': 60, 'vfeat_version': 'resnet_slowfast', 'vfeat_interval': 1.5, 'compressed_db': False, 'seed': 77, 'n_workers': 4, 'pin_mem': True, 'fp16': True, 'task': 'tvr', 'vcmr_eval_video_batch_size': 50, 'vcmr_eval_q_batch_size': 80, 'drop_svmr_prob': 0.8, 'lw_neg_q': 8.0, 'lw_neg_ctx': 8.0, 'lw_st_ed': 0.01, 'ranking_loss_type': 'hinge', 'margin': 0.1, 'hard_pool_size': [
        20
    ], 'hard_neg_weights': [
        10
    ], 'hard_negtiave_start_step': [
        2000
    ], 'train_span_start_step': 0, 'use_all_neg': True, 'eval_with_query_type': True, 'max_before_nms': 200, 'max_after_nms': 100, 'distributed_eval': True, 'nms_thd': 0.5, 'q2c_alpha': 20, 'max_vcmr_video': 100, 'full_eval_tasks': ['VCMR', 'SVMR', 'VR'
    ], 'min_pred_l': 2, 'max_pred_l': 16, 'tasks': 'tvr', 'sub_txt_db': '/txt/tv_subtitles.db', 'vfeat_db': '/video/tv', 'train_query_txt_db': '/txt/tvr_train.db', 'val_query_txt_db': '/txt/tvr_val.db', 'drop_sub_prob': 0, 'vcmr_eval_batch_size': 80, 'rank': 0, 'n_gpu': 8
}

And then, I just changed the contents of these two files $PATH_TO_STORAGE/finetune/tvr_default/log/hps.json and config/train-tvr-8gpu.json to fix this error:

# store_temp/finetune/tvr_default/log/hps.json
{
    "model_config": "config/hero_finetune.json",
    "checkpoint": "/pretrain/hero-tv-ht100.pt",
    "train_batch_size": 32,
    "val_batch_size": 20,
    "gradient_accumulation_steps": 2,
    "learning_rate": 0.0001,
    "valid_steps": 200,
    "save_steps": 200,
    "optim": "adamw",
    "betas": [
        0.9,
        0.98
    ],
    "dropout": 0.1,
    "weight_decay": 0.01,
    "grad_norm": 1.0,
    "warmup_steps": 500,
    "lr_mul": 1.0,
    "num_train_steps": 5000,
    "output_dir": "/storage/tvr_default",
    "sub_ctx_len": 0,
    "max_clip_len": 100,
    "max_txt_len": 60,
    "vfeat_version": "resnet_slowfast",
    "vfeat_interval": 1.5,
    "compressed_db": false,
    "seed": 77,
    "n_workers": 4,
    "pin_mem": true,
    "fp16": true,
    "task": "tvr",
    "vcmr_eval_video_batch_size": 50,
    "vcmr_eval_q_batch_size": 80,
    "drop_svmr_prob": 0.8,
    "lw_neg_q": 8.0,
    "lw_neg_ctx": 8.0,
    "lw_st_ed": 0.01,
    "ranking_loss_type": "hinge",
    "margin": 0.1,
    "hard_pool_size": [
        20
    ],
    "hard_neg_weights": [
        10
    ],
    "hard_negtiave_start_step": [
        2000
    ],
    "train_span_start_step": 0,
    "use_all_neg": true,
    "eval_with_query_type": true,
    "max_before_nms": 200,
    "max_after_nms": 100,
    "distributed_eval": true,
    "nms_thd": 0.5,
    "q2c_alpha": 20,
    "max_vcmr_video": 100,
    "full_eval_tasks": [
        "VCMR",
        "SVMR",
        "VR"
    ],
    "min_pred_l": 2,
    "max_pred_l": 16,
    "sub_txt_db": "/txt/tv_subtitles.db",
    "vfeat_db": "/video/tv",
    "train_query_txt_db": "/txt/tvr_train.db",
    "val_query_txt_db": "/txt/tvr_val.db",
    "test_query_txt_db": null,
    "vcmr_eval_batch_size": 80,
    "rank": 0,
    "tasks": "tvr",
    "drop_sub_prob": 0,
    "n_gpu": 1
}

# config/train-tvr-8gpu.json
{
    "task": "tvr",
    "sub_txt_db": "/txt/tv_subtitles.db",
    "vfeat_db": "/video/tv",
    "train_query_txt_db": "/txt/tvr_train.db",
    "val_query_txt_db": "/txt/tvr_val.db",
    "test_query_txt_db": null,
    "compressed_db": false,
    "model_config": "config/hero_finetune.json",
    "checkpoint": "/pretrain/hero-tv-ht100.pt",
    "output_dir": "/storage/tvr_default",
    "eval_with_query_type": true,
    "max_before_nms": 200,
    "max_after_nms": 100,
    "distributed_eval": true,
    "nms_thd": 0.5,
    "q2c_alpha": 20,
    "max_vcmr_video": 100,
    "full_eval_tasks": [
        "VCMR",
        "SVMR",
        "VR"
    ],
    "max_clip_len": 100,
    "max_txt_len": 60,
    "vfeat_version": "resnet_slowfast",
    "vfeat_interval": 1.5,
    "min_pred_l": 2,
    "max_pred_l": 16,
    "drop_svmr_prob": 0.8,
    "train_batch_size": 32,
    "val_batch_size": 20,
    "vcmr_eval_video_batch_size": 50,
    "vcmr_eval_batch_size": 80,
    "gradient_accumulation_steps":2,
    "learning_rate": 1e-04,
    "valid_steps": 200,
    "save_steps": 200,
    "num_train_steps": 5000,
    "optim": "adamw",
    "betas": [
        0.9,
        0.98
    ],
    "dropout": 0.1,
    "weight_decay": 0.01,
    "grad_norm": 1.0,
    "warmup_steps": 500,
    "lw_neg_q": 8.0,
    "lw_neg_ctx": 8.0,
    "lw_st_ed": 0.01,
    "ranking_loss_type": "hinge",
    "margin": 0.1,
    "hard_pool_size": [
        20
    ],
    "hard_neg_weights": [
        10
    ],
    "hard_negtiave_start_step": [
        2000
    ],
    "train_span_start_step": 0,
    "sub_ctx_len": 0,
    "use_all_neg": true,
    "seed": 77,
    "fp16": true,
    "n_workers": 4,
    "pin_mem": true,
    "rank": 0,
    "tasks": "tvr",
    "drop_sub_prob": 0
}

Is this the correct way to fix this error?

Can you highlight the changes?

linjieli222 / HERO

An error during finetuning for the TVR task #37