Closed jaeyun95 closed 4 years ago
i solved torchvision problem! but this error remained!
torchvision problem solved like this: (1) anaconda env : conda install torchvision -c pytorch (2) pip env : pip install torchvision
(vl-bert) ailab@ailab:~/vl-bert/VL-BERT$ ./scripts/dist_run_single.sh 2 vcr/train_end2end.py ./cfgs/vcr/base_q2a_4x16G_fp32.yaml ./vcr/saves/q2a/
/home/ailab/vl-bert/VL-BERT/vcr/../vcr/function/config.py:176: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
exp_config = edict(yaml.load(f))
/home/ailab/vl-bert/VL-BERT/vcr/../vcr/function/config.py:176: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
exp_config = edict(yaml.load(f))
Directory not created.
Directory not created.
Directory not created.
Namespace(cfg='./cfgs/vcr/base_q2a_4x16G_fp32.yaml', cudnn_off=False, dist=True, do_test=False, log_dir='./vcr/saves/q2a/./output/vl-bert/vcr/base_q2a_4x16G_fp32/vcr1images_train/tensorboard_logs', model_dir='./vcr/saves/q2a/', partial_pretrain=None, slurm=False)
Namespace(cfg='./cfgs/vcr/base_q2a_4x16G_fp32.yaml', cudnn_off=False, dist=True, do_test=False, log_dir='./vcr/saves/q2a/./output/vl-bert/vcr/base_q2a_4x16G_fp32/vcr1images_train/tensorboard_logs', model_dir='./vcr/saves/q2a/', partial_pretrain=None, slurm=False)
{'CHECKPOINT_FREQUENT': 1,
{'CHECKPOINT_FREQUENT': 1,
'DATASET': {'ADD_IMAGE_AS_A_BOX': True,
'DATASET': {'ADD_IMAGE_AS_A_BOX': True,
'APPEND_INDEX': False,
'APPEND_INDEX': False,
'BASIC_ALIGN': False,
'BASIC_ALIGN': False,
'CACHE_MODE': False,
'CACHE_MODE': False,
'DATASET': 'vcr',
'DATASET': 'vcr',
'DATASET_PATH': '/media/ailab/songyoungtak/vcr1',
'DATASET_PATH': '/media/ailab/songyoungtak/vcr1',
'IGNORE_DB_CACHE': True,
'IGNORE_DB_CACHE': True,
'LABEL_INDEX_IN_BATCH': 7,
'LABEL_INDEX_IN_BATCH': 7,
'MASK_SIZE': 14,
'MASK_SIZE': 14,
'ONLY_USE_RELEVANT_DETS': False,
'ONLY_USE_RELEVANT_DETS': False,
'QA2R_AUG': False,
'QA2R_AUG': False,
'QA2R_NOQ': False,
'QA2R_NOQ': False,
'ROOT_PATH': './',
'ROOT_PATH': './',
'TASK': 'Q2A',
'TASK': 'Q2A',
'TEST_ANNOTATION_FILE': 'test.jsonl',
'TEST_ANNOTATION_FILE': 'test.jsonl',
'TEST_IMAGE_SET': 'vcr1images',
'TEST_IMAGE_SET': 'vcr1images',
'TRAIN_ANNOTATION_FILE': 'train.jsonl',
'TRAIN_ANNOTATION_FILE': 'train.jsonl',
'TRAIN_IMAGE_SET': 'vcr1images',
'TRAIN_IMAGE_SET': 'vcr1images',
'VAL_ANNOTATION_FILE': 'val.jsonl',
'VAL_ANNOTATION_FILE': 'val.jsonl',
'VAL_IMAGE_SET': 'vcr1images',
'VAL_IMAGE_SET': 'vcr1images',
'ZIP_MODE': False},
'ZIP_MODE': False},
'GPUS': '0,1,2,3',
'GPUS': '0,1,2,3',
'LOG_FREQUENT': 100,
'LOG_FREQUENT': 100,
'MODEL_PREFIX': 'vl-bert_base_a_res101',
'MODEL_PREFIX': 'vl-bert_base_a_res101',
'MODULE': 'ResNetVLBERT',
'MODULE': 'ResNetVLBERT',
'NETWORK': {'ANSWER_FIRST': False,
'NETWORK': {'ANSWER_FIRST': False,
'ANS_LOSS_WEIGHT': 1.0,
'ANS_LOSS_WEIGHT': 1.0,
'BERT_ALIGN_ANSWER': True,
'BERT_ALIGN_ANSWER': True,
'BERT_ALIGN_QUESTION': True,
'BERT_ALIGN_QUESTION': True,
'BERT_FROZEN': False,
'BERT_FROZEN': False,
'BERT_MODEL_NAME': './model/pretrained_model/bert-base-uncased',
'BERT_MODEL_NAME': './model/pretrained_model/bert-base-uncased',
'BERT_PRETRAINED': '',
'BERT_PRETRAINED': '',
'BERT_PRETRAINED_EPOCH': 0,
'BERT_PRETRAINED_EPOCH': 0,
'BERT_USE_LAYER': -2,
'BERT_USE_LAYER': -2,
'BERT_WITH_MLM_LOSS': False,
'BERT_WITH_MLM_LOSS': False,
'BERT_WITH_NSP_LOSS': False,
'BERT_WITH_NSP_LOSS': False,
'BLIND': False,
'BLIND': False,
'CLASSIFIER_DROPOUT': 0.1,
'CLASSIFIER_DROPOUT': 0.1,
'CLASSIFIER_HIDDEN_SIZE': 1024,
'CLASSIFIER_HIDDEN_SIZE': 1024,
'CLASSIFIER_SIGMOID': True,
'CLASSIFIER_SIGMOID': True,
'CLASSIFIER_SIGMOID_LOSS_POSITIVE_WEIGHT': 1.0,
'CLASSIFIER_SIGMOID_LOSS_POSITIVE_WEIGHT': 1.0,
'CLASSIFIER_TYPE': '1fc',
'CLASSIFIER_TYPE': '1fc',
'CNN_LOSS_TOP': True,
'CNN_LOSS_TOP': True,
'CNN_LOSS_WEIGHT': 1.0,
'CNN_LOSS_WEIGHT': 1.0,
'CNN_REG_DROPOUT': 0.0,
'CNN_REG_DROPOUT': 0.0,
'ENABLE_CNN_REG_LOSS': True,
'ENABLE_CNN_REG_LOSS': True,
'FOR_MASK_VL_MODELING_PRETRAIN': False,
'FOR_MASK_VL_MODELING_PRETRAIN': False,
'IMAGE_C5_DILATED': True,
'IMAGE_C5_DILATED': True,
'IMAGE_FEAT_PRECOMPUTED': False,
'IMAGE_FEAT_PRECOMPUTED': False,
'IMAGE_FINAL_DIM': 768,
'IMAGE_FINAL_DIM': 768,
'IMAGE_FROZEN_BACKBONE_STAGES': [1, 2],
'IMAGE_FROZEN_BACKBONE_STAGES': [1, 2],
'IMAGE_FROZEN_BN': True,
'IMAGE_FROZEN_BN': True,
'IMAGE_NUM_LAYERS': 101,
'IMAGE_NUM_LAYERS': 101,
'IMAGE_PRETRAINED': './model/pretrained_model/resnet101-pt-vgbua',
'IMAGE_PRETRAINED': './model/pretrained_model/resnet101-pt-vgbua',
'IMAGE_PRETRAINED_EPOCH': 0,
'IMAGE_PRETRAINED_EPOCH': 0,
'IMAGE_SEMANTIC': False,
'IMAGE_SEMANTIC': False,
'IMAGE_STRIDE_IN_1x1': True,
'IMAGE_STRIDE_IN_1x1': True,
'LOAD_REL_HEAD': True,
'LOAD_REL_HEAD': True,
'NO_GROUNDING': False,
'NO_GROUNDING': False,
'NO_OBJ_ATTENTION': False,
'NO_OBJ_ATTENTION': False,
'OUTPUT_CONV5': False,
'OUTPUT_CONV5': False,
'PARTIAL_PRETRAIN': './model/pretrained_model/vl-bert-base-e2e.model',
'PARTIAL_PRETRAIN': './model/pretrained_model/vl-bert-base-e2e.model',
'PARTIAL_PRETRAIN_PREFIX_CHANGES': ['vlbert.mvrc_head.transform->cnn_loss_reg.0',
'PARTIAL_PRETRAIN_PREFIX_CHANGES': ['vlbert.mvrc_head.transform->cnn_loss_reg.0',
'module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0',
'module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0',
'module.vlbert->module.vlbert._module',
'module.vlbert->module.vlbert._module',
'vlbert->vlbert._module'],
'vlbert->vlbert._module'],
'PARTIAL_PRETRAIN_SEGMB_INIT': True,
'PARTIAL_PRETRAIN_SEGMB_INIT': True,
'PIXEL_MEANS': [102.9801, 115.9465, 122.7717],
'PIXEL_MEANS': [102.9801, 115.9465, 122.7717],
'PIXEL_STDS': [1.0, 1.0, 1.0],
'PIXEL_STDS': [1.0, 1.0, 1.0],
'QA_ONE_SENT': False,
'QA_ONE_SENT': False,
'VLBERT': {'attention_probs_dropout_prob': 0.1,
'VLBERT': {'attention_probs_dropout_prob': 0.1,
'hidden_act': 'gelu',
'hidden_act': 'gelu',
'hidden_dropout_prob': 0.1,
'hidden_dropout_prob': 0.1,
'hidden_size': 768,
'hidden_size': 768,
'initializer_range': 0.02,
'initializer_range': 0.02,
'input_size': 1280,
'input_size': 1280,
'input_transform_type': 1,
'input_transform_type': 1,
'intermediate_size': 3072,
'intermediate_size': 3072,
'max_position_embeddings': 512,
'max_position_embeddings': 512,
'num_attention_heads': 12,
'num_attention_heads': 12,
'num_hidden_layers': 12,
'num_hidden_layers': 12,
'obj_pos_id_relative': True,
'obj_pos_id_relative': True,
'object_word_embed_mode': 2,
'object_word_embed_mode': 2,
'position_padding_idx': -1,
'position_padding_idx': -1,
'type_vocab_size': 3,
'type_vocab_size': 3,
'visual_ln': True,
'visual_ln': True,
'visual_scale_object_init': 0.0,
'visual_scale_object_init': 0.0,
'visual_scale_text_init': 0.0,
'visual_scale_text_init': 0.0,
'visual_size': 768,
'visual_size': 768,
'vocab_size': 30522,
'vocab_size': 30522,
'with_pooler': True,
'with_pooler': True,
'word_embedding_frozen': False}},
'word_embedding_frozen': False}},
'NUM_WORKERS_PER_GPU': 4,
'NUM_WORKERS_PER_GPU': 4,
'OUTPUT_PATH': './vcr/saves/q2a/./output/vl-bert/vcr',
'OUTPUT_PATH': './vcr/saves/q2a/./output/vl-bert/vcr',
'RNG_SEED': 12345,
'RNG_SEED': 12345,
'SCALES': [600, 1200],
'SCALES': [600, 1200],
'TEST': {'BATCH_IMAGES': 4, 'FLIP_PROB': 0, 'SHUFFLE': False, 'TEST_EPOCH': 0},
'TEST': {'BATCH_IMAGES': 4, 'FLIP_PROB': 0, 'SHUFFLE': False, 'TEST_EPOCH': 0},
'TRAIN': {'ASPECT_GROUPING': False,
'TRAIN': {'ASPECT_GROUPING': False,
'AUTO_RESUME': True,
'AUTO_RESUME': True,
'BATCH_IMAGES': 4,
'BATCH_IMAGES': 4,
'BEGIN_EPOCH': 0,
'BEGIN_EPOCH': 0,
'CLIP_GRAD_NORM': 10,
'CLIP_GRAD_NORM': 10,
'END_EPOCH': 20,
'END_EPOCH': 20,
'FLIP_PROB': 0.5,
'FLIP_PROB': 0.5,
'FP16': False,
'FP16': False,
'FP16_LOSS_SCALE': 128.0,
'FP16_LOSS_SCALE': 128.0,
'GRAD_ACCUMULATE_STEPS': 4,
'GRAD_ACCUMULATE_STEPS': 4,
'LOSS_LOGGERS': [('ans_loss', 'AnsLoss'),
'LOSS_LOGGERS': [('ans_loss', 'AnsLoss'),
('cnn_regularization_loss', 'CNNRegLoss')],
('cnn_regularization_loss', 'CNNRegLoss')],
'LR': 7e-05,
'LR': 7e-05,
'LR_FACTOR': 0.1,
'LR_FACTOR': 0.1,
'LR_MULT': [],
'LR_MULT': [],
'LR_SCHEDULE': 'step',
'LR_SCHEDULE': 'step',
'LR_STEP': [14.0, 18.0],
'LR_STEP': [14.0, 18.0],
'MOMENTUM': 0.9,
'MOMENTUM': 0.9,
'OPTIMIZER': 'SGD',
'OPTIMIZER': 'SGD',
'RESUME': False,
'RESUME': False,
'SHUFFLE': True,
'SHUFFLE': True,
'VISUAL_SCALE_CLIP_GRAD_NORM': -1,
'VISUAL_SCALE_CLIP_GRAD_NORM': -1,
'VISUAL_SCALE_OBJECT_LR_MULT': 1.0,
'VISUAL_SCALE_OBJECT_LR_MULT': 1.0,
'VISUAL_SCALE_TEXT_LR_MULT': 1.0,
'VISUAL_SCALE_TEXT_LR_MULT': 1.0,
'WARMUP': True,
'WARMUP': True,
'WARMUP_FACTOR': 0.0,
'WARMUP_FACTOR': 0.0,
'WARMUP_METHOD': 'linear',
'WARMUP_METHOD': 'linear',
'WARMUP_STEPS': 1000,
'WARMUP_STEPS': 1000,
'WD': 0.0001},
'WD': 0.0001},
'VAL': {'BATCH_IMAGES': 4, 'FLIP_PROB': 0, 'SHUFFLE': False},
'VAL': {'BATCH_IMAGES': 4, 'FLIP_PROB': 0, 'SHUFFLE': False},
'VAL_FREQUENT': 1}
'VAL_FREQUENT': 1}
Traceback (most recent call last):
File "vcr/train_end2end.py", line 59, in <module>
main()
File "vcr/train_end2end.py", line 53, in main
rank, model = train_net(args, config)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/function/train.py", line 64, in train_net
model = eval(config.MODULE)(config)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/modules/resnet_vlbert_for_vcr.py", line 26, in __init__
enable_cnn_reg_loss=(self.enable_cnn_reg_loss and not self.cnn_loss_top))
File "/home/ailab/vl-bert/VL-BERT/vcr/../common/fast_rcnn.py", line 56, in __init__
expose_stages=[4], stride_in_1x1=self.stride_in_1x1)
File "/home/ailab/vl-bert/VL-BERT/vcr/../common/backbone/resnet/resnet.py", line 376, in resnet101
state_dict = torch.load(pretrained_model_path, map_location=lambda storage, loc: storage)
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/serialization.py", line 382, in load
Traceback (most recent call last):
File "vcr/train_end2end.py", line 59, in <module>
main()
File "vcr/train_end2end.py", line 53, in main
rank, model = train_net(args, config)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/function/train.py", line 64, in train_net
model = eval(config.MODULE)(config)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/modules/resnet_vlbert_for_vcr.py", line 26, in __init__
enable_cnn_reg_loss=(self.enable_cnn_reg_loss and not self.cnn_loss_top))
File "/home/ailab/vl-bert/VL-BERT/vcr/../common/fast_rcnn.py", line 56, in __init__
expose_stages=[4], stride_in_1x1=self.stride_in_1x1)
File "/home/ailab/vl-bert/VL-BERT/vcr/../common/backbone/resnet/resnet.py", line 376, in resnet101
state_dict = torch.load(pretrained_model_path, map_location=lambda storage, loc: storage)
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/serialization.py", line 382, in load
f = open(f, 'rb')
FileNotFoundError: [Errno 2] No such file or directory: './model/pretrained_model/resnet101-pt-vgbua-0000.model'
f = open(f, 'rb')
FileNotFoundError: [Errno 2] No such file or directory: './model/pretrained_model/resnet101-pt-vgbua-0000.model'
Traceback (most recent call last):
File "./scripts/launch.py", line 200, in <module>
main()
File "./scripts/launch.py", line 196, in main
cmd=process.args)
subprocess.CalledProcessError: Command '['/home/ailab/anaconda3/envs/vl-bert/bin/python', '-u', 'vcr/train_end2end.py', '--cfg', './cfgs/vcr/base_q2a_4x16G_fp32.yaml', '--model-dir', './vcr/saves/q2a/', '--dist']' returned non-zero exit status 1.
i solved it!!
but i have new error about tokenizer T^T
Traceback (most recent call last):
File "vcr/train_end2end.py", line 59, in <module>
main()
File "vcr/train_end2end.py", line 53, in main
rank, model = train_net(args, config)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/function/train.py", line 337, in train_net
gradient_accumulate_steps=config.TRAIN.GRAD_ACCUMULATE_STEPS)
File "/home/ailab/vl-bert/VL-BERT/vcr/../common/trainer.py", line 101, in train
for nbatch, batch in enumerate(train_loader):
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 582, in __next__
return self._process_next_batch(batch)
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 608, in _process_next_batch
raise batch.exc_type(batch.exc_msg)
AttributeError: Traceback (most recent call last):
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/data/datasets/vcr.py", line 309, in __getitem__
non_obj_tag=non_obj_tag)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/data/datasets/vcr.py", line 243, in retokenize_and_convert_to_ids_with_tag
retokenized_tokens = self.tokenizer.tokenize(mixed_token)
AttributeError: 'NoneType' object has no attribute 'tokenize'
Traceback (most recent call last):
File "vcr/train_end2end.py", line 59, in <module>
main()
File "vcr/train_end2end.py", line 53, in main
rank, model = train_net(args, config)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/function/train.py", line 337, in train_net
gradient_accumulate_steps=config.TRAIN.GRAD_ACCUMULATE_STEPS)
File "/home/ailab/vl-bert/VL-BERT/vcr/../common/trainer.py", line 101, in train
for nbatch, batch in enumerate(train_loader):
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 582, in __next__
return self._process_next_batch(batch)
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 608, in _process_next_batch
raise batch.exc_type(batch.exc_msg)
AttributeError: Traceback (most recent call last):
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/ailab/anaconda3/envs/vl-bert/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 99, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/data/datasets/vcr.py", line 309, in __getitem__
non_obj_tag=non_obj_tag)
File "/home/ailab/vl-bert/VL-BERT/vcr/../vcr/data/datasets/vcr.py", line 243, in retokenize_and_convert_to_ids_with_tag
retokenized_tokens = self.tokenizer.tokenize(mixed_token)
AttributeError: 'NoneType' object has no attribute 'tokenize'
Traceback (most recent call last):
File "./scripts/launch.py", line 199, in <module>
main()
File "./scripts/launch.py", line 195, in main
raise subprocess.CalledProcessError(returncode=process.returncode,cmd=process.args)
subprocess.CalledProcessError: Command '['/home/ailab/anaconda3/envs/vl-bert/bin/python', '-u', 'vcr/train_end2end.py', '--cfg', './cfgs/vcr/base_q2a_4x16G_fp32.yaml', '--model-dir', './vcr/saves/q2a/', '--dist']' returned non-zero exit status 1.
thank you!
oh!! i solved all problem!
thanks!
I am wondering how you fix the tokenizer error, can you please tell me the detail?
hi! i want vcr training.
but i got this error!
how do i do? thank you!