open-mmlab / mmaction2

OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark
https://mmaction2.readthedocs.io
Apache License 2.0
4.14k stars 1.22k forks source link

when I train r2plus1d model, the following question has been bothering me #1684

Closed Strontia closed 2 years ago

Strontia commented 2 years ago

when I set r2plus1d_r34.py like below:

model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='ResNet2Plus1d',
        depth=34,
        pretrained=None,
        pretrained2d=False,
        norm_eval=True,
        conv_cfg=dict(type='Conv2plus1d'),
        # norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),
        norm_cfg=dict(type='BN', requires_grad=False),
        conv1_kernel=(3, 7, 7),
        conv1_stride_t=1,
        pool1_stride_t=1,
        inflate=(1, 1, 1, 1),
        spatial_strides=(1, 2, 2, 2),
        temporal_strides=(1, 2, 2, 2),
        zero_init_residual=False),
    cls_head=dict(
        type='I3DHead',
        num_classes=400,
        in_channels=512,
        spatial_type='avg',
        dropout_ratio=0.5,
        init_std=0.01),
    # model training and testing settings
    train_cfg=None,
    test_cfg=dict(average_clips='prob'))

error happened:

Traceback (most recent call last):
  File "/home/r/code/mmacion/mmaction2/mycode/R2plus1_frame.py", line 57, in <module>
    train_model(model, datasets, cfg, distributed=False, validate=True)
  File "/home/r/code/mmacion/mmaction2/mmaction/apis/train.py", line 233, in train_model
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 130, in run
    epoch_runner(data_loaders[i], **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 51, in train
    self.run_iter(data_batch, train_mode=True, **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 29, in run_iter
    outputs = self.model.train_step(data_batch, self.optimizer,
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/parallel/data_parallel.py", line 75, in train_step
    return self.module.train_step(*inputs[0], **kwargs[0])
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/base.py", line 301, in train_step
    losses = self(imgs, label, return_loss=True, **aux_info)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/base.py", line 263, in forward
    return self.forward_train(imgs, label, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/recognizer3d.py", line 20, in forward_train
    x = self.extract_feat(imgs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 116, in new_func
    return old_func(*args, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/base.py", line 164, in extract_feat
    x = self.backbone(imgs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/backbones/resnet2plus1d.py", line 43, in forward
    x = self.conv1(x)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/cnn/bricks/conv_module.py", line 203, in forward
    x = self.norm(x)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 135, in forward
    self._check_input_dim(input)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 407, in _check_input_dim
    raise ValueError("expected 4D input (got {}D input)".format(input.dim()))
ValueError: expected 4D input (got 5D input)

Process finished with exit code 1

when I changed r2plus1d_r34.py :

norm_eval=False,
norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),

error also changed:

Traceback (most recent call last):
  File "/home/r/code/mmacion/mmaction2/mycode/R2plus1_frame.py", line 57, in <module>
    train_model(model, datasets, cfg, distributed=False, validate=True)
  File "/home/r/code/mmacion/mmaction2/mmaction/apis/train.py", line 233, in train_model
    runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 130, in run
    epoch_runner(data_loaders[i], **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 51, in train
    self.run_iter(data_batch, train_mode=True, **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 29, in run_iter
    outputs = self.model.train_step(data_batch, self.optimizer,
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/parallel/data_parallel.py", line 75, in train_step
    return self.module.train_step(*inputs[0], **kwargs[0])
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/base.py", line 301, in train_step
    losses = self(imgs, label, return_loss=True, **aux_info)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/base.py", line 263, in forward
    return self.forward_train(imgs, label, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/recognizer3d.py", line 20, in forward_train
    x = self.extract_feat(imgs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 116, in new_func
    return old_func(*args, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/recognizers/base.py", line 164, in extract_feat
    x = self.backbone(imgs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/code/mmacion/mmaction2/mmaction/models/backbones/resnet2plus1d.py", line 43, in forward
    x = self.conv1(x)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/mmcv/cnn/bricks/conv_module.py", line 203, in forward
    x = self.norm(x)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 731, in forward
    world_size = torch.distributed.get_world_size(process_group)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 867, in get_world_size
    return _get_group_size(group)
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 325, in _get_group_size
    default_pg = _get_default_group()
  File "/home/r/miniconda3/envs/torch/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 429, in _get_default_group
    raise RuntimeError(
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

Process finished with exit code 1

How to fix this problem?

dreamerlin commented 2 years ago

It seems that your tensor shape is wrong. Check this

Strontia commented 2 years ago

my tensoor shape is not wrong.

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
train_pipeline = [
    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='RandomResizedCrop'),
    dict(type='Resize', scale=(224, 224), keep_ratio=False),
    dict(type='Flip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=1,
        test_mode=True),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=10,
        test_mode=True),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='ThreeCrop', crop_size=256),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
data = dict(
    videos_per_gpu=6,
    workers_per_gpu=2,
    test_dataloader=dict(videos_per_gpu=1),
    train=dict(
        type=dataset_type,
        ann_file=ann_file_train,
        data_prefix=data_root,
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        ann_file=ann_file_val,
        data_prefix=data_root_val,
        pipeline=val_pipeline,
        test_mode=True),
    test=dict(
        type=dataset_type,
        ann_file=ann_file_val,
        data_prefix=data_root_val,
        pipeline=test_pipeline,
        test_mode=True))
# optimizer
optimizer = dict(
    type='SGD', lr=0.075, momentum=0.9,
    weight_decay=0.0001)  # this lr is used for 8 gpus

# runtime settings
work_dir = './work_dirs/r2plus1d_r34_3d_32x2x1_180e_kinetics400_rgb/'
dreamerlin commented 2 years ago
  1. Try to use BN3d
  2. SyncBN needs distribution training
dralmadani commented 2 years ago

I try to change SyncBN to BN3d but doesn't work with me when using this cfg.model.backbone.norm_cfg.type= 'BN3d'