Input dimensionality error in training on flow images with ResNet3dCSN (Recognizer3D)

androbaza commented 3 years ago

I am training on custom dataset, from which I extracted flow frames. I trained on those frames with TANet (which has Recognizer2D), it worked well. Now I am using config from ./configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py and the full config is as follows:

Config:
model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='ResNet3dCSN',
        pretrained2d=False,
        pretrained=
        'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth',
        depth=152,
        with_pool2=False,
        bottleneck_mode='ir',
        norm_eval=True,
        zero_init_residual=False,
        bn_frozen=True),
    cls_head=dict(
        type='I3DHead',
        num_classes=7,
        in_channels=2048,
        spatial_type='avg',
        dropout_ratio=0.5,
        init_std=0.01),
    train_cfg=None,
    test_cfg=dict(average_clips='prob'))
dataset_type = 'RawframeDataset'
data_root = 'data/childact_rawframe/train/'
data_root_val = 'data/childact_rawframe/val/'
ann_file_train = 'data/childact_rawframe/childact_train_rawframe.txt'
ann_file_val = 'data/childact_rawframe/childact_val_rawframe.txt'
ann_file_test = 'data/childact_rawframe/childact_test_rawframe.txt'
img_norm_cfg = dict(mean=[128, 128], std=[128, 128])
train_pipeline = [
    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='RandomResizedCrop'),
    dict(type='Resize', scale=(224, 224), keep_ratio=False),
    dict(type='Normalize', mean=[128, 128], std=[128, 128]),
    dict(type='FormatShape', input_format='NCHW_Flow'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=1,
        test_mode=True),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Flip', flip_ratio=0),
    dict(type='Normalize', mean=[128, 128], std=[128, 128]),
    dict(type='FormatShape', input_format='NCHW_Flow'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=10,
        test_mode=True),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='ThreeCrop', crop_size=256),
    dict(type='Flip', flip_ratio=0),
    dict(type='Normalize', mean=[128, 128], std=[128, 128]),
    dict(type='FormatShape', input_format='NCHW_Flow'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
data = dict(
    videos_per_gpu=3,
    workers_per_gpu=4,
    train=dict(
        type='RawframeDataset',
        ann_file='data/childact_rawframe/childact_train_rawframe.txt',
        data_prefix='data/childact_rawframe/train/',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=32,
                frame_interval=2,
                num_clips=1),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='RandomResizedCrop'),
            dict(type='Resize', scale=(224, 224), keep_ratio=False),
            dict(type='Normalize', mean=[128, 128], std=[128, 128]),
            dict(type='FormatShape', input_format='NCHW_Flow'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs', 'label'])
        ],
        modality='Flow',
        start_index=0,
        filename_tmpl='flow_{}_{:05d}.jpg'),
    val=dict(
        type='RawframeDataset',
        ann_file='data/childact_rawframe/childact_val_rawframe.txt',
        data_prefix='data/childact_rawframe/val/',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=32,
                frame_interval=2,
                num_clips=1,
                test_mode=True),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='CenterCrop', crop_size=224),
            dict(type='Flip', flip_ratio=0),
            dict(type='Normalize', mean=[128, 128], std=[128, 128]),
            dict(type='FormatShape', input_format='NCHW_Flow'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs'])
        ],
        modality='Flow',
        start_index=0,
        filename_tmpl='flow_{}_{:05d}.jpg'),
    test=dict(
        type='RawframeDataset',
        ann_file='data/childact_rawframe/childact_test_rawframe.txt',
        data_prefix='data/childact_rawframe/test/',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=32,
                frame_interval=2,
                num_clips=10,
                test_mode=True),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='ThreeCrop', crop_size=256),
            dict(type='Flip', flip_ratio=0),
            dict(type='Normalize', mean=[128, 128], std=[128, 128]),
            dict(type='FormatShape', input_format='NCHW_Flow'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs'])
        ],
        modality='Flow',
        start_index=0,
        filename_tmpl='flow_{}_{:05d}.jpg'))
optimizer = dict(type='SGD', lr=0.000125, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
lr_config = dict(
    policy='cyclic',
    target_ratio=(10, 1e-05),
    cyclic_times=1,
    step_ratio_up=0.4)
total_epochs = 51
checkpoint_config = dict(interval=12)
evaluation = dict(
    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
log_config = dict(
    interval=100,
    hooks=[dict(type='TextLoggerHook'),
           dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './childact-checkpoints/childact-CSN'
load_from = 'checkpoints/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb_20200812-9037a758.pth'
resume_from = None
workflow = [('train', 1)]
find_unused_parameters = True
omnisource = False
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.8947368421052632, 1),
    cyclic_times=1,
    step_ratio_up=0.4)
seed = 42
gpu_ids = range(0, 1)
output_config = dict(out='./childact-checkpoints/childact-CSN/results.json')

which results in the following when I calltrain_model():

2021-03-29 21:27:53,517 - mmaction - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth
2021-03-29 21:27:53,518 - mmaction - INFO - Use load_from_http loader
2021-03-29 21:27:53,751 - mmaction - INFO - load checkpoint from checkpoints/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb_20200812-9037a758.pth
2021-03-29 21:27:53,752 - mmaction - INFO - Use load_from_local loader
2021-03-29 21:27:53,913 - mmaction - WARNING - The model and loaded state dict do not match exactly

size mismatch for cls_head.fc_cls.weight: copying a param with shape torch.Size([400, 2048]) from checkpoint, the shape in current model is torch.Size([7, 2048]).
size mismatch for cls_head.fc_cls.bias: copying a param with shape torch.Size([400]) from checkpoint, the shape in current model is torch.Size([7]).
2021-03-29 21:27:53,915 - mmaction - INFO - Start running, host: actrec@actrec-HP-Z4-G4-Workstation, work_dir: /home/actrec/.virtualenvs/mmaction/mmaction2/childact-checkpoints/childact-CSN
2021-03-29 21:27:53,916 - mmaction - INFO - workflow: [('train', 1)], max: 51 epochs

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-103-be92e24cff32> in <module>
     18 # Create work_dir
     19 mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
---> 20 train_model(model, datasets, cfg, distributed=False, validate=True)

~/.virtualenvs/mmaction/mmaction2/mmaction/apis/train.py in train_model(model, dataset, cfg, distributed, validate, timestamp, meta)
    154     if cfg.omnisource:
    155         runner_kwargs = dict(train_ratio=train_ratio)
--> 156     runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in run(self, data_loaders, workflow, max_epochs, **kwargs)
    123                     if mode == 'train' and self.epoch >= self._max_epochs:
    124                         break
--> 125                     epoch_runner(data_loaders[i], **kwargs)
    126 
    127         time.sleep(1)  # wait for some hooks like loggers to finish

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in train(self, data_loader, **kwargs)
     48             self._inner_iter = i
     49             self.call_hook('before_train_iter')
---> 50             self.run_iter(data_batch, train_mode=True)
     51             self.call_hook('after_train_iter')
     52             self._iter += 1

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in run_iter(self, data_batch, train_mode, **kwargs)
     28         elif train_mode:
     29             outputs = self.model.train_step(data_batch, self.optimizer,
---> 30                                             **kwargs)
     31         else:
     32             outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/parallel/data_parallel.py in train_step(self, *inputs, **kwargs)
     65 
     66         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
---> 67         return self.module.train_step(*inputs[0], **kwargs[0])
     68 
     69     def val_step(self, *inputs, **kwargs):

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in train_step(self, data_batch, optimizer, **kwargs)
    220             aux_info[item] = data_batch[item]
    221 
--> 222         losses = self(imgs, label, return_loss=True, **aux_info)
    223 
    224         loss, log_vars = self._parse_losses(losses)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in forward(self, imgs, label, return_loss, **kwargs)
    182             if label is None:
    183                 raise ValueError('Label should not be None.')
--> 184             return self.forward_train(imgs, label, **kwargs)
    185 
    186         return self.forward_test(imgs, **kwargs)

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/recognizer3d.py in forward_train(self, imgs, labels, **kwargs)
     14         losses = dict()
     15 
---> 16         x = self.extract_feat(imgs)
     17         if hasattr(self, 'neck'):
     18             x, loss_aux = self.neck(x, labels.squeeze())

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs)
     82                                 'method of nn.Module')
     83             if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
---> 84                 return old_func(*args, **kwargs)
     85             # get the arg spec of the decorated method
     86             args_info = getfullargspec(old_func)

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in extract_feat(self, imgs)
     85             torch.tensor: The extracted features.
     86         """
---> 87         x = self.backbone(imgs)
     88         return x
     89 

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/mmaction2/mmaction/models/backbones/resnet3d.py in forward(self, x)
    816             samples extracted by the backbone.
    817         """
--> 818         x = self.conv1(x)
    819         x = self.maxpool(x)
    820         outs = []

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/cnn/bricks/conv_module.py in forward(self, x, activate, norm)
    191                 if self.with_explicit_padding:
    192                     x = self.padding_layer(x)
--> 193                 x = self.conv(x)
    194             elif layer == 'norm' and norm and self.with_norm:
    195                 x = self.norm(x)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/cnn/bricks/wrappers.py in forward(self, x)
     77                 return empty
     78 
---> 79         return super().forward(x)
     80 
     81 

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/conv.py in forward(self, input)
    571                             self.dilation, self.groups)
    572         return F.conv3d(input, self.weight, self.bias, self.stride,
--> 573                         self.padding, self.dilation, self.groups)
    574 
    575 

RuntimeError: Expected 5-dimensional input for 5-dimensional weight [64, 3, 3, 7, 7], but got 4-dimensional input of size [3, 64, 224, 224] instead

I searched for the corresponding RuntimeError, but have not found solution that would help me. I understand that the issue is related to the model's backbone, but I could not find what do I have to add to the config to make it work with the flow files.

androbaza commented 3 years ago

got similar issue on ResNet3dSlowOnly, also Recognizer3D. Config:

Config:
model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='ResNet3dSlowOnly',
        depth=50,
        pretrained='torchvision://resnet50',
        lateral=False,
        out_indices=(2, 3),
        conv1_kernel=(1, 7, 7),
        conv1_stride_t=1,
        pool1_stride_t=1,
        inflate=(0, 0, 1, 1),
        norm_eval=False),
    neck=dict(
        type='TPN',
        in_channels=(1024, 2048),
        out_channels=1024,
        spatial_modulation_cfg=dict(
            in_channels=(1024, 2048), out_channels=2048),
        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
        upsample_cfg=dict(scale_factor=(1, 1, 1)),
        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
        level_fusion_cfg=dict(
            in_channels=(1024, 1024),
            mid_channels=(1024, 1024),
            out_channels=2048,
            downsample_scales=((1, 1, 1), (1, 1, 1))),
        aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
    cls_head=dict(
        type='TPNHead',
        num_classes=7,
        in_channels=2048,
        spatial_type='avg',
        consensus=dict(type='AvgConsensus', dim=1),
        dropout_ratio=0.5,
        init_std=0.01),
    train_cfg=None,
    test_cfg=dict(average_clips='prob'))
checkpoint_config = dict(interval=12)
log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = 'checkpoints/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb_20200923-52629684.pth'
resume_from = None
workflow = [('train', 1)]
dataset_type = 'RawframeDataset'
data_root = 'data/childact_rawframe/train/'
data_root_val = 'data/childact_rawframe/val/'
ann_file_train = 'data/childact_rawframe/childact_train_rawframe.txt'
ann_file_val = 'data/childact_rawframe/childact_val_rawframe.txt'
ann_file_test = 'data/childact_rawframe/childact_test_rawframe.txt'
img_norm_cfg = dict(mean=[128, 128], std=[128, 128])
train_pipeline = [
    dict(type='SampleFrames', clip_len=32, frame_interval=8, num_clips=1),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='RandomCrop', size=224),
    dict(type='Normalize', mean=[128, 128], std=[128, 128]),
    dict(type='FormatShape', input_format='NCHW_Flow'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
    dict(
        type='SampleFrames',
        clip_len=8,
        frame_interval=8,
        num_clips=1,
        test_mode=True),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='CenterCrop', crop_size=224),
    dict(type='Normalize', mean=[128, 128], std=[128, 128]),
    dict(type='FormatShape', input_format='NCHW_Flow'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
    dict(
        type='SampleFrames',
        clip_len=8,
        frame_interval=8,
        num_clips=10,
        test_mode=True),
    dict(type='RawFrameDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='ThreeCrop', crop_size=256),
    dict(type='Normalize', mean=[128, 128], std=[128, 128]),
    dict(type='FormatShape', input_format='NCHW_Flow'),
    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
data = dict(
    videos_per_gpu=8,
    workers_per_gpu=8,
    train=dict(
        type='RawframeDataset',
        ann_file='data/childact_rawframe/childact_train_rawframe.txt',
        data_prefix='data/childact_rawframe/train/',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=32,
                frame_interval=8,
                num_clips=1),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='RandomCrop', size=224),
            dict(type='Normalize', mean=[128, 128], std=[128, 128]),
            dict(type='FormatShape', input_format='NCHW_Flow'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs', 'label'])
        ],
        modality='Flow',
        start_index=0,
        filename_tmpl='flow_{}_{:05d}.jpg'),
    val=dict(
        type='RawframeDataset',
        ann_file='data/childact_rawframe/childact_val_rawframe.txt',
        data_prefix='data/childact_rawframe/val/',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=8,
                frame_interval=8,
                num_clips=1,
                test_mode=True),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='CenterCrop', crop_size=224),
            dict(type='Normalize', mean=[128, 128], std=[128, 128]),
            dict(type='FormatShape', input_format='NCHW_Flow'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs'])
        ],
        modality='Flow',
        start_index=0,
        filename_tmpl='flow_{}_{:05d}.jpg'),
    test=dict(
        type='RawframeDataset',
        ann_file='data/childact_rawframe/childact_test_rawframe.txt',
        data_prefix='data/childact_rawframe/test/',
        pipeline=[
            dict(
                type='SampleFrames',
                clip_len=8,
                frame_interval=8,
                num_clips=10,
                test_mode=True),
            dict(type='RawFrameDecode'),
            dict(type='Resize', scale=(-1, 256)),
            dict(type='ThreeCrop', crop_size=256),
            dict(type='Normalize', mean=[128, 128], std=[128, 128]),
            dict(type='FormatShape', input_format='NCHW_Flow'),
            dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
            dict(type='ToTensor', keys=['imgs'])
        ],
        modality='Flow',
        start_index=0,
        filename_tmpl='flow_{}_{:05d}.jpg'))
evaluation = dict(
    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
optimizer = dict(
    type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
lr_config = dict(
    policy='cyclic',
    target_ratio=(10, 1e-05),
    cyclic_times=1,
    step_ratio_up=0.4)
total_epochs = 51
work_dir = './childact-checkpoints/childact-TPN'
omnisource = False
momentum_config = dict(
    policy='cyclic',
    target_ratio=(0.8947368421052632, 1),
    cyclic_times=1,
    step_ratio_up=0.4)
seed = 42
gpu_ids = range(0, 1)
output_config = dict(out='./childact-checkpoints/childact-TPN/results.json')

error:

2021-03-29 22:48:30,211 - mmaction - INFO - load model from: torchvision://resnet50
2021-03-29 22:48:30,398 - mmaction - INFO - These parameters in the 2d checkpoint are not loaded: {'fc.weight', 'fc.bias'}

Use load_from_torchvision loader

2021-03-29 22:48:33,227 - mmaction - INFO - load checkpoint from checkpoints/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb_20200923-52629684.pth
2021-03-29 22:48:33,228 - mmaction - INFO - Use load_from_local loader
2021-03-29 22:48:33,475 - mmaction - WARNING - The model and loaded state dict do not match exactly

size mismatch for cls_head.fc_cls.weight: copying a param with shape torch.Size([400, 2048]) from checkpoint, the shape in current model is torch.Size([7, 2048]).
size mismatch for cls_head.fc_cls.bias: copying a param with shape torch.Size([400]) from checkpoint, the shape in current model is torch.Size([7]).
2021-03-29 22:48:33,486 - mmaction - INFO - Start running, host: actrec@actrec-HP-Z4-G4-Workstation, work_dir: /home/actrec/.virtualenvs/mmaction/mmaction2/childact-checkpoints/childact-TPN
2021-03-29 22:48:33,486 - mmaction - INFO - workflow: [('train', 1)], max: 51 epochs
/home/actrec/.virtualenvs/mmaction/mmaction2/mmaction/core/evaluation/eval_hooks.py:131: UserWarning: runner.meta is None. Creating a empty one.
  warnings.warn('runner.meta is None. Creating a empty one.')

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-be92e24cff32> in <module>
     18 # Create work_dir
     19 mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
---> 20 train_model(model, datasets, cfg, distributed=False, validate=True)

~/.virtualenvs/mmaction/mmaction2/mmaction/apis/train.py in train_model(model, dataset, cfg, distributed, validate, timestamp, meta)
    154     if cfg.omnisource:
    155         runner_kwargs = dict(train_ratio=train_ratio)
--> 156     runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in run(self, data_loaders, workflow, max_epochs, **kwargs)
    123                     if mode == 'train' and self.epoch >= self._max_epochs:
    124                         break
--> 125                     epoch_runner(data_loaders[i], **kwargs)
    126 
    127         time.sleep(1)  # wait for some hooks like loggers to finish

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in train(self, data_loader, **kwargs)
     48             self._inner_iter = i
     49             self.call_hook('before_train_iter')
---> 50             self.run_iter(data_batch, train_mode=True)
     51             self.call_hook('after_train_iter')
     52             self._iter += 1

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in run_iter(self, data_batch, train_mode, **kwargs)
     28         elif train_mode:
     29             outputs = self.model.train_step(data_batch, self.optimizer,
---> 30                                             **kwargs)
     31         else:
     32             outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/parallel/data_parallel.py in train_step(self, *inputs, **kwargs)
     65 
     66         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
---> 67         return self.module.train_step(*inputs[0], **kwargs[0])
     68 
     69     def val_step(self, *inputs, **kwargs):

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in train_step(self, data_batch, optimizer, **kwargs)
    220             aux_info[item] = data_batch[item]
    221 
--> 222         losses = self(imgs, label, return_loss=True, **aux_info)
    223 
    224         loss, log_vars = self._parse_losses(losses)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in forward(self, imgs, label, return_loss, **kwargs)
    182             if label is None:
    183                 raise ValueError('Label should not be None.')
--> 184             return self.forward_train(imgs, label, **kwargs)
    185 
    186         return self.forward_test(imgs, **kwargs)

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/recognizer3d.py in forward_train(self, imgs, labels, **kwargs)
     14         losses = dict()
     15 
---> 16         x = self.extract_feat(imgs)
     17         if hasattr(self, 'neck'):
     18             x, loss_aux = self.neck(x, labels.squeeze())

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs)
     82                                 'method of nn.Module')
     83             if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
---> 84                 return old_func(*args, **kwargs)
     85             # get the arg spec of the decorated method
     86             args_info = getfullargspec(old_func)

~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in extract_feat(self, imgs)
     85             torch.tensor: The extracted features.
     86         """
---> 87         x = self.backbone(imgs)
     88         return x
     89 

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/mmaction2/mmaction/models/backbones/resnet3d.py in forward(self, x)
    816             samples extracted by the backbone.
    817         """
--> 818         x = self.conv1(x)
    819         x = self.maxpool(x)
    820         outs = []

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/cnn/bricks/conv_module.py in forward(self, x, activate, norm)
    191                 if self.with_explicit_padding:
    192                     x = self.padding_layer(x)
--> 193                 x = self.conv(x)
    194             elif layer == 'norm' and norm and self.with_norm:
    195                 x = self.norm(x)

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/cnn/bricks/wrappers.py in forward(self, x)
     77                 return empty
     78 
---> 79         return super().forward(x)
     80 
     81 

~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/conv.py in forward(self, input)
    571                             self.dilation, self.groups)
    572         return F.conv3d(input, self.weight, self.bias, self.stride,
--> 573                         self.padding, self.dilation, self.groups)
    574 
    575 

RuntimeError: Expected 5-dimensional input for 5-dimensional weight [64, 3, 1, 7, 7], but got 4-dimensional input of size [40, 64, 224, 224] instead

dreamerlin commented 3 years ago

The 'NCHW_Flow' format is not corresponding to ResNet3dCSN

androbaza commented 3 years ago

The 'NCHW_Flow' format is not corresponding to ResNet3dCSN

So there is no way to train Recognizer3D models on flow images?

irvingzhang0512 commented 3 years ago

slowonly_r50_4x16x1_256e_kinetics400_flow.py may help

androbaza commented 3 years ago

slowonly_r50_4x16x1_256e_kinetics400_flow.py may help

Couldn't believe that I overlooked that file! Thank you, will study it and close the issue after I solve the problem.

androbaza commented 3 years ago

The error was solved by changing the data pipeline according to the defaults in cfg file of the network I wanted to use,

open-mmlab / mmaction2

Input dimensionality error in training on flow images with ResNet3dCSN (Recognizer3D) #770