Closed androbaza closed 3 years ago
got similar issue on ResNet3dSlowOnly, also Recognizer3D. Config:
Config:
model = dict(
type='Recognizer3D',
backbone=dict(
type='ResNet3dSlowOnly',
depth=50,
pretrained='torchvision://resnet50',
lateral=False,
out_indices=(2, 3),
conv1_kernel=(1, 7, 7),
conv1_stride_t=1,
pool1_stride_t=1,
inflate=(0, 0, 1, 1),
norm_eval=False),
neck=dict(
type='TPN',
in_channels=(1024, 2048),
out_channels=1024,
spatial_modulation_cfg=dict(
in_channels=(1024, 2048), out_channels=2048),
temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
upsample_cfg=dict(scale_factor=(1, 1, 1)),
downsample_cfg=dict(downsample_scale=(1, 1, 1)),
level_fusion_cfg=dict(
in_channels=(1024, 1024),
mid_channels=(1024, 1024),
out_channels=2048,
downsample_scales=((1, 1, 1), (1, 1, 1))),
aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
cls_head=dict(
type='TPNHead',
num_classes=7,
in_channels=2048,
spatial_type='avg',
consensus=dict(type='AvgConsensus', dim=1),
dropout_ratio=0.5,
init_std=0.01),
train_cfg=None,
test_cfg=dict(average_clips='prob'))
checkpoint_config = dict(interval=12)
log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = 'checkpoints/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb_20200923-52629684.pth'
resume_from = None
workflow = [('train', 1)]
dataset_type = 'RawframeDataset'
data_root = 'data/childact_rawframe/train/'
data_root_val = 'data/childact_rawframe/val/'
ann_file_train = 'data/childact_rawframe/childact_train_rawframe.txt'
ann_file_val = 'data/childact_rawframe/childact_val_rawframe.txt'
ann_file_test = 'data/childact_rawframe/childact_test_rawframe.txt'
img_norm_cfg = dict(mean=[128, 128], std=[128, 128])
train_pipeline = [
dict(type='SampleFrames', clip_len=32, frame_interval=8, num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomCrop', size=224),
dict(type='Normalize', mean=[128, 128], std=[128, 128]),
dict(type='FormatShape', input_format='NCHW_Flow'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
]
val_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', mean=[128, 128], std=[128, 128]),
dict(type='FormatShape', input_format='NCHW_Flow'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
test_pipeline = [
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', mean=[128, 128], std=[128, 128]),
dict(type='FormatShape', input_format='NCHW_Flow'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
]
data = dict(
videos_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='RawframeDataset',
ann_file='data/childact_rawframe/childact_train_rawframe.txt',
data_prefix='data/childact_rawframe/train/',
pipeline=[
dict(
type='SampleFrames',
clip_len=32,
frame_interval=8,
num_clips=1),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='RandomCrop', size=224),
dict(type='Normalize', mean=[128, 128], std=[128, 128]),
dict(type='FormatShape', input_format='NCHW_Flow'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs', 'label'])
],
modality='Flow',
start_index=0,
filename_tmpl='flow_{}_{:05d}.jpg'),
val=dict(
type='RawframeDataset',
ann_file='data/childact_rawframe/childact_val_rawframe.txt',
data_prefix='data/childact_rawframe/val/',
pipeline=[
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=1,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='CenterCrop', crop_size=224),
dict(type='Normalize', mean=[128, 128], std=[128, 128]),
dict(type='FormatShape', input_format='NCHW_Flow'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
],
modality='Flow',
start_index=0,
filename_tmpl='flow_{}_{:05d}.jpg'),
test=dict(
type='RawframeDataset',
ann_file='data/childact_rawframe/childact_test_rawframe.txt',
data_prefix='data/childact_rawframe/test/',
pipeline=[
dict(
type='SampleFrames',
clip_len=8,
frame_interval=8,
num_clips=10,
test_mode=True),
dict(type='RawFrameDecode'),
dict(type='Resize', scale=(-1, 256)),
dict(type='ThreeCrop', crop_size=256),
dict(type='Normalize', mean=[128, 128], std=[128, 128]),
dict(type='FormatShape', input_format='NCHW_Flow'),
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
dict(type='ToTensor', keys=['imgs'])
],
modality='Flow',
start_index=0,
filename_tmpl='flow_{}_{:05d}.jpg'))
evaluation = dict(
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
optimizer = dict(
type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-05),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 51
work_dir = './childact-checkpoints/childact-TPN'
omnisource = False
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
seed = 42
gpu_ids = range(0, 1)
output_config = dict(out='./childact-checkpoints/childact-TPN/results.json')
error:
2021-03-29 22:48:30,211 - mmaction - INFO - load model from: torchvision://resnet50
2021-03-29 22:48:30,398 - mmaction - INFO - These parameters in the 2d checkpoint are not loaded: {'fc.weight', 'fc.bias'}
Use load_from_torchvision loader
2021-03-29 22:48:33,227 - mmaction - INFO - load checkpoint from checkpoints/tpn_imagenet_pretrained_slowonly_r50_8x8x1_150e_kinetics_rgb_20200923-52629684.pth
2021-03-29 22:48:33,228 - mmaction - INFO - Use load_from_local loader
2021-03-29 22:48:33,475 - mmaction - WARNING - The model and loaded state dict do not match exactly
size mismatch for cls_head.fc_cls.weight: copying a param with shape torch.Size([400, 2048]) from checkpoint, the shape in current model is torch.Size([7, 2048]).
size mismatch for cls_head.fc_cls.bias: copying a param with shape torch.Size([400]) from checkpoint, the shape in current model is torch.Size([7]).
2021-03-29 22:48:33,486 - mmaction - INFO - Start running, host: actrec@actrec-HP-Z4-G4-Workstation, work_dir: /home/actrec/.virtualenvs/mmaction/mmaction2/childact-checkpoints/childact-TPN
2021-03-29 22:48:33,486 - mmaction - INFO - workflow: [('train', 1)], max: 51 epochs
/home/actrec/.virtualenvs/mmaction/mmaction2/mmaction/core/evaluation/eval_hooks.py:131: UserWarning: runner.meta is None. Creating a empty one.
warnings.warn('runner.meta is None. Creating a empty one.')
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-4-be92e24cff32> in <module>
18 # Create work_dir
19 mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
---> 20 train_model(model, datasets, cfg, distributed=False, validate=True)
~/.virtualenvs/mmaction/mmaction2/mmaction/apis/train.py in train_model(model, dataset, cfg, distributed, validate, timestamp, meta)
154 if cfg.omnisource:
155 runner_kwargs = dict(train_ratio=train_ratio)
--> 156 runner.run(data_loaders, cfg.workflow, cfg.total_epochs, **runner_kwargs)
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in run(self, data_loaders, workflow, max_epochs, **kwargs)
123 if mode == 'train' and self.epoch >= self._max_epochs:
124 break
--> 125 epoch_runner(data_loaders[i], **kwargs)
126
127 time.sleep(1) # wait for some hooks like loggers to finish
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in train(self, data_loader, **kwargs)
48 self._inner_iter = i
49 self.call_hook('before_train_iter')
---> 50 self.run_iter(data_batch, train_mode=True)
51 self.call_hook('after_train_iter')
52 self._iter += 1
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/epoch_based_runner.py in run_iter(self, data_batch, train_mode, **kwargs)
28 elif train_mode:
29 outputs = self.model.train_step(data_batch, self.optimizer,
---> 30 **kwargs)
31 else:
32 outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/parallel/data_parallel.py in train_step(self, *inputs, **kwargs)
65
66 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
---> 67 return self.module.train_step(*inputs[0], **kwargs[0])
68
69 def val_step(self, *inputs, **kwargs):
~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in train_step(self, data_batch, optimizer, **kwargs)
220 aux_info[item] = data_batch[item]
221
--> 222 losses = self(imgs, label, return_loss=True, **aux_info)
223
224 loss, log_vars = self._parse_losses(losses)
~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in forward(self, imgs, label, return_loss, **kwargs)
182 if label is None:
183 raise ValueError('Label should not be None.')
--> 184 return self.forward_train(imgs, label, **kwargs)
185
186 return self.forward_test(imgs, **kwargs)
~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/recognizer3d.py in forward_train(self, imgs, labels, **kwargs)
14 losses = dict()
15
---> 16 x = self.extract_feat(imgs)
17 if hasattr(self, 'neck'):
18 x, loss_aux = self.neck(x, labels.squeeze())
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs)
82 'method of nn.Module')
83 if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
---> 84 return old_func(*args, **kwargs)
85 # get the arg spec of the decorated method
86 args_info = getfullargspec(old_func)
~/.virtualenvs/mmaction/mmaction2/mmaction/models/recognizers/base.py in extract_feat(self, imgs)
85 torch.tensor: The extracted features.
86 """
---> 87 x = self.backbone(imgs)
88 return x
89
~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/.virtualenvs/mmaction/mmaction2/mmaction/models/backbones/resnet3d.py in forward(self, x)
816 samples extracted by the backbone.
817 """
--> 818 x = self.conv1(x)
819 x = self.maxpool(x)
820 outs = []
~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/cnn/bricks/conv_module.py in forward(self, x, activate, norm)
191 if self.with_explicit_padding:
192 x = self.padding_layer(x)
--> 193 x = self.conv(x)
194 elif layer == 'norm' and norm and self.with_norm:
195 x = self.norm(x)
~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/.virtualenvs/mmaction/lib/python3.6/site-packages/mmcv/cnn/bricks/wrappers.py in forward(self, x)
77 return empty
78
---> 79 return super().forward(x)
80
81
~/.virtualenvs/mmaction/lib/python3.6/site-packages/torch/nn/modules/conv.py in forward(self, input)
571 self.dilation, self.groups)
572 return F.conv3d(input, self.weight, self.bias, self.stride,
--> 573 self.padding, self.dilation, self.groups)
574
575
RuntimeError: Expected 5-dimensional input for 5-dimensional weight [64, 3, 1, 7, 7], but got 4-dimensional input of size [40, 64, 224, 224] instead
The 'NCHW_Flow' format is not corresponding to ResNet3dCSN
The 'NCHW_Flow' format is not corresponding to ResNet3dCSN
So there is no way to train Recognizer3D models on flow images?
Couldn't believe that I overlooked that file! Thank you, will study it and close the issue after I solve the problem.
The error was solved by changing the data pipeline according to the defaults in cfg file of the network I wanted to use,
I am training on custom dataset, from which I extracted flow frames. I trained on those frames with TANet (which has Recognizer2D), it worked well. Now I am using config from
./configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py
and the full config is as follows:which results in the following when I call
train_model():
I searched for the corresponding RuntimeError, but have not found solution that would help me. I understand that the issue is related to the model's backbone, but I could not find what do I have to add to the config to make it work with the flow files.