open-mmlab / mmaction2

OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark
https://mmaction2.readthedocs.io
Apache License 2.0
4.31k stars 1.25k forks source link

How to replace the backbone and head, I replaced the cls_head and then I got an error #2744

Open luciferasura opened 1 year ago

luciferasura commented 1 year ago

Branch

main branch (1.x version, such as v1.0.0, or dev-1.x branch)

Prerequisite

Environment

System environment: sys.platform: linux Python: 3.8.17 (default, Jul 5 2023, 21:04:15) [GCC 11.2.0] CUDA available: True numpy_random_seed: 812247719 GPU 0: NVIDIA GeForce RTX 3090 CUDA_HOME: /usr/local/cuda NVCC: Cuda compilation tools, release 11.0, V11.0.221 GCC: gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0 PyTorch: 1.7.0 PyTorch compiling details: PyTorch built with:

Runtime environment: cudnn_benchmark: False mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} dist_cfg: {'backend': 'nccl'} seed: 812247719 diff_rank_seed: False deterministic: False Distributed launcher: none Distributed training: False GPU number: 1

Describe the bug

I replaced the cls_head in the config file of the swin_transformer.py, replaced the original I3Dhead with Timesformerhead, and then I got an error——RuntimeError: mat1 dim 1 must match mat2 dim 0

Reproduces the problem - code sample

1.code ann_file_test = '/mmaction2/classvideo/val1.txt' ann_file_train = '/mmaction2/classvideo/train1.txt' ann_file_val = '/mmaction2/classvideo/val1.txt' auto_scale_lr = dict(base_batch_size=8, enable=False) data_root = '/mmaction2/classvideo/train' data_root_val = '/mmaction2/classvideo/val' dataset_type = 'VideoDataset' default_hooks = dict( checkpoint=dict( interval=3, max_keep_ckpts=3, save_best='auto', type='CheckpointHook'), logger=dict(ignore_last=False, interval=20, type='LoggerHook'), param_scheduler=dict(type='ParamSchedulerHook'), runtime_info=dict(type='RuntimeInfoHook'), sampler_seed=dict(type='DistSamplerSeedHook'), sync_buffers=dict(type='SyncBuffersHook'), timer=dict(type='IterTimerHook')) default_scope = 'mmaction' env_cfg = dict( cudnn_benchmark=False, dist_cfg=dict(backend='nccl'), mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) file_client_args = dict(io_backend='disk') launcher = 'none' load_from = None log_level = 'INFO' log_processor = dict(by_epoch=True, type='LogProcessor', window_size=20) model = dict( backbone=dict( arch='base', attn_drop_rate=0.0, drop_path_rate=0.3, drop_rate=0.0, mlp_ratio=4.0, patch_norm=True, patch_size=( 2, 4, 4, ), pretrained= 'https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin_base_patch4_window7_224.pth', pretrained2d=True, qk_scale=None, qkv_bias=True, type='SwinTransformer3D', window_size=( 8, 7, 7, )), cls_head=dict( average_clips='prob', in_channels=1024, num_classes=7, type='TimeSformerHead'), data_preprocessor=dict( format_shape='NCTHW', mean=[ 123.675, 116.28, 103.53, ], std=[ 58.395, 57.12, 57.375, ], type='ActionDataPreprocessor'), type='Recognizer3D') optim_wrapper = dict( constructor='SwinOptimWrapperConstructor', optimizer=dict( betas=( 0.9, 0.999, ), lr=0.0001, type='AdamW', weight_decay=0.05), paramwise_cfg=dict( absolute_pos_embed=dict(decay_mult=0.0), backbone=dict(lr_mult=0.1), norm=dict(decay_mult=0.0), relative_position_bias_table=dict(decay_mult=0.0)), type='AmpOptimWrapper') param_scheduler = [ dict( begin=0, by_epoch=True, convert_to_iter_based=True, end=2.5, start_factor=0.1, type='LinearLR'), dict( T_max=30, begin=0, by_epoch=True, end=30, eta_min=0, type='CosineAnnealingLR'), ] randomness = dict(deterministic=False, diff_rank_seed=False, seed=None) resume = False test_cfg = dict(type='TestLoop') test_dataloader = dict( batch_size=1, dataset=dict( ann_file='/mmaction2/classvideo/val1.txt', data_prefix=dict(video='/mmaction2/classvideo/val'), pipeline=[ dict(io_backend='disk', type='DecordInit'), dict( clip_len=32, frame_interval=2, num_clips=4, test_mode=True, type='SampleFrames'), dict(type='DecordDecode'), dict(scale=( -1, 224, ), type='Resize'), dict(crop_size=224, type='ThreeCrop'), dict(input_format='NCTHW', type='FormatShape'), dict(type='PackActionInputs'), ], test_mode=True, type='VideoDataset'), num_workers=8, persistent_workers=True, sampler=dict(shuffle=False, type='DefaultSampler')) test_evaluator = dict(type='AccMetric') test_pipeline = [ dict(io_backend='disk', type='DecordInit'), dict( clip_len=32, frame_interval=2, num_clips=4, test_mode=True, type='SampleFrames'), dict(type='DecordDecode'), dict(scale=( -1, 224, ), type='Resize'), dict(crop_size=224, type='ThreeCrop'), dict(input_format='NCTHW', type='FormatShape'), dict(type='PackActionInputs'), ] train_cfg = dict( max_epochs=30, type='EpochBasedTrainLoop', val_begin=1, val_interval=1) train_dataloader = dict( batch_size=2, dataset=dict( ann_file='/mmaction2/classvideo/train1.txt', data_prefix=dict(video='/mmaction2/classvideo/train'), pipeline=[ dict(io_backend='disk', type='DecordInit'), dict( clip_len=32, frame_interval=2, num_clips=1, type='SampleFrames'), dict(type='DecordDecode'), dict(scale=( -1, 256, ), type='Resize'), dict(type='RandomResizedCrop'), dict(keep_ratio=False, scale=( 224, 224, ), type='Resize'), dict(flip_ratio=0.5, type='Flip'), dict(input_format='NCTHW', type='FormatShape'), dict(type='PackActionInputs'), ], type='VideoDataset'), num_workers=8, persistent_workers=True, sampler=dict(shuffle=True, type='DefaultSampler')) train_pipeline = [ dict(io_backend='disk', type='DecordInit'), dict(clip_len=32, frame_interval=2, num_clips=1, type='SampleFrames'), dict(type='DecordDecode'), dict(scale=( -1, 256, ), type='Resize'), dict(type='RandomResizedCrop'), dict(keep_ratio=False, scale=( 224, 224, ), type='Resize'), dict(flip_ratio=0.5, type='Flip'), dict(input_format='NCTHW', type='FormatShape'), dict(type='PackActionInputs'), ] val_cfg = dict(type='ValLoop') val_dataloader = dict( batch_size=2, dataset=dict( ann_file='/mmaction2/classvideo/val1.txt', data_prefix=dict(video='/mmaction2/classvideo/val'), pipeline=[ dict(io_backend='disk', type='DecordInit'), dict( clip_len=32, frame_interval=2, num_clips=1, test_mode=True, type='SampleFrames'), dict(type='DecordDecode'), dict(scale=( -1, 256, ), type='Resize'), dict(crop_size=224, type='CenterCrop'), dict(input_format='NCTHW', type='FormatShape'), dict(type='PackActionInputs'), ], test_mode=True, type='VideoDataset'), num_workers=8, persistent_workers=True, sampler=dict(shuffle=False, type='DefaultSampler')) val_evaluator = dict(type='AccMetric') val_pipeline = [ dict(io_backend='disk', type='DecordInit'), dict( clip_len=32, frame_interval=2, num_clips=1, test_mode=True, type='SampleFrames'), dict(type='DecordDecode'), dict(scale=( -1, 256, ), type='Resize'), dict(crop_size=224, type='CenterCrop'), dict(input_format='NCTHW', type='FormatShape'), dict(type='PackActionInputs'), ] vis_backends = [ dict(type='LocalVisBackend'), ] visualizer = dict( type='ActionVisualizer', vis_backends=[ dict(type='LocalVisBackend'), ]) work_dir = './work_dirs/swin-base-test'

2.change something I replaced the cls_head in the config file of the swin_transformer.py, replaced the original I3Dhead with Timesformerhead, and then I got an error. The following code is the original I3Dhead and the replaced head: cls_head=dict( type='I3DHead', in_channels=768, num_classes=400, spatial_type='avg', dropout_ratio=0.5, average_clips='prob') replaced head: cls_head=dict( type='TimeSformerHead', num_classes=7, in_channels=1024, average_clips='prob')

Reproduces the problem - command or script

python tools/train.py checkpiont/swin-base-test.py

Reproduces the problem - error message

Traceback (most recent call last): File "tools/train.py", line 135, in main() File "tools/train.py", line 131, in main runner.train() File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/mmengine/runner/runner.py", line 1745, in train model = self.train_loop.run() # type: ignore File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/mmengine/runner/loops.py", line 96, in run self.run_epoch() File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/mmengine/runner/loops.py", line 112, in run_epoch self.run_iter(idx, data_batch) File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/mmengine/runner/loops.py", line 128, in run_iter outputs = self.runner.model.train_step( File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 114, in train_step losses = self._run_forward(data, mode='loss') # type: ignore File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 340, in _run_forward results = self(data, mode=mode) File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl result = self.forward(input, kwargs) File "/mmaction2/mmaction/models/recognizers/base.py", line 262, in forward return self.loss(inputs, data_samples, kwargs) File "/mmaction2/mmaction/models/recognizers/base.py", line 176, in loss loss_cls = self.cls_head.loss(feats, data_samples, loss_kwargs) File "/mmaction2/mmaction/models/heads/base.py", line 99, in loss cls_scores = self(feats, kwargs) File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl result = self.forward(input, kwargs) File "/mmaction2/mmaction/models/heads/timesformer_head.py", line 60, in forward cls_score = self.fc_cls(x) File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl result = self.forward(*input, **kwargs) File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 93, in forward return F.linear(input, self.weight, self.bias) File "/usr/local/miniconda3/envs/mmlab/lib/python3.8/site-packages/torch/nn/functional.py", line 1692, in linear output = input.matmul(weight.t()) RuntimeError: mat1 dim 1 must match mat2 dim 0

Additional information

I use videodataset and I'm using swin.py's config for action recognition, and I want to make the new model of different components work by replacing backbone, head, etc So I would like to know how to replace components such as backbone and head in the model, as well as the things and processes that need to be paid attention to.