Closed Egrt closed 6 months ago
Hi @Egrt , RTMO 完全基于 MMPose 进行构建和训练,因此按照 MMPose 文档进行训练即可。我推荐的阅读顺序是:
感谢您的指导,我自定义了训练设置,但是再验证过程中遇到了一些问题,应该如何解决,以下是我的训练设置:
_base_ = ['mmpose::_base_/default_runtime.py']
# runtime
max_epochs = 100
base_lr = 4e-4
val_interval = 10
train_cfg = dict(max_epochs=max_epochs, val_interval=val_interval)
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (416, 416)
metafile = 'configs/_base_/datasets/uav.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(416, 416),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/UAV-baban/'
uav_coco = [
(0, 0),
(1, 1),
(2, 2),
]
dataset_uav = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='train_coco.json',
data_prefix=dict(img='images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=3, mapping=uav_coco)
],
)
train_dataloader = dict(
batch_size=8,
num_workers=1,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CocoDataset',
data_root=data_root,
metainfo=dict(from_file=metafile),
data_mode=data_mode,
ann_file='train_coco.json',
data_prefix=dict(img='images/'),
pipeline=train_pipeline_stage1
))
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='val_coco.json',
data_prefix=dict(img='images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'val_coco.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_uav,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.375
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(320, 640),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_'
'20211124_171234-b4047906.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[96, 192, 384],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=192,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=3,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=192,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile),
use_keypoints_for_center=True),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=192,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1.0,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
以下是报错内容:
03/01 13:35:20 - mmengine - INFO - Epoch(train) [10][150/228] base_lr: 4.000000e-04 lr: 4.000000e-04 eta: 1:41:44 time: 0.277194 data_time: 0.001964 memory: 3107 grad_norm: 96.415793 loss: 16.194411 loss_bbox: 2.556790 loss_vis: 0.322454 loss_mle: -2.623663 loss_oks: 15.137849 loss_cls: 0.800981 num_samples: 36.000000 overlaps: 0.582449
03/01 13:35:34 - mmengine - INFO - Epoch(train) [10][200/228] base_lr: 4.000000e-04 lr: 4.000000e-04 eta: 1:41:23 time: 0.281353 data_time: 0.001872 memory: 3116 grad_norm: 95.948228 loss: 16.678265 loss_bbox: 2.717645 loss_vis: 0.348905 loss_mle: -2.611413 loss_oks: 15.399887 loss_cls: 0.823240 num_samples: 59.000000 overlaps: 0.667207
03/01 13:35:42 - mmengine - INFO - Exp name: rtmo-t-uav_20240301_132415
03/01 13:35:57 - mmengine - INFO - Epoch(val) [10][ 50/202] eta: 0:00:45 time: 0.297385 data_time: 0.247945 memory: 3116
03/01 13:36:00 - mmengine - INFO - Epoch(val) [10][100/202] eta: 0:00:17 time: 0.047142 data_time: 0.000320 memory: 152
03/01 13:36:02 - mmengine - INFO - Epoch(val) [10][150/202] eta: 0:00:06 time: 0.047503 data_time: 0.000120 memory: 152
03/01 13:36:04 - mmengine - INFO - Epoch(val) [10][200/202] eta: 0:00:00 time: 0.047237 data_time: 0.000260 memory: 152
17
17
Traceback (most recent call last):
File "tools/train.py", line 162, in <module>
main()
File "tools/train.py", line 158, in main
runner.train()
File "D:\anaconda3\lib\site-packages\mmengine\runner\runner.py", line 1745, in train
model = self.train_loop.run() # type: ignore
File "D:\anaconda3\lib\site-packages\mmengine\runner\loops.py", line 102, in run
self.runner.val_loop.run()
File "D:\anaconda3\lib\site-packages\mmengine\runner\loops.py", line 366, in run
metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
File "D:\anaconda3\lib\site-packages\mmengine\evaluator\evaluator.py", line 79, in evaluate
_results = metric.evaluate(size)
File "D:\anaconda3\lib\site-packages\mmengine\evaluator\metric.py", line 133, in evaluate
_metrics = self.compute_metrics(results) # type: ignore
File "d:\notebook\mmpose\mmpose\mmpose\evaluation\metrics\coco_metric.py", line 488, in compute_metrics
self.results2json(valid_kpts, outfile_prefix=outfile_prefix)
File "d:\notebook\mmpose\mmpose\mmpose\evaluation\metrics\coco_metric.py", line 530, in results2json
_keypoints = _keypoints.reshape(-1, num_keypoints * 3)
ValueError: cannot reshape array of size 126 into shape (51)
上述报错已经解决了,但是训练过程中val的结果始终为0,我的配置文件为:
_base_ = ['mmpose::_base_/default_runtime.py']
# runtime
max_epochs = 100
base_lr = 4e-4
val_interval = 10
train_cfg = dict(max_epochs=max_epochs, val_interval=val_interval)
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=val_interval, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=base_lr * 0.05,
begin=max_epochs // 2,
end=max_epochs,
T_max=max_epochs // 2,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (416, 416)
metafile = 'configs/_base_/datasets/uav.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(416, 416),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/UAV-baban/'
uav_coco = [
(0, 0),
(1, 1),
(2, 2),
]
dataset_uav = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='train_coco.json',
data_prefix=dict(img='images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=3, mapping=uav_coco)
],
)
train_dataloader = dict(
batch_size=8,
num_workers=1,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CocoDataset',
data_root=data_root,
metainfo=dict(from_file=metafile),
data_mode=data_mode,
ann_file='train_coco.json',
data_prefix=dict(img='images/'),
pipeline=train_pipeline_stage1
))
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
metainfo=dict(from_file=metafile),
data_mode=data_mode,
ann_file='val_coco.json',
data_prefix=dict(img='images/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'val_coco.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_uav,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.375
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(320, 640),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_'
'20211124_171234-b4047906.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[96, 192, 384],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=192,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=3,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=192,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile),
use_keypoints_for_center=True),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=192,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1.0,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.000
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.000
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.000
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = -1.000
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.000
Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.000
Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = -1.000
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.000
03/01 15:27:13 - mmengine - INFO - Epoch(val) [1][202/202] coco/AP: 0.000000 coco/AP .5: 0.000000 coco/AP .75: 0.000000 coco/AP (M): -1.000000 coco/AP (L): 0.000000 coco/AR: 0.000000 coco/AR .5: 0.000000 coco/AR .75: 0.000000 coco/AR (M): -1.000000 coco/AR (L): 0.000000 data_time: 0.062827 time: 0.090564
可以把val的batch_size设成1试试
可以把val的batch_size设成1试试
已经设置成1了,还是这样
请问是否有RTMO训练自定义数据集的教程?