aim-uofa / Poseur

[ECCV 2022] The official repo for the paper "Poseur: Direct Human Pose Regression with Transformers".
Other
179 stars 13 forks source link

How to train on my own dataset? #12

Closed trapqueenxx closed 1 year ago

trapqueenxx commented 1 year ago

Hello, I have trained on my own dataset with 29 keypoints, and there is an error in poseur_head.py : the size of enc_outputs is [32, 17, 2](32 is the batchsize, 17 is the number of keypoints), it doesn't match with my 29 keypoints. Then I find in the class of PoseurTransformer_v3 in transformer.py, num_joints is initialized to 17, so I changed it to 29 and get started training. But the result epoch AP is 0, and the training log's loss and acc are unuausl as blow:

INFO - Epoch [1][50/1446] lr: 9.890e-05, eta: 4 days, 5:02:52, time: 1.198, data_time: 0.115, memory: 23176, enc_rle_loss: 319.5232, dec_rle_loss_0: 1515.8930, dec_rle_loss_1: 1094.4419, dec_rle_loss_2: 917.2394, dec_rle_loss_3: 1053.5126, dec_rle_loss_4: 977.9670, dec_rle_loss_5: 1089.1097, enc_coord_acc: 0.0002, dec_coord_acc: 0.0027, loss: 6967.6868

I don't kown how to train on my own dataset.

YongtaoGe commented 1 year ago

Hi, @trapqueenxx can you provide the whole config file here?

trapqueenxx commented 1 year ago

Hi, @trapqueenxx can you provide the whole config file here?

Thank you for your response.My config is below.

log_level = 'INFO' load_from = '../models/poseur_256x192_w32_6dec_coco.pth' resume_from = None dist_params = dict(backend='nccl') workflow = [('train', 1)] checkpoint_config = dict(interval=10) evaluation = dict(interval=10, metric='mAP', key_indicator='AP', rle_score=True)

optimizer = dict( type='AdamW', lr=1e-3, weight_decay=1e-4, paramwise_cfg = dict( custom_keys={

'backbone': dict(lr_mult=0.1),

        'sampling_offsets': dict(lr_mult=0.1),
        'reference_points': dict(lr_mult=0.1),
        # 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
    },
)

)

optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[170, 200]) total_epochs = 210

log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook'), ])

channel_cfg = dict( num_output_channels=29, dataset_joints=29, dataset_channel=[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], ], inference_channel=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 ])

emb_dim = 256

norm_cfg = dict(type='BN', requires_grad=True) model = dict( type='Poseur', pretrained=load_from, backbone=dict( type='HRNet', norm_cfg = norm_cfg, in_channels=3, extra=dict( stage1=dict( num_modules=1, num_branches=1, block='BOTTLENECK', num_blocks=(4, ), num_channels=(64, )), stage2=dict( num_modules=1, num_branches=2, block='BASIC', num_blocks=(4, 4), num_channels=(32, 64)), stage3=dict( num_modules=4, num_branches=3, block='BASIC', num_blocks=(4, 4, 4), num_channels=(32, 64, 128)), stage4=dict( num_modules=3, num_branches=4, block='BASIC', num_blocks=(4, 4, 4, 4), num_channels=(32, 64, 128, 256), multiscale_output=True, )), ), neck=dict( type='ChannelMapper', in_channels=[32, 64, 128, 256], kernel_size=1, out_channels=emb_dim, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), ), keypoint_head=dict( type='Poseur_noise_sample', in_channels=512, num_queries=29, num_reg_fcs=2, num_joints=channel_cfg['num_output_channels'], with_box_refine=True, loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True), loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True), loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10), positional_encoding=dict( type='SinePositionalEncoding', num_feats=emb_dim//2, normalize=True, offset=-0.5), transformer=dict( type='PoseurTransformer_v3', query_pose_emb = True, embed_dims = emb_dim, encoder=dict( type='DetrTransformerEncoder_zero_layer', num_layers=0, transformerlayers=dict( type='BaseTransformerLayer', ffn_cfgs = dict( embed_dims=emb_dim, ), attn_cfgs=dict( type='MultiScaleDeformableAttention', num_levels=4, num_points=4, embed_dims=emb_dim),

                feedforward_channels=1024,
                ffn_dropout=0.1,
                operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
        decoder=dict(
            type='DeformableDetrTransformerDecoder',
            num_layers=6,
            return_intermediate=True,
            transformerlayers=dict(
                type='DetrTransformerDecoderLayer_grouped',
                ffn_cfgs = dict(
                    embed_dims=emb_dim,
                    ),
                attn_cfgs=[
                    dict(
                        type='MultiheadAttention',
                        embed_dims=emb_dim,
                        num_heads=8,
                        dropout=0.1),
                    dict(
                        type='MultiScaleDeformableAttention_post_value',
                        num_levels=4,
                        num_points=4,
                        embed_dims=emb_dim)],
                feedforward_channels=1024,
                ffn_dropout=0.1,
                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                 'ffn', 'norm')))),
    as_two_stage=True,
    use_heatmap_loss=False,
),
train_cfg=dict(image_size=[192, 256]),
test_cfg = dict(
    image_size=[192, 256],
    flip_test=True,
    post_process='default',
    shift_heatmap=True,
    modulate_kernel=11)

) data_cfg = dict( image_size=[288, 384], heatmap_size=[72, 96], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], inference_channel=channel_cfg['inference_channel'], soft_nms=False,

use_nms=False,

nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
det_bbox_thr=0.0,
use_gt_bbox=True,
bbox_file='',

) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownRandomFlip', flip_prob=0.5), dict( type='TopDownHalfBodyTransform', num_joints_half_body=8, prob_half_body=0.3), dict( type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict( target_type='wo_mask', type='TopDownGenerateCoordAndHeatMapTarget', encoding='MSRA', sigma=2), dict( type='Collect', keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] val_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict( type='Collect', keys=[ 'img', ], meta_keys=[ 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] test_pipeline = val_pipeline data = dict( samples_per_gpu=32, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=32), test_dataloader=dict(samples_per_gpu=32), train=dict( type='TopDownHikposeDataset', ann_file='../labels/del_train.json', img_prefix='', data_cfg=data_cfg, pipeline=train_pipeline, ), val=dict( type='TopDownHikposeDataset', ann_file='../labels/del_test_1.json', img_prefix='', data_cfg=data_cfg, pipeline=val_pipeline, ), test=dict( type='TopDownHikposeDataset', ann_file='../labels/del_test_1.json', img_prefix='', data_cfg=data_cfg, pipeline=test_pipeline, ), ) fp16 = dict(loss_scale='dynamic')

YongtaoGe commented 1 year ago

@trapqueenxx I update a config file for mpii dataset thus you can have a reference for your own dataset.