Closed trapqueenxx closed 1 year ago
Hi, @trapqueenxx can you provide the whole config file here?
Hi, @trapqueenxx can you provide the whole config file here?
Thank you for your response.My config is below.
log_level = 'INFO' load_from = '../models/poseur_256x192_w32_6dec_coco.pth' resume_from = None dist_params = dict(backend='nccl') workflow = [('train', 1)] checkpoint_config = dict(interval=10) evaluation = dict(interval=10, metric='mAP', key_indicator='AP', rle_score=True)
optimizer = dict( type='AdamW', lr=1e-3, weight_decay=1e-4, paramwise_cfg = dict( custom_keys={
'sampling_offsets': dict(lr_mult=0.1),
'reference_points': dict(lr_mult=0.1),
# 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
},
)
)
optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[170, 200]) total_epochs = 210
log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook'), ])
channel_cfg = dict( num_output_channels=29, dataset_joints=29, dataset_channel=[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], ], inference_channel=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 ])
emb_dim = 256
norm_cfg = dict(type='BN', requires_grad=True) model = dict( type='Poseur', pretrained=load_from, backbone=dict( type='HRNet', norm_cfg = norm_cfg, in_channels=3, extra=dict( stage1=dict( num_modules=1, num_branches=1, block='BOTTLENECK', num_blocks=(4, ), num_channels=(64, )), stage2=dict( num_modules=1, num_branches=2, block='BASIC', num_blocks=(4, 4), num_channels=(32, 64)), stage3=dict( num_modules=4, num_branches=3, block='BASIC', num_blocks=(4, 4, 4), num_channels=(32, 64, 128)), stage4=dict( num_modules=3, num_branches=4, block='BASIC', num_blocks=(4, 4, 4, 4), num_channels=(32, 64, 128, 256), multiscale_output=True, )), ), neck=dict( type='ChannelMapper', in_channels=[32, 64, 128, 256], kernel_size=1, out_channels=emb_dim, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), ), keypoint_head=dict( type='Poseur_noise_sample', in_channels=512, num_queries=29, num_reg_fcs=2, num_joints=channel_cfg['num_output_channels'], with_box_refine=True, loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True), loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True), loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10), positional_encoding=dict( type='SinePositionalEncoding', num_feats=emb_dim//2, normalize=True, offset=-0.5), transformer=dict( type='PoseurTransformer_v3', query_pose_emb = True, embed_dims = emb_dim, encoder=dict( type='DetrTransformerEncoder_zero_layer', num_layers=0, transformerlayers=dict( type='BaseTransformerLayer', ffn_cfgs = dict( embed_dims=emb_dim, ), attn_cfgs=dict( type='MultiScaleDeformableAttention', num_levels=4, num_points=4, embed_dims=emb_dim),
feedforward_channels=1024,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
decoder=dict(
type='DeformableDetrTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer_grouped',
ffn_cfgs = dict(
embed_dims=emb_dim,
),
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=emb_dim,
num_heads=8,
dropout=0.1),
dict(
type='MultiScaleDeformableAttention_post_value',
num_levels=4,
num_points=4,
embed_dims=emb_dim)],
feedforward_channels=1024,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
as_two_stage=True,
use_heatmap_loss=False,
),
train_cfg=dict(image_size=[192, 256]),
test_cfg = dict(
image_size=[192, 256],
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=11)
) data_cfg = dict( image_size=[288, 384], heatmap_size=[72, 96], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], inference_channel=channel_cfg['inference_channel'], soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
det_bbox_thr=0.0,
use_gt_bbox=True,
bbox_file='',
) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownRandomFlip', flip_prob=0.5), dict( type='TopDownHalfBodyTransform', num_joints_half_body=8, prob_half_body=0.3), dict( type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict( target_type='wo_mask', type='TopDownGenerateCoordAndHeatMapTarget', encoding='MSRA', sigma=2), dict( type='Collect', keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] val_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict( type='Collect', keys=[ 'img', ], meta_keys=[ 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] test_pipeline = val_pipeline data = dict( samples_per_gpu=32, workers_per_gpu=4, val_dataloader=dict(samples_per_gpu=32), test_dataloader=dict(samples_per_gpu=32), train=dict( type='TopDownHikposeDataset', ann_file='../labels/del_train.json', img_prefix='', data_cfg=data_cfg, pipeline=train_pipeline, ), val=dict( type='TopDownHikposeDataset', ann_file='../labels/del_test_1.json', img_prefix='', data_cfg=data_cfg, pipeline=val_pipeline, ), test=dict( type='TopDownHikposeDataset', ann_file='../labels/del_test_1.json', img_prefix='', data_cfg=data_cfg, pipeline=test_pipeline, ), ) fp16 = dict(loss_scale='dynamic')
@trapqueenxx I update a config file for mpii dataset thus you can have a reference for your own dataset.
Hello, I have trained on my own dataset with 29 keypoints, and there is an error in poseur_head.py : the size of enc_outputs is [32, 17, 2](32 is the batchsize, 17 is the number of keypoints), it doesn't match with my 29 keypoints. Then I find in the class of PoseurTransformer_v3 in transformer.py, num_joints is initialized to 17, so I changed it to 29 and get started training. But the result epoch AP is 0, and the training log's loss and acc are unuausl as blow:
INFO - Epoch [1][50/1446] lr: 9.890e-05, eta: 4 days, 5:02:52, time: 1.198, data_time: 0.115, memory: 23176, enc_rle_loss: 319.5232, dec_rle_loss_0: 1515.8930, dec_rle_loss_1: 1094.4419, dec_rle_loss_2: 917.2394, dec_rle_loss_3: 1053.5126, dec_rle_loss_4: 977.9670, dec_rle_loss_5: 1089.1097, enc_coord_acc: 0.0002, dec_coord_acc: 0.0027, loss: 6967.6868
I don't kown how to train on my own dataset.