Closed LeBron-Jian closed 3 weeks ago
贴一下config吧
贴一下config吧
`base = ['../configs/base/default_runtime.py']
train_cfg = dict(max_epochs=700, val_interval=1, dynamic_intervals=[(670, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict( checkpoint=dict(type='CheckpointHook', interval=50, max_keep_ckpts=3))
optim_wrapper = dict( type='OptimWrapper', constructor='ForceDefaultOptimWrapperConstructor', optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05), paramwise_cfg=dict( norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True, force_default_settings=True, custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})), clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [ dict( type='QuadraticWarmupLR', by_epoch=True, begin=0, end=5, convert_to_iter_based=True), dict( type='CosineAnnealingLR', eta_min=0.0002, begin=5, T_max=350, end=349, by_epoch=True, convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=349, end=350),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=350,
T_max=320,
end=670,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=670, end=700),
]
input_size = (640, 640) metafile = 'configs/base/datasets/hand.py' codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [ dict(type='LoadImage', backend_args=None), dict( type='Mosaic', img_scale=(640, 640), pad_val=114.0, pre_transform=[dict(type='LoadImage', backend_args=None)]), dict( type='BottomupRandomAffine', input_size=(640, 640), shift_factor=0.2, rotate_factor=30, scale_factor=(0.5, 1.5), pad_val=114, distribution='uniform', transform_mode='perspective', bbox_keep_corner=False, clip_border=True, ), dict( type='YOLOXMixUp', img_scale=(640, 640), ratio_range=(0.6, 1.6), pad_val=114.0, pre_transform=[dict(type='LoadImage', backend_args=None)]), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip'), dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs'), ] train_pipeline_stage2 = [ dict(type='LoadImage'), dict( type='BottomupRandomAffine', input_size=(640, 640), shift_prob=0, rotate_prob=0, scale_prob=0, scale_type='long', pad_val=(114, 114, 114), bbox_keep_corner=False, clip_border=True, ), dict(type='YOLOXHSVRandomAug'), dict(type='RandomFlip'), dict(type='BottomupGetHeatmapMask', get_invalid=True), dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs'), ]
val_pipeline = [ dict(type='LoadImage'), dict( type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)), dict(type='BottomupGetHeatmapMask', get_invalid=True), dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False), dict(type='GenerateTarget', encoder=codec), dict( type='PackPoseInputs', meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', 'input_size', 'input_center', 'input_scale')) ]
data_mode = 'bottomup' data_root = 'D:/Hand/data/BonesOfHand_keypoint/COCODataset1/'
dataset_crowdpose = dict(
type='HandDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='train_coco.json',
data_prefix=dict(img='images/'),
pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1
)
train_dataloader = dict( batch_size=1, num_workers=1, persistent_workers=True, pin_memory=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dataset_crowdpose)
val_dataloader = dict( batch_size=1, num_workers=1, persistent_workers=True, pin_memory=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), dataset=dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='val_coco.json', data_prefix=dict(img='images/'), test_mode=True, pipeline=val_pipeline, )) test_dataloader = val_dataloader
val_evaluator = dict( type='CocoMetric', ann_file=data_root + 'val_coco.json', score_mode='bbox', nms_mode='none', )
test_evaluator = val_evaluator
custom_hooks = [ dict( type='YOLOXPoseModeSwitchHook', num_last_epochs=30, new_train_pipeline=train_pipeline_stage2, priority=48), dict( type='RTMOModeSwitchHook', epoch_attributes={ 350: { 'proxy_target_cc': True, 'overlaps_power': 1.0, 'loss_cls.loss_weight': 2.0, 'loss_mle.loss_weight': 5.0, 'loss_oks.loss_weight': 10.0 }, }, priority=48), dict(type='SyncNormHook', priority=48), dict( type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0002, update_buffers=True, strict_load=False, priority=49), ]
widen_factor = 0.5 deepen_factor = 0.33
model = dict( type='BottomupPoseEstimator', init_cfg=dict( type='Kaiming', layer='Conv2d', a=2.23606797749979, distribution='uniform', mode='fan_in', nonlinearity='leaky_relu'), data_preprocessor=dict( type='PoseDataPreprocessor', pad_size_divisor=32, # 32 mean=[0, 0, 0], std=[1, 1, 1],
# dict(
# type='BatchSyncRandomResize',
# random_size_range=(480, 800),
# size_divisor=32,
# interval=1),
# ]
),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_'
'20211121_095711-4592a793.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[128, 256, 512],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=256,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=21,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=256,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile)),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=256,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1e-3,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))
`
我也打印了输入图像的size,并且show了图像。图像就是resize到640*640的,并且保持宽高比的。但是bbox和keypoints还是原始的,你是说config不对吗?
dataset_crowdpose = dict(
type='HandDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='train_coco.json',
data_prefix=dict(img='images/'),
pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1
)
训练数据集别用val_pipeline
dataset_crowdpose = dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='train_coco.json', data_prefix=dict(img='images/'), pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1 )
训练数据集别用val_pipeline
看大了BottomupRandomAffine和BottomupResize的区别了。没细读代码,感谢
dataset_crowdpose = dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='train_coco.json', data_prefix=dict(img='images/'), pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1 )
训练数据集别用val_pipeline
看大了BottomupRandomAffine和BottomupResize的区别了。没细读代码,感谢
你好 我在训练RTMO时发现他的loss_mle会训练成负值 为什么会这样是因为1e−4嘛 09/01 10:07:00 - mmengine - INFO - Epoch(train) [1][ 9000/16029] base_lr: 5.044198e-05 lr: 5.044198e-05 eta: 26 days, 17:51:34 time: 0.248321 data_time: 0.002521 memory: 4903 grad_norm: 75.045101 loss: 24.547811 loss_bbox: 2.699222 loss_vis: 0.458490 loss_mle: -0.017420 loss_oks: 20.619778 loss_cls: 0.787740 num_samples: 67.000000 overlaps: 0.486408
dataset_crowdpose = dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='train_coco.json', data_prefix=dict(img='images/'), pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1 )
训练数据集别用val_pipeline
看大了BottomupRandomAffine和BottomupResize的区别了。没细读代码,感谢
你好 我在训练RTMO时发现他的loss_mle会训练成负值 为什么会这样是因为1e−4嘛 09/01 10:07:00 - mmengine - INFO - Epoch(train) [1][ 9000/16029] base_lr: 5.044198e-05 lr: 5.044198e-05 eta: 26 days, 17:51:34 time: 0.248321 data_time: 0.002521 memory: 4903 grad_norm: 75.045101 loss: 24.547811 loss_bbox: 2.699222 loss_vis: 0.458490 loss_mle: -0.017420 loss_oks: 20.619778 loss_cls: 0.787740 num_samples: 67.000000 overlaps: 0.486408
我训练自己的模型,mleloss也出现了负值。(我自己的数据集)。我训练的mlecc的loss权重是1,你的是1e-4吗
@Ben-Louis 大佬再请教一个问题。我使用rtmo里面的coco数据集及其coco下面的rtmo-s_8xb32-600e_coco-640x640.py配置文件进行训练,得到的最终mAP才达到0.607? 而与实际的0.677还是有很大的差异的,请问是什么原因?
我只是将batch_size由32改为了16(因为电脑配置的问题,32跑不动)。
然后我对比了backbone为s和m,l的区别。我发现就MLECCLoss的权重是1,而不是1e-3, 而且crowdpose的rtmo-s的MLECCLOSS的权重也是1e-3,所以这个是其中一个原因吗?
@Ben-Louis 最终我的训练,得到的结果如下 而使用rtmo下载的s模型的权重跑的结果如下:
感觉差异还是蛮大的呢,七个点。。所以公开的rtmo-s的训练配置问题参数有问题吗?
dataset_crowdpose = dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='train_coco.json', data_prefix=dict(img='images/'), pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1 )
训练数据集别用val_pipeline
看大了BottomupRandomAffine和BottomupResize的区别了。没细读代码,感谢
你好 我在训练RTMO时发现他的loss_mle会训练成负值 为什么会这样是因为1e−4嘛 09/01 10:07:00 - mmengine - INFO - Epoch(train) [1][ 9000/16029] base_lr: 5.044198e-05 lr: 5.044198e-05 eta: 26 days, 17:51:34 time: 0.248321 data_time: 0.002521 memory: 4903 grad_norm: 75.045101 loss: 24.547811 loss_bbox: 2.699222 loss_vis: 0.458490 loss_mle: -0.017420 loss_oks: 20.619778 loss_cls: 0.787740 num_samples: 67.000000 overlaps: 0.486408
我训练自己的模型,mleloss也出现了负值。(我自己的数据集)。我训练的mlecc的loss权重是1,你的是1e-4吗
没有唉 我用的是正常的rtmo-l的config配置
@Ben-Louis 大佬再请教一个问题。我使用rtmo里面的coco数据集及其coco下面的rtmo-s_8xb32-600e_coco-640x640.py配置文件进行训练,得到的最终mAP才达到0.607? 而与实际的0.677还是有很大的差异的,请问是什么原因?
我只是将batch_size由32改为了16(因为电脑配置的问题,32跑不动)。
然后我对比了backbone为s和m,l的区别。我发现就MLECCLoss的权重是1,而不是1e-3, 而且crowdpose的rtmo-s的MLECCLOSS的权重也是1e-3,所以这个是其中一个原因吗?
batchsize是 $8 \times 32 = 256$,不是 32。你要复现的话 batchsize 得一致
@Ben-Louis 大佬再请教一个问题。我使用rtmo里面的coco数据集及其coco下面的rtmo-s_8xb32-600e_coco-640x640.py配置文件进行训练,得到的最终mAP才达到0.607? 而与实际的0.677还是有很大的差异的,请问是什么原因? 我只是将batch_size由32改为了16(因为电脑配置的问题,32跑不动)。 然后我对比了backbone为s和m,l的区别。我发现就MLECCLoss的权重是1,而不是1e-3, 而且crowdpose的rtmo-s的MLECCLOSS的权重也是1e-3,所以这个是其中一个原因吗?
batchsize是 8 × 32 = 256 ,不是 32。你要复现的话 batchsize 得一致
@Ben-Louis 所以你这8xb32其实是8个GPU,每个gpu的batch_size是32。因此总的batch_size是256。而实际上我复现的是1xb16,是不是对应的我的学习率应该设置为lr/16,对应到config文件,应该设置auto_scale_lr = dict(base_batch_size=256/16),这样就可以达到起码接近的效果,是这样吗?
@Ben-Louis 还有就是对于mmpose里面的 configs\body_2d_keypoint\rtmo\coco\rtmo_coco.yml Dataset还写的是 CrowdPose,这个可以修改一下呢。容易误导
dataset_crowdpose = dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='train_coco.json', data_prefix=dict(img='images/'), pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1 )
训练数据集别用val_pipeline
看大了BottomupRandomAffine和BottomupResize的区别了。没细读代码,感谢
你好 我在训练RTMO时发现他的loss_mle会训练成负值 为什么会这样是因为1e−4嘛 09/01 10:07:00 - mmengine - INFO - Epoch(train) [1][ 9000/16029] base_lr: 5.044198e-05 lr: 5.044198e-05 eta: 26 days, 17:51:34 time: 0.248321 data_time: 0.002521 memory: 4903 grad_norm: 75.045101 loss: 24.547811 loss_bbox: 2.699222 loss_vis: 0.458490 loss_mle: -0.017420 loss_oks: 20.619778 loss_cls: 0.787740 num_samples: 67.000000 overlaps: 0.486408
我训练自己的模型,mleloss也出现了负值。(我自己的数据集)。我训练的mlecc的loss权重是1,你的是1e-4吗
没有唉 我用的是正常的rtmo-l的config配置
这个应该是正常的吧。rtm0-l的config中MLECCLoss的 loss_weight=1e-2 我个人理解:对于MLECCLoss来说,函数使用了log类型,即loss = -torch.log(prob + 1e-4), 假设prob的取值范围是[0, 1],那么此对数函数的取值范围就是[-0.0001, 9.2103], 所以说,为负值应该是正常的吧。而且 prob = 1.0 for o, t in zip(outputs, targets): prob = (o t).sum(dim=-1) 逐元素累加,再乘到prob中,我感觉会超过1,甚至更大,然后对于大数求ln,再取负数,会出现负值吧。 你是如何理解的呢?
dataset_crowdpose = dict( type='HandDataset', data_root=data_root, data_mode=data_mode, ann_file='train_coco.json', data_prefix=dict(img='images/'), pipeline=val_pipeline, # val_pipeline # train_pipeline_stage1 )
训练数据集别用val_pipeline
看大了BottomupRandomAffine和BottomupResize的区别了。没细读代码,感谢
你好 我在训练RTMO时发现他的loss_mle会训练成负值 为什么会这样是因为1e−4嘛 09/01 10:07:00 - mmengine - INFO - Epoch(train) [1][ 9000/16029] base_lr: 5.044198e-05 lr: 5.044198e-05 eta: 26 days, 17:51:34 time: 0.248321 data_time: 0.002521 memory: 4903 grad_norm: 75.045101 loss: 24.547811 loss_bbox: 2.699222 loss_vis: 0.458490 loss_mle: -0.017420 loss_oks: 20.619778 loss_cls: 0.787740 num_samples: 67.000000 overlaps: 0.486408
我训练自己的模型,mleloss也出现了负值。(我自己的数据集)。我训练的mlecc的loss权重是1,你的是1e-4吗
没有唉 我用的是正常的rtmo-l的config配置
这个应该是正常的吧。rtm0-l的config中MLECCLoss的 loss_weight=1e-2 我个人理解:对于MLECCLoss来说,函数使用了log类型,即loss = -torch.log(prob + 1e-4), 假设prob的取值范围是[0, 1],那么此对数函数的取值范围就是[-0.0001, 9.2103], 所以说,为负值应该是正常的吧。而且 prob = 1.0 for o, t in zip(outputs, targets): prob = (o t).sum(dim=-1) 逐元素累加,再乘到prob中,我感觉会超过1,甚至更大,然后对于大数求ln,再取负数,会出现负值吧。 你是如何理解的呢?
确实唉 十分感谢!
Prerequisite
Environment
OrderedDict([('sys.platform', 'win32'), ('Python', '3.9.13 (main, Oct 13 2022, 21:23:06) [MSC v.1916 64 bit (AMD64)]'), ('CUDA available', True), ('MUSA available', False), ('numpy_random_seed', 2147483648), ('GPU 0', 'NVIDIA GeForce RTX 3090'), ('GPU 1', 'NVIDIA GeForce RTX 3060'), ('CUDA_HOME', 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'), ('NVCC', 'Cuda compilation tools, release 11.8, V11.8.89'), ('MSVC', '用于 x64 的 Microsoft (R) C/C++ 优化编译器 19.34.31937 版'), ('GCC', 'n/a'), ('PyTorch', '2.0.1+cu117'), ('PyTorch compiling details', 'PyTorch built with:\n - C++ Version: 199711\n - MSVC 193431937\n - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v2.7.3 (Git Hash 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)\n - OpenMP 2019\n - LAPACK is enabled (usually provided by MKL)\n - CPU capability usage: AVX2\n - CUDA Runtime 11.7\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37\n - CuDNN 8.5\n - Magma 2.5.4\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.7, CUDNN_VERSION=8.5.0, CXX_COMPILER=C:/actions-runner/_work/pytorch/pytorch/builder/windows/tmp_bin/sccache-cl.exe, CXX_FLAGS=/DWIN32 /D_WINDOWS /GR /EHsc /w /bigobj /FS -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_DISABLE_GPU_ASSERTS=OFF, TORCH_VERSION=2.0.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=OFF, USE_OPENMP=ON, USE_ROCM=OFF, \n'), ('TorchVision', '0.15.2+cu117'), ('OpenCV', '4.9.0'), ('MMEngine', '0.10.3'), ('MMPose', '1.3.1+5a3be94')])
Reproduces the problem - code sample
当我使用RTMO算法进行训练自己的数据集的时候,我发现在计算loss的时候,也就是在Hybrid_heads/rtmo_head.py里面。当使用generate_targets函数的时候。传入的gt信息是原图的信息。而不是resize到640*640的关键点和bbox坐标。请问这是为什么,算错了吗?还是我理解有问题?
Reproduces the problem - command or script
python tools/train.py .\data\rtmo-s_8xb32-700e_hand-640x640.py
Reproduces the problem - error message
我的coco文件夹第一张图的关键点信息如下 当我进入Hybrid_heads/rtmo_head.py里面,也就是下面函数
然后进入yolopose_head.py里面的self._get_targets_single()函数,通过Sim_ota进行正负样本分配,即下面位置:
我以为是在计算前进行转换,但是我分析sim_ota_assigner.py里面的assign函数拿到gt直接进行计算。 即下面位置我进行了打印
打印结果如下:
所以说是使用原图的信息进行正负样本匹配吗?
按照正常的理解不应该是resize后的图像的gt吗?
Additional information
按照正常的理解不应该是resize后的图像的gt吗?