Open innerlee opened 3 years ago
Motivation: Verify the effect of the bottom-up model(such as higherhrnet) on the AIC dataset. I have tried it, but the result is not good. mAP only 0.2. So please help me, thanks.
Configs: higher_hrnet32_aic_512x512.py
Datasets: bottom_up_aic.py
Details:
higher_hrnet32_aic_512x512.py
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=10)
evaluation = dict(interval=20, metric='mAP', key_indicator='AP')
optimizer = dict(
type='Adam',
lr=0.0015,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[200, 260])
total_epochs = 100 # 300
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=14,
dataset_joints=14,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
],
inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
data_cfg = dict(
image_size=512,
base_size=256,
base_sigma=2,
heatmap_size=[128, 256],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
num_scales=2,
scale_aware_sigma=False,
)
# model settings
model = dict(
type='BottomUp',
pretrained='https://download.openmmlab.com/mmpose/'
'pretrain_models/hrnet_w32-36af842e.pth',
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256))),
),
keypoint_head=dict(
type='BottomUpHigherResolutionHead',
in_channels=32,
num_joints=14,
tag_per_joint=True,
extra=dict(final_conv_kernel=1, ),
num_deconv_layers=1,
num_deconv_filters=[32],
num_deconv_kernels=[4],
num_basic_blocks=4,
cat_output=[True],
with_ae_loss=[True, False]),
train_cfg=dict(
num_joints=channel_cfg['dataset_joints'],
img_size=data_cfg['image_size']),
test_cfg=dict(
num_joints=channel_cfg['dataset_joints'],
max_num_people=30,
scale_factor=[1],
with_heatmaps=[True, True],
with_ae=[True, False],
project2image=True,
nms_kernel=5,
nms_padding=2,
tag_per_joint=True,
detection_threshold=0.1,
tag_threshold=1,
use_detection_val=True,
ignore_too_much=False,
adjust=True,
refine=True,
flip_test=True),
loss_pose=dict(
type='MultiLossFactory',
num_joints=14,
num_stages=2,
ae_loss_type='exp',
with_ae_loss=[True, False],
push_loss_factor=[0.001, 0.001],
pull_loss_factor=[0.001, 0.001],
with_heatmaps_loss=[True, True],
heatmaps_loss_factor=[1.0, 1.0],
),
)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='BottomUpRandomAffine',
rot_factor=30,
scale_factor=[0.75, 1.5],
scale_type='short',
trans_factor=40),
dict(type='BottomUpRandomFlip', flip_prob=0.5),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='BottomUpGenerateTarget',
sigma=2,
max_num_people=30,
),
dict(
type='Collect',
keys=['img', 'joints', 'targets', 'masks'],
meta_keys=[]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
dict(
type='BottomUpResizeAlign',
transforms=[
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'aug_data', 'test_scale_factor', 'base_size',
'center', 'scale', 'flip_index'
]),
]
test_pipeline = val_pipeline
data_root = 'data/aic'
data = dict(
samples_per_gpu=4, # 24
workers_per_gpu=2,
train=dict(
type='BottomUpAicDataset',
ann_file=f'{data_root}/annotations/aic_train.json',
img_prefix=f'{data_root}/ai_challenger_keypoint_train_20170902/'
'keypoint_train_images_20170902/',
data_cfg=data_cfg,
pipeline=train_pipeline),
val=dict(
type='BottomUpAicDataset',
ann_file=f'{data_root}/annotations/aic_val.json',
img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
'keypoint_validation_images_20170911/',
data_cfg=data_cfg,
pipeline=val_pipeline),
test=dict(
type='BottomUpAicDataset',
ann_file=f'{data_root}/annotations/aic_val.json',
img_prefix=f'{data_root}/ai_challenger_keypoint_validation_20170911/'
'keypoint_validation_images_20170911/',
data_cfg=data_cfg,
pipeline=val_pipeline),
)
bottom_up_aic.py
import numpy as np
from xtcocotools.coco import COCO
from xtcocotools.cocoeval import COCOeval
import xtcocotools
from mmpose.datasets.builder import DATASETS
from .bottom_up_coco import BottomUpCocoDataset
@DATASETS.register_module()
class BottomUpAicDataset(BottomUpCocoDataset):
"""Aic dataset for bottom-up pose estimation.
`AI Challenger : A Large-scale Dataset for Going Deeper
in Image Understanding <https://arxiv.org/abs/1711.06475>`__
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
AIC keypoint indexes::
0: "right_shoulder",
1: "right_elbow",
2: "right_wrist",
3: "left_shoulder",
4: "left_elbow",
5: "left_wrist",
6: "right_hip",
7: "right_knee",
8: "right_ankle",
9: "left_hip",
10: "left_knee",
11: "left_ankle",
12: "head_top",
13: "neck"
Args:
ann_file (str): Path to the annotation file.
img_prefix (str): Path to a directory where images are held.
Default: None.
data_cfg (dict): config
pipeline (list[dict | callable]): A sequence of data transforms.
test_mode (bool): Store True when building test or
validation dataset. Default: False.
"""
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
test_mode=False):
super(BottomUpCocoDataset, self).__init__(
ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode)
self.ann_info['flip_index'] = [
3, 4, 5, 0, 1, 2, 9, 10, 11, 6, 7, 8, 12, 13
]
self.ann_info['use_different_joint_weights'] = False
self.ann_info['joint_weights'] = np.array(
[1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.],
dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
self.sigmas = np.array([
0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891,
0.01402144, 0.03909642, 0.03686941, 0.01981803, 0.03843971,
0.03412318, 0.02415081, 0.01291456, 0.01236173
]) * 2
self.coco = COCO(ann_file)
cats = [
cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
]
self.classes = ['__background__'] + cats
self.num_classes = len(self.classes)
self._class_to_ind = dict(zip(self.classes, range(self.num_classes)))
self._class_to_coco_ind = dict(zip(cats, self.coco.getCatIds()))
self._coco_ind_to_class_ind = dict(
(self._class_to_coco_ind[cls], self._class_to_ind[cls])
for cls in self.classes[1:])
self.img_ids = self.coco.getImgIds()
if not test_mode:
self.img_ids = [
img_id for img_id in self.img_ids
if len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
]
self.num_images = len(self.img_ids)
self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
self.dataset_name = 'aic'
print(f'=> num_images: {self.num_images}')
def _do_python_keypoint_eval(self, res_file):
"""Keypoint evaluation using COCOAPI."""
coco_det = self.coco.loadRes(res_file)
coco_eval = COCOeval(
self.coco, coco_det, 'keypoints', self.sigmas, use_area=False)
coco_eval.params.useSegm = None
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
stats_names = [
'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
'AR .75', 'AR (M)', 'AR (L)'
]
info_str = list(zip(stats_names, coco_eval.stats))
return info_str
Motivation: Have a smaller model more well suited for realtime inference for the interhand 3d dataset Configs: mobilenetv2_interhand3d_all_256x256.py Datasets: Interhand3D Details:
Would also be great to see smaller input sizes other than 256x256 such as 96x96 or 128x128 (like megatrack) or 224x224 (like mediapipe blazehand)
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=1)
evaluation = dict(
interval=1,
metric=['MRRPE', 'MPJPE', 'Handedness_acc'],
key_indicator='MPJPE_all')
optimizer = dict(
type='Adam',
lr=2e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(policy='step', step=[15, 17])
total_epochs = 20
log_config = dict(
interval=20,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=42,
dataset_joints=42,
dataset_channel=[list(range(42))],
inference_channel=list(range(42)))
# model settings
model = dict(
type='Interhand3D',
pretrained='mmcls://mobilenet_v2',
backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
keypoint_head=dict(
type='Interhand3DHead',
keypoint_head_cfg=dict(
in_channels=1280,
out_channels=21 * 64,
depth_size=64,
num_deconv_layers=3,
num_deconv_filters=(256, 256, 256),
num_deconv_kernels=(4, 4, 4),
),
root_head_cfg=dict(
in_channels=1280,
heatmap_size=64,
hidden_dims=(512, ),
),
hand_type_head_cfg=dict(
in_channels=1280,
num_labels=2,
hidden_dims=(512, ),
),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True),
loss_root_depth=dict(type='L1Loss'),
loss_hand_type=dict(type='BCELoss', use_target_weight=True),
),
train_cfg={},
test_cfg=dict(flip_test=False))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64, 64],
heatmap3d_depth_bound=400.0,
heatmap_size_root=64,
root_depth_bound=400.0,
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='HandRandomFlip', flip_prob=0.5),
dict(type='TopDownRandomTranslation', trans_factor=0.15),
dict(
type='TopDownGetRandomScaleRotation',
rot_factor=45,
scale_factor=0.25,
rot_prob=0.6),
# dict(type='MeshRandomChannelNoise', noise_factor=0.2),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='MultitaskGatherTarget',
pipeline_list=[
[dict(
type='Generate3DHeatmapTarget',
sigma=2.5,
max_bound=255,
)], [dict(type='HandGenerateRelDepthTarget')],
[
dict(
type='RenameKeys',
key_pairs=[('hand_type', 'target'),
('hand_type_valid', 'target_weight')])
]
],
pipeline_indices=[0, 1, 2],
),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffine'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'flip_pairs',
'heatmap3d_depth_bound', 'root_depth_bound'
]),
]
test_pipeline = val_pipeline
data_root = 'data/interhand2.6m'
data = dict(
samples_per_gpu=16,
workers_per_gpu=2,
train=dict(
type='InterHand3DDataset',
ann_file=f'{data_root}/annotations/all/'
'InterHand2.6M_train_data.json',
camera_file=f'{data_root}/annotations/all/'
'InterHand2.6M_train_camera.json',
joint_file=f'{data_root}/annotations/all/'
'InterHand2.6M_train_joint_3d.json',
img_prefix=f'{data_root}/images/train/',
data_cfg=data_cfg,
use_gt_root_depth=True,
rootnet_result_file=None,
pipeline=train_pipeline),
val=dict(
type='InterHand3DDataset',
ann_file=f'{data_root}/annotations/machine_annot/'
'InterHand2.6M_val_data.json',
camera_file=f'{data_root}/annotations/machine_annot/'
'InterHand2.6M_val_camera.json',
joint_file=f'{data_root}/annotations/machine_annot/'
'InterHand2.6M_val_joint_3d.json',
img_prefix=f'{data_root}/images/val/',
data_cfg=data_cfg,
use_gt_root_depth=True,
rootnet_result_file=None,
pipeline=val_pipeline),
test=dict(
type='InterHand3DDataset',
ann_file=f'{data_root}/annotations/all/'
'InterHand2.6M_test_data.json',
camera_file=f'{data_root}/annotations/all/'
'InterHand2.6M_test_camera.json',
joint_file=f'{data_root}/annotations/all/'
'InterHand2.6M_test_joint_3d.json',
img_prefix=f'{data_root}/images/test/',
data_cfg=data_cfg,
use_gt_root_depth=True,
rootnet_result_file=None,
pipeline=val_pipeline),
)
Body25 key points model supported? Which can be done by combined coco and MPII, so that users like can using the key points of foot. Now, coco's 16 key points model can not meets requirements anymore.
Thanks! BTW, we have already provided 133-kpt COCO-Wholebody models. You can run this model to obtain foot keypoints.
@jin-s13 For some realtime situation, 133 is too much (probability not optimal in certain points). Alphapose provides Body25 with HALPE dataset, as well as Openpose. if mmpose can have such an option, that would be very great for users want a simple yet useful human pose model.
+1
From time to time, there are requests for more checkpoints. Considering that enriching the model zoo is a good thing, we collect requests for checkpoints here. Please post the detailed configs, settings, backgrounds, motivations etc. below, and others may thumbs-up 👍 the request items. We will periodically assess them and train & release them if the needs are high.
Happy Research!