open-mmlab / mmdetection

OpenMMLab Detection Toolbox and Benchmark
https://mmdetection.readthedocs.io
Apache License 2.0
29.43k stars 9.43k forks source link

Anchor box configuration #3736

Closed PawelFaron closed 4 years ago

PawelFaron commented 4 years ago

I have my own datgaset in COCO format and tried to train a model on it. I'm using res2net https://github.com/open-mmlab/mmdetection/blob/master/configs/res2net/htc_r2_101_fpn_20e_coco.py . I have 4 GPUs with 4 images per GPU Running on default anchor box configuration the progress looks like following (first epoch):

{"mode": "train", "epoch": 1, "iter": 50, "lr": 0.00198, "memory": 8410, "data_time": 0.26623, "loss_rpn_cls": 0.06792, "loss_rpn_bbox": 0.01594, "s0.loss_cls": 0.22635, "s0.acc": 92.29858, "s0.loss_bbox": 0.0439, "s0.loss_mask": 1.46783, "s1.loss_cls": 0.10369, "s1.acc": 95.17383, "s1.loss_bbox": 0.01499, "s1.loss_mask": 0.77722, "s2.loss_cls": 0.06181, "s2.acc": 89.31689, "s2.loss_bbox": 0.00287, "s2.loss_mask": 0.62522, "loss": 3.40774, "time": 0.96131}
{"mode": "train", "epoch": 1, "iter": 100, "lr": 0.00398, "memory": 8410, "data_time": 0.02473, "loss_rpn_cls": 0.04785, "loss_rpn_bbox": 0.01569, "s0.loss_cls": 0.08799, "s0.acc": 97.92725, "s0.loss_bbox": 0.04771, "s0.loss_mask": 0.6216, "s1.loss_cls": 0.03017, "s1.acc": 99.0083, "s1.loss_bbox": 0.01585, "s1.loss_mask": 0.31075, "s2.loss_cls": 0.01117, "s2.acc": 99.49219, "s2.loss_bbox": 0.00251, "s2.loss_mask": 0.1535, "loss": 1.34478, "time": 0.70048}
{"mode": "train", "epoch": 1, "iter": 200, "lr": 0.00797, "memory": 8592, "data_time": 0.02552, "loss_rpn_cls": 0.03526, "loss_rpn_bbox": 0.01392, "s0.loss_cls": 0.09784, "s0.acc": 97.02905, "s0.loss_bbox": 0.05718, "s0.loss_mask": 0.54236, "s1.loss_cls": 0.04148, "s1.acc": 97.84448, "s1.loss_bbox": 0.03451, "s1.loss_mask": 0.25958, "s2.loss_cls": 0.01616, "s2.acc": 98.69604, "s2.loss_bbox": 0.00987, "s2.loss_mask": 0.1304, "loss": 1.23858, "time": 0.7133}
{"mode": "val", "epoch": 1, "iter": 436, "lr": 0.0174, "bbox_mAP": 0.091, "bbox_mAP_50": 0.225, "bbox_mAP_75": 0.06, "bbox_mAP_s": 0.024, "bbox_mAP_m": 0.047, "bbox_mAP_l": 0.116, "bbox_mAP_copypaste": "0.091 0.225 0.060 0.024 0.047 0.116", "segm_mAP": 0.042, "segm_mAP_50": 0.141, "segm_mAP_75": 0.014, "segm_mAP_s": 0.007, "segm_mAP_m": 0.033, "segm_mAP_l": 0.053, "segm_mAP_copypaste": "0.042 0.141 0.014 0.007 0.033 0.053"}

Subsequent epoches looks like this (last iter and val):

{"mode": "train", "epoch": 2, "iter": 400, "lr": 0.02, "memory": 8932, "data_time": 0.02542, "loss_rpn_cls": 0.02285, "loss_rpn_bbox": 0.01138, "s0.loss_cls": 0.09057, "s0.acc": 96.91748, "s0.loss_bbox": 0.05284, "s0.loss_mask": 0.44854, "s1.loss_cls": 0.04344, "s1.acc": 97.00684, "s1.loss_bbox": 0.04615, "s1.loss_mask": 0.21286, "s2.loss_cls": 0.01833, "s2.acc": 97.55786, "s2.loss_bbox": 0.01879, "s2.loss_mask": 0.10257, "loss": 1.06833, "time": 0.72228}
{"mode": "val", "epoch": 2, "iter": 436, "lr": 0.02, "bbox_mAP": 0.134, "bbox_mAP_50": 0.3, "bbox_mAP_75": 0.105, "bbox_mAP_s": 0.039, "bbox_mAP_m": 0.052, "bbox_mAP_l": 0.167, "bbox_mAP_copypaste": "0.134 0.300 0.105 0.039 0.052 0.167", "segm_mAP": 0.074, "segm_mAP_50": 0.198, "segm_mAP_75": 0.043, "segm_mAP_s": 0.013, "segm_mAP_m": 0.035, "segm_mAP_l": 0.092, "segm_mAP_copypaste": "0.074 0.198 0.043 0.013 0.035 0.092"}

{"mode": "train", "epoch": 3, "iter": 400, "lr": 0.02, "memory": 8967, "data_time": 0.02623, "loss_rpn_cls": 0.02226, "loss_rpn_bbox": 0.01061, "s0.loss_cls": 0.08915, "s0.acc": 96.99316, "s0.loss_bbox": 0.05066, "s0.loss_mask": 0.44269, "s1.loss_cls": 0.04355, "s1.acc": 96.95215, "s1.loss_bbox": 0.04651, "s1.loss_mask": 0.20904, "s2.loss_cls": 0.01887, "s2.acc": 97.39014, "s2.loss_bbox": 0.02064, "s2.loss_mask": 0.0992, "loss": 1.05319, "time": 0.72466}
{"mode": "val", "epoch": 3, "iter": 436, "lr": 0.02, "bbox_mAP": 0.136, "bbox_mAP_50": 0.296, "bbox_mAP_75": 0.108, "bbox_mAP_s": 0.035, "bbox_mAP_m": 0.058, "bbox_mAP_l": 0.172, "bbox_mAP_copypaste": "0.136 0.296 0.108 0.035 0.058 0.172", "segm_mAP": 0.073, "segm_mAP_50": 0.202, "segm_mAP_75": 0.042, "segm_mAP_s": 0.019, "segm_mAP_m": 0.039, "segm_mAP_l": 0.091, "segm_mAP_copypaste": "0.073 0.202 0.042 0.019 0.039 0.091"}

Then I've run my dataset by this tool to calculate optimum archor box configuration https://github.com/martinzlocha/anchor-optimization . It gave me those results:

Final best anchor configuration State: 0.17333 Ratios: [0.25, 0.546, 1.0, 1.832, 4.0] Scales: [0.41, 0.651, 0.954, 1.15, 1.518]

So I put this into configuration and this was only change. The results was bit surprising as the loos was dropping very quickly per validation was very poor.

{"mode": "train", "epoch": 1, "iter": 50, "lr": 0.00198, "memory": 9620, "data_time": 0.26598, "loss_rpn_cls": 0.41754, "loss_rpn_bbox": 0.00218, "s0.loss_cls": 0.15733, "s0.acc": 97.07788, "s0.loss_bbox": 0.00079, "s0.loss_mask": 0.85893, "s1.loss_cls": 0.10082, "s1.acc": 93.70435, "s1.loss_bbox": 0.00051, "s1.loss_mask": 0.81001, "s2.loss_cls": 0.06517, "s2.acc": 87.38965, "s2.loss_bbox": 0.0002, "s2.loss_mask": 0.34433, "loss": 2.75782, "time": 0.94976}
{"mode": "train", "epoch": 1, "iter": 100, "lr": 0.00398, "memory": 9620, "data_time": 0.02615, "loss_rpn_cls": 0.02018, "loss_rpn_bbox": 0.00237, "s0.loss_cls": 0.02943, "s0.acc": 99.67944, "s0.loss_bbox": 0.00057, "s0.loss_mask": 0.50567, "s1.loss_cls": 0.02075, "s1.acc": 99.68555, "s1.loss_bbox": 0.00024, "s1.loss_mask": 0.2546, "s2.loss_cls": 0.01459, "s2.acc": 99.68994, "s2.loss_bbox": 3e-05, "s2.loss_mask": 0.13277, "loss": 0.9812, "time": 0.68861}
{"mode": "train", "epoch": 1, "iter": 400, "lr": 0.01596, "memory": 9620, "data_time": 0.02512, "loss_rpn_cls": 0.00792, "loss_rpn_bbox": 0.00192, "s0.loss_cls": 0.01148, "s0.acc": 99.73364, "s0.loss_bbox": 0.00276, "s0.loss_mask": 0.46378, "s1.loss_cls": 0.00659, "s1.acc": 99.60767, "s1.loss_bbox": 0.00177, "s1.loss_mask": 0.22066, "s2.loss_cls": 0.00366, "s2.acc": 99.64478, "s2.loss_bbox": 0.00063, "s2.loss_mask": 0.11087, "loss": 0.83204, "time": 0.69931}
{"mode": "val", "epoch": 1, "iter": 431, "lr": 0.0172, "bbox_mAP": 0.011, "bbox_mAP_50": 0.027, "bbox_mAP_75": 0.011, "bbox_mAP_s": 0.0, "bbox_mAP_m": 0.022, "bbox_mAP_l": 0.011, "bbox_mAP_copypaste": "0.011 0.027 0.011 0.000 0.022 0.011", "segm_mAP": 0.009, "segm_mAP_50": 0.026, "segm_mAP_75": 0.001, "segm_mAP_s": 0.002, "segm_mAP_m": 0.017, "segm_mAP_l": 0.008, "segm_mAP_copypaste": "0.009 0.026 0.001 0.002 0.017 0.008"}

{"mode": "train", "epoch": 2, "iter": 50, "lr": 0.0192, "memory": 9620, "data_time": 0.25106, "loss_rpn_cls": 0.00864, "loss_rpn_bbox": 0.00185, "s0.loss_cls": 0.0127, "s0.acc": 99.72412, "s0.loss_bbox": 0.00291, "s0.loss_mask": 0.46616, "s1.loss_cls": 0.00717, "s1.acc": 99.58911, "s1.loss_bbox": 0.00202, "s1.loss_mask": 0.22399, "s2.loss_cls": 0.0036, "s2.acc": 99.62866, "s2.loss_bbox": 0.00072, "s2.loss_mask": 0.11312, "loss": 0.84288, "time": 0.9406}
{"mode": "train", "epoch": 2, "iter": 400, "lr": 0.02, "memory": 9620, "data_time": 0.02526, "loss_rpn_cls": 0.00655, "loss_rpn_bbox": 0.00189, "s0.loss_cls": 0.01515, "s0.acc": 99.64502, "s0.loss_bbox": 0.00449, "s0.loss_mask": 0.45183, "s1.loss_cls": 0.0081, "s1.acc": 99.65137, "s1.loss_bbox": 0.00342, "s1.loss_mask": 0.21701, "s2.loss_cls": 0.00376, "s2.acc": 99.66235, "s2.loss_bbox": 0.00127, "s2.loss_mask": 0.10733, "loss": 0.82082, "time": 0.7335}
{"mode": "val", "epoch": 2, "iter": 431, "lr": 0.02, "bbox_mAP": 0.018, "bbox_mAP_50": 0.044, "bbox_mAP_75": 0.014, "bbox_mAP_s": 0.016, "bbox_mAP_m": 0.041, "bbox_mAP_l": 0.017, "bbox_mAP_copypaste": "0.018 0.044 0.014 0.016 0.041 0.017", "segm_mAP": 0.014, "segm_mAP_50": 0.042, "segm_mAP_75": 0.002, "segm_mAP_s": 0.019, "segm_mAP_m": 0.03, "segm_mAP_l": 0.013, "segm_mAP_copypaste": "0.014 0.042 0.002 0.019 0.030 0.013"}

Do you maybe know what can be a reason for that?

ZwwWayne commented 4 years ago

When the hyper-parameters of anchor is changed, e.g., scales, ratios, and their numbers, some hyper-parameters should also be tuned. For example, the IoU threshold in the assigned, the sample numbers, or maybe pos/neg ratios. Only the users know the exact and complete situation they meet, so the users should tune this hyper-parameters on their own and we are afraid that we cannot do much help here.