Open yuzhobotianzhiyu opened 3 days ago
Could you paste what error you get when training crashes?
Also the fact that loss reaches 0 (in a few steps rather than 1 step) and not nan seems to suggest it may be an issue with the codebase? Is 0 loss possible for your task?
{"mode": "train", "epoch": 1, "iter": 11900, "lr": 0.00057, "memory": 58903, "data_time": 0.2401, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.08528, "frame_0_loss_map_pts": 0.57237, "frame_0_loss_map_dir": 0.09235, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.12016, "frame_0_d0.loss_map_pts": 0.76768, "frame_0_d0.loss_map_dir": 0.19397, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.09383, "frame_0_d1.loss_map_pts": 0.64106, "frame_0_d1.loss_map_dir": 0.13004, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.08647, "frame_0_d2.loss_map_pts": 0.58496, "frame_0_d2.loss_map_dir": 0.10149, "frame_0_d2.map_losses_geo2": 0.0, "loss": 3.46966, "grad_norm": 17.46814, "time": 2.89447} {"mode": "train", "epoch": 1, "iter": 11950, "lr": 0.00057, "memory": 58903, "data_time": 0.23606, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.04241, "frame_0_loss_map_pts": 0.25875, "frame_0_loss_map_dir": 0.04166, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.05639, "frame_0_d0.loss_map_pts": 0.3376, "frame_0_d0.loss_map_dir": 0.08466, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.04617, "frame_0_d1.loss_map_pts": 0.28839, "frame_0_d1.loss_map_dir": 0.05794, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.04303, "frame_0_d2.loss_map_pts": 0.26442, "frame_0_d2.loss_map_dir": 0.04538, "frame_0_d2.map_losses_geo2": 0.0, "loss": 1.56681, "grad_norm": NaN, "time": 2.38594} {"mode": "train", "epoch": 1, "iter": 12000, "lr": 0.00057, "memory": 58903, "data_time": 0.27183, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.01042, "frame_0_loss_map_pts": 0.04873, "frame_0_loss_map_dir": 0.00605, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.01104, "frame_0_d0.loss_map_pts": 0.05637, "frame_0_d0.loss_map_dir": 0.01167, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.01046, "frame_0_d1.loss_map_pts": 0.05167, "frame_0_d1.loss_map_dir": 0.0082, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.01035, "frame_0_d2.loss_map_pts": 0.04938, "frame_0_d2.loss_map_dir": 0.0066, "frame_0_d2.map_losses_geo2": 0.0, "loss": 0.28095, "grad_norm": NaN, "time": 2.08962} {"mode": "train", "epoch": 1, "iter": 12050, "lr": 0.00057, "memory": 58903, "data_time": 0.25106, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.00424, "frame_0_loss_map_pts": 0.01957, "frame_0_loss_map_dir": 0.00227, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.00439, "frame_0_d0.loss_map_pts": 0.02203, "frame_0_d0.loss_map_dir": 0.00418, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.00431, "frame_0_d1.loss_map_pts": 0.02037, "frame_0_d1.loss_map_dir": 0.00301, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.00433, "frame_0_d2.loss_map_pts": 0.01963, "frame_0_d2.loss_map_dir": 0.00244, "frame_0_d2.map_losses_geo2": 0.0, "loss": 0.11077, "grad_norm": NaN, "time": 2.01282} {"mode": "train", "epoch": 1, "iter": 12100, "lr": 0.00057, "memory": 58903, "data_time": 0.22773, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.0088, "frame_0_loss_map_pts": 0.03994, "frame_0_loss_map_dir": 0.00459, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.00911, "frame_0_d0.loss_map_pts": 0.04477, "frame_0_d0.loss_map_dir": 0.00829, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.00905, "frame_0_d1.loss_map_pts": 0.04148, "frame_0_d1.loss_map_dir": 0.00606, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.00879, "frame_0_d2.loss_map_pts": 0.04035, "frame_0_d2.loss_map_dir": 0.00489, "frame_0_d2.map_losses_geo2": 0.0, "loss": 0.22612, "grad_norm": NaN, "time": 2.0291} {"mode": "train", "epoch": 1, "iter": 12150, "lr": 0.00057, "memory": 58903, "data_time": 0.25477, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.0, "frame_0_loss_map_pts": 0.0, "frame_0_loss_map_dir": 0.0, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.0, "frame_0_d0.loss_map_pts": 0.0, "frame_0_d0.loss_map_dir": 0.0, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.0, "frame_0_d1.loss_map_pts": 0.0, "frame_0_d1.loss_map_dir": 0.0, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.0, "frame_0_d2.loss_map_pts": 0.0, "frame_0_d2.loss_map_dir": 0.0, "frame_0_d2.map_losses_geo2": 0.0, "loss": 0.0, "grad_norm": NaN, "time": 2.01777} {"mode": "train", "epoch": 1, "iter": 12200, "lr": 0.00057, "memory": 58903, "data_time": 0.22977, "frame_0_loss_cls": 0.0, "frame_0_loss_bbox": 0.0, "frame_0_loss_reprojection": 0.0, "frame_0_loss_iou3d": 0.0, "frame_0_d0.loss_cls": 0.0, "frame_0_d0.loss_bbox": 0.0, "frame_0_d0.loss_reprojection": 0.0, "frame_0_d0.loss_iou3d": 0.0, "frame_0_d1.loss_cls": 0.0, "frame_0_d1.loss_bbox": 0.0, "frame_0_d1.loss_reprojection": 0.0, "frame_0_d1.loss_iou3d": 0.0, "frame_0_d2.loss_cls": 0.0, "frame_0_d2.loss_bbox": 0.0, "frame_0_d2.loss_reprojection": 0.0, "frame_0_d2.loss_iou3d": 0.0, "frame_0_loss_map_cls": 0.0, "frame_0_loss_map_pts": 0.0, "frame_0_loss_map_dir": 0.0, "frame_0_loss_map_geo2": 0.0, "frame_0_d0.loss_map_cls": 0.0, "frame_0_d0.loss_map_pts": 0.0, "frame_0_d0.loss_map_dir": 0.0, "frame_0_d0.map_losses_geo2": 0.0, "frame_0_d1.loss_map_cls": 0.0, "frame_0_d1.loss_map_pts": 0.0, "frame_0_d1.loss_map_dir": 0.0, "frame_0_d1.map_losses_geo2": 0.0, "frame_0_d2.loss_map_cls": 0.0, "frame_0_d2.loss_map_pts": 0.0, "frame_0_d2.loss_map_dir": 0.0, "frame_0_d2.map_losses_geo2": 0.0, "loss": 0.0, "grad_norm": NaN, "time": 2.01131}
The above is my training log. The model gradually crashes during training until the loss is 0. My dataset is for autonomous driving detection tasks, and normal convergence cannot reach 0. Currently, my task uses soap to train without crashing only when LR is below 3 * e-4. I don't know if it's possible that float16 was used for training.
First of all, thanks for your work. The following experiment was conducted in my computer vision task: I used soap instead of Adamw and Lamb optimizers, and under the high learning rate setting, the convergence speed was significantly improved. But the training will fail, and the loss will gradually reach 0.