同时训练最后会报错如下:
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 365/365, 13.9 task/s, elapsed: 26s, ETA: 0sTraceback (most recent call last):
File "/home/haida_huanglei/mjt/Co-DETR-main/tools/train.py", line 245, in
main()
File "/home/haida_huanglei/mjt/Co-DETR-main/tools/train.py", line 234, in main
train_detector(
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/apis/train.py", line 245, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 54, in train
self.call_hook('after_train_epoch')
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/base_runner.py", line 309, in call_hook
getattr(hook, fn_name)(self)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/hooks/evaluation.py", line 267, in after_train_epoch
self._do_evaluate(runner)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/core/evaluation/eval_hooks.py", line 63, in _do_evaluate
key_score = self.evaluate(runner, results)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/hooks/evaluation.py", line 363, in evaluate
eval_res = self.dataloader.dataset.evaluate(
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 641, in evaluate
result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 383, in format_results
result_files = self.results2json(results, jsonfile_prefix)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 315, in results2json
json_results = self._det2json(results)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 252, in _det2json
data['category_id'] = self.cat_ids[label]
IndexError: list index out of range
尊敬的作者大大,我之前用P40的8卡训练都没有问题,loss下降正常。但是换了单块A800的卡之后,同样的工程代码、同样的docker镜像,训练的时候出现了Loss都是0的情况: 2023-08-08 21:53:37,574 - mmdet - INFO - Epoch [1][800/818] lr: 2.000e-04, eta: 1:15:35, time: 0.496, data_time: 0.002, memory: 8800, enc_loss_cls: 0.0000, enc_loss_bbox: 0.0000, enc_loss_iou: 0.0000, loss_cls: 0.0000, loss_bbox: 0.0000, loss_iou: 0.0000, d0.loss_cls: 0.0000, d0.loss_bbox: 0.0000, d0.loss_iou: 0.0000, d1.loss_cls: 0.0000, d1.loss_bbox: 0.0000, d1.loss_iou: 0.0000, d2.loss_cls: 0.0000, d2.loss_bbox: 0.0000, d2.loss_iou: 0.0000, d3.loss_cls: 0.0000, d3.loss_bbox: 0.0000, d3.loss_iou: 0.0000, d4.loss_cls: 0.0000, d4.loss_bbox: 0.0000, d4.loss_iou: 0.0000, loss_rpn_cls: 0.0000, loss_rpn_bbox: 0.0000, loss_cls0: 0.0000, acc0: 100.0000, loss_bbox0: 0.0000, loss_cls1: 0.0000, loss_bbox1: 0.0000, loss_centerness1: 0.0000, loss_cls_aux0: 0.0000, loss_bbox_aux0: 0.0000, loss_iou_aux0: 0.0000, d0.loss_cls_aux0: 0.0000, d0.loss_bbox_aux0: 0.0000, d0.loss_iou_aux0: 0.0000, d1.loss_cls_aux0: 0.0000, d1.loss_bbox_aux0: 0.0000, d1.loss_iou_aux0: 0.0000, d2.loss_cls_aux0: 0.0000, d2.loss_bbox_aux0: 0.0000, d2.loss_iou_aux0: 0.0000, d3.loss_cls_aux0: 0.0000, d3.loss_bbox_aux0: 0.0000, d3.loss_iou_aux0: 0.0000, d4.loss_cls_aux0: 0.0000, d4.loss_bbox_aux0: 0.0000, d4.loss_iou_aux0: 0.0000, loss_cls_aux1: 0.0000, loss_bbox_aux1: 0.0000, loss_iou_aux1: 0.0000, d0.loss_cls_aux1: 0.0000, d0.loss_bbox_aux1: 0.0000, d0.loss_iou_aux1: 0.0000, d1.loss_cls_aux1: 0.0000, d1.loss_bbox_aux1: 0.0000, d1.loss_iou_aux1: 0.0000, d2.loss_cls_aux1: 0.0000, d2.loss_bbox_aux1: 0.0000, d2.loss_iou_aux1: 0.0000, d3.loss_cls_aux1: 0.0000, d3.loss_bbox_aux1: 0.0000, d3.loss_iou_aux1: 0.0000, d4.loss_cls_aux1: 0.0000, d4.loss_bbox_aux1: 0.0000, d4.loss_iou_aux1: 0.0000, loss: 0.0000, grad_norm: 0.0001
同时训练最后会报错如下: [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 365/365, 13.9 task/s, elapsed: 26s, ETA: 0sTraceback (most recent call last): File "/home/haida_huanglei/mjt/Co-DETR-main/tools/train.py", line 245, in
main()
File "/home/haida_huanglei/mjt/Co-DETR-main/tools/train.py", line 234, in main
train_detector(
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/apis/train.py", line 245, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/epoch_based_runner.py", line 54, in train
self.call_hook('after_train_epoch')
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/base_runner.py", line 309, in call_hook
getattr(hook, fn_name)(self)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/hooks/evaluation.py", line 267, in after_train_epoch
self._do_evaluate(runner)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/core/evaluation/eval_hooks.py", line 63, in _do_evaluate
key_score = self.evaluate(runner, results)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmcv/runner/hooks/evaluation.py", line 363, in evaluate
eval_res = self.dataloader.dataset.evaluate(
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 641, in evaluate
result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 383, in format_results
result_files = self.results2json(results, jsonfile_prefix)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 315, in results2json
json_results = self._det2json(results)
File "/home/haida_huanglei/anaconda3/envs/mjt/lib/python3.9/site-packages/mmdet/datasets/coco.py", line 252, in _det2json
data['category_id'] = self.cat_ids[label]
IndexError: list index out of range
楼主大大,我看之前也有人提过相同的问题,说是与class类别有关,但是类别我都改过了。而且P40跟A800上工程代码和镜像都是一样的,就是在A800的卡上训练不起来。不知道跟GPU卡的数量有没有关系。这几个参数我是这样设置的: samples_per_gpu=1, workers_per_gpu=2, lr=2e-4,
不知道是什么原因,烦请作者大大指导一下~