Closed onexmaster closed 4 years ago
I am trying to evaluate(mask AP and bbox AP) my trained model on LVIS dataset using the validation images, but i am getting error.
Checklist
Describe the bug When i try to test my dataset using the valdiation data i get cuda out of memory error. I have 2 Tesla K80 gpu.
Reproduction
./tools/dist_test.sh configs/htc/htc_x101_32x4d_fpn_20e_16gpu.py checkpoints/epoch_1.pth 2 --out results.pkl --eval bbox segm
# model settings model = dict( type='HybridTaskCascade', num_stages=3, pretrained='open-mmlab://resnext101_32x4d', interleaved=True, mask_info_flow=True, backbone=dict( type='ResNeXt', depth=101, groups=32, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1231, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1231, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1231, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=[ dict( type='HTCMaskHead', with_conv_res=False, num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1231, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1231, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), dict( type='HTCMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=1231, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)) ],) # model training and testing settings train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.00001, nms=dict(type='nms', iou_thr=0.5), max_per_img=300, mask_thr_binary=0.5)) # dataset settings dataset_type = 'LVISDataset' data_root = 'data/LVIS/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='SegRescale', scale_factor=1 / 8), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( imgs_per_gpu=1, workers_per_gpu=1, train=dict( type=dataset_type, ann_file=data_root + 'lvis_v0.5_train.json', img_prefix=data_root + 'images/train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'lvis_v0.5_val.json', img_prefix=data_root + 'images/val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'lvis_v0.5_val.json', img_prefix=data_root + 'images/val2017/', pipeline=test_pipeline)) evaluation = dict(interval=4, metric=['bbox', 'segm']) # optimizer optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[16, 19]) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable # runtime settings total_epochs = 2 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/htc_x101_32x4d_fpn_20e' load_from = None resume_from = './work_dirs/htc_x101_32x4d_fpn_20e/latest.pth' workflow = [('train', 1)]
Environment
python mmdet/utils/collect_env.py
Python: 3.7.7 (default, Mar 26 2020, 15:48:22) [GCC 7.3.0] CUDA available: True CUDA_HOME: /usr/local/cuda NVCC: Cuda compilation tools, release 10.1, V10.1.243 GPU 0,1: Tesla K80 GCC: gcc (Debian 6.3.0-18+deb9u1) 6.3.0 20170516 PyTorch: 1.4.0 PyTorch compiling details: PyTorch built with: - GCC 7.3 - Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications - Intel(R) MKL-DNN v0.21.1 (Git Hash 7d2fd500bc78936d1d648ca713b901012f470dbc) - OpenMP 201511 (a.k.a. OpenMP 4.5) - NNPACK is enabled - CUDA Runtime 10.1 - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_7 0,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37 - CuDNN 7.6.3 - Magma 2.5.1 - Build settings: BLAS=MKL, BUILD_NAMEDTENSOR=OFF, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-functi on -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast - fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Wno-stringop-overflow, DISABLE_NUMA=1, PERF_WITH_AVX=1, PERF_WITH_ AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_STATIC_DISPATCH=O FF, TorchVision: 0.5.0 OpenCV: 4.1.0 MMCV: 0.4.3 MMDetection: 1.1.0+a1c3aa4 MMDetection Compiler: GCC 6.3 MMDetection CUDA Compiler: 10.1
Error traceback If applicable, paste the error trackback here.
pu.py checkpoints/epoch_1.pth 2 --out results.pkl --eval bbox segm ***************************************** Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. ***************************************** loading annotations into memory... loading annotations into memory... Done (t=0.45s) creating index... Done (t=0.45s) creating index... index created! index created! [ ] 0/5000, elapsed: 0s, ETA:Traceback (most recent call last): File "./tools/test.py", line 170, in <module> main() File "./tools/test.py", line 155, in main args.gpu_collect) File "/home/onexmaster2447/mmdetection/mmdet/apis/test.py", line 58, in multi_gpu_test result = model(return_loss=False, rescale=True, **data) File "/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/opt/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 447, in forward output = self.module(*inputs[0], **kwargs[0]) File "/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(*args, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/detectors/base.py", line 149, in forward return self.forward_test(img, img_metas, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/detectors/base.py", line 130, in forward_test return self.simple_test(imgs[0], img_metas[0], **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/detectors/htc.py", line 351, in simple_test cfg=rcnn_test_cfg) File "/home/onexmaster2447/mmdetection/mmdet/core/fp16/decorators.py", line 127, in new_func return old_func(*args, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/bbox_heads/bbox_head.py", line 173, in get_det_bboxes cfg.max_per_img) File "/home/onexmaster2447/mmdetection/mmdet/core/post_processing/bbox_nms.py", line 64, in multiclass_nms torch.cat([bboxes_for_nms, scores[:, None]], 1), **nms_cfg_) File "/home/onexmaster2447/mmdetection/mmdet/ops/nms/nms_wrapper.py", line 54, in nms inds = nms_cuda.nms(dets_th, iou_thr) RuntimeError: CUDA out of memory. Tried to allocate 17179869177.21 GiB (GPU 0; 11.17 GiB total capacity; 951.35 MiB already allocated; 9.71 GiB free; 1.15 GiB reserved in total by PyTorch) (malloc at /opt/conda/conda-bld/pytorch_1 579022060824/work/c10/cuda/CUDACachingAllocator.cpp:289) frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x47 (0x7f3b89e9f627 in /opt/anaconda3/lib/p ython3.7/site-packages/torch/lib/libc10.so) frame #1: <unknown function> + 0x1ed15 (0x7f3b8a0e3d15 in /opt/anaconda3/lib/python3.7/site-packages/torch/lib/libc 10_cuda.so) frame #2: <unknown function> + 0x1ff2e (0x7f3b8a0e4f2e in /opt/anaconda3/lib/python3.7/site-packages/torch/lib/libc 10_cuda.so) frame #3: <unknown function> + 0x1a76dd8 (0x7f3b92407dd8 in /opt/anaconda3/lib/python3.7/site-packages/torch/lib/li btorch.so) frame #4: nms_cuda(at::Tensor, float) + 0x289 (0x7f3b60005540 in /home/onexmaster2447/mmdetection/mmdet/ops/nms/nms _cuda.cpython-37m-x86_64-linux-gnu.so) frame #5: nms(at::Tensor const&, float) + 0x43d (0x7f3b5fff559d in /home/onexmaster2447/mmdetection/mmdet/ops/nms/n ms_cuda.cpython-37m-x86_64-linux-gnu.so) frame #6: <unknown function> + 0x3b7cd (0x7f3b600037cd in /home/onexmaster2447/mmdetection/mmdet/ops/nms/nms_cuda.c python-37m-x86_64-linux-gnu.so) frame #7: <unknown function> + 0x39b83 (0x7f3b60001b83 in /home/onexmaster2447/mmdetection/mmdet/ops/nms/nms_cuda.c python-37m-x86_64-linux-gnu.so) frame #8: _PyMethodDef_RawFastCallKeywords + 0x264 (0x55e4ac23fab4 in /home/onexmaster2447/.conda/envs/open-mmlab/b in/python) frame #9: _PyCFunction_FastCallKeywords + 0x21 (0x55e4ac23fbd1 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/p ython) frame #10: _PyEval_EvalFrameDefault + 0x4ecb (0x55e4ac2a657b in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #11: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #12: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #13: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #14: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #15: _PyFunction_FastCallKeywords + 0x325 (0x55e4ac23f255 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/ python) frame #16: _PyEval_EvalFrameDefault + 0x416 (0x55e4ac2a1ac6 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #17: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #18: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #19: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #20: _PyEval_EvalCodeWithName + 0xab8 (0x55e4ac1ebb48 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #21: _PyFunction_FastCallKeywords + 0x387 (0x55e4ac23f2b7 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/ python) frame #22: _PyEval_EvalFrameDefault + 0x14d4 (0x55e4ac2a2b84 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #23: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #25: _PyObject_Call_Prepend + 0x63 (0x55e4ac20ba73 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #26: PyObject_Call + 0x6e (0x55e4ac1fdfde in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #27: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #28: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #29: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #30: _PyObject_Call_Prepend + 0x63 (0x55e4ac20ba73 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #31: PyObject_Call + 0x6e (0x55e4ac1fdfde in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #32: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #33: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #34: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #35: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #36: _PyEval_EvalCodeWithName + 0xab8 (0x55e4ac1ebb48 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #37: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #38: _PyObject_Call_Prepend + 0x63 (0x55e4ac20ba73 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #39: PyObject_Call + 0x6e (0x55e4ac1fdfde in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #40: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #41: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #42: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #43: _PyObject_Call_Prepend + 0x63 (0x55e4ac20ba73 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #44: <unknown function> + 0x17d27a (0x55e4ac25327a in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #45: PyObject_Call + 0x6e (0x55e4ac1fdfde in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #46: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #47: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #48: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #49: _PyObject_Call_Prepend + 0x63 (0x55e4ac20ba73 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #50: PyObject_Call + 0x6e (0x55e4ac1fdfde in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #51: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #52: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #53: _PyFunction_FastCallDict + 0x3ff (0x55e4ac1ec6ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #54: _PyObject_Call_Prepend + 0x63 (0x55e4ac20ba73 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #55: <unknown function> + 0x17d27a (0x55e4ac25327a in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #56: PyObject_Call + 0x6e (0x55e4ac1fdfde in /home/onexmaster2447/.conda/envs/open-mmlab/bin/python) frame #57: _PyEval_EvalFrameDefault + 0x1e9d (0x55e4ac2a354d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #58: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #59: _PyFunction_FastCallKeywords + 0x325 (0x55e4ac23f255 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/ python) frame #60: _PyEval_EvalFrameDefault + 0x416 (0x55e4ac2a1ac6 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #61: _PyFunction_FastCallKeywords + 0xfb (0x55e4ac23f02b in /home/onexmaster2447/.conda/envs/open-mmlab/bin/p ython) frame #62: _PyEval_EvalFrameDefault + 0x416 (0x55e4ac2a1ac6 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #63: _PyEval_EvalCodeWithName + 0x2f9 (0x55e4ac1eb389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) Traceback (most recent call last): File "./tools/test.py", line 170, in <module> main() File "./tools/test.py", line 155, in main args.gpu_collect) File "/home/onexmaster2447/mmdetection/mmdet/apis/test.py", line 58, in multi_gpu_test result = model(return_loss=False, rescale=True, **data) File "/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/opt/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 447, in forward output = self.module(*inputs[0], **kwargs[0]) File "/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(*args, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/detectors/base.py", line 149, in forward return self.forward_test(img, img_metas, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/detectors/base.py", line 130, in forward_test return self.simple_test(imgs[0], img_metas[0], **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/detectors/htc.py", line 351, in simple_test cfg=rcnn_test_cfg) File "/home/onexmaster2447/mmdetection/mmdet/core/fp16/decorators.py", line 127, in new_func return old_func(*args, **kwargs) File "/home/onexmaster2447/mmdetection/mmdet/models/bbox_heads/bbox_head.py", line 173, in get_det_bboxes cfg.max_per_img) File "/home/onexmaster2447/mmdetection/mmdet/core/post_processing/bbox_nms.py", line 64, in multiclass_nms torch.cat([bboxes_for_nms, scores[:, None]], 1), **nms_cfg_) File "/home/onexmaster2447/mmdetection/mmdet/ops/nms/nms_wrapper.py", line 54, in nms inds = nms_cuda.nms(dets_th, iou_thr) RuntimeError: CUDA out of memory. Tried to allocate 13.31 GiB (GPU 1; 11.17 GiB total capacity; 942.92 MiB already allocated; 9.71 GiB free; 1.15 GiB reserved in total by PyTorch) (malloc at /opt/conda/conda-bld/pytorch_1579022060 824/work/c10/cuda/CUDACachingAllocator.cpp:289) frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x47 (0x7ffb19002627 in /opt/anaconda3/lib/p ython3.7/site-packages/torch/lib/libc10.so) frame #1: <unknown function> + 0x1ed15 (0x7ffb19246d15 in /opt/anaconda3/lib/python3.7/site-packages/torch/lib/libc 10_cuda.so) frame #2: <unknown function> + 0x1ff2e (0x7ffb19247f2e in /opt/anaconda3/lib/python3.7/site-packages/torch/lib/libc 10_cuda.so) frame #3: <unknown function> + 0x1a76dd8 (0x7ffb2156add8 in /opt/anaconda3/lib/python3.7/site-packages/torch/lib/li btorch.so) frame #4: nms_cuda(at::Tensor, float) + 0x289 (0x7ffaef168540 in /home/onexmaster2447/mmdetection/mmdet/ops/nms/nms _cuda.cpython-37m-x86_64-linux-gnu.so) frame #5: nms(at::Tensor const&, float) + 0x43d (0x7ffaef15859d in /home/onexmaster2447/mmdetection/mmdet/ops/nms/n ms_cuda.cpython-37m-x86_64-linux-gnu.so) frame #6: <unknown function> + 0x3b7cd (0x7ffaef1667cd in /home/onexmaster2447/mmdetection/mmdet/ops/nms/nms_cuda.c python-37m-x86_64-linux-gnu.so) frame #7: <unknown function> + 0x39b83 (0x7ffaef164b83 in /home/onexmaster2447/mmdetection/mmdet/ops/nms/nms_cuda.c python-37m-x86_64-linux-gnu.so) frame #8: _PyMethodDef_RawFastCallKeywords + 0x264 (0x55c39a155ab4 in /home/onexmaster2447/.conda/envs/open-mmlab/b in/python) frame #9: _PyCFunction_FastCallKeywords + 0x21 (0x55c39a155bd1 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/p ython) frame #10: _PyEval_EvalFrameDefault + 0x4ecb (0x55c39a1bc57b in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #11: _PyEval_EvalCodeWithName + 0x2f9 (0x55c39a101389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #12: _PyFunction_FastCallDict + 0x3ff (0x55c39a1026ef in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #13: _PyEval_EvalFrameDefault + 0x1e9d (0x55c39a1b954d in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyt hon) frame #14: _PyEval_EvalCodeWithName + 0x2f9 (0x55c39a101389 in /home/onexmaster2447/.conda/envs/open-mmlab/bin/pyth on) frame #15: _PyFunction_FastCallKeywords + 0x325 (0x55c39a155255 in /home/onexmaster2447/.conda/envs/open-mmlab/bin Traceback (most recent call last): File "/home/onexmaster2447/.conda/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main "__main__", mod_spec) File "/home/onexmaster2447/.conda/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/opt/anaconda3/lib/python3.7/site-packages/torch/distributed/launch.py", line 263, in <module> main() File "/opt/anaconda3/lib/python3.7/site-packages/torch/distributed/launch.py", line 259, in main cmd=cmd) subprocess.CalledProcessError: Command '['/home/onexmaster2447/.conda/envs/open-mmlab/bin/python', '-u', './tools/t est.py', '--local_rank=1', 'configs/htc/htc_x101_32x4d_fpn_20e_16gpu.py', 'checkpoints/epoch_1.pth', '--launcher', 'pytorch', '--out', 'results.pkl', '--eval', 'bbox', 'segm']' returned non-zero exit status 1.
LVIS eval is very different with COCO eval, you may refer to this github repo for some hint.
https://github.com/KaihuaTang/LVIS-for-mmdetection/blob/master/LVIS.py
This error is simply caused by GPU OOM.
I am trying to evaluate(mask AP and bbox AP) my trained model on LVIS dataset using the validation images, but i am getting error.
Checklist
Describe the bug When i try to test my dataset using the valdiation data i get cuda out of memory error. I have 2 Tesla K80 gpu.
Reproduction
Environment
python mmdet/utils/collect_env.py
to collect necessary environment infomation and paste it here.Error traceback If applicable, paste the error trackback here.