As the title suggests, I'm running into a RuntimeError: CUDA error: out of memory while doing inference in between training on multiple gpus. My confusion is that the inference process seems to have completed successfully but it fails immediate afterwards. Any suggestions?
2019-12-30 10:37:52,761 maskrcnn_benchmark.inference INFO: Start evaluation on isaid_val dataset(9446 images).
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:38<00:00, 5.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:41<00:00, 5.34it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:42<00:00, 5.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:43<00:00, 5.29it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:41<00:00, 5.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:44<00:00, 5.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:43<00:00, 5.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1181/1181 [03:46<00:00, 5.21it/s]
2019-12-30 10:41:37,149 maskrcnn_benchmark.inference INFO: Total run time: 0:03:44.387373 (0.19003800378622152 s / img per device, on 8 devices)
2019-12-30 10:41:37,152 maskrcnn_benchmark.inference INFO: Model inference time: 0:03:26.711526 (0.1750679870240395 s / img per device, on 8 devices)
Traceback (most recent call last):
File "tools/train_net.py", line 196, in <module>
Traceback (most recent call last):
File "tools/train_net.py", line 196, in <module>
main()
File "tools/train_net.py", line 192, in main
main()
File "tools/train_net.py", line 192, in main
run_test(cfg, model, args.distributed)
File "tools/train_net.py", line 122, in run_test
run_test(cfg, model, args.distributed)
File "tools/train_net.py", line 122, in run_test
output_folder=output_folder,
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/engine/inference.py", line 104, in inference
output_folder=output_folder,
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/engine/inference.py", line 104, in inference
Traceback (most recent call last):
File "tools/train_net.py", line 196, in <module>
predictions = _accumulate_predictions_from_multiple_gpus(predictions)
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/engine/inference.py", line 43, in _accumulate_predictions_from_multiple_gpus
predictions = _accumulate_predictions_from_multiple_gpus(predictions)
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/engine/inference.py", line 43, in _accumulate_predictions_from_multiple_gpus
all_predictions = all_gather(predictions_per_gpu)
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/utils/comm.py", line 86, in all_gather
main()
File "tools/train_net.py", line 192, in main
all_predictions = all_gather(predictions_per_gpu)
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/utils/comm.py", line 86, in all_gather
run_test(cfg, model, args.distributed)
File "tools/train_net.py", line 122, in run_test
output_folder=output_folder,
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/engine/inference.py", line 104, in inference
predictions = _accumulate_predictions_from_multiple_gpus(predictions)
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/engine/inference.py", line 43, in _accumulate_predictions_from_multiple_gpus
all_predictions = all_gather(predictions_per_gpu)
File "/home/an1/rotated_maskrcnn/maskrcnn_benchmark/utils/comm.py", line 86, in all_gather
data_list.append(pickle.loads(buffer))
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/storage.py", line 134, in _load_from_bytes
data_list.append(pickle.loads(buffer))
data_list.append(pickle.loads(buffer))
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/storage.py", line 134, in _load_from_bytes
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/storage.py", line 134, in _load_from_bytes
return torch.load(io.BytesIO(b))
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 386, in load
return torch.load(io.BytesIO(b))
return torch.load(io.BytesIO(b))
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 386, in load
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 386, in load
return _load(f, map_location, pickle_module, **pickle_load_args)
return _load(f, map_location, pickle_module, **pickle_load_args)
return _load(f, map_location, pickle_module, **pickle_load_args)
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 573, in _load
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 573, in _load
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 573, in _load
result = unpickler.load()
result = unpickler.load()
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 536, in persistent_load
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 536, in persistent_load
result = unpickler.load()
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 536, in persistent_load
deserialized_objects[root_key] = restore_location(obj, location)
deserialized_objects[root_key] = restore_location(obj, location)
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 119, in default_restore_location
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 119, in default_restore_location
deserialized_objects[root_key] = restore_location(obj, location)
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 119, in default_restore_location
result = fn(storage, location)
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 99, in _cuda_deserialize
result = fn(storage, location)
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 99, in _cuda_deserialize
result = fn(storage, location)
File "/home/an1/miniconda3/envs/rotated_maskrcnn/lib/python3.6/site-packages/torch/serialization.py", line 99, in _cuda_deserialize
return storage_type(obj.size())
return storage_type(obj.size())
RuntimeError: CUDA error: out of memory
RuntimeError: CUDA error: out of memory
return storage_type(obj.size())
RuntimeError: CUDA error: out of memory
❓ Questions and Help
As the title suggests, I'm running into a
RuntimeError: CUDA error: out of memory
while doing inference in between training on multiple gpus. My confusion is that the inference process seems to have completed successfully but it fails immediate afterwards. Any suggestions?