RuntimeError: CUDA out of memory

Instructions To Reproduce the Issue:

what code you wrote or what changes you made (git diff)


def main():

dataset_dicts_rgb_improved_test, dataset_dicts_lwir_improved_test = register_kaist_dataset()

config_path = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"

train_set_rgb = "kaist_rgb_train"
train_set_rgb_sanitized = "kaist_rgb_train_sanitized"
val_set_rgb = "kaist_rgb_test"
val_set_rgb_improved = "kaist_rgb_improved_test"

train_set_lwir = "kaist_lwir_train"
train_set_lwir_sanitized = "kaist_lwir_train_sanitized"
val_set_lwir = "kaist_lwir_test"
val_set_lwir_improved = "kaist_lwir_improved_test"

learning_rates = [.001] # .01 .001 .0001 .00025 .000025 
roi_batch_sizes = [128] # 32, 64, 128, 256
freezing_layers = [0] # 0 1 2 3 4 5
max_iters = [600]
batch_sizes = [30]

results_list = []

run_number = 303

torch.cuda.empty_cache()

# Pick and set up a GPU. This serve got 8 [0, ... , 7]
#torch.cuda.set_device(7)

# Tune hyperparams with val set.
for freezing_layer in freezing_layers:
    for learning_rate in learning_rates:    
        for batch_size in batch_sizes:
            for max_iter in max_iters:
                for roi_batch_size in roi_batch_sizes:
                    # Get default config node.
                    cfg = get_cfg()

                    cfg.merge_from_file(model_zoo.get_config_file(config_path))

                    cfg.DATASETS.TRAIN = (train_set_lwir_sanitized, )
                    # If it is set, after the trainer() will start automatically trainer.test()
                    cfg.DATASETS.TEST = ()

                    # Number of data loading threads, dft=4
                    cfg.DATALOADER.NUM_WORKERS = 2

                    # Path to a checkpoint file to be loaded to the model. 
                    # You can find available models in the model zoo.
                    # Here I'm loading the weights obtained through the training on the COCO dataset
                    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(config_path)  # Let the weight be initialized from model zoo

                    # Number of images per batch across all machines.
                    cfg.SOLVER.IMS_PER_BATCH = batch_size

                    #cfg.SEED = 3

                    cfg.SOLVER.MAX_ITER = max_iter

                    # Perform just a bbox regression task and not segmentation one
                    cfg.MODEL.MASK_ON = False

                    # `True` if cropping is used for data augmentation during training
                    cfg.INPUT.CROP.ENABLED = True

                    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (person)

                    # Not filter out images with no annotations. keep them
                    cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True

                    #cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0, 3.0, 4.0, 5.0]]

                    #cfg.MODEL.ANCHOR_GENERATOR.OFFSET = 0.5

                    cfg.MODEL.PIXEL_MEAN = [44.2694227416269, 44.2694227416269, 44.2694227416269]
                    #cfg.MODEL.PIXEL_STD = [57.375, 57.375, 57.375]

                    # Specify where write the trained weights, dft: _C.OUTPUT_DIR = "./output"
                    # I'm in home dir (=> tilde dir).
                    # exist_ok=T => no OS error raise if the dir alredy exist(=> overwrtite it)
                    my_relative_path = [os.path.abspath("../../"), "results/kaist/training_outputs",
                                        config_path, "lwir", "train_"+str(run_number)]
                    cfg.OUTPUT_DIR = "/".join(my_relative_path)
                    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

                    cfg.MODEL.BACKBONE.FREEZE_AT = freezing_layer

                    cfg.SOLVER.BASE_LR = learning_rate

                    # RoI minibatch size *per image* (number of regions of interest [ROIs])
                    # Total number of RoIs per training minibatch =
                    # ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
                    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = roi_batch_size  # (default: 512)

                    # Build a trainer from the specified config
                    trainer = Trainer(cfg, run_number)

                    # If resume==True, and last checkpoint exists, resume from it.
                    # Otherwise, load a model specified by the config.
                    trainer.resume_or_load(resume=False)

                    # Start training
                    trainer.train()

                    results = evaluate_model_on_dataset(cfg, config_path,
                                                        val_set_lwir_improved,
                                                        dataset_dicts_lwir_improved_test)

                    results_list.append(results)
                    run_number += 1    

return results_list

2. what exact command you run:
python transfer_learning_kaist.py

3. what you observed (including __full logs__):

RuntimeError: CUDA out of memory. Tried to allocate 400.00 MiB (GPU 0; 31.72 GiB total capacity; 29.81 GiB already allocated; 27.94 MiB free; 30.64 GiB reserved in total by PyTorch)


## Expected behavior:

#? How it's possible that PyTorch required this huge amount of memory? Does It is normal or I made stuff wrong? 
it seems strange to me that I can allocate only 27MB out of 32 GB

## Environment:

Provide your environment information using the following command:

wget -nc -q https://github.com/facebookresearch/detectron2/raw/master/detectron2/utils/collect_env.py && python collect_env.py


------------------------  -----------------------------------------------------------------------------------------------------------------------
sys.platform              linux
Python                    3.7.6 (default, Jan  8 2020, 19:59:22) [GCC 7.3.0]
numpy                     1.18.1
detectron2                0.1.1 @/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2
detectron2 compiler       GCC 7.3
detectron2 CUDA compiler  10.0
detectron2 arch flags     /home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2/_C.cpython-37m-x86_64-linux-gnu.so
DETECTRON2_ENV_MODULE     <not set>
PyTorch                   1.4.0+cu100 @/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torch
PyTorch debug build       False
CUDA available            True
GPU 0,1,2,3,4,5,6,7       Tesla V100-SXM2-32GB
CUDA_HOME                 None
Pillow                    7.0.0
torchvision               0.5.0+cu100 @/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torchvision
torchvision arch flags    /home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torchvision/_C.so
cv2                       3.4.2
------------------------  -----------------------------------------------------------------------------------------------------------------------
PyTorch built with:
  - GCC 7.3
  - Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v0.21.1 (Git Hash 7d2fd500bc78936d1d648ca713b901012f470dbc)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - NNPACK is enabled
  - CUDA Runtime 10.0
  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
  - CuDNN 7.6.3
  - Magma 2.5.1
  - Build settings: BLAS=MKL, BUILD_NAMEDTENSOR=OFF, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Wno-stringop-overflow, DISABLE_NUMA=1, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_STATIC_DISPATCH=OFF,

what you observed (including full logs):

please include full logs.

ERROR [06/09 11:05:25 d2.engine.train_loop]: Exception during training: Traceback (most recent call last): File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2/engine/train_loop.py", line 132, in train self.run_step() File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2/engine/train_loop.py", line 228, in run_step losses.backward() File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torch/tensor.py", line 195, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph) File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torch/autograd/init.py", line 99, in backward allow_unreachable=True) # allow_unreachable flag RuntimeError: CUDA out of memory. Tried to allocate 1.71 GiB (GPU 0; 31.72 GiB total capacity; 23.58 GiB already allocated; 1.58 GiB free; 29.08 GiB reserved in total by PyTorch) [06/09 11:05:25 d2.engine.hooks]: Overall training speed: 22 iterations in 0:01:02 (2.8448 s / it) [06/09 11:05:25 d2.engine.hooks]: Total training time: 0:01:02 (0:00:00 on hooks) Traceback (most recent call last): File "transfer_learning_kaist.py", line 418, in results = main() File "transfer_learning_kaist.py", line 357, in main trainer.train() File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2/engine/defaults.py", line 381, in train super().train(self.start_iter, self.max_iter) File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2/engine/train_loop.py", line 132, in train self.run_step() File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/detectron2/engine/train_loop.py", line 228, in run_step losses.backward() File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torch/tensor.py", line 195, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph) File "/home/lab/nbogliol/miniconda2/envs/detectron2/lib/python3.7/site-packages/torch/autograd/init.py", line 99, in backward allow_unreachable=True) # allow_unreachable flag RuntimeError: CUDA out of memory. Tried to allocate 1.71 GiB (GPU 0; 31.72 GiB total capacity; 23.58 GiB already allocated; 1.58 GiB free; 29.08 GiB reserved in total by PyTorch)

Here you can find the full logs. Thank you in advance

facebookresearch / detectron2

RuntimeError: CUDA out of memory #1563

Instructions To Reproduce the Issue: