I met error: Exception has occurred: RuntimeError CUDA out of memory. Tried to allocate 190.00 MiB (GPU 0; 3.94 GiB total capacity; 2.12 GiB already allocated; 171.06 MiB free; 2.22 GiB reserved in total by PyTorch) File "/media/thanhvt/HDD_Data/UbuntuWork/LenseProject/Lenses_Detectron2/train.py", line 121, in trainer.train()

I used custom Dataloaders. If I use T.RandomApply(transform=T.Resize(shape=(800, 800)), prob=1), it run well. But If I comment this line and use origin image size (1920*1080), CUDA out of memory appear. As I know, I use batch size is 1 (cfg.SOLVER.IMS_PER_BATCH = 1)

Instructions To Reproduce the Issue:

full code you wrote or full changes you made (git diff)


import torch, torchvision
import detectron2
# Setup detectron2 logger
from detectron2.utils.logger import setup_logger
setup_logger()
# import some common libraries
import numpy as np
import os, json, cv2, random
# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog,build_detection_test_loader,build_detection_train_loader

from detectron2.structures import BoxMode from detectron2.engine import DefaultTrainer from detectron2.data import DatasetMapper

def get_scratch_lense_dicts(img_dir): json_file = os.path.join(img_dir, "via_project_9Aug2020_9h36m_json.json") with open(json_file) as f: imgs_anns = json.load(f)

dataset_dicts = []
for idx, v in enumerate(imgs_anns.values()):
    record = {}

    filename = os.path.join(img_dir, v["filename"])
    height, width = cv2.imread(filename).shape[:2]

    record["file_name"] = filename
    record["image_id"] = idx
    record["height"] = height
    record["width"] = width

    annos = v["regions"]
    objs = []
    for anno in annos:
        assert not anno["region_attributes"]
        anno = anno["shape_attributes"]
        px = anno["x"]
        py = anno["y"]
        width = anno["width"]
        height = anno["height"]

        obj = {
            "bbox": [px, py, width, height],
            "bbox_mode": BoxMode.XYWH_ABS,
            "category_id": 0,
        }
        objs.append(obj)
    record["annotations"] = objs
    dataset_dicts.append(record)
return dataset_dicts

for d in ["train", "val"]: DatasetCatalog.register("scratchlense" + d, lambda d=d: get_scratch_lense_dicts("scratch_lense/" + d)) MetadataCatalog.get("scratchlense" + d).set(thing_classes=["scratch_lense"]) scratch_lense_metadata = MetadataCatalog.get("scratch_lense_train")

cfg = get_cfg() cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")) cfg.DATASETS.TRAIN = ("scratch_lense_train",) cfg.DATASETS.TEST = () cfg.DATALOADER.NUM_WORKERS = 8 cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml") # Let training initialize from model zoo cfg.SOLVER.IMS_PER_BATCH = 1 cfg.SOLVER.BASE_LR = 0.00025 # pick a good LR cfg.SOLVER.MAX_ITER = 300 # 300 iterations seems good enough for this toy dataset; you may need to train longer for a practical dataset cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 56 # faster, and good enough for this toy dataset (default: 512) cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1 # only has one class (ballon) cfg.SOLVER.CHECKPOINT_PERIOD = 50 cfg.MAX_SIZE_TRAIN = 2000

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

import detectron2.data.transforms as T from detectron2.data import detection_utils as utils import copy def custom_mapper(dataset_dict):

Implement a mapper, similar to the default DatasetMapper, but with your own customizations

dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format="BGR")

image, transforms = T.apply_transform_gens([
    T.RandomFlip(prob=0.5, horizontal=True, vertical=False),
    T.RandomFlip(prob=0.5, horizontal=False, vertical=True),
    # T.RandomApply(transform=T.Resize(shape=(800, 800)), 
    #               prob=1),### If I uncomment this line, there is CUDA memory error
    T.RandomApply(transform=T.RandomRotation(angle=[-45,45], expand=True, center=None, sample_style="range", interp=None), 
                  prob=0.5)

], image)

dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))

annos = [
    utils.transform_instance_annotations(obj, transforms, image.shape[:2])
    for obj in dataset_dict.pop("annotations")
    if obj.get("iscrowd", 0) == 0
]
instances = utils.annotations_to_instances(annos, image.shape[:2])
dataset_dict["instances"] = utils.filter_empty_instances(instances)
return dataset_dict

class CS_Trainer(DefaultTrainer): @classmethod def build_test_loader(cls, cfg, dataset_name): return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))

@classmethod
def build_train_loader(cls, cfg):
    return build_detection_train_loader(cfg, mapper=custom_mapper)

trainer = CS_Trainer(cfg) trainer.resume_or_load(resume=False) trainer.train()

from detectron2.evaluation import COCOEvaluator, inference_on_dataset from detectron2.data import build_detection_test_loader evaluator = COCOEvaluator("scratch_lense_val", cfg, False, output_dir="./output/") val_loader = build_detection_test_loader(cfg, "scratch_lense_val") print(inference_on_dataset(trainer.model, val_loader, evaluator))

2. what exact command you run:
3. __full logs__ you observed:

        (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05)
      )
      (conv3): Conv2d(
        128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
    )
    (2): BottleneckBlock(
      (conv1): Conv2d(
        512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05)
      )
      (conv2): Conv2d(
        128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05)
      )
      (conv3): Conv2d(
        128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
    )
    (3): BottleneckBlock(
      (conv1): Conv2d(
        512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05)
      )
      (conv2): Conv2d(
        128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05)
      )
      (conv3): Conv2d(
        128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
    )
  )
  (res4): Sequential(
    (0): BottleneckBlock(
      (shortcut): Conv2d(
        512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
      (conv1): Conv2d(
        512, 256, kernel_size=(1, 1), stride=(2, 2), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv2): Conv2d(
        256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv3): Conv2d(
        256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
    )
    (1): BottleneckBlock(
      (conv1): Conv2d(
        1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv2): Conv2d(
        256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv3): Conv2d(
        256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
    )
    (2): BottleneckBlock(
      (conv1): Conv2d(
        1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv2): Conv2d(
        256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv3): Conv2d(
        256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
    )
    (3): BottleneckBlock(
      (conv1): Conv2d(
        1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv2): Conv2d(
        256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv3): Conv2d(
        256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
    )
    (4): BottleneckBlock(
      (conv1): Conv2d(
        1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv2): Conv2d(
        256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv3): Conv2d(
        256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
    )
    (5): BottleneckBlock(
      (conv1): Conv2d(
        1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv2): Conv2d(
        256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05)
      )
      (conv3): Conv2d(
        256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05)
      )
    )
  )
  (res5): Sequential(
    (0): BottleneckBlock(
      (shortcut): Conv2d(
        1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False
        (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05)
      )
      (conv1): Conv2d(
        1024, 512, kernel_size=(1, 1), stride=(2, 2), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
      (conv2): Conv2d(
        512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
      (conv3): Conv2d(
        512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05)
      )
    )
    (1): BottleneckBlock(
      (conv1): Conv2d(
        2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
      (conv2): Conv2d(
        512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
      (conv3): Conv2d(
        512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05)
      )
    )
    (2): BottleneckBlock(
      (conv1): Conv2d(
        2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
      (conv2): Conv2d(
        512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05)
      )
      (conv3): Conv2d(
        512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False
        (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05)
      )
    )
  )
)

) (proposal_generator): RPN( (rpn_head): StandardRPNHead( (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (objectness_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1)) (anchor_deltas): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1)) ) (anchor_generator): DefaultAnchorGenerator( (cell_anchors): BufferList() ) ) (roi_heads): StandardROIHeads( (box_pooler): ROIPooler( (level_poolers): ModuleList( (0): ROIAlign(output_size=(7, 7), spatial_scale=0.25, sampling_ratio=0, aligned=True) (1): ROIAlign(output_size=(7, 7), spatial_scale=0.125, sampling_ratio=0, aligned=True) (2): ROIAlign(output_size=(7, 7), spatial_scale=0.0625, sampling_ratio=0, aligned=True) (3): ROIAlign(output_size=(7, 7), spatial_scale=0.03125, sampling_ratio=0, aligned=True) ) ) (box_head): FastRCNNConvFCHead( (fc1): Linear(in_features=12544, out_features=1024, bias=True) (fc2): Linear(in_features=1024, out_features=1024, bias=True) ) (box_predictor): FastRCNNOutputLayers( (cls_score): Linear(in_features=1024, out_features=2, bias=True) (bbox_pred): Linear(in_features=1024, out_features=4, bias=True) ) ) ) [08/14 07:32:20 d2.data.build]: Removed 0 images with no usable annotations. 17 images left. [08/14 07:32:20 d2.data.build]: Distribution of instances among all 1 categories:	category	#instances
scratch_lense	17

[08/14 07:32:20 d2.data.common]: Serializing 17 elements to byte tensors and concatenating them all ... [08/14 07:32:20 d2.data.common]: Serialized dataset takes 0.00 MiB [08/14 07:32:20 d2.data.build]: Using training sampler TrainingSampler 2020-08-14 07:32:20.970958: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1 Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (2, 1024) in the model! You might want to double check if this is expected. Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (2,) in the model! You might want to double check if this is expected. Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (4, 1024) in the model! You might want to double check if this is expected. Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (4,) in the model! You might want to double check if this is expected. [08/14 07:32:23 d2.engine.train_loop]: Starting training from iteration 0 /home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2/layers/wrappers.py:226: UserWarning: This overload of nonzero is deprecated: nonzero() Consider using one of the following signatures instead: nonzero(, bool as_tuple) (Triggered internally at /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.) return x.nonzero().unbind(1) ERROR [08/14 07:32:25 d2.engine.train_loop]: Exception during training: Traceback (most recent call last): File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 140, in train self.run_step() File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2/engine/train_loop.py", line 226, in run_step loss_dict = self.model(data) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2/modeling/meta_arch/rcnn.py", line 157, in forward features = self.backbone(images.tensor) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(*input, *kwargs) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2/modeling/backbone/fpn.py", line 132, in forward lateral_features = lateral_conv(features) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2/layers/wrappers.py", line 94, in forward x = super().forward(x) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 419, in forward return self._conv_forward(input, self.weight) File "/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 415, in _conv_forward return F.conv2d(input, weight, self.bias, self.stride, RuntimeError: CUDA out of memory. Tried to allocate 190.00 MiB (GPU 0; 3.94 GiB total capacity; 2.12 GiB already allocated; 171.06 MiB free; 2.22 GiB reserved in total by PyTorch) [08/14 07:32:25 d2.engine.hooks]: Total training time: 0:00:01 (0:00:00 on hooks)


## Expected behavior:

If there are no obvious error in "what you observed" provided above,
please tell us the expected behavior.

If you expect the model to converge / work better, note that we do not give suggestions
on how to train a new model.
Only in one of the two conditions we will help with it:
(1) You're unable to reproduce the results in detectron2 model zoo.
(2) It indicates a detectron2 bug.

## Environment:

Provide your environment information using the following command:

sys.platform linux Python 3.8.3 (default, Jul 2 2020, 16:21:59) [GCC 7.3.0] numpy 1.18.5 detectron2 0.2.1 @/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/detectron2 Compiler GCC 7.3 CUDA compiler CUDA 10.1 detectron2 arch flags sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70, sm_75 DETECTRON2_ENV_MODULE PyTorch 1.6.0+cu101 @/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torch PyTorch debug build False GPU available True GPU 0 GeForce GTX 1050 Ti CUDA_HOME /usr/local/cuda-10.1 Pillow 7.2.0 torchvision 0.7.0+cu101 @/home/thanhvt/anaconda3/envs/keras/lib/python3.8/site-packages/torchvision torchvision arch flags sm_35, sm_50, sm_60, sm_70, sm_75 fvcore 0.1.1.post20200716 cv2 4.3.0

PyTorch built with:

GCC 7.3
C++ Version: 201402
Intel(R) Math Kernel Library Version 2019.0.5 Product Build 20190808 for Intel(R) 64 architecture applications
Intel(R) MKL-DNN v1.5.0 (Git Hash e2ac1fac44c5078ca927cb9b90e1b3066a0b2ed0)
OpenMP 201511 (a.k.a. OpenMP 4.5)
NNPACK is enabled
CPU capability usage: AVX2
CUDA Runtime 10.1
NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75
CuDNN 7.6.3
Magma 2.5.2
Build settings: BLAS=MKL, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_VULKAN_WRAPPER -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_STATIC_DISPATCH=OFF,

facebookresearch / detectron2

CUDA out of memory error - 1920*1080 image - batch size is 1 #1902

Instructions To Reproduce the Issue:

Implement a mapper, similar to the default DatasetMapper, but with your own customizations