Luo-Z13 / pointobb

[CVPR2024] PointOBB: Learning Oriented Object Detection via Single Point Supervision
MIT License
55 stars 3 forks source link

RuntimeError: CUDA out of memory. #7

Closed ToneZe closed 7 months ago

ToneZe commented 7 months ago

2024-04-03 18:51:39,493 - mmdet - INFO - Epoch [15][300/2909] lr: 5.000e-03, eta: 4:27:30, time: 0.606, data_time: 0.007, memory: 12809, stage0_loss_symmetry_ss: 0.0311, stage0_loss_instance_mil: 0.0333, stage0_bag_acc: 97.7123, stage0_mean_ious: 0.4440, stage0_s: 0.1233, stage0_m: 0.2246, stage0_l: 0.2410, stage0_h: 0.4768, stage1_loss_instance_mil: 0.0107, stage1_bag_acc: 98.7738, stage1_neg_loss: 0.0040, stage1_mean_ious: 0.4720, stage1_s: 0.1410, stage1_m: 0.2396, stage1_l: 0.2557, stage1_h: 0.4850, loss: 0.0791, grad_norm: 1.5265 Traceback (most recent call last): File "tools/train_dist.py", line 192, in main() File "tools/train_dist.py", line 181, in main train_detector( File "/hy-tmp/pointobb/PointOBB/mmdet/apis/train.py", line 172, in train_detector runner.run(data_loaders, cfg.workflow) File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/epoch_based_runner.py", line 127, in run epoch_runner(data_loaders[i], **kwargs) File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/epoch_based_runner.py", line 51, in train self.call_hook('after_train_iter') File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/base_runner.py", line 309, in call_hook getattr(hook, fn_name)(self) File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/hooks/optimizer.py", line 56, in after_train_iter runner.outputs['loss'].backward() File "/usr/local/lib/python3.8/dist-packages/torch/_tensor.py", line 255, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) File "/usr/local/lib/python3.8/dist-packages/torch/autograd/init.py", line 147, in backward Variable._execution_engine.run_backward( RuntimeError: CUDA out of memory. Tried to allocate 3.27 GiB (GPU 1; 15.77 GiB total capacity; 10.10 GiB already allocated; 1.15 GiB free; 13.10 GiB reserved in total by PyTorch)

base = [ '../../configs/base/default_runtime.py' ] norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) # add debug = False

num_stages = 2 model = dict( type='PointOBB', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4, norm_cfg=norm_cfg ),

loss_diff_view=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0),  # SSC loss
crop_size = (800, 800),
construct_view = True,   # rot/flp view
construct_resize = True, # resized view

roi_head=dict(
    type='PointOBBHead',
    num_stages=num_stages,
    top_k=7,
    with_atten=False,

    loss_symmetry_ss=dict(
        type='SmoothL1Loss', loss_weight=0.5, beta=0.1),
    angle_coder=dict(
                type='PSCCoder',
                angle_version='le90',
                dual_freq=False,
                num_step=3,
                thr_mod=0),
    angle_version = 'le90',
    rotation_agnostic_classes=[5, 9, 15, 19],
    agnostic_resize_classes = [13, 18],
    use_angle_loss = False,
    add_angle_pred_begin = False,
    not_use_rot_mil = False, 
    detach_angle_head = False,
    stacked_convs = 2,

    bbox_roi_extractor=dict(
        type='RotatedSingleRoIExtractor',
        roi_layer=dict(
            type='RoIAlignRotated',
            out_size=7,
            sample_num=2,
            clockwise=True),
        out_channels=256,
        featmap_strides=[4, 8, 16, 32]),
    bbox_head=dict(
        type='Shared2FCInstanceMILHead',
        num_stages=num_stages,
        with_loss_pseudo=False,
        in_channels=256,
        fc_out_channels=1024,
        roi_feat_size=7,
        num_classes=20,
        num_ref_fcs=0,
        bbox_coder=dict(
            type='DeltaXYWHBBoxCoder',
            target_means=[0., 0., 0., 0.],
            target_stds=[0.1, 0.1, 0.2, 0.2]),
        reg_class_agnostic=True,
        loss_type='MIL',
        loss_mil1=dict(
            type='MILLoss',
            binary_ins=False,
            loss_weight=0.25,
            loss_type='binary_cross_entropy'), 
        loss_mil2=dict(
            type='MILLoss',
            binary_ins=False,
            loss_weight=0.25,
            loss_type='gfocal_loss'),),
),
# model training and testing settings
train_cfg=dict(
    base_proposal=dict(
        base_scales=[4, 8, 16, 24, 32, 48, 64, 72, 80, 96],
        base_ratios=[1 / 3, 1 / 2, 1 / 1.5, 1.0, 1.5, 2.0, 3.0],
        shake_ratio=None,
        cut_mode='symmetry', 
        gen_num_neg=0),
    fine_proposal=dict(
        gen_proposal_mode='fix_gen',
        cut_mode=None,
        shake_ratio=[0.1],
        base_ratios=[1, 1.2, 1.3, 0.8, 0.7],
        iou_thr=0.3,
        gen_num_neg=500,
    ),
    rcnn=None,
    iter_count = 0,
    burn_in_steps1 = 16000, 
    burn_in_steps2 = 22000
),
test_cfg=dict(
    rpn=None,
    rcnn=None,
))

dataset settings

dataset_type = 'CocoFmtObbDataset' angle_version = 'le90'

img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(800, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5,version=angle_version) if not debug else dict(type='RandomFlip', flip_ratio=0.),

dict(

#     type='RandomFlip',
#     flip_ratio=[0.25, 0.25, 0.25],
#     direction=['horizontal', 'vertical', 'diagonal'],
#     version=angle_version),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_true_bboxes']),

]

test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='MultiScaleFlipAug', img_scale=(800, 800), flip=False, transforms=[ dict(type='Resize', img_scale=(800, 800) , keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_anns_id', 'gt_true_bboxes']), ]) ]

data_root_trainval = '../dior-r/' data_root_test = '../dior-r/'

data = dict( samples_per_gpu=2, workers_per_gpu=2,
shuffle=False if debug else None, train=dict( type=dataset_type, version=angle_version, ann_file = data_root_trainval + "trainval_rbox_pt_P2Bfmt.json", img_prefix = data_root_trainval + 'JPEGImages-trainval/', pipeline=train_pipeline, filter_empty_gt=True ), val=dict( samples_per_gpu=2, type=dataset_type, ann_file = data_root_trainval + "trainval_rbox_pt_P2Bfmt.json", img_prefix = data_root_trainval + 'JPEGImages-trainval/', pipeline=test_pipeline, test_mode=False, ), test=dict( type=dataset_type, img_prefix=data_root_test + 'JPEGImages-testfordebug/', ann_file=data_root_test + "Annotations/testfordebug_rbox_pt_P2Bfmt.json", pipeline=test_pipeline))

check = dict(stop_while_nan=False)

optimizer

optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

learning policy

training_time = 2 lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8training_time, 11training_time]) runner = dict(type='EpochBasedRunner', max_epochs=12*training_time)

work_dir = 'xxx/work_dir/pointobb_r50_fpn_2x_dota10_dist/'

evaluation = dict( interval=1, metric='bbox', save_result_file=work_dir + 'pseudo_obb_result.json', do_first_eval=False, # test do_final_eval=True, )

Inference

load_from = 'xxx/work_dir/epoch_12.pth'

evaluation = dict(

save_result_file='xxx/work_dir/test/test_debug_result.json',

do_first_eval=True

)

runner = dict(type='EpochBasedRunner',max_epochs=0)

ToneZe commented 7 months ago

when 15 epoch in 2 V100 GPUs

Luo-Z13 commented 7 months ago

when 15 epoch in 2 V100 GPUs

Hello, you can refer to https://github.com/Luo-Z13/pointobb/issues/4#issuecomment-1980399695. If your environment is set up correctly and you're still encountering this issue, I suggest implementing checkpoint continuation training. Please note that you need to manually set the 'iter_count' from the checkpoint, otherwise it will reset to zero.

ToneZe commented 7 months ago

OK,Thanks!