Closed ToneZe closed 7 months ago
when 15 epoch in 2 V100 GPUs
when 15 epoch in 2 V100 GPUs
Hello, you can refer to https://github.com/Luo-Z13/pointobb/issues/4#issuecomment-1980399695. If your environment is set up correctly and you're still encountering this issue, I suggest implementing checkpoint continuation training. Please note that you need to manually set the 'iter_count' from the checkpoint, otherwise it will reset to zero.
OK,Thanks!
2024-04-03 18:51:39,493 - mmdet - INFO - Epoch [15][300/2909] lr: 5.000e-03, eta: 4:27:30, time: 0.606, data_time: 0.007, memory: 12809, stage0_loss_symmetry_ss: 0.0311, stage0_loss_instance_mil: 0.0333, stage0_bag_acc: 97.7123, stage0_mean_ious: 0.4440, stage0_s: 0.1233, stage0_m: 0.2246, stage0_l: 0.2410, stage0_h: 0.4768, stage1_loss_instance_mil: 0.0107, stage1_bag_acc: 98.7738, stage1_neg_loss: 0.0040, stage1_mean_ious: 0.4720, stage1_s: 0.1410, stage1_m: 0.2396, stage1_l: 0.2557, stage1_h: 0.4850, loss: 0.0791, grad_norm: 1.5265 Traceback (most recent call last): File "tools/train_dist.py", line 192, in
main()
File "tools/train_dist.py", line 181, in main
train_detector(
File "/hy-tmp/pointobb/PointOBB/mmdet/apis/train.py", line 172, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/epoch_based_runner.py", line 51, in train
self.call_hook('after_train_iter')
File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/base_runner.py", line 309, in call_hook
getattr(hook, fn_name)(self)
File "/usr/local/lib/python3.8/dist-packages/mmcv/runner/hooks/optimizer.py", line 56, in after_train_iter
runner.outputs['loss'].backward()
File "/usr/local/lib/python3.8/dist-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/usr/local/lib/python3.8/dist-packages/torch/autograd/init.py", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: CUDA out of memory. Tried to allocate 3.27 GiB (GPU 1; 15.77 GiB total capacity; 10.10 GiB already allocated; 1.15 GiB free; 13.10 GiB reserved in total by PyTorch)
base = [ '../../configs/base/default_runtime.py' ] norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) # add debug = False
num_stages = 2 model = dict( type='PointOBB', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_input', num_outs=4, norm_cfg=norm_cfg ),
dataset settings
dataset_type = 'CocoFmtObbDataset' angle_version = 'le90'
img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(800, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5,version=angle_version) if not debug else dict(type='RandomFlip', flip_ratio=0.),
dict(
]
test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict( type='MultiScaleFlipAug', img_scale=(800, 800), flip=False, transforms=[ dict(type='Resize', img_scale=(800, 800) , keep_ratio=True), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_ignore', 'gt_anns_id', 'gt_true_bboxes']), ]) ]
data_root_trainval = '../dior-r/' data_root_test = '../dior-r/'
data = dict( samples_per_gpu=2, workers_per_gpu=2,
shuffle=False if debug else None, train=dict( type=dataset_type, version=angle_version, ann_file = data_root_trainval + "trainval_rbox_pt_P2Bfmt.json", img_prefix = data_root_trainval + 'JPEGImages-trainval/', pipeline=train_pipeline, filter_empty_gt=True ), val=dict( samples_per_gpu=2, type=dataset_type, ann_file = data_root_trainval + "trainval_rbox_pt_P2Bfmt.json", img_prefix = data_root_trainval + 'JPEGImages-trainval/', pipeline=test_pipeline, test_mode=False, ), test=dict( type=dataset_type, img_prefix=data_root_test + 'JPEGImages-testfordebug/', ann_file=data_root_test + "Annotations/testfordebug_rbox_pt_P2Bfmt.json", pipeline=test_pipeline))
check = dict(stop_while_nan=False)
optimizer
optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
learning policy
training_time = 2 lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8training_time, 11training_time]) runner = dict(type='EpochBasedRunner', max_epochs=12*training_time)
work_dir = 'xxx/work_dir/pointobb_r50_fpn_2x_dota10_dist/'
evaluation = dict( interval=1, metric='bbox', save_result_file=work_dir + 'pseudo_obb_result.json', do_first_eval=False, # test do_final_eval=True, )
Inference
load_from = 'xxx/work_dir/epoch_12.pth'
evaluation = dict(
save_result_file='xxx/work_dir/test/test_debug_result.json',
do_first_eval=True
)
runner = dict(type='EpochBasedRunner',max_epochs=0)