training time - Githubissues

Qjizhi commented 1 year ago

Hi,

There is another question: I tried to train TartanAir with 335k images, it will take 70 days on machine with 8 3090 GPUs. Do you think there are any methods to make it faster? Thanks!

mli0603 commented 1 year ago

Hi @Qjizhi

Thank you for your interest in the project.

70 days is indeed very long... I didn't run into this when I trained the model. It finished in a few days maximum.

Have you checked what is the bottleneck? I would look into data loading to make sure this is not blocking you. For example, is the disk reading speed fast enough? Do you use enough workers to load the images? The default configurations in this repo may not be the most optimal for your machines.

Qjizhi commented 1 year ago

Hi @Qjizhi

Thank you for your interest in the project.

70 days is indeed very long... I didn't run into this when I trained the model. It finished in a few days maximum.

Have you checked what is the bottleneck? I would look into data loading to make sure this is not blocking you. For example, is the disk reading speed fast enough? Do you use enough workers to load the images? The default configurations in this repo may not be the most optimal for your machines.

@mli0603 Oh, yes! Theiters in codd.py should be 1 for TartanAir instead of 16, Thanks!

Qjizhi commented 1 year ago

Hi @Qjizhi Thank you for your interest in the project. 70 days is indeed very long... I didn't run into this when I trained the model. It finished in a few days maximum. Have you checked what is the bottleneck? I would look into data loading to make sure this is not blocking you. For example, is the disk reading speed fast enough? Do you use enough workers to load the images? The default configurations in this repo may not be the most optimal for your machines.

@mli0603 Oh, yes! Theiters in codd.py should be 1 for TartanAir instead of 16, Thanks!

@mli0603 Hi, after I changed iters to 1, the training time is still approximately 50 days, this is a example of log:

2023-02-17 03:22:07,589 - mmseg - INFO - Epoch [1][1000/10445]  lr: 4.000e-04, eta: 50 days, 1:15:26, time: 1.185, data_time: 0.022, memory: 22145, loss_disp0: 0.2184, init_loss0: 0.9902, prop_loss0: 0.5112, slant_loss0: 0.0249, w_loss0: 0.0101, epe0: nan, thres3: nan, loss_disp1: 0.2184, init_loss1: 0.9902, prop_loss1: 0.5111, slant_loss1: 0.0249, w_loss1: 0.0101, epe1: nan, loss_warp1: 0.5051, epe2d_warp1: nan, epedz_warp1: nan, 1px_warp1: nan, 3px_warp1: nan, 5px_warp1: nan, loss_temporal1: nan, loss: nan

and this is the training config I am using:

_base_ = [
    'models/codd.py', 'datasets/tartanair.py',
    'default_runtime.py', 'schedules/schedule_stereo.py'
]

# 'models/codd.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.

# model settings
max_disp = 320

iters = 1  # 16 for scene flow/KITTI, 1 for Sintel/TartanAir
motion_loss_weight = 0.5  # 0.5 for joint training tartan/KITTI, 1.0 for pretrain
fusion_loss_weight = 1.0
wr_weight = 1.0
wf_weight = 1.0

freeze_stereo = False
freeze_motion = False
freeze_fusion = False
if freeze_stereo or freeze_motion or freeze_fusion:
    find_unused_parameters = True

model = dict(
    type='ConsistentOnlineDynamicDepth',
    stereo=dict(
        type='HITNetMF',
        backbone=dict(
            type='HITUNet',
        ),
        initialization=dict(
            type='TileInitialization',
            max_disp=max_disp,
        ),
        propagation=dict(
            type='TilePropagation',
        ),
        loss=dict(
            type='HITLoss',
            max_disp=max_disp,
            alpha=0.9,
            c=0.1,
        ),
    ),
    motion=dict(
        type="Motion",
        iters=iters,
        raft3d=dict(
            type="RAFT3D",
            cnet_cfg=dict(
                type='HRNet',
                norm_cfg=dict(type='SyncBN', requires_grad=False),
                norm_eval=True,
                extra=dict(
                    stage1=dict(
                        num_modules=1,
                        num_branches=1,
                        block='BOTTLENECK',
                        num_blocks=(2,),
                        num_channels=(64,)),
                    stage2=dict(
                        num_modules=1,
                        num_branches=2,
                        block='BASIC',
                        num_blocks=(2, 2),
                        num_channels=(18, 36)),
                    stage3=dict(
                        num_modules=3,
                        num_branches=3,
                        block='BASIC',
                        num_blocks=(2, 2, 2),
                        num_channels=(18, 36, 72)),
                    stage4=dict(
                        num_modules=2,
                        num_branches=4,
                        block='BASIC',
                        num_blocks=(2, 2, 2, 2),
                        num_channels=(18, 36, 72, 144))
                )
            )
        ),
        loss=dict(
            type='MotionLoss',
            loss_weight=motion_loss_weight
        ),
    ),
    fusion=dict(
        type="Fusion",
        in_channels=24,
        fusion_channel=32,
        corr_cfg=dict(type='px2patch', patch_size=3),
        loss=dict(
            type='FusionLoss',
            loss_weight=fusion_loss_weight,
            min_disp=1,
            max_disp=320,
            wr_weight=wr_weight,
            wf_weight=wf_weight
        ),
    ),
    train_cfg=dict(
        freeze_stereo=freeze_stereo,
        freeze_motion=freeze_motion,
        freeze_fusion=freeze_fusion,
    ),
    test_cfg=dict(mode='whole')
)

# 'datasets/tartanair.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.

# dataset settings
dataset_type = "TartanAirMultiFrameDataset"
data_root = "/xxx/TartanAir"
train_split = "/xxx/TartanAir_train.txt"
val_split = "/xxx/TartanAir_val.txt"
test_split = "/xxx/TartanAir_test.txt"

calib = 320 * 0.25  # from https://github.com/castacks/tartanair_tools/blob/master/data_type.md
disp_range = (1.0, 210.0)
depth_range = (calib / disp_range[1], calib / disp_range[0])
intrinsics = [320, 320, 320, 240]  # https://github.com/castacks/tartanair_tools/blob/master/data_type.md

img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
batch_size = 4
crop_size = (448, 640)

train_pipeline = [
    dict(type="LoadImagesFromFile"),
    dict(type="LoadRImagesFromFile"),
    dict(type="LoadDispAnnotations", imdecode_backend="tartanair", key="disp", is_reciprocal=True, calib=calib),
    dict(type="LoadOpticalFlowAnnotations", imdecode_backend="tartanair", key="flow"),
    dict(type="LoadOcclusionAnnotations", imdecode_backend="tartanair", key="flow_occ"),
    dict(type="RandomCrop", crop_size=crop_size),
    dict(type="PhotoMetricDistortion"),
    dict(type="Normalize", **img_norm_cfg),
    dict(type="Pad", size=crop_size, pad_val=0, seg_pad_val=255, disp_pad_val=0),
    dict(type="DefaultFormatBundleList"),
    dict(
        type="Collect",
        keys=["img", "r_img", "gt_disp", "gt_flow", "gt_flow_occ"],
        meta_keys=[
            "filename",
            "ori_filename",
            "ori_shape",
            "img_shape",
            "pad_shape",
            "img_norm_cfg",
            "calib",
            "disp_range",
            "depth_range",
            "intrinsics",
        ],
    ),
]
test_pipeline = [
    dict(type='LoadImagesFromFile'),
    dict(type="LoadRImagesFromFile"),
    dict(type="LoadDispAnnotations", imdecode_backend="tartanair", key="disp", is_reciprocal=True, calib=calib),
    dict(type="LoadOpticalFlowAnnotations", imdecode_backend="tartanair", key="flow"),
    dict(type="LoadOcclusionAnnotations", imdecode_backend="tartanair", key="flow_occ"),
    dict(
        type='MultiScaleFlipAug',
        img_ratios=[1.0],
        img_scale=None,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=64),
            dict(type="DefaultFormatBundleList"),
            dict(type='Collect',
                 keys=["img", "r_img", "gt_disp", "gt_flow", "gt_flow_occ"],
                 meta_keys=[
                     "filename",
                     "ori_filename",
                     "ori_shape",
                     "img_shape",
                     "pad_shape",
                     "calib",
                     "disp_range",
                     "depth_range",
                     "intrinsics",
                 ],
                 ),
        ])
]
data = dict(
    samples_per_gpu=batch_size,
    workers_per_gpu=batch_size,
    train=dict(
        type=dataset_type,
        disp_range=disp_range,
        calib=calib,
        depth_range=depth_range,
        img_dir=data_root,
        r_img_dir=data_root,
        disp_dir=data_root,
        flow_dir=data_root,
        flow_occ_dir=data_root,
        num_frames=2,
        intrinsics=intrinsics,
        split=train_split,
        pipeline=train_pipeline,
    ),
    val=dict(
        type=dataset_type,
        disp_range=disp_range,
        calib=calib,
        depth_range=depth_range,
        img_dir=data_root,
        r_img_dir=data_root,
        disp_dir=data_root,
        flow_dir=data_root,
        flow_occ_dir=data_root,
        num_frames=-1,
        intrinsics=intrinsics,
        split=val_split,
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        disp_range=disp_range,
        calib=calib,
        depth_range=depth_range,
        img_dir=data_root,
        r_img_dir=data_root,
        disp_dir=data_root,
        flow_dir=data_root,
        flow_occ_dir=data_root,
        num_frames=-1,
        intrinsics=intrinsics,
        split=test_split,
        pipeline=test_pipeline,
    ),
)

# 'default_runtime.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.

log_config = dict(
    interval=50,
    hooks=[
        dict(type='TextLoggerHook'),
        dict(type='TensorboardLoggerHook')
    ])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True

# 'schedules/schedule_stereo.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.

# optimizer
optimizer = dict(type='Adam', lr=4e-4, betas=(0.9, 0.999))
optimizer_config = dict()
# learning policy
lr_config = dict(policy='MultiGamma', step=[225, 293, 315], gamma=[0.25, 0.4, 0.25])

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=340)  # Following HITNet
checkpoint_config = dict(by_epoch=True, interval=20)
evaluation = dict(interval=10, metric='default')

facebookresearch / CODD

training time #3