Open Qjizhi opened 1 year ago
Hi @Qjizhi
Thank you for your interest in the project.
70 days is indeed very long... I didn't run into this when I trained the model. It finished in a few days maximum.
Have you checked what is the bottleneck? I would look into data loading to make sure this is not blocking you. For example, is the disk reading speed fast enough? Do you use enough workers to load the images? The default configurations in this repo may not be the most optimal for your machines.
Hi @Qjizhi
Thank you for your interest in the project.
70 days is indeed very long... I didn't run into this when I trained the model. It finished in a few days maximum.
Have you checked what is the bottleneck? I would look into data loading to make sure this is not blocking you. For example, is the disk reading speed fast enough? Do you use enough workers to load the images? The default configurations in this repo may not be the most optimal for your machines.
@mli0603 Oh, yes! Theiters
in codd.py
should be 1 for TartanAir instead of 16, Thanks!
Hi @Qjizhi Thank you for your interest in the project. 70 days is indeed very long... I didn't run into this when I trained the model. It finished in a few days maximum. Have you checked what is the bottleneck? I would look into data loading to make sure this is not blocking you. For example, is the disk reading speed fast enough? Do you use enough workers to load the images? The default configurations in this repo may not be the most optimal for your machines.
@mli0603 Oh, yes! The
iters
incodd.py
should be 1 for TartanAir instead of 16, Thanks!
@mli0603 Hi, after I changed iters
to 1, the training time is still approximately 50 days, this is a example of log:
2023-02-17 03:22:07,589 - mmseg - INFO - Epoch [1][1000/10445] lr: 4.000e-04, eta: 50 days, 1:15:26, time: 1.185, data_time: 0.022, memory: 22145, loss_disp0: 0.2184, init_loss0: 0.9902, prop_loss0: 0.5112, slant_loss0: 0.0249, w_loss0: 0.0101, epe0: nan, thres3: nan, loss_disp1: 0.2184, init_loss1: 0.9902, prop_loss1: 0.5111, slant_loss1: 0.0249, w_loss1: 0.0101, epe1: nan, loss_warp1: 0.5051, epe2d_warp1: nan, epedz_warp1: nan, 1px_warp1: nan, 3px_warp1: nan, 5px_warp1: nan, loss_temporal1: nan, loss: nan
and this is the training config I am using:
_base_ = [
'models/codd.py', 'datasets/tartanair.py',
'default_runtime.py', 'schedules/schedule_stereo.py'
]
# 'models/codd.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.
# model settings
max_disp = 320
iters = 1 # 16 for scene flow/KITTI, 1 for Sintel/TartanAir
motion_loss_weight = 0.5 # 0.5 for joint training tartan/KITTI, 1.0 for pretrain
fusion_loss_weight = 1.0
wr_weight = 1.0
wf_weight = 1.0
freeze_stereo = False
freeze_motion = False
freeze_fusion = False
if freeze_stereo or freeze_motion or freeze_fusion:
find_unused_parameters = True
model = dict(
type='ConsistentOnlineDynamicDepth',
stereo=dict(
type='HITNetMF',
backbone=dict(
type='HITUNet',
),
initialization=dict(
type='TileInitialization',
max_disp=max_disp,
),
propagation=dict(
type='TilePropagation',
),
loss=dict(
type='HITLoss',
max_disp=max_disp,
alpha=0.9,
c=0.1,
),
),
motion=dict(
type="Motion",
iters=iters,
raft3d=dict(
type="RAFT3D",
cnet_cfg=dict(
type='HRNet',
norm_cfg=dict(type='SyncBN', requires_grad=False),
norm_eval=True,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(2,),
num_channels=(64,)),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(2, 2),
num_channels=(18, 36)),
stage3=dict(
num_modules=3,
num_branches=3,
block='BASIC',
num_blocks=(2, 2, 2),
num_channels=(18, 36, 72)),
stage4=dict(
num_modules=2,
num_branches=4,
block='BASIC',
num_blocks=(2, 2, 2, 2),
num_channels=(18, 36, 72, 144))
)
)
),
loss=dict(
type='MotionLoss',
loss_weight=motion_loss_weight
),
),
fusion=dict(
type="Fusion",
in_channels=24,
fusion_channel=32,
corr_cfg=dict(type='px2patch', patch_size=3),
loss=dict(
type='FusionLoss',
loss_weight=fusion_loss_weight,
min_disp=1,
max_disp=320,
wr_weight=wr_weight,
wf_weight=wf_weight
),
),
train_cfg=dict(
freeze_stereo=freeze_stereo,
freeze_motion=freeze_motion,
freeze_fusion=freeze_fusion,
),
test_cfg=dict(mode='whole')
)
# 'datasets/tartanair.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.
# dataset settings
dataset_type = "TartanAirMultiFrameDataset"
data_root = "/xxx/TartanAir"
train_split = "/xxx/TartanAir_train.txt"
val_split = "/xxx/TartanAir_val.txt"
test_split = "/xxx/TartanAir_test.txt"
calib = 320 * 0.25 # from https://github.com/castacks/tartanair_tools/blob/master/data_type.md
disp_range = (1.0, 210.0)
depth_range = (calib / disp_range[1], calib / disp_range[0])
intrinsics = [320, 320, 320, 240] # https://github.com/castacks/tartanair_tools/blob/master/data_type.md
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
batch_size = 4
crop_size = (448, 640)
train_pipeline = [
dict(type="LoadImagesFromFile"),
dict(type="LoadRImagesFromFile"),
dict(type="LoadDispAnnotations", imdecode_backend="tartanair", key="disp", is_reciprocal=True, calib=calib),
dict(type="LoadOpticalFlowAnnotations", imdecode_backend="tartanair", key="flow"),
dict(type="LoadOcclusionAnnotations", imdecode_backend="tartanair", key="flow_occ"),
dict(type="RandomCrop", crop_size=crop_size),
dict(type="PhotoMetricDistortion"),
dict(type="Normalize", **img_norm_cfg),
dict(type="Pad", size=crop_size, pad_val=0, seg_pad_val=255, disp_pad_val=0),
dict(type="DefaultFormatBundleList"),
dict(
type="Collect",
keys=["img", "r_img", "gt_disp", "gt_flow", "gt_flow_occ"],
meta_keys=[
"filename",
"ori_filename",
"ori_shape",
"img_shape",
"pad_shape",
"img_norm_cfg",
"calib",
"disp_range",
"depth_range",
"intrinsics",
],
),
]
test_pipeline = [
dict(type='LoadImagesFromFile'),
dict(type="LoadRImagesFromFile"),
dict(type="LoadDispAnnotations", imdecode_backend="tartanair", key="disp", is_reciprocal=True, calib=calib),
dict(type="LoadOpticalFlowAnnotations", imdecode_backend="tartanair", key="flow"),
dict(type="LoadOcclusionAnnotations", imdecode_backend="tartanair", key="flow_occ"),
dict(
type='MultiScaleFlipAug',
img_ratios=[1.0],
img_scale=None,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=64),
dict(type="DefaultFormatBundleList"),
dict(type='Collect',
keys=["img", "r_img", "gt_disp", "gt_flow", "gt_flow_occ"],
meta_keys=[
"filename",
"ori_filename",
"ori_shape",
"img_shape",
"pad_shape",
"calib",
"disp_range",
"depth_range",
"intrinsics",
],
),
])
]
data = dict(
samples_per_gpu=batch_size,
workers_per_gpu=batch_size,
train=dict(
type=dataset_type,
disp_range=disp_range,
calib=calib,
depth_range=depth_range,
img_dir=data_root,
r_img_dir=data_root,
disp_dir=data_root,
flow_dir=data_root,
flow_occ_dir=data_root,
num_frames=2,
intrinsics=intrinsics,
split=train_split,
pipeline=train_pipeline,
),
val=dict(
type=dataset_type,
disp_range=disp_range,
calib=calib,
depth_range=depth_range,
img_dir=data_root,
r_img_dir=data_root,
disp_dir=data_root,
flow_dir=data_root,
flow_occ_dir=data_root,
num_frames=-1,
intrinsics=intrinsics,
split=val_split,
pipeline=test_pipeline,
),
test=dict(
type=dataset_type,
disp_range=disp_range,
calib=calib,
depth_range=depth_range,
img_dir=data_root,
r_img_dir=data_root,
disp_dir=data_root,
flow_dir=data_root,
flow_occ_dir=data_root,
num_frames=-1,
intrinsics=intrinsics,
split=test_split,
pipeline=test_pipeline,
),
)
# 'default_runtime.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
# 'schedules/schedule_stereo.py'
# Copyright (c) Meta Platforms, Inc. and affiliates.
# optimizer
optimizer = dict(type='Adam', lr=4e-4, betas=(0.9, 0.999))
optimizer_config = dict()
# learning policy
lr_config = dict(policy='MultiGamma', step=[225, 293, 315], gamma=[0.25, 0.4, 0.25])
# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=340) # Following HITNet
checkpoint_config = dict(by_epoch=True, interval=20)
evaluation = dict(interval=10, metric='default')
Hi,
There is another question: I tried to train TartanAir with 335k images, it will take 70 days on machine with 8 3090 GPUs. Do you think there are any methods to make it faster? Thanks!