nuclear-missile commented 1 year ago

我们成功的在单卡上运行了训练代码, 但是在多卡的时候遇到了问题. 我们使用2张3070运行, 将NGPU改为2, nviews改为2避免爆显存, 我们遇到了很多不一样的报错如下, 并且每次修改都会改变报错. root@I10ed7f43820050143b:/hy-tmp/TransMVSNet# bash scripts/train.sh /usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py:178: FutureWarning: The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use_env is set by default in torchrun. If your script expects--local_rankargument to be set, please change it to read fromos.environ['LOCAL_RANK']` instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for further instructions

warnings.warn( current time 20230220_230705 creating new summary file argv: ['--local_rank=0', '--logdir=./outputs/dtu_training', '--dataset=dtu_yao', '--batch_size=2', '--epochs=16', '--trainpath=/hy-tmp/dtu_training', '--trainlist=lists/dtu/train.txt', '--testlist=lists/dtu/val.txt', '--numdepth=192', '--ndepths=48,32,16', '--nviews=2', '--wd=0.0001', '--depth_inter_r=4.0,1.0,0.5', '--lrepochs=6,8,12:2', '--dlossw=1.0,1.0,1.0'] ################################ args ################################ mode train <class 'str'>
model mvsnet <class 'str'>
device cuda <class 'str'>
dataset dtu_yao <class 'str'>
trainpath /hy-tmp/dtu_training <class 'str'>
testpath /hy-tmp/dtu_training <class 'str'>
trainlist lists/dtu/train.txt <class 'str'>
testlist lists/dtu/val.txt <class 'str'>
epochs 16 <class 'int'>
lr 0.001 <class 'float'>
lrepochs 6,8,12:2 <class 'str'>
wd 0.0001 <class 'float'>
nviews 2 <class 'int'>
batch_size 2 <class 'int'>
numdepth 192 <class 'int'>
interval_scale 1.06 <class 'float'>
loadckpt None <class 'NoneType'>
logdir ./outputs/dtu_training <class 'str'>
resume False <class 'bool'>
summary_freq 10 <class 'int'>
save_freq 1 <class 'int'>
eval_freq 1 <class 'int'>
seed 1 <class 'int'>
pin_m False <class 'bool'>
local_rank 0 <class 'int'>
share_cr False <class 'bool'>
ndepths 48,32,16 <class 'str'>
depth_inter_r 4.0,1.0,0.5 <class 'str'>
dlossw 1.0,1.0,1.0 <class 'str'>
cr_base_chs 8,8,8 <class 'str'>
grad_method detach <class 'str'>
using_apex False <class 'bool'>
sync_bn False <class 'bool'>
opt_level O0 <class 'str'>
keep_batchnorm_fp32 None <class 'NoneType'>
loss_scale None <class 'NoneType'>
######################################################################## **netphs:[48, 32, 16], depth_intervals_ratio:[4.0, 1.0, 0.5], grad:detach, chs:[8, 8, 8]**** start at epoch 0 Number of model parameters: 1148924 Let's use 2 GPUs! mvsdataset kwargs {} dataset train metas: 27097 mvsdataset kwargs {} dataset test metas: 6174 /usr/local/lib/python3.8/dist-packages/torch/functional.py:478: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:2894.) return _VF.meshgrid(tensors, kwargs) # type: ignore[attr-defined] Traceback (most recent call last): File "train.py", line 404, in train(model, model_loss, optimizer, TrainImgLoader, TestImgLoader, start_epoch, args) File "train.py", line 80, in train loss, scalar_outputs, image_outputs = train_sample(model, model_loss, optimizer, sample, args) File "train.py", line 161, in train_sample outputs = model(sample_cuda["imgs"], sample_cuda["proj_matrices"], sample_cuda["depth_values"], advance_search_ratio=[0.25]) File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, *kwargs) File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/data_parallel.py", line 168, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply output.reraise() File "/usr/local/lib/python3.8/dist-packages/torch/_utils.py", line 461, in reraise raise exception RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker output = module(input, kwargs) File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, kwargs) File "/hy-tmp/TransMVSNet/models/TransMVSNet.py", line 233, in forward outputs_stage = self.DepthNet( File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, *kwargs) File "/hy-tmp/TransMVSNet/models/TransMVSNet.py", line 92, in forward cost_reg = cost_regularization(similarity) File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, kwargs) File "/hy-tmp/TransMVSNet/models/module.py", line 453, in forward x = conv0 + self.conv11(x) File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, *kwargs) File "/hy-tmp/TransMVSNet/models/module.py", line 180, in forward y = self.conv(x) File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, **kwargs) File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py", line 1102, in forward return F.conv_transpose3d( RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.

import torch torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = False torch.backends.cudnn.allow_tf32 = False data = torch.randn([1, 8, 16, 512, 640], dtype=torch.float, device='cuda', requires_grad=True) net = torch.nn.Conv3d(8, 16, kernel_size=[3, 3, 3], padding=[1, 1, 1], stride=[2, 2, 2], dilation=[1, 1, 1], groups=1) net = net.cuda().float() out = net(data) out.backward(torch.randn_like(out)) torch.cuda.synchronize()

ConvolutionParams data_type = CUDNN_DATA_FLOAT padding = [1, 1, 1] stride = [2, 2, 2] dilation = [1, 1, 1] groups = 1 deterministic = false allow_tf32 = false input: TensorDescriptor 0x7f85e8fd4670 type = CUDNN_DATA_FLOAT nbDims = 5 dimA = 1, 8, 16, 512, 640, strideA = 41943040, 5242880, 327680, 640, 1, output: TensorDescriptor 0x7f85e8fd5490 type = CUDNN_DATA_FLOAT nbDims = 5 dimA = 1, 16, 8, 256, 320, strideA = 10485760, 655360, 81920, 320, 1, weight: FilterDescriptor 0x7f87139fd870 type = CUDNN_DATA_FLOAT tensor_format = CUDNN_TENSOR_NCHW nbDims = 5 dimA = 16, 8, 3, 3, 3, Pointer addresses: input: 0x7f85a8000000 output: 0x7f85b8e00000 weight: 0x7f88d0de0400

ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 8936) of binary: /usr/bin/python Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 193, in main() File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 189, in main launch(args) File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 174, in launch run(args) File "/usr/local/lib/python3.8/dist-packages/torch/distributed/run.py", line 752, in run elastic_launch( File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 131, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 245, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:

------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-02-20_23:07:19 host : I10ed7f43820050143b rank : 0 (local_rank: 0) exitcode : 1 (pid: 8936) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================` 请问要使用多卡的时候是否需要进行改动

DingYikang commented 1 year ago

您好，使用多卡训练时仅需修改NGPU即可，不需要其他修改。根据您展示的报错，建议从CUDA版本/Pytorch版本、是否有其他任务占用显存等角度排查，默认代码多卡训练是正常的，可能是机器或环境导致了问题。

limiao766 commented 1 year ago

请问您是如何单卡训练的，我只有一张3060的显卡。我将NGPU设置为1,但是报以下错误： /home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use_env is set by default in torchrun. If your script expects --local_rank argument to be set, please change it to read from os.environ['LOCAL_RANK'] instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for further instructions

FutureWarning, /home/lph/anaconda3/envs/pytorch/bin/python: can't open file 'train.py': [Errno 2] No such file or directory ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 8487) of binary: /home/lph/anaconda3/envs/pytorch/bin/python Traceback (most recent call last): File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in main() File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main launch(args) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch run(args) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/run.py", line 755, in run )(*cmd_args) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 247, in launch_agent failures=result.failures, torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:

------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-05-04_09:24:38 host : lph-LEGION-REN7000K-26IOB rank : 0 (local_rank: 0) exitcode : 2 (pid: 8487) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

wtyuan96 commented 1 year ago

请问您是如何单卡训练的，我只有一张3060的显卡。我将NGPU设置为1,但是报以下错误： /home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use_env is set by default in torchrun. If your script expects --local_rank argument to be set, please change it to read from os.environ['LOCAL_RANK'] instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for further instructions

FutureWarning,

/home/lph/anaconda3/envs/pytorch/bin/python: can't open file 'train.py': [Errno 2] No such file or directory ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 8487) of binary: /home/lph/anaconda3/envs/pytorch/bin/python Traceback (most recent call last): File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in main() File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main launch(args) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch run(args) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/run.py", line 755, in run )(*cmd_args) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/home/lph/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 247, in launch_agent failures=result.failures, torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
Root Cause (first observed failure): [0]: time : 2023-05-04_09:24:38 host : lph-LEGION-REN7000K-26IOB rank : 0 (local_rank: 0) exitcode : 2 (pid: 8487) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

你的报错如下：

/home/lph/anaconda3/envs/pytorch/bin/python: can't open file 'train.py': [Errno 2] No such file or directory

请检查一下运行命令的路径和 train.py 的文件路径是否正确。

megvii-research / TransMVSNet

如何多卡训练 #24

train.py FAILED

train.py FAILED

FutureWarning,

train.py FAILED

Failures: