When I try to run ./tools/pixpro_base_r50_100ep.sh, it crashes with the following error messages. I was wondering if anyone knows how to fix it. Thanks.
File "main_pretrain.py", line 241, in
return forward_call(*input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
main(opt)
File "main_pretrain.py", line 141, in main
train(epoch, train_loader, model, optimizer, scheduler, args, summary_writer)
File "main_pretrain.py", line 165, in train
loss = model(data[0], data[1], data[2], data[3])
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
output = self._run_ddp_forward(*inputs, *kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(inputs[0], kwargs[0])
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
return forward_call(*input, *kwargs)
File "/home/test/Desktop/PixPro/contrast/models/PixPro.py", line 226, in forward
feat_2 = self.encoder(im_2)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
output = self._run_ddp_forward(inputs, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return forward_call(*input, kwargs)
File "/home/test/Desktop/PixPro/contrast/resnet.py", line 200, in forward
x = self.conv1(x)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return module_to_run(*inputs[0], *kwargs[0])
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 463, in forward
return forward_call(*input, kwargs)
File "/home/test/Desktop/PixPro/contrast/models/PixPro.py", line 226, in forward
feat_2 = self.encoder(im_2)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return self._conv_forward(input, self.weight, self.bias)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 460, in _conv_forward
return forward_call(*input, *kwargs)
File "/home/test/Desktop/PixPro/contrast/resnet.py", line 200, in forward
self.padding, self.dilation, self.groups)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/wrap.py", line 21, in wrapper
args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/utils.py", line 97, in cached_cast
x = self.conv1(x)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
if cached_x.grad_fn.next_functions[1][0].variable is not x:
IndexError: tuple index out of range
return forward_call(input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 463, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 460, in _conv_forward
self.padding, self.dilation, self.groups)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/wrap.py", line 21, in wrapper
args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/utils.py", line 97, in cached_cast
if cached_x.grad_fn.next_functions[1][0].variable is not x:
IndexError: tuple index out of range
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2554351) of binary: /home/test/.conda/envs/test/bin/python
Traceback (most recent call last):
File "/home/test/.conda/envs/test/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/test/.conda/envs/test/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/run.py", line 756, in run
)(*cmd_args)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 248, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Hello,
When I try to run ./tools/pixpro_base_r50_100ep.sh, it crashes with the following error messages. I was wondering if anyone knows how to fix it. Thanks.
File "main_pretrain.py", line 241, in
return forward_call(*input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
main(opt)
File "main_pretrain.py", line 141, in main
train(epoch, train_loader, model, optimizer, scheduler, args, summary_writer)
File "main_pretrain.py", line 165, in train
loss = model(data[0], data[1], data[2], data[3])
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
output = self._run_ddp_forward(*inputs, *kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(inputs[0], kwargs[0])
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
return forward_call(*input, *kwargs)
File "/home/test/Desktop/PixPro/contrast/models/PixPro.py", line 226, in forward
feat_2 = self.encoder(im_2)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
output = self._run_ddp_forward(inputs, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return forward_call(*input, kwargs)
File "/home/test/Desktop/PixPro/contrast/resnet.py", line 200, in forward
x = self.conv1(x)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return module_to_run(*inputs[0], *kwargs[0])
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 463, in forward
return forward_call(*input, kwargs)
File "/home/test/Desktop/PixPro/contrast/models/PixPro.py", line 226, in forward
feat_2 = self.encoder(im_2)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return self._conv_forward(input, self.weight, self.bias)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 460, in _conv_forward
return forward_call(*input, *kwargs)
File "/home/test/Desktop/PixPro/contrast/resnet.py", line 200, in forward
self.padding, self.dilation, self.groups)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/wrap.py", line 21, in wrapper
args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/utils.py", line 97, in cached_cast
x = self.conv1(x)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
if cached_x.grad_fn.next_functions[1][0].variable is not x:
IndexError: tuple index out of range
return forward_call(input, kwargs)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 463, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 460, in _conv_forward
self.padding, self.dilation, self.groups)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/wrap.py", line 21, in wrapper
args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/apex/amp/utils.py", line 97, in cached_cast
if cached_x.grad_fn.next_functions[1][0].variable is not x:
IndexError: tuple index out of range
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2554351) of binary: /home/test/.conda/envs/test/bin/python
Traceback (most recent call last):
File "/home/test/.conda/envs/test/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/test/.conda/envs/test/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/run.py", line 756, in run
)(*cmd_args)
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/test/.conda/envs/test/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 248, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
main_pretrain.py FAILED