Start training
Traceback (most recent call last):
File "pretrain.py", line 173, in
main(args, config)
File "pretrain.py", line 131, in main
train_stats = train(model, data_loader, optimizer, epoch, device, config)
File "pretrain.py", line 48, in train
for i, (image, caption) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
File "/content/BLIP/utils.py", line 179, in log_every
header, total_time_str, total_time / len(iterable)))
ZeroDivisionError: float division by zero
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1426) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/run.py", line 765, in
main()
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, *kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/run.py", line 761, in main
run(args)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/run.py", line 755, in run
)(cmd_args)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/launcher/api.py", line 247, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Start training Traceback (most recent call last): File "pretrain.py", line 173, in
main(args, config)
File "pretrain.py", line 131, in main
train_stats = train(model, data_loader, optimizer, epoch, device, config)
File "pretrain.py", line 48, in train
for i, (image, caption) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
File "/content/BLIP/utils.py", line 179, in log_every
header, total_time_str, total_time / len(iterable)))
ZeroDivisionError: float division by zero
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1426) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/run.py", line 765, in
main()
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, *kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/run.py", line 761, in main
run(args)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/run.py", line 755, in run
)(cmd_args)
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.7/dist-packages/torch/distributed/launcher/api.py", line 247, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
pretrain.py FAILED
Failures: