...
Epoch: 509, total time: 3.209872.
Epoch: 510, total time: 3.206095.
Epoch: 511, total time: 3.207372.
Epoch: 512, total time: 3.192731.
Evaluating with 4 samples.
Traceback (most recent call last):
File "train.py", line 104, in
main()
File "train.py", line 93, in main
trainer.train(cfg,
File "/home/piyush/data/neuralangelo/projects/neuralangelo/trainer.py", line 107, in train
super().train(cfg, data_loader, single_gpu, profile, show_pbar)
File "/home/piyush/data/neuralangelo/projects/nerf/trainers/base.py", line 115, in train
super().train(cfg, data_loader, single_gpu, profile, show_pbar)
File "/home/piyush/data/neuralangelo/imaginaire/trainers/base.py", line 512, in train
self.end_of_iteration(data, current_epoch, current_iteration)
File "/home/piyush/data/neuralangelo/imaginaire/trainers/base.py", line 329, in end_of_iteration
self.checkpointer.save(current_epoch, current_iteration)
File "/home/piyush/data/neuralangelo/imaginaire/trainers/base.py", line 578, in save
save_dict = to_cpu(self._collect_state_dicts())
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 120, in to_cpu
return to_device(data, 'cpu')
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 95, in to_device
data = data.to(device, non_blocking=True)
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 23073) of binary: /home/piyush/anaconda3/envs/neuralangelo/bin/python
Traceback (most recent call last):
File "/home/piyush/anaconda3/envs/neuralangelo/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.0.0.post200', 'console_scripts', 'torchrun')())
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
... Epoch: 509, total time: 3.209872. Epoch: 510, total time: 3.206095. Epoch: 511, total time: 3.207372. Epoch: 512, total time: 3.192731. Evaluating with 4 samples. Traceback (most recent call last): File "train.py", line 104, in
main()
File "train.py", line 93, in main
trainer.train(cfg,
File "/home/piyush/data/neuralangelo/projects/neuralangelo/trainer.py", line 107, in train
super().train(cfg, data_loader, single_gpu, profile, show_pbar)
File "/home/piyush/data/neuralangelo/projects/nerf/trainers/base.py", line 115, in train
super().train(cfg, data_loader, single_gpu, profile, show_pbar)
File "/home/piyush/data/neuralangelo/imaginaire/trainers/base.py", line 512, in train
self.end_of_iteration(data, current_epoch, current_iteration)
File "/home/piyush/data/neuralangelo/imaginaire/trainers/base.py", line 329, in end_of_iteration
self.checkpointer.save(current_epoch, current_iteration)
File "/home/piyush/data/neuralangelo/imaginaire/trainers/base.py", line 578, in save
save_dict = to_cpu(self._collect_state_dicts())
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 120, in to_cpu
return to_device(data, 'cpu')
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in to_device
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 98, in
return type(data)({key: to_device(data[key], device) for key in data})
File "/home/piyush/data/neuralangelo/imaginaire/utils/misc.py", line 95, in to_device
data = data.to(device, non_blocking=True)
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 23073) of binary: /home/piyush/anaconda3/envs/neuralangelo/bin/python Traceback (most recent call last): File "/home/piyush/anaconda3/envs/neuralangelo/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.0.0.post200', 'console_scripts', 'torchrun')())
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/piyush/anaconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
train.py FAILED
Failures: