Can someone help me take a look at the errors I have encountered?Here are the error codes I encountered.
Training with 1 GPUs.
Using random seed 0
Make folder logs/example_group/example_name
wandb_scalar_iter: 100
cudnn benchmark: True
cudnn deterministic: False
Setup trainer.
Using random seed 0
Traceback (most recent call last):
File "train.py", line 104, in
main()
File "train.py", line 79, in main
trainer = get_trainer(cfg, is_inference=False, seed=args.seed)
File "/home/intel/neuralangelo/imaginaire/trainers/utils/get_trainer.py", line 32, in get_trainer
trainer = trainer_lib.Trainer(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/neuralangelo/trainer.py", line 26, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/nerf/trainers/base.py", line 28, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 50, in init
self.model = self.setup_model(cfg, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 116, in setup_model
lib_model = importlib.import_module(cfg.model.type)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1014, in _gcd_import
File "", line 991, in _find_and_load
File "", line 975, in _find_and_load_unlocked
File "", line 671, in _load_unlocked
File "", line 843, in exec_module
File "", line 219, in _call_with_frames_removed
File "/home/intel/neuralangelo/projects/neuralangelo/model.py", line 21, in
from projects.neuralangelo.utils.modules import NeuralSDF, NeuralRGB, BackgroundNeRF
File "/home/intel/neuralangelo/projects/neuralangelo/utils/modules.py", line 16, in
import tinycudann as tcnn
ModuleNotFoundError: No module named 'tinycudann'
[2024-03-25 20:11:48,544] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 37238) of binary: /home/intel/miniconda3/envs/neuralangelo/bin/python
Traceback (most recent call last):
File "/home/intel/miniconda3/envs/neuralangelo/bin/torchrun", line 10, in
sys.exit(main())
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Can someone help me take a look at the errors I have encountered?Here are the error codes I encountered. Training with 1 GPUs. Using random seed 0 Make folder logs/example_group/example_name
wandb_scalar_iter: 100 cudnn benchmark: True cudnn deterministic: False Setup trainer. Using random seed 0 Traceback (most recent call last): File "train.py", line 104, in
main()
File "train.py", line 79, in main
trainer = get_trainer(cfg, is_inference=False, seed=args.seed)
File "/home/intel/neuralangelo/imaginaire/trainers/utils/get_trainer.py", line 32, in get_trainer
trainer = trainer_lib.Trainer(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/neuralangelo/trainer.py", line 26, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/projects/nerf/trainers/base.py", line 28, in init
super().init(cfg, is_inference=is_inference, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 50, in init
self.model = self.setup_model(cfg, seed=seed)
File "/home/intel/neuralangelo/imaginaire/trainers/base.py", line 116, in setup_model
lib_model = importlib.import_module(cfg.model.type)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1014, in _gcd_import
File "", line 991, in _find_and_load
File "", line 975, in _find_and_load_unlocked
File "", line 671, in _load_unlocked
File "", line 843, in exec_module
File "", line 219, in _call_with_frames_removed
File "/home/intel/neuralangelo/projects/neuralangelo/model.py", line 21, in
from projects.neuralangelo.utils.modules import NeuralSDF, NeuralRGB, BackgroundNeRF
File "/home/intel/neuralangelo/projects/neuralangelo/utils/modules.py", line 16, in
import tinycudann as tcnn
ModuleNotFoundError: No module named 'tinycudann'
[2024-03-25 20:11:48,544] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 37238) of binary: /home/intel/miniconda3/envs/neuralangelo/bin/python
Traceback (most recent call last):
File "/home/intel/miniconda3/envs/neuralangelo/bin/torchrun", line 10, in
sys.exit(main())
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/intel/miniconda3/envs/neuralangelo/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
train.py FAILED
Failures: