Open boxbox2 opened 4 months ago
Whenever I run main. py, he would have reported it wrong。
FileNotFoundError: [Errno 2] No such file or directory: '/workspace/anomalydiffusion/logs/anomaly-checkpoints/configs/2024-03-15T10-01-56-project.yaml'
def on_pretrain_routine_start(self, trainer, pl_module): if trainer.global_rank == 0: # Create logdirs and save configs os.makedirs(self.logdir, exist_ok=True) os.makedirs(self.ckptdir, exist_ok=True) os.makedirs(self.cfgdir, exist_ok=True) if "callbacks" in self.lightning_config: if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']: os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True) print("Project config") print(OmegaConf.to_yaml(self.config)) OmegaConf.save(self.config, os.path.join(self.cfgdir, "{}-project.yaml".format(self.now))) print("Lightning config") print(OmegaConf.to_yaml(self.lightning_config)) OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}), os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now))) else: # ModelCheckpoint callback created log directory --- remove it if not self.resume and os.path.exists(self.logdir): dst, name = os.path.split(self.logdir) dst = os.path.join(dst, "child_runs", name) os.makedirs(os.path.split(dst)[0], exist_ok=True) try: os.rename(self.logdir, dst) except FileNotFoundError: pass
when i create anomaly-checkpoints/configs/ it will report
OSError: [Errno 39] Directory not empty: 'logs/anomaly-checkpoints' -> 'logs/child_runs/anomaly-checkpoints'
could you help me?
Where is 2024-03-15T10-01-56-project.yaml from?
Whenever I run main. py, he would have reported it wrong。
FileNotFoundError: [Errno 2] No such file or directory: '/workspace/anomalydiffusion/logs/anomaly-checkpoints/configs/2024-03-15T10-01-56-project.yaml'
def on_pretrain_routine_start(self, trainer, pl_module): if trainer.global_rank == 0: # Create logdirs and save configs os.makedirs(self.logdir, exist_ok=True) os.makedirs(self.ckptdir, exist_ok=True) os.makedirs(self.cfgdir, exist_ok=True) if "callbacks" in self.lightning_config: if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']: os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True) print("Project config") print(OmegaConf.to_yaml(self.config)) OmegaConf.save(self.config, os.path.join(self.cfgdir, "{}-project.yaml".format(self.now))) print("Lightning config") print(OmegaConf.to_yaml(self.lightning_config)) OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}), os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now))) else: # ModelCheckpoint callback created log directory --- remove it if not self.resume and os.path.exists(self.logdir): dst, name = os.path.split(self.logdir) dst = os.path.join(dst, "child_runs", name) os.makedirs(os.path.split(dst)[0], exist_ok=True) try: os.rename(self.logdir, dst) except FileNotFoundError: pass
when i create anomaly-checkpoints/configs/ it will report
OSError: [Errno 39] Directory not empty: 'logs/anomaly-checkpoints' -> 'logs/child_runs/anomaly-checkpoints'
could you help me?
Where is 2024-03-15T10-01-56-project.yaml from?
In my opinion,It comes from this function
OmegaConf.save(self.config,
os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
print("Lightning config")
print(OmegaConf.to_yaml(self.lightning_config))
OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
It creates this file(2024-03-15T10-01-56-project.yaml) and stores it in the logs/anomaly-checkpoints/configs the first time it runs, and moves the last saved yaml to the child_runs folder the second time it runs to the function.Is my understanding correct? Looking forward to hearing from you
Could you show the error log when encountering 'FileNotFoundError: [Errno 2] No such file or directory: '/workspace/anomalydiffusion/logs/anomaly-checkpoints/configs/2024-03-15T10-01-56-project.yaml''
Could you show the error log when encountering 'FileNotFoundError: [Errno 2] No such file or directory: '/workspace/anomalydiffusion/logs/anomaly-checkpoints/configs/2024-03-15T10-01-56-project.yaml''
Hi,Thanks for you reply. the error log is
Traceback (most recent call last):
File "main.py", line 868, in <module>
trainer.fit(model, data)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in fit
self._call_and_handle_interrupt(
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 695, in _call_and_handle_interrupt
self.training_type_plugin.reconciliate_processes(traceback.format_exc())
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 535, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
Traceback (most recent call last):
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 682, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1193, in _run
self._dispatch()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1272, in _dispatch
self.training_type_plugin.start_training(self)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1282, in run_stage
return self._run_train()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1299, in _run_train
self._pre_training_routine()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1294, in _pre_training_routine
self.call_hook("on_pretrain_routine_start")
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1477, in call_hook
callback_fx(*args, **kwargs)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py", line 148, in on_pretrain_routine_start
callback.on_pretrain_routine_start(self, self.lightning_module)
File "/workspace/anomalydiffusion/main.py", line 380, in on_pretrain_routine_start
OmegaConf.save(self.config,
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/omegaconf/omegaconf.py", line 216, in save
with io.open(os.path.abspath(f), "w", encoding="utf-8") as file:
FileNotFoundError: [Errno 2] No such file or directory: '/workspace/anomalydiffusion/logs/anomaly-checkpoints/configs/2024-03-18T07-11-01-project.yaml'
Do you run main.py in '/anomalydiffusion'? I see '/workspace' in your error log. Maybe you should switch the directory.
Do you run main.py in '/anomalydiffusion'? I see '/workspace' in your error log. Maybe you should switch the directory.
Dear author,i change my project path into /root.Now it report this error
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/anomalydiffusion/main.py", line 866, in <module>
trainer.fit(model, data)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 735, in fit
self._call_and_handle_interrupt(
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 695, in _call_and_handle_interrupt
self.training_type_plugin.reconciliate_processes(traceback.format_exc())
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 535, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 2
Traceback (most recent call last):
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 682, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 770, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1193, in _run
self._dispatch()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1272, in _dispatch
self.training_type_plugin.start_training(self)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1282, in run_stage
return self._run_train()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1299, in _run_train
self._pre_training_routine()
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1294, in _pre_training_routine
self.call_hook("on_pretrain_routine_start")
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1477, in call_hook
callback_fx(*args, **kwargs)
File "/opt/conda/envs/Anomalydiffusion/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py", line 148, in on_pretrain_routine_start
callback.on_pretrain_routine_start(self, self.lightning_module)
File "/anomalydiffusion/main.py", line 394, in on_pretrain_routine_start
os.rename(self.logdir, dst)
OSError: [Errno 39] Directory not empty: 'logs/anomaly-checkpoints' -> 'logs/child_runs/anomaly-checkpoints'
删除logs即可,重新开始训练,但是我再使用两张卡进行训练的时候,会遇到以下问题: warning_cache.warn( Epoch 0: 4%|███▊ | 49/1166 [00:44<16:47, 1.11it/s, loss=0.00905, v_num=0, train/loss_simple_step=0.0082, train/loss_vlb_step=3.72e-5, train/loss_step=0.0082, global_step=48.00]Exception in thread Thread-1: Traceback (most recent call last): File "/usr/local/python38/lib/python3.8/threading.py", line 932, in _bootstrap_inner self.run() File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 233, in run self._record_writer.write(data) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write self._writer.write(header + header_crc + data + footer_crc) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 766, in write self.fs.append(self.filename, file_content, self.binary_mode) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 160, in append self._write(filename, file_content, "ab" if binary_mode else "a") File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 164, in _write with io.open(filename, mode, encoding=encoding) as f: FileNotFoundError: [Errno 2] No such file or directory: b'logs/anomaly-checkpoints/testtube/version_0/tf/events.out.tfevents.1711004184.ts-2c336b4eb29c4382a74300083a490f8c-launcher.52988.0' Epoch 0: 4%|███▊ | 50/1166 [00:44<16:40, 1.12it/s, loss=0.00838, v_num=0, train/loss_simple_step=0.000813, train/loss_vlb_step=3.18e-6, train/loss_step=0.000813, global_step=49.00]Summoning checkpoint. ...... FileNotFoundError: [Errno 2] No such file or directory: 'logs/anomaly-checkpoints/testtube/version_0/meta.experiment.tmp' 我查看了logs目录,发现这个文件的实际路径为'logs/child_run/anomaly-checkpoints/testtube/version_0/meta.experiment.tmp' 应该是代码哪个位置读文件写错了?还是什么其他问题,请问这个代码可以多卡跑吗?
删除logs即可,重新开始训练,但是我再使用两张卡进行训练的时候,会遇到以下问题: warning_cache.warn( Epoch 0: 4%|███▊ | 49/1166 [00:44<16:47, 1.11it/s, loss=0.00905, v_num=0, train/loss_simple_step=0.0082, train/loss_vlb_step=3.72e-5, train/loss_step=0.0082, global_step=48.00]Exception in thread Thread-1: Traceback (most recent call last): File "/usr/local/python38/lib/python3.8/threading.py", line 932, in _bootstrap_inner self.run() File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 233, in run self._record_writer.write(data) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write self._writer.write(header + header_crc + data + footer_crc) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 766, in write self.fs.append(self.filename, file_content, self.binary_mode) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 160, in append self._write(filename, file_content, "ab" if binary_mode else "a") File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 164, in _write with io.open(filename, mode, encoding=encoding) as f: FileNotFoundError: [Errno 2] No such file or directory: b'logs/anomaly-checkpoints/testtube/version_0/tf/events.out.tfevents.1711004184.ts-2c336b4eb29c4382a74300083a490f8c-launcher.52988.0' Epoch 0: 4%|███▊ | 50/1166 [00:44<16:40, 1.12it/s, loss=0.00838, v_num=0, train/loss_simple_step=0.000813, train/loss_vlb_step=3.18e-6, train/loss_step=0.000813, global_step=49.00]Summoning checkpoint. ...... FileNotFoundError: [Errno 2] No such file or directory: 'logs/anomaly-checkpoints/testtube/version_0/meta.experiment.tmp' 我查看了logs目录,发现这个文件的实际路径为'logs/child_run/anomaly-checkpoints/testtube/version_0/meta.experiment.tmp' 应该是代码哪个位置读文件写错了?还是什么其他问题,请问这个代码可以多卡跑吗?
There may be a problem when running on multiple GPUs. It is suggested to run on a single GPU
删除logs即可,重新开始训练,但是我再使用两张卡进行训练的时候,会遇到以下问题: warning_cache.warn( Epoch 0: 4%|███▊ | 49/1166 [00:44<16:47, 1.11it/s, loss=0.00905, v_num=0, train/loss_simple_step=0.0082, train/loss_vlb_step=3.72e-5, train/loss_step=0.0082, global_step=48.00]Exception in thread Thread-1: Traceback (most recent call last): File "/usr/local/python38/lib/python3.8/threading.py", line 932, in _bootstrap_inner self.run() File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 233, in run self._record_writer.write(data) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write self._writer.write(header + header_crc + data + footer_crc) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 766, in write self.fs.append(self.filename, file_content, self.binary_mode) File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 160, in append self._write(filename, file_content, "ab" if binary_mode else "a") File "/usr/local/python38/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 164, in _write with io.open(filename, mode, encoding=encoding) as f: FileNotFoundError: [Errno 2] No such file or directory: b'logs/anomaly-checkpoints/testtube/version_0/tf/events.out.tfevents.1711004184.ts-2c336b4eb29c4382a74300083a490f8c-launcher.52988.0' Epoch 0: 4%|███▊ | 50/1166 [00:44<16:40, 1.12it/s, loss=0.00838, v_num=0, train/loss_simple_step=0.000813, train/loss_vlb_step=3.18e-6, train/loss_step=0.000813, global_step=49.00]Summoning checkpoint. ...... FileNotFoundError: [Errno 2] No such file or directory: 'logs/anomaly-checkpoints/testtube/version_0/meta.experiment.tmp' 我查看了logs目录,发现这个文件的实际路径为'logs/child_run/anomaly-checkpoints/testtube/version_0/meta.experiment.tmp' 应该是代码哪个位置读文件写错了?还是什么其他问题,请问这个代码可以多卡跑吗?
你好,你的问题解决了嘛?能使用多卡运行吗?我单卡16G在batch_size:2,nums_works:0时还是会爆内存
Whenever I run main. py, he would have reported it wrong。
when i create anomaly-checkpoints/configs/ it will report
could you help me?