Thank you very much, I have solved the issue. However, I have got another problem with the flowing:
"pytorch_lightning.utilities.exceptions.MisconfigurationException: ModelCheckpoint(monitor='val_mean_ap_50') could not find the monitored key in the returned metrics: ['train_loss_ce', 'train_loss_mask', 'train_loss_dice', 'train_loss_ce_0', 'train_loss_mask_0', 'train_loss_dice_0', 'train_loss_ce_1', 'train_loss_mask_1', 'train_loss_dice_1', 'train_loss_ce_2', 'train_loss_mask_2', 'train_loss_dice_2', 'train_loss_ce_3', 'train_loss_mask_3', 'train_loss_dice_3', 'train_loss_ce_4', 'train_loss_mask_4', 'train_loss_dice_4', 'train_loss_ce_5', 'train_loss_mask_5', 'train_loss_dice_5', 'train_loss_ce_6', 'train_loss_mask_6', 'train_loss_dice_6', 'train_loss_ce_7', 'train_loss_mask_7', 'train_loss_dice_7', 'train_loss_ce_8', 'train_loss_mask_8', 'train_loss_dice_8', 'train_loss_ce_9', 'train_loss_mask_9', 'train_loss_dice_9', 'train_loss_ce_10', 'train_loss_mask_10', 'train_loss_dice_10', 'train_loss_ce_11', 'train_loss_mask_11', 'train_loss_dice_11', 'train_mean_loss_ce', 'train_mean_loss_mask', 'train_mean_loss_dice', 'epoch', 'step']. HINT: Did you call log('val_mean_ap_50', value) in the LightningModule?
"
Epoch 49: 100%|████| 1513/1513 [1:56:39<00:00, 4.63s/it, loss=43.9, v_num=TION]
Traceback (most recent call last):
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 198, in run_and_report
return func()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 347, in
lambda: hydra.run(
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/hydra.py", line 107, in run
return run_job(
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/core/utils.py", line 128, in run_job
ret.return_value = task_function(task_cfg)
File "/home/mylabs/Mask3D/main_instance_segmentation.py", line 108, in main
train(cfg)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/main.py", line 27, in decorated_main
return task_function(cfg_passthrough)
File "/home/mylabs/Mask3D/main_instance_segmentation.py", line 84, in train
runner.fit(model)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit
self._call_and_handle_interrupt(
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt
return trainer_fn(*args, kwargs)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1168, in _run
results = self._run_stage()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1254, in _run_stage
return self._run_train()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1285, in _run_train
self.fit_loop.run()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 200, in run
self.advance(*args, *kwargs)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 270, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 201, in run
self.on_advance_end()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 241, in on_advance_end
self._run_validation()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 299, in _run_validation
self.val_loop.run()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 207, in run
output = self.on_run_end()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 201, in on_run_end
self._on_evaluation_end()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 265, in _on_evaluation_end
self.trainer._call_callback_hooks(hook_name, args, kwargs)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1599, in _call_callback_hooks
fn(self, self.lightning_module, *args, **kwargs)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 319, in on_validation_end
self._save_topk_checkpoint(trainer, monitor_candidates)
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 380, in _save_topk_checkpoint
raise MisconfigurationException(m)
pytorch_lightning.utilities.exceptions.MisconfigurationException: ModelCheckpoint(monitor='val_mean_ap_50') could not find the monitored key in the returned metrics: ['train_loss_ce', 'train_loss_mask', 'train_loss_dice', 'train_loss_ce_0', 'train_loss_mask_0', 'train_loss_dice_0', 'train_loss_ce_1', 'train_loss_mask_1', 'train_loss_dice_1', 'train_loss_ce_2', 'train_loss_mask_2', 'train_loss_dice_2', 'train_loss_ce_3', 'train_loss_mask_3', 'train_loss_dice_3', 'train_loss_ce_4', 'train_loss_mask_4', 'train_loss_dice_4', 'train_loss_ce_5', 'train_loss_mask_5', 'train_loss_dice_5', 'train_loss_ce_6', 'train_loss_mask_6', 'train_loss_dice_6', 'train_loss_ce_7', 'train_loss_mask_7', 'train_loss_dice_7', 'train_loss_ce_8', 'train_loss_mask_8', 'train_loss_dice_8', 'train_loss_ce_9', 'train_loss_mask_9', 'train_loss_dice_9', 'train_loss_ce_10', 'train_loss_mask_10', 'train_loss_dice_10', 'train_loss_ce_11', 'train_loss_mask_11', 'train_loss_dice_11', 'train_mean_loss_ce', 'train_mean_loss_mask', 'train_mean_loss_dice', 'epoch', 'step']. HINT: Did you call log('val_mean_ap_50', value) in the LightningModule?
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/mylabs/Mask3D/main_instance_segmentation.py", line 114, in
main()
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/main.py", line 32, in decorated_main
_run_hydra(
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 346, in _run_hydra
run_and_report(
File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 267, in run_and_report
print_exception(etype=None, value=ex, tb=final_tb) # type: ignore
TypeError: print_exception() got an unexpected keyword argument 'etype'
wandb: Waiting for W&B process to finish... (failed 1). Press Control-C to abort syncing.
I am not very clear about the problem above, may I get some help from you, if so, it would my best greatest.
Hi!
The codebase monitors val_mean_ap_50 (validation mean AP at 50%) to save the current best checkpoint. This key misses in your metrics. You can instead change it to a metric you track, e.g. train_loss_ce here.
Thank you very much, I have solved the issue. However, I have got another problem with the flowing: "pytorch_lightning.utilities.exceptions.MisconfigurationException: ModelCheckpoint(monitor='val_mean_ap_50') could not find the monitored key in the returned metrics: ['train_loss_ce', 'train_loss_mask', 'train_loss_dice', 'train_loss_ce_0', 'train_loss_mask_0', 'train_loss_dice_0', 'train_loss_ce_1', 'train_loss_mask_1', 'train_loss_dice_1', 'train_loss_ce_2', 'train_loss_mask_2', 'train_loss_dice_2', 'train_loss_ce_3', 'train_loss_mask_3', 'train_loss_dice_3', 'train_loss_ce_4', 'train_loss_mask_4', 'train_loss_dice_4', 'train_loss_ce_5', 'train_loss_mask_5', 'train_loss_dice_5', 'train_loss_ce_6', 'train_loss_mask_6', 'train_loss_dice_6', 'train_loss_ce_7', 'train_loss_mask_7', 'train_loss_dice_7', 'train_loss_ce_8', 'train_loss_mask_8', 'train_loss_dice_8', 'train_loss_ce_9', 'train_loss_mask_9', 'train_loss_dice_9', 'train_loss_ce_10', 'train_loss_mask_10', 'train_loss_dice_10', 'train_loss_ce_11', 'train_loss_mask_11', 'train_loss_dice_11', 'train_mean_loss_ce', 'train_mean_loss_mask', 'train_mean_loss_dice', 'epoch', 'step']. HINT: Did you call log('val_mean_ap_50', value) in the LightningModule? "
Epoch 49: 100%|████| 1513/1513 [1:56:39<00:00, 4.63s/it, loss=43.9, v_num=TION] Traceback (most recent call last): File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 198, in run_and_report return func() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 347, in lambda: hydra.run( File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/hydra.py", line 107, in run return run_job( File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/core/utils.py", line 128, in run_job ret.return_value = task_function(task_cfg) File "/home/mylabs/Mask3D/main_instance_segmentation.py", line 108, in main train(cfg) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/main.py", line 27, in decorated_main return task_function(cfg_passthrough) File "/home/mylabs/Mask3D/main_instance_segmentation.py", line 84, in train runner.fit(model) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 696, in fit self._call_and_handle_interrupt( File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 650, in _call_and_handle_interrupt return trainer_fn(*args, kwargs) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl results = self._run(model, ckpt_path=self.ckpt_path) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1168, in _run results = self._run_stage() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1254, in _run_stage return self._run_train() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1285, in _run_train self.fit_loop.run() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 200, in run self.advance(*args, *kwargs) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 270, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 201, in run self.on_advance_end() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 241, in on_advance_end self._run_validation() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 299, in _run_validation self.val_loop.run() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py", line 207, in run output = self.on_run_end() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 201, in on_run_end self._on_evaluation_end() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 265, in _on_evaluation_end self.trainer._call_callback_hooks(hook_name, args, kwargs) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1599, in _call_callback_hooks fn(self, self.lightning_module, *args, **kwargs) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 319, in on_validation_end self._save_topk_checkpoint(trainer, monitor_candidates) File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 380, in _save_topk_checkpoint raise MisconfigurationException(m) pytorch_lightning.utilities.exceptions.MisconfigurationException: ModelCheckpoint(monitor='val_mean_ap_50') could not find the monitored key in the returned metrics: ['train_loss_ce', 'train_loss_mask', 'train_loss_dice', 'train_loss_ce_0', 'train_loss_mask_0', 'train_loss_dice_0', 'train_loss_ce_1', 'train_loss_mask_1', 'train_loss_dice_1', 'train_loss_ce_2', 'train_loss_mask_2', 'train_loss_dice_2', 'train_loss_ce_3', 'train_loss_mask_3', 'train_loss_dice_3', 'train_loss_ce_4', 'train_loss_mask_4', 'train_loss_dice_4', 'train_loss_ce_5', 'train_loss_mask_5', 'train_loss_dice_5', 'train_loss_ce_6', 'train_loss_mask_6', 'train_loss_dice_6', 'train_loss_ce_7', 'train_loss_mask_7', 'train_loss_dice_7', 'train_loss_ce_8', 'train_loss_mask_8', 'train_loss_dice_8', 'train_loss_ce_9', 'train_loss_mask_9', 'train_loss_dice_9', 'train_loss_ce_10', 'train_loss_mask_10', 'train_loss_dice_10', 'train_loss_ce_11', 'train_loss_mask_11', 'train_loss_dice_11', 'train_mean_loss_ce', 'train_mean_loss_mask', 'train_mean_loss_dice', 'epoch', 'step']. HINT: Did you call log('val_mean_ap_50', value) in the LightningModule?
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/home/mylabs/Mask3D/main_instance_segmentation.py", line 114, in main() File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/main.py", line 32, in decorated_main _run_hydra( File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 346, in _run_hydra run_and_report( File "/root/anaconda3/envs/mask3d_cuda113/lib/python3.10/site-packages/hydra/_internal/utils.py", line 267, in run_and_report print_exception(etype=None, value=ex, tb=final_tb) # type: ignore TypeError: print_exception() got an unexpected keyword argument 'etype' wandb: Waiting for W&B process to finish... (failed 1). Press Control-C to abort syncing.
I am not very clear about the problem above, may I get some help from you, if so, it would my best greatest.