Problems uploading non builting obj in config when using Aim Logger and pl lightning when logging with Aim server.
However, when logging locally, the objects are stringified appropriately.
...
Traceback (most recent call last):
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 90, in launch
return function(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1086, in _run
self._log_hyperparams()
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1154, in _log_hyperparams
logger.log_hyperparams(hparams_initial)
File "/.venv/lib/python3.8/site-packages/lightning_utilities/core/rank_zero.py", line 24, in wrapped_fn
return fn(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/aim/sdk/adapters/pytorch_lightning.py", line 105, in log_hyperparams
self.experiment.set(('hparams', key), value, strict=False)
File "/.venv/lib/python3.8/site-packages/aim/sdk/run.py", line 401, in set
self.meta_run_attrs_tree.set(key, val, strict)
File "aim/storage/treeview.py", line 75, in aim.storage.treeview.TreeView.set
File "/.venv/lib/python3.8/site-packages/aim/storage/treeviewproxy.py", line 210, in __setitem__
self.tree[self.absolute_path(path)] = value
File "/.venv/lib/python3.8/site-packages/aim/storage/treeviewproxy.py", line 90, in __setitem__
self._rpc_client.run_instruction(self._hash, self._handler, '__setitem__', (path, value), is_write_only=True)
File "/.venv/lib/python3.8/site-packages/aim/ext/transport/client.py", line 241, in run_instruction
self._run_write_instructions, list(encode_tree([(resource, method, args)])))
File "aim/storage/treeutils.pyx", line 179, in encode_paths_vals
File "aim/storage/treeutils.pyx", line 44, in unfold_tree
File "aim/storage/treeutils.pyx", line 44, in unfold_tree
File "aim/storage/treeutils.pyx", line 44, in unfold_tree
File "aim/storage/treeutils.pyx", line 60, in unfold_tree
File "/.venv/lib/python3.8/site-packages/aim/storage/treeutils_non_native.py", line 31, in convert_to_native_object
raise TypeError(f'Unhandled non-native value `{obj}` of type `{type(obj)}`.')
TypeError: Unhandled non-native value `Compose([
LongestMaxSize(always_apply=False, p=1, max_size=2000, interpolation=1),
PadIfNeeded(always_apply=False, p=1.0, min_height=2000, min_width=2000, pad_height_divisor=None, pad_width_divisor=None, border_mode=0, value=(255, 255, 255), mask_value=0),
Resize(always_apply=False, p=1, height=2000, width=2000, interpolation=1),
HorizontalFlip(always_apply=False, p=0.5),
ToTensorV2(always_apply=True, p=1.0, transpose_mask=True),
], p=1.0, bbox_params={'format': 'pascal_voc', 'label_fields': ['class_labels'], 'min_area': 0.0, 'min_visibility': 0.0, 'check_each_transform': True}, keypoint_params=None, additional_targets={})` of type `<class 'albumentations.core.composition.Compose'>`.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "train.py", line 218, in <module>
main(args)
File "train.py", line 187, in main
trainer.fit(model, datamodule=wsi_datamodule)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 59, in _call_and_handle_interrupt
trainer.strategy.reconciliate_processes(traceback.format_exc())
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 461, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
Traceback (most recent call last):
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 90, in launch
return function(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1086, in _run
self._log_hyperparams()
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1154, in _log_hyperparams
logger.log_hyperparams(hparams_initial)
File "/.venv/lib/python3.8/site-packages/lightning_utilities/core/rank_zero.py", line 24, in wrapped_fn
return fn(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/aim/sdk/adapters/pytorch_lightning.py", line 105, in log_hyperparams
self.experiment.set(('hparams', key), value, strict=False)
File "/.venv/lib/python3.8/site-packages/aim/sdk/run.py", line 401, in set
self.meta_run_attrs_tree.set(key, val, strict)
File "aim/storage/treeview.py", line 75, in aim.storage.treeview.TreeView.set
File "/.venv/lib/python3.8/site-packages/aim/storage/treeviewproxy.py", line 210, in __setitem__
self.tree[self.absolute_path(path)] = value
File "/.venv/lib/python3.8/site-packages/aim/storage/treeviewproxy.py", line 90, in __setitem__
self._rpc_client.run_instruction(self._hash, self._handler, '__setitem__', (path, value), is_write_only=True)
File "/.venv/lib/python3.8/site-packages/aim/ext/transport/client.py", line 241, in run_instruction
self._run_write_instructions, list(encode_tree([(resource, method, args)])))
File "aim/storage/treeutils.pyx", line 179, in encode_paths_vals
File "aim/storage/treeutils.pyx", line 44, in unfold_tree
File "aim/storage/treeutils.pyx", line 44, in unfold_tree
File "aim/storage/treeutils.pyx", line 44, in unfold_tree
File "aim/storage/treeutils.pyx", line 60, in unfold_tree
File "/.venv/lib/python3.8/site-packages/aim/storage/treeutils_non_native.py", line 31, in convert_to_native_object
raise TypeError(f'Unhandled non-native value `{obj}` of type `{type(obj)}`.')
TypeError: Unhandled non-native value `Compose([
LongestMaxSize(always_apply=False, p=1, max_size=2000, interpolation=1),
PadIfNeeded(always_apply=False, p=1.0, min_height=2000, min_width=2000, pad_height_divisor=None, pad_width_divisor=None, border_mode=0, value=(255, 255, 255), mask_value=0),
Resize(always_apply=False, p=1, height=2000, width=2000, interpolation=1),
HorizontalFlip(always_apply=False, p=0.5),
ToTensorV2(always_apply=True, p=1.0, transpose_mask=True),
], p=1.0, bbox_params={'format': 'pascal_voc', 'label_fields': ['class_labels'], 'min_area': 0.0, 'min_visibility': 0.0, 'check_each_transform': True}, keypoint_params=None, additional_targets={})` of type `<class 'albumentations.core.composition.Compose'>`.
However, when we remove the datamodule hyperparameters from the config logging~ the Trainer complains not being able to reach the collate_fn from datamodule.
...
/.venv/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py:438: UserWarning: Error handling mechanism for deadlock detection is uninitialized. Skipping check.
rank_zero_warn("Error handling mechanism for deadlock detection is uninitialized. Skipping check.")
Traceback (most recent call last):
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 289, in __getattr__
return self[key]
KeyError: 'collate_fn'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/train.py", line 206, in <module>
main(args)
File "/train.py", line 174, in main
Traceback (most recent call last):
trainer.fit(model, datamodule=wsi_datamodule)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 289, in __getattr__
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return self[key]
KeyError: 'collate_fn'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/train.py", line 206, in <module>
return trainer_fn(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
Traceback (most recent call last):
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 289, in __getattr__
main(args)
File "/train.py", line 174, in main
self._run(model, ckpt_path=self.ckpt_path)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1037, in _run
trainer.fit(model, datamodule=wsi_datamodule)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
return self[key]
KeyError: 'collate_fn'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "train.py", line 206, in <module>
Traceback (most recent call last):
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 289, in __getattr__
self._call_setup_hook() # allow user to setup lightning_module in accelerator environment
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1284, in _call_setup_hook
call._call_and_handle_interrupt(
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
main(args)
File "train.py", line 174, in main
return self[key]
KeyError: 'collate_fn'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/train.py", line 206, in <module>
return trainer_fn(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._call_lightning_datamodule_hook("setup", stage=fn)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1361, in _call_lightning_datamodule_hook
trainer.fit(model, datamodule=wsi_datamodule)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
main(args)
File "/train.py", line 174, in main
self._run(model, ckpt_path=self.ckpt_path)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1037, in _run
call._call_and_handle_interrupt(
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 36, in _call_and_handle_interrupt
return fn(*args, **kwargs)
File "/pl_model/dataset.py", line 228, in setup
trainer.fit(model, datamodule=wsi_datamodule)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 90, in launch
self._call_setup_hook() # allow user to setup lightning_module in accelerator environment
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1284, in _call_setup_hook
if self.hparams.collate_fn is None:
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 291, in __getattr__
call._call_and_handle_interrupt(
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return function(*args, **kwargs)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
return trainer_fn(*args, **kwargs)
self._call_lightning_datamodule_hook("setup", stage=fn)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1361, in _call_lightning_datamodule_hook
raise AttributeError(f'Missing attribute "{key}"') from exp
AttributeError: Missing attribute "collate_fn"
self._run(model, ckpt_path=self.ckpt_path)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1037, in _run
self._run(model, ckpt_path=self.ckpt_path)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1037, in _run
return fn(*args, **kwargs)
File "/pl_model/dataset.py", line 228, in setup
self._call_setup_hook() # allow user to setup lightning_module in accelerator environment
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1284, in _call_setup_hook
self._call_setup_hook() # allow user to setup lightning_module in accelerator environment
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1284, in _call_setup_hook
if self.hparams.collate_fn is None:
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 291, in __getattr__
self._call_lightning_datamodule_hook("setup", stage=fn)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1361, in _call_lightning_datamodule_hook
raise AttributeError(f'Missing attribute "{key}"') from exp
AttributeError: Missing attribute "collate_fn"
self._call_lightning_datamodule_hook("setup", stage=fn)
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1361, in _call_lightning_datamodule_hook
return fn(*args, **kwargs)
File "/pl_model/dataset.py", line 228, in setup
return fn(*args, **kwargs)
File "/pl_model/dataset.py", line 228, in setup
if self.hparams.collate_fn is None:
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 291, in __getattr__
if self.hparams.collate_fn is None:
File "/.venv/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py", line 291, in __getattr__
raise AttributeError(f'Missing attribute "{key}"') from exp
AttributeError: Missing attribute "collate_fn"
raise AttributeError(f'Missing attribute "{key}"') from exp
AttributeError: Missing attribute "collate_fn"
To reproduce
Setup similar to above where the Datamodule has self.save_hyperparameters() and also non-native Python obj such as transform declaration in class init for logging.
🐛 Bug
Problems uploading non builting obj in config when using Aim Logger and pl lightning when logging with Aim server. However, when logging locally, the objects are stringified appropriately.
eg. with AIM Logger setup
However, when we remove the datamodule hyperparameters from the config logging~ the
Trainer
complains not being able to reach thecollate_fn
from datamodule.To reproduce
Setup similar to above where the
Datamodule
hasself.save_hyperparameters()
and also non-native Python obj such as transform declaration in class init for logging.Expected behavior
Non-native Python obj should be stringified on config like we say below in local mode.
Environment
Additional context
I am running training in torch Lightning framework in DDP mode ~