First of all, I thank you so much for this great project. You have done a really tremendous job, and I am so full of anticipation about exploring and using this model.
I was following the instructions provided in the README to train the model, including the environment setup and dataset acquisition, but I encountered an error during the process. Below are the details:
Error:
After building the environment and acquiring the data set, the following commands were executed:
PYTHONPATH=. python train.py exp=train_msdm
Then I got the following error:
Traceback (most recent call last):
File "/raid/m236866/multi-source-diffusion-models/train.py", line 4, in <module>
import pytorch_lightning as pl
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/pytorch_lightning/__init__.py", line 34, in <module>
from pytorch_lightning.callbacks import Callback # noqa: E402
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/pytorch_lightning/callbacks/__init__.py", line 25, in <module>
from pytorch_lightning.callbacks.progress import ProgressBarBase, RichProgressBar, TQDMProgressBar
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/pytorch_lightning/callbacks/progress/__init__.py", line 22, in <module>
from pytorch_lightning.callbacks.progress.rich_progress import RichProgressBar # noqa: F401
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/pytorch_lightning/callbacks/progress/rich_progress.py", line 20, in <module>
from torchmetrics.utilities.imports import _compare_version
ImportError: cannot import name '_compare_version' from 'torchmetrics.utilities.imports' (/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/torchmetrics/utilities/imports.py)
Then, I installed torchmetrics and executed it again, and the following error occurred:
/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
warn(
Enter run name: first-test-of-train-0525
[2024-05-25 21:11:39,620][main.utils][INFO] - Disabling python warnings! <config.ignore_warnings=True>
Global seed set to 12345
[2024-05-25 21:11:39,622][__main__][INFO] - Instantiating datamodule <main.module_base.DatamoduleWithValidation>.
Error executing job with overrides: ['exp=train_msdm']
Error locating target 'main.data.MultiSourceDataset', see chained exception above.
full_key: datamodule.train_dataset
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
4. Finally, I tried the following and executed it again:
/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from torchvision.io, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have libjpeg or libpng installed before building torchvision from source?
warn(
Enter run name: test
[2024-05-25 21:13:40,990][main.utils][INFO] - Disabling python warnings!
Global seed set to 12345
[2024-05-25 21:13:40,992][main][INFO] - Instantiating datamodule .
Error executing job with overrides: ['exp=train_msdm']
Traceback (most recent call last):
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 639, in _locate
obj = getattr(obj, part)
AttributeError: module 'main' has no attribute 'data'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 645, in _locate
obj = import_module(mod)
File "/home/m236866/.conda/envs/msdm/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/raid/m236866/multi-source-diffusion-models/main/data.py", line 11, in
import av
ModuleNotFoundError: No module named 'av'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 134, in _resolve_target
target = _locate(target)
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 648, in _locate
raise ImportError(
ImportError: Error loading 'main.data.MultiSourceDataset':
ModuleNotFoundError("No module named 'av'")
Are you sure that 'data' is importable from module 'main'?
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/raid/m236866/multi-source-diffusion-models/train.py", line 99, in
main()
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/main.py", line 90, in decorated_main
_run_hydra(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 389, in _run_hydra
_run_app(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 452, in _run_app
run_and_report(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 216, in run_and_report
raise ex
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 213, in run_and_report
return func()
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 453, in
lambda: hydra.run(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/internal/hydra.py", line 132, in run
= ret.return_value
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/raid/m236866/multi-source-diffusion-models/train.py", line 28, in main
datamodule = hydra.utils.instantiate(config.datamodule, convert="partial")
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 222, in instantiate
return instantiate_node(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 334, in instantiate_node
value = instantiate_node(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 325, in instantiate_node
target = _resolve_target(node.get(_Keys.TARGET), full_key)
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 139, in _resolve_target
raise InstantiationException(msg) from e
hydra.errors.InstantiationException: Error locating target 'main.data.MultiSourceDataset', see chained exception above.
full_key: datamodule.train_dataset
I would greatly appreciate any guidance or solutions you could provide to resolve this issue.
Thank you once again for your incredible work.
First of all, I thank you so much for this great project. You have done a really tremendous job, and I am so full of anticipation about exploring and using this model.
I was following the instructions provided in the README to train the model, including the environment setup and dataset acquisition, but I encountered an error during the process. Below are the details:
Error:
After building the environment and acquiring the data set, the following commands were executed:
PYTHONPATH=. python train.py exp=train_msdm
Then I got the following error:
torchmetrics
and executed it again, and the following error occurred:Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
conda install -c conda-forge cudatoolkit=11.3 cudnn=8.2 conda install -c conda-forge libjpeg libpng pip uninstall torchvision pip install torchvision export HYDRA_FULL_ERROR=1
/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'libc10_cuda.so: cannot open shared object file: No such file or directory'If you don't plan on using image functionality from
Global seed set to 12345
[2024-05-25 21:13:40,992][main][INFO] - Instantiating datamodule .
Error executing job with overrides: ['exp=train_msdm']
Traceback (most recent call last):
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 639, in _locate
obj = getattr(obj, part)
AttributeError: module 'main' has no attribute 'data'
torchvision.io
, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you havelibjpeg
orlibpng
installed before buildingtorchvision
from source? warn( Enter run name: test [2024-05-25 21:13:40,990][main.utils][INFO] - Disabling python warnings!During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 645, in _locate obj = import_module(mod) File "/home/m236866/.conda/envs/msdm/lib/python3.9/importlib/init.py", line 127, in import_module return _bootstrap._gcd_import(name[level:], package, level) File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/raid/m236866/multi-source-diffusion-models/main/data.py", line 11, in
import av
ModuleNotFoundError: No module named 'av'
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 134, in _resolve_target target = _locate(target) File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 648, in _locate raise ImportError( ImportError: Error loading 'main.data.MultiSourceDataset': ModuleNotFoundError("No module named 'av'") Are you sure that 'data' is importable from module 'main'?
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/raid/m236866/multi-source-diffusion-models/train.py", line 99, in
main()
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/main.py", line 90, in decorated_main
_run_hydra(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 389, in _run_hydra
_run_app(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 452, in _run_app
run_and_report(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 216, in run_and_report
raise ex
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 213, in run_and_report
return func()
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/utils.py", line 453, in
lambda: hydra.run(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/internal/hydra.py", line 132, in run
= ret.return_value
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/raid/m236866/multi-source-diffusion-models/train.py", line 28, in main
datamodule = hydra.utils.instantiate(config.datamodule, convert="partial")
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 222, in instantiate
return instantiate_node(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 334, in instantiate_node
value = instantiate_node(
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 325, in instantiate_node
target = _resolve_target(node.get(_Keys.TARGET), full_key)
File "/home/m236866/.conda/envs/msdm/lib/python3.9/site-packages/hydra/_internal/instantiate/_instantiate2.py", line 139, in _resolve_target
raise InstantiationException(msg) from e
hydra.errors.InstantiationException: Error locating target 'main.data.MultiSourceDataset', see chained exception above.
full_key: datamodule.train_dataset