I downloaded sa_000000.tar from SA-1b to try to train masa. When I followed the tutorial to complete the dataset format conversion and train it, I got this error:
loading annotations into memory...
Done (t=2.93s)
creating index...
index created!
Loading data list...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11186/11186 [00:02<00:00, 4080.73it/s]
Traceback (most recent call last):
File "/home/fuc/code/mot/masa/tools/train.py", line 151, in
main()
File "/home/fuc/code/mot/masa/tools/train.py", line 147, in main
runner.train()
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/runner.py", line 1520, in build_train_loop
loop = LOOPS.build(
^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, args, kwargs, registry=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(args) # type: ignore
^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/loops.py", line 46, in init
super().init(runner, dataloader)
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/base_loop.py", line 26, in init
self.dataloader = runner.build_dataloader(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, args, kwargs, registry=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(args) # type: ignore
^^^^^^^^^^^^^^^
File "/home/fuc/code/mot/masa/masa/datasets/dataset_wrappers.py", line 67, in init
self.dataset = DATASETS.build(dataset)
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, kwargs, registry=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(args) # type: ignore
^^^^^^^^^^^^^^^
File "/home/fuc/code/mot/masa/masa/datasets/rsconcat_dataset.py", line 45, in init
self.fixed_length <= total_datasets_length
AssertionError: the length of the concatenated dataset must be less than the sum of the lengths of the individual datasets
[2024-10-28 14:53:58,640] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 921912) of binary: /home/fuc/anaconda3/envs/masaenv/bin/python
Traceback (most recent call last):
File "", line 198, in _run_module_as_main
File "", line 88, in _run_code
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launch.py", line 196, in
main()
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launch.py", line 192, in main
launch(args)
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launch.py", line 177, in launch
run(args)
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
tools/train.py FAILED
Failures:
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-10-28_14:53:58
host : fuc-System-Product-Name
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 921912)
error_file:
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
The training parameters I used are as follows: tools/dist_train.sh configs/masa-gdino/masa_gdino_swinb_train.py 1 --work-dir saved_models/masa_gdino/
I downloaded sa_000000.tar from SA-1b to try to train masa. When I followed the tutorial to complete the dataset format conversion and train it, I got this error: loading annotations into memory... Done (t=2.93s) creating index... index created! Loading data list... 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11186/11186 [00:02<00:00, 4080.73it/s] Traceback (most recent call last): File "/home/fuc/code/mot/masa/tools/train.py", line 151, in
main()
File "/home/fuc/code/mot/masa/tools/train.py", line 147, in main
runner.train()
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/runner.py", line 1728, in train
self._train_loop = self.build_train_loop(
^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/runner.py", line 1520, in build_train_loop
loop = LOOPS.build(
^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, args, kwargs, registry=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(args) # type: ignore
^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/loops.py", line 46, in init
super().init(runner, dataloader)
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/base_loop.py", line 26, in init
self.dataloader = runner.build_dataloader(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/runner/runner.py", line 1370, in build_dataloader
dataset = DATASETS.build(dataset_cfg)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, args, kwargs, registry=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(args) # type: ignore
^^^^^^^^^^^^^^^
File "/home/fuc/code/mot/masa/masa/datasets/dataset_wrappers.py", line 67, in init
self.dataset = DATASETS.build(dataset)
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, kwargs, registry=self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(args) # type: ignore
^^^^^^^^^^^^^^^
File "/home/fuc/code/mot/masa/masa/datasets/rsconcat_dataset.py", line 45, in init
self.fixed_length <= total_datasets_length
AssertionError: the length of the concatenated dataset must be less than the sum of the lengths of the individual datasets
[2024-10-28 14:53:58,640] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 921912) of binary: /home/fuc/anaconda3/envs/masaenv/bin/python
Traceback (most recent call last):
File "", line 198, in _run_module_as_main
File "", line 88, in _run_code
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launch.py", line 196, in
main()
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launch.py", line 192, in main
launch(args)
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launch.py", line 177, in launch
run(args)
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/fuc/anaconda3/envs/masaenv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
tools/train.py FAILED
Failures: