Thank you for your excellent work! However, I could not run the code due to the below issue:
(slt) muhiddin@server:~/SLRT/TwoStreamNetwork$ python -m torch.distributed.launch --nproc_per_node 8 --use_env training.py --config experiments/configs/TwoStream/phoenix_keypoint.yaml
/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects --local_rank argument to be set, please
change it to read from os.environ['LOCAL_RANK'] instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
FutureWarning,
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
File "training.py", line 6, in
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2680830) of binary: /home/muhiddin/miniconda3/envs/slt/bin/python
Traceback (most recent call last):
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
)(*cmd_args)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
training.py FAILED
Failures:
[1]:
time : 2024-08-17_13:24:04
host : server
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 2680831)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-08-17_13:24:04
host : server
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 2680832)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-08-17_13:24:04
host : server
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 2680833)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
time : 2024-08-17_13:24:04
host : server
rank : 4 (local_rank: 4)
exitcode : 1 (pid: 2680834)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[5]:
time : 2024-08-17_13:24:04
host : server
rank : 5 (local_rank: 5)
exitcode : 1 (pid: 2680835)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[6]:
time : 2024-08-17_13:24:04
host : server
rank : 6 (local_rank: 6)
exitcode : 1 (pid: 2680836)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[7]:
time : 2024-08-17_13:24:04
host : server
rank : 7 (local_rank: 7)
exitcode : 1 (pid: 2680837)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure):
[0]:
time : 2024-08-17_13:24:04
host : server
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 2680830)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Can you please, guide me while I solve the issue, and get the required results?
Thank you for your excellent work! However, I could not run the code due to the below issue: (slt) muhiddin@server:~/SLRT/TwoStreamNetwork$ python -m torch.distributed.launch --nproc_per_node 8 --use_env training.py --config experiments/configs/TwoStream/phoenix_keypoint.yaml
/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects
--local_rank
argument to be set, pleasechange it to read from
os.environ['LOCAL_RANK']
instead. Seehttps://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
FutureWarning,
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
Traceback (most recent call last):
File "training.py", line 6, in
from torch.utils.tensorboard import SummaryWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/init.py", line 13, in
from .writer import FileWriter, SummaryWriter # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 13, in
from tensorboard.summary.writer.event_file_writer import EventFileWriter
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/init.py", line 22, in
from tensorboard.summary import v1 # noqa: F401
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/summary/v1.py", line 23, in
from tensorboard.plugins.histogram import summary as _histogram_summary
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary.py", line 35, in
from tensorboard.plugins.histogram import summary_v2
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/plugins/histogram/summary_v2.py", line 35, in
from tensorboard.util import tensor_util
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/util/tensor_util.py", line 20, in
from tensorboard.compat.tensorflow_stub import dtypes, compat, tensor_shape
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/init.py", line 25, in
from . import app # noqa
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/app.py", line 21, in
from . import flags
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/flags.py", line 25, in
from absl.flags import # pylint: disable=wildcard-import
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/init.py", line 35, in
from absl.flags import _argument_parser
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/absl/flags/_argument_parser.py", line 82, in
class ArgumentParser(Generic[_T], metaclass=_ArgumentParserCache):
TypeError: metaclass conflict: the metaclass of a derived class must be a (non-strict) subclass of the metaclasses of all its bases
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2680830) of binary: /home/muhiddin/miniconda3/envs/slt/bin/python
Traceback (most recent call last):
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
)(*cmd_args)
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/muhiddin/miniconda3/envs/slt/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
File "training.py", line 6, in
Traceback (most recent call last): File "training.py", line 6, in
training.py FAILED
Failures: [1]: time : 2024-08-17_13:24:04 host : server rank : 1 (local_rank: 1) exitcode : 1 (pid: 2680831) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [2]: time : 2024-08-17_13:24:04 host : server rank : 2 (local_rank: 2) exitcode : 1 (pid: 2680832) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [3]: time : 2024-08-17_13:24:04 host : server rank : 3 (local_rank: 3) exitcode : 1 (pid: 2680833) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [4]: time : 2024-08-17_13:24:04 host : server rank : 4 (local_rank: 4) exitcode : 1 (pid: 2680834) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [5]: time : 2024-08-17_13:24:04 host : server rank : 5 (local_rank: 5) exitcode : 1 (pid: 2680835) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [6]: time : 2024-08-17_13:24:04 host : server rank : 6 (local_rank: 6) exitcode : 1 (pid: 2680836) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [7]: time : 2024-08-17_13:24:04 host : server rank : 7 (local_rank: 7) exitcode : 1 (pid: 2680837) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2024-08-17_13:24:04 host : server rank : 0 (local_rank: 0) exitcode : 1 (pid: 2680830) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Can you please, guide me while I solve the issue, and get the required results?