Traceback (most recent call last):
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 51, in __init__
config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 133, in urlsafe_b64decode
return b64decode(s)
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 87, in b64decode
return binascii.a2b_base64(s)
binascii.Error: Incorrect padding
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "<string>", line 134, in __init__
File "/root/.local/lib/python3.9/site-packages/transformers/training_args.py", line 1930, in __post_init__
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 91, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 81, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 54, in __init__
raise ValueError(
ValueError: Expected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: examples/deepspeed/ds_z3_config.json
Traceback (most recent call last):
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 51, in __init__
config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 133, in urlsafe_b64decode
return b64decode(s)
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 87, in b64decode
return binascii.a2b_base64(s)
binascii.Error: Incorrect padding
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "<string>", line 134, in __init__
File "/root/.local/lib/python3.9/site-packages/transformers/training_args.py", line 1930, in __post_init__
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 91, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 81, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 54, in __init__
raise ValueError(
ValueError: Expected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: examples/deepspeed/ds_z3_config.json
Traceback (most recent call last):
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 51, in __init__
config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 133, in urlsafe_b64decode
return b64decode(s)
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 87, in b64decode
return binascii.a2b_base64(s)
binascii.Error: Incorrect padding
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "<string>", line 134, in __init__
File "/root/.local/lib/python3.9/site-packages/transformers/training_args.py", line 1930, in __post_init__
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 91, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 81, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 54, in __init__
raise ValueError(
ValueError: Expected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: examples/deepspeed/ds_z3_config.json
Traceback (most recent call last):
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 51, in __init__
config_decoded = base64.urlsafe_b64decode(config_file_or_dict).decode("utf-8")
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 133, in urlsafe_b64decode
return b64decode(s)
File "/opt/conda/envs/pytorch/lib/python3.9/base64.py", line 87, in b64decode
return binascii.a2b_base64(s)
binascii.Error: Incorrect padding
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 374, in parse_dict
obj = dtype(**inputs)
File "<string>", line 134, in __init__
File "/root/.local/lib/python3.9/site-packages/transformers/training_args.py", line 1930, in __post_init__
self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 91, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/transformers/integrations/deepspeed.py", line 81, in __init__
super().__init__(config_file_or_dict)
File "/root/.local/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 54, in __init__
raise ValueError(
ValueError: Expected a string path to an existing deepspeed config, or a dictionary, or a base64 encoded string. Received: examples/deepspeed/ds_z3_config.json
[2024-09-21 10:31:37,514] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1555) of binary: /opt/conda/envs/pytorch/bin/python
Traceback (most recent call last):
File "/opt/conda/envs/pytorch/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.1.2', 'console_scripts', 'torchrun')())
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-09-21_10:31:37
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1556)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-09-21_10:31:37
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 1557)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-09-21_10:31:37
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 1558)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-09-21_10:31:37
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1555)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
(pytorch) root@notebook-254a295b-9350-4e5e-9778-95c19feddabb-0:~/dataDisk# CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train /root/dataDisk/LLaMA-Factory/examples/train_full/llama3_full_sft_ds3.yaml
[2024-09-21 10:32:54,531] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
[WARNING] using untested triton version (2.1.0), only 1.0.0 is known to be compatible
09/21/2024 10:32:57 - INFO - llamafactory.cli - Initializing distributed tasks at: 127.0.0.1:28176
[2024-09-21 10:32:58,199] torch.distributed.run: [WARNING]
[2024-09-21 10:32:58,199] torch.distributed.run: [WARNING] *****************************************
[2024-09-21 10:32:58,199] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
[2024-09-21 10:32:58,199] torch.distributed.run: [WARNING] *****************************************
[2024-09-21 10:33:01,588] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-21 10:33:01,589] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-21 10:33:01,598] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-21 10:33:01,633] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt [WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
[WARNING] using untested triton version (2.1.0), only 1.0.0 is known to be compatible
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
[WARNING] using untested triton version (2.1.0), only 1.0.0 is known to be compatible
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
[WARNING] using untested triton version (2.1.0), only 1.0.0 is known to be compatible
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
[WARNING] using untested triton version (2.1.0), only 1.0.0 is known to be compatible
[2024-09-21 10:33:02,655] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-09-21 10:33:02,655] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-09-21 10:33:02,655] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-09-21 10:33:02,669] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-09-21 10:33:02,703] [INFO] [comm.py:637:init_distributed] cdb=None
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 377, in parse_dict
raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}")
ValueError: Some keys are not used by the HfArgumentParser: ['visual_inputs']
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 377, in parse_dict
raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}")
ValueError: Some keys are not used by the HfArgumentParser: ['visual_inputs']
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 377, in parse_dict
raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}")
ValueError: Some keys are not used by the HfArgumentParser: ['visual_inputs']
Traceback (most recent call last):
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module>
launch()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/train/tuner.py", line 45, in run_exp
model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 162, in get_train_args
model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 148, in _parse_train_args
return _parse_args(parser, args)
File "/root/dataDisk/LLaMA-Factory/src/llamafactory/hparams/parser.py", line 61, in _parse_args
return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 423, in parse_yaml_file
outputs = self.parse_dict(yaml.safe_load(Path(yaml_file).read_text()), allow_extra_keys=allow_extra_keys)
File "/root/.local/lib/python3.9/site-packages/transformers/hf_argparser.py", line 377, in parse_dict
raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}")
ValueError: Some keys are not used by the HfArgumentParser: ['visual_inputs']
[2024-09-21 10:33:08,232] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1816) of binary: /opt/conda/envs/pytorch/bin/python
Traceback (most recent call last):
File "/opt/conda/envs/pytorch/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.1.2', 'console_scripts', 'torchrun')())
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/pytorch/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
/root/dataDisk/LLaMA-Factory/src/llamafactory/launcher.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-09-21_10:33:08
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1817)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-09-21_10:33:08
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 1818)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-09-21_10:33:08
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 1819)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-09-21_10:33:08
host : notebook-254a295b-9350-4e5e-9778-95c19feddabb-0.notebook-254a295b-9350-4e5e-9778-95c19feddabb.colossal-ai.svc.cluster.local
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1816)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Expected behavior
No response
Others
这个参数我以前用来微调 llava 的图片摘要能力,现在不能用了,请问我要怎么做呢?
No response
Reminder
System Info
Reproduction
配置信息:
报错:
Expected behavior
No response
Others
这个参数我以前用来微调 llava 的图片摘要能力,现在不能用了,请问我要怎么做呢? No response