It raised the error when run MoD(llama3-8b) with Deepspeed:
`Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
return inner_training_loop(train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
return inner_training_loop(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
return inner_training_loop(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step
self.accelerator.backward(loss, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self.deepspeed_engine_wrapped.backward(loss, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
self.accelerator.backward(loss, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self.accelerator.backward(loss, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self.deepspeed_engine_wrapped.backward(loss, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.deepspeed_engine_wrapped.backward(loss, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
self.engine.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
self.engine.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, *kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self._take_model_step(lr_kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(args, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
ret_val = func(*args, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, *kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(args, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
self._optimizer_step(sub_group_id)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step
return func(args, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
self.optimizer.step()
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
buff = F.get_paged(p.shape, dtype=dtype, device=p.device)
return wrapped(args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
return wrapped(*args, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr'
out = func(*args, *kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
out = func(args, kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
return func(args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step
self.init_state(group, p, gindex, pindex)self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)return func(args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state
state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer
buff = F.get_paged(p.shape, dtype=dtype, device=p.device)buff = F.get_paged(p.shape, dtype=dtype, device=p.device)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeErrorAttributeError: : 'NoneType' object has no attribute 'cget_managed_ptr''NoneType' object has no attribute 'cget_managed_ptr'
0%| | 0/120 [00:29<?, ?it/s]
[2024-07-04 06:19:40,168] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1412100) of binary: /usr/local/python3.10.2/bin/python3.10
Traceback (most recent call last):
File "/usr/local/python3.10.2/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
It raised the error when run MoD(llama3-8b) with Deepspeed:
`Traceback (most recent call last): File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
Traceback (most recent call last):
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
launch()
File "/home/code/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft
run_exp()
File "/home/code/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
return inner_training_loop(train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train File "/home/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 89, in run_sft train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train tr_loss_step = self.training_step(model, inputs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step return inner_training_loop( File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop return inner_training_loop( File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step tr_loss_step = self.training_step(model, inputs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/transformers/trainer.py", line 3324, in training_step self.accelerator.backward(loss, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward self.deepspeed_engine_wrapped.backward(loss, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward self.engine.step() self.accelerator.backward(loss, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step
File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward self.accelerator.backward(loss, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/accelerator.py", line 2126, in backward self._take_model_step(lr_kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step self.deepspeed_engine_wrapped.backward(loss, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward self.deepspeed_engine_wrapped.backward(loss, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 175, in backward self.engine.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step self.engine.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2160, in step self.optimizer.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(*args, *kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step self._take_model_step(lr_kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step self._take_model_step(lr_kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2066, in _take_model_step self._optimizer_step(sub_group_id) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step self.optimizer.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn self.optimizer.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn ret_val = func(args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step ret_val = func(*args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 2050, in step self.optimizer.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper return wrapped(*args, *kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper out = func(args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, *kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step self._optimizer_step(sub_group_id) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step self.init_state(group, p, gindex, pindex) File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context self._optimizer_step(sub_group_id) File "/usr/local/python3.10.2/lib/python3.10/site-packages/deepspeed/runtime/zero/stage3.py", line 947, in _optimizer_step return func(args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state state["state1"] = self.get_state_buffer(p, dtype=torch.uint8) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer self.optimizer.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper self.optimizer.step() File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper buff = F.get_paged(p.shape, dtype=dtype, device=p.device)
return wrapped(args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper return wrapped(*args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/optim/optimizer.py", line 373, in wrapper cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes)) AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr' out = func(*args, *kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context out = func(args, kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, *kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step return func(args, **kwargs) File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 266, in step self.init_state(group, p, gindex, pindex)self.init_state(group, p, gindex, pindex)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, *kwargs)return func(args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 414, in init_state state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)state["state1"] = self.get_state_buffer(p, dtype=torch.uint8)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py", line 309, in get_state_buffer buff = F.get_paged(p.shape, dtype=dtype, device=p.device)buff = F.get_paged(p.shape, dtype=dtype, device=p.device)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged File "/usr/local/python3.10.2/lib/python3.10/site-packages/bitsandbytes/functional.py", line 171, in get_paged cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
AttributeErrorAttributeError: : 'NoneType' object has no attribute 'cget_managed_ptr''NoneType' object has no attribute 'cget_managed_ptr'
0%| | 0/120 [00:29<?, ?it/s] [2024-07-04 06:19:40,168] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1412100) of binary: /usr/local/python3.10.2/bin/python3.10 Traceback (most recent call last): File "/usr/local/python3.10.2/bin/torchrun", line 8, in
sys.exit(main())
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/python3.10.2/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/home/code/LLaMA-Factory/src/llamafactory/launcher.py FAILED
Failures: [1]: time : 2024-07-04_06:19:40 host : localhost.localdomain rank : 1 (local_rank: 1) exitcode : 1 (pid: 1412101) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [2]: time : 2024-07-04_06:19:40 host : localhost.localdomain rank : 2 (local_rank: 2) exitcode : 1 (pid: 1412102) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2024-07-04_06:19:40 host : localhost.localdomain rank : 0 (local_rank: 0) exitcode : 1 (pid: 1412100) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html`