Closed tanghui315 closed 4 months ago
It should have been fixed
@hiyouga I get same errors when finetuing glm-4-9b model with latest codes:
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_flat_param.py", line 573, in __init__
self._init_flat_param_and_metadata(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_flat_param.py", line 623, in _init_flat_param_and_metadata
) = self._validate_tensors_to_flatten(params)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_flat_param.py", line 761, in _validate_tensors_to_flatten
raise ValueError(
ValueError: Must flatten tensors with uniform dtype but got torch.bfloat16 and torch.float16
Traceback (most recent call last):
File "/data/disk2/ybZhang/LLaMA-Factory/src/train.py", line 28, in <module>
main()
File "/data/disk2/ybZhang/LLaMA-Factory/src/train.py", line 19, in main
run_exp()
File "/data/disk2/ybZhang/LLaMA-Factory/src/llamafactory/train/tuner.py", line 50, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data/disk2/ybZhang/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 94, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/transformers/trainer.py", line 2032, in _inner_training_loop
self.model = self.accelerator.prepare(self.model)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/accelerate/accelerator.py", line 1292, in prepare
result = tuple(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/accelerate/accelerator.py", line 1293, in <genexpr>
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/accelerate/accelerator.py", line 1169, in _prepare_one
return self.prepare_model(obj, device_placement=device_placement)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/accelerate/accelerator.py", line 1459, in prepare_model
model = FSDP(model, **kwargs)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 477, in __init__
_auto_wrap(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_wrap_utils.py", line 101, in _auto_wrap
_recursive_wrap(**recursive_wrap_kwargs, **root_kwargs) # type: ignore[arg-type]
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
wrapped_child, num_wrapped_params = _recursive_wrap(
[Previous line repeated 3 more times]
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 561, in _recursive_wrap
return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/wrap.py", line 490, in _wrap
return wrapper_cls(module, **kwargs)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 503, in __init__
_init_param_handle_from_module(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py", line 590, in _init_param_handle_from_module
_init_param_handle_from_params(state, managed_params, fully_sharded_module)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py", line 602, in _init_param_handle_from_params
handle = FlatParamHandle(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_flat_param.py", line 573, in __init__
self._init_flat_param_and_metadata(
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_flat_param.py", line 623, in _init_flat_param_and_metadata
) = self._validate_tensors_to_flatten(params)
File "/home/ybZhang/miniconda3/envs/glm-fs/lib/python3.10/site-packages/torch/distributed/fsdp/_flat_param.py", line 761, in _validate_tensors_to_flatten
raise ValueError(
ValueError: Must flatten tensors with uniform dtype but got torch.bfloat16 and torch.float16
[2024-08-22 15:12:04,849] torch.distributed.elastic.multiprocessing.api: [ERROR] f
Reminder
Reproduction
I pulled new code and ran Accelerate +FSDP + Qlora training, but encountered an error: ValueError: Must flatten tensors with uniform dtype but got torch.bfloat16 and torch.float16
However, when I use the old code, it works fine. I suspect there is a bug in the new code.
Expected behavior
No response
System Info
No response
Others
No response