Closed hhtao closed 5 months ago
我觉得关键信息应该是这个
ValueError: Target modules {'q_proj', 'v_proj'} not found in the base model. Please check the target modules and try again.
不知道这个错误什么原因引起的,怎么解决
--lora_target query_key_value
--lora_target query_key_value
我不知道这个改成什么合适,我参看一些视频里的教程 把这个参数改成了--lora_target all 程序可以跑通了,大神能不能解读一下 --lora_target q_proj,v_proj 这个2个参数意义?
Reminder
Reproduction
!/bin/bash
pip install "transformers>=4.39.1" pip install "accelerate>=0.28.0" pip install "bitsandbytes>=0.43.0"
CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ --config_file /mnt/part1/LLaMA-Factory/examples/accelerate/fsdp_config.yaml \ /mnt/part1/LLaMA-Factory/src/train_bash.py \ --stage sft \ --do_train \ --model_name_or_path /mnt/part2/chatglm3-6b \ --dataset DISC-Law-SFT \ --dataset_dir /mnt/part1/LLaMA-Factory/data \ --template default \ --finetuning_type lora \ --lora_target q_proj,v_proj \ --output_dir /mnt/part1/LLaMA-Factory/saves/chatglm3/lora/sft \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 4 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --warmup_steps 20 \ --save_steps 200 \ --eval_steps 200 \ --evaluation_strategy steps \ --load_best_model_at_end \ --learning_rate 3e-3 \ --num_train_epochs 10.0 \ --max_samples 5000 \ --val_size 0.1 \ --ddp_timeout 180000000 \ --quantization_bit 4 \ --plot_loss \ --fp16
Expected behavior
[INFO|modeling_utils.py:4032] 2024-04-06 13:08:15,667 >> All the weights of ChatGLMForConditionalGeneration were initialized from the model checkpoint at /mnt/part2/chatglm3-6b. If your task is similar to the task the model of the checkpoint was trained on, you can already use ChatGLMForConditionalGeneration for predictions without further training. [INFO|modeling_utils.py:3573] 2024-04-06 13:08:15,673 >> Generation config file not found, using a generation config created from the model config. 04/06/2024 13:08:15 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled. 04/06/2024 13:08:15 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA Traceback (most recent call last): File "/mnt/part1/LLaMA-Factory/src/train_bash.py", line 14, in
main()
File "/mnt/part1/LLaMA-Factory/src/train_bash.py", line 5, in main
run_exp()
File "/mnt/part1/LLaMA-Factory/src/llmtuner/train/tuner.py", line 33, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/mnt/part1/LLaMA-Factory/src/llmtuner/train/sft/workflow.py", line 33, in run_sft
model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
File "/mnt/part1/LLaMA-Factory/src/llmtuner/model/loader.py", line 93, in load_model
model = init_adapter(model, model_args, finetuning_args, is_trainable)
File "/mnt/part1/LLaMA-Factory/src/llmtuner/model/adapter.py", line 157, in init_adapter
model = get_peft_model(model, lora_config)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/mapping.py", line 136, in get_peft_model
return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/peft_model.py", line 1094, in init
super().init(model, peft_config, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/peft_model.py", line 129, in init
self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 136, in init
super().init(model, config, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 148, in init
self.inject_adapter(self.model, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 328, in inject_adapter
raise ValueError(
ValueError: Target modules {'v_proj', 'q_proj'} not found in the base model. Please check the target modules and try again.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████| 7/7 [00:06<00:00, 1.05it/s]
04/06/2024 13:08:18 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
04/06/2024 13:08:18 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA
Traceback (most recent call last):
File "/mnt/part1/LLaMA-Factory/src/train_bash.py", line 14, in
main()
File "/mnt/part1/LLaMA-Factory/src/train_bash.py", line 5, in main
run_exp()
File "/mnt/part1/LLaMA-Factory/src/llmtuner/train/tuner.py", line 33, in run_exp
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/mnt/part1/LLaMA-Factory/src/llmtuner/train/sft/workflow.py", line 33, in run_sft
model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
File "/mnt/part1/LLaMA-Factory/src/llmtuner/model/loader.py", line 93, in load_model
model = init_adapter(model, model_args, finetuning_args, is_trainable)
File "/mnt/part1/LLaMA-Factory/src/llmtuner/model/adapter.py", line 157, in init_adapter
model = get_peft_model(model, lora_config)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/mapping.py", line 136, in get_peft_model
return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/peft_model.py", line 1094, in init
super().init(model, peft_config, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/peft_model.py", line 129, in init
self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 136, in init
super().init(model, config, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 148, in init
self.inject_adapter(self.model, adapter_name)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 328, in inject_adapter
raise ValueError(
ValueError: Target modules {'v_proj', 'q_proj'} not found in the base model. Please check the target modules and try again.
[2024-04-06 13:08:21,065] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 216659) of binary: /root/anaconda3/envs/llm/bin/python
Traceback (most recent call last):
File "/root/anaconda3/envs/llm/bin/accelerate", line 8, in
sys.exit(main())
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 46, in main
args.func(args)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1044, in launch_command
multi_gpu_launcher(args)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/accelerate/commands/launch.py", line 702, in multi_gpu_launcher
distrib_run.run(args)
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/anaconda3/envs/llm/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
/mnt/part1/LLaMA-Factory/src/train_bash.py FAILED
Failures: [1]: time : 2024-04-06_13:08:21 host : hht3090 rank : 1 (local_rank: 1) exitcode : 1 (pid: 216660) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2024-04-06_13:08:21 host : hht3090 rank : 0 (local_rank: 0) exitcode : 1 (pid: 216659) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
System Info
Ubuntu20.0 3090*2(24G)
Others
No response