Closed raidery closed 6 months ago
1、--model_name_or_path '/root/.cache/modelscope/hub/ZJUNLP/OneKE' \
--checkpoint_dir '/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/checkpoint/llama2-13b-IEPile-lora' \
这块不需要增加--checkpoint_dir参数
2、增加--bits 4
进行量化
感谢大佬回复 1.去掉--checkpoint_dir参数参数出现同样的错误,
Traceback (most recent call last):
File "/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/src/finetune.py", line 116, in <module>
main()
File "/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/src/finetune.py", line 111, in main
train(model_args, data_args, training_args, finetuning_args, generating_args)
File "/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/src/finetune.py", line 66, in train
trainer = trainer_class(
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/trainer.py", line 528, in __init__
self._move_model_to_device(model, args.device)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/trainer.py", line 775, in _move_model_to_device
model = model.to(device)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1145, in to
return self._apply(convert)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 797, in _apply
module._apply(fn)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 797, in _apply
module._apply(fn)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 797, in _apply
module._apply(fn)
[Previous line repeated 5 more times]
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 820, in _apply
param_applied = fn(param)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1143, in convert
return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB (GPU 1; 23.64 GiB total capacity; 23.07 GiB already allocated; 54.69 MiB free; 23.07 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Traceback (most recent call last):
2.添加增加--bits 4出现错误
ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time.
06/03/2024 11:33:07 - INFO - model.loader - Quantizing model to 4 bit.
我现在的命令是这样的
export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:1024'
CUDA_VISIBLE_DEVICES="0,1,2" torchrun --nproc_per_node=3 --master_port=1287 src/finetune.py \
--do_train --do_eval \
--overwrite_output_dir \
--model_name_or_path '/root/.cache/modelscope/hub/ZJUNLP/OneKE' \
--stage 'sft' \
--model_name 'llama' \
--template 'llama2_zh' \
--train_file 'data/train.json' \
--output_dir='/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/oneke-continue' \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--preprocessing_num_workers 2 \
--num_train_epochs 5 \
--learning_rate 5e-5 \
--max_grad_norm 0.5 \
--optim "adamw_torch" \
--max_source_length 10 \
--cutoff_len 10 \
--max_target_length 5 \
--evaluation_strategy "epoch" \
--save_strategy "epoch" \
--save_total_limit 2 \
--lora_r 64 \
--lora_alpha 64 \
--max_length 256 \
--max_new_tokens 512 \
--lora_dropout 0.05 \
--bf16 \
--bits 4
请使用Deepke的最新版本
已经git pull到最新版本,同样的问题
抱歉,代码存在漏洞,我们已经更新了代码。请pull最新的deepke。
感谢你的fix,但是还是有问题
RuntimeError: "triu_tril_cuda_template" not implemented for 'BFloat16'
Traceback (most recent call last):
File "/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/src/finetune.py", line 116, in <module>
main()
File "/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/src/finetune.py", line 111, in main
train(model_args, data_args, training_args, finetuning_args, generating_args)
File "/root/kg-llm/oneke/DeepKE/example/llm/InstructKGC/src/finetune.py", line 80, in train
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/trainer.py", line 3238, in training_step
loss = self.compute_loss(model, inputs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/trainer.py", line 3264, in compute_loss
outputs = model(**inputs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/peft/peft_model.py", line 1430, in forward
return self.base_model(
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/peft/tuners/tuners_utils.py", line 179, in forward
return self.model.forward(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1164, in forward
outputs = self.model(
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 940, in forward
causal_mask = self._update_causal_mask(
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1061, in _update_causal_mask
causal_mask = torch.triu(causal_mask, diagonal=1)
RuntimeError: "triu_tril_cuda_template" not implemented for 'BFloat16'
降级transformers版本到4.33.0
谢谢已降级到4.33.0
pip install -U transformers==4.33.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
感谢,可以跑通了
我已经把参数调值的很小了,还是提示内存不够用
File "/root/anaconda3/envs/deepke-llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1143, in convert return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 2; 23.64 GiB total capacity; 23.08 GiB already allocated; 704.00 KiB free; 23.12 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 25598) of binary: /root/anaconda3/envs/deepke-llm/bin/python Traceback (most recent call last): File "/root/anaconda3/envs/deepke-llm/bin/torchrun", line 8, in
sys.exit(main())
谢谢!