Closed tamanna-mostafa closed 8 months ago
It appears that the "gate_proj", "up_proj", and "down_proj" layers are causing trouble in your case, whereas the other layers work. This is strange, since these layers seem to be normal nn.Linear
layers. When you first create the PEFT model (after calling get_peft_model
), could you please inspect if these layers are initialized correctly? Especially, their lora_A.default.weight
and lora_B.default.weight
should have the correct shape. If they do, could you please do the same check after training (before saving) and then after loading? It would be interesting to figure out at what point those weights are seemingly reduced to size 0.
One area where this could happen is when training with DeepSpeed, but it's not obvious to me why that would only affect those weights.
@BenjaminBossan thanks for your reply.
get_peft_model
You mean when I train the DPO model? (the base SFT model was not a PEFT model)
I didn't use get_peft_model
function. I used the script below to train the DPO model, in which I hard-coded the target modules.
could you please inspect if these layers are initialized correctly?
How can I do this check?
# 0. imports
import os
from dataclasses import dataclass, field
from typing import Dict, Optional
import re
import torch, json
from datasets import Dataset, load_dataset, load_from_disk
from trl import DPOTrainer
from dataclasses import dataclass, field
from typing import Dict, Optional
import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments
from trl import DPOTrainer
from peft import AutoPeftModelForCausalLM, LoraConfig
from datasets import Dataset
# Define and parse arguments.
@dataclass
class ScriptArguments:
"""
The arguments for the DPO training script.
"""
# data parameters
beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})
# training parameters
model_name_or_path: Optional[str] = field(
default="../sft/results/final_checkpoint",
metadata={"help": "the location of the SFT model name or path"},
)
data_path: Optional[str] = field(
default="./anthropic_helpful_model_lamma7b-oasst-topk-best_vs_worst_DPO.json",
metadata={"help": "the location of data "},
)
use_lamma2_peft_config: Optional[bool] = field(
default=True, metadata={"help": "Is Lamma2 model?"}
)
learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})
per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"})
per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
gradient_accumulation_steps: Optional[int] = field(
default=4, metadata={"help": "the number of gradient accumulation steps"}
)
gradient_checkpointing: Optional[bool] = field(
default=True, metadata={"help": "whether to use gradient checkpointing"}
)
lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
num_train_epochs: Optional[int] = field(default=2, metadata={"help": "num epochs"})
max_prompt_length: Optional[int] = field(default=512, metadata={"help": "the maximum prompt length"})
max_length: Optional[int] = field(default=1024, metadata={"help": "the maximum sequence length"})
max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"})
logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"})
eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"})
save_total_limit: Optional[int] = field(default=4, metadata={"help": "the logging frequency"})
output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})
# max_length: Optional[int] = field(default=512, metadata={"help": "max length of each sample"})
# max_prompt_length: Optional[int] = field(default=128, metadata={"help": "max length of each sample's prompt"})
max_target_length: Optional[int] = field(
default=128, metadata={"help": "Only used for encoder decoder model. Max target of each sample's prompt"}
)
# instrumentation
sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"})
report_to: Optional[str] = field(
default="wandb",
metadata={
"help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
'`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
},
)
# debug argument for distributed training
ignore_bias_buffers: Optional[bool] = field(
default=False,
metadata={
"help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
"https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
},
)
def find_all_linear_names(model):
cls = torch.nn.Linear # if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
def create_peft_config(modules, r, alpha, dropout):
"""
Create Parameter-Efficient Fine-Tuning config for your model
:param modules: Names of the modules to apply Lora to
"""
config = LoraConfig(
r=r, # dimension of the updated matrices
lora_alpha=alpha, # parameter for scaling
target_modules=modules,
#target_modules=['query_key_value'],
lora_dropout=dropout, # dropout probability for layers
bias="none",
task_type="CAUSAL_LM",
)
return config
if __name__ == "__main__":
parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses()[0]
# 1. load a pretrained model
model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path,
# low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
# load_in_4bit=True,
#load_in_4bit=True,
trust_remote_code=True
)
#model.config.use_cache = False
if script_args.ignore_bias_buffers:
# torch distributed hack
model._ddp_params_and_buffers_to_ignore = [
name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
]
model.enable_input_require_grads()
model_ref = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path, torch_dtype=torch.bfloat16,
# low_cpu_mem_usage=True,
# load_in_4bit=True,
# load_in_4bit=True,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
with open(script_args.data_path) as f:
dataset = Dataset.from_list(json.load(f))
print(dataset)
print(dataset[0])
dataset = dataset.train_test_split(test_size=0.05, shuffle=True, seed=42)
train_dataset = dataset['train'].filter(
lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
)
# 3. Load evaluation dataset
eval_dataset = dataset['test'].filter(
lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
)
print(f'length train: {len(train_dataset)}, length eval: {len(eval_dataset)} ')
"""
peft_config = LoraConfig(
r=script_args.lora_r,
lora_alpha=script_args.lora_alpha,
lora_dropout=script_args.lora_dropout,
target_modules=[
"dense_4h_to_h",
"dense",
"query_key_value",
"dense_h_to_4h"
],
bias="none",
task_type="CAUSAL_LM",
)
"""
if not script_args.use_lamma2_peft_config:
#modules = find_all_linear_names(model)
modules=[
"k_proj",
"gate_proj",
"v_proj",
"up_proj",
"q_proj",
"o_proj",
"down_proj",
]
#print(modules)
peft_config=create_peft_config(modules,script_args.lora_r,script_args.lora_alpha,script_args.lora_dropout)
print(f"peft_config: {peft_config}")
else:
print("****** USING LAMMA2 PEFT CONFIG ******")
peft_config = LoraConfig(
r=script_args.lora_r,
lora_alpha=script_args.lora_alpha,
lora_dropout=script_args.lora_dropout,
target_modules=[
"q_proj",
"v_proj",
"k_proj",
"out_proj",
"fc_in",
"fc_out",
"wte",
],
bias="none",
task_type="CAUSAL_LM",
)
print(f"peft_config: {peft_config}")
wandb_run_name=script_args.output_dir.split('/')[-1]+"_"+script_args.data_path.split('/')[-1][:-4]
# 4. initialize training arguments:
training_args = TrainingArguments(
per_device_train_batch_size=script_args.per_device_train_batch_size,
# max_steps=script_args.max_steps,
num_train_epochs=script_args.num_train_epochs,
logging_steps=script_args.logging_steps,
save_steps=script_args.save_steps,
gradient_accumulation_steps=script_args.gradient_accumulation_steps,
save_total_limit=script_args.save_total_limit,
gradient_checkpointing=script_args.gradient_checkpointing,
learning_rate=script_args.learning_rate,
evaluation_strategy="steps",
eval_steps=script_args.eval_steps,
output_dir=script_args.output_dir,
report_to=script_args.report_to,
lr_scheduler_type=script_args.lr_scheduler_type,
warmup_steps=script_args.warmup_steps,
optim=script_args.optimizer_type,
logging_first_step=True,
bf16=True,
remove_unused_columns=False,
run_name="dpo_" + wandb_run_name,
)
# 5. initialize the DPO trainer
dpo_trainer = DPOTrainer(
model,
model_ref,
args=training_args,
beta=script_args.beta,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
peft_config=peft_config,
max_prompt_length=script_args.max_prompt_length,
max_length=script_args.max_length,
)
# 6. train
dpo_trainer.train()
dpo_trainer.save_model(script_args.output_dir)
# 7. save
output_dir = os.path.join(script_args.output_dir, "final_checkpoint")
dpo_trainer.model.save_pretrained(output_dir)
Command used to run the script above:
accelerate launch --config_file ./accelerate_configs/ds_zero3.yaml rlhf_dpo.py \
--model_name_or_path="/mnt/efs/data/tammosta/files_t/output_sft_32k" \
--output_dir="/mnt/efs/data/tammosta/files_t/DPO_output_mistral_debug" \
--data_path="/mnt/efs/data/tammosta/files_t/DPO_data_rbs_clean_AIF.json" \
--use_lamma2_peft_config True \
--beta 0.1 \
--optimizer_type adamw_hf \
--learning_rate 1e-6 \
--warmup_steps 50 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--lora_alpha 16 \
--lora_dropout 0.05 \
--lora_r 8 \
--max_prompt_length 2048 \
--max_length 4096 \
--num_train_epochs 4 \
--logging_steps 20 \
--save_steps 100 \
--save_total_limit 8 \
--eval_steps 50 \
--gradient_checkpointing True \
--report_to "wandb"
I didn't use
get_peft_model
function. I used the script below to train the DPO model, in which I hard-coded the target modules.
I see, so you're delegating that work to trl, which creates the PEFT model for you.
How can I do this check?
I don't know trl really well, maybe @younesbelkada can share some insights.
If I used the same target modules (see below) used for DPO-training LLAMA-2-7b model to DPO-train Mistral 7b model, what will be the implications? That way, I don't have this error.
"q_proj",
"v_proj",
"k_proj",
"out_proj",
"fc_in",
"fc_out",
"wte",
This way, you would not apply LoRA to a few layers. Maybe those are not necessary and you can still get good results. I'd definitely try this out.
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Hi, I am facing the same error after training Phi3 with a Lora adapter. I saved the adapter but now am not able to load the model and the adapter again. Did you find a solution to this?
@MaggieK410 Could you provide more details? Are you using DeepSpeed? What is the size of the checkpoints on disk, do they appear to be empty?
Hi Benjamin, thank you for the quick reply. I am using deepspeed with the following config:
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
gradient_accumulation_steps: 1
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
I am also using quantization:
bnb_config=BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type=βnf4β,
bnb_4bit_compute_dtype=torch.bfloat16
I load my model and LoraConfig and preprare everything for training like this:
peft_config=LoraConfig(inference_mode=False, r=16, lora_alpha=64, lora_dropout=0.1, task_type=TaskType.CAUSAL_LM, target_modules=βall-linearβ)
model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, trust_remote_code=True)
model=get_peft_model(model, peft_config)
model, train_dataloader, val_dataloader, test_dataloader, optimizer, lr_scheduler=accelerator.prepare(model, train_dataloader, val_dataloader, test_dataloader, optimizer, lr_scheduler)
After training, I save my model like this:
unwrapped_model=accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(save_path, is_main_process=accelerator.is_main_process, save_function=accelerator.save)
I checked the file sizes (with ls -lh) and they have the following sizes. I am not entirely sure how big they should be?: adapter_model.safetensors: 29K adapter_config.json: 698
Let me know if I can provide any other information that could be useful :) Thank you very much for your help!
Okay, thanks, so far this looks correct. The adapter file does not appear to be empty, though it is very small. You didn't say exactly what model you used, but when I try microsoft/Phi-3-mini-128k-instruct
with the same LoRA config as you mentioned, my checkpoint is much bigger, 97M.
You could try loading the checkpoint directly and check if it contains the expected items and if the values have the right shape:
from safetensors.torch import load_file
sd = load_file("adapter_model.safetensors")
for key, val in sd.items():
print(key, val.shape)
Hi, thank you for the quick update! Yes, I used microsoft/Phi-3-mini-128k-instruct as my model, but I am running into the same issue with klyang/MentaLLaMA-chat-7B. I checked, and you're right, all the tensors are empty (tensor([],dtype=torch.bfloat16)) , which is definitely weird...
I add my training loop here, in case something weird is happening during training:
model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, trust_remote_code=True)
model=get_peft_model(model, peft_config)
lr=2e-5
num_epochs=10
optimizer=torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler=get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataloader)*num_epochs))
model=model.to("cuda")
model, train_dataloader, val_dataloader, test_dataloader, optimizer, lr_scheduler=accelerator.prepare(model, train_dataloader, val_dataloader, test_dataloader, optimizer, lr_scheduler)
pattern=re.compile(r"^\w+: (yes|no)$")
accelerator.print(model)
mismatch=0
pre_train_preds=[]
pre_train_true_labels=[]
full_preds=[]
full_true=[]
for step, batch in enumerate(tqdm(test_dataloader)):
with torch.no_grad():
label=[el for el in batch["labels"][0] if el != -100]
outputs=accelerator.unwrap_model(model).generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_new_tokens=30)
preds=[el for el in tokenizer.decode(outputs[0][len(batch["input_ids"][0]):], skip_special_tokens=True).split("\n") if pattern.match(el)]
true_label=tokenizer.decode([el for el in batch["labels"][0] if el != -100], skip_special_tokens=True).strip().split("\n")
full_preds.append(preds)
full_true.append(true_label)
if len(preds) != len(true_label):
mismatch+=1
else:
pre_train_true_labels+=[0 if el.split(": ")[1] == "no" else 1 for el in true_label]
pre_train_preds+=[0 if el.split(": ")[1] == "no" else 1 for el in preds]
for epoch in range(num_epochs):
model.train()
total_loss=0
for step, batch in enumerate(tqdm(train_dataloader)):
outputs=model(**batch)
loss=outputs.loss
total_loss+=loss.detach().float()
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
pattern=re.compile(r"^\w+ : (yes|no)$")
model.eval()
eval_loss=0
eval_preds=[]
true_labels=[]
mismatch=0
for step, batch in enumerate(tqdm(test_dataloader)):
with torch.no_grad():
labels=[el for el in batch["labels"][0] if el != -100]
outputs=accelerator.unwrap_model(model).generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_new_tokens=30)
preds=[el for el in tokenizer.decode(outputs[0][len(batch["input_ids"][0]):], skip_special_tokens=True).strip().split("\n") if pattern.match(el)]
true_label=tokenizer.decode([el for el in batch["labels"][0] if el != -100], skip_special_tokens=True).strip().split("\n")
if len(preds) != len(true_label):
mismatch+=1
else:
eval_preds+=[0 if el.split(" : ")[1] == "no" else 1 for el in preds]
true_labels+=[0 if el.split(" : ")[1] == "no" else 1 for el in true_label]
save_path=filename[:-4]+"_checkpoint/"
unwrapped_model=accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(save_path, is_main_process=accelerator.is_main_process, save_function=accelerator.save)
Sorry for the sloppiness of the code, I am in the middle of a research project and haven't cleaned everything up yet. I will look deeper into whether the issue could come from the accelerator saving function or from the model being put on the GPU before training?
Thanks again for all your help!
Something must be wrong when saving the model, probably the parameters are not correctly gathered from the shards, resulting in empty params. I checked the accelerate docs for DeepSpeed and the saving call is a little bit different to what you use, as it also pass state_dict=accelerator.get_state_dict(model)
. Could you try if that helps?
Perfect, you're totally right, this works now! For anyone else coming across this issue, this is the way I reaload my adapter now after saving it like Benjamin said, which works:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
bnb_config=BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16)
model_name="microsoft/Phi-3-mini-128k-instruct"
model=AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, trust_remote_code=True)
inference_model=PeftModel.from_pretrained(model, "./path_to_checkpoint/")
Thanks again!
Glad that it worked, thanks for reporting back.
By using the above code, I still encountered an error and seek help !
from safetensors.torch import load_file
sd = load_file("/content/drive/MyDrive/python-code-llama/checkpoint-20/adapter_model.safetensors")
for key, val in sd.items():
print(key, val.shape)
SafetensorError: Error while deserializing header: InvalidHeaderDeserialization
#test model loading LORA
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel
base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_8bit=True,
torch_dtype=torch.float16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf",trust_remote_code=True)
output_dir = "/content/drive/MyDrive/python-code-llama/checkpoint-20"
model = PeftModel.from_pretrained(model, output_dir)
eval_prompt = """ Below is a task for the coding fix. Complete the task by writing python code, and explaining it.
#Your response must contain only the python code to solve this problem.
### bad_solutionοΌ
### Response:
"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
print(tokenizer.decode(model.generate(**model_input, max_new_tokens=500,pad_token_id=tokenizer.eos_token_id)[0], skip_special_tokens=True))
SafetensorError: Error while deserializing header: InvalidHeaderDeserialization adapter_model.safetensors is only 48 Bytes
@BilingIris It looks like your checkpoint file is broken somehow. I can't really help you unless you can show how the file was created. Also, I'm assuming that you're not using DeepSpeed, since it looks like you're using Colab.
System Info
Who can help?
@younesbelkada @sayakpaul
peft_config = LoraConfig( r=script_args.lora_r, lora_alpha=script_args.lora_alpha, lora_dropout=script_args.lora_dropout, target_modules=[ "q_proj", "v_proj", "k_proj", "gate_proj", "up_proj", "down_proj", "o_proj", ], bias="none", task_type="CAUSAL_LM", )
accelerate launch --config_file ./accelerate_configs/ds_zero3.yaml rlhf_dpo_working_tammosta.py \ --model_name_or_path="/mnt/efs/data/tammosta/files_t/output_sft_32k" \ --output_dir="/mnt/efs/data/tammosta/files_t/DPO_output_mistral_AIF12k_T32k" \ --data_path="/mnt/efs/data/tammosta/files_t/DPO_data_rbs_clean_AIF.json" \ --use_lamma2_peft_config False \ --beta 0.1 \ --optimizer_type "paged_adamw_32bit" \ --learning_rate 5e-5 \ --warmup_steps 100 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 4 \ --lora_alpha 16 \ --lora_dropout 0.05 \ --lora_r 16 \ --max_prompt_length 2048 \ --max_length 4096 \ --num_train_epochs 4 \ --logging_steps 1 \ --save_steps 100 \ --save_total_limit 8 \ --eval_steps 50 \ --gradient_checkpointing True \ --lr_scheduler_type "cosine" \ --report_to "wandb"
from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch
import os import argparse
def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--base_model_name_or_path", type=str) parser.add_argument("--peft_model_path", type=str) parser.add_argument("--output_dir", type=str) parser.add_argument("--device", type=str, default="auto") parser.add_argument("--safe_serialization", action="store_true")
def main(): args = get_args()
if name == "main" : main()
/mnt/efs/data/tammosta/files_t/DPO_output_mistral_AIF12k_T32k_merged --safe_serialization Loading base model: /mnt/efs/data/tammosta/files_t/output_sft_32k Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 3/3 [00:05<00:00, 1.67s/it] Loading PEFT: /mnt/efs/data/tammosta/files_t/DPO_output_mistral_AIF12k_T32k/final_checkpoint Traceback (most recent call last): File "/mnt/efs/data/tammosta/scripts_hb/merge_peft_adaptors_gpu.py", line 51, in
main()
File "/mnt/efs/data/tammosta/scripts_hb/merge_peft_adaptors_gpu.py", line 38, in main
model = PeftModel.from_pretrained(base_model, args.peft_model_path)
File "/opt/conda/envs/ml_v2/lib/python3.10/site-packages/peft/peft_model.py", line 354, in from_pretrained
model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs)
File "/opt/conda/envs/ml_v2/lib/python3.10/site-packages/peft/peft_model.py", line 698, in load_adapter
load_result = set_peft_model_state_dict(self, adapters_weights, adapter_name=adapter_name)
File "/opt/conda/envs/ml_v2/lib/python3.10/site-packages/peft/utils/save_and_load.py", line 241, in set_peft_model_state_dict
load_result = model.load_state_dict(peft_model_state_dict, strict=False)
File "/opt/conda/envs/ml_v2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2153, in load_state_dict
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
size mismatch for base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([16, 14336]).
size mismatch for base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
size mismatch for base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight: copying a param with shape torch.Size([0]) from checkpoint, the shape in current model is torch.Size([14336, 16]).
peft_config = LoraConfig( r=script_args.lora_r, lora_alpha=script_args.lora_alpha, lora_dropout=script_args.lora_dropout, target_modules=[ "q_proj", "v_proj", "k_proj", "out_proj", "fc_in", "fc_out", "wte", ], bias="none", task_type="CAUSAL_LM", )
accelerate launch --config_file ./accelerate_configs/ds_zero3.yaml rlhf_dpo_working_tammosta.py \ --model_name_or_path="/mnt/efs/data/tammosta/files_t/output_sft_32k" \ --output_dir="/mnt/efs/data/tammosta/files_t/DPO_output_mistral_AIF12k_T32k" \ --data_path="/mnt/efs/data/tammosta/files_t/DPO_data_rbs_clean_AIF.json" \ --use_lamma2_peft_config False \ --beta 0.1 \ --optimizer_type "paged_adamw_32bit" \ --learning_rate 5e-5 \ --warmup_steps 100 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 4 \ --lora_alpha 16 \ --lora_dropout 0.05 \ --lora_r 16 \ --max_prompt_length 2048 \ --max_length 4096 \ --num_train_epochs 4 \ --logging_steps 1 \ --save_steps 100 \ --save_total_limit 8 \ --eval_steps 50 \ --gradient_checkpointing True \ --lr_scheduler_type "cosine" \ --report_to "wandb"