We faced a problem when finetuning a large model using Deepspeed Zero3.
If there is an LLM to finetune, we have to load it into memory first, then we can use the Deepspeed engine to shard and train them. The normal way that we init a model to train is just to load it into the CPU for every process(for a GPU), for small models it's ok to do so, but OOM for a large model such as 65B or 80B.
The Deepspeed that gave us a way to solve the above problem is to init the model within context: deepspeed.zero.Init() , therefore the engine will shard the model when it is building, and distribute state_dict then. We face a problem when adding Lora into gpt2 within that context, the main error is maybe caused by the keyword "basemodel" was auto replaced by
the Deepspeed engine and PEFT can not get this attribute so it just gets into a dead loop until the stack overflow.
The environment is latest,
pytorch == 1.13.1
transformers == 4.31.0
peft == 0.4.0
hardware is:
nvidia-a100-80G*8
Who can help?
No response
Information
[ ] The official example scripts
[X] My own modified scripts
Tasks
[ ] An officially supported task in the examples folder
[X] My own task or dataset (give details below)
Reproduction
from typing import Any
import torch
from transformers import GPT2Config
import transformers
from deepspeed.ops.adam import FusedAdam
from transformers import GPT2Tokenizer, GPT2LMHeadModel,GPT2Config
import pytorch_lightning as pl
import os
from datasets import load_dataset
import sys
if "." not in sys.path:
sys.path.append(".")
from tqdm import tqdm
from transformers import GPT2LMHeadModel
test_config = {
"name": "test_lora_gpt2",
"model":{
"gpt":{
"name": "gpt2",
"ckpt_path": "./gpt2-local",
"tokenizer_path": "./gpt2-local/tokenizer",
}
}
}
prompt_list = [
"The sun rises in the east.",
"The cat sat lazily on the windowsill.",
"The ocean waves crashed against the shore.",
"He studied diligently for his exams.",
"The majestic mountains stood tall in the distance.",
"She danced gracefully across the stage.",
"The aroma of fresh-baked bread filled the air.",
"The children giggled as they played in the park.",
"The painting displayed vivid colors and intricate details.",
"The old man walked slowly with a cane.",
"The spaceship soared through the vast expanse of space.",
"The smell of rain lingered after the storm passed.",
"She smiled warmly at her friend.",
"The recipe called for a pinch of salt and a dash of pepper.",
"The athlete ran swiftly toward the finish line.",
"The ancient ruins held secrets of a forgotten civilization.",
"The butterfly fluttered gracefully from flower to flower.",
"The detective carefully examined the crime scene for clues.",
"The symphony orchestra played a beautiful melody.",
]
from peft import LoraConfig, get_peft_model
def is_local_rank_zero():
if "LOCAL_RANK" not in os.environ or os.environ["LOCAL_RANK"] == "0":
return True
else:
return False
def find_all_linear_names(model):
cls_linear = torch.nn.Linear
cls_conv1d = transformers.pytorch_utils.Conv1D
lora_module_names = set()
for name, module in model.named_modules():
print(name, type(module))
if isinstance(module, cls_linear) or isinstance(module, cls_conv1d):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
print("check replaced model name", list(lora_module_names))
return list(lora_module_names)
class LLMSystem(pl.LightningModule):
def __init__(self, gpt, tokenizer):
super().__init__()
self.gpt = gpt
self.tokenizer = tokenizer
def common_step(self, batch, batch_idx):
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.padding_side = "left"
inputs = batch['input']
output = self.tokenizer(inputs, return_tensors='pt')
embedding_devcie = self.gpt.transformer.wte.weight.device
input_ids, attention_mask = output.input_ids.to(embedding_devcie), output.attention_mask.to(embedding_devcie)
ret = self.gpt(
input_ids=input_ids,
attention_mask=attention_mask,
labels=input_ids,
return_dict=True,
)
if is_local_rank_zero():
print(ret.loss)
return ret.loss
def training_step(self, batch, batch_idx):
return self.common_step(batch, batch_idx)
def configure_optimizers(self) -> Any:
# return DeepSpeedCPUAdam(self.parameters(), lr=1e-4)
# return torch.optim.Adam(self.parameters(), lr=1e-4)
return FusedAdam(self.parameters(), lr=1e-4)
def configure_sharded_model(self):
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules=find_all_linear_names(self.gpt),
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
self.gpt = get_peft_model(self.gpt, lora_config)
class test_dataset(torch.utils.data.Dataset):
def __init__(self, prompt_list):
# self.prompt_list = prompt_list
self.prompt_list= []
for i in range(1000):
self.prompt_list += prompt_list
def __len__(self):
return len(self.prompt_list)
def __getitem__(self, idx):
# return self.prompt_list[idx]
text = self.prompt_list[idx]
return {
"input": text
}
def use_pure_gpt_for_lora():
config = test_config
wrapper_config = config['model']['gpt']
tokenizer = GPT2Tokenizer.from_pretrained(wrapper_config['ckpt_path']+"/tokenizer", use_fast=False)
gpt_config = GPT2Config.from_pretrained(wrapper_config['ckpt_path']+"/model")
gpt = GPT2LMHeadModel(gpt_config).half()
for name, params in gpt.named_parameters():
params.requires_grad = False
train_sys = LLMSystem(gpt, tokenizer)
if not os.path.exists("{}/{}".format("all_models", config['name'])):
os.mkdir("{}/{}".format("all_models", config['name']))
tensor_board = pl.loggers.TensorBoardLogger("{}/{}/logs/".format("all_models", config['name']))
trainer = pl.Trainer(
num_nodes=1,
strategy="deepspeed_stage_3",
# strategy="ddp",
accelerator="auto",
precision="16-mixed",
logger=tensor_board,
devices=8,
# callbacks=[checkpoint_callback],
)
testset = test_dataset(prompt_list)
test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)
trainer.fit(train_sys, test_loader)
pass
if __name__ == "__main__":
use_pure_gpt_for_lora()
pass
Expected behavior
The Lora can work fine in :
ddp mode,
deepspeed zero1\2 (without model parallel)
deepspeed zero3 only when init entire model outside of the context.
The Lora can not work when using deepspeed.zero.Init() to init the model.
We expect Lora can be inited within deepspeed.zero.Init() context.
the error is a endless loop, there fore i just post a part of error messages in below:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 434, in __getattr__
return getattr(self.base_model, name)
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 434, in __getattr__
return getattr(self.base_model, name)
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 434, in __getattr__
return getattr(self.base_model, name)
[Previous line repeated 995 more times]
File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
return super().__getattr__(name) # defer to nn.Module's logic
File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1270, in __getattr__
type(self).__name__, name))
RecursionError: maximum recursion depth exceeded while calling a Python object
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
System Info
Hello guys,
We faced a problem when finetuning a large model using Deepspeed Zero3.
If there is an LLM to finetune, we have to load it into memory first, then we can use the Deepspeed engine to shard and train them. The normal way that we init a model to train is just to load it into the CPU for every process(for a GPU), for small models it's ok to do so, but OOM for a large model such as 65B or 80B.
The Deepspeed that gave us a way to solve the above problem is to init the model within context: deepspeed.zero.Init() , therefore the engine will shard the model when it is building, and distribute state_dict then. We face a problem when adding Lora into gpt2 within that context, the main error is maybe caused by the keyword "basemodel" was auto replaced by the Deepspeed engine and PEFT can not get this attribute so it just gets into a dead loop until the stack overflow.
The environment is latest, pytorch == 1.13.1 transformers == 4.31.0 peft == 0.4.0
hardware is: nvidia-a100-80G*8
Who can help?
No response
Information
Tasks
examples
folderReproduction
Expected behavior
The Lora can work fine in :
The Lora can not work when using deepspeed.zero.Init() to init the model.
We expect Lora can be inited within deepspeed.zero.Init() context.
the error is a endless loop, there fore i just post a part of error messages in below: