Error when sharding model within deepspeed.zero.Init context

System Info

Hello guys,

We faced a problem when finetuning a large model using Deepspeed Zero3.

If there is an LLM to finetune, we have to load it into memory first, then we can use the Deepspeed engine to shard and train them. The normal way that we init a model to train is just to load it into the CPU for every process(for a GPU), for small models it's ok to do so, but OOM for a large model such as 65B or 80B.

The Deepspeed that gave us a way to solve the above problem is to init the model within context: deepspeed.zero.Init() , therefore the engine will shard the model when it is building, and distribute state_dict then. We face a problem when adding Lora into gpt2 within that context, the main error is maybe caused by the keyword "basemodel" was auto replaced by the Deepspeed engine and PEFT can not get this attribute so it just gets into a dead loop until the stack overflow.

The environment is latest, pytorch == 1.13.1 transformers == 4.31.0 peft == 0.4.0

hardware is: nvidia-a100-80G*8

Who can help?

No response

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder
[X] My own task or dataset (give details below)

Reproduction

from typing import Any
import torch
from transformers import GPT2Config
import transformers
from deepspeed.ops.adam import FusedAdam

from transformers import GPT2Tokenizer, GPT2LMHeadModel,GPT2Config

import pytorch_lightning as pl
import os

from datasets import load_dataset

import sys
if "." not in sys.path:
    sys.path.append(".")

from tqdm import tqdm
from transformers import GPT2LMHeadModel 

test_config = {
    "name": "test_lora_gpt2",
    "model":{
        "gpt":{
            "name": "gpt2",
            "ckpt_path": "./gpt2-local",
            "tokenizer_path": "./gpt2-local/tokenizer",
        }
    }
}

prompt_list = [
    "The sun rises in the east.",
    "The cat sat lazily on the windowsill.",
    "The ocean waves crashed against the shore.",
    "He studied diligently for his exams.",
    "The majestic mountains stood tall in the distance.",
    "She danced gracefully across the stage.",
    "The aroma of fresh-baked bread filled the air.",
    "The children giggled as they played in the park.",
    "The painting displayed vivid colors and intricate details.",
    "The old man walked slowly with a cane.",
    "The spaceship soared through the vast expanse of space.",
    "The smell of rain lingered after the storm passed.",
    "She smiled warmly at her friend.",
    "The recipe called for a pinch of salt and a dash of pepper.",
    "The athlete ran swiftly toward the finish line.",
    "The ancient ruins held secrets of a forgotten civilization.",
    "The butterfly fluttered gracefully from flower to flower.",
    "The detective carefully examined the crime scene for clues.",
    "The symphony orchestra played a beautiful melody.",
]

from peft import LoraConfig, get_peft_model

def is_local_rank_zero():
    if "LOCAL_RANK" not in os.environ or os.environ["LOCAL_RANK"] == "0":
        return True
    else:
        return False

def find_all_linear_names(model):
    cls_linear = torch.nn.Linear
    cls_conv1d = transformers.pytorch_utils.Conv1D

    lora_module_names = set()
    for name, module in model.named_modules():
        print(name, type(module))
        if isinstance(module, cls_linear) or isinstance(module, cls_conv1d):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    print("check replaced model name", list(lora_module_names))
    return list(lora_module_names)

class LLMSystem(pl.LightningModule):
    def __init__(self, gpt, tokenizer):
        super().__init__()
        self.gpt = gpt
        self.tokenizer = tokenizer

    def common_step(self, batch, batch_idx):
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        inputs = batch['input']
        output = self.tokenizer(inputs, return_tensors='pt')

        embedding_devcie = self.gpt.transformer.wte.weight.device
        input_ids, attention_mask = output.input_ids.to(embedding_devcie), output.attention_mask.to(embedding_devcie)

        ret = self.gpt(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids,
            return_dict=True,
        )
        if is_local_rank_zero():
            print(ret.loss)

        return ret.loss

    def training_step(self, batch, batch_idx):
        return self.common_step(batch, batch_idx)

    def configure_optimizers(self) -> Any:
        # return DeepSpeedCPUAdam(self.parameters(), lr=1e-4)
        # return torch.optim.Adam(self.parameters(), lr=1e-4)
        return FusedAdam(self.parameters(), lr=1e-4)

    def configure_sharded_model(self):

        lora_config = LoraConfig(
            r=64,
            lora_alpha=16,
            target_modules=find_all_linear_names(self.gpt),
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        self.gpt = get_peft_model(self.gpt, lora_config)

class test_dataset(torch.utils.data.Dataset):
    def __init__(self, prompt_list):
        # self.prompt_list = prompt_list 
        self.prompt_list= []
        for i in range(1000):
            self.prompt_list += prompt_list

    def __len__(self):
        return len(self.prompt_list)

    def __getitem__(self, idx):
        # return self.prompt_list[idx]
        text = self.prompt_list[idx]
        return {
            "input": text
        }

def use_pure_gpt_for_lora():

    config = test_config

    wrapper_config = config['model']['gpt']

    tokenizer = GPT2Tokenizer.from_pretrained(wrapper_config['ckpt_path']+"/tokenizer", use_fast=False)

    gpt_config = GPT2Config.from_pretrained(wrapper_config['ckpt_path']+"/model")
    gpt = GPT2LMHeadModel(gpt_config).half()

    for name, params in gpt.named_parameters():
        params.requires_grad = False

    train_sys = LLMSystem(gpt, tokenizer)

    if not os.path.exists("{}/{}".format("all_models", config['name'])):
        os.mkdir("{}/{}".format("all_models", config['name']))

    tensor_board = pl.loggers.TensorBoardLogger("{}/{}/logs/".format("all_models", config['name']))

    trainer = pl.Trainer(
        num_nodes=1,
        strategy="deepspeed_stage_3",
        # strategy="ddp",
        accelerator="auto",
        precision="16-mixed",
        logger=tensor_board,
        devices=8,
        # callbacks=[checkpoint_callback],

    )

    testset = test_dataset(prompt_list)

    test_loader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)

    trainer.fit(train_sys, test_loader)

    pass

if __name__ == "__main__":

    use_pure_gpt_for_lora()

    pass

Expected behavior

The Lora can work fine in :

ddp mode,
deepspeed zero1\2 (without model parallel)
deepspeed zero3 only when init entire model outside of the context.

The Lora can not work when using deepspeed.zero.Init() to init the model.

We expect Lora can be inited within deepspeed.zero.Init() context.

the error is a endless loop, there fore i just post a part of error messages in below:

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1269, in __getattr__
    raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'PeftModelForCausalLM' object has no attribute 'base_model'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 434, in __getattr__
    return getattr(self.base_model, name)
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 434, in __getattr__
    return getattr(self.base_model, name)
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 434, in __getattr__
    return getattr(self.base_model, name)
  [Previous line repeated 995 more times]
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/peft/peft_model.py", line 408, in __getattr__
    return super().__getattr__(name)  # defer to nn.Module's logic
  File "/opt/conda/envs/fake/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1270, in __getattr__
    type(self).__name__, name))
RecursionError: maximum recursion depth exceeded while calling a Python object

huggingface / peft