When I use peft to finetune llama2, the gpu memory keeps growing

System Info

torch 2.4.1 transformers 4.46.0.dev0 trl 0.11.2 peft 0.13.1 GPU V100 CUDA 12.4 nvidia driver 550.54.15

Who can help?

No response

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder
[X] My own task or dataset (give details below)

Reproduction

from datasets import Dataset 
import json
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM
from trl import SFTTrainer

def prepare_dataset(jsonpath):
    ann  = json.load(open(jsonpath))
    eval_length = int(len(ann)/20)
    train_ann = ann[eval_length:]
    eval_ann = ann[:eval_length]
    train_list=[]
    eval_list=[]
    for dic in train_ann:
        train_list.append({'text':'<s>[INST] <<SYS>>'+dic['instruction']+'<</SYS>>'+dic['input']+'[/INST]'+dic['output']+'</s>'})
    for dic in eval_ann:
        eval_list.append({'text':dic['input']+dic['output']+'</s>'})
    train_dataset = Dataset.from_dict({key: [dic[key] for dic in train_list] for key in train_list[0]})
    eval_dataset = Dataset.from_dict({key: [dic[key] for dic in eval_list] for key in eval_list[0]})
    print("train dataset: ", train_dataset)
    print("eval dataset: ", eval_dataset)
    return train_dataset, eval_dataset

peft_config = LoraConfig(
        r=8,
        lora_alpha=8,
        target_modules=['q_proj', 'v_proj'],
        lora_dropout=0.05,
        bias='none',
        task_type='CAUSAL_LM'
        )
training_arguments = TrainingArguments(
        output_dir="./output_peft",
        overwrite_output_dir=True,
        per_device_train_batch_size=16,
        optim='adamw_torch',
        learning_rate=10e-4,
        eval_steps=50,
        save_steps=100,
        logging_steps=20,
        eval_strategy='steps',
        group_by_length=False,
        #num_train_epochs=20,
        max_steps=200,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_grad_norm=0.3,
        bf16=True,
        lr_scheduler_type='cosine',
        warmup_steps=100
        )

model_name = "llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map='balanced_low_0'
        )

model.enable_input_require_grads()
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token_id = 0
tokenizer.padding_side = 'right'

train_dataset, eval_dataset = prepare_dataset("alpaca_data.json")
trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field='text',
        peft_config=peft_config,
        max_seq_length=1024,
        tokenizer=tokenizer,
        args=training_arguments
        )

trainer.train()
trainer.model_save_pretrained("./output_peft")

Expected behavior

Traceback (most recent call last):
  File "finetuning-llama.py", line 81, in <module>
    trainer.train()
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/trl/trainer/sft_trainer.py", line 434, in train
    output = super().train(*args, **kwargs)
  File "/home/xuan/code/pytorch/huggingface/transformers/src/transformers/trainer.py", line 2084, in train
    return inner_training_loop(
  File "/home/xuan/code/pytorch/huggingface/transformers/src/transformers/trainer.py", line 2420, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/xuan/code/pytorch/huggingface/transformers/src/transformers/trainer.py", line 3523, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/xuan/code/pytorch/huggingface/transformers/src/transformers/trainer.py", line 3570, in compute_loss
    outputs = model(**inputs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/utils/operations.py", line 820, in forward
    return model_forward(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/utils/operations.py", line 808, in __call__
    return convert_to_fp32(self.model_forward(*args, **kwargs))
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/amp/autocast_mode.py", line 43, in decorate_autocast
    return func(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/peft/peft_model.py", line 1644, in forward
    return self.base_model(
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/peft/tuners/tuners_utils.py", line 197, in forward
    return self.model.forward(*args, **kwargs)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/xuan/code/pytorch/huggingface/transformers/src/transformers/models/llama/modeling_llama.py", line 1212, in forward
    logits = logits.float()
RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

In the middle of fine-tuning llama, the above error will be reported. And I found that after every few steps, the GPU memory will grow a lot until it reaches the maximum GPU memory.There is another phenomenon, that is, during fine-tuning, only one GPU has a usage rate of 100%, and the other GPU usage rates are 0. Moreover, the GPU ID with a usage rate of 100% is not fixed and will change every time.

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla V100-PCIE-32GB           Off |   00000000:5A:00.0 Off |                    0 |
| N/A   38C    P0             38W /  250W |    6404MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-PCIE-32GB           Off |   00000000:5E:00.0 Off |                    0 |
| N/A   52C    P0             48W /  250W |   13882MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   2  Tesla V100-PCIE-32GB           Off |   00000000:62:00.0 Off |                    0 |
| N/A   58C    P0            238W /  250W |   15312MiB /  32768MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   3  Tesla V100-PCIE-32GB           Off |   00000000:66:00.0 Off |                    0 |
| N/A   51C    P0             49W /  250W |   15314MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   4  Tesla V100-PCIE-32GB           Off |   00000000:B5:00.0 Off |                    0 |
| N/A   50C    P0             47W /  250W |   15314MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   5  Tesla V100-PCIE-32GB           Off |   00000000:B9:00.0 Off |                    0 |
| N/A   52C    P0             46W /  250W |   15312MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   6  Tesla V100-PCIE-32GB           Off |   00000000:BD:00.0 Off |                    0 |
| N/A   45C    P0             40W /  250W |   22068MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   7  Tesla V100-PCIE-32GB           Off |   00000000:C1:00.0 Off |                    0 |
| N/A   41C    P0             39W /  250W |     314MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

In the middle of fine-tuning llama, the above error will be reported. And I found that after every few steps, the GPU memory will grow a lot until it reaches the maximum GPU memory.

There can be a few reasons for that. For instance, hidden states can grow quite fast with sequence length, so if you have batches with very uneven length, it's not surprising to see memory fluctuate. We can't rule out some form of memory leak but that would normally manifest as a slow and steady growth in memory.

There is another phenomenon, that is, during fine-tuning, only one GPU has a usage rate of 100%, and the other GPU usage rates are 0. Moreover, the GPU ID with a usage rate of 100% is not fixed and will change every time.

This is also hard to diagnose. How do you run the model? I assume DeepSpeed or FSDP, how are they configured?

One thing you could try to help with diagnosing is to remove PEFT from the equation, i.e. do full fine-tuning. Perhaps you can fit that into memory by reducing the batch size and then check if the same issues as previously occur.

There can be a few reasons for that. For instance, hidden states can grow quite fast with sequence length, so if you have batches with very uneven length, it's not surprising to see memory fluctuate. We can't rule out some form of memory leak but that would normally manifest as a slow and steady growth in memory.

GPU memory will grow by about 4G~6G at a time

This is also hard to diagnose. How do you run the model? I assume DeepSpeed or FSDP, how are they configured?

just run: python demo.py. I've tried using torchrun and accelerate but this conflicts with device_map='auto'

GPU memory will grow by about 4G~6G at a time

As mentioned, it's really hard to say what the reason is, but I don't think it's a memory leak. Can you maybe check if your training data (after all processing steps) has very unequal sequence lengths?

just run: python demo.py. I've tried using torchrun and accelerate but this conflicts with device_map='auto'

Okay, so what this means is that you're not using model parallel training (which would require DeepSpeed or FSDP, which can be used via accelerate). Instead, by default, the Trainer (which SFTTrainer is based on) uses torch DataParallel under the hood when multiple GPUs are detected. This a bit of an outdated method. You could look into DistributedDataParallel, which is recommended by PyTorch for data parallel training (and also supported by accelerate). For bigger models, I would, however, recommend DeepSpeed or FSDP for model parallel training -- check this PEFT example.

GPU memory will grow by about 4G~6G at a time

As mentioned, it's really hard to say what the reason is, but I don't think it's a memory leak. Can you maybe check if your training data (after all processing steps) has very unequal sequence lengths?

just run: python demo.py. I've tried using torchrun and accelerate but this conflicts with device_map='auto'

Okay, so what this means is that you're not using model parallel training (which would require DeepSpeed or FSDP, which can be used via accelerate). Instead, by default, the Trainer (which SFTTrainer is based on) uses torch DataParallel under the hood when multiple GPUs are detected. This a bit of an outdated method. You could look into DistributedDataParallel, which is recommended by PyTorch for data parallel training (and also supported by accelerate). For bigger models, I would, however, recommend DeepSpeed or FSDP for model parallel training -- check this PEFT example.

I commented device_map='auto' and use accelerate launch --config_file "deepspeed_config.yaml" demo.py, then the error:

W1010 19:41:59.845001 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931897 closing signal SIGTERM
W1010 19:41:59.846389 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931898 closing signal SIGTERM
W1010 19:41:59.847245 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931914 closing signal SIGTERM
W1010 19:41:59.848147 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931942 closing signal SIGTERM
W1010 19:41:59.849349 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931966 closing signal SIGTERM
W1010 19:41:59.850257 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931967 closing signal SIGTERM
W1010 19:41:59.850677 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931989 closing signal SIGTERM
E1010 19:42:00.996399 140390055487296 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 3931875) of binary: /home/xuan/anaconda3/envs/pytorch2.4/bin/python
Traceback (most recent call last):
  File "/home/xuan/anaconda3/envs/pytorch2.4/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/commands/launch.py", line 1159, in launch_command
    deepspeed_launcher(args)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/commands/launch.py", line 852, in deepspeed_launcher
    distrib_run.run(args)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/distributed/run.py", line 892, in run
    elastic_launch(
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
finetuning-llama.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-10-10_19:41:59
  host      : mdc-G5500
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 3931875)
  error_file: <N/A>

my deepspeed_config.yaml:

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  deepspeed_multinode_launcher: standard
  gradient_accumulation_steps: 4
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

GPU memory will grow by about 4G~6G at a time

As mentioned, it's really hard to say what the reason is, but I don't think it's a memory leak. Can you maybe check if your training data (after all processing steps) has very unequal sequence lengths?

just run: python demo.py. I've tried using torchrun and accelerate but this conflicts with device_map='auto'

Okay, so what this means is that you're not using model parallel training (which would require DeepSpeed or FSDP, which can be used via accelerate). Instead, by default, the Trainer (which SFTTrainer is based on) uses torch DataParallel under the hood when multiple GPUs are detected. This a bit of an outdated method. You could look into DistributedDataParallel, which is recommended by PyTorch for data parallel training (and also supported by accelerate). For bigger models, I would, however, recommend DeepSpeed or FSDP for model parallel training -- check this PEFT example.

I commented device_map='auto' and use accelerate launch --config_file "deepspeed_config.yaml" demo.py, then the error:

W1010 19:41:59.845001 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931897 closing signal SIGTERM
W1010 19:41:59.846389 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931898 closing signal SIGTERM
W1010 19:41:59.847245 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931914 closing signal SIGTERM
W1010 19:41:59.848147 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931942 closing signal SIGTERM
W1010 19:41:59.849349 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931966 closing signal SIGTERM
W1010 19:41:59.850257 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931967 closing signal SIGTERM
W1010 19:41:59.850677 140390055487296 torch/distributed/elastic/multiprocessing/api.py:858] Sending process 3931989 closing signal SIGTERM
E1010 19:42:00.996399 140390055487296 torch/distributed/elastic/multiprocessing/api.py:833] failed (exitcode: 1) local_rank: 0 (pid: 3931875) of binary: /home/xuan/anaconda3/envs/pytorch2.4/bin/python
Traceback (most recent call last):
  File "/home/xuan/anaconda3/envs/pytorch2.4/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 48, in main
    args.func(args)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/commands/launch.py", line 1159, in launch_command
    deepspeed_launcher(args)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/accelerate/commands/launch.py", line 852, in deepspeed_launcher
    distrib_run.run(args)
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/distributed/run.py", line 892, in run
    elastic_launch(
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/xuan/anaconda3/envs/pytorch2.4/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
finetuning-llama.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-10-10_19:41:59
  host      : mdc-G5500
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 3931875)
  error_file: <N/A>

my deepspeed_config.yaml:

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  deepspeed_multinode_launcher: standard
  gradient_accumulation_steps: 4
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

I found the reason， gradient_accumulation_steps value in deepspeed_config.yaml and demo.py is not the same.

Okay, so if you set the same values, does that resolve the issue and the model trains successfully?

huggingface / peft