Accelerate doubles my VRAM usage.

System Info

Name: accelerate
Version: 0.33.0.dev0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /home/ubuntu/anaconda3/envs/commander/lib/python3.12/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: llamafactory, peft, trl
---
Name: deepspeed
Version: 0.14.4
Summary: DeepSpeed library
Home-page: http://deepspeed.ai
Author: DeepSpeed Team
Author-email: deepspeed-info@microsoft.com
License: Apache Software License 2.0
Location: /home/ubuntu/anaconda3/envs/commander/lib/python3.12/site-packages
Requires: hjson, ninja, numpy, nvidia-ml-py, packaging, psutil, py-cpuinfo, pydantic, torch, tqdm
Required-by:
---
Name: torch
Version: 2.3.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /home/ubuntu/anaconda3/envs/commander/lib/python3.12/site-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, typing-extensions
Required-by: accelerate, deepspeed, flash-attn, peft, torchaudio, torchvision, trl

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] One of the scripts in the examples/ folder of Accelerate or an officially supported no_trainer script in the examples folder of the transformers repo (such as run_no_trainer_glue.py)
[X] My own task or dataset (give details below)

Reproduction

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 256
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  zero3_save_16bit_model: false
  zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

import os
import pandas as pd

import torch

from tqdm import tqdm

from torch.utils.data import  DataLoader

from PretrainDataset import PretrainDataset

from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator

domain_data = pd.read_csv("data/domain_content_merged.csv")
domain_data = domain_data.drop(columns = ["length"])

general_data = pd.read_csv("data/general_content.csv")

data = pd.concat([domain_data, general_data], axis = 0)
data = data.reset_index(drop = True)
data = data.drop(columns = ["source"])

######################

def init_model(model_name, hyper_param):

    model = AutoModelForCausalLM.from_pretrained(model_name,  torch_dtype = torch.float16, attn_implementation = "flash_attention_2")

    peft_config = LoraConfig(r = 1024,
                             lora_alpha = 32,
                             lora_dropout = 0.05,
                             bias = "none",
                             task_type = "CAUSAL_LM",
                             target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"])

    model = get_peft_model(model, peft_config)

    total = 0
    train = 0

    for _, parameters in model.named_parameters():

        total += parameters.numel()

        if parameters.requires_grad:
            train += parameters.numel()

    print(total, " parameters in total")
    print(train, " parameters trainable")

    return model

######################

def init_dataloader(model_name, data, hyper_param):

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    dataset_train = PretrainDataset(1024, pad_id = 151643)

    contents = data["content"].to_list()[:10]

    for t in tqdm(contents, desc = "Crafting Dataset"):

        max_length = len(t) * 3

        tokenized_text = tokenizer(t, return_tensors = "pt", max_length = max_length, truncation = True, padding = False)
        tokenized_text = tokenized_text["input_ids"]
        dataset_train.add_data(tokenized_text)

    print(len(dataset_train), " of data points")

    train_loader = DataLoader(dataset_train, batch_size = hyper_param["batch_size"], shuffle = True)

    return train_loader

def train(model, train_loader, optimizer, accelerator, lr_scheduler, recorder):

    model.train()

    for step, batch in enumerate(train_loader):

        with accelerator.accumulate(model):

            optimizer.zero_grad()
            outputs = model(**batch)

            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()

            lr_scheduler.step()

            if accelerator.sync_gradients:
                print(loss)

                del loss

def validate():
    pass

def training_loop(model_name, data, hyper_param):

    model = init_model(model_name, hyper_param)
    optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5)
    train_loader = init_dataloader(model_name, data, hyper_param)
    accelerator = Accelerator(gradient_accumulation_steps = hyper_param["gradient accumulation"])

    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

    lr_schedular = get_linear_schedule_with_warmup(optimizer = optimizer, num_warmup_steps = hyper_param["warmup"], num_training_steps = len(train_loader) * hyper_param["epochs"])

    recorder = {}

    for _ in tqdm(range(hyper_param["epochs"]), desc = "training progress"):

        train(model, train_loader, optimizer, accelerator, lr_schedular, recorder)
        validate()

        break

if __name__ == "__main__":

    hyperparameters = {"gradient accumulation" : 256,
                       "batch_size" : 1,
                       "warmup" : 10,
                       "epochs" : 1}

    training_loop("model/Qwen2-1.5B-Instruct", data, hyperparameters)

Expected behavior

I'm training a model with accelerate and deepspeed Zero 3. As I known, accelerate does data parallelism and copy the model on to different GPUs and does training on those GPUs. Therefore using extra VRAM is expected. However, I think zero3 would divide one model onto multi GPUs and I'm using Zero3. Therefore it should be using less VRAM. When I ran it, it seems like it's still copying full model onto those GPUs and uses double amount of VRAM.

when I run without accelerate it shows only 15G of VRAM used.

python train.py

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4090 D      Off |   00000000:01:00.0 Off |                  Off |
| 30%   50C    P2            270W /  425W |   15779MiB /  24564MiB |     98%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090 D      Off |   00000000:05:00.0 Off |                  Off |
|  0%   38C    P8             17W /  425W |      18MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

when I run with accelerate it shows about 45G of VRAM used.

accelerate launch train.py

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 4090 D      Off |   00000000:01:00.0 Off |                  Off |
|  0%   43C    P2            108W /  425W |   23799MiB /  24564MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090 D      Off |   00000000:05:00.0 Off |                  Off |
|  0%   41C    P2            104W /  425W |   22973MiB /  24564MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

Is there anything I set wrong? I notice that there needs to be one model per process but it won't let me to set 1 process and 2 GPUs.

huggingface / accelerate