Environment info

transformers version: 4.12.0
Platform: Linux-5.4.0-1073-azure-x86_64-with-glibc2.29
Python version: 3.8.10
PyTorch version (GPU?): 1.10.0+cu102 (True)
Tensorflow version (GPU?): 2.6.0 (True)
Flax version (CPU?/GPU?/TPU?): not installed (NA)
Jax version: not installed
JaxLib version: not installed
Using GPU in script?: Yes
Using distributed or parallel set-up in script?: Yes

Description:

@patrickvonplaten @patil-suraj @sgugger Hi! In a nut shell, Im trying to train mBart for a seq2seq2 generation task using huggingface transformers trainer with Distributed Data Parallel (DDP) mode, but encountered CUDA OOM error.

Specifically, the problem is, with the same setting (batch_size, and same length of data, etc), I can train it with single GPU successfully! But always encounter CUDA OOM error when using DDP mode. I also tried to decrease batch size to 1, and length of input data to 50 (it was 256 for encoder and 100 for decoder), but still had the issue.

I run the code below by: %sh OMP_NUM_THREADS=10 python -m torch.distributed.launch --nproc_per_node=4 Train_MBart_DDP.py

Code:

Libraries

from transformers import MBartForConditionalGeneration, MBartTokenizer
from transformers import Trainer, TrainingArguments
from transformers.models.bart.modeling_bart import shift_tokens_right
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
from datasets import load_dataset, load_metric
import torch
import numpy as np
import nltk
import os
import logging
logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s')
os.environ["WANDB_DISABLED"] = "true"
metric =load_metric('/metrics/rouge/rouge.py')

set up MLFOW and get local rank

os.environ["DATABRICKS_HOST"] = "[MASKED]"
os.environ["DATABRICKS_TOKEN"] = "[MASKED]"
os.environ["WANDB_WATCH"] = "false"
os.environ["NCCL_DEBUG"] = "INFO"
local_rank = int(os.environ["LOCAL_RANK"])
client = MlflowClient()
experiment = client.get_experiment([MASKED])
remote_server_uri = mlflow.tracking.get_tracking_uri()[]
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment('[MASKED]/mBart_DDP')

get data

 def apply_process(row):
     question, answers, ctxs = row
     answer = answers[0]
     candidates = np.array([d['text'] for d in row['ctxs']])
     candidates = pd.unique(candidates)
     candidates = ' '.join(candidates[:5].tolist())
     question_passage = question + ' ' + candidates
     return question_passage, answer

df_path = '/tmp/top200_output.json'
f = open(df_path)
df = json.load(f)
f.close()
dff = pd.DataFrame(df)
dff[['question_passage', 'answer']] = dff.apply(apply_process, axis = 1, result_type="expand")
dff = dff[['question_passage', 'answer']]

get dataset and model

def convert_to_features(dataset):
     input_encodings = tokenizer.batch_encode_plus(dataset['question_passage'], pad_to_max_length=True, padding='max_length', max_length = 256, truncation=True)
     target_encodings = tokenizer.batch_encode_plus(dataset['answer'], pad_to_max_length=True, padding='max_length', max_length = 100, truncation=True)
     labels = target_encodings['input_ids']
     labels = torch.tensor(labels)
     decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id, 0)
     decoder_input_ids = np.array(decoder_input_ids)
     labels[labels[:, :] == model.config.pad_token_id] = -100
     labels = np.array(labels)
     encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': decoder_input_ids,
        'labels': labels,
     }
     return encodings

tokenizer = MBartTokenizer.from_pretrained('/tmp/mbart-large-cc25', src_lang="en_XX",  local_files_only=True)
model = MBartForConditionalGeneration.from_pretrained('/tmp/mbart-large-cc25', local_files_only=True)
model.config.decoder_start_token_id = tokenizer.lang_code_to_id["en_XX"]

dataset = Dataset.from_pandas(dff)
test = Dataset.from_dict(dataset[:10])
train = Dataset.from_dict(dataset[500:])
test = test.map(convert_to_features, batched=True)
columns = ['input_ids', 'labels', 'decoder_input_ids','attention_mask',] 
test.set_format(type='torch', columns=columns)
test = test.remove_columns(['question_passage', 'answer'])
train = train.map(convert_to_features, batched=True)
columns = ['input_ids', 'labels', 'decoder_input_ids','attention_mask',] 
train.set_format(type='torch', columns=columns)
train = train.remove_columns(['question_passage', 'answer'])

set trainer

args = Seq2SeqTrainingArguments(
    "/tmp/bart_training",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    gradient_accumulation_steps = 4,
    disable_tqdm=False,
    dataloader_num_workers = 10,
    fp16=True,
    local_rank= os.environ["LOCAL_RANK"],
    do_train=True,
    do_eval=True,
    overwrite_output_dir = True,
    sharded_ddp = 'simple',
    dataloader_pin_memory = True,
    adafactor =True,
    skip_memory_metrics = True,
    ddp_find_unused_parameters =True,
    sortish_sampler=True,
    generation_max_length =50,
    gradient_checkpointing =False
      )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    logging.info(decoded_preds)
    logging.info('\n\n')
    logging.info(decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

trainer.train()

The error:

Error:

0220-221927-imgswodk-10-232-244-83:9106:9106 [2] NCCL INFO Bootstrap : Using eth0:10.232.244.83<0>
0220-221927-imgswodk-10-232-244-83:9106:9106 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
0220-221927-imgswodk-10-232-244-83:9106:9106 [2] NCCL INFO NET/IB : No device found.
0220-221927-imgswodk-10-232-244-83:9106:9106 [2] NCCL INFO NET/Socket : Using [0]eth0:10.232.244.83<0>
0220-221927-imgswodk-10-232-244-83:9106:9106 [2] NCCL INFO Using network Socket
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Channel 00/02 :    0   1   2   3
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Channel 01/02 :    0   1   2   3
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Setting affinity for GPU 0 to 0fff
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Setting affinity for GPU 1 to 0fff
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Setting affinity for GPU 2 to 0fff
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Setting affinity for GPU 3 to 0fff
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Channel 00 : 0[100000] -> 1[200000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Channel 00 : 3[400000] -> 0[100000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Channel 01 : 0[100000] -> 1[200000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Channel 01 : 3[400000] -> 0[100000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Channel 00 : 2[300000] -> 3[400000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Channel 00 : 1[200000] -> 2[300000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Channel 01 : 2[300000] -> 3[400000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Channel 01 : 1[200000] -> 2[300000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Connected all rings
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Connected all rings
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Channel 00 : 3[400000] -> 2[300000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Channel 01 : 3[400000] -> 2[300000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Channel 00 : 2[300000] -> 1[200000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Channel 01 : 2[300000] -> 1[200000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO Connected all trees
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Connected all rings
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Connected all rings
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Channel 00 : 1[200000] -> 0[100000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Channel 01 : 1[200000] -> 0[100000] via direct shared memory
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO Connected all trees
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO Connected all trees
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO Connected all trees
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
0220-221927-imgswodk-10-232-244-83:9105:9265 [1] NCCL INFO comm 0x7fba7c001240 rank 1 nranks 4 cudaDev 1 busId 200000 - Init COMPLETE
0220-221927-imgswodk-10-232-244-83:9106:9266 [2] NCCL INFO comm 0x7f6a70001240 rank 2 nranks 4 cudaDev 2 busId 300000 - Init COMPLETE
0220-221927-imgswodk-10-232-244-83:9104:9263 [0] NCCL INFO comm 0x7f4f40001240 rank 0 nranks 4 cudaDev 0 busId 100000 - Init COMPLETE
0220-221927-imgswodk-10-232-244-83:9107:9264 [3] NCCL INFO comm 0x7f0918001240 rank 3 nranks 4 cudaDev 3 busId 400000 - Init COMPLETE
0220-221927-imgswodk-10-232-244-83:9104:9104 [0] NCCL INFO Launch mode Parallel
***** Running training *****
  Num examples = 500
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 96

  0%|          | 0/96 [00:00<?, ?it/s][W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())

  1%|          | 1/96 [00:01<01:53,  1.19s/it]Traceback (most recent call last):
  File "Train_MBart_reader_DDP.py", line 197, in <module>
Traceback (most recent call last):
  File "Train_MBart_reader_DDP.py", line 197, in <module>
Traceback (most recent call last):
  File "Train_MBart_reader_DDP.py", line 197, in <module>
Traceback (most recent call last):
  File "Train_MBart_reader_DDP.py", line 197, in <module>
    trainer.train()
  File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1316, in train
    trainer.train()
  File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1316, in train
        tr_loss_step = self.training_step(model, inputs)trainer.train()

  File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1867, in training_step
  File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1316, in train
    tr_loss_step = self.training_step(model, inputs)
  File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1867, in training_step
    loss.backward()
  File "/databricks/python/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
    tr_loss_step = self.training_step(model, inputs)
  File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1867, in training_step
        loss.backward()torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)

  File "/databricks/python/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
  File "/databricks/python/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
    Variable._execution_engine.run_backward(
  RuntimeError    : torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
   CUDA out of memory. Tried to allocate 382.00 MiB (GPU 0; 15.78 GiB total capacity; 14.02 GiB already allocated; 339.50 MiB free; 
14.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
  File "/databricks/python/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
    Variable._execution_engine.run_backward(
RuntimeError: CUDA out of memory. Tried to allocate 382.00 MiB (GPU 2; 15.78 GiB total capacity; 14.02 GiB already allocated; 339.50 MiB free; 14.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
    loss.backward()
  File "/databricks/python/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/databricks/python/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
    Variable._execution_engine.run_backward(
RuntimeError: CUDA out of memory. Tried to allocate 382.00 MiB (GPU 3; 15.78 GiB total capacity; 14.02 GiB already allocated; 339.50 MiB free; 14.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
trainer.train()
File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1316, in train
  tr_loss_step = self.training_step(model, inputs)
File "/databricks/python/lib/python3.8/site-packages/transformers/trainer.py", line 1867, in training_step
  loss.backward()
File "/databricks/python/lib/python3.8/site-packages/torch/_tensor.py", line 307, in backward
 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/databricks/python/lib/python3.8/site-packages/torch/autograd/__init__.py", line 154, in backward
   Variable._execution_engine.run_backward(
RuntimeError: CUDA out of memory. Tried to allocate 382.00 MiB (GPU 1; 15.78 GiB total capacity; 14.02 GiB already allocated; 339.50 
MiB free; 14.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid 
fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

To reproduce

You could reproduce it using other data like summarization CNN Daily Dataset.

huggingface / transformers

Cuda Memory leak (OOM) when using HF Trainer DDP mode #16705