Performing Distributed training with PyTorch and Azure ML?

Operating System

Linux

Version Information

channels:

conda-forge dependencies:
python=3.8
pip=22.1.2
numpy=1.21.2
scikit-learn=0.24.2
scipy=1.7.1
'pandas>=1.1,<1.2'
pytorch=1.10.0
pip:
- 'inference-schema[numpy-support]==1.5.0'
- xlrd==2.0.1
- mlflow== 2.4.1
- azureml-mlflow==1.52.0
- tqdm==4.63.0
- TorchCRF==1.1.0
- pytorch-transformers==1.2.0
- pytorch-lightning==2.0.2
- seqeval==1.2.2
- azureml-inference-server-http==0.8.4
- transformers==4.30.0 name: model-env

Steps to reproduce

Reference link - https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/single-step/pytorch/distributed-training/distributed-cifar10.ipynb
Trained model successfully on single GPU
Scaled the model training to distributed GPU

Expected behavior

The model should successfully be trained in distributed multiple GPU settings, as it worked on single GPU.

Actual behavior

I am trying to run this train.py to train the model on a GPU cluster with one instance node and 4 GPUs. But, every time it gives me the error of CUDA out of memory. Can someone please help me address this issue?

Here's my train.py code-

# A. Specify GPU To Use
import os
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['LOCAL_RANK']
import torch
import argparse

import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD
import torch.multiprocessing as mp

parser = argparse.ArgumentParser()
# add arguments
parser.add_argument("--data_dir", type=str, help="directory containing CIFAR-10 dataset")
parser.add_argument('--epochs', type=int)
parser.add_argument('--batch_size', type=int)
parser.add_argument('--learning_rate', type=float)
args = parser.parse_args()

df = pd.read_csv(args.data_dir, names=["text", "labels"], skiprows=1)

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
label_all_tokens = False

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['text'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

# define functions
def train_model(model, train_loader,criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        train_label = data[1]
        train_data = data[0]
        train_label = train_label.to(device)
        mask = train_data['attention_mask'].squeeze(1).to(device)
        input_id = train_data['input_ids'].squeeze(1).to(device)

        optimizer.zero_grad()
        loss, logits = model(input_id, mask, train_label)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_freq == 0:  # print every print_freq mini-batches
            print(
                "Rank %d: [%d, %5d] loss: %.3f"
                % (rank, epoch + 1, i + 1, running_loss / print_freq)
            )
            running_loss = 0.0

def save_model(model):
    torch.save(model.state_dict(), 'model.pt')

def main():
    df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                                [int(.8 * len(df)), int(.9 * len(df))])
    # B. Prepare For Distributed Training
    world_size = int(os.environ['WORLD_SIZE'])
    rank = int(os.environ['RANK'])
    local_rank = int(os.environ['LOCAL_RANK'])
    is_distributed = world_size > 1

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if is_distributed:
        torch.distributed.init_process_group(backend="nccl")

    # C. Perform Certain Tasks Only In Specific Processes
    if local_rank == 0:
        train_set = DataSequence(df_train)
    if is_distributed:
        torch.distributed.barrier()
    if local_rank != 0:
        train_set = DataSequence(df_train)

    # D. Create Distributed Sampler and Data Loader
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_set) if is_distributed else None
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=args.batch_size,
        shuffle=(train_sampler is None),
        sampler=train_sampler)

    # E. Initialize Model Using DistributedDataParallel
    model = BertModel().to(device)
    if is_distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)

    # F. Set Learning Rate and Optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    # G. Update Distributed Sampler On Each Epoch
    for epoch in range(args.epochs):
        if is_distributed:
            train_sampler.set_epoch(epoch)
        train_model(model, train_loader, criterion, optimizer, device)

    # C. Perform Certain Tasks Only In Specific Processes
    # Evaluate and save the model only in the main process (with rank 0)
    # Note that it is also possible to perform evaluation using multiple processes in parallel if needed
    if rank == 0:
        test_loader = torch.utils.data.DataLoader(DataSequence(df_test), batch_size=args.batch_size, shuffle=False)
        #evaluate(model, test_loader, device)
        save_model(model)

if __name__ == '__main__':
    main()

Use this code as reference - [https://github.com/Azure/azureml-examples/blob/main/sdk/python/jobs/single-step/pytorch/distributed-training/distributed-cifar10.ipynb]()

This is the error that i am getting -

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO NET/Socket : Using [0]eth0:10.0.0.5<0>
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Using network Socket
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Channel 00/02 :    0   1   2   3
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Channel 01/02 :    0   1   2   3
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Channel 00 : 0[100000] -> 1[200000] via SHM/direct/direct
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Channel 01 : 0[100000] -> 1[200000] via SHM/direct/direct
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Connected all rings
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO Connected all trees
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
39e6f54faaf2439b992bd4f67fb0aa37000001:61:138 [0] NCCL INFO comm 0x56340dd87000 rank 0 nranks 4 cudaDev 0 busId 100000 - Init COMPLETE
Traceback (most recent call last):
  File "demo.py", line 197, in <module>
    main()
  File "demo.py", line 186, in main
    train_model(model, train_loader, criterion, optimizer, device)
  File "demo.py", line 127, in train_model
    loss, logits = model(input_id, mask, train_label)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
    output = self._run_ddp_forward(*inputs, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
    return module_to_run(*inputs[0], **kwargs[0])  # type: ignore[index]
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "demo.py", line 52, in forward
    output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 1758, in forward
    outputs = self.bert(
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 1020, in forward
    encoder_outputs = self.encoder(
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 610, in forward
    layer_outputs = layer_module(
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 495, in forward
    self_attention_outputs = self.attention(
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 425, in forward
    self_outputs = self.self(
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/azureml-envs/azureml_fd02e2c27414f14cbd6b08cf24bea779/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 365, in forward
    context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 7.94 GiB total capacity; 7.32 GiB already allocated; 44.75 MiB free; 7.39 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

I am using this GPU Cluster - GPU - 4 x NVIDIA Tesla M60 (Standard_NV48s_v3 (48 cores, 448 GB RAM, 2948 GB disk))

Can someone please help me address this issue?

Addition information

No response

Azure / azure-sdk-for-python