hpcaitech / ColossalAI

Making large AI models cheaper, faster and more accessible
https://www.colossalai.org
Apache License 2.0
38.79k stars 4.34k forks source link

[BUG]: Error caused by multiple `param_groups` in optimizer #3903

Open HAOCHENYE opened 1 year ago

HAOCHENYE commented 1 year ago

πŸ› Describe the bug

If the params in one param_groups cannot be divided into world_size parts, an error will be raised here:

https://github.com/hpcaitech/ColossalAI/blob/57a6d7685cf05b0763eeb65eb62e7d8cce2f6955/colossalai/zero/low_level/low_level_optim.py#L183

since an empty list cannot be flattened. The error can be simply triggered by doing small modification in

https://github.com/hpcaitech/ColossalAI/blob/main/examples/tutorial/new_api/cifar_resnet/train.py

like this:

    param_groups = []
    for name, param in model.named_parameters():
        if name == 'conv1.weight':
            param_groups.append({'params': [param], 'lr': 0.01})
        else:
            param_groups.append({'params': [param], 'lr': 0.1})
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = FusedAdam(param_groups, lr=LEARNING_RATE)

execute the modified script:

import argparse
import os
from pathlib import Path

import colossalai
import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from colossalai.booster import Booster
from colossalai.booster.plugin import (GeminiPlugin, LowLevelZeroPlugin,
                                       TorchDDPPlugin)
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import FusedAdam, HybridAdam
from colossalai.utils import get_current_device
from torch.optim import Optimizer
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import DataLoader
from tqdm import tqdm

# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 80
LEARNING_RATE = 1e-3

def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
    # transform
    transform_train = transforms.Compose(
        [transforms.Pad(4),
         transforms.RandomHorizontalFlip(),
         transforms.RandomCrop(32),
         transforms.ToTensor()])
    transform_test = transforms.ToTensor()

    # CIFAR-10 dataset
    data_path = os.environ.get('DATA', './data')
    with coordinator.priority_execution():
        train_dataset = torchvision.datasets.CIFAR10(root=data_path,
                                                     train=True,
                                                     transform=transform_train,
                                                     download=True)
        test_dataset = torchvision.datasets.CIFAR10(root=data_path,
                                                    train=False,
                                                    transform=transform_test,
                                                    download=True)

    # Data loader
    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    return train_dataloader, test_dataloader

@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
    model.eval()
    correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
    total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
    for images, labels in test_dataloader:
        images = images.cuda()
        labels = labels.cuda()
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    dist.all_reduce(correct)
    dist.all_reduce(total)
    accuracy = correct.item() / total.item()
    if coordinator.is_master():
        print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
    return accuracy

def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
                booster: Booster, coordinator: DistCoordinator):
    model.train()
    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
        for images, labels in pbar:
            images = images.cuda()
            labels = labels.cuda()
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            booster.backward(loss, optimizer)
            optimizer.step()
            optimizer.zero_grad()

            # Print log info
            pbar.set_postfix({'loss': loss.item()})

def main():
    # ==============================
    # Parse Arguments
    # ==============================
    parser = argparse.ArgumentParser()
    # FIXME(ver217): gemini is not supported resnet now
    parser.add_argument('-p',
                        '--plugin',
                        type=str,
                        default='torch_ddp',
                        choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
                        help="plugin to use")
    parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
    parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
    parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
    parser.add_argument('--target_acc',
                        type=float,
                        default=None,
                        help="target accuracy. Raise exception if not reached")
    args = parser.parse_args()

    # ==============================
    # Prepare Checkpoint Directory
    # ==============================
    if args.interval > 0:
        Path(args.checkpoint).mkdir(parents=True, exist_ok=True)

    # ==============================
    # Launch Distributed Environment
    # ==============================
    colossalai.launch_from_torch(config={})
    coordinator = DistCoordinator()

    # update the learning rate with linear scaling
    # old_gpu_num / old_lr = new_gpu_num / new_lr
    global LEARNING_RATE
    LEARNING_RATE *= coordinator.world_size

    # ==============================
    # Instantiate Plugin and Booster
    # ==============================
    booster_kwargs = {}
    if args.plugin == 'torch_ddp_fp16':
        booster_kwargs['mixed_precision'] = 'fp16'
    if args.plugin.startswith('torch_ddp'):
        plugin = TorchDDPPlugin()
    elif args.plugin == 'gemini':
        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
    elif args.plugin == 'low_level_zero':
        plugin = LowLevelZeroPlugin(initial_scale=2**5)

    booster = Booster(plugin=plugin, **booster_kwargs)

    # ==============================
    # Prepare Dataloader
    # ==============================
    train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)

    # ====================================
    # Prepare model, optimizer, criterion
    # ====================================
    # resent50
    model = torchvision.models.resnet18(num_classes=10)
    param_groups = []
    for name, param in model.named_parameters():
        if name == 'conv1.weight':
            param_groups.append({'params': [param], 'lr': 0.01})
        else:
            param_groups.append({'params': [param], 'lr': 0.1})
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = FusedAdam(param_groups, lr=LEARNING_RATE)

    # lr scheduler
    lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)

    # ==============================
    # Boost with ColossalAI
    # ==============================
    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
                                                                 optimizer,
                                                                 criterion=criterion,
                                                                 lr_scheduler=lr_scheduler)

    # ==============================
    # Resume from checkpoint
    # ==============================
    if args.resume >= 0:
        booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
        booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
        booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')

    # ==============================
    # Train model
    # ==============================
    start_epoch = args.resume if args.resume >= 0 else 0
    for epoch in range(start_epoch, NUM_EPOCHS):
        train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
        lr_scheduler.step()

        # save checkpoint
        if args.interval > 0 and (epoch + 1) % args.interval == 0:
            booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
            booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
            booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')

    accuracy = evaluate(model, test_dataloader, coordinator)
    if args.target_acc is not None:
        assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'

if __name__ == '__main__':
    main()

You'll get the error like this:

Traceback (most recent call last):
  File "/home/yehaochen/codebase/mmengine.worktrees/flexible_runner_colosalai/examples/test_colossalai.py", line 209, in <module>
    main()
  File "/home/yehaochen/codebase/mmengine.worktrees/flexible_runner_colosalai/examples/test_colossalai.py", line 176, in main
    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/booster.py", line 119, in boost
    model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/plugin/low_level_zero_plugin.py", line 199, in configure
    optimizer = LowLevelZeroOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/plugin/low_level_zero_plugin.py", line 74, in __init__
    optimizer = zero_optim_wrapper(module,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/wrapper.py", line 110, in zero_optim_wrapper
    return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/low_level/low_level_optim.py", line 165, in __init__
    flat_tensor = flatten(tensor_list)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/low_level/_utils.py", line 14, in flatten
    return _flatten_dense_tensors(input_)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/_utils.py", line 459, in _flatten_dense_tensors
    return torch._C._nn.flatten_dense_tensors(tensors)
RuntimeError: torch.cat(): expected a non-empty list of Tensors

Besides, I found that the gemini mode cannot be enabled in here:

https://github.com/hpcaitech/ColossalAI/blob/57a6d7685cf05b0763eeb65eb62e7d8cce2f6955/examples/tutorial/new_api/cifar_resnet/train.py#L143

If I force to enable it, an error will also be raised:

RuntimeError: Output 0 of AliasBackward0 is a view and is being modified inplace. This view was created inside a custom Function (or because an input was returned as-is) and the autograd logic to handle view+inplace would override the custom backward associated with the custom Function, leading to incorrect gradients. This behavior is forbidden. You can fix this by cloning the output of the custom Function.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 190766) of binary: /home/yehaochen/anaconda3/envs/py310torch20/bin/python

I guess this is caused by the in-place operation in torchvision, and the gemini mode could work after a small modification:

origin:

        out += identity
        out = self.relu(out)

        return out

after

        res = out + identity
        res = self.relu(res)

        return res

Although the error message is clear enough in this case, I encounter a more intricate situation with my model (vit-large). Its parameters are divided into multiple param_groups with varying learning rates. Consequently, I experience a training interruption due to a system signal.

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 167965 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -11) local_rank: 0 (pid: 167964) of binary: /home/yehaochen/anaconda3/envs/py310torch20/bin/python
Traceback (most recent call last):
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/__main__.py", line 45, in <module>
    cli.main()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/server/cli.py", line 444, in main
    run()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/server/cli.py", line 331, in run_module
    run_module_as_main(target_as_str, alter_argv=True)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 196, in <module>
    main()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 192, in main
    launch(args)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 177, in launch
    run(args)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
    elastic_launch(
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
========================================================
./tools/train.py FAILED
--------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
--------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-06-06_02:43:25
  host      : SH-IDC1-10-140-1-184
  rank      : 0 (local_rank: 0)
  exitcode  : -11 (pid: 167964)
  error_file: <N/A>
  traceback : Signal 11 (SIGSEGV) received by PID 167964

I'm not sure about what happened, could you give me some advice?

Environment

No response

ver217 commented 1 year ago

It's a known bug of low level zero, and we are fixing it.

For gemini example, could you debug via the core dump file first?

HAOCHENYE commented 1 year ago

It's a known bug of low level zero, and we are fixing it.

For gemini example, could you debug via the core dump file first?

Hello, is there a tutorial available on debugging core dump files? Should I install ColossalAI from source with symbols retained and use a Python version compiled with symbols?

flybird11111 commented 1 year ago

πŸ› Describe the bug

If the params in one param_groups cannot be divided into world_size parts, an error will be raised here:

https://github.com/hpcaitech/ColossalAI/blob/57a6d7685cf05b0763eeb65eb62e7d8cce2f6955/colossalai/zero/low_level/low_level_optim.py#L183

since an empty list cannot be flattened. The error can be simply triggered by doing small modification in

https://github.com/hpcaitech/ColossalAI/blob/main/examples/tutorial/new_api/cifar_resnet/train.py

like this:

    param_groups = []
    for name, param in model.named_parameters():
        if name == 'conv1.weight':
            param_groups.append({'params': [param], 'lr': 0.01})
        else:
            param_groups.append({'params': [param], 'lr': 0.1})
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = FusedAdam(param_groups, lr=LEARNING_RATE)

execute the modified script:

import argparse
import os
from pathlib import Path

import colossalai
import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from colossalai.booster import Booster
from colossalai.booster.plugin import (GeminiPlugin, LowLevelZeroPlugin,
                                       TorchDDPPlugin)
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import FusedAdam, HybridAdam
from colossalai.utils import get_current_device
from torch.optim import Optimizer
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import DataLoader
from tqdm import tqdm

# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 80
LEARNING_RATE = 1e-3

def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
    # transform
    transform_train = transforms.Compose(
        [transforms.Pad(4),
         transforms.RandomHorizontalFlip(),
         transforms.RandomCrop(32),
         transforms.ToTensor()])
    transform_test = transforms.ToTensor()

    # CIFAR-10 dataset
    data_path = os.environ.get('DATA', './data')
    with coordinator.priority_execution():
        train_dataset = torchvision.datasets.CIFAR10(root=data_path,
                                                     train=True,
                                                     transform=transform_train,
                                                     download=True)
        test_dataset = torchvision.datasets.CIFAR10(root=data_path,
                                                    train=False,
                                                    transform=transform_test,
                                                    download=True)

    # Data loader
    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    return train_dataloader, test_dataloader

@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
    model.eval()
    correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
    total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
    for images, labels in test_dataloader:
        images = images.cuda()
        labels = labels.cuda()
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    dist.all_reduce(correct)
    dist.all_reduce(total)
    accuracy = correct.item() / total.item()
    if coordinator.is_master():
        print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
    return accuracy

def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
                booster: Booster, coordinator: DistCoordinator):
    model.train()
    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
        for images, labels in pbar:
            images = images.cuda()
            labels = labels.cuda()
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            booster.backward(loss, optimizer)
            optimizer.step()
            optimizer.zero_grad()

            # Print log info
            pbar.set_postfix({'loss': loss.item()})

def main():
    # ==============================
    # Parse Arguments
    # ==============================
    parser = argparse.ArgumentParser()
    # FIXME(ver217): gemini is not supported resnet now
    parser.add_argument('-p',
                        '--plugin',
                        type=str,
                        default='torch_ddp',
                        choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
                        help="plugin to use")
    parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
    parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
    parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
    parser.add_argument('--target_acc',
                        type=float,
                        default=None,
                        help="target accuracy. Raise exception if not reached")
    args = parser.parse_args()

    # ==============================
    # Prepare Checkpoint Directory
    # ==============================
    if args.interval > 0:
        Path(args.checkpoint).mkdir(parents=True, exist_ok=True)

    # ==============================
    # Launch Distributed Environment
    # ==============================
    colossalai.launch_from_torch(config={})
    coordinator = DistCoordinator()

    # update the learning rate with linear scaling
    # old_gpu_num / old_lr = new_gpu_num / new_lr
    global LEARNING_RATE
    LEARNING_RATE *= coordinator.world_size

    # ==============================
    # Instantiate Plugin and Booster
    # ==============================
    booster_kwargs = {}
    if args.plugin == 'torch_ddp_fp16':
        booster_kwargs['mixed_precision'] = 'fp16'
    if args.plugin.startswith('torch_ddp'):
        plugin = TorchDDPPlugin()
    elif args.plugin == 'gemini':
        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
    elif args.plugin == 'low_level_zero':
        plugin = LowLevelZeroPlugin(initial_scale=2**5)

    booster = Booster(plugin=plugin, **booster_kwargs)

    # ==============================
    # Prepare Dataloader
    # ==============================
    train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)

    # ====================================
    # Prepare model, optimizer, criterion
    # ====================================
    # resent50
    model = torchvision.models.resnet18(num_classes=10)
    param_groups = []
    for name, param in model.named_parameters():
        if name == 'conv1.weight':
            param_groups.append({'params': [param], 'lr': 0.01})
        else:
            param_groups.append({'params': [param], 'lr': 0.1})
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = FusedAdam(param_groups, lr=LEARNING_RATE)

    # lr scheduler
    lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)

    # ==============================
    # Boost with ColossalAI
    # ==============================
    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
                                                                 optimizer,
                                                                 criterion=criterion,
                                                                 lr_scheduler=lr_scheduler)

    # ==============================
    # Resume from checkpoint
    # ==============================
    if args.resume >= 0:
        booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
        booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
        booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')

    # ==============================
    # Train model
    # ==============================
    start_epoch = args.resume if args.resume >= 0 else 0
    for epoch in range(start_epoch, NUM_EPOCHS):
        train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
        lr_scheduler.step()

        # save checkpoint
        if args.interval > 0 and (epoch + 1) % args.interval == 0:
            booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
            booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
            booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')

    accuracy = evaluate(model, test_dataloader, coordinator)
    if args.target_acc is not None:
        assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'

if __name__ == '__main__':
    main()

You'll get the error like this:

Traceback (most recent call last):
  File "/home/yehaochen/codebase/mmengine.worktrees/flexible_runner_colosalai/examples/test_colossalai.py", line 209, in <module>
    main()
  File "/home/yehaochen/codebase/mmengine.worktrees/flexible_runner_colosalai/examples/test_colossalai.py", line 176, in main
    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/booster.py", line 119, in boost
    model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/plugin/low_level_zero_plugin.py", line 199, in configure
    optimizer = LowLevelZeroOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/plugin/low_level_zero_plugin.py", line 74, in __init__
    optimizer = zero_optim_wrapper(module,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/wrapper.py", line 110, in zero_optim_wrapper
    return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/low_level/low_level_optim.py", line 165, in __init__
    flat_tensor = flatten(tensor_list)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/low_level/_utils.py", line 14, in flatten
    return _flatten_dense_tensors(input_)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/_utils.py", line 459, in _flatten_dense_tensors
    return torch._C._nn.flatten_dense_tensors(tensors)
RuntimeError: torch.cat(): expected a non-empty list of Tensors

Besides, I found that the gemini mode cannot be enabled in here:

https://github.com/hpcaitech/ColossalAI/blob/57a6d7685cf05b0763eeb65eb62e7d8cce2f6955/examples/tutorial/new_api/cifar_resnet/train.py#L143

If I force to enable it, an error will also be raised:

RuntimeError: Output 0 of AliasBackward0 is a view and is being modified inplace. This view was created inside a custom Function (or because an input was returned as-is) and the autograd logic to handle view+inplace would override the custom backward associated with the custom Function, leading to incorrect gradients. This behavior is forbidden. You can fix this by cloning the output of the custom Function.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 190766) of binary: /home/yehaochen/anaconda3/envs/py310torch20/bin/python

I guess this is caused by the in-place operation in torchvision, and the gemini mode could work after a small modification:

origin:

        out += identity
        out = self.relu(out)

        return out

after

        res = out + identity
        res = self.relu(res)

        return res

Although the error message is clear enough in this case, I encounter a more intricate situation with my model (vit-large). Its parameters are divided into multiple param_groups with varying learning rates. Consequently, I experience a training interruption due to a system signal.

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 167965 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -11) local_rank: 0 (pid: 167964) of binary: /home/yehaochen/anaconda3/envs/py310torch20/bin/python
Traceback (most recent call last):
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/__main__.py", line 45, in <module>
    cli.main()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/server/cli.py", line 444, in main
    run()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/server/cli.py", line 331, in run_module
    run_module_as_main(target_as_str, alter_argv=True)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 196, in <module>
    main()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 192, in main
    launch(args)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 177, in launch
    run(args)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
    elastic_launch(
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
========================================================
./tools/train.py FAILED
--------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
--------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-06-06_02:43:25
  host      : SH-IDC1-10-140-1-184
  rank      : 0 (local_rank: 0)
  exitcode  : -11 (pid: 167964)
  error_file: <N/A>
  traceback : Signal 11 (SIGSEGV) received by PID 167964

I'm not sure about what happened, could you give me some advice?

Environment

No response

The interruption appears to be an indication of out-of-memory error.

HAOCHENYE commented 1 year ago

bug Describe the bug

If the params in one param_groups cannot be divided into world_size parts, an error will be raised here: https://github.com/hpcaitech/ColossalAI/blob/57a6d7685cf05b0763eeb65eb62e7d8cce2f6955/colossalai/zero/low_level/low_level_optim.py#L183

since an empty list cannot be flattened. The error can be simply triggered by doing small modification in https://github.com/hpcaitech/ColossalAI/blob/main/examples/tutorial/new_api/cifar_resnet/train.py like this:

    param_groups = []
    for name, param in model.named_parameters():
        if name == 'conv1.weight':
            param_groups.append({'params': [param], 'lr': 0.01})
        else:
            param_groups.append({'params': [param], 'lr': 0.1})
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = FusedAdam(param_groups, lr=LEARNING_RATE)

execute the modified script:

import argparse
import os
from pathlib import Path

import colossalai
import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from colossalai.booster import Booster
from colossalai.booster.plugin import (GeminiPlugin, LowLevelZeroPlugin,
                                       TorchDDPPlugin)
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import FusedAdam, HybridAdam
from colossalai.utils import get_current_device
from torch.optim import Optimizer
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import DataLoader
from tqdm import tqdm

# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 80
LEARNING_RATE = 1e-3

def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
    # transform
    transform_train = transforms.Compose(
        [transforms.Pad(4),
         transforms.RandomHorizontalFlip(),
         transforms.RandomCrop(32),
         transforms.ToTensor()])
    transform_test = transforms.ToTensor()

    # CIFAR-10 dataset
    data_path = os.environ.get('DATA', './data')
    with coordinator.priority_execution():
        train_dataset = torchvision.datasets.CIFAR10(root=data_path,
                                                     train=True,
                                                     transform=transform_train,
                                                     download=True)
        test_dataset = torchvision.datasets.CIFAR10(root=data_path,
                                                    train=False,
                                                    transform=transform_test,
                                                    download=True)

    # Data loader
    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    return train_dataloader, test_dataloader

@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
    model.eval()
    correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
    total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
    for images, labels in test_dataloader:
        images = images.cuda()
        labels = labels.cuda()
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    dist.all_reduce(correct)
    dist.all_reduce(total)
    accuracy = correct.item() / total.item()
    if coordinator.is_master():
        print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
    return accuracy

def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
                booster: Booster, coordinator: DistCoordinator):
    model.train()
    with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
        for images, labels in pbar:
            images = images.cuda()
            labels = labels.cuda()
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            booster.backward(loss, optimizer)
            optimizer.step()
            optimizer.zero_grad()

            # Print log info
            pbar.set_postfix({'loss': loss.item()})

def main():
    # ==============================
    # Parse Arguments
    # ==============================
    parser = argparse.ArgumentParser()
    # FIXME(ver217): gemini is not supported resnet now
    parser.add_argument('-p',
                        '--plugin',
                        type=str,
                        default='torch_ddp',
                        choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
                        help="plugin to use")
    parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
    parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
    parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
    parser.add_argument('--target_acc',
                        type=float,
                        default=None,
                        help="target accuracy. Raise exception if not reached")
    args = parser.parse_args()

    # ==============================
    # Prepare Checkpoint Directory
    # ==============================
    if args.interval > 0:
        Path(args.checkpoint).mkdir(parents=True, exist_ok=True)

    # ==============================
    # Launch Distributed Environment
    # ==============================
    colossalai.launch_from_torch(config={})
    coordinator = DistCoordinator()

    # update the learning rate with linear scaling
    # old_gpu_num / old_lr = new_gpu_num / new_lr
    global LEARNING_RATE
    LEARNING_RATE *= coordinator.world_size

    # ==============================
    # Instantiate Plugin and Booster
    # ==============================
    booster_kwargs = {}
    if args.plugin == 'torch_ddp_fp16':
        booster_kwargs['mixed_precision'] = 'fp16'
    if args.plugin.startswith('torch_ddp'):
        plugin = TorchDDPPlugin()
    elif args.plugin == 'gemini':
        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
    elif args.plugin == 'low_level_zero':
        plugin = LowLevelZeroPlugin(initial_scale=2**5)

    booster = Booster(plugin=plugin, **booster_kwargs)

    # ==============================
    # Prepare Dataloader
    # ==============================
    train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)

    # ====================================
    # Prepare model, optimizer, criterion
    # ====================================
    # resent50
    model = torchvision.models.resnet18(num_classes=10)
    param_groups = []
    for name, param in model.named_parameters():
        if name == 'conv1.weight':
            param_groups.append({'params': [param], 'lr': 0.01})
        else:
            param_groups.append({'params': [param], 'lr': 0.1})
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = FusedAdam(param_groups, lr=LEARNING_RATE)

    # lr scheduler
    lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)

    # ==============================
    # Boost with ColossalAI
    # ==============================
    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
                                                                 optimizer,
                                                                 criterion=criterion,
                                                                 lr_scheduler=lr_scheduler)

    # ==============================
    # Resume from checkpoint
    # ==============================
    if args.resume >= 0:
        booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
        booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
        booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')

    # ==============================
    # Train model
    # ==============================
    start_epoch = args.resume if args.resume >= 0 else 0
    for epoch in range(start_epoch, NUM_EPOCHS):
        train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
        lr_scheduler.step()

        # save checkpoint
        if args.interval > 0 and (epoch + 1) % args.interval == 0:
            booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
            booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
            booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')

    accuracy = evaluate(model, test_dataloader, coordinator)
    if args.target_acc is not None:
        assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'

if __name__ == '__main__':
    main()

You'll get the error like this:

Traceback (most recent call last):
  File "/home/yehaochen/codebase/mmengine.worktrees/flexible_runner_colosalai/examples/test_colossalai.py", line 209, in <module>
    main()
  File "/home/yehaochen/codebase/mmengine.worktrees/flexible_runner_colosalai/examples/test_colossalai.py", line 176, in main
    model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/booster.py", line 119, in boost
    model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/plugin/low_level_zero_plugin.py", line 199, in configure
    optimizer = LowLevelZeroOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/booster/plugin/low_level_zero_plugin.py", line 74, in __init__
    optimizer = zero_optim_wrapper(module,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/wrapper.py", line 110, in zero_optim_wrapper
    return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/low_level/low_level_optim.py", line 165, in __init__
    flat_tensor = flatten(tensor_list)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/colossalai/zero/low_level/_utils.py", line 14, in flatten
    return _flatten_dense_tensors(input_)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/_utils.py", line 459, in _flatten_dense_tensors
    return torch._C._nn.flatten_dense_tensors(tensors)
RuntimeError: torch.cat(): expected a non-empty list of Tensors

Besides, I found that the gemini mode cannot be enabled in here: https://github.com/hpcaitech/ColossalAI/blob/57a6d7685cf05b0763eeb65eb62e7d8cce2f6955/examples/tutorial/new_api/cifar_resnet/train.py#L143

If I force to enable it, an error will also be raised:

RuntimeError: Output 0 of AliasBackward0 is a view and is being modified inplace. This view was created inside a custom Function (or because an input was returned as-is) and the autograd logic to handle view+inplace would override the custom backward associated with the custom Function, leading to incorrect gradients. This behavior is forbidden. You can fix this by cloning the output of the custom Function.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 190766) of binary: /home/yehaochen/anaconda3/envs/py310torch20/bin/python

I guess this is caused by the in-place operation in torchvision, and the gemini mode could work after a small modification: origin:

        out += identity
        out = self.relu(out)

        return out

after

        res = out + identity
        res = self.relu(res)

        return res

Although the error message is clear enough in this case, I encounter a more intricate situation with my model (vit-large). Its parameters are divided into multiple param_groups with varying learning rates. Consequently, I experience a training interruption due to a system signal.

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 167965 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -11) local_rank: 0 (pid: 167964) of binary: /home/yehaochen/anaconda3/envs/py310torch20/bin/python
Traceback (most recent call last):
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/__main__.py", line 45, in <module>
    cli.main()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/server/cli.py", line 444, in main
    run()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/debugpy/server/cli.py", line 331, in run_module
    run_module_as_main(target_as_str, alter_argv=True)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 196, in <module>
    main()
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 192, in main
    launch(args)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launch.py", line 177, in launch
    run(args)
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
    elastic_launch(
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/yehaochen/anaconda3/envs/py310torch20/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
========================================================
./tools/train.py FAILED
--------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
--------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-06-06_02:43:25
  host      : SH-IDC1-10-140-1-184
  rank      : 0 (local_rank: 0)
  exitcode  : -11 (pid: 167964)
  error_file: <N/A>
  traceback : Signal 11 (SIGSEGV) received by PID 167964

I'm not sure about what happened, could you give me some advice?

Environment

No response

The interruption appears to be an indication of CUDA out-of-memory error.

Actually, this model can be trained by native PyTorch with 28GB allocated GPU memory. It is strange that an OOM error will be raised when using ColossalAI. If there is only one parameter group in the optimizer, no error will be raised and only 20GB allocated memory will be used.