IPUs hang when pipelining with more than 2 IPUs

Cy-r0 commented 3 years ago

Hello, I'm training a model on a DELL DSS8840 in poptorch. I need to use pipelining because I'll have to train it on high-resolution images and that won't fit on a single IPU. However, when I split the model over more than 2 IPUs, I get the following error:

[06:58:00.549] [poptorch::python] [critical] RuntimeError: ERROR in poptorch/python/poptorch.cpp:722: 'std::exception' exception: IPUDevice: still waiting for host sync after: 300 seconds. Use the engine option target.hostSyncTimeout to increase the timeout.

Traceback (most recent call last):
  File "reproducible_example.py", line 132, in <module>
    main()
  File "reproducible_example.py", line 128, in main
    train()
  File "reproducible_example.py", line 119, in train
    loss = training_model(image)
  File "/home/ciroc/cnn-env/lib/python3.6/site-packages/poptorch/_impl.py", line 1294, in __call__
    in_tensors.asTuple(), {})
RuntimeError: ERROR in poptorch/python/poptorch.cpp:722: 'std::exception' exception: IPUDevice: still waiting for host sync after: 300 seconds. Use the engine option target.hostSyncTimeout to increase the timeout.

This error doesn't appear if I pipeline over 2 IPUs, it only does when I set my IPUs to be >= 4. It also doesn't disappear if I manually increase target.hostSyncTimeout to 1200 or above.

Below is a complete example to reproduce the error:

"""
Test pipelining on multiple IPUs.
"""

import os, sys

from timeit import default_timer as timer
import torch
import torch.nn as nn
import torchvision.transforms as T
from timeit import default_timer as timer
from tqdm import tqdm

import poptorch

class ExampleDataset(torch.utils.data.Dataset):
    def __init__(self, shape, length):
        super().__init__()
        self._shape = shape
        self._length = length

        self._all_data = []

        torch.manual_seed(0)
        for _ in range(length):
            data = torch.rand(self._shape)
            self._all_data.append(data)

    def __len__(self):
        return self._length

    def __getitem__(self, index):
        return self._all_data[index]

class TestModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.Conv1 = nn.Conv2d(3, 10, 3, padding=1)
        self.Conv2 = nn.Conv2d(10, 10, 3, padding=1)
        self.Conv3 = nn.Conv2d(10, 10, 3, padding=1)
        self.Conv4 = nn.Conv2d(10, 3, 3, padding=1)

    def forward(self, x):

        # Split model among IPUs
        with poptorch.Block(user_id="B1", ipu_id=0):
            y = self.Conv1(x)

        with poptorch.Block(user_id="B2", ipu_id=1):
            y = self.Conv2(y)

        with poptorch.Block(user_id="B3", ipu_id=2):
            y = self.Conv3(y)

        with poptorch.Block(user_id="B4", ipu_id=3):
            x_hat = self.Conv4(y)
            loss = mse(x_hat, x)
            return loss

def mse(pred, target):
    mse = torch.mean((target - pred) ** 2, dim=(-1, -2, -3)) * 255 ** 2
    return poptorch.identity_loss(mse, "mean")

def main():

    # Experiment parameters
    EPOCHS = 1
    LR = 1e-4
    BATCH_SIZE = 1
    IMG_H = 64
    IMG_W = IMG_H

    if not poptorch.ipuHardwareIsAvailable():
        poptorch.logger.warn("This examples requires IPU hardware to run")
        sys.exit(0)

    train_opt = poptorch.Options()
    train_opt.deviceIterations(4)
    train_opt.replicationFactor(1)
    train_opt.randomSeed(42)
    train_opt.useIpuModel(False)
    train_opt.setExecutionStrategy(
        poptorch.PipelinedExecution(poptorch.AutoStage.SameAsIpu))
    train_opt.Training.gradientAccumulation(8)

    # Getting memory proportion right is important to avoid OOM errors
    # Tutorial here: 
    # https://docs.graphcore.ai/projects/available-memory/en/latest/available-memory.html#worked-example
    train_opt.setAvailableMemoryProportion(
        {"IPU0": 0.6, "IPU1": 0.6, "IPU2": 0.6, "IPU3": 0.6})

    # Define dataloader for training set with appropriate image transformations
    train_set = ExampleDataset((3, IMG_H, IMG_W), 1024)
    train_loader = poptorch.DataLoader(
        train_opt,
        train_set, 
        batch_size=BATCH_SIZE, 
        shuffle=True, 
        num_workers=0)

    model = TestModel()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    training_model = poptorch.trainingModel(model, train_loader.options, optimizer)

    def train():

        start = timer()

        for _ in range(EPOCHS):
            with tqdm(total=len(train_loader), desc=f"Train", position=-1) as t:
                for i, image in enumerate(train_loader):

                    loss = training_model(image)

                    if i % 100 == 0:
                        print(f"image mse: {loss}")

        stop = timer()
        print(f"Training time: {stop - start:.3f} s")

    train()

if __name__ == "__main__":
    main()

Let me know if there's something I'm missing here.

LRVerkin commented 3 years ago

Hello Cy-r0,

Thank you for letting us about this! You are registered on Graphcore's Support platform. I have taken the liberty of turning your message into a Support ticket in your name, which will help us process it faster. You will have received an email on the address associated with your Support account.

For clarity, we will leave this GitHub issue open until it has been resolved to your satisfaction.

cBog commented 3 years ago

Resolved by updating to the latest firmware.

graphcore / poptorch

IPUs hang when pipelining with more than 2 IPUs #1