laekov / fastmoe

A fast MoE impl for PyTorch
https://fastmoe.ai
Apache License 2.0
1.56k stars 187 forks source link

DDP error #202

Closed Peg-Wu closed 6 months ago

Peg-Wu commented 6 months ago

我用手写数字数据集跑了一个小案例(代码如下), 当我使用DDP时遇到了点问题,会出现错误:RuntimeErrorRuntimeError: : No backend type associated with device type cpuNo backend type associated with device type cpu 可以帮我看一下代码中有什么问题嘛,谢谢!

import os
import argparse

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision
import torchvision.transforms as transforms

import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler

from fmoe.distributed import DistributedGroupedDataParallel as fmoeDDP
from fmoe.transformer import FMoETransformerMLP
from fmoe.gates import *

parser = argparse.ArgumentParser()
parser.add_argument('-d', '--device_ids', type=str, default='6,7', help='Train_Devices')
parser.add_argument('-e', '--epochs', type=int, default=20, help='Train_Epochs')
parser.add_argument('-b', '--batch_size', type=int, default=256, help='Batch_Size')
parser.add_argument('-l' '--lr', type=float, default=0.001, help='Learning_Rate')
args = parser.parse_args()

# params
local_rank = int(os.environ["LOCAL_RANK"])
device_ids = list(map(int, args.device_ids.split(",")))
world_size = len(device_ids)

# init (current process)
dist.init_process_group(backend='nccl')
device = torch.device(f"cuda:{device_ids[local_rank]}")
torch.cuda.set_device(device)

# ds & dl
os.makedirs('./mnist_dataset', exist_ok=True)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307), (0.3081))])
train_ds = torchvision.datasets.MNIST('./mnist_dataset', train=True, transform=transform, download=True)
train_sampler = DistributedSampler(train_ds)
train_dl = DataLoader(train_ds, batch_size=args.batch_size, sampler=train_sampler)

# model
model = FMoETransformerMLP(num_expert=3, 
                           world_size=world_size, 
                           d_model=1*28*28, 
                           d_hidden=128, 
                           top_k=2,
                           gate=NaiveGate)

model.add_module(name="classifier", module=nn.Linear(1*28*28, 10))
model = fmoeDDP(model, device_ids=[device_ids[local_rank]], output_device=device_ids[local_rank])

# loss & optim
loss_fn = nn.CrossEntropyLoss(reduction="sum").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

# train_process
for epoch in range(args.epochs):
    train_sampler.set_epoch(epoch)

    train_loss, train_true = 0.0, 0.0

    model.train()
    for X, y in train_dl:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        X = X.flatten()
        output = model(X)
        loss = loss_fn(output, y)
        loss.backward()
        model.allreduce_gradients()
        optimizer.step()

        with torch.no_grad():
            train_loss += loss.item()
            train_true += torch.eq(output.argmax(dim=-1), y).sum().item()

    if local_rank == 0:
        print(f"Epoch: {epoch + 1}, Loss: {(train_loss / len(train_ds)):.6f}, Acc: {(train_true / len(train_ds)):.6f}")

r"""
Usage:
torchrun --standalone --nnodes=1 --nproc-per-node=2 demo.py --device_ids='6,7'
"""