我用手写数字数据集跑了一个小案例(代码如下),
当我使用DDP时遇到了点问题,会出现错误:RuntimeErrorRuntimeError: : No backend type associated with device type cpuNo backend type associated with device type cpu
可以帮我看一下代码中有什么问题嘛,谢谢!
import os
import argparse
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from fmoe.distributed import DistributedGroupedDataParallel as fmoeDDP
from fmoe.transformer import FMoETransformerMLP
from fmoe.gates import *
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--device_ids', type=str, default='6,7', help='Train_Devices')
parser.add_argument('-e', '--epochs', type=int, default=20, help='Train_Epochs')
parser.add_argument('-b', '--batch_size', type=int, default=256, help='Batch_Size')
parser.add_argument('-l' '--lr', type=float, default=0.001, help='Learning_Rate')
args = parser.parse_args()
# params
local_rank = int(os.environ["LOCAL_RANK"])
device_ids = list(map(int, args.device_ids.split(",")))
world_size = len(device_ids)
# init (current process)
dist.init_process_group(backend='nccl')
device = torch.device(f"cuda:{device_ids[local_rank]}")
torch.cuda.set_device(device)
# ds & dl
os.makedirs('./mnist_dataset', exist_ok=True)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307), (0.3081))])
train_ds = torchvision.datasets.MNIST('./mnist_dataset', train=True, transform=transform, download=True)
train_sampler = DistributedSampler(train_ds)
train_dl = DataLoader(train_ds, batch_size=args.batch_size, sampler=train_sampler)
# model
model = FMoETransformerMLP(num_expert=3,
world_size=world_size,
d_model=1*28*28,
d_hidden=128,
top_k=2,
gate=NaiveGate)
model.add_module(name="classifier", module=nn.Linear(1*28*28, 10))
model = fmoeDDP(model, device_ids=[device_ids[local_rank]], output_device=device_ids[local_rank])
# loss & optim
loss_fn = nn.CrossEntropyLoss(reduction="sum").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
# train_process
for epoch in range(args.epochs):
train_sampler.set_epoch(epoch)
train_loss, train_true = 0.0, 0.0
model.train()
for X, y in train_dl:
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
X = X.flatten()
output = model(X)
loss = loss_fn(output, y)
loss.backward()
model.allreduce_gradients()
optimizer.step()
with torch.no_grad():
train_loss += loss.item()
train_true += torch.eq(output.argmax(dim=-1), y).sum().item()
if local_rank == 0:
print(f"Epoch: {epoch + 1}, Loss: {(train_loss / len(train_ds)):.6f}, Acc: {(train_true / len(train_ds)):.6f}")
r"""
Usage:
torchrun --standalone --nnodes=1 --nproc-per-node=2 demo.py --device_ids='6,7'
"""
我用手写数字数据集跑了一个小案例(代码如下), 当我使用DDP时遇到了点问题,会出现错误:
RuntimeErrorRuntimeError: : No backend type associated with device type cpuNo backend type associated with device type cpu
可以帮我看一下代码中有什么问题嘛,谢谢!