Open 785985821 opened 4 years ago
模型需要的输入是3通道的,你给的输入时1通道的,两个不匹配了
哦哦 我是直接运行你的代码的 然后不知道怎么改 改了一天没弄出来 小白一枚------------------ 原始邮件 ------------------ 发件人: "Yun Yang"<notifications@github.com> 发送时间: 2020年10月12日(星期一) 晚上11:11 收件人: "lesliejackson/PyTorch-Distributed-Training"<PyTorch-Distributed-Training@noreply.github.com>; 抄送: "785985821"<785985821@qq.com>;"Author"<author@noreply.github.com>; 主题: Re: [lesliejackson/PyTorch-Distributed-Training] RuntimeError: Given groups=1, weight of size 64 3 7 7, expected input[128, 1, 28, 28] to have 3 channels, but got 1 channels instead (#2)
运行代码:
python -m torch.distributed.launch --nproc_per_node=8 --master_port=29500 test_ddp01.py --world_size 8
代码:
import argparse
import time
import torch
import torchvision
from torch import distributed as dist
from torchvision.models import resnet18
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
import numpy as np
def reduce_loss(tensor, rank, world_size):
with torch.no_grad():
dist.reduce(tensor, dst=0)
if rank == 0:
tensor /= world_size
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, help="local gpu id")
parser.add_argument('--world_size', type=int, help="num of processes")
args = parser.parse_args()
batch_size = 128
epochs = 5
lr = 0.001
dist.init_process_group(backend='nccl', init_method='env://')
torch.cuda.set_device(args.local_rank)
global_rank = dist.get_rank()
from torchvision.models.resnet import ResNet, BasicBlock
class MnistResNet(ResNet):
def __init__(self):
super(MnistResNet, self).__init__(BasicBlock, [2, 2, 2, 2], num_classes=10)
self.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
def forward(self, x):
return torch.softmax(super(MnistResNet, self).forward(x), dim=-1)
# net = resnet18()
net = MnistResNet()
net.cuda()
net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
net = DDP(net, device_ids=[args.local_rank], output_device=args.local_rank)
class ToNumpy(object):
def __call__(self, sample):
return np.array(sample)
data_root = 'dataset'
trainset = MNIST(root=data_root,
download=True,
train=True,
transform=torchvision.transforms.Compose(
[ToNumpy(), torchvision.transforms.ToTensor()])
)
valset = MNIST(root=data_root,
download=True,
train=False,
transform=torchvision.transforms.Compose(
[ToNumpy(), torchvision.transforms.ToTensor()])
)
sampler = DistributedSampler(trainset)
train_loader = DataLoader(trainset,
batch_size=batch_size,
shuffle=False,
pin_memory=True,
sampler=sampler)
val_loader = DataLoader(valset,
batch_size=batch_size,
shuffle=False,
pin_memory=True)
criterion = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(net.parameters(), lr=lr)
net.train()
for e in range(epochs):
# DistributedSampler deterministically shuffle data
# by seting random seed be current number epoch
# so if do not call set_epoch when start of one epoch
# the order of shuffled data will be always same
sampler.set_epoch(e)
for idx, (imgs, labels) in enumerate(train_loader):
imgs = imgs.cuda()
labels = labels.cuda()
output = net(imgs)
loss = criterion(output, labels)
opt.zero_grad()
loss.backward()
opt.step()
reduce_loss(loss, global_rank, args.world_size)
if idx % 10 == 0 and global_rank == 0:
print('Epoch: {} step: {} loss: {}'.format(e, idx, loss.item()))
net.eval()
with torch.no_grad():
cnt = 0
total = len(val_loader.dataset)
for imgs, labels in val_loader:
imgs, labels = imgs.cuda(), labels.cuda()
output = net(imgs)
predict = torch.argmax(output, dim=1)
cnt += (predict == labels).sum().item()
if global_rank == 0:
print('eval accuracy: {}'.format(cnt / total))
(END)
)
sampler = DistributedSampler(trainset)
train_loader = DataLoader(trainset,
batch_size=batch_size,
shuffle=False,
pin_memory=True,
sampler=sampler)
val_loader = DataLoader(valset,
batch_size=batch_size,
shuffle=False,
pin_memory=True)
criterion = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(net.parameters(), lr=lr)
net.train()
for e in range(epochs):
# DistributedSampler deterministically shuffle data
# by seting random seed be current number epoch
# so if do not call set_epoch when start of one epoch
# the order of shuffled data will be always same
sampler.set_epoch(e)
for idx, (imgs, labels) in enumerate(train_loader):
imgs = imgs.cuda()
labels = labels.cuda()
output = net(imgs)
loss = criterion(output, labels)
opt.zero_grad()
loss.backward()
opt.step()
reduce_loss(loss, global_rank, args.world_size)
if idx % 10 == 0 and global_rank == 0:
print('Epoch: {} step: {} loss: {}'.format(e, idx, loss.item()))
net.eval()
with torch.no_grad():
cnt = 0
total = len(val_loader.dataset)
for imgs, labels in val_loader:
imgs, labels = imgs.cuda(), labels.cuda()
output = net(imgs)
predict = torch.argmax(output, dim=1)
cnt += (predict == labels).sum().item()
if global_rank == 0:
print('eval accuracy: {}'.format(cnt / total))
你好,我遇到过一个问题。就是跑的时候没跑成功,显示 File "main.py", line 98, in
output = net(imgs)
File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call
result = self.forward(*input, kwargs)
File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 376, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call
result = self.forward(input, kwargs)
File "/home/hexianglong/.local/lib/python3.7/site-packages/torchvision/models/resnet.py", line 192, in forward
x = self.conv1(x)
File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call
result = self.forward(*input, **kwargs)
File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 338, in forward
self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size 64 3 7 7, expected input[128, 1, 28, 28] to have 3 channels, but got 1 channels instead
请问如何解决