lesliejackson / PyTorch-Distributed-Training

Example of PyTorch DistributedDataParallel
59 stars 24 forks source link

RuntimeError: Given groups=1, weight of size 64 3 7 7, expected input[128, 1, 28, 28] to have 3 channels, but got 1 channels instead #2

Open 785985821 opened 4 years ago

785985821 commented 4 years ago

你好,我遇到过一个问题。就是跑的时候没跑成功,显示 File "main.py", line 98, in output = net(imgs) File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call result = self.forward(*input, kwargs) File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 376, in forward output = self.module(*inputs[0], *kwargs[0]) File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call result = self.forward(input, kwargs) File "/home/hexianglong/.local/lib/python3.7/site-packages/torchvision/models/resnet.py", line 192, in forward x = self.conv1(x) File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call result = self.forward(*input, **kwargs) File "/home/hexianglong/.local/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 338, in forward self.padding, self.dilation, self.groups) RuntimeError: Given groups=1, weight of size 64 3 7 7, expected input[128, 1, 28, 28] to have 3 channels, but got 1 channels instead

请问如何解决

lesliejackson commented 4 years ago

模型需要的输入是3通道的,你给的输入时1通道的,两个不匹配了

785985821 commented 4 years ago

哦哦 我是直接运行你的代码的 然后不知道怎么改 改了一天没弄出来 小白一枚------------------ 原始邮件 ------------------ 发件人: "Yun&nbsp;Yang"<notifications@github.com> 发送时间: 2020年10月12日(星期一) 晚上11:11 收件人: "lesliejackson/PyTorch-Distributed-Training"<PyTorch-Distributed-Training@noreply.github.com>; 抄送: "785985821"<785985821@qq.com>;"Author"<author@noreply.github.com>; 主题: Re: [lesliejackson/PyTorch-Distributed-Training] RuntimeError: Given groups=1, weight of size 64 3 7 7, expected input[128, 1, 28, 28] to have 3 channels, but got 1 channels instead (#2)

wdfnst commented 4 years ago

运行代码:

python -m torch.distributed.launch --nproc_per_node=8 --master_port=29500 test_ddp01.py --world_size 8

代码:

import argparse
import time
import torch
import torchvision
from torch import distributed as dist
from torchvision.models import resnet18
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
import numpy as np

def reduce_loss(tensor, rank, world_size):
    with torch.no_grad():
        dist.reduce(tensor, dst=0)
        if rank == 0:
            tensor /= world_size

parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, help="local gpu id")
parser.add_argument('--world_size', type=int, help="num of processes")
args = parser.parse_args()

batch_size = 128
epochs = 5
lr = 0.001

dist.init_process_group(backend='nccl', init_method='env://')
torch.cuda.set_device(args.local_rank)
global_rank = dist.get_rank()

from torchvision.models.resnet import ResNet, BasicBlock
class MnistResNet(ResNet):
    def __init__(self):
        super(MnistResNet, self).__init__(BasicBlock, [2, 2, 2, 2], num_classes=10)
        self.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

    def forward(self, x):
        return torch.softmax(super(MnistResNet, self).forward(x), dim=-1)

# net = resnet18()
net = MnistResNet()
net.cuda()
net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
net = DDP(net, device_ids=[args.local_rank], output_device=args.local_rank)

class ToNumpy(object):
    def __call__(self, sample):
        return np.array(sample)

data_root = 'dataset'
trainset = MNIST(root=data_root,
                 download=True,
                 train=True,
                 transform=torchvision.transforms.Compose(
                     [ToNumpy(), torchvision.transforms.ToTensor()])
                 )

valset = MNIST(root=data_root,
               download=True,
               train=False,
               transform=torchvision.transforms.Compose(
                   [ToNumpy(), torchvision.transforms.ToTensor()])
               )

sampler = DistributedSampler(trainset)
train_loader = DataLoader(trainset,
                          batch_size=batch_size,
                          shuffle=False,
                          pin_memory=True,
                          sampler=sampler)

val_loader = DataLoader(valset,
                        batch_size=batch_size,
                        shuffle=False,
                        pin_memory=True)

criterion = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(net.parameters(), lr=lr)

net.train()
for e in range(epochs):
    # DistributedSampler deterministically shuffle data
    # by seting random seed be current number epoch
    # so if do not call set_epoch when start of one epoch
    # the order of shuffled data will be always same
    sampler.set_epoch(e)
    for idx, (imgs, labels) in enumerate(train_loader):
        imgs = imgs.cuda()
        labels = labels.cuda()
        output = net(imgs)
        loss = criterion(output, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
        reduce_loss(loss, global_rank, args.world_size)
        if idx % 10 == 0 and global_rank == 0:
            print('Epoch: {} step: {} loss: {}'.format(e, idx, loss.item()))
net.eval()
with torch.no_grad():
    cnt = 0
    total = len(val_loader.dataset)
    for imgs, labels in val_loader:
        imgs, labels = imgs.cuda(), labels.cuda()
        output = net(imgs)
        predict = torch.argmax(output, dim=1)
        cnt += (predict == labels).sum().item()

if global_rank == 0:
    print('eval accuracy: {}'.format(cnt / total))
(END)
               )

sampler = DistributedSampler(trainset)
train_loader = DataLoader(trainset,
                          batch_size=batch_size,
                          shuffle=False,
                          pin_memory=True,
                          sampler=sampler)

val_loader = DataLoader(valset,
                        batch_size=batch_size,
                        shuffle=False,
                        pin_memory=True)

criterion = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(net.parameters(), lr=lr)

net.train()
for e in range(epochs):
    # DistributedSampler deterministically shuffle data
    # by seting random seed be current number epoch
    # so if do not call set_epoch when start of one epoch
    # the order of shuffled data will be always same
    sampler.set_epoch(e)
    for idx, (imgs, labels) in enumerate(train_loader):
        imgs = imgs.cuda()
        labels = labels.cuda()
        output = net(imgs)
        loss = criterion(output, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
        reduce_loss(loss, global_rank, args.world_size)
        if idx % 10 == 0 and global_rank == 0:
            print('Epoch: {} step: {} loss: {}'.format(e, idx, loss.item()))
net.eval()
with torch.no_grad():
    cnt = 0
    total = len(val_loader.dataset)
    for imgs, labels in val_loader:
        imgs, labels = imgs.cuda(), labels.cuda()
        output = net(imgs)
        predict = torch.argmax(output, dim=1)
        cnt += (predict == labels).sum().item()

if global_rank == 0:
    print('eval accuracy: {}'.format(cnt / total))