Open bcd33bcd opened 6 years ago
Maybe you should make the arguments located on a same GPU?. Could you show the code?
Thank you for your reply. Here is the model of discriminator
class Discriminator(nn.Module): def init(self,nc,ndf): super(Discriminator,self).init()
self.layer1 = nn.Sequential(nn.Conv2d(nc,ndf,kernel_size=4,stride=2,padding=1),
nn.BatchNorm2d(ndf),
nn.LeakyReLU(0.2,inplace=True))
self.layer1 = nn.DataParallel(self.layer1)
# 16 x 16
self.layer2 = nn.Sequential(nn.Conv2d(ndf,ndf*2,kernel_size=4,stride=2,padding=1),
nn.BatchNorm2d(ndf*2),
nn.LeakyReLU(0.2,inplace=True))
self.layer2 = nn.DataParallel(self.layer2)
# 8 x 8
self.layer3 = nn.Sequential(nn.Conv2d(ndf*2,ndf*4,kernel_size=4,stride=2,padding=1),
nn.BatchNorm2d(ndf*4),
nn.LeakyReLU(0.2,inplace=True))
self.layer3 = nn.DataParallel(self.layer3)
# 4 x 4
self.layer4 = nn.Sequential(nn.Conv2d(ndf*4,2,kernel_size=4,stride=1,padding=0),
nn.Sigmoid())
self.layer4 = nn.DataParallel(self.layer4)
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
return out
And in the main.py, there is
netD = Discriminator(nc, ndf) netG = Generator(nc, ngf, opt.nz) if(opt.cuda): netD.cuda() netG.cuda()
Variables is initilized by ########## GLOBAL VARIABLES ########### noise = torch.FloatTensor(opt.batchSize, opt.nz, 1, 1) real = torch.FloatTensor(opt.batchSize, nc, opt.imageSize, opt.imageSize) real_label = 1 fake_label = 0 real_list = torch.FloatTensor(opt.batchSize) fake_list = torch.FloatTensor(opt.batchSize)
noise = Variable(noise) real = Variable(real) real_list = Variable(real_list) fake_list = Variable(fake_list)
if(opt.cuda): noise = noise.cuda() real = real.cuda() real_list = real_list.cuda() fake_list = fake_list.cuda()
Training:
# opt.batchSize images
real.data.resize_(images.size()).copy_(images)
fake_list = fake_list.data.resize_(images.size(0),1).fill_(fake_label)
real_list = real_list.data.resize_(images.size(0),1).fill_(real_label)
label = Variable(torch.cat((fake_list,real_list),1))
if opt.cuda:
label.cuda()
errD_real = netD(real)
prob = nn.functional.softmax(errD_real)
errD_real = criterion(prob, label)
errD_real.backward()
Please make the point of my mistake. Thanks. In addition, the bug report is
File "dcgan.py", line 214, in
gradient_penalty.backward() File "/home/workstation1/anaconda2/lib/python2.7/site-packages/torch/autograd/variable.py", line 156, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables) File "/home/workstation1/anaconda2/lib/python2.7/site-packages/torch/autograd/init.py", line 98, in backward variables, grad_variables, retain_graph) RuntimeError: arguments are located on different GPUs at /opt/conda/conda-bld/pytorch_1503966894950/work/torch/lib/THC/generated/../generic/THCTensorMathPointwise.cu:215
The function of gradient_penalty is not changed.
def calc_gradient_penalty(netD, real_data, fake_data,BATCH_SIZE, LAMBDA, use_cuda):
alpha = torch.rand(BATCH_SIZE, 1)
alpha = alpha.expand(BATCH_SIZE, real_data.nelement()/BATCH_SIZE).contiguous().view(BATCH_SIZE, 3, 32, 32)
alpha = alpha.cuda() if use_cuda else alpha
interpolates = alpha * real_data + ((1 - alpha) * fake_data)
if use_cuda:
interpolates = interpolates.cuda()
interpolates = Variable(interpolates, requires_grad=True)
disc_interpolates = netD(interpolates)
gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=interpolates,
grad_outputs=torch.ones(disc_interpolates.size()).cuda() if use_cuda else torch.ones(
disc_interpolates.size()),
create_graph=True, retain_graph=True, only_inputs=True)[0]
gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * LAMBDA
return gradient_penalty
This error does not appear to be related to multiple gpus itself, but rather the use of using the nn.parallel
functions. If I use the following:
def forward(self, input):
return nn.parallel.data_parallel(self.forward_, input, self.gpu_ids)
I get the error even though I'm only using one GPU, but if I just do
def forward(self, input):
return self.forward_(input)
I do not get any errors (input and the model are loaded on the current GPU).
I try to implement your code with multi-gpus, but the network occur the problem with
while add "torch.dataparallel()" to the Discriminator and Generator. Followed by the suggestion of "gchanan", the error changes to
How could I fix the problem? Thank you.