hanzhanggit / StackGAN-Pytorch

MIT License
491 stars 146 forks source link

GPU out of memory during evaluation. #3

Closed shenkev closed 7 years ago

shenkev commented 7 years ago

Hi Han,

I'm getting a "cuda runtime error (2) : out of memory" error when I try to evaluate the model using the pretrained weights. What is the hardware requirement to run this code? I have an Nvidia gtx 1080.

Console:

$ python main.py --cfg cfg/coco_eval.yml --gpu 0

Using config:
{'CONFIG_NAME': 'stageII',
 'CUDA': True,
 'DATASET_NAME': 'coco',
 'DATA_DIR': '../data/coco',
 'EMBEDDING_TYPE': 'cnn-rnn',
 'GAN': {'CONDITION_DIM': 128, 'DF_DIM': 96, 'GF_DIM': 192, 'R_NUM': 2},
 'GPU_ID': '0',
 'IMSIZE': 256,
 'NET_D': '',
 'NET_G': '../models/coco/netG_epoch_90.pth',
 'STAGE': 2,
 'STAGE1_G': '',
 'TEXT': {'DIMENSION': 1024},
 'TRAIN': {'BATCH_SIZE': 40,
           'COEFF': {'KL': 2.0},
           'DISCRIMINATOR_LR': 0.0002,
           'FLAG': False,
           'GENERATOR_LR': 0.0002,
           'LR_DECAY_EPOCH': 600,
           'MAX_EPOCH': 600,
           'PRETRAINED_EPOCH': 600,
           'PRETRAINED_MODEL': '',
           'SNAPSHOT_INTERVAL': 50},
 'VIS_COUNT': 64,
 'WORKERS': 4,
 'Z_DIM': 100}
STAGE2_G (
  (STAGE1_G): STAGE1_G (
    (ca_net): CA_NET (
      (fc): Linear (1024 -> 256)
      (relu): ReLU ()
    )
    (fc): Sequential (
      (0): Linear (228 -> 24576)
      (1): BatchNorm1d(24576, eps=1e-05, momentum=0.1, affine=True)
      (2): ReLU (inplace)
    )
    (upsample1): Sequential (
      (0): Upsample(scale_factor=2, mode=nearest)
      (1): Conv2d(1536, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (2): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
      (3): ReLU (inplace)
    )
    (upsample2): Sequential (
      (0): Upsample(scale_factor=2, mode=nearest)
      (1): Conv2d(768, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (2): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True)
      (3): ReLU (inplace)
    )
    (upsample3): Sequential (
      (0): Upsample(scale_factor=2, mode=nearest)
      (1): Conv2d(384, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (2): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True)
      (3): ReLU (inplace)
    )
    (upsample4): Sequential (
      (0): Upsample(scale_factor=2, mode=nearest)
      (1): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (2): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True)
      (3): ReLU (inplace)
    )
    (img): Sequential (
      (0): Conv2d(96, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): Tanh ()
    )
  )
  (ca_net): CA_NET (
    (fc): Linear (1024 -> 256)
    (relu): ReLU ()
  )
  (encoder): Sequential (
    (0): Conv2d(3, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): ReLU (inplace)
    (2): Conv2d(192, 384, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True)
    (4): ReLU (inplace)
    (5): Conv2d(384, 768, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
    (7): ReLU (inplace)
  )
  (hr_joint): Sequential (
    (0): Conv2d(896, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
    (2): ReLU (inplace)
  )
  (residual): Sequential (
    (0): ResBlock (
      (block): Sequential (
        (0): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
        (2): ReLU (inplace)
        (3): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
      )
      (relu): ReLU (inplace)
    )
    (1): ResBlock (
      (block): Sequential (
        (0): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
        (2): ReLU (inplace)
        (3): Conv2d(768, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
      )
      (relu): ReLU (inplace)
    )
  )
  (upsample1): Sequential (
    (0): Upsample(scale_factor=2, mode=nearest)
    (1): Conv2d(768, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (2): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True)
    (3): ReLU (inplace)
  )
  (upsample2): Sequential (
    (0): Upsample(scale_factor=2, mode=nearest)
    (1): Conv2d(384, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (2): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True)
    (3): ReLU (inplace)
  )
  (upsample3): Sequential (
    (0): Upsample(scale_factor=2, mode=nearest)
    (1): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (2): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True)
    (3): ReLU (inplace)
  )
  (upsample4): Sequential (
    (0): Upsample(scale_factor=2, mode=nearest)
    (1): Conv2d(96, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True)
    (3): ReLU (inplace)
  )
  (img): Sequential (
    (0): Conv2d(48, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): Tanh ()
  )
)
Load from:  ../models/coco/netG_epoch_90.pth
STAGE2_D (
  (encode_img): Sequential (
    (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): LeakyReLU (0.2, inplace)
    (2): Conv2d(96, 192, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True)
    (4): LeakyReLU (0.2, inplace)
    (5): Conv2d(192, 384, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True)
    (7): LeakyReLU (0.2, inplace)
    (8): Conv2d(384, 768, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (9): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
    (10): LeakyReLU (0.2, inplace)
    (11): Conv2d(768, 1536, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (12): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True)
    (13): LeakyReLU (0.2, inplace)
    (14): Conv2d(1536, 3072, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (15): BatchNorm2d(3072, eps=1e-05, momentum=0.1, affine=True)
    (16): LeakyReLU (0.2, inplace)
    (17): Conv2d(3072, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (18): BatchNorm2d(1536, eps=1e-05, momentum=0.1, affine=True)
    (19): LeakyReLU (0.2, inplace)
    (20): Conv2d(1536, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (21): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
    (22): LeakyReLU (0.2, inplace)
  )
  (get_cond_logits): D_GET_LOGITS (
    (outlogits): Sequential (
      (0): Conv2d(896, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(768, eps=1e-05, momentum=0.1, affine=True)
      (2): LeakyReLU (0.2, inplace)
      (3): Conv2d(768, 1, kernel_size=(4, 4), stride=(4, 4))
      (4): Sigmoid ()
    )
  )
  (get_uncond_logits): D_GET_LOGITS (
    (outlogits): Sequential (
      (0): Conv2d(768, 1, kernel_size=(4, 4), stride=(4, 4))
      (1): Sigmoid ()
    )
  )
)
Successfully load sentences from:  ../data/coco/test/val_captions.t7
Total number of sentences: 40470
num_embeddings: 40470 (40470, 1024)
THCudaCheck FAIL file=/pytorch/torch/lib/THC/generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
  File "main.py", line 77, in <module>
    algo.sample(datapath, cfg.STAGE)
  File "/home/shenkev/Downloads/StackGAN-Pytorch/code/trainer.py", line 278, in sample
    nn.parallel.data_parallel(netG, inputs, self.gpus)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 102, in data_parallel
    return module(*inputs[0], **module_kwargs[0])
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 224, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shenkev/Downloads/StackGAN-Pytorch/code/model.py", line 257, in forward
    h_code = self.upsample4(h_code)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 224, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/container.py", line 67, in forward
    input = module(input)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 224, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/upsampling.py", line 80, in forward
    return F.upsample(input, self.size, self.scale_factor, self.mode)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/functional.py", line 911, in upsample
    return _functions.thnn.UpsamplingNearest2d(_pair(size), scale_factor)(input)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/_functions/thnn/upsampling.py", line 52, in forward
    self.scale_factor
RuntimeError: cuda runtime error (2) : out of memory at /pytorch/torch/lib/THC/generic/THCStorage.cu:66
htoyryla commented 7 years ago

Change the batch size in the yml file to a smaller value. E.g. 8.