lucidrains / imagen-pytorch

Implementation of Imagen, Google's Text-to-Image Neural Network, in Pytorch
MIT License
8.11k stars 768 forks source link

Shape mismatch error when trying to train basic example using MNIST #243

Closed jameshball closed 2 years ago

jameshball commented 2 years ago

Hi,

I've copied the Dataloader example to test this on my machine and make sure everything works before properly using this library, but I'm getting the following error after making some small modifications and when using the MNIST dataset:

Traceback (most recent call last):
  File "C:\dev\venv\lib\site-packages\einops\einops.py", line 413, in reduce
    return _apply_recipe(recipe, tensor, reduction_type=reduction)
  File "C:\dev\venv\lib\site-packages\einops\einops.py", line 236, in _apply_recipe
    _reconstruct_from_shape(recipe, backend.shape(tensor))
  File "C:\dev\venv\lib\site-packages\einops\einops.py", line 192, in _reconstruct_from_shape_uncached
    raise EinopsError('Shape mismatch, {} != {}'.format(length, known_product))
einops.EinopsError: Shape mismatch, 1 != 3

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\dev\main.py", line 54, in <module>
    main()
  File "C:\dev\main.py", line 41, in main
    loss = trainer.train_step(unet_number=1, max_batch_size=4)
  File "C:\dev\venv\lib\site-packages\imagen_pytorch\trainer.py", line 590, in train_step
    loss = self.step_with_dl_iter(self.train_dl_iter, unet_number = unet_number, **kwargs)
  File "C:\dev\venv\lib\site-packages\imagen_pytorch\trainer.py", line 608, in step_with_dl_iter
    loss = self.forward(**{**kwargs, **model_input})
  File "C:\dev\venv\lib\site-packages\imagen_pytorch\trainer.py", line 135, in inner
    out = fn(model, *args, **kwargs)
  File "C:\dev\venv\lib\site-packages\imagen_pytorch\trainer.py", line 961, in forward
    loss = self.imagen(*chunked_args, unet = self.unet_being_trained, unet_number = unet_number, **chunked_kwargs)
  File "C:\dev\venv\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\dev\venv\lib\site-packages\imagen_pytorch\imagen_pytorch.py", line 2470, in forward
    check_shape(images, 'b c ...', c = self.channels)
  File "C:\dev\venv\lib\site-packages\einops_exts\einops_exts.py", line 12, in check_shape
    return rearrange(tensor, f"{pattern} -> {pattern}", **kwargs)
  File "C:\dev\venv\lib\site-packages\einops\einops.py", line 484, in rearrange
    return reduce(tensor, pattern, reduction='rearrange', **axes_lengths)
  File "C:\dev\venv\lib\site-packages\einops\einops.py", line 421, in reduce
    raise EinopsError(message + '\n {}'.format(e))
einops.EinopsError:  Error while processing rearrange-reduction pattern "b c ... -> b c ...".
 Input tensor shape: torch.Size([4, 1, 28, 28]). Additional info: {'c': 3}.
 Shape mismatch, 1 != 3

This is the code I'm using which is only slightly modified to change the image size:

from imagen_pytorch import Unet, Imagen, ImagenTrainer
from imagen_pytorch.data import Dataset

def main():
    # unets for unconditional imagen

    unet = Unet(
        dim=32,
        dim_mults=(1, 2, 4, 8),
        num_resnet_blocks=1,
        layer_attns=(False, False, False, True),
        layer_cross_attns=False
    )

    # imagen, which contains the unet above

    imagen = Imagen(
        condition_on_text=False,  # this must be set to False for unconditional Imagen
        unets=unet,
        image_sizes=28,
        timesteps=1000
    )

    trainer = ImagenTrainer(
        imagen=imagen,
        split_valid_from_train=True  # whether to split the validation dataset from the training
    ).cuda()

    # instantiate your dataloader, which returns the necessary inputs to the DDPM as tuple in the order of images, text embeddings, then text masks. in this case, only images is returned as it is unconditional training

    dataset = Dataset('mnist/training', image_size=28)

    trainer.add_train_dataset(dataset, batch_size=16)

    # working training loop

    for i in range(200000):
        loss = trainer.train_step(unet_number=1, max_batch_size=4)
        print(f'loss: {loss}')

        if not (i % 50):
            valid_loss = trainer.valid_step(unet_number=1, max_batch_size=4)
            print(f'valid loss: {valid_loss}')

        if not (i % 100) and trainer.is_main:  # is_main makes sure this can run in distributed
            images = trainer.sample(batch_size=1, return_pil_images=True)  # returns List[Image]
            images[0].save(f'./sample-{i // 100}.png')

if __name__ == '__main__':
    main()

I'm running this on CUDA with an NVIDIA GeForce RTX 3060 Laptop GPU on Windows 11. Please let me know if there's any extra info I can provide!

Thanks.

jameshball commented 2 years ago

I suspect this is an issue with the library expecting 3 colours in the training images but not getting this because MNIST is only greyscale

jameshball commented 2 years ago

I fixed this error by creating my own version of imagen_pytorch.data.Dataset that converts the image to grayscale (which it already is) with 3 channels so that the shape is [3, 28, 28] instead.

The new Dataset class:

from pathlib import Path
from functools import partial

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T, utils

from PIL import Image

# helpers functions

def exists(val):
    return val is not None

def cycle(dl):
    while True:
        for data in dl:
            yield data

def convert_image_to(img_type, image):
    if image.mode != img_type:
        return image.convert(img_type)
    return image

# dataset and dataloader

class Dataset(Dataset):
    def __init__(
            self,
            folder,
            image_size,
            exts=['jpg', 'jpeg', 'png', 'tiff'],
            convert_image_to_type=None
    ):
        super().__init__()
        self.folder = folder
        self.image_size = image_size
        self.paths = [p for ext in exts for p in Path(f'{folder}').glob(f'**/*.{ext}')]

        convert_fn = partial(convert_image_to, convert_image_to_type) if exists(
            convert_image_to_type) else nn.Identity()

        self.transform = T.Compose([
            T.Lambda(convert_fn),
            T.Resize(image_size),
            T.RandomHorizontalFlip(),
            T.CenterCrop(image_size),
            # Added this so the shape is correct!
            T.Grayscale(3),
            T.ToTensor()
        ])

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index):
        path = self.paths[index]
        img = Image.open(path)
        return self.transform(img)

def get_images_dataloader(
        folder,
        *,
        batch_size,
        image_size,
        shuffle=True,
        cycle_dl=False,
        pin_memory=True
):
    ds = Dataset(folder, image_size)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=shuffle, pin_memory=pin_memory)

    if cycle_dl:
        dl = cycle(dl)
    return dl
jameshball commented 2 years ago

Also got the following error after I did this:

einops.EinopsError: Shape mismatch, can't divide axis of length 7 in chunks of 2

This is just because I was trying to train with 28x28 images rather than a power of 2. Changing the image size to 32x32 seems to have resolved this!