Lightning-Universe / lightning-bolts

Toolbox of models, callbacks, and datasets for AI/ML researchers.
https://lightning-bolts.readthedocs.io
Apache License 2.0
1.69k stars 323 forks source link

pretrained VAE bug? #622

Open staniPetrox opened 3 years ago

staniPetrox commented 3 years ago

🐛 Bug

Hey hey! I am trying to train a pretrained VAE. I have 3 channeled RBG byte data and try to train it on pl_bolts.models.autoencoders, which eventually produced a bug.

Please reproduce using the BoringModel

#I load my data here, I converted it to 3 channel RGB
def load_data(path="./no_obstacle/black_and_white_compressed/"):
    return [np.array(Image.open(f"{path}{i}.png").convert("RGB")) for i in range(2500)]

class NewSet(Dataset):

    def __init__(self, data, label):
        self.data = data
        self.label = label.tobytes()

    def __len__(self):
        return len(self.label)

    #the pretrained model requires to have two outputs
    def __getitem__(self, idx):
        return self.data[idx],self.data[idx]#self.label[idx] 

def data_loaders(data_source="../no_obstacle/black_and_white_compressed_shape10/",batchsize = 25, train_size = 2000, val_size=250, test_size=250):
    #load_data
    data = load_data(path=data_source)
    data_num = load_data_numeric(path="../no_obstacle/state/")#TODO: WICHTIG GILT NUR FÜR NO OBSTACLE

    #shuffle data with permutations
    np.random.seed(123)
    p = np.random.permutation(len(data))
    p = p.astype(int) 
    data, data_num = np.array(data), np.array(data_num)[:,2].reshape(len(data_num),1) 
    data = data[p]
    data_num = data_num[p]
    data_num = data_num.astype(float)

    #split data
    data = np.split(data, [train_size,train_size+val_size,train_size+val_size*test_size]) #is there any randomness here?
    num_data = np.split(data_num, [train_size,train_size+val_size,train_size+val_size*test_size])
    train,val,test = data[0],data[1],data[2]
    train_num,val_num,test_num = num_data[0],num_data[1],num_data[2]

    #init(deploy pytorch data_loader)
    init = lambda a: torch.utils.data.DataLoader(a,batch_size=batchsize,shuffle=False)
    swap = lambda a: np.swapaxes(a,1,3)

    train,val,test = swap(train), swap(val), swap(test)

    train_loader = init(NewSet(data=train,label=train_num))
    val_loader = init(NewSet(data=val,label=val_num))
    test_loader = init(NewSet(data=test,label=test_num))

    #return the dataloader
    return train_loader, val_loader, test_loader

loader = data_loaders()[0]

vae = VAE(
        input_height = 10,
        enc_type = 'resnet18',
        first_conv = False,
        maxpool1 = False,
        enc_out_dim = 512,
        kl_coeff = 0.1,
        latent_dim = 256,
        lr = 1e-4
    ).from_pretrained('cifar10-resnet18') #not sure if the last one is really necesarry?

trainer = pl.Trainer(max_epochs=10)
trainer.fit(vae, loader)
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 599, in run_train
    self.train_loop.run_training_epoch()
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/training_loop.py", line 480, in run_training_epoch
    batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/training_loop.py", line 639, in run_training_batch
    self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/training_loop.py", line 414, in optimizer_step
    model_ref.optimizer_step(
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/core/lightning.py", line 1400, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/core/optimizer.py", line 214, in step
    self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
    trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/accelerators/accelerator.py", line 303, in optimizer_step
    self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/accelerators/accelerator.py", line 310, in run_optimizer_step
    self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 184, in optimizer_step
    optimizer.step(closure=lambda_closure, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torch/autograd/grad_mode.py", line 26, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torch/optim/adam.py", line 66, in step
    loss = closure()
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/training_loop.py", line 633, in train_step_and_backward_closure
    result = self.training_step_and_backward(
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/training_loop.py", line 727, in training_step_and_backward
    result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/training_loop.py", line 281, in training_step
    training_step_output = self.trainer.accelerator.training_step(args)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/accelerators/accelerator.py", line 182, in training_step
    return self.training_type_plugin.training_step(*args)
  File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 146, in training_step
    return self.lightning_module.training_step(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py", line 154, in training_step
    loss, logs = self.step(batch, batch_idx)
  File "/usr/local/lib/python3.8/dist-packages/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py", line 133, in step
    z, x_hat, p, q = self._run_step(x)
  File "/usr/local/lib/python3.8/dist-packages/pl_bolts/models/autoencoders/basic_vae/basic_vae_module.py", line 118, in _run_step
    x = self.encoder(x)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pl_bolts/models/autoencoders/components.py", line 247, in forward
    x = self.conv1(x)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py", line 423, in forward
    return self._conv_forward(input, self.weight)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py", line 419, in _conv_forward
    return F.conv2d(input, weight, self.bias, self.stride,
RuntimeError: expected scalar type Byte but found Float

The obvious thing to check is if my datatypes are indeed Byte-format - uint8 as I understand. It is indeed. It is uint8 in my script, it is basic_vae_module.py, I didn't check any further. Maybe useful to know, the structure of my input is torch.Size([25, 3, 10, 10]) 25 minibatches, 3 channels, 10x10 pictures.

Thanks in advance!

akihironitta commented 3 years ago

Let me transfer this issue to lightning-bolts as this seems not related to pytorch-lightning but to Bolts.

github-actions[bot] commented 3 years ago

Hi! thanks for your contribution!, great first issue!