kevinzakka / recurrent-visual-attention

A PyTorch Implementation of "Recurrent Models of Visual Attention"
MIT License
468 stars 123 forks source link

Issues faced while using my own dataset #18

Closed duygusar closed 6 years ago

duygusar commented 6 years ago

Hello I want to use recurrent visual attention with my own dataset so I have a custom dataloader which looks like below. I have run the code with MNIST without any trouble but with my own dataset I am facing issues.

from __future__ import print_function, division #ds
import numpy as np
from utils import plot_images

import os #ds
import pandas as pd #ds
from skimage import io, transform #ds
import torch
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader #ds
from torchvision import transforms
from torchvision import utils #ds
from torch.utils.data.sampler import SubsetRandomSampler

class CDataset(Dataset):

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.frame)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir,
                                self.frame.iloc[idx, 0]+'.jpg')
        image = io.imread(img_name)
#       image = image.transpose((2, 0, 1))
        labels = np.array(self.frame.iloc[idx, 1])#.as_matrix() #ds
        #landmarks = landmarks.astype('float').reshape(-1, 2)
        #print(image.shape)
        #print(img_name,labels)
        sample = {'image': image, 'labels': labels}

        if self.transform:
            sample = self.transform(sample)

        return sample

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, labels = sample['image'], sample['labels']
        #print(image)
        #print(labels)
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        #print(image.shape)
        #print((torch.from_numpy(image)))
        #print((torch.from_numpy(labels)))
        return {'image': torch.from_numpy(image),
                'labels': torch.from_numpy(labels)}

def get_train_valid_loader(data_dir,
                           batch_size,
                           random_seed,
                           #valid_size=0.1, #ds
                           #shuffle=True,
                           show_sample=False,
                           num_workers=4,
                           pin_memory=False):
    """
    Utility function for loading and returning train and valid
    multi-process iterators over the MNIST dataset. A sample
    9x9 grid of the images can be optionally displayed.

    If using CUDA, num_workers should be set to 1 and pin_memory to True.

    Args
    ----
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - random_seed: fix seed for reproducibility.
    - #ds valid_size: percentage split of the training set used for
      the validation set. Should be a float in the range [0, 1].
      In the paper, this number is set to 0.1.
    - shuffle: whether to shuffle the train/validation indices.
    - show_sample: plot 9x9 sample grid of the dataset.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.

    Returns
    -------
    - train_loader: training set iterator.
    - valid_loader: validation set iterator.
    """
    #ds
    #error_msg = "[!] valid_size should be in the range [0, 1]."
    #assert ((valid_size >= 0) and (valid_size <= 1)), error_msg
    #ds

    # define transforms
    #normalize = transforms.Normalize((0.1307,), (0.3081,))
    trans = transforms.Compose([
        ToTensor(), #normalize,
    ])

    # load train dataset
    #train_dataset = datasets.MNIST(
    #    data_dir, train=True, download=True, transform=trans
    #)

    train_dataset = CDataset(csv_file='/home/Desktop/6June17/util/train.csv',
                                    root_dir='/home/caffe/data/images/',transform=trans)

    # load validation dataset
    #valid_dataset = datasets.MNIST( #ds
    #    data_dir, train=True, download=True, transform=trans #ds
    #)

    valid_dataset = CDataset(csv_file='/home/Desktop/6June17/util/eval.csv',
                                    root_dir='/home/caffe/data/images/',transform=trans)

    num_train = len(train_dataset) 
    train_indices = list(range(num_train)) 
    #ds split = int(np.floor(valid_size * num_train))

    num_valid = len(valid_dataset) #ds
    valid_indices = list(range(num_valid)) #ds

    #if shuffle:
    #    np.random.seed(random_seed)
    #    np.random.shuffle(indices)

    #ds train_idx, valid_idx = indices[split:], indices[:split]
    train_idx = train_indices #ds
    valid_idx = valid_indices #ds

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    print(train_loader)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    # visualize some images
    if show_sample:
        sample_loader = torch.utils.data.DataLoader(
            dataset, batch_size=9, #shuffle=shuffle,
            num_workers=num_workers, pin_memory=pin_memory
        )
        data_iter = iter(sample_loader)
        images, labels = data_iter.next()
        X = images.numpy()
        X = np.transpose(X, [0, 2, 3, 1])
        plot_images(X, labels)

    return (train_loader, valid_loader)

def get_test_loader(data_dir,
                    batch_size,
                    num_workers=4,
                    pin_memory=False):
    """
    Utility function for loading and returning a multi-process
    test iterator over the MNIST dataset.

    If using CUDA, num_workers should be set to 1 and pin_memory to True.

    Args
    ----
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.

    Returns
    -------
    - data_loader: test set iterator.
    """
    # define transforms
    #normalize = transforms.Normalize((0.1307,), (0.3081,))
    trans = transforms.Compose([
        ToTensor(), #normalize,
    ])

    # load dataset
    #dataset = datasets.MNIST(
    #    data_dir, train=False, download=True, transform=trans
    #)

    test_dataset = CDataset(csv_file='/home/Desktop/6June17/util/test.csv',
                                    root_dir='/home/caffe/data/images/',transform=trans)

    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    return test_loader

#for i_batch, sample_batched in enumerate(dataloader):
#    print(i_batch, sample_batched['image'].size(),
#          sample_batched['landmarks'].size())

#    # observe 4th batch and stop.
#    if i_batch == 3:
#        plt.figure()
#        show_landmarks_batch(sample_batched)
#        plt.axis('off')
#        plt.ioff()
#        plt.show()
#        break

Other main change I have made is closing off the parameter intake for validation size and shuffling (as I am using a pre-existing train, validation and test split and I have already shuffled these splits)

And my last change is,while iterating at trainer.py train_one_epoch(self, epoch) function. I have changed this part because formerly the x,y was being returned as strings of "image" and "labels" - headers of the pyton dictionary rather than the values in batches.

      for i, batch in enumerate(self.train_loader):

                x, y = batch["image"], batch["labels"]  

But now I get issues that I can not figure out:

Without the GPU, I get this error:

[*] Train on 64034 samples, validate on 18951 samples
Epoch: 1/200 - LR: 0.000300
<torch.utils.data.dataloader.DataLoader object at 0x7fe065fd4f60>
  0%|                                                                                                                                                                             | 0/64034 [00:00<?, ?it/s]/home/duygu/recurrent-visual-attention-master/modules.py:106: UserWarning: invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number
  from_x, to_x = from_x.data[0], to_x.data[0]
/home/duygu/recurrent-visual-attention-master/modules.py:107: UserWarning: invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number
  from_y, to_y = from_y.data[0], to_y.data[0]

Traceback (most recent call last):
  File "main.py", line 49, in <module>
    main(config)
  File "main.py", line 40, in main
    trainer.train()
  File "/home/duygu/recurrent-visual-attention-master/trainer.py", line 168, in train
    train_loss, train_acc = self.train_one_epoch(epoch)
  File "/home/duygu/recurrent-visual-attention-master/trainer.py", line 252, in train_one_epoch
    h_t, l_t, b_t, p = self.model(x, l_t, h_t)
  File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/duygu/recurrent-visual-attention-master/model.py", line 101, in forward
    g_t = self.sensor(x, l_t_prev)
  File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/duygu/recurrent-visual-attention-master/modules.py", line 214, in forward
    phi_out = F.relu(self.fc1(phi))
  File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/linear.py", line 55, in forward
    return F.linear(input, self.weight, self.bias)
  File "/usr/local/lib/python3.5/dist-packages/torch/nn/functional.py", line 992, in linear
    return torch.addmm(bias, input, weight.t())
RuntimeError: Expected object of type torch.FloatTensor but found type torch.ByteTensor for argument #4 'mat1'

Also is there any modifications that we can do to use GPU (frankly, I have chosen this implementation thinking GPU is supported so I am a little discouraged with other comments saying it is not)? I could potentially try it out. But of course, most crucial part is that I have a running example and to ensure I am not doing anything wrong (kind of difficult to track as I am new to pytorch).

Thanks.

duygusar commented 6 years ago

I cast the type with image = image.astype(float) -since it is a numpy array- but this time I get RuntimeError: Expected object of type torch.FloatTensor but found type torch.DoubleTensor for argument #4 'mat1

duygusar commented 6 years ago

I have casted explicitly to float64 (so that it is certainly not a double) and that solves that error but now I get a mismatch error. The network was working just fine with MNIST so I believe it is indeed a problem in my custom dataloader. Or could it be an inherent problem since this repository is using MNIST and is customized to handle a similar kind of data?

File "main.py", line 40, in main trainer.train() File "/home/duygu/recurrent-visual-attention-master/trainer.py", line 168, in train train_loss, train_acc = self.train_one_epoch(epoch) File "/home/duygu/recurrent-visual-attention-master/trainer.py", line 252, in train_one_epoch h_t, l_t, b_t, p = self.model(x, l_t, h_t) File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/module.py", line 491, in call result = self.forward(*input, kwargs) File "/home/duygu/recurrent-visual-attention-master/model.py", line 101, in forward g_t = self.sensor(x, l_t_prev) File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/module.py", line 491, in call result = self.forward(*input, *kwargs) File "/home/duygu/recurrent-visual-attention-master/modules.py", line 214, in forward phi_out = F.relu(self.fc1(phi)) File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/module.py", line 491, in call 307440 1 result = self.forward(input, kwargs) File "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/linear.py", line 55, in forward return F.linear(input, self.weight, self.bias) File "/usr/local/lib/python3.5/dist-packages/torch/nn/functional.py", line 992, in linear return torch.addmm(bias, input, weight.t()) RuntimeError: size mismatch, m1: [32 x 192], m2: [64 x 128] at /pytorch/aten/src/TH/generic/THTensorMath.c:2033