spatiallysaying commented 5 years ago

🐛 Bug

Retraining the 'fasterrcnn_resnet50_fpn ' model for custom dataset is failing

To Reproduce

Steps to reproduce the behavior:

Successfully ran the sample with custom data . The classification is working as expected.
Wanted to work on object detection with custom data Faster R-CNN Object Detection with PyTorch
Combined above two examples . Replaced
model_ft = models.resnet50(pretrained=True) with model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

Expected behavior

Object detection retrained for custom object detection similar to https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/training.html

Environment

Google colab

` model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

our dataset has two classes only

num_classes = 2

in_features = model.roi_heads.box_predictor.cls_score.in_features

move model to the right device

model.to(device)

construct an optimizer

params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

and a learning rate scheduler which decreases the learning rate by

10x every 3 epochs

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

loss_func = nn.NLLLoss()`

`def train_and_validate(model, loss_criterion, optimizer, epochs=25): ''' Function to train and validate Parameters :param model: Model to train and validate :param loss_criterion: Loss Criterion to minimize :param optimizer: Optimizer for computing gradients :param epochs: Number of epochs (default=25)

Returns
    model: Trained Model with best validation accuracy
    history: (dict object): Having training loss, accuracy and validation loss, accuracy
'''

start = time.time()
history = []
best_acc = 0.0

for epoch in range(epochs):
    epoch_start = time.time()
    print("Epoch: {}/{}".format(epoch+1, epochs))

    # Set to training mode
    model.train()

    # Loss and Accuracy within the epoch
    train_loss = 0.0
    train_acc = 0.0

    valid_loss = 0.0
    valid_acc = 0.0

    for i, (inputs, labels) in enumerate(train_data_loader):

        inputs = inputs.to(device)
        labels = labels.to(device)

        # Clean existing gradients
        optimizer.zero_grad()

        # Forward pass - compute outputs on input data using the model
        outputs = model(inputs)

        # Compute loss
        loss = loss_criterion(outputs, labels)

        # Backpropagate the gradients
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Compute the total loss for the batch and add it to train_loss
        train_loss += loss.item() * inputs.size(0)

        # Compute the accuracy
        ret, predictions = torch.max(outputs.data, 1)
        correct_counts = predictions.eq(labels.data.view_as(predictions))

        # Convert correct_counts to float and then compute the mean
        acc = torch.mean(correct_counts.type(torch.FloatTensor))

        # Compute total accuracy in the whole batch and add to train_acc
        train_acc += acc.item() * inputs.size(0)

        #print("Batch number: {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}".format(i, loss.item(), acc.item()))

    # Validation - No gradient tracking needed
    with torch.no_grad():

        # Set to evaluation mode
        model.eval()

        # Validation loop
        for j, (inputs, labels) in enumerate(valid_data_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass - compute outputs on input data using the model
            outputs = model(inputs)

            # Compute loss
            loss = loss_criterion(outputs, labels)

            # Compute the total loss for the batch and add it to valid_loss
            valid_loss += loss.item() * inputs.size(0)

            # Calculate validation accuracy
            ret, predictions = torch.max(outputs.data, 1)
            correct_counts = predictions.eq(labels.data.view_as(predictions))

            # Convert correct_counts to float and then compute the mean
            acc = torch.mean(correct_counts.type(torch.FloatTensor))

            # Compute total accuracy in the whole batch and add to valid_acc
            valid_acc += acc.item() * inputs.size(0)

            #print("Validation Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}".format(j, loss.item(), acc.item()))

    # Find average training loss and training accuracy
    avg_train_loss = train_loss/train_data_size 
    avg_train_acc = train_acc/train_data_size

    # Find average training loss and training accuracy
    avg_valid_loss = valid_loss/valid_data_size 
    avg_valid_acc = valid_acc/valid_data_size

    history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])

    epoch_end = time.time()

    print("Epoch : {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}%, \n\t\tValidation : Loss : {:.4f}, Accuracy: {:.4f}%, Time: {:.4f}s".format(epoch, avg_train_loss, avg_train_acc*100, avg_valid_loss, avg_valid_acc*100, epoch_end-epoch_start))

    # Save if the model has best accuracy till now
    torch.save(model, dataset+'_model_'+str(epoch)+'.pt')

return model, history
`

num_epochs = 30
trained_model, history = train_and_validate(model, loss_func, optimizer, num_epochs)

torch.save(history, dataset+'_history.pt')`

#Errors while applying transfer learning to object detection
Epoch: 1/30
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-6035cd5d39dd> in <module>()
      1 num_epochs = 30
----> 2 trained_model, history = train_and_validate(model, loss_func, optimizer, num_epochs)
      3 
      4 torch.save(history, dataset+'_history.pt')

2 frames
/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/generalized_rcnn.py in forward(self, images, targets)
     43         """
     44         if self.training and targets is None:
---> 45             raise ValueError("In training mode, targets should be passed")
     46         original_image_sizes = [img.shape[-2:] for img in images]
     47         images, targets = self.transform(images, targets)

ValueError: In training mode, targets should be passed

fmassa commented 5 years ago

Finetuning an object detection model requires a slightly different training code, and the classification code that you used is not adapted.

Check the tutorial in https://colab.research.google.com/github/pytorch/vision/blob/temp-tutorial/tutorials/torchvision_finetuning_instance_segmentation.ipynb for learning how to finetune an instance segmentation model. An object detection model is very similar.

spatiallysaying commented 5 years ago

@fmassa , example is really helpful . I have ignored masks and got the bounding boxes for the 'PennFudanDataset' dataset given in the example. This example detects a single class 'person'. I want to extend this for multiple classes (custom dataset). Appreciate pointers in that direction.

Notice that I have accomplished this in Tensorflow using TF Record, model config file and .pbtxt albeit in a harder way. I am new to Pytorch and struggling to replicate the same.My impression is Pytrorch is much simpler.

In the torchvision example for multiclass classification ,dataset is organised for 'test' and 'val'. I have expected a similar data organization for object detection too. data\train\ class1\img1.jpg,img2.jpg..... class2\img1.jpg,img2.jpg.... data\val\ class1\img1.jpg,img2.jpg..... class2\img1.jpg,img2.jpg....

`` import os import numpy as np import torch import torch.utils.data from PIL import Image

class PennFudanDataset(torch.utils.data.Dataset): def init(self, root, transforms=None): self.root = root self.transforms = transforms

load all image files, sorting them to

    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
    self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
    mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
    img = Image.open(img_path).convert("RGB")
    # note that we haven't converted the mask to RGB,
    # because each color corresponds to a different instance
    # with 0 being background
    mask = Image.open(mask_path)

    mask = np.array(mask)
    # instances are encoded as different colors
    obj_ids = np.unique(mask)
    # first id is the background, so remove it
    obj_ids = obj_ids[1:]

    # split the color-encoded mask into a set
    # of binary masks
    masks = mask == obj_ids[:, None, None]

    # get bounding box coordinates for each mask
    num_objs = len(obj_ids)
    boxes = []
    for i in range(num_objs):
        pos = np.where(masks[i])
        xmin = np.min(pos[1])
        xmax = np.max(pos[1])
        ymin = np.min(pos[0])
        ymax = np.max(pos[0])
        boxes.append([xmin, ymin, xmax, ymax])

    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    **# there is only one class
    labels = torch.ones((num_objs,), dtype=torch.int64)**
    masks = torch.as_tensor(masks, dtype=torch.uint8)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

    target = {}
    target["boxes"] = boxes
    target["labels"] = labels
    target["masks"] = masks
    target["image_id"] = image_id
    target["area"] = area
    target["iscrowd"] = iscrowd

    if self.transforms is not None:
        img, target = self.transforms(img, target)

    return img, target

def __len__(self):
    return len(self.imgs)