pytorch / vision

Datasets, Transforms and Models specific to Computer Vision
BSD 3-Clause "New" or "Revised" License
16.31k stars 6.97k forks source link

fasterrcnn_resnet50_fpn errors #1071

Closed spatiallysaying closed 5 years ago

spatiallysaying commented 5 years ago

🐛 Bug

Retraining the 'fasterrcnn_resnet50_fpn ' model for custom dataset is failing

To Reproduce

Steps to reproduce the behavior:

  1. Successfully ran the sample with custom data . The classification is working as expected.
  2. Wanted to work on object detection with custom data Faster R-CNN Object Detection with PyTorch
  3. Combined above two examples . Replaced
    model_ft = models.resnet50(pretrained=True) with model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

Expected behavior

Object detection retrained for custom object detection similar to


Google colab

` model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

our dataset has two classes only

num_classes = 2

in_features = model.roi_heads.box_predictor.cls_score.in_features

move model to the right device

construct an optimizer

params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

and a learning rate scheduler which decreases the learning rate by

10x every 3 epochs

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

loss_func = nn.NLLLoss()`

`def train_and_validate(model, loss_criterion, optimizer, epochs=25): ''' Function to train and validate Parameters :param model: Model to train and validate :param loss_criterion: Loss Criterion to minimize :param optimizer: Optimizer for computing gradients :param epochs: Number of epochs (default=25)

    model: Trained Model with best validation accuracy
    history: (dict object): Having training loss, accuracy and validation loss, accuracy

start = time.time()
history = []
best_acc = 0.0

for epoch in range(epochs):
    epoch_start = time.time()
    print("Epoch: {}/{}".format(epoch+1, epochs))

    # Set to training mode

    # Loss and Accuracy within the epoch
    train_loss = 0.0
    train_acc = 0.0

    valid_loss = 0.0
    valid_acc = 0.0

    for i, (inputs, labels) in enumerate(train_data_loader):

        inputs =
        labels =

        # Clean existing gradients

        # Forward pass - compute outputs on input data using the model
        outputs = model(inputs)

        # Compute loss
        loss = loss_criterion(outputs, labels)

        # Backpropagate the gradients

        # Update the parameters

        # Compute the total loss for the batch and add it to train_loss
        train_loss += loss.item() * inputs.size(0)

        # Compute the accuracy
        ret, predictions = torch.max(, 1)
        correct_counts = predictions.eq(

        # Convert correct_counts to float and then compute the mean
        acc = torch.mean(correct_counts.type(torch.FloatTensor))

        # Compute total accuracy in the whole batch and add to train_acc
        train_acc += acc.item() * inputs.size(0)

        #print("Batch number: {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}".format(i, loss.item(), acc.item()))

    # Validation - No gradient tracking needed
    with torch.no_grad():

        # Set to evaluation mode

        # Validation loop
        for j, (inputs, labels) in enumerate(valid_data_loader):
            inputs =
            labels =

            # Forward pass - compute outputs on input data using the model
            outputs = model(inputs)

            # Compute loss
            loss = loss_criterion(outputs, labels)

            # Compute the total loss for the batch and add it to valid_loss
            valid_loss += loss.item() * inputs.size(0)

            # Calculate validation accuracy
            ret, predictions = torch.max(, 1)
            correct_counts = predictions.eq(

            # Convert correct_counts to float and then compute the mean
            acc = torch.mean(correct_counts.type(torch.FloatTensor))

            # Compute total accuracy in the whole batch and add to valid_acc
            valid_acc += acc.item() * inputs.size(0)

            #print("Validation Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}".format(j, loss.item(), acc.item()))

    # Find average training loss and training accuracy
    avg_train_loss = train_loss/train_data_size 
    avg_train_acc = train_acc/train_data_size

    # Find average training loss and training accuracy
    avg_valid_loss = valid_loss/valid_data_size 
    avg_valid_acc = valid_acc/valid_data_size

    history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])

    epoch_end = time.time()

    print("Epoch : {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}%, \n\t\tValidation : Loss : {:.4f}, Accuracy: {:.4f}%, Time: {:.4f}s".format(epoch, avg_train_loss, avg_train_acc*100, avg_valid_loss, avg_valid_acc*100, epoch_end-epoch_start))

    # Save if the model has best accuracy till now, dataset+'_model_'+str(epoch)+'.pt')

return model, history
num_epochs = 30
trained_model, history = train_and_validate(model, loss_func, optimizer, num_epochs), dataset+'')`
#Errors while applying transfer learning to object detection
Epoch: 1/30
ValueError                                Traceback (most recent call last)
<ipython-input-37-6035cd5d39dd> in <module>()
      1 num_epochs = 30
----> 2 trained_model, history = train_and_validate(model, loss_func, optimizer, num_epochs)
      4, dataset+'')

2 frames
/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/ in forward(self, images, targets)
     43         """
     44         if and targets is None:
---> 45             raise ValueError("In training mode, targets should be passed")
     46         original_image_sizes = [img.shape[-2:] for img in images]
     47         images, targets = self.transform(images, targets)

ValueError: In training mode, targets should be passed
fmassa commented 5 years ago

Finetuning an object detection model requires a slightly different training code, and the classification code that you used is not adapted.

Check the tutorial in for learning how to finetune an instance segmentation model. An object detection model is very similar.

spatiallysaying commented 5 years ago

@fmassa , example is really helpful . I have ignored masks and got the bounding boxes for the 'PennFudanDataset' dataset given in the example. This example detects a single class 'person'. I want to extend this for multiple classes (custom dataset). Appreciate pointers in that direction.

Notice that I have accomplished this in Tensorflow using TF Record, model config file and .pbtxt albeit in a harder way. I am new to Pytorch and struggling to replicate the same.My impression is Pytrorch is much simpler.

In the torchvision example for multiclass classification ,dataset is organised for 'test' and 'val'. I have expected a similar data organization for object detection too. data\train\ class1\img1.jpg,img2.jpg..... class2\img1.jpg,img2.jpg.... data\val\ class1\img1.jpg,img2.jpg..... class2\img1.jpg,img2.jpg....

`` import os import numpy as np import torch import from PIL import Image

class PennFudanDataset( def init(self, root, transforms=None): self.root = root self.transforms = transforms

load all image files, sorting them to

    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
    self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
    mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
    img ="RGB")
    # note that we haven't converted the mask to RGB,
    # because each color corresponds to a different instance
    # with 0 being background
    mask =

    mask = np.array(mask)
    # instances are encoded as different colors
    obj_ids = np.unique(mask)
    # first id is the background, so remove it
    obj_ids = obj_ids[1:]

    # split the color-encoded mask into a set
    # of binary masks
    masks = mask == obj_ids[:, None, None]

    # get bounding box coordinates for each mask
    num_objs = len(obj_ids)
    boxes = []
    for i in range(num_objs):
        pos = np.where(masks[i])
        xmin = np.min(pos[1])
        xmax = np.max(pos[1])
        ymin = np.min(pos[0])
        ymax = np.max(pos[0])
        boxes.append([xmin, ymin, xmax, ymax])

    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    **# there is only one class
    labels = torch.ones((num_objs,), dtype=torch.int64)**
    masks = torch.as_tensor(masks, dtype=torch.uint8)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

    target = {}
    target["boxes"] = boxes
    target["labels"] = labels
    target["masks"] = masks
    target["image_id"] = image_id
    target["area"] = area
    target["iscrowd"] = iscrowd

    if self.transforms is not None:
        img, target = self.transforms(img, target)

    return img, target

def __len__(self):
    return len(self.imgs)
fmassa commented 5 years ago

Extending it to multiple classes should be a matter of changing the labels in the dataset to represent the multiple classes you want (as numbers from 1 to the number of classes), and adding a larger num_classes in the model.

This should be straightforward, but without further information it is hard to understand where you got blocked