Reproducing semantic segmentation results

Hello,

I am trying to reproduce the semantic segmentation results on cityscapes. I reproduced the model and the training procedure described in the paper and I’m using the MoCo v1 weights.

Here is the model I ended up with: (The code is inspired from here)

import torch
from torch import nn
import torch.nn.functional as F

import torchvision.models as models


class FCN(nn.Module):
    def __init__(self, classes=2, criterion=nn.CrossEntropyLoss(ignore_index=255)):
        super(FCN, self).__init__()

        self.criterion = criterion

        resnet = models.resnet.resnet50()

        self.layer0 = nn.Sequential(
            resnet.conv1,
            resnet.bn1,
            resnet.relu,
            resnet.maxpool,
        )

        self.layer1 = resnet.layer1
        self.layer2 = resnet.layer2
        self.layer3 = resnet.layer3
        self.layer4 = resnet.layer4
        self.layer4[0].conv2 = nn.Conv2d(512, 512, kernel_size=(3, 3), dilation=(2, 2), stride=(1, 1), padding=(2, 2), bias=False)
        self.layer4[0].downsample[0] = nn.Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)

        self.classifier = nn.Sequential(
            nn.Conv2d(2048, 256, kernel_size=3, dilation=6),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, dilation=6),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, classes, kernel_size=1),
        )

        self.score_pool4 = nn.Conv2d(1024, classes, kernel_size=1)
        self.upsampling2 = nn.ConvTranspose2d(classes, classes, kernel_size=4, stride=2, bias=False)
        self.upsampling16 = nn.ConvTranspose2d(classes, classes, kernel_size=32, stride=16, bias=False)


    def forward(self, x, y=None):

        input_shape = x.shape
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x16 = self.layer3(x)

        x32 = self.layer4(x16)
        x32 = self.classifier(x32)
        x32 = self.upsampling2(x32)

        x16 = self.score_pool4(x16)

        x32 = x32[:, :, 1:1 + x16.size()[2], 1:1 + x16.size()[3]]
        y_pred = x32 + x16

        y_pred = self.upsampling16(y_pred)

        cx = int((y_pred.shape[3] - input_shape[3]) / 2)
        cy = int((y_pred.shape[2] - input_shape[2]) / 2)

        y_pred = y_pred[:, :, cy:cy + input_shape[2], cx:cx + input_shape[3]]

        if self.training:
            main_loss = self.criterion(y_pred, y)
            return y_pred.max(1)[1], main_loss, torch.zeros_like(main_loss)
        else:
            return y_pred

mIoU_train:

Validation: mIoU/mAcc/allAcc 0.5278/0.6497/0.7965.

I’m definitely doing something wrong, are you using different loss weights for the different classes?

facebookresearch / moco

Reproducing semantic segmentation results #65