yjh0410 / CenterNet-plus

A Simple Baseline for Object Detection
55 stars 11 forks source link

Help needed in CenterNet model script and COCOEval script #9

Closed YashRunwal closed 3 years ago

YashRunwal commented 3 years ago

Hi,

I am editing the script for my custom images with shape (512, 1536). So I wrote my own custom resnet18 backbone (had to make some changes, no big deal) and now am editing the CenterNet_plus script and I have some doubts about it:

class ObjectDetectionModel(nn.Module):

    def __init__(self,
                 device,
                 act='relu',
                 input_size=None,
                 trainable=True,
                 num_classes=None,
                 backbone='resnet18',
                 conf_thresh=0.05,
                 nms_thresh=0.45,
                 topk=100,
                 gs=1.0,
                 use_nms=False):

        super(ObjectDetectionModel, self).__init__()
        self.device = device
        self.bk = backbone
        self.num_classes = num_classes
        self.trainable = trainable
        self.input_size = input_size
        self.stride = 4
        self.gs = gs
        self.conf_thresh = conf_thresh
        self.nms_thresh = nms_thresh
        self.use_nms = use_nms
        self.topk = topk

        # Load Backbone
        if self.bk == 'resnet18':
            self.backbone = resnet18(pretrained=False)
            c2, c3, c4, c5 = 64, 128, 256, 512
            p2, p3, p4, p5 = 256, 256, 256, 256
        else:
            print('Currently only resnet18 backbone is supported')
            exit()

        # Neck
        self.neck = DilateEncoder(c1=c5, c2=p5)

        # Decoder
        # P4
        self.deconv4 = ResizeConv(c1=p5, c2=p4, act=act, scale_factor=2)  # 32 -> 16
        self.latter4 = Conv(c4, p4, k=1, act=None)
        self.smooth4 = Conv(p4, p4, k=3, p=1, act=act)  # shape remains the same

        # P3
        self.deconv3 = ResizeConv(c1=p4, c2=p3, act=act, scale_factor=2)  # 16 -> 8
        self.latter3 = Conv(c3, p3, k=1, act=None)
        self.smooth3 = Conv(p3, p3, k=3, p=1, act=act)  # shape remains the same

        # P2
        self.deconv2 = ResizeConv(c1=p3, c2=p2, act=act, scale_factor=2)  # 8 -> 4
        self.latter2 = Conv(c2, p2, k=1, act=None)
        self.smooth2 = Conv(p2, p2, k=3, p=1, act=act)  # shape remains the same

        # Detection heads
        self.cls_pred = nn.Sequential(
            Conv(p2, 64, k=3, p=1, act=act),
            nn.Conv2d(64, self.num_classes, kernel_size=1)
        )

        self.txty_pred = nn.Sequential(
            Conv(p2, 64, k=3, p=1, act=act),
            nn.Conv2d(64, 2, kernel_size=1)
        )

        self.twth_pred = nn.Sequential(
            Conv(p2, 64, k=3, p=1, act=act),
            nn.Conv2d(64, 2, kernel_size=1)
        )

        self.iou_aware_pred = nn.Sequential(
            Conv(p2, 64, k=3, p=1, act=act),
            nn.Conv2d(64, 1, kernel_size=1)
        )

        # Initialize weights for class pred
        init_prob = 0.01
        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
        nn.init.constant_(self.cls_pred[-1].bias, bias_value)

    def create_grid(self, input_size):
        # Change here, h=512, w=1536
        h, w = input_size[0], input_size[1]
        # generate grid cells
        ws, hs = w // self.stride, h // self.stride
        grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
        grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
        grid_xy = grid_xy.view(1, hs * ws, 2).to(self.device)
        print(f'grid_xy: {grid_xy.size()}')
        return grid_xy

    def set_grid(self, input_size):
        self.grid_cell = self.create_grid(input_size)
        self.input_size = input_size

    def decode_boxes(self, pred):
        """
        input box :  [delta_x, delta_y, sqrt(w), sqrt(h)]
        output box : [xmin, ymin, xmax, ymax]
        """
        self.set_grid(self.input_size)
        output = torch.zeros_like(pred)
        pred[:, :, :2] = (self.grid_cell + self.gs * torch.sigmoid(pred[:, :, :2]) - (self.gs - 1.0) / 2) * self.stride
        pred[:, :, 2:] = (torch.exp(pred[:, :, 2:])) * self.stride

        # [c_x, c_y, w, h] -> [xmin, ymin, xmax, ymax]
        output[:, :, 0] = pred[:, :, 0] - pred[:, :, 2] / 2
        output[:, :, 1] = pred[:, :, 1] - pred[:, :, 3] / 2
        output[:, :, 2] = pred[:, :, 0] + pred[:, :, 2] / 2
        output[:, :, 3] = pred[:, :, 1] + pred[:, :, 3] / 2

        print(f'output: {output.size()}')

        return output

    def forward(self, x):
        # Backbone: ResNet18, x is image size: (512, 1536)
        c2, c3, c4, c5 = self.backbone(x)

        B = c5.size(0)
        print(f'B: {B}')

        # Decoder (Bottom-Up)
        p5 = self.neck(c5)  # torch.Size([1, 256, 16, 48])
        p4 = self.smooth4(self.latter4(c4) + self.deconv4(p5))  # torch.Size([1, 256, 32, 96])
        p3 = self.smooth3(self.latter3(c3) + self.deconv3(p4))  # torch.Size([1, 256, 64, 192])
        p2 = self.smooth2(self.latter2(c2) + self.deconv2(p3))  # torch.Size([1, 256, 128, 384])

        # detection head
        cls_pred = self.cls_pred(p2)
        txty_pred = self.txty_pred(p2)
        twth_pred = self.twth_pred(p2)
        iou_aware_pred = self.iou_aware_pred(p2)

        if self.trainable:
            # [B, H*W, num_classes]
            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
            # [B, H*W, 2]
            txty_pred = txty_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 2)
            # [B, H*W, 2]
            twth_pred = twth_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 2)
            # # [B, H*W, 1]
            iou_aware_pred = iou_aware_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1)

            # Computing the iou score between gt and pred boxes
            txtytwth_pred = torch.cat([txty_pred, twth_pred], dim=-1)  # [B, H*W, 4]
            print(f'txtytwth: {txtytwth_pred.size()}')
            x1y1x2y2_pred = (self.decode_boxes(txtytwth_pred) / self.input_size).view(-1, 4)
            print(f'x1y1x2y2_pred: {x1y1x2y2_pred}')

        return cls_pred, txty_pred, twth_pred, iou_aware_pred

def main():
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    image_size = torch.randn(1, 1, 512, 1536).to(device)
    model = ObjectDetectionModel(device=device, input_size=(512, 1536), num_classes=8).to(device)
    # summary(model, image_size)
    model(image_size)
    print('Stop')
  1. So in the forward function at this line: x1y1x2y2_pred = (self.decode_boxes(txtytwth_pred) / self.input_size).view(-1, 4) In the function decode_boxes, it is written in the comments that the input shape of pred is: [delta_x, delta_y, sqrt(w), sqrt(h)], however, the size of the pred variable ispred: torch.Size([1, 49152, 4]), how is this the case? For me this looks like [B, H*W, 4]. Are the variables under the `self.trainable line correctly permuted?

  2. Also, as you can see I have made changes in the create_grid function in the height and width variable as the input size is nonsquare. But in the forward function where decode_boxes function is used , you have divided it by input_size which is for your case is 512 or any square images. What do I do here as my image is (512, 1536), I cannot divide by input_size directly as it is a tuple and will throw an error.

  3. You have used the COCO Dataset for training and evaluation purposes and hence can use the COCOEval class from pycocotools directly. However, my annotation format is different. I have a .txt file for each image like YOLO. How do I use the COCOEval class in this case? Any ideas?

Thanks.

Edit 1: @yjh0410 So printed the shapes of the variables before using the function decode_boxes using the input_shape (512, 1536) and are: [1, 49152, 4]. So this comes from [1, 128, 384, 4]. My idea is to divide the 128 channel values by 512 and 384 channel values by 1536. What do you think? How can I do this though and does it make any sense? @developer0hye What do you think? Sorry for tagging you here, but even I am trying to extend the functionality of this model.

yjh0410 commented 3 years ago

"My idea is to divide the 128 channel values by 512 and 384 channel values by 1536..."

I am a little confused about this, sorry~

In this model, the txtytwth_pred's size is [B, H, W, 4], where the H is img_h / 4 and W is img_w / 4. Then I reshape it to [B, HxW, 4](For example, your 49152 is equal to 128x384).

Why I reshape it is just for the convenience of calculation to decode box, compute iou between pred box and target box. You can also use [B, H, W, 4] not [B, HW, 4] to do following processing. It is OK.

In CenterNet, it predicts one bounding box at each pixel location in P2, so it will output HW boundbing boxes, but it only keeps topk bounding boxes as the final predictions.

YashRunwal commented 3 years ago

@yjh0410 Hi, Please bear with me.

So I have changed the code to use [1, 128, 384, channels] but now am stuck at the following line of code:

bbox_pred = torch.clamp((self.decode_boxes(txtytwth_pred) / self.input_size)[0], 0., 1.)

The output of decode_boxes is for me of the shape [1, 128, 384, 4] Here above, you divide the decode_boxes(txtytwth_pred) by input_size. However, my input_size is (512, 1536) so with which value should I divide?

Edit: @yjh0410: Consider the following example with real targets (bboxes normalized between 0 and 1 and class_id)

targets: [[0.00911458 0.26171875 0.53190104 0.89648438 4.        ]]
c_x: 415.5, c_y:296.5, box_w:802.9999999999999,box_h:325.0
tx, ty, tw, th:0.875, 0.125, 5.302060352826871, 4.3975308212098465

All these above variables are from generate_txtytwth() function. The stride used is 4. I presume, this stride is used for the image of the size (512, 512), but if the image size is (512, 1536), do you think I should use the stride (12, 4) so the following becomes:

box_w_s = box_w / 12  # This, do you think the logic is correct? I don't think it is as the heatmap is of size H/4, W/4.
box_h_s = box_h / 4

Then using the decode_boxes() function the output is output = self.decode_boxes(txtytwth_pred)

But theoutput.max() is 1532.76 and output.min() is -1.1088. This takes me back to the same question: Do I divide this output by 512 or 1536?

YashRunwal commented 3 years ago

@yjh0410 Any ideas? Need some help.

yjh0410 commented 3 years ago

The size of P2 feature map is [B, C, H/4, W/4], so the size of heatmap you create should also be [B, C, H/4, W/4], not [B, C, H/4, W/4]. The 512 x 512 size I used is just an example, and we also can use other size like 640x640, 800x1024.

When I normalize the pred boxes, I just divide it by input_size 512, because I set H and W as the same size 512. As for your case, you shoule divide the height of the pred boxes by H(512), and divide the wifth of the pred boxes by W(1536).

YashRunwal commented 3 years ago

@yjh0410 Yes, my P2 feature map size is [B, C, 128, 384], i.e. the size of the self.decode_boxes/(txtytwth_pred) is [B, C, 128, 384]. But how do I divide the height of the feature map by 512 and width by 1536? I tried a lot but can't get anywhere.

yjh0410 commented 3 years ago

@YashRunwal The size of box_pred=self.decode_boxes/(txtytwth_pred) is [B, 4, H//4, W//4], so you can divide box_pred[B, 0::2, :, :] by the width and box_pred[B, 1::2, :, :] by the height.

YashRunwal commented 3 years ago

@yjh0410 Thanks. There is slight mistake it should be box_pred[B, 1::3, :, :]by the height. Thanks. Will close this issue now