The speed problem - Githubissues

import time import torch import cv2 from torch import nn import numpy as np from torch.backends import cudnn import yaml import itertools import torch import torch.nn as nn import numpy as np from typing import Union from efficientnet import EfficientNet as EffNet import math from torch.nn import functional as F from torchvision.ops.boxes import batched_nms class Conv2dStaticSamePadding(nn.Module): """ created by Zylo117 The real keras/tensorflow conv2d with same padding """

def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, groups=1, dilation=1, **kwargs):
    super().__init__()
    self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
                          bias=bias, groups=groups)
    self.stride = self.conv.stride
    self.kernel_size = self.conv.kernel_size
    self.dilation = self.conv.dilation

    if isinstance(self.stride, int):
        self.stride = [self.stride] * 2
    elif len(self.stride) == 1:
        self.stride = [self.stride[0]] * 2

    if isinstance(self.kernel_size, int):
        self.kernel_size = [self.kernel_size] * 2
    elif len(self.kernel_size) == 1:
        self.kernel_size = [self.kernel_size[0]] * 2

def forward(self, x):
    h, w = x.shape[-2:]

    extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1]
    extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0]

    left = extra_h // 2
    right = extra_h - left
    top = extra_v // 2
    bottom = extra_v - top

    x = F.pad(x, [left, right, top, bottom])

    x = self.conv(x)
    return x

class SeparableConvBlock(nn.Module): """ created by Zylo117 """

def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False):
    super(SeparableConvBlock, self).__init__()
    if out_channels is None:
        out_channels = in_channels

    # Q: whether separate conv
    #  share bias between depthwise_conv and pointwise_conv
    #  or just pointwise_conv apply bias.
    # A: Confirmed, just pointwise_conv applies bias, depthwise_conv has no bias.

    self.depthwise_conv = Conv2dStaticSamePadding(in_channels, in_channels,
                                                  kernel_size=3, stride=1, groups=in_channels, bias=False)
    self.pointwise_conv = Conv2dStaticSamePadding(in_channels, out_channels, kernel_size=1, stride=1)

    self.norm = norm
    if self.norm:
        # Warning: pytorch momentum is different from tensorflow's, momentum_pytorch = 1 - momentum_tensorflow
        self.bn = nn.BatchNorm2d(num_features=out_channels, momentum=0.01, eps=1e-3)

    self.activation = activation
    if self.activation:
        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()

def forward(self, x):
    x = self.depthwise_conv(x)
    x = self.pointwise_conv(x)

    if self.norm:
        x = self.bn(x)

    if self.activation:
        x = self.swish(x)

    return x

class MaxPool2dStaticSamePadding(nn.Module): """ created by Zylo117 The real keras/tensorflow MaxPool2d with same padding """

def __init__(self, *args, **kwargs):
    super().__init__()
    self.pool = nn.MaxPool2d(*args, **kwargs)
    self.stride = self.pool.stride
    self.kernel_size = self.pool.kernel_size

    if isinstance(self.stride, int):
        self.stride = [self.stride] * 2
    elif len(self.stride) == 1:
        self.stride = [self.stride[0]] * 2

    if isinstance(self.kernel_size, int):
        self.kernel_size = [self.kernel_size] * 2
    elif len(self.kernel_size) == 1:
        self.kernel_size = [self.kernel_size[0]] * 2

def forward(self, x):
    h, w = x.shape[-2:]

    extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1]
    extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0]

    left = extra_h // 2
    right = extra_h - left
    top = extra_v // 2
    bottom = extra_v - top

    x = F.pad(x, [left, right, top, bottom])

    x = self.pool(x)
    return x

class SwishImplementation(torch.autograd.Function): @staticmethod def forward(ctx, i): result = i torch.sigmoid(i) ctx.save_for_backward(i) return result class MemoryEfficientSwish(nn.Module): def forward(self, x): return SwishImplementation.apply(x) class Swish(nn.Module): def forward(self, x): return x torch.sigmoid(x) class BiFPN(nn.Module): """ modified by Zylo117 """

def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True,
             use_p8=False):
    """

    Args:
        num_channels:
        conv_channels:
        first_time: whether the input comes directly from the efficientnet,
                    if True, downchannel it first, and downsample P5 to generate P6 then P7
        epsilon: epsilon of fast weighted attention sum of BiFPN, not the BN's epsilon
        onnx_export: if True, use Swish instead of MemoryEfficientSwish
    """
    super(BiFPN, self).__init__()
    self.epsilon = epsilon
    self.use_p8 = use_p8

    # Conv layers
    self.conv6_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv5_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv4_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv3_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv4_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv5_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv6_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    self.conv7_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
    if use_p8:
        self.conv7_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv8_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)

    # Feature scaling layers
    self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest')
    self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest')
    self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest')
    self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest')

    self.p4_downsample = MaxPool2dStaticSamePadding(3, 2)
    self.p5_downsample = MaxPool2dStaticSamePadding(3, 2)
    self.p6_downsample = MaxPool2dStaticSamePadding(3, 2)
    self.p7_downsample = MaxPool2dStaticSamePadding(3, 2)
    if use_p8:
        self.p7_upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.p8_downsample = MaxPool2dStaticSamePadding(3, 2)

    self.swish = MemoryEfficientSwish() if not onnx_export else Swish()

    self.first_time = first_time
    if self.first_time:
        self.p5_down_channel = nn.Sequential(
            Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
            nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
        )
        self.p4_down_channel = nn.Sequential(
            Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
            nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
        )
        self.p3_down_channel = nn.Sequential(
            Conv2dStaticSamePadding(conv_channels[0], num_channels, 1),
            nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
        )

        self.p5_to_p6 = nn.Sequential(
            Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
            nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
            MaxPool2dStaticSamePadding(3, 2)
        )
        self.p6_to_p7 = nn.Sequential(
            MaxPool2dStaticSamePadding(3, 2)
        )
        if use_p8:
            self.p7_to_p8 = nn.Sequential(
                MaxPool2dStaticSamePadding(3, 2)
            )

        self.p4_down_channel_2 = nn.Sequential(
            Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
            nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
        )
        self.p5_down_channel_2 = nn.Sequential(
            Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
            nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
        )

    # Weight
    self.p6_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
    self.p6_w1_relu = nn.ReLU()
    self.p5_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
    self.p5_w1_relu = nn.ReLU()
    self.p4_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
    self.p4_w1_relu = nn.ReLU()
    self.p3_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
    self.p3_w1_relu = nn.ReLU()

    self.p4_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
    self.p4_w2_relu = nn.ReLU()
    self.p5_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
    self.p5_w2_relu = nn.ReLU()
    self.p6_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
    self.p6_w2_relu = nn.ReLU()
    self.p7_w2 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
    self.p7_w2_relu = nn.ReLU()

    self.attention = attention

def forward(self, inputs):

    # downsample channels using same-padding conv2d to target phase's if not the same
    # judge: same phase as target,
    # if same, pass;
    # elif earlier phase, downsample to target phase's by pooling
    # elif later phase, upsample to target phase's by nearest interpolation

    if self.attention:
        outs = self._forward_fast_attention(inputs)
    else:
        outs = self._forward(inputs)

    return outs

def _forward_fast_attention(self, inputs):
    if self.first_time:
        p3, p4, p5 = inputs

        p6_in = self.p5_to_p6(p5)
        p7_in = self.p6_to_p7(p6_in)

        p3_in = self.p3_down_channel(p3)
        p4_in = self.p4_down_channel(p4)
        p5_in = self.p5_down_channel(p5)

    else:
        # P3_0, P4_0, P5_0, P6_0 and P7_0
        p3_in, p4_in, p5_in, p6_in, p7_in = inputs

    # P7_0 to P7_2

    # Weights for P6_0 and P7_0 to P6_1
    p6_w1 = self.p6_w1_relu(self.p6_w1)
    weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon)
    # Connections for P6_0 and P7_0 to P6_1 respectively
    p6_up = self.conv6_up(self.swish(weight[0] * p6_in + weight[1] * self.p6_upsample(p7_in)))

    # Weights for P5_0 and P6_1 to P5_1
    p5_w1 = self.p5_w1_relu(self.p5_w1)
    weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon)
    # Connections for P5_0 and P6_1 to P5_1 respectively
    p5_up = self.conv5_up(self.swish(weight[0] * p5_in + weight[1] * self.p5_upsample(p6_up)))

    # Weights for P4_0 and P5_1 to P4_1
    p4_w1 = self.p4_w1_relu(self.p4_w1)
    weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon)
    # Connections for P4_0 and P5_1 to P4_1 respectively
    p4_up = self.conv4_up(self.swish(weight[0] * p4_in + weight[1] * self.p4_upsample(p5_up)))

    # Weights for P3_0 and P4_1 to P3_2
    p3_w1 = self.p3_w1_relu(self.p3_w1)
    weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon)
    # Connections for P3_0 and P4_1 to P3_2 respectively
    p3_out = self.conv3_up(self.swish(weight[0] * p3_in + weight[1] * self.p3_upsample(p4_up)))

    if self.first_time:
        p4_in = self.p4_down_channel_2(p4)
        p5_in = self.p5_down_channel_2(p5)

    # Weights for P4_0, P4_1 and P3_2 to P4_2
    p4_w2 = self.p4_w2_relu(self.p4_w2)
    weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon)
    # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
    p4_out = self.conv4_down(
        self.swish(weight[0] * p4_in + weight[1] * p4_up + weight[2] * self.p4_downsample(p3_out)))

    # Weights for P5_0, P5_1 and P4_2 to P5_2
    p5_w2 = self.p5_w2_relu(self.p5_w2)
    weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon)
    # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
    p5_out = self.conv5_down(
        self.swish(weight[0] * p5_in + weight[1] * p5_up + weight[2] * self.p5_downsample(p4_out)))

    # Weights for P6_0, P6_1 and P5_2 to P6_2
    p6_w2 = self.p6_w2_relu(self.p6_w2)
    weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon)
    # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
    p6_out = self.conv6_down(
        self.swish(weight[0] * p6_in + weight[1] * p6_up + weight[2] * self.p6_downsample(p5_out)))

    # Weights for P7_0 and P6_2 to P7_2
    p7_w2 = self.p7_w2_relu(self.p7_w2)
    weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon)
    # Connections for P7_0 and P6_2 to P7_2
    p7_out = self.conv7_down(self.swish(weight[0] * p7_in + weight[1] * self.p7_downsample(p6_out)))

    return p3_out, p4_out, p5_out, p6_out, p7_out

def _forward(self, inputs):
    if self.first_time:
        p3, p4, p5 = inputs

        p6_in = self.p5_to_p6(p5)
        p7_in = self.p6_to_p7(p6_in)
        if self.use_p8:
            p8_in = self.p7_to_p8(p7_in)

        p3_in = self.p3_down_channel(p3)
        p4_in = self.p4_down_channel(p4)
        p5_in = self.p5_down_channel(p5)

    else:
        if self.use_p8:
            # P3_0, P4_0, P5_0, P6_0, P7_0 and P8_0
            p3_in, p4_in, p5_in, p6_in, p7_in, p8_in = inputs
        else:
            # P3_0, P4_0, P5_0, P6_0 and P7_0
            p3_in, p4_in, p5_in, p6_in, p7_in = inputs

    if self.use_p8:
        # P8_0 to P8_2

        # Connections for P7_0 and P8_0 to P7_1 respectively
        p7_up = self.conv7_up(self.swish(p7_in + self.p7_upsample(p8_in)))

        # Connections for P6_0 and P7_0 to P6_1 respectively
        p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_up)))
    else:
        # P7_0 to P7_2

        # Connections for P6_0 and P7_0 to P6_1 respectively
        p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_in)))

    # Connections for P5_0 and P6_1 to P5_1 respectively
    p5_up = self.conv5_up(self.swish(p5_in + self.p5_upsample(p6_up)))

    # Connections for P4_0 and P5_1 to P4_1 respectively
    p4_up = self.conv4_up(self.swish(p4_in + self.p4_upsample(p5_up)))

    # Connections for P3_0 and P4_1 to P3_2 respectively
    p3_out = self.conv3_up(self.swish(p3_in + self.p3_upsample(p4_up)))

    if self.first_time:
        p4_in = self.p4_down_channel_2(p4)
        p5_in = self.p5_down_channel_2(p5)

    # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
    p4_out = self.conv4_down(
        self.swish(p4_in + p4_up + self.p4_downsample(p3_out)))

    # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
    p5_out = self.conv5_down(
        self.swish(p5_in + p5_up + self.p5_downsample(p4_out)))

    # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
    p6_out = self.conv6_down(
        self.swish(p6_in + p6_up + self.p6_downsample(p5_out)))

    if self.use_p8:
        # Connections for P7_0, P7_1 and P6_2 to P7_2 respectively
        p7_out = self.conv7_down(
            self.swish(p7_in + p7_up + self.p7_downsample(p6_out)))

        # Connections for P8_0 and P7_2 to P8_2
        p8_out = self.conv8_down(self.swish(p8_in + self.p8_downsample(p7_out)))

        return p3_out, p4_out, p5_out, p6_out, p7_out, p8_out
    else:
        # Connections for P7_0 and P6_2 to P7_2
        p7_out = self.conv7_down(self.swish(p7_in + self.p7_downsample(p6_out)))

        return p3_out, p4_out, p5_out, p6_out, p7_out

class BBoxTransform(nn.Module): def forward(self, anchors, regression): """ decode_box_outputs adapted from https://github.com/google/automl/blob/master/efficientdet/anchors.py

    Args:
        anchors: [batchsize, boxes, (y1, x1, y2, x2)]
        regression: [batchsize, boxes, (dy, dx, dh, dw)]

    Returns:

    """
    y_centers_a = (anchors[..., 0] + anchors[..., 2]) / 2
    x_centers_a = (anchors[..., 1] + anchors[..., 3]) / 2
    ha = anchors[..., 2] - anchors[..., 0]
    wa = anchors[..., 3] - anchors[..., 1]

    w = regression[..., 3].exp() * wa
    h = regression[..., 2].exp() * ha

    y_centers = regression[..., 0] * ha + y_centers_a
    x_centers = regression[..., 1] * wa + x_centers_a

    ymin = y_centers - h / 2.
    xmin = x_centers - w / 2.
    ymax = y_centers + h / 2.
    xmax = x_centers + w / 2.

    return torch.stack([xmin, ymin, xmax, ymax], dim=2)

class ClipBoxes(nn.Module):

def __init__(self):
    super(ClipBoxes, self).__init__()

def forward(self, boxes, img):
    batch_size, num_channels, height, width = img.shape

    boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
    boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)

    boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width - 1)
    boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height - 1)

    return boxes

class Regressor(nn.Module): """ modified by Zylo117 """

def __init__(self, in_channels, num_anchors, num_layers, pyramid_levels=5, onnx_export=False):
    super(Regressor, self).__init__()
    self.num_layers = num_layers

    self.conv_list = nn.ModuleList(
        [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
    self.bn_list = nn.ModuleList(
        [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
         range(pyramid_levels)])
    self.header = SeparableConvBlock(in_channels, num_anchors * 4, norm=False, activation=False)
    self.swish = MemoryEfficientSwish() if not onnx_export else Swish()

def forward(self, inputs):
    feats = []
    for feat, bn_list in zip(inputs, self.bn_list):
        for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
            feat = conv(feat)
            feat = bn(feat)
            feat = self.swish(feat)
        feat = self.header(feat)

        feat = feat.permute(0, 2, 3, 1)
        feat = feat.contiguous().view(feat.shape[0], -1, 4)

        feats.append(feat)

    feats = torch.cat(feats, dim=1)

    return feats

class Classifier(nn.Module): """ modified by Zylo117 """

def __init__(self, in_channels, num_anchors, num_classes, num_layers, pyramid_levels=5, onnx_export=False):
    super(Classifier, self).__init__()
    self.num_anchors = num_anchors
    self.num_classes = num_classes
    self.num_layers = num_layers
    self.conv_list = nn.ModuleList(
        [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
    self.bn_list = nn.ModuleList(
        [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
         range(pyramid_levels)])
    self.header = SeparableConvBlock(in_channels, num_anchors * num_classes, norm=False, activation=False)
    self.swish = MemoryEfficientSwish() if not onnx_export else Swish()

def forward(self, inputs):
    feats = []
    for feat, bn_list in zip(inputs, self.bn_list):
        for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
            feat = conv(feat)
            feat = bn(feat)
            feat = self.swish(feat)
        feat = self.header(feat)

        feat = feat.permute(0, 2, 3, 1)
        feat = feat.contiguous().view(feat.shape[0], feat.shape[1], feat.shape[2], self.num_anchors,
                                      self.num_classes)
        feat = feat.contiguous().view(feat.shape[0], -1, self.num_classes)

        feats.append(feat)

    feats = torch.cat(feats, dim=1)
    feats = feats.sigmoid()

    return feats

class EfficientNet(nn.Module): """ modified by Zylo117 """

def __init__(self, compound_coef, load_weights=False):
    super(EfficientNet, self).__init__()
    model = EffNet.from_pretrained(f'efficientnet-b{compound_coef}', load_weights)
    del model._conv_head
    del model._bn1
    del model._avg_pooling
    del model._dropout
    del model._fc
    self.model = model

def forward(self, x):
    x = self.model._conv_stem(x)
    x = self.model._bn0(x)
    x = self.model._swish(x)
    feature_maps = []

    # TODO: temporarily storing extra tensor last_x and del it later might not be a good idea,
    #  try recording stride changing when creating efficientnet,
    #  and then apply it here.
    last_x = None
    for idx, block in enumerate(self.model._blocks):
        drop_connect_rate = self.model._global_params.drop_connect_rate
        if drop_connect_rate:
            drop_connect_rate *= float(idx) / len(self.model._blocks)
        x = block(x, drop_connect_rate=drop_connect_rate)

        if block._depthwise_conv.stride == [2, 2]:
            feature_maps.append(last_x)
        elif idx == len(self.model._blocks) - 1:
            feature_maps.append(x)
        last_x = x
    del last_x
    return feature_maps[1:]

class Anchors(nn.Module): """ adapted and modified from https://github.com/google/automl/blob/master/efficientdet/anchors.py by Zylo117 """

def __init__(self, anchor_scale=4., pyramid_levels=None, **kwargs):
    super().__init__()
    self.anchor_scale = anchor_scale

    if pyramid_levels is None:
        self.pyramid_levels = [3, 4, 5, 6, 7]
    else:
        self.pyramid_levels = pyramid_levels

    self.strides = kwargs.get('strides', [2 ** x for x in self.pyramid_levels])
    self.scales = np.array(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
    self.ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])

    self.last_anchors = {}
    self.last_shape = None

def forward(self, image, dtype=torch.float32):
    """Generates multiscale anchor boxes.

    Args:
      image_size: integer number of input image size. The input image has the
        same dimension for width and height. The image_size should be divided by
        the largest feature stride 2^max_level.
      anchor_scale: float number representing the scale of size of the base
        anchor to the feature stride 2^level.
      anchor_configs: a dictionary with keys as the levels of anchors and
        values as a list of anchor configuration.

    Returns:
      anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
        feature levels.
    Raises:
      ValueError: input size must be the multiple of largest feature stride.
    """
    image_shape = image.shape[2:]

    if image_shape == self.last_shape and image.device in self.last_anchors:
        return self.last_anchors[image.device]

    if self.last_shape is None or self.last_shape != image_shape:
        self.last_shape = image_shape

    if dtype == torch.float16:
        dtype = np.float16
    else:
        dtype = np.float32

    boxes_all = []
    for stride in self.strides:
        boxes_level = []
        for scale, ratio in itertools.product(self.scales, self.ratios):
            if image_shape[1] % stride != 0:
                raise ValueError('input size must be divided by the stride.')
            base_anchor_size = self.anchor_scale * stride * scale
            anchor_size_x_2 = base_anchor_size * ratio[0] / 2.0
            anchor_size_y_2 = base_anchor_size * ratio[1] / 2.0

            x = np.arange(stride / 2, image_shape[1], stride)
            y = np.arange(stride / 2, image_shape[0], stride)
            xv, yv = np.meshgrid(x, y)
            xv = xv.reshape(-1)
            yv = yv.reshape(-1)

            # y1,x1,y2,x2
            boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
                               yv + anchor_size_y_2, xv + anchor_size_x_2))
            boxes = np.swapaxes(boxes, 0, 1)
            boxes_level.append(np.expand_dims(boxes, axis=1))
        # concat anchors on the same level to the reshape NxAx4
        boxes_level = np.concatenate(boxes_level, axis=1)
        boxes_all.append(boxes_level.reshape([-1, 4]))

    anchor_boxes = np.vstack(boxes_all)

    anchor_boxes = torch.from_numpy(anchor_boxes.astype(dtype)).to(image.device)
    anchor_boxes = anchor_boxes.unsqueeze(0)

    # save it for later use to reduce overhead
    self.last_anchors[image.device] = anchor_boxes
    return anchor_boxes

class EfficientDetBackbone(nn.Module): def init(self, num_classes=159, compound_coef=0, load_weights=False, **kwargs): super(EfficientDetBackbone, self).init() self.compound_coef = compound_coef

    self.backbone_compound_coef = [0, 1, 2, 3, 4, 5, 6, 6, 7]
    self.fpn_num_filters = [64, 88, 112, 160, 224, 288, 384, 384, 384]
    self.fpn_cell_repeats = [3, 4, 5, 6, 7, 7, 8, 8, 8]
    self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
    self.box_class_repeats = [3, 3, 3, 4, 4, 4, 5, 5, 5]
    self.pyramid_levels = [5, 5, 5, 5, 5, 5, 5, 5, 6]
    self.anchor_scale = [4., 4., 4., 4., 4., 4., 4., 5., 4.]
    self.aspect_ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
    self.num_scales = len(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
    conv_channel_coef = {
        # the channels of P3/P4/P5.
        0: [40, 112, 320],
        1: [40, 112, 320],
        2: [48, 120, 352],
        3: [48, 136, 384],
        4: [56, 160, 448],
        5: [64, 176, 512],
        6: [72, 200, 576],
        7: [72, 200, 576],
        8: [80, 224, 640],
    }

    num_anchors = len(self.aspect_ratios) * self.num_scales

    self.bifpn = nn.Sequential(
        *[BiFPN(self.fpn_num_filters[self.compound_coef],
                conv_channel_coef[compound_coef],
                True if _ == 0 else False,
                attention=True if compound_coef < 6 else False,
                use_p8=compound_coef > 7)
          for _ in range(self.fpn_cell_repeats[compound_coef])])

    self.num_classes = num_classes
    self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                               num_layers=self.box_class_repeats[self.compound_coef],
                               pyramid_levels=self.pyramid_levels[self.compound_coef])
    self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                 num_classes=num_classes,
                                 num_layers=self.box_class_repeats[self.compound_coef],
                                 pyramid_levels=self.pyramid_levels[self.compound_coef])

    self.anchors = Anchors(anchor_scale=self.anchor_scale[compound_coef],
                           pyramid_levels=(torch.arange(self.pyramid_levels[self.compound_coef]) + 3).tolist(),
                           **kwargs)

    self.backbone_net = EfficientNet(self.backbone_compound_coef[compound_coef], load_weights)

def freeze_bn(self):
    for m in self.modules():
        if isinstance(m, nn.BatchNorm2d):
            m.eval()

def forward(self, inputs):
    max_size = inputs.shape[-1]

    _, p3, p4, p5 = self.backbone_net(inputs)

    features = (p3, p4, p5)
    features = self.bifpn(features)

    regression = self.regressor(features)
    classification = self.classifier(features)
    anchors = self.anchors(inputs, inputs.dtype)

    return features, regression, classification, anchors

def init_backbone(self, path):
    state_dict = torch.load(path)
    try:
        ret = self.load_state_dict(state_dict, strict=False)
        print(ret)
    except RuntimeError as e:
        print('Ignoring ' + str(e) + '"')

class detect: def init(self,threshold=0.2,iou_threshold=0.2,weight_file='',obj_list=[],use_cuda=True, use_float16 = False, compound_coef = 2,force_input_size= None,): self.compound_coef = compound_coef self.force_input_size = force_input_size self.threshold = threshold self.use_cuda = use_cuda self.use_float16 = use_float16 self.iou_threshold = iou_threshold

    self.input_sizes = input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
    self.input_size = input_sizes[compound_coef] if force_input_size is None else force_input_size
    self.obj_list = obj_list
    self.weight_file = weight_file
    self.model = EfficientDetBackbone(compound_coef=self.compound_coef, num_classes=len(self.obj_list),map_location=lambda storage, loc: storage.cuda(1))
    self.model.load_state_dict(torch.load(self.weight_file))
    self.model.requires_grad_(False)
    self.model.eval()
    if self.use_cuda:
        self.model = self.model.cuda()
    if self.use_float16:
        self.model = self.model.half()
    self.regressBoxes = BBoxTransform()
    self.clipBoxes = ClipBoxes()
def run(self, frame):
    cudnn.enabled = True
    cudnn.benchmark = True
    ori_imgs, framed_imgs, framed_metas = preprocess_video(frame, max_size=self.input_size)
    if self.use_cuda:
        x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
    else:
        x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)
    x = x.to(torch.float32 if not self.use_float16 else self.torch.float16).permute(0, 3, 1, 2)
    # model predict
    with torch.no_grad():
        features, regression, classification, anchors = self.model(x)
        out = postprocess(x,
                          anchors, regression, classification,
                          self.regressBoxes, self.clipBoxes,
                          self.threshold, self.iou_threshold)

    # result
    out = invert_affine(framed_metas, out)
    return out

def invert_affine(metas: Union[float, list, tuple], preds): for i in range(len(preds)): if len(preds[i]['rois']) == 0: continue else: if metas is float: preds[i]['rois'][:, [0, 2]] = preds[i]['rois'][:, [0, 2]] / metas preds[i]['rois'][:, [1, 3]] = preds[i]['rois'][:, [1, 3]] / metas else: new_w, new_h, old_w, old_h, padding_w, padding_h = metas[i] preds[i]['rois'][:, [0, 2]] = preds[i]['rois'][:, [0, 2]] / (new_w / old_w) preds[i]['rois'][:, [1, 3]] = preds[i]['rois'][:, [1, 3]] / (new_h / old_h) return preds

def aspectaware_resize_padding(image, width, height, interpolation=None, means=None): old_h, old_w, c = image.shape if old_w > old_h: new_w = width new_h = int(width / old_w old_h) else: new_w = int(height / old_h old_w) new_h = height

canvas = np.zeros((height, height, c), np.float32)
if means is not None:
    canvas[...] = means

if new_w != old_w or new_h != old_h:
    if interpolation is None:
        image = cv2.resize(image, (new_w, new_h))
    else:
        image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)

padding_h = height - new_h
padding_w = width - new_w

if c > 1:
    canvas[:new_h, :new_w] = image
else:
    if len(image.shape) == 2:
        canvas[:new_h, :new_w, 0] = image
    else:
        canvas[:new_h, :new_w] = image

return canvas, new_w, new_h, old_w, old_h, padding_w, padding_h,

def preprocess_video(*frame_from_video, max_size=512, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): ori_imgs = frame_from_video normalized_imgs = [(img / 255 - mean) / std for img in ori_imgs] imgs_meta = [aspectaware_resize_padding(img[..., ::-1], max_size, max_size, means=None) for img in normalized_imgs] framed_imgs = [img_meta[0] for img_meta in imgs_meta] framed_metas = [img_meta[1:] for img_meta in imgs_meta]

return ori_imgs, framed_imgs, framed_metas

def postprocess(x, anchors, regression, classification, regressBoxes, clipBoxes, threshold, iou_threshold): transformed_anchors = regressBoxes(anchors, regression) transformed_anchors = clipBoxes(transformed_anchors, x) scores = torch.max(classification, dim=2, keepdim=True)[0] scores_over_thresh = (scores > threshold)[:, :, 0] out = [] for i in range(x.shape[0]): if scores_over_thresh[i].sum() == 0: out.append({ 'rois': np.array(()), 'class_ids': np.array(()), 'scores': np.array(()), }) continue

    classification_per = classification[i, scores_over_thresh[i, :], ...].permute(1, 0)
    transformed_anchors_per = transformed_anchors[i, scores_over_thresh[i, :], ...]
    scores_per = scores[i, scores_over_thresh[i, :], ...]
    scores_, classes_ = classification_per.max(dim=0)
    anchors_nms_idx = batched_nms(transformed_anchors_per, scores_per[:, 0], classes_,
                                  iou_threshold=iou_threshold)

    if anchors_nms_idx.shape[0] != 0:
        classes_ = classes_[anchors_nms_idx]
        scores_ = scores_[anchors_nms_idx]
        boxes_ = transformed_anchors_per[anchors_nms_idx, :]

        out.append({
            'rois': boxes_.cpu().numpy(),
            'class_ids': classes_.cpu().numpy(),
            'scores': scores_.cpu().numpy(),
        })
    else:
        out.append({
            'rois': np.array(()),
            'class_ids': np.array(()),
            'scores': np.array(()),
        })
return out

if name == 'main':

object_list = ['aeroplane', 'baleiwuqun', 'bangexigua', 'banma', 'bijibendiannao', 'buqiang', 'car_1', 'cat', 'chahu',
    'changjinglu', 'chaoguo', 'chuanzhedequnzi', 'chuanzhedetxu', 'dalingdaixizhuang', 'datiqin', 'dayoulun',
    'dengju', 'denglong', 'dengshanbao', 'diandongche', 'diannaoyi', 'diaodai', 'dog', 'duobaoge', 'erhuan',
    'erji_1', 'fanchuan', 'feitin', 'feixingyuantoukui', 'food_one_plate', 'fuzi', 'gangqin', 'gangqin_1',
    'gaogenxie', 'gaojiaobei', 'gaotie', 'glass', 'gongjiaoche', 'guanzhuangnaifen', 'gudaijiugang', 'guopan',
    'hanbaobao', 'helicopter', 'hongjiu', 'honglvdeng', 'hu_die', 'huaqiu', 'hunsha', 'huwaimao1', 'jeep',
    'jianpan', 'jiaoche', 'jingjurenwu', 'jipuche', 'jita', 'jiuhu', 'jiweijiubei', 'junjian', 'junmao',
    'junzhuang', 'kafeibeici', 'kafeibeizhi', 'kaola', 'kuabao', 'kuaiting', 'laganxiang', 'laohu',
    'laoshijiaoche', 'laoshiliushengji', 'lianxiqun', 'lu', 'lunyi', 'luotuo', 'ma', 'maikefeng_shouli',
    'miandian', 'motuoche', 'motuochetoukui', 'motuoting', 'nanbiao', 'nanshipixie', 'niaolong', 'nvbiao',
    'nvshineiyi', 'nvxingtoushi', 'paoche', 'penzai', 'person_face', 'person_run', 'pijiuping', 'pingbandiannao',
    'putongmotuoche', 'qiaokeli', 'qie', 'renxingdeng', 'reqiqiu', 'shafa', 'shanghaidongfangmingzhudianshita',
    'shengridangao', 'shizhong', 'shoufengqin', 'shouji', 'shounachuifengji', 'shouqiang', 'shoutixiang', 'shu1',
    'shu2', 'shubiao', 'shuilongtou', 'shuiniu', 'songshu', 'suv', 'taideng', 'tanke', 'tushuguan', 'tuzi',
    'wanjupenshuiqiang', 'weika', 'weimian', 'wugui', 'xiandaidianhuazuoji', 'xiangbinta', 'xianglian',
    'xiaotiqin', 'xiaoxueshengshubao', 'xibuniuzaimao', 'xinge', 'xiniu', 'xinlvjianceyi', 'xiong', 'xiongmao',
    'xiongwawa', 'xizhuanglingjie', 'xueren', 'ya_1', 'yangjiu', 'yangjiujiubei', 'yangqin', 'yanying', 'yijia',
    'yingwu', 'yinxiang', 'yipaihongjiuping', 'yizhuang', 'yugang1', 'yuhangyuanquanshen', 'yundongfu',
    'yundongxie', 'yurongfu', 'zhandouji', 'zhangpeng', 'zhaoxiangji', 'zheshan', 'zhongguogudaihuafang',
    'zhongshihunsha', 'zhuangjiache', 'zhuguo', 'zidongshouhuoji', 'zixingche']
weight_file ='/data/Lx/Yet-Another-EfficientDet-Pytorch/logs/val/efficientdet-d2_27_128500.pth'
a = detect(obj_list=object_list,weight_file=weight_file)
t1 = time.time()
c = cv2.imread('/data/Lx/Yet-Another-EfficientDet-Pytorch/cat.jpg')
b  = a.run(c)
t2 = time.time()
tact_time = (t2 - t1)
print(f'{tact_time} seconds, {1 / tact_time} FPS, @batch_size 1')
print(b) @zylo117

zylo117 / Yet-Another-EfficientDet-Pytorch

The speed problem #499