marcoslucianops / DeepStream-Yolo

NVIDIA DeepStream SDK 7.1 / 7.0 / 6.4 / 6.3 / 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 / 5.1 implementation for YOLO models
MIT License
1.49k stars 359 forks source link

I added the CoordAttention module in yolov5-6.0. How do I use gen_wts_yoloV5.py, do you want to modify the project source code #304

Open hupu1dong opened 1 year ago

hupu1dong commented 1 year ago

My steps:

  1. Add the following code to common. py:

    
    class h_sigmoid(nn.Module):
    def __init__(self, inplace=True):
        super(h_sigmoid, self).__init__()
        self.relu = nn.ReLU6(inplace=inplace)
    
    def forward(self, x):
        return self.relu(x + 3) / 6

class h_swish(nn.Module): def init(self, inplace=True): super(h_swish, self).init() self.sigmoid = h_sigmoid(inplace=inplace)

def forward(self, x):
    return x * self.sigmoid(x)

class CoordAtt(nn.Module): def init(self, inp, oup, reduction=32): super(CoordAtt, self).init() self.pool_h = nn.AdaptiveAvgPool2d((None, 1)) self.pool_w = nn.AdaptiveAvgPool2d((1, None))

    mip = max(8, inp // reduction)

    self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0)
    self.bn1 = nn.BatchNorm2d(mip)
    self.act = h_swish()

    self.conv_h = nn.Conv2d(mip, oup, kernel_size=1, stride=1, padding=0)
    self.conv_w = nn.Conv2d(mip, oup, kernel_size=1, stride=1, padding=0)

def forward(self, x):
    identity = x

    n, c, h, w = x.size()
    x_h = self.pool_h(x)
    x_w = self.pool_w(x).permute(0, 1, 3, 2)

    y = torch.cat([x_h, x_w], dim=2)
    y = self.conv1(y)
    y = self.bn1(y)
    y = self.act(y)

    x_h, x_w = torch.split(y, [h, w], dim=2)
    x_w = x_w.permute(0, 1, 3, 2)

    a_h = self.conv_h(x_h).sigmoid()
    a_w = self.conv_w(x_w).sigmoid()

    out = identity * a_w * a_h

    return out

class SELayer(nn.Module): def init(self, c1, r=16): super(SELayer, self).init() self.avgpool = nn.AdaptiveAvgPool2d(1) self.l1 = nn.Linear(c1, c1 // r, bias=False) self.relu = nn.ReLU(inplace=True) self.l2 = nn.Linear(c1 // r, c1, bias=False) self.sig = nn.Sigmoid()

def forward(self, x):
    b, c, _, _ = x.size()
    y = self.avgpool(x).view(b, c)
    y = self.l1(y)
    y = self.relu(y)
    y = self.l2(y)
    y = self.sig(y)
    y = y.view(b, c, 1, 1)
    return x * y.expand_as(x)

class eca_layer(nn.Module): """Constructs a ECA module. Args: channel: Number of channels of the input feature map k_size: Adaptive selection of kernel size """

def __init__(self, channel, k_size=3):
    super(eca_layer, self).__init__()
    self.avg_pool = nn.AdaptiveAvgPool2d(1)
    self.conv = nn.Conv1d(1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False)
    self.sigmoid = nn.Sigmoid()

def forward(self, x):
    # feature descriptor on the global spatial information
    y = self.avg_pool(x)

    # Two different branches of ECA module
    y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)

    # Multi-scale information fusion
    y = self.sigmoid(y)
    x = x * y.expand_as(x)

    return x * y.expand_as(x)

class ChannelAttention(nn.Module): def init(self, in_planes, ratio=16): super(ChannelAttention, self).init() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.max_pool = nn.AdaptiveMaxPool2d(1)

    self.f1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
    self.relu = nn.ReLU()
    self.f2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
    self.sigmoid = nn.Sigmoid()

def forward(self, x):
    avg_out = self.f2(self.relu(self.f1(self.avg_pool(x))))
    max_out = self.f2(self.relu(self.f1(self.max_pool(x))))
    out = self.sigmoid(avg_out + max_out)
    return out

class SpatialAttention(nn.Module): def init(self, kernel_size=7): super(SpatialAttention, self).init()

    assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
    padding = 3 if kernel_size == 7 else 1

    self.conv = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
    self.sigmoid = nn.Sigmoid()

def forward(self, x):
    avg_out = torch.mean(x, dim=1, keepdim=True)
    max_out, _ = torch.max(x, dim=1, keepdim=True)
    x = torch.cat([avg_out, max_out], dim=1)
    x = self.conv(x)
    return self.sigmoid(x)

class CBAMC3(nn.Module):

CSP Bottleneck with 3 convolutions

def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
    super(CBAMC3, self).__init__()
    c_ = int(c2 * e)  # hidden channels
    self.cv1 = Conv(c1, c_, 1, 1)
    self.cv2 = Conv(c1, c_, 1, 1)
    self.cv3 = Conv(2 * c_, c2, 1)
    self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
    self.channel_attention = ChannelAttention(c2, 16)
    self.spatial_attention = SpatialAttention(7)

    # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])

def forward(self, x):
    out = self.channel_attention(x) * x
    print('outchannels:{}'.format(out.shape))
    out = self.spatial_attention(out) * out
    return out

2.  yolo.py  parse_model(d, ch): add CoordAtt module to the function
    if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
             BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, CoordAtt]:
        c1, c2 = ch[f], args[0]
        if c2 != no:  # if not output
            c2 = make_divisible(c2 * gw, 8)

        args = [c1, c2, *args[1:]]
        if m in [BottleneckCSP, C3, C3TR, C3Ghost]:
            args.insert(2, n)  # number of repeats
            n = 1

3.I used yolov5s.yaml as the template, and inserted CA attention mechanism

YOLOv5 🚀 by Ultralytics, GPL-3.0 license

Parameters

nc: 2 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: 3

- [10,13, 16,30, 33,23] # P3/8

- [30,61, 62,45, 59,119] # P4/16

- [116,90, 156,198, 373,326] # P5/32

YOLOv5 v6.0 backbone

backbone:

[from, number, module, args]

[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, CoordAtt,[1024]], [-1, 1, SPPF, [1024, 5]], # 10 ]

YOLOv5 v6.0 head

head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 14

[-1, 1, Conv, [256, 1, 1]], [-1, 1, nn.Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 18 (P3/8-small)

[-1, 1, Conv, [256, 3, 2]], [[-1, 15], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 21 (P4/16-medium)

[-1, 1, Conv, [512, 3, 2]], [[-1, 11], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 24 (P5/32-large)

[[18, 21, 24], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) ]



4.I trained 408 epoch and got pt weight 

 How do I use gen_wts_yoloV5.py, do you want to modify the project source code?How to modify?
marcoslucianops commented 1 year ago

For now, this layer isn't supported. I will check the possibility to add it in the next update.

hupu1dong commented 1 year ago

I look forward to realizing this layer

Alberto1404 commented 1 year ago

What is this for?

marcoslucianops commented 1 year ago

I look forward to realizing this layer

Now you can use the ONNX conversion to DeepStream. Probably it will work with your custom layers now.

YOLOv5