YoloV8改进策略：轻量级Slim Neck打造极致的YoloV8

AI浩

发布于 2024-10-22 04:11:58

31500

代码可运行

文章被收录于专栏：AI智韵AI智韵

运行总次数：0

代码可运行

摘要

论文链接：https://arxiv.org/ftp/arxiv/papers/2206/2206.02424.pdf

作者研究了增强 CNN 学习能力的通用方法，例如 DensNet、VoVNet 和 CSPNet，然后根据这些方法的理论设计了 Slim-Neck 结构。

使用轻量级卷积方法 GSConv 来代替 SC。其计算成本约为 SC 的60%~70%，但其对模型学习能力的贡献与后者不相上下。然后，在 GSConv 的基础上继续引入 GSbottleneck，下图（a）展示了 GSbottleneck 模块的结构。

同样，使用一次性聚合方法来设计跨级部分网络 (GSCSP) 模块 VoV-GSCSP。VoV-GSCSP 模块降低了计算和网络结构的复杂性，但保持了足够的精度。(b) 显示了 VoV-GSCSP 的结构。使用 VoV-GSCSP 代替 Neck 的 CSP，其中 CSP 层由标准卷积组成，FLOPs 将平均比后者减少 15.72%

最后，需要灵活地使用3个模块，GSConv、GSbottleneck 和 VoV-GSCSP。

作者使用 slim-neck 的模块来改造 Scaled-YOLOv4 和 YOLOv5 的 Neck 层，如上图！结果如下：

从结果上看，在YoloV5上的表现还是不错的！如果用来改进YoloV8会怎么样呢？

Yolov8官方结果

YOLOv8l summary (fused): 268 layers, 43631280 parameters, 0 gradients, 165.0 GFLOPs
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 29/29 [
                   all        230       1412      0.922      0.957      0.986      0.737
                   c17        230        131      0.973      0.992      0.995      0.825
                    c5        230         68      0.945          1      0.995      0.836
            helicopter        230         43       0.96      0.907      0.951      0.607
                  c130        230         85      0.984          1      0.995      0.655
                   f16        230         57      0.955      0.965      0.985      0.669
                    b2        230          2      0.704          1      0.995      0.722
                 other        230         86      0.903      0.942      0.963      0.534
                   b52        230         70       0.96      0.971      0.978      0.831
                  kc10        230         62      0.999      0.984       0.99      0.847
               command        230         40       0.97          1      0.995      0.811
                   f15        230        123      0.891          1      0.992      0.701
                 kc135        230         91      0.971      0.989      0.986      0.712
                   a10        230         27          1      0.555      0.899      0.456
                    b1        230         20      0.972          1      0.995      0.793
                   aew        230         25      0.945          1       0.99      0.784
                   f22        230         17      0.913          1      0.995      0.725
                    p3        230        105       0.99          1      0.995      0.801
                    p8        230          1      0.637          1      0.995      0.597
                   f35        230         32      0.939      0.938      0.978      0.574
                   f18        230        125      0.985      0.992      0.987      0.817
                   v22        230         41      0.983          1      0.995       0.69
                 su-27        230         31      0.925          1      0.995      0.859
                 il-38        230         27      0.972          1      0.995      0.811
                tu-134        230          1      0.663          1      0.995      0.895
                 su-33        230          2          1      0.611      0.995      0.796
                 an-70        230          2      0.766          1      0.995       0.73
                 tu-22        230         98      0.984          1      0.995      0.831
Speed: 0.2ms preprocess, 3.8ms inference, 0.0ms loss, 0.8ms postprocess per image

源码

def autopad(k, p=None):  # kernel, padding
    # Pad to 'same'
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
    return p

class GSConv(nn.Module):
    # GSConv https://github.com/AlanLi1997/slim-neck-by-gsconv
    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
        super().__init__()
        c_ = c2 // 2
        self.cv1 = Conv(c1, c_, k, s, None, g, act)
        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act)

    def forward(self, x):
        x1 = self.cv1(x)
        x2 = torch.cat((x1, self.cv2(x1)), 1)

        b, n, h, w = x2.data.size()
        b_n = b * n // 2
        y = x2.reshape(b_n, 2, h * w)
        y = y.permute(1, 0, 2)
        y = y.reshape(2, -1, n // 2, h, w)

        return torch.cat((y[0], y[1]), 1)


class GSConvns(GSConv):
    # GSConv with a normative-shuffle https://github.com/AlanLi1997/slim-neck-by-gsconv
    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
        super().__init__(c1, c2, k=1, s=1, g=1, act=True)
        c_ = c2 // 2
        self.shuf = nn.Conv2d(c_ * 2, c2, 1, 1, 0, bias=False)

    def forward(self, x):
        x1 = self.cv1(x)
        x2 = torch.cat((x1, self.cv2(x1)), 1)
        # normative-shuffle, TRT supported
        return nn.ReLU(self.shuf(x2))


class GSBottleneck(nn.Module):
    # GS Bottleneck https://github.com/AlanLi1997/slim-neck-by-gsconv
    def __init__(self, c1, c2, k=3, s=1, e=0.5):
        super().__init__()
        c_ = int(c2*e)
        # for lighting
        self.conv_lighting = nn.Sequential(
            GSConv(c1, c_, 1, 1),
            GSConv(c_, c2, 3, 1, act=False))
        self.shortcut = Conv(c1, c2, 1, 1, act=False)

    def forward(self, x):
        return self.conv_lighting(x) + self.shortcut(x)


class GSBottleneckC(GSBottleneck):
    # cheap GS Bottleneck https://github.com/AlanLi1997/slim-neck-by-gsconv
    def __init__(self, c1, c2, k=3, s=1):
        super().__init__(c1, c2, k, s)
        self.shortcut = DWConv(c1, c2, k, s, act=False)


class VoVGSCSP(nn.Module):
    # VoVGSCSP module with GSBottleneck
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        # self.gc1 = GSConv(c_, c_, 1, 1)
        # self.gc2 = GSConv(c_, c_, 1, 1)
        # self.gsb = GSBottleneck(c_, c_, 1, 1)
        self.gsb = nn.Sequential(*(GSBottleneck(c_, c_, e=1.0) for _ in range(n)))
        self.res = Conv(c_, c_, 3, 1, act=False)
        self.cv3 = Conv(2 * c_, c2, 1)  #


    def forward(self, x):
        x1 = self.gsb(self.cv1(x))
        y = self.cv2(x)
        return self.cv3(torch.cat((y, x1), dim=1))


class VoVGSCSPC(VoVGSCSP):
    # cheap VoVGSCSP module with GSBottleneck
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        super().__init__(c1, c2)
        c_ = int(c2 * 0.5)  # hidden channels
        self.gsb = GSBottleneckC(c_, c_, 1, 1)

测试结果

YOLOv8l summary (fused): 426 layers, 33290544 parameters, 0 gradients, 134.7 GFLOPs
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 15/15 [00:13<00:00,  1.11it/s]
                   all        230       1412      0.974      0.959      0.983      0.745
                   c17        230        131      0.991      0.992      0.995      0.828
                    c5        230         68      0.964      0.985      0.994      0.836
            helicopter        230         43      0.975      0.924      0.967      0.606
                  c130        230         85      0.996      0.965      0.995      0.663
                   f16        230         57          1      0.913      0.992      0.667
                    b2        230          2      0.942          1      0.995       0.75
                 other        230         86      0.974      0.888      0.969      0.541
                   b52        230         70      0.982      0.971      0.975       0.82
                  kc10        230         62      0.996      0.968      0.987      0.816
               command        230         40      0.996          1      0.995      0.851
                   f15        230        123      0.967      0.966      0.992       0.69
                 kc135        230         91      0.989      0.988      0.982      0.709
                   a10        230         27          1       0.52      0.781      0.377
                    b1        230         20          1      0.953      0.995      0.734
                   aew        230         25      0.955          1      0.993      0.769
                   f22        230         17      0.908          1      0.995      0.748
                    p3        230        105          1      0.965      0.995      0.806
                    p8        230          1      0.899          1      0.995      0.895
                   f35        230         32      0.966      0.892      0.987      0.571
                   f18        230        125      0.992       0.99       0.99      0.824
                   v22        230         41      0.997          1      0.995      0.725
                 su-27        230         31      0.992          1      0.995       0.83
                 il-38        230         27      0.993          1      0.995      0.825
                tu-134        230          1      0.895          1      0.995      0.895
                 su-33        230          2          1          1      0.995      0.759
                 an-70        230          2      0.925          1      0.995      0.749
                 tu-22        230         98      0.999          1      0.995      0.815
Speed: 0.5ms preprocess, 7.6ms inference, 0.0ms loss, 7.8ms postprocess per image