💡💡💡本文独家改进:RepGhost,通过重参数化实现硬件高效的Ghost模块,性能优于GhostNet、MobileNetV3等,在移动设备上具有更少的参数和可比的延迟。
RepGhost和C2f结合 | 轻量化的同时在数据集并有小幅涨点;
性能比较
layers | parameters | GFLOPs | kb | |
---|---|---|---|---|
YOLOv8s | 168 | 11125971 | 28.4 | 21991 |
yolov8_C2f_repghosts | 352 | 2589699 | 7 | 5341 |
论文:https://arxiv.org/pdf/2211.06088.pdf
特征重用一直是轻量级卷积神经网络(CNN)设计中的关键技术。当前的方法通常利用级联运算符通过重用来自其他层的特征图来廉价地保持大通道数(从而大网络容量)。尽管级联是无参数和无FLOPs的,但其在硬件设备上的计算成本是不可忽略的。
为了解决这个问题,本文提供了一个通过结构重参数化技术实现特征重用的新视角。提出了一种新的硬件高效的RepGhost模块,用于通过重参数化实现隐式特征重用,而不是使用级联运算符。
图3.从Ghost模块到RepGhost模块的演变。我们省略了输入的1x1卷积,更多的结构细节请参见图4。dconv:深度卷积层(后跟BN)。Cat:连接层。a)带有ReLU的Ghost模块[14];b)用添add替换concat;c)向后移动ReLU,使模块满足结构重新参数化的规则;d)训练过程中的RepGhost模块;e)推理过程中的RepGhost模块。模块c和模块d可以在推理过程中融合到模块e中。
图4:与Ghost bottleneck[14]相比。S Block:跳跃连接块,DS:下采样层,SE:SE块[21]。RG-block :RepGhost bottleneck。虚线中的方块只在必要时插入。Cin、Cmid和Cout分别表示bottleneck的输入通道、中间通道和输出通道。请注意,这些被标记为红色的bottleneck,即RepGhost bottleneck不同于bottleneck内部通道中的Ghost bottleneck。
核心代码:
class RepGhostModule(nn.Module):
def __init__(
self, inp, oup, kernel_size=1, dw_size=3, stride=1, relu=True, deploy=False, reparam_bn=True,
reparam_identity=False
):
super(RepGhostModule, self).__init__()
init_channels = oup
new_channels = oup
self.deploy = deploy
self.primary_conv = nn.Sequential(
nn.Conv2d(
inp, init_channels, kernel_size, stride, kernel_size // 2, bias=False,
),
nn.BatchNorm2d(init_channels),
nn.SiLU(inplace=True) if relu else nn.Sequential(),
)
fusion_conv = []
fusion_bn = []
if not deploy and reparam_bn:
fusion_conv.append(nn.Identity())
fusion_bn.append(nn.BatchNorm2d(init_channels))
if not deploy and reparam_identity:
fusion_conv.append(nn.Identity())
fusion_bn.append(nn.Identity())
self.fusion_conv = nn.Sequential(*fusion_conv)
self.fusion_bn = nn.Sequential(*fusion_bn)
self.cheap_operation = nn.Sequential(
nn.Conv2d(
init_channels,
new_channels,
dw_size,
1,
dw_size // 2,
groups=init_channels,
bias=deploy,
),
nn.BatchNorm2d(new_channels) if not deploy else nn.Sequential(),
# nn.ReLU(inplace=True) if relu else nn.Sequential(),
)
if deploy:
self.cheap_operation = self.cheap_operation[0]
if relu:
self.relu = nn.SiLU(inplace=False)
else:
self.relu = nn.Sequential()
def forward(self, x):
x1 = self.primary_conv(x) # mg
x2 = self.cheap_operation(x1)
for conv, bn in zip(self.fusion_conv, self.fusion_bn):
x2 = x2 + bn(conv(x1))
return self.relu(x2)
def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.cheap_operation[0], self.cheap_operation[1])
for conv, bn in zip(self.fusion_conv, self.fusion_bn):
kernel, bias = self._fuse_bn_tensor(conv, bn, kernel3x3.shape[0], kernel3x3.device)
kernel3x3 += self._pad_1x1_to_3x3_tensor(kernel)
bias3x3 += bias
return kernel3x3, bias3x3
@staticmethod
def _pad_1x1_to_3x3_tensor(kernel1x1):
if kernel1x1 is None:
return 0
else:
return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
@staticmethod
def _fuse_bn_tensor(conv, bn, in_channels=None, device=None):
in_channels = in_channels if in_channels else bn.running_mean.shape[0]
device = device if device else bn.weight.device
if isinstance(conv, nn.Conv2d):
kernel = conv.weight
assert conv.bias is None
else:
assert isinstance(conv, nn.Identity)
kernel_value = np.zeros((in_channels, 1, 1, 1), dtype=np.float32)
for i in range(in_channels):
kernel_value[i, 0, 0, 0] = 1
kernel = torch.from_numpy(kernel_value).to(device)
if isinstance(bn, nn.BatchNorm2d):
running_mean = bn.running_mean
running_var = bn.running_var
gamma = bn.weight
beta = bn.bias
eps = bn.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std
assert isinstance(bn, nn.Identity)
return kernel, torch.zeros(in_channels).to(kernel.device)
def switch_to_deploy(self):
if len(self.fusion_conv) == 0 and len(self.fusion_bn) == 0:
return
kernel, bias = self.get_equivalent_kernel_bias()
self.cheap_operation = nn.Conv2d(in_channels=self.cheap_operation[0].in_channels,
out_channels=self.cheap_operation[0].out_channels,
kernel_size=self.cheap_operation[0].kernel_size,
padding=self.cheap_operation[0].padding,
dilation=self.cheap_operation[0].dilation,
groups=self.cheap_operation[0].groups,
bias=True)
self.cheap_operation.weight.data = kernel
self.cheap_operation.bias.data = bias
self.__delattr__('fusion_conv')
self.__delattr__('fusion_bn')
self.fusion_conv = []
self.fusion_bn = []
self.deploy = True
class RepGhostBottleneck(nn.Module):
"""RepGhost bottleneck w/ optional SE"""
def __init__(
self,
in_chs,
mid_chs,
out_chs,
dw_kernel_size=3,
stride=1,
se_ratio=0.0,
shortcut=True,
reparam=True,
reparam_bn=True,
reparam_identity=False,
deploy=False,
):
super(RepGhostBottleneck, self).__init__()
has_se = se_ratio is not None and se_ratio > 0.0
self.stride = stride
self.enable_shortcut = shortcut
self.in_chs = in_chs
self.out_chs = out_chs
# Point-wise expansion
self.ghost1 = RepGhostModule(
in_chs,
mid_chs,
relu=True,
reparam_bn=reparam and reparam_bn,
reparam_identity=reparam and reparam_identity,
deploy=deploy,
)
# Depth-wise convolution
if self.stride > 1:
self.conv_dw = nn.Conv2d(
mid_chs,
mid_chs,
dw_kernel_size,
stride=stride,
padding=(dw_kernel_size - 1) // 2,
groups=mid_chs,
bias=False,
)
self.bn_dw = nn.BatchNorm2d(mid_chs)
# Squeeze-and-excitation
if has_se:
self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
else:
self.se = None
# Point-wise linear projection
self.ghost2 = RepGhostModule(
mid_chs,
out_chs,
relu=False,
reparam_bn=reparam and reparam_bn,
reparam_identity=reparam and reparam_identity,
deploy=deploy,
)
# shortcut
if in_chs == out_chs and self.stride == 1:
self.shortcut = nn.Sequential()
else:
self.shortcut = nn.Sequential(
nn.Conv2d(
in_chs,
in_chs,
dw_kernel_size,
stride=stride,
padding=(dw_kernel_size - 1) // 2,
groups=in_chs,
bias=False,
),
nn.BatchNorm2d(in_chs),
nn.Conv2d(
in_chs, out_chs, 1, stride=1,
padding=0, bias=False,
),
nn.BatchNorm2d(out_chs),
)
def forward(self, x):
residual = x
x1 = self.ghost1(x)
if self.stride > 1:
x = self.conv_dw(x1)
x = self.bn_dw(x)
else:
x = x1
if self.se is not None:
x = self.se(x)
# 2nd repghost bottleneck mg
x = self.ghost2(x)
if not self.enable_shortcut and self.in_chs == self.out_chs and self.stride == 1:
return x
return x + self.shortcut(residual)
详见:
https://blog.csdn.net/m0_63774211/article/details/132022000
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。