Open ffxxy opened 2 months ago
class AdditiveTokenMixer(nn.Module): """ 改变了proj函数的输入,不对q+k卷积,而是对融合之后的结果proj """ def init(self, dim=512, attn_bias=False, proj_drop=0.): super().init() self.qkv = nn.Conv2d(dim, 3 * dim, 1, stride=1, padding=0, bias=attn_bias) self.oper_q = nn.Sequential( SpatialOperation(dim), ChannelOperation(dim), ) self.oper_k = nn.Sequential( SpatialOperation(dim), ChannelOperation(dim), ) self.dwc = nn.Conv2d(dim, dim, 3, 1, 1, groups=dim)
self.proj = nn.Conv2d(dim, dim, 3, 1, 1, groups=dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x): q, k, v = self.qkv(x).chunk(3, dim=1) q = self.oper_q(q) k = self.oper_k(k) out = self.proj(self.dwc(q + k) * v) out = self.proj_drop(out) return out
这个类里面,似乎没有自注意力矩阵,self.dwc(q + k) * v,这里似乎并没有上下文交互。这违背了transformer的初衷。
自注意力机制可以被看作对图像各块的特征的加权求和,权重是由相似度得到。这个工作没用点积相似度,而是用了一种加性注意力来获取对V的组合权重
class AdditiveTokenMixer(nn.Module): """ 改变了proj函数的输入,不对q+k卷积,而是对融合之后的结果proj """ def init(self, dim=512, attn_bias=False, proj_drop=0.): super().init() self.qkv = nn.Conv2d(dim, 3 * dim, 1, stride=1, padding=0, bias=attn_bias) self.oper_q = nn.Sequential( SpatialOperation(dim), ChannelOperation(dim), ) self.oper_k = nn.Sequential( SpatialOperation(dim), ChannelOperation(dim), ) self.dwc = nn.Conv2d(dim, dim, 3, 1, 1, groups=dim)
这个类里面,似乎没有自注意力矩阵,self.dwc(q + k) * v,这里似乎并没有上下文交互。这违背了transformer的初衷。