Closed HYUNJS closed 3 years ago
Hi~
Below is the code of multi-head attention in dynamic head. `class RCNNHead(nn.Module):
def init(self, cfg, d_model, num_classes, dim_feedforward=2048, nhead=8, dropout=0.1, activation="relu", scale_clamp: float = _DEFAULT_SCALE_CLAMP, bbox_weights=(2.0, 2.0, 1.0, 1.0)): super().init()
self.d_model = d_model
self.attetion = cfg.MODEL.SparseRCNN.ATTENTION
# head.
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
if self.attetion:
self.inst_interact = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
else:
self.inst_interact = DynamicConv(cfg)
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
# cls.
num_cls = cfg.MODEL.SparseRCNN.NUM_CLS
cls_module = list()
for _ in range(num_cls):
cls_module.append(nn.Linear(d_model, d_model, False))
cls_module.append(nn.LayerNorm(d_model))
cls_module.append(nn.ReLU(inplace=True))
self.cls_module = nn.ModuleList(cls_module)
# reg.
num_reg = cfg.MODEL.SparseRCNN.NUM_REG
reg_module = list()
for _ in range(num_reg):
reg_module.append(nn.Linear(d_model, d_model, False))
reg_module.append(nn.LayerNorm(d_model))
reg_module.append(nn.ReLU(inplace=True))
self.reg_module = nn.ModuleList(reg_module)
# pred.
self.use_focal = cfg.MODEL.SparseRCNN.USE_FOCAL
if self.use_focal:
self.class_logits = nn.Linear(d_model, num_classes)
else:
self.class_logits = nn.Linear(d_model, num_classes + 1)
self.bboxes_delta = nn.Linear(d_model, 4)
self.scale_clamp = scale_clamp
self.bbox_weights = bbox_weights
def forward(self, features, bboxes, pro_features, pooler): """ :param bboxes: (N, nr_boxes, 4) :param pro_features: (N, nr_boxes, d_model) """
N, nr_boxes = bboxes.shape[:2]
# roi_feature.
proposal_boxes = list()
for b in range(N):
proposal_boxes.append(Boxes(bboxes[b]))
roi_features = pooler(features, proposal_boxes)
roi_features = roi_features.view(N * nr_boxes, self.d_model, -1).permute(2, 0, 1)
# self_att.
pro_features = pro_features.view(N, nr_boxes, self.d_model).permute(1, 0, 2)
pro_features2 = self.self_attn(pro_features, pro_features, value=pro_features)[0]
pro_features = pro_features + self.dropout1(pro_features2)
pro_features = self.norm1(pro_features)
# inst_interact.
pro_features = pro_features.view(nr_boxes, N, self.d_model).permute(1, 0, 2).reshape(1, N * nr_boxes, self.d_model)
if self.attetion:
pro_features2 = self.inst_interact(query=pro_features, key=roi_features, value=roi_features)[0]
else:
pro_features2 = self.inst_interact(pro_features, roi_features)
pro_features = pro_features + self.dropout2(pro_features2)
obj_features = self.norm2(pro_features)
# obj_feature.
obj_features2 = self.linear2(self.dropout(self.activation(self.linear1(obj_features))))
obj_features = obj_features + self.dropout3(obj_features2)
obj_features = self.norm3(obj_features)
fc_feature = obj_features.transpose(0, 1).reshape(N * nr_boxes, -1)
cls_feature = fc_feature.clone()
reg_feature = fc_feature.clone()
for cls_layer in self.cls_module:
cls_feature = cls_layer(cls_feature)
for reg_layer in self.reg_module:
reg_feature = reg_layer(reg_feature)
class_logits = self.class_logits(cls_feature)
bboxes_deltas = self.bboxes_delta(reg_feature)
pred_bboxes = self.apply_deltas(bboxes_deltas, bboxes.view(-1, 4))
return class_logits.view(N, nr_boxes, -1), pred_bboxes.view(N, nr_boxes, -1), obj_features`
Thank you for your answer :)
@HYUNJS Hello, we have checked the 37.2 vs 35.7 and there is no problem, we are sorry that we do not describe it clearly in our paper. As for the performance of 35.7, just replace the dynamic module with multi-head attention in our code. As for 37.2, we use a modified fast head that is much more complicated than that in our current dynamic instance interaction head FFN to achieve comparable result (as we find if we just use the fast-rcnn version head (2-fc), the performance is bad (AP < 20)). And here is the code of modified fast head, if you are interested, just replace it in https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L46 and set the https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/detector.py#L56 as None, if you meet other issue, just set the issue-variable as None, and then you will reproduce the result (~38AP in our current code).
`import copy import math from typing import Optional, List
import torch import torch.nn.functional as F from torch import nn, Tensor from detectron2.modeling.poolers import ROIPooler, cat from detectron2.structures import Boxes
_DEFAULT_SCALE_CLAMP = math.log(100000.0 / 16)
class FastHeadLayer(nn.Module):
def __init__(self, cfg, num_classes, scale_clamp: float = _DEFAULT_SCALE_CLAMP, weights=(10.0, 10.0, 5.0, 5.0)):
super().__init__()
self.cfg = cfg
self.use_focal = cfg.MODEL.SparseRCNN.USE_FOCAL
self.self_attn = cfg.MODEL.SparseRCNN.FAST_SELF_ATTN
# Implementation of Feedforward model
self.linear1 = nn.Linear(7*7*256, 256)
self.norm1 = nn.LayerNorm(256)
self.linear2 = nn.Linear(256, 256)
self.norm2 = nn.LayerNorm(256)
self.linear3 = nn.Linear(7*7*256, 256)
self.norm3 = nn.LayerNorm(256)
self.linear4 = nn.Linear(256, 256)
self.norm4 = nn.LayerNorm(256)
self.linear5 = nn.Linear(512, 256)
self.norm5 = nn.LayerNorm(256)
self.linear6 = nn.Linear(256, 256)
self.norm6 = nn.LayerNorm(256)
self.scale_clamp = scale_clamp
self.weights = weights
if self.use_focal:
self.class_logits = nn.Linear(256, num_classes)
else:
self.class_logits = nn.Linear(256, num_classes + 1)
self.bboxes_delta = nn.Linear(256, 4)
if self.self_attn:
self.self_attn = nn.MultiheadAttention(256, 8, dropout=0.1)
self.relu = nn.ReLU(inplace=True)
def forward(self, features, bboxes, tgt, pooler=None, norm=None, query_pos=None):
"""
:param features: can be original features or memory (RoI feature).
:param bboxes: (N, nr_boxes, 4)
:param tgt: (nr_boxes, N, d_model)
:param pooler:
:return:
"""
N, nr_boxes = bboxes.shape[:2]
proposals = list()
for b in range(N):
proposals.append(Boxes(bboxes[b]))
# roi_feature
memory = pooler(features, proposals)
memory = memory.view(N*nr_boxes, -1)
feat_box = self.relu(self.norm1(self.linear1(memory)))
feat_box = self.relu(self.norm2(self.linear2(feat_box)))
bboxes_deltas = self.bboxes_delta(feat_box)
pred_bboxes = self.apply_deltas(bboxes_deltas, bboxes.view(-1, 4))
feat_cls = self.relu(self.norm3(self.linear3(memory)))
feat_cls = self.relu(self.norm4(self.linear4(feat_cls)))
if tgt is None:
memory_attn = feat_cls.clone()
else:
memory_attn = tgt.clone()
if self.self_attn:
memory_attn = memory_attn.view(N, nr_boxes, 256).permute(1, 0, 2)
memory_attn = self.self_attn(memory_attn, memory_attn, value=memory_attn)[0]
memory_attn = memory_attn.transpose(0, 1).reshape(N * nr_boxes, -1)
feat_cls = torch.cat((feat_cls, memory_attn), dim=-1)
feat_cls = self.relu(self.norm5(self.linear5(feat_cls)))
feat_cls = self.relu(self.norm6(self.linear6(feat_cls)))
class_logits = self.class_logits(feat_cls)
return class_logits.view(N, nr_boxes, -1), pred_bboxes.view(N, nr_boxes, -1), feat_cls
@staticmethod
def with_pos_embed(tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def apply_deltas(self, deltas, boxes):
"""
Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
deltas[i] represents k potentially different class-specific
box transformations for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 4)
"""
boxes = boxes.to(deltas.dtype)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
wx, wy, ww, wh = self.weights
dx = deltas[:, 0::4] / wx
dy = deltas[:, 1::4] / wy
dw = deltas[:, 2::4] / ww
dh = deltas[:, 3::4] / wh
# Prevent sending too large values into torch.exp()
dw = torch.clamp(dw, max=self.scale_clamp)
dh = torch.clamp(dh, max=self.scale_clamp)
pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
pred_w = torch.exp(dw) * widths[:, None]
pred_h = torch.exp(dh) * heights[:, None]
pred_boxes = torch.zeros_like(deltas)
pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1
pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1
pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2
pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2
return pred_boxes`
Hi :) Thank you for sharing code of Sparse RCNN. I really like your novel ideas in this paper.
I would like to ask regarding the instance-interaction module in dynamic head.
According to the Table 4 in your paper, 37.2 AP is achieved without instance interaction module. However, in the Table 8, using multi-head attention as instance interaction would rather decrease the AP to 35.7 (-1.8). What is the reason for such performance degradation? Would it be due to small number of epochs for training self-attention enough? If yes, then may I ask have you trained with longer training schedule (I would appreciate if you show the training graphs)?
Also, may I ask how did you implement multi-head attention in dynamic head? I guess you may reduce the dimension of flattened roi featuers (7x7xC -> C) in order to apply attention with object features (since object features dimension is C). Which method did you use in the ablation study?
Furthermore, I would like to ask regarding the inference time usage of SparseRCNN.