Open VanLinLin opened 1 year ago
@VanLinLin Based on your configuration, this error shouldn't occur. Can you provide further information?
@hhaAndroid Hi, thanks for your reply.
I copy some files from InternImage to mmdetection, and rerun the python setup install
to register them.
The steps is shown below:
pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
pip install openmim
pip install mmcv-full==1.7.1
pip install opencv-python termcolor yacs pyyaml scipy
python setup.py install
The intern_image.py
file:
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torch.nn as nn
from collections import OrderedDict
import torch.utils.checkpoint as checkpoint
from timm.models.layers import trunc_normal_, DropPath
from mmcv.runner import _load_checkpoint
from mmcv.cnn import constant_init, trunc_normal_init
from mmdet.utils import get_root_logger
from mmdet.models.builder import BACKBONES
import torch.nn.functional as F
from ops_dcnv3 import modules as opsm
class to_channels_first(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 3, 1, 2)
class to_channels_last(nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
return x.permute(0, 2, 3, 1)
def build_norm_layer(dim,
norm_layer,
in_format='channels_last',
out_format='channels_last',
eps=1e-6):
layers = []
if norm_layer == 'BN':
if in_format == 'channels_last':
layers.append(to_channels_first())
layers.append(nn.BatchNorm2d(dim))
if out_format == 'channels_last':
layers.append(to_channels_last())
elif norm_layer == 'LN':
if in_format == 'channels_first':
layers.append(to_channels_last())
layers.append(nn.LayerNorm(dim, eps=eps))
if out_format == 'channels_first':
layers.append(to_channels_first())
else:
raise NotImplementedError(
f'build_norm_layer does not support {norm_layer}')
return nn.Sequential(*layers)
def build_act_layer(act_layer):
if act_layer == 'ReLU':
return nn.ReLU(inplace=True)
elif act_layer == 'SiLU':
return nn.SiLU(inplace=True)
elif act_layer == 'GELU':
return nn.GELU()
raise NotImplementedError(f'build_act_layer does not support {act_layer}')
class CrossAttention(nn.Module):
r""" Cross Attention Module
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
attn_drop (float, optional): Dropout ratio of attention weight.
Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
attn_head_dim (int, optional): Dimension of attention head.
out_dim (int, optional): Dimension of output.
"""
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
attn_head_dim=None,
out_dim=None):
super().__init__()
if out_dim is None:
out_dim = dim
self.num_heads = num_heads
head_dim = dim // num_heads
if attn_head_dim is not None:
head_dim = attn_head_dim
all_head_dim = head_dim * self.num_heads
self.scale = qk_scale or head_dim ** -0.5
assert all_head_dim == dim
self.q = nn.Linear(dim, all_head_dim, bias=False)
self.k = nn.Linear(dim, all_head_dim, bias=False)
self.v = nn.Linear(dim, all_head_dim, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
else:
self.q_bias = None
self.k_bias = None
self.v_bias = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(all_head_dim, out_dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, k=None, v=None):
B, N, C = x.shape
N_k = k.shape[1]
N_v = v.shape[1]
q_bias, k_bias, v_bias = None, None, None
if self.q_bias is not None:
q_bias = self.q_bias
k_bias = self.k_bias
v_bias = self.v_bias
q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
q = q.reshape(B, N, 1, self.num_heads,
-1).permute(2, 0, 3, 1,
4).squeeze(0) # (B, N_head, N_q, dim)
k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1,
4).squeeze(0)
v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1,
4).squeeze(0)
q = q * self.scale
attn = (q @ k.transpose(-2, -1)) # (B, N_head, N_q, N_k)
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
x = self.proj(x)
x = self.proj_drop(x)
return x
class AttentiveBlock(nn.Module):
r"""Attentive Block
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads. Default: 8
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: False.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
drop (float, optional): Dropout rate. Default: 0.0.
attn_drop (float, optional): Attention dropout rate. Default: 0.0.
drop_path (float | tuple[float], optional): Stochastic depth rate.
Default: 0.0.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm.
attn_head_dim (int, optional): Dimension of attention head. Default: None.
out_dim (int, optional): Dimension of output. Default: None.
"""
def __init__(self,
dim,
num_heads,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer="LN",
attn_head_dim=None,
out_dim=None):
super().__init__()
self.norm1_q = build_norm_layer(dim, norm_layer, eps=1e-6)
self.norm1_k = build_norm_layer(dim, norm_layer, eps=1e-6)
self.norm1_v = build_norm_layer(dim, norm_layer, eps=1e-6)
self.cross_dcn = CrossAttention(dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
attn_head_dim=attn_head_dim,
out_dim=out_dim)
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
def forward(self,
x_q,
x_kv,
pos_q,
pos_k,
bool_masked_pos,
rel_pos_bias=None):
x_q = self.norm1_q(x_q + pos_q)
x_k = self.norm1_k(x_kv + pos_k)
x_v = self.norm1_v(x_kv)
x = self.cross_dcn(x_q, k=x_k, v=x_v)
return x
class AttentionPoolingBlock(AttentiveBlock):
def forward(self, x):
x_q = x.mean(1, keepdim=True)
x_kv = x
pos_q, pos_k = 0, 0
x = super().forward(x_q, x_kv, pos_q, pos_k,
bool_masked_pos=None,
rel_pos_bias=None)
x = x.squeeze(1)
return x
class StemLayer(nn.Module):
r""" Stem layer of InternImage
Args:
in_chans (int): number of input channels
out_chans (int): number of output channels
act_layer (str): activation layer
norm_layer (str): normalization layer
"""
def __init__(self,
in_chans=3,
out_chans=96,
act_layer='GELU',
norm_layer='BN'):
super().__init__()
self.conv1 = nn.Conv2d(in_chans,
out_chans // 2,
kernel_size=3,
stride=2,
padding=1)
self.norm1 = build_norm_layer(out_chans // 2, norm_layer,
'channels_first', 'channels_first')
self.act = build_act_layer(act_layer)
self.conv2 = nn.Conv2d(out_chans // 2,
out_chans,
kernel_size=3,
stride=2,
padding=1)
self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first',
'channels_last')
def forward(self, x):
x = self.conv1(x)
x = self.norm1(x)
x = self.act(x)
x = self.conv2(x)
x = self.norm2(x)
return x
class DownsampleLayer(nn.Module):
r""" Downsample layer of InternImage
Args:
channels (int): number of input channels
norm_layer (str): normalization layer
"""
def __init__(self, channels, norm_layer='LN'):
super().__init__()
self.conv = nn.Conv2d(channels,
2 * channels,
kernel_size=3,
stride=2,
padding=1,
bias=False)
self.norm = build_norm_layer(2 * channels, norm_layer,
'channels_first', 'channels_last')
def forward(self, x):
x = self.conv(x.permute(0, 3, 1, 2))
x = self.norm(x)
return x
class MLPLayer(nn.Module):
r""" MLP layer of InternImage
Args:
in_features (int): number of input features
hidden_features (int): number of hidden features
out_features (int): number of output features
act_layer (str): activation layer
drop (float): dropout rate
"""
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer='GELU',
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = build_act_layer(act_layer)
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class InternImageLayer(nn.Module):
r""" Basic layer of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def __init__(self,
core_op,
channels,
groups,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer='GELU',
norm_layer='LN',
post_norm=False,
layer_scale=None,
offset_scale=1.0,
with_cp=False,
dw_kernel_size=None, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
center_feature_scale=False): # for InternImage-H/G
super().__init__()
self.channels = channels
self.groups = groups
self.mlp_ratio = mlp_ratio
self.with_cp = with_cp
self.norm1 = build_norm_layer(channels, 'LN')
self.post_norm = post_norm
self.dcn = core_op(
channels=channels,
kernel_size=3,
stride=1,
pad=1,
dilation=1,
group=groups,
offset_scale=offset_scale,
act_layer=act_layer,
norm_layer=norm_layer,
dw_kernel_size=dw_kernel_size, # for InternImage-H/G
center_feature_scale=center_feature_scale) # for InternImage-H/G
self.drop_path = DropPath(drop_path) if drop_path > 0. \
else nn.Identity()
self.norm2 = build_norm_layer(channels, 'LN')
self.mlp = MLPLayer(in_features=channels,
hidden_features=int(channels * mlp_ratio),
act_layer=act_layer,
drop=drop)
self.layer_scale = layer_scale is not None
if self.layer_scale:
self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels),
requires_grad=True)
self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels),
requires_grad=True)
self.res_post_norm = res_post_norm
if res_post_norm:
self.res_post_norm1 = build_norm_layer(channels, 'LN')
self.res_post_norm2 = build_norm_layer(channels, 'LN')
def forward(self, x):
def _inner_forward(x):
if not self.layer_scale:
if self.post_norm:
x = x + self.drop_path(self.norm1(self.dcn(x)))
x = x + self.drop_path(self.norm2(self.mlp(x)))
elif self.res_post_norm: # for InternImage-H/G
x = x + self.drop_path(self.res_post_norm1(self.dcn(self.norm1(x))))
x = x + self.drop_path(self.res_post_norm2(self.mlp(self.norm2(x))))
else:
x = x + self.drop_path(self.dcn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
if self.post_norm:
x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x)))
x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x)))
else:
x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x)))
x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x)))
return x
if self.with_cp and x.requires_grad:
x = checkpoint.checkpoint(_inner_forward, x)
else:
x = _inner_forward(x)
return x
class InternImageBlock(nn.Module):
r""" Block of InternImage
Args:
core_op (nn.Module): core operation of InternImage
channels (int): number of input channels
depths (list): Depth of each block.
groups (list): Groups of each block.
mlp_ratio (float): ratio of mlp hidden features to input channels
drop (float): dropout rate
drop_path (float): drop path rate
act_layer (str): activation layer
norm_layer (str): normalization layer
post_norm (bool): whether to use post normalization
layer_scale (float): layer scale
offset_scale (float): offset scale
with_cp (bool): whether to use checkpoint
"""
def __init__(self,
core_op,
channels,
depth,
groups,
downsample=True,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer='GELU',
norm_layer='LN',
post_norm=False,
offset_scale=1.0,
layer_scale=None,
with_cp=False,
dw_kernel_size=None, # for InternImage-H/G
post_norm_block_ids=None, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
center_feature_scale=False): # for InternImage-H/G
super().__init__()
self.channels = channels
self.depth = depth
self.post_norm = post_norm
self.center_feature_scale = center_feature_scale
self.blocks = nn.ModuleList([
InternImageLayer(
core_op=core_op,
channels=channels,
groups=groups,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i] if isinstance(
drop_path, list) else drop_path,
act_layer=act_layer,
norm_layer=norm_layer,
post_norm=post_norm,
layer_scale=layer_scale,
offset_scale=offset_scale,
with_cp=with_cp,
dw_kernel_size=dw_kernel_size, # for InternImage-H/G
res_post_norm=res_post_norm, # for InternImage-H/G
center_feature_scale=center_feature_scale # for InternImage-H/G
) for i in range(depth)
])
if not self.post_norm or center_feature_scale:
self.norm = build_norm_layer(channels, 'LN')
self.post_norm_block_ids = post_norm_block_ids
if post_norm_block_ids is not None: # for InternImage-H/G
self.post_norms = nn.ModuleList(
[build_norm_layer(channels, 'LN', eps=1e-6) for _ in post_norm_block_ids]
)
self.downsample = DownsampleLayer(
channels=channels, norm_layer=norm_layer) if downsample else None
def forward(self, x, return_wo_downsample=False):
for i, blk in enumerate(self.blocks):
x = blk(x)
if (self.post_norm_block_ids is not None) and (i in self.post_norm_block_ids):
index = self.post_norm_block_ids.index(i)
x = self.post_norms[index](x) # for InternImage-H/G
if not self.post_norm or self.center_feature_scale:
x = self.norm(x)
if return_wo_downsample:
x_ = x
if self.downsample is not None:
x = self.downsample(x)
if return_wo_downsample:
return x, x_
return x
@BACKBONES.register_module()
class InternImage(nn.Module):
r""" InternImage
A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` -
https://arxiv.org/pdf/2103.14030
Args:
core_op (str): Core operator. Default: 'DCNv3'
channels (int): Number of the first stage. Default: 64
depths (list): Depth of each block. Default: [3, 4, 18, 5]
groups (list): Groups of each block. Default: [3, 6, 12, 24]
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Probability of an element to be zeroed. Default: 0.
drop_path_rate (float): Stochastic depth rate. Default: 0.
act_layer (str): Activation layer. Default: 'GELU'
norm_layer (str): Normalization layer. Default: 'LN'
layer_scale (bool): Whether to use layer scale. Default: False
cls_scale (bool): Whether to use class scale. Default: False
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
dw_kernel_size (int): Size of the dwconv. Default: None
level2_post_norm (bool): Whether to use level2 post norm. Default: False
level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None
res_post_norm (bool): Whether to use res post norm. Default: False
center_feature_scale (bool): Whether to use center feature scale. Default: False
"""
def __init__(self,
core_op='DCNv3',
channels=64,
depths=[3, 4, 18, 5],
groups=[3, 6, 12, 24],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.2,
drop_path_type='linear',
act_layer='GELU',
norm_layer='LN',
layer_scale=None,
offset_scale=1.0,
post_norm=False,
with_cp=False,
dw_kernel_size=None, # for InternImage-H/G
level2_post_norm=False, # for InternImage-H/G
level2_post_norm_block_ids=None, # for InternImage-H/G
res_post_norm=False, # for InternImage-H/G
center_feature_scale=False, # for InternImage-H/G
out_indices=(0, 1, 2, 3),
init_cfg=None,
**kwargs):
super().__init__()
self.core_op = core_op
self.num_levels = len(depths)
self.depths = depths
self.channels = channels
self.num_features = int(channels * 2**(self.num_levels - 1))
self.post_norm = post_norm
self.mlp_ratio = mlp_ratio
self.init_cfg = init_cfg
self.out_indices = out_indices
self.level2_post_norm_block_ids = level2_post_norm_block_ids
logger = get_root_logger()
logger.info(f'using core type: {core_op}')
logger.info(f'using activation layer: {act_layer}')
logger.info(f'using main norm layer: {norm_layer}')
logger.info(f'using dpr: {drop_path_type}, {drop_path_rate}')
logger.info(f"level2_post_norm: {level2_post_norm}")
logger.info(f"level2_post_norm_block_ids: {level2_post_norm_block_ids}")
logger.info(f"res_post_norm: {res_post_norm}")
in_chans = 3
self.patch_embed = StemLayer(in_chans=in_chans,
out_chans=channels,
act_layer=act_layer,
norm_layer=norm_layer)
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
]
if drop_path_type == 'uniform':
for i in range(len(dpr)):
dpr[i] = drop_path_rate
self.levels = nn.ModuleList()
for i in range(self.num_levels):
post_norm_block_ids = level2_post_norm_block_ids if level2_post_norm and (
i == 2) else None # for InternImage-H/G
level = InternImageBlock(
core_op=getattr(opsm, core_op),
channels=int(channels * 2**i),
depth=depths[i],
groups=groups[i],
mlp_ratio=self.mlp_ratio,
drop=drop_rate,
drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])],
act_layer=act_layer,
norm_layer=norm_layer,
post_norm=post_norm,
downsample=(i < self.num_levels - 1),
layer_scale=layer_scale,
offset_scale=offset_scale,
with_cp=with_cp,
dw_kernel_size=dw_kernel_size, # for InternImage-H/G
post_norm_block_ids=post_norm_block_ids, # for InternImage-H/G
res_post_norm=res_post_norm, # for InternImage-H/G
center_feature_scale=center_feature_scale # for InternImage-H/G
)
self.levels.append(level)
self.num_layers = len(depths)
self.apply(self._init_weights)
self.apply(self._init_deform_weights)
def init_weights(self):
logger = get_root_logger()
if self.init_cfg is None:
logger.warn(f'No pre-trained weights for '
f'{self.__class__.__name__}, '
f'training start from scratch')
for m in self.modules():
if isinstance(m, nn.Linear):
trunc_normal_init(m, std=.02, bias=0.)
elif isinstance(m, nn.LayerNorm):
constant_init(m, 1.0)
else:
assert 'checkpoint' in self.init_cfg, f'Only support ' \
f'specify `Pretrained` in ' \
f'`init_cfg` in ' \
f'{self.__class__.__name__} '
ckpt = _load_checkpoint(self.init_cfg.checkpoint,
logger=logger,
map_location='cpu')
if 'state_dict' in ckpt:
_state_dict = ckpt['state_dict']
elif 'model' in ckpt:
_state_dict = ckpt['model']
else:
_state_dict = ckpt
state_dict = OrderedDict()
for k, v in _state_dict.items():
if k.startswith('backbone.'):
state_dict[k[9:]] = v
else:
state_dict[k] = v
# strip prefix of state_dict
if list(state_dict.keys())[0].startswith('module.'):
state_dict = {k[7:]: v for k, v in state_dict.items()}
# load state_dict
meg = self.load_state_dict(state_dict, False)
logger.info(meg)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def _init_deform_weights(self, m):
if isinstance(m, getattr(opsm, self.core_op)):
m._reset_parameters()
def forward(self, x):
x = self.patch_embed(x)
x = self.pos_drop(x)
seq_out = []
for level_idx, level in enumerate(self.levels):
x, x_ = level(x, return_wo_downsample=True)
if level_idx in self.out_indices:
seq_out.append(x_.permute(0, 3, 1, 2).contiguous())
return seq_out
The dino_head.py
file:
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh, multi_apply,
reduce_mean)
from ..utils import build_dn_generator
from mmdet.models.utils.transformer import inverse_sigmoid
from mmdet.models.builder import HEADS
from .deformable_detr_head import DeformableDETRHead
from mmcv.runner import force_fp32
@HEADS.register_module()
class DINOHead(DeformableDETRHead):
def __init__(self, *args, dn_cfg=None, **kwargs):
super(DINOHead, self).__init__(*args, **kwargs)
self._init_layers()
self.init_denoising(dn_cfg)
assert self.as_two_stage, \
'as_two_stage must be True for DINO'
assert self.with_box_refine, \
'with_box_refine must be True for DINO'
def _init_layers(self):
super()._init_layers()
# NOTE The original repo of DINO set the num_embeddings 92 for coco,
# 91 (0~90) of which represents target classes and the 92 (91)
# indicates [Unknown] class. However, the embedding of unknown class
# is not used in the original DINO
self.label_embedding = nn.Embedding(self.cls_out_channels,
self.embed_dims)
def init_denoising(self, dn_cfg):
if dn_cfg is not None:
dn_cfg['num_classes'] = self.num_classes
dn_cfg['num_queries'] = self.num_query
dn_cfg['hidden_dim'] = self.embed_dims
self.dn_generator = build_dn_generator(dn_cfg)
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
assert proposal_cfg is None, '"proposal_cfg" must be None'
assert self.dn_generator is not None, '"dn_cfg" must be set'
dn_label_query, dn_bbox_query, attn_mask, dn_meta = \
self.dn_generator(gt_bboxes, gt_labels,
self.label_embedding, img_metas)
outs = self(x, img_metas, dn_label_query, dn_bbox_query, attn_mask)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, img_metas, dn_meta)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas, dn_meta)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
def forward(self,
mlvl_feats,
img_metas,
dn_label_query=None,
dn_bbox_query=None,
attn_mask=None):
batch_size = mlvl_feats[0].size(0)
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
img_masks = mlvl_feats[0].new_ones(
(batch_size, input_img_h, input_img_w))
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
img_masks[img_id, :img_h, :img_w] = 0
mlvl_masks = []
mlvl_positional_encodings = []
for feat in mlvl_feats:
mlvl_masks.append(
F.interpolate(
img_masks[None],
size=feat.shape[-2:]).to(torch.bool).squeeze(0))
mlvl_positional_encodings.append(
self.positional_encoding(mlvl_masks[-1]))
query_embeds = None
hs, inter_references, topk_score, topk_anchor = \
self.transformer(
mlvl_feats,
mlvl_masks,
query_embeds,
mlvl_positional_encodings,
dn_label_query,
dn_bbox_query,
attn_mask,
reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501
cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501
)
hs = hs.permute(0, 2, 1, 3)
if dn_label_query is not None and dn_label_query.size(1) == 0:
# NOTE: If there is no target in the image, the parameters of
# label_embedding won't be used in producing loss, which raises
# RuntimeError when using distributed mode.
hs[0] += self.label_embedding.weight[0, 0] * 0.0
outputs_classes = []
outputs_coords = []
for lvl in range(hs.shape[0]):
reference = inter_references[lvl]
reference = inverse_sigmoid(reference, eps=1e-3)
outputs_class = self.cls_branches[lvl](hs[lvl])
tmp = self.reg_branches[lvl](hs[lvl])
if reference.shape[-1] == 4:
tmp += reference
else:
assert reference.shape[-1] == 2
tmp[..., :2] += reference
outputs_coord = tmp.sigmoid()
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
outputs_classes = torch.stack(outputs_classes)
outputs_coords = torch.stack(outputs_coords)
return outputs_classes, outputs_coords, topk_score, topk_anchor
@force_fp32(apply_to=('all_cls_scores', 'all_bbox_preds'))
def loss(self,
all_cls_scores,
all_bbox_preds,
enc_topk_scores,
enc_topk_anchors,
gt_bboxes_list,
gt_labels_list,
img_metas,
dn_meta=None,
gt_bboxes_ignore=None):
assert gt_bboxes_ignore is None, \
f'{self.__class__.__name__} only supports ' \
f'for gt_bboxes_ignore setting to None.'
loss_dict = dict()
# extract denoising and matching part of outputs
all_cls_scores, all_bbox_preds, dn_cls_scores, dn_bbox_preds = \
self.extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta)
if enc_topk_scores is not None:
# calculate loss from encode feature maps
# NOTE The DeformDETR calculate binary cls loss
# for all encoder embeddings, while DINO calculate
# multi-class loss for topk embeddings.
enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
self.loss_single(enc_topk_scores, enc_topk_anchors,
gt_bboxes_list, gt_labels_list,
img_metas, gt_bboxes_ignore)
# collate loss from encode feature maps
loss_dict['interm_loss_cls'] = enc_loss_cls
loss_dict['interm_loss_bbox'] = enc_losses_bbox
loss_dict['interm_loss_iou'] = enc_losses_iou
# calculate loss from all decoder layers
num_dec_layers = len(all_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_bbox, losses_iou = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
all_gt_bboxes_ignore_list)
# collate loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
loss_dict['loss_iou'] = losses_iou[-1]
# collate loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
losses_bbox[:-1],
losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
num_dec_layer += 1
if dn_cls_scores is not None:
# calculate denoising loss from all decoder layers
dn_meta = [dn_meta for _ in img_metas]
dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
dn_cls_scores, dn_bbox_preds, gt_bboxes_list, gt_labels_list,
img_metas, dn_meta)
# collate denoising loss
loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(
dn_losses_cls[:-1], dn_losses_bbox[:-1],
dn_losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
num_dec_layer += 1
return loss_dict
def loss_dn(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta):
num_dec_layers = len(dn_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
dn_meta_list = [dn_meta for _ in range(num_dec_layers)]
return multi_apply(self.loss_dn_single, dn_cls_scores, dn_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list,
img_metas_list, dn_meta_list)
def loss_dn_single(self, dn_cls_scores, dn_bbox_preds, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta):
num_imgs = dn_cls_scores.size(0)
bbox_preds_list = [dn_bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_dn_target(bbox_preds_list, gt_bboxes_list,
gt_labels_list, img_metas,
dn_meta)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = \
num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
cls_avg_factor = max(cls_avg_factor, 1)
if len(cls_scores) > 0:
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
else:
loss_cls = torch.zeros( # TODO: How to better return zero loss
1,
dtype=cls_scores.dtype,
device=cls_scores.device)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale bboxes
factors = []
for img_meta, bbox_pred in zip(img_metas, dn_bbox_preds):
img_h, img_w, _ = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds = dn_bbox_preds.reshape(-1, 4)
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
return loss_cls, loss_bbox, loss_iou
def get_dn_target(self, dn_bbox_preds_list, gt_bboxes_list, gt_labels_list,
img_metas, dn_meta):
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
pos_inds_list,
neg_inds_list) = multi_apply(self._get_dn_target_single,
dn_bbox_preds_list, gt_bboxes_list,
gt_labels_list, img_metas, dn_meta)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def _get_dn_target_single(self, dn_bbox_pred, gt_bboxes, gt_labels,
img_meta, dn_meta):
num_groups = dn_meta['num_dn_group']
pad_size = dn_meta['pad_size']
assert pad_size % num_groups == 0
single_pad = pad_size // num_groups
num_bboxes = dn_bbox_pred.size(0)
if len(gt_labels) > 0:
t = torch.range(0, len(gt_labels) - 1).long().cuda()
t = t.unsqueeze(0).repeat(num_groups, 1)
pos_assigned_gt_inds = t.flatten()
pos_inds = (torch.tensor(range(num_groups)) *
single_pad).long().cuda().unsqueeze(1) + t
pos_inds = pos_inds.flatten()
else:
pos_inds = pos_assigned_gt_inds = torch.tensor([]).long().cuda()
neg_inds = pos_inds + single_pad // 2
# label targets
labels = gt_bboxes.new_full((num_bboxes, ),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)
# bbox targets
bbox_targets = torch.zeros_like(dn_bbox_pred)
bbox_weights = torch.zeros_like(dn_bbox_pred)
bbox_weights[pos_inds] = 1.0
img_h, img_w, _ = img_meta['img_shape']
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor = dn_bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
gt_bboxes_normalized = gt_bboxes / factor
gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds)
@staticmethod
def extract_dn_outputs(all_cls_scores, all_bbox_preds, dn_meta):
# if dn_meta and dn_meta['pad_size'] > 0:
if dn_meta is not None:
denoising_cls_scores = all_cls_scores[:, :, :
dn_meta['pad_size'], :]
denoising_bbox_preds = all_bbox_preds[:, :, :
dn_meta['pad_size'], :]
matching_cls_scores = all_cls_scores[:, :, dn_meta['pad_size']:, :]
matching_bbox_preds = all_bbox_preds[:, :, dn_meta['pad_size']:, :]
else:
denoising_cls_scores = None
denoising_bbox_preds = None
matching_cls_scores = all_cls_scores
matching_bbox_preds = all_bbox_preds
return (matching_cls_scores, matching_bbox_preds, denoising_cls_scores,
denoising_bbox_preds)
The dino.py
file:
# Copyright (c) OpenMMLab. All rights reserved.
from mmdet.models.builder import DETECTORS
from mmdet.models.detectors.detr import DETR
@DETECTORS.register_module()
class DINO(DETR):
def __init__(self, *args, **kwargs):
super(DETR, self).__init__(*args, **kwargs)
The query_denoising.py
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmcv.runner import BaseModule
from mmdet.core import bbox_xyxy_to_cxcywh
from mmdet.models.utils.transformer import inverse_sigmoid
class DnQueryGenerator(BaseModule):
def __init__(self,
num_queries,
hidden_dim,
num_classes,
noise_scale=dict(label=0.5, box=0.4),
group_cfg=dict(
dynamic=True, num_groups=None, num_dn_queries=None)):
super(DnQueryGenerator, self).__init__()
self.num_queries = num_queries
self.hidden_dim = hidden_dim
self.num_classes = num_classes
self.label_noise_scale = noise_scale['label']
self.box_noise_scale = noise_scale['box']
self.dynamic_dn_groups = group_cfg.get('dynamic', False)
if self.dynamic_dn_groups:
assert 'num_dn_queries' in group_cfg, \
'num_dn_queries should be set when using ' \
'dynamic dn groups'
self.num_dn = group_cfg['num_dn_queries']
else:
assert 'num_groups' in group_cfg, \
'num_groups should be set when using ' \
'static dn groups'
self.num_dn = group_cfg['num_groups']
assert isinstance(self.num_dn, int) and self.num_dn >= 1, \
f'Expected the num in group_cfg to have type int. ' \
f'Found {type(self.num_dn)} '
def get_num_groups(self, group_queries=None):
"""
Args:
group_queries (int): Number of dn queries in one group.
"""
if self.dynamic_dn_groups:
assert group_queries is not None, \
'group_queries should be provided when using ' \
'dynamic dn groups'
if group_queries == 0:
num_groups = 1
else:
num_groups = self.num_dn // group_queries
else:
num_groups = self.num_dn
if num_groups < 1: # avoid num_groups < 1 in query generator
num_groups = 1
return int(num_groups)
def forward(self,
gt_bboxes,
gt_labels=None,
label_enc=None,
img_metas=None):
"""
Args:
gt_bboxes (List[Tensor]): List of ground truth bboxes
of the image, shape of each (num_gts, 4).
gt_labels (List[Tensor]): List of ground truth labels
of the image, shape of each (num_gts,), if None,
TODO:noisy_label would be None.
Returns:
TODO
"""
# TODO: temp only support for CDN
# TODO: temp assert gt_labels is not None and label_enc is not None
if self.training:
if gt_labels is not None:
assert len(gt_bboxes) == len(gt_labels), \
f'the length of provided gt_labels ' \
f'{len(gt_labels)} should be equal to' \
f' that of gt_bboxes {len(gt_bboxes)}'
assert gt_labels is not None \
and label_enc is not None \
and img_metas is not None # TODO: adjust args
batch_size = len(gt_bboxes)
# convert bbox
gt_bboxes_list = []
for img_meta, bboxes in zip(img_metas, gt_bboxes):
img_h, img_w, _ = img_meta['img_shape']
factor = bboxes.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
bboxes_normalized = bbox_xyxy_to_cxcywh(bboxes) / factor
gt_bboxes_list.append(bboxes_normalized)
gt_bboxes = gt_bboxes_list
known = [torch.ones_like(labels) for labels in gt_labels]
known_num = [sum(k) for k in known]
num_groups = self.get_num_groups(int(max(known_num)))
unmask_bbox = unmask_label = torch.cat(known)
labels = torch.cat(gt_labels)
boxes = torch.cat(gt_bboxes)
batch_idx = torch.cat([
torch.full_like(t.long(), i) for i, t in enumerate(gt_labels)
])
known_indice = torch.nonzero(unmask_label + unmask_bbox)
known_indice = known_indice.view(-1)
known_indice = known_indice.repeat(2 * num_groups, 1).view(-1)
known_labels = labels.repeat(2 * num_groups, 1).view(-1)
known_bid = batch_idx.repeat(2 * num_groups, 1).view(-1)
known_bboxs = boxes.repeat(2 * num_groups, 1)
known_labels_expand = known_labels.clone()
known_bbox_expand = known_bboxs.clone()
if self.label_noise_scale > 0:
p = torch.rand_like(known_labels_expand.float())
chosen_indice = torch.nonzero(
p < (self.label_noise_scale * 0.5)).view(-1)
new_label = torch.randint_like(chosen_indice, 0,
self.num_classes)
known_labels_expand.scatter_(0, chosen_indice, new_label)
single_pad = int(max(known_num)) # TODO
pad_size = int(single_pad * 2 * num_groups)
positive_idx = torch.tensor(range(
len(boxes))).long().cuda().unsqueeze(0).repeat(num_groups, 1)
positive_idx += (torch.tensor(range(num_groups)) * len(boxes) *
2).long().cuda().unsqueeze(1)
positive_idx = positive_idx.flatten()
negative_idx = positive_idx + len(boxes)
if self.box_noise_scale > 0:
known_bbox_ = torch.zeros_like(known_bboxs)
known_bbox_[:, : 2] = \
known_bboxs[:, : 2] - known_bboxs[:, 2:] / 2
known_bbox_[:, 2:] = \
known_bboxs[:, :2] + known_bboxs[:, 2:] / 2
diff = torch.zeros_like(known_bboxs)
diff[:, :2] = known_bboxs[:, 2:] / 2
diff[:, 2:] = known_bboxs[:, 2:] / 2
rand_sign = torch.randint_like(
known_bboxs, low=0, high=2, dtype=torch.float32)
rand_sign = rand_sign * 2.0 - 1.0
rand_part = torch.rand_like(known_bboxs)
rand_part[negative_idx] += 1.0
rand_part *= rand_sign
known_bbox_ += \
torch.mul(rand_part, diff).cuda() * self.box_noise_scale
known_bbox_ = known_bbox_.clamp(min=0.0, max=1.0)
known_bbox_expand[:, :2] = \
(known_bbox_[:, :2] + known_bbox_[:, 2:]) / 2
known_bbox_expand[:, 2:] = \
known_bbox_[:, 2:] - known_bbox_[:, :2]
m = known_labels_expand.long().to('cuda')
input_label_embed = label_enc(m)
input_bbox_embed = inverse_sigmoid(known_bbox_expand, eps=1e-3)
padding_label = torch.zeros(pad_size, self.hidden_dim).cuda()
padding_bbox = torch.zeros(pad_size, 4).cuda()
input_query_label = padding_label.repeat(batch_size, 1, 1)
input_query_bbox = padding_bbox.repeat(batch_size, 1, 1)
map_known_indice = torch.tensor([]).to('cuda')
if len(known_num):
map_known_indice = torch.cat(
[torch.tensor(range(num)) for num in known_num])
map_known_indice = torch.cat([
map_known_indice + single_pad * i
for i in range(2 * num_groups)
]).long()
if len(known_bid):
input_query_label[(known_bid.long(),
map_known_indice)] = input_label_embed
input_query_bbox[(known_bid.long(),
map_known_indice)] = input_bbox_embed
tgt_size = pad_size + self.num_queries
attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0
# match query cannot see the reconstruct
attn_mask[pad_size:, :pad_size] = True
# reconstruct cannot see each other
for i in range(num_groups):
if i == 0:
attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1),
single_pad * 2 * (i + 1):pad_size] = True
if i == num_groups - 1:
attn_mask[single_pad * 2 * i:single_pad * 2 *
(i + 1), :single_pad * i * 2] = True
else:
attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1),
single_pad * 2 * (i + 1):pad_size] = True
attn_mask[single_pad * 2 * i:single_pad * 2 *
(i + 1), :single_pad * 2 * i] = True
dn_meta = {
'pad_size': pad_size,
'num_dn_group': num_groups,
}
else:
input_query_label = None
input_query_bbox = None
attn_mask = None
dn_meta = None
return input_query_label, input_query_bbox, attn_mask, dn_meta
class CdnQueryGenerator(DnQueryGenerator):
def __init__(self, *args, **kwargs):
super(CdnQueryGenerator, self).__init__(*args, **kwargs)
def build_dn_generator(dn_args):
"""
Args:
dn_args (dict):
Returns:
"""
if dn_args is None:
return None
type = dn_args.pop('type')
if type == 'DnQueryGenerator':
return DnQueryGenerator(**dn_args)
elif type == 'CdnQueryGenerator':
return CdnQueryGenerator(**dn_args)
else:
raise NotImplementedError(f'{type} is not supported yet')
Furthermore, I reproduce the error and use print()
to check the actural value of weights. The error message shown below:
loss_cls['loss_weight']:1.0, assigner['cls_cost']['weight']:2.0 # <- here is the error occur
Traceback (most recent call last):
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 69, in build_from_cfg
return obj_cls(**args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\dense_heads\dino_head.py", line 19, in __init__
super(DINOHead, self).__init__(*args, **kwargs)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\dense_heads\deformable_detr_head.py", line 47, in __init__
super(DeformableDETRHead, self).__init__(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\dense_heads\detr_head.py", line 111, in __init__
assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
AssertionError: The classification weight for loss and matcher should beexactly the same.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 69, in build_from_cfg
return obj_cls(**args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\detectors\dino.py", line 10, in __init__
super(DETR, self).__init__(*args, **kwargs)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\detectors\single_stage.py", line 37, in __init__
self.bbox_head = build_head(bbox_head)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\builder.py", line 40, in build_head
return HEADS.build(cfg)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 237, in build
return self.build_func(*args, **kwargs, registry=self)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\cnn\builder.py", line 27, in build_model_from_cfg
return build_from_cfg(cfg, registry, default_args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 72, in build_from_cfg
raise type(e)(f'{obj_cls.__name__}: {e}')
AssertionError: DINOHead: The classification weight for loss and matcher should beexactly the same.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Van\mmdetection-2.28.0\tools\train.py", line 250, in <module>
main()
File "D:\Van\mmdetection-2.28.0\tools\train.py", line 215, in main
model = build_detector(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\builder.py", line 58, in build_detector
return DETECTORS.build(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 237, in build
return self.build_func(*args, **kwargs, registry=self)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\cnn\builder.py", line 27, in build_model_from_cfg
return build_from_cfg(cfg, registry, default_args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 72, in build_from_cfg
raise type(e)(f'{obj_cls.__name__}: {e}')
AssertionError: DINO: DINOHead: The classification weight for loss and matcher should beexactly the same.
loss_cls['loss_weight']:1.0, assigner['cls_cost']['weight']:2.0
Traceback (most recent call last):
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 69, in build_from_cfg
return obj_cls(**args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\dense_heads\dino_head.py", line 19, in __init__
super(DINOHead, self).__init__(*args, **kwargs)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\dense_heads\deformable_detr_head.py", line 47, in __init__
super(DeformableDETRHead, self).__init__(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\dense_heads\detr_head.py", line 111, in __init__
assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
AssertionError: The classification weight for loss and matcher should beexactly the same.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 69, in build_from_cfg
return obj_cls(**args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\detectors\dino.py", line 10, in __init__
super(DETR, self).__init__(*args, **kwargs)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\detectors\single_stage.py", line 37, in __init__
self.bbox_head = build_head(bbox_head)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\builder.py", line 40, in build_head
return HEADS.build(cfg)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 237, in build
return self.build_func(*args, **kwargs, registry=self)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\cnn\builder.py", line 27, in build_model_from_cfg
return build_from_cfg(cfg, registry, default_args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 72, in build_from_cfg
raise type(e)(f'{obj_cls.__name__}: {e}')
AssertionError: DINOHead: The classification weight for loss and matcher should beexactly the same.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Van\mmdetection-2.28.0\tools\train.py", line 250, in <module>
main()
File "D:\Van\mmdetection-2.28.0\tools\train.py", line 215, in main
model = build_detector(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmdet-2.28.0-py3.9.egg\mmdet\models\builder.py", line 58, in build_detector
return DETECTORS.build(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 237, in build
return self.build_func(*args, **kwargs, registry=self)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\cnn\builder.py", line 27, in build_model_from_cfg
return build_from_cfg(cfg, registry, default_args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\mmcv\utils\registry.py", line 72, in build_from_cfg
raise type(e)(f'{obj_cls.__name__}: {e}')
AssertionError: DINO: DINOHead: The classification weight for loss and matcher should beexactly the same.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2688) of binary: D:\Van\mmdetection-2.28.0\.venv\Scripts\python.exeTraceback (most recent call last):
File "C:\Users\Lab516\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\Lab516\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "D:\Van\mmdetection-2.28.0\.venv\Scripts\torchrun.exe\__main__.py", line 7, in <module>
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\torch\distributed\elastic\multiprocessing\errors\__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\torch\distributed\run.py", line 724, in main
run(args)
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\torch\distributed\run.py", line 715, in run
elastic_launch(
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\torch\distributed\launcher\api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "D:\Van\mmdetection-2.28.0\.venv\lib\site-packages\torch\distributed\launcher\api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
.\tools\train.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-08-10_16:56:44
host : Lab516
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 8356)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-08-10_16:56:44
host : Lab516
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 2688)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
And this error was occured in detr_head.py
, the detial code is shown below:
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Conv2d, Linear, build_activation_layer
from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding
from mmcv.runner import force_fp32
from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
build_assigner, build_sampler, multi_apply,
reduce_mean)
from mmdet.models.utils import build_transformer
from ..builder import HEADS, build_loss
from .anchor_free_head import AnchorFreeHead
@HEADS.register_module()
class DETRHead(AnchorFreeHead):
"""Implements the DETR transformer head.
See `paper: End-to-End Object Detection with Transformers
<https://arxiv.org/pdf/2005.12872>`_ for details.
Args:
num_classes (int): Number of categories excluding the background.
in_channels (int): Number of channels in the input feature map.
num_query (int): Number of query in Transformer.
num_reg_fcs (int, optional): Number of fully-connected layers used in
`FFN`, which is then used for the regression head. Default 2.
transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
Default: None.
sync_cls_avg_factor (bool): Whether to sync the avg_factor of
all ranks. Default to False.
positional_encoding (obj:`mmcv.ConfigDict`|dict):
Config for position encoding.
loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
classification loss. Default `CrossEntropyLoss`.
loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
regression loss. Default `L1Loss`.
loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
regression iou loss. Default `GIoULoss`.
tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
transformer head.
test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
transformer head.
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None
"""
_version = 2
def __init__(self,
num_classes,
in_channels,
num_query=100,
num_reg_fcs=2,
transformer=None,
sync_cls_avg_factor=False,
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
normalize=True),
loss_cls=dict(
type='CrossEntropyLoss',
bg_cls_weight=0.1,
use_sigmoid=False,
loss_weight=1.0,
class_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0),
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxL1Cost', weight=5.0),
iou_cost=dict(
type='IoUCost', iou_mode='giou', weight=2.0))),
test_cfg=dict(max_per_img=100),
init_cfg=None,
**kwargs):
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since it brings inconvenience when the initialization of
# `AnchorFreeHead` is called.
super(AnchorFreeHead, self).__init__(init_cfg)
self.bg_cls_weight = 0
self.sync_cls_avg_factor = sync_cls_avg_factor
class_weight = loss_cls.get('class_weight', None)
if class_weight is not None and (self.__class__ is DETRHead):
assert isinstance(class_weight, float), 'Expected ' \
'class_weight to have type float. Found ' \
f'{type(class_weight)}.'
# NOTE following the official DETR rep0, bg_cls_weight means
# relative classification weight of the no-object class.
bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
assert isinstance(bg_cls_weight, float), 'Expected ' \
'bg_cls_weight to have type float. Found ' \
f'{type(bg_cls_weight)}.'
class_weight = torch.ones(num_classes + 1) * class_weight
# set background class as the last indice
class_weight[num_classes] = bg_cls_weight
loss_cls.update({'class_weight': class_weight})
if 'bg_cls_weight' in loss_cls:
loss_cls.pop('bg_cls_weight')
self.bg_cls_weight = bg_cls_weight
if train_cfg:
assert 'assigner' in train_cfg, 'assigner should be provided '\
'when train_cfg is set.'
assigner = train_cfg['assigner']
print(F"loss_cls['loss_weight']:{loss_cls['loss_weight']}, assigner['cls_cost']['weight']:{assigner['cls_cost']['weight']}")
assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
'The classification weight for loss and matcher should be' \
'exactly the same.'
assert loss_bbox['loss_weight'] == assigner['reg_cost'][
'weight'], 'The regression L1 weight for loss and matcher ' \
'should be exactly the same.'
assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
'The regression iou weight for loss and matcher should be' \
'exactly the same.'
self.assigner = build_assigner(assigner)
# DETR sampling=False, so use PseudoSampler
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.num_query = num_query
self.num_classes = num_classes
self.in_channels = in_channels
self.num_reg_fcs = num_reg_fcs
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.fp16_enabled = False
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_iou = build_loss(loss_iou)
if self.loss_cls.use_sigmoid:
self.cls_out_channels = num_classes
else:
self.cls_out_channels = num_classes + 1
self.act_cfg = transformer.get('act_cfg',
dict(type='ReLU', inplace=True))
self.activate = build_activation_layer(self.act_cfg)
self.positional_encoding = build_positional_encoding(
positional_encoding)
self.transformer = build_transformer(transformer)
self.embed_dims = self.transformer.embed_dims
assert 'num_feats' in positional_encoding
num_feats = positional_encoding['num_feats']
assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
f' and {num_feats}.'
self._init_layers()
def _init_layers(self):
"""Initialize layers of the transformer head."""
self.input_proj = Conv2d(
self.in_channels, self.embed_dims, kernel_size=1)
self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
self.reg_ffn = FFN(
self.embed_dims,
self.embed_dims,
self.num_reg_fcs,
self.act_cfg,
dropout=0.0,
add_residual=False)
self.fc_reg = Linear(self.embed_dims, 4)
self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
def init_weights(self):
"""Initialize weights of the transformer head."""
# The initialization for transformer is important
self.transformer.init_weights()
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
"""load checkpoints."""
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since `AnchorFreeHead._load_from_state_dict` should not be
# called here. Invoking the default `Module._load_from_state_dict`
# is enough.
# Names of some parameters in has been changed.
version = local_metadata.get('version', None)
if (version is None or version < 2) and self.__class__ is DETRHead:
convert_dict = {
'.self_attn.': '.attentions.0.',
'.ffn.': '.ffns.0.',
'.multihead_attn.': '.attentions.1.',
'.decoder.norm.': '.decoder.post_norm.'
}
state_dict_keys = list(state_dict.keys())
for k in state_dict_keys:
for ori_key, convert_key in convert_dict.items():
if ori_key in k:
convert_key = k.replace(ori_key, convert_key)
state_dict[convert_key] = state_dict[k]
del state_dict[k]
super(AnchorFreeHead,
self)._load_from_state_dict(state_dict, prefix, local_metadata,
strict, missing_keys,
unexpected_keys, error_msgs)
def forward(self, feats, img_metas):
"""Forward function.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): List of image information.
Returns:
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
- all_cls_scores_list (list[Tensor]): Classification scores \
for each scale level. Each is a 4D-tensor with shape \
[nb_dec, bs, num_query, cls_out_channels]. Note \
`cls_out_channels` should includes background.
- all_bbox_preds_list (list[Tensor]): Sigmoid regression \
outputs for each scale level. Each is a 4D-tensor with \
normalized coordinate format (cx, cy, w, h) and shape \
[nb_dec, bs, num_query, 4].
"""
num_levels = len(feats)
img_metas_list = [img_metas for _ in range(num_levels)]
return multi_apply(self.forward_single, feats, img_metas_list)
def forward_single(self, x, img_metas):
""""Forward function for a single feature level.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
batch_size = x.size(0)
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
masks = x.new_ones((batch_size, input_img_h, input_img_w))
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
masks[img_id, :img_h, :img_w] = 0
x = self.input_proj(x)
# interpolate masks to have the same spatial shape with x
masks = F.interpolate(
masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
# position encoding
pos_embed = self.positional_encoding(masks) # [bs, embed_dim, h, w]
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
pos_embed)
all_cls_scores = self.fc_cls(outs_dec)
all_bbox_preds = self.fc_reg(self.activate(
self.reg_ffn(outs_dec))).sigmoid()
return all_cls_scores, all_bbox_preds
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
def loss(self,
all_cls_scores_list,
all_bbox_preds_list,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore=None):
""""Loss function.
Only outputs from the last feature level are used for computing
losses by default.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
# NOTE defaultly only the outputs from the last feature scale is used.
all_cls_scores = all_cls_scores_list[-1]
all_bbox_preds = all_bbox_preds_list[-1]
assert gt_bboxes_ignore is None, \
'Only supports for gt_bboxes_ignore setting to None.'
num_dec_layers = len(all_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_bbox, losses_iou = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
all_gt_bboxes_ignore_list)
loss_dict = dict()
# loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
loss_dict['loss_iou'] = losses_iou[-1]
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
losses_bbox[:-1],
losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
num_dec_layer += 1
return loss_dict
def loss_single(self,
cls_scores,
bbox_preds,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore_list=None):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs = cls_scores.size(0)
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
gt_bboxes_list, gt_labels_list,
img_metas, gt_bboxes_ignore_list)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
cls_avg_factor = max(cls_avg_factor, 1)
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale bboxes
factors = []
for img_meta, bbox_pred in zip(img_metas, bbox_preds):
img_h, img_w, _ = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds = bbox_preds.reshape(-1, 4)
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
return loss_cls, loss_bbox, loss_iou
def get_targets(self,
cls_scores_list,
bbox_preds_list,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore_list=None):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all \
images.
- bbox_targets_list (list[Tensor]): BBox targets for all \
images.
- bbox_weights_list (list[Tensor]): BBox weights for all \
images.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
assert gt_bboxes_ignore_list is None, \
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs = len(cls_scores_list)
gt_bboxes_ignore_list = [
gt_bboxes_ignore_list for _ in range(num_imgs)
]
(labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
self._get_target_single, cls_scores_list, bbox_preds_list,
gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def _get_target_single(self,
cls_score,
bbox_pred,
gt_bboxes,
gt_labels,
img_meta,
gt_bboxes_ignore=None):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
img_meta (dict): Meta information for one image.
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes = bbox_pred.size(0)
# assigner and sampler
assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
gt_labels, img_meta,
gt_bboxes_ignore)
sampling_result = self.sampler.sample(assign_result, bbox_pred,
gt_bboxes)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
# label targets
labels = gt_bboxes.new_full((num_bboxes, ),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)
# bbox targets
bbox_targets = torch.zeros_like(bbox_pred)
bbox_weights = torch.zeros_like(bbox_pred)
bbox_weights[pos_inds] = 1.0
img_h, img_w, _ = img_meta['img_shape']
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
bbox_targets[pos_inds] = pos_gt_bboxes_targets
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds)
# over-write because img_metas are needed as inputs for bbox_head.
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
"""Forward function for training mode.
Args:
x (list[Tensor]): Features from backbone.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (Tensor): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (Tensor): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_ignore (Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert proposal_cfg is None, '"proposal_cfg" must be None'
outs = self(x, img_metas)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, img_metas)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
def get_bboxes(self,
all_cls_scores_list,
all_bbox_preds_list,
img_metas,
rescale=False):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Default False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
The first item is an (n, 5) tensor, where the first 4 columns \
are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
5-th column is a score between 0 and 1. The second item is a \
(n,) tensor where each item is the predicted class label of \
the corresponding box.
"""
# NOTE defaultly only using outputs from the last feature level,
# and only the outputs from the last decoder layer is used.
cls_scores = all_cls_scores_list[-1][-1]
bbox_preds = all_bbox_preds_list[-1][-1]
result_list = []
for img_id in range(len(img_metas)):
cls_score = cls_scores[img_id]
bbox_pred = bbox_preds[img_id]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self._get_bboxes_single(cls_score, bbox_pred,
img_shape, scale_factor,
rescale)
result_list.append(proposals)
return result_list
def _get_bboxes_single(self,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=False):
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (cx, cy, w, h) and
shape [num_query, 4].
img_shape (tuple[int]): Shape of input image, (height, width, 3).
scale_factor (ndarray, optional): Scale factor of the image arange
as (w_scale, h_scale, w_scale, h_scale).
rescale (bool, optional): If True, return boxes in original image
space. Default False.
Returns:
tuple[Tensor]: Results of detected bboxes and labels.
- det_bboxes: Predicted bboxes with shape [num_query, 5], \
where the first 4 columns are bounding box positions \
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
between 0 and 1.
- det_labels: Predicted labels of the corresponding box with \
shape [num_query].
"""
assert len(cls_score) == len(bbox_pred)
max_per_img = self.test_cfg.get('max_per_img', self.num_query)
# exclude background
if self.loss_cls.use_sigmoid:
cls_score = cls_score.sigmoid()
scores, indexes = cls_score.view(-1).topk(max_per_img)
det_labels = indexes % self.num_classes
bbox_index = indexes // self.num_classes
bbox_pred = bbox_pred[bbox_index]
else:
scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
scores, bbox_index = scores.topk(max_per_img)
bbox_pred = bbox_pred[bbox_index]
det_labels = det_labels[bbox_index]
det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
if rescale:
det_bboxes /= det_bboxes.new_tensor(scale_factor)
det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
return det_bboxes, det_labels
def simple_test_bboxes(self, feats, img_metas, rescale=False):
"""Test det bboxes without test-time augmentation.
Args:
feats (tuple[torch.Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is ``bboxes`` with shape (n, 5),
where 5 represent (tl_x, tl_y, br_x, br_y, score).
The shape of the second tensor in the tuple is ``labels``
with shape (n,)
"""
# forward of this head requires img_metas
outs = self.forward(feats, img_metas)
results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
return results_list
def forward_onnx(self, feats, img_metas):
"""Forward function for exporting to ONNX.
Over-write `forward` because: `masks` is directly created with
zero (valid position tag) and has the same spatial size as `x`.
Thus the construction of `masks` is different from that in `forward`.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): List of image information.
Returns:
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
- all_cls_scores_list (list[Tensor]): Classification scores \
for each scale level. Each is a 4D-tensor with shape \
[nb_dec, bs, num_query, cls_out_channels]. Note \
`cls_out_channels` should includes background.
- all_bbox_preds_list (list[Tensor]): Sigmoid regression \
outputs for each scale level. Each is a 4D-tensor with \
normalized coordinate format (cx, cy, w, h) and shape \
[nb_dec, bs, num_query, 4].
"""
num_levels = len(feats)
img_metas_list = [img_metas for _ in range(num_levels)]
return multi_apply(self.forward_single_onnx, feats, img_metas_list)
def forward_single_onnx(self, x, img_metas):
""""Forward function for a single feature level with ONNX exportation.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# Note `img_shape` is not dynamically traceable to ONNX,
# since the related augmentation was done with numpy under
# CPU. Thus `masks` is directly created with zeros (valid tag)
# and the same spatial shape as `x`.
# The difference between torch and exported ONNX model may be
# ignored, since the same performance is achieved (e.g.
# 40.1 vs 40.1 for DETR)
batch_size = x.size(0)
h, w = x.size()[-2:]
masks = x.new_zeros((batch_size, h, w)) # [B,h,w]
x = self.input_proj(x)
# interpolate masks to have the same spatial shape with x
masks = F.interpolate(
masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
pos_embed = self.positional_encoding(masks)
outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
pos_embed)
all_cls_scores = self.fc_cls(outs_dec)
all_bbox_preds = self.fc_reg(self.activate(
self.reg_ffn(outs_dec))).sigmoid()
return all_cls_scores, all_bbox_preds
def onnx_export(self, all_cls_scores_list, all_bbox_preds_list, img_metas):
"""Transform network outputs into bbox predictions, with ONNX
exportation.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
img_metas (list[dict]): Meta information of each image.
Returns:
tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
and class labels of shape [N, num_det].
"""
assert len(img_metas) == 1, \
'Only support one input image while in exporting to ONNX'
cls_scores = all_cls_scores_list[-1][-1]
bbox_preds = all_bbox_preds_list[-1][-1]
# Note `img_shape` is not dynamically traceable to ONNX,
# here `img_shape_for_onnx` (padded shape of image tensor)
# is used.
img_shape = img_metas[0]['img_shape_for_onnx']
max_per_img = self.test_cfg.get('max_per_img', self.num_query)
batch_size = cls_scores.size(0)
# `batch_index_offset` is used for the gather of concatenated tensor
batch_index_offset = torch.arange(batch_size).to(
cls_scores.device) * max_per_img
batch_index_offset = batch_index_offset.unsqueeze(1).expand(
batch_size, max_per_img)
# supports dynamical batch inference
if self.loss_cls.use_sigmoid:
cls_scores = cls_scores.sigmoid()
scores, indexes = cls_scores.view(batch_size, -1).topk(
max_per_img, dim=1)
det_labels = indexes % self.num_classes
bbox_index = indexes // self.num_classes
bbox_index = (bbox_index + batch_index_offset).view(-1)
bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
bbox_preds = bbox_preds.view(batch_size, -1, 4)
else:
scores, det_labels = F.softmax(
cls_scores, dim=-1)[..., :-1].max(-1)
scores, bbox_index = scores.topk(max_per_img, dim=1)
bbox_index = (bbox_index + batch_index_offset).view(-1)
bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
det_labels = det_labels.view(-1)[bbox_index]
bbox_preds = bbox_preds.view(batch_size, -1, 4)
det_labels = det_labels.view(batch_size, -1)
det_bboxes = bbox_cxcywh_to_xyxy(bbox_preds)
# use `img_shape_tensor` for dynamically exporting to ONNX
img_shape_tensor = img_shape.flip(0).repeat(2) # [w,h,w,h]
img_shape_tensor = img_shape_tensor.unsqueeze(0).unsqueeze(0).expand(
batch_size, det_bboxes.size(1), 4)
det_bboxes = det_bboxes * img_shape_tensor
# dynamically clip bboxes
x1, y1, x2, y2 = det_bboxes.split((1, 1, 1, 1), dim=-1)
from mmdet.core.export import dynamic_clip_for_onnx
x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, img_shape)
det_bboxes = torch.cat([x1, y1, x2, y2], dim=-1)
det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(-1)), -1)
return det_bboxes, det_labels
Thanks again for your reply!!
Hi @VanLinLin ! I had the same error with Co-DETR. In my case, it happened because the repository already had a custom 'mmdet' folder overwriting mmdetection files. The error occured when I used anaconda, the custom files did not overwrite mmdetection source code. At first, I was also copying the customized python files one by one just like you did but it did not work. So the fastest way I had to solve this was to directly copy paste the whole custom mmdet folder files into the environment mmdet folder. This issue did not occur when I was using a python virtualenv, because the custom mmdet was indeed overwritting mmdetection source code.
@NVigne-cloud Hi, thanks for your reply! I wonder about the details that you copy the whole mmdet folder into the environment mmdet folder. Can you tell me more, please? I really appreciate your help!
@VanLinLin Hi! in my case you can see the mmdet folder here : https://github.com/Sense-X/Co-DETR/tree/main/mmdet. Your error comes from an assert in a mmdet source code config builder file. This assert does not appear in the Co-DETR mmdet folder. I'm assuming you are using this repo of InternImage : https://github.com/OpenGVLab/InternImage/tree/master/detection. It's rather weird that you have this error while the README specifies to install mmdet through a pip install.
Thanks @NVigne-cloud ! I have already solved it.
I am getting this error:
AssertionError: DETR: DETRHead: Expected class_weight to have type float. Found <class 'torch.Tensor'>.
Thanks @NVigne-cloud ! I have already solved it.
how did you solve it?
@saadyousuf45 You can just copy the mmdet_custom or mmcv_custom folder to your root category.
Describe the issue
Hi, I'm going to merge InternImage model into mmdetection==2.28.1, and I encounter the error:
I found when I change the weight of cls_cost like following describe , this error will dissapper.
But I think this is not the correct way since in
deformable_detr_r50_16x2_50e_coco.py
, it use the same train_cfg but it can work well. So I wonder how to fix this problem?Reproduction
Thanks for anyone's reply!