Closed rrjia closed 2 years ago
回复得稍晚~白天在使劲卷~🙃 是这样的,你看的很仔细,赞👍🏻。这是由于yolov5的代码中,已经把descale和anchor匹配用pytorch实现了(Detect模块)。在转出成onnx时,这部分被整合在模型里一块转出来了。下面是一段我关于yolov5的源码注释,你可以参考下。 稍微有点长 ~
class Detect(nn.Module):
stride = None # strides computed during build
onnx_dynamic = False # ONNX export parameter
def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
super(Detect, self).__init__()
self.nc = nc # number of classes nc=80
self.no = nc + 5 # number of outputs per anchor
self.nl = len(anchors) # number of detection layers 预测层的数量 每层负责预测不同尺度的框
self.na = len(anchors[0]) // 2 # number of anchors 每个点上的anchor数量
self.grid = [torch.zeros(1)] * self.nl # init grid
a = torch.tensor(anchors).float().view(self.nl, -1, 2) # (nl,na,2) 2表示宽高尺度
# register_buffer: https://blog.csdn.net/weixin_38145317/article/details/104917218
# 注册入缓冲区 不会被梯度更新 被视为常量 在forward中可以直接使用
# self.anchors和self.anchor_grid
self.register_buffer('anchors', a) # shape(nl,na,2)
self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2)
self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv (bs,na*no,ny,nx)
self.inplace = inplace # use in-place ops (e.g. slice assignment)
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv (bs,na*no,ny,nx)
bs, _, ny, nx = x[i].shape
# x(bs,255,20,20) to x(bs,3,20,20,85=80+5) (bs,na,ny,nx,no=nc+5=4+1+nc)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# 关于self.grid的理解:
# 一般情况下在training模式下是用不到grid的,因此按理说,grid[i]应该一直是torch.zeros(1)
# 所以原来这个self.grid[i].shape[2:4] != x[i].shape[2:4]判断必然为True;然而需要注意的
# 是,在train.py中有一段测试逻辑test.test(...),这里边调用了model.eval(),而这个方法会把
# 父类nn.Module的属性self.training设置为False,于是在eval模式下的forward会跑入以下这段
# 逻辑,从修改了self.grid,所以在我们冻结权重时会发现print出来的grid[i].shape不是(1,);
# 问题在于修改后的self.grid所对应的图片尺寸以及网格尺寸,不一定就是我们冻结权重时想要的,
# 所以self.grid[i].shape[2:4] != x[i].shape[2:4]可能为True也可能为False,这就影响了
# jit的Tracing(TracerWarning:)。所以解决问题的方法就是,去掉这个判断,始终根据目前的输入维度构造新的grid
# 从逻辑上看,这并没有改变yolov5最终的推理结果
if not self.training: # inference
# if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
# self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
# update at 20210515
self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
# update at 20210515
y = x[i].sigmoid() # (bs,na,ny,nx,no=nc+5=4+1+nc)
# 应该是将预测的偏移量也做了归一化(0.,1.) 于是xywh+conf+cls_prob都是(0.,1.)
# 或者说 按照下面的反算逻辑 应该预测的是xy相对于grid[i]上锚点中心的偏移 这种偏移
# 被限制在(0.,1.)之间;比如grid[i]在(ii,jj)位置上的值2值即为(ii,jj),代表的是
# 锚点中心的坐标,预测的是相对于(ii,jj)的偏移量 在(0.,1.)之间
# 另外在转onnx时 计算GFLOPS出现异常 AttributeError: 'Detect' object has no attribute 'inplace'
# 估计是训练时保存的模型没有self.inplace这个属性,但是代码后来又添加了这段逻辑 注释掉self.inplace
# 之后就可以计算GFLOPS了
# update at 20210515
xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy (bs,na,ny,nx,2)
wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2) # wh (bs,na,ny,nx,2)
y = torch.cat((xy, wh, y[..., 4:]), -1) # (bs,na,ny,nx,2+2+1+nc=xy+wh+conf+cls_prob)
# update at 20210515
# 在转换成onnx时,默认self.inplance=False
# if self.inplace:
# y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
# # 计算预测的中心点 并反算到输入图像的尺寸坐标上
# xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
# wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2) # wh
# y = torch.cat((xy, wh, y[..., 4:]), -1)
z.append(y.view(bs, -1, self.no)) # y (bs,na*ny*nx,no=2+2+1+nc=xy+wh+conf+cls_prob)
return x if self.training else (torch.cat(z, 1), x)
# torch.cat(z, 1) (bs,na*ny*nx*nl,no=2+2+1+nc=xy+wh+conf+cls_prob)
@staticmethod
def _make_grid(nx=20, ny=20):
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
class Model(nn.Module):
def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classes
super(Model, self).__init__()
if isinstance(cfg, dict):
self.yaml = cfg # model dict
else: # is *.yaml
import yaml # for torch hub
self.yaml_file = Path(cfg).name
with open(cfg) as f:
self.yaml = yaml.safe_load(f) # model dict
# Define model
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
if nc and nc != self.yaml['nc']:
logger.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
self.yaml['nc'] = nc # override yaml value
if anchors:
logger.info(f'Overriding model.yaml anchors with anchors={anchors}')
self.yaml['anchors'] = round(anchors) # override yaml value
self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch]) # model, savelist
self.names = [str(i) for i in range(self.yaml['nc'])] # default names
self.inplace = self.yaml.get('inplace', True)
# logger.info([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
# Build strides, anchors
m = self.model[-1] # Detect()
if isinstance(m, Detect):
s = 256 # 2x min stride
m.inplace = self.inplace
m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))]) # forward
m.anchors /= m.stride.view(-1, 1, 1)
check_anchor_order(m)
self.stride = m.stride
self._initialize_biases() # only run once
# logger.info('Strides: %s' % m.stride.tolist())
# Init weights, biases
initialize_weights(self)
self.info()
logger.info('')
def forward(self, x, augment=False, profile=False):
if augment:
return self.forward_augment(x) # augmented inference, None
else:
return self.forward_once(x, profile) # single-scale inference, train
def forward_augment(self, x):
img_size = x.shape[-2:] # height, width
s = [1, 0.83, 0.67] # scales
f = [None, 3, None] # flips (2-ud, 3-lr)
y = [] # outputs
for si, fi in zip(s, f):
xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
yi = self.forward_once(xi)[0] # forward
# cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save
yi = self._descale_pred(yi, fi, si, img_size)
y.append(yi)
return torch.cat(y, 1), None # augmented inference, train
def forward_once(self, x, profile=False):
y, dt = [], [] # outputs
for m in self.model:
if m.f != -1: # if not from previous layer
# 注释:当m是Detect时,f=[17, 20, 23],此时为最后的检测层;x更新为从[17, 20, 23]层获取的特征 此时有3个
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
if profile:
o = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPS
t = time_synchronized()
for _ in range(10):
_ = m(x)
dt.append((time_synchronized() - t) * 100)
if m == self.model[0]:
logger.info(f"{'time (ms)':>10s} {'GFLOPS':>10s} {'params':>10s} {'module'}")
logger.info(f'{dt[-1]:10.2f} {o:10.2f} {m.np:10.0f} {m.type}')
# 注释:在最后的Detect层时,输入的x为3个张量,输出为(torch.cat(z, 1), x)
# 所以展开后是4个输出 第一个是预测结果 其余的是中间层的特征;但是直接用pth推理的
# 输出长度是2;而转成onnx后,输出是4个结果。估计是onnx将结果展开了。如果你只在onnx
# 导出时指定一个输出名称output_names=["pred"],则另外3个的输出的名称会被自动指定,
# 如: %pred, %778, %876, %974
x = m(x) # run
y.append(x if m.i in self.save else None) # save output
if profile:
logger.info('%.1fms total' % sum(dt))
return x
def _descale_pred(self, p, flips, scale, img_size):
# de-scale predictions following augmented inference (inverse operation)
if self.inplace:
p[..., :4] /= scale # de-scale
if flips == 2:
p[..., 1] = img_size[0] - p[..., 1] # de-flip ud
elif flips == 3:
p[..., 0] = img_size[1] - p[..., 0] # de-flip lr
else:
x, y, wh = p[..., 0:1] / scale, p[..., 1:2] / scale, p[..., 2:4] / scale # de-scale
if flips == 2:
y = img_size[0] - y # de-flip ud
elif flips == 3:
x = img_size[1] - x # de-flip lr
p = torch.cat((x, y, wh, p[..., 4:]), -1)
return p
另外也比较推荐你看一下这份我关于yolov5转c++工程的记录文档 ort_yolov5.zh.md 希望能解决你的疑惑
感谢楼主卷到这么晚还回复我,昨天看yolov5源码没注意到
if not self.training: # inference
把上面的not忽视了,认为anchor乘法没有在网络内部处理过,感觉这么详细的结合源码的解答
yolov5是一个基于anchor的算法,再前向推理的过程中应该涉及anchor的计算,但是在代码中没有看到andhor的任何信息,请问是怎么处理的?