RAFT inference time and model size

ashank-art commented 1 year ago

Issue Type

Performance, Support

OS

Ubuntu

OS architecture

x86_64

Programming Language

Python

Framework

PyTorch, ONNX

Model name and Weights/Checkpoints URL

https://github.com/PINTO0309/PINTO_model_zoo/tree/main/252_RAFT

Description

Hi @PINTO0309 regarding the RAFT onnx model conversion, i have done the conversion with Pytorch version 1.12.0 and offset version 16, but the model size i am getting is 21.1mb but model uploaded here is 97mb for same resolution 640*480 and iterations. Any suggestion what might be missing, Below attached the code snipped used and environment.

Relevant Log Output


### URL or source code for simple inference testing code
DEVICE = 'cuda'

def demo(args):
    model = torch.nn.DataParallel(RAFT(args))
    model.load_state_dict(torch.load(args.model))

    model = model.module
    model.to(DEVICE)
    model.eval()

    H=480
    W=640
    onnx_file = f"raft_kitti_iter20_{H}x{W}.onnx"
    x1 = torch.randn(1, 3, H, W).to(DEVICE)
    x2 = torch.randn(1, 3, H, W).to(DEVICE)
    torch.onnx.export(
        model,
        args=(x1,x2),
        f=onnx_file,
        opset_version=16,
    )

and the environment used.
numpy==1.21.6
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
opencv-python==4.6.0.66
Pillow==9.3.0
scipy==1.7.3
torch==1.12.0
typing_extensions==4.4.0

PINTO0309 commented 1 year ago

Have you read the source code of the paper?

utils.py

import torch
import torch.nn.functional as F
import numpy as np
from scipy import interpolate

class InputPadder:
  """ Pads images such that dimensions are divisible by 8 """
  def __init__(self, dims, mode='sintel'):
      self.ht, self.wd = dims[-2:]
      pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
      pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
      if mode == 'sintel':
          self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
      else:
          self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]

  def pad(self, *inputs):
      return [F.pad(x, self._pad, mode='replicate') for x in inputs]

  def unpad(self,x):
      ht, wd = x.shape[-2:]
      c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
      return x[..., c[0]:c[1], c[2]:c[3]]

def forward_interpolate(flow):
  flow = flow.detach().cpu().numpy()
  dx, dy = flow[0], flow[1]

  ht, wd = dx.shape
  x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))

  x1 = x0 + dx
  y1 = y0 + dy

  x1 = x1.reshape(-1)
  y1 = y1.reshape(-1)
  dx = dx.reshape(-1)
  dy = dy.reshape(-1)

  valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
  x1 = x1[valid]
  y1 = y1[valid]
  dx = dx[valid]
  dy = dy[valid]

  flow_x = interpolate.griddata(
      (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)

  flow_y = interpolate.griddata(
      (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)

  flow = np.stack([flow_x, flow_y], axis=0)
  return torch.from_numpy(flow).float()

def bilinear_grid_sample(im, grid, align_corners=False):
  """Given an input and a flow-field grid, computes the output using input
  values and pixel locations from grid. Supported only bilinear interpolation
  method to sample the input pixels.

  Args:
      im (torch.Tensor): Input feature map, shape (N, C, H, W)
      grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
      align_corners {bool}: If set to True, the extrema (-1 and 1) are
          considered as referring to the center points of the input’s
          corner pixels. If set to False, they are instead considered as
          referring to the corner points of the input’s corner pixels,
          making the sampling more resolution agnostic.

  Returns:
      torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
  """
  n, c, h, w = im.shape
  gn, gh, gw, _ = grid.shape
  assert n == gn

  x = grid[:, :, :, 0]
  y = grid[:, :, :, 1]

  if align_corners:
      x = ((x + 1) / 2) * (w - 1)
      y = ((y + 1) / 2) * (h - 1)
  else:
      x = ((x + 1) * w - 1) / 2
      y = ((y + 1) * h - 1) / 2

  x = x.view(n, -1)
  y = y.view(n, -1)

  x0 = torch.floor(x).long()
  y0 = torch.floor(y).long()
  x1 = x0 + 1
  y1 = y0 + 1

  wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
  wb = ((x1 - x) * (y - y0)).unsqueeze(1)
  wc = ((x - x0) * (y1 - y)).unsqueeze(1)
  wd = ((x - x0) * (y - y0)).unsqueeze(1)

  # Apply default for grid_sample function zero padding
  im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
  padded_h = h + 2
  padded_w = w + 2
  # save points positions after padding
  x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1

  # Clip coordinates to padded image size
  x0 = torch.where(x0 < 0, torch.tensor(0), x0)
  x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
  x1 = torch.where(x1 < 0, torch.tensor(0), x1)
  x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
  y0 = torch.where(y0 < 0, torch.tensor(0), y0)
  y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
  y1 = torch.where(y1 < 0, torch.tensor(0), y1)
  y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)

  im_padded = im_padded.view(n, c, -1)

  x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
  x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
  x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
  x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)

  Ia = torch.gather(im_padded, 2, x0_y0)
  Ib = torch.gather(im_padded, 2, x0_y1)
  Ic = torch.gather(im_padded, 2, x1_y0)
  Id = torch.gather(im_padded, 2, x1_y1)

  return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)

def bilinear_sampler(img, coords, mode='bilinear', mask=False):
  """ Wrapper for grid_sample, uses pixel coordinates """
  H, W = img.shape[-2:]
  xgrid, ygrid = coords.split([1,1], dim=-1)
  xgrid = 2*xgrid/(W-1) - 1
  ygrid = 2*ygrid/(H-1) - 1

  grid = torch.cat([xgrid, ygrid], dim=-1)
  # img = F.grid_sample(img, grid, align_corners=True)
  img = bilinear_grid_sample(img, grid, align_corners=True)

  if mask:
      mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
      return img, mask.float()

  return img

def coords_grid(batch, ht, wd, device):
  coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
  coords = torch.stack(coords[::-1], dim=0).float()
  return coords[None].repeat(batch, 1, 1, 1)

def upflow8(flow, mode='bilinear'):
  new_size = (8 * flow.shape[2], 8 * flow.shape[3])
  return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)

raft.py

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from update import BasicUpdateBlock, SmallUpdateBlock
from extractor import BasicEncoder, SmallEncoder
from corr import CorrBlock, AlternateCorrBlock
from utils.utils import bilinear_sampler, coords_grid, upflow8

try:
  autocast = torch.cuda.amp.autocast
except:
  # dummy autocast for PyTorch < 1.6
  class autocast:
      def __init__(self, enabled):
          pass
      def __enter__(self):
          pass
      def __exit__(self, *args):
          pass

class RAFT(nn.Module):
  def __init__(self, args):
      super(RAFT, self).__init__()
      self.args = args

      if args.small:
          self.hidden_dim = hdim = 96
          self.context_dim = cdim = 64
          args.corr_levels = 4
          args.corr_radius = 3

      else:
          self.hidden_dim = hdim = 128
          self.context_dim = cdim = 128
          args.corr_levels = 4
          args.corr_radius = 4

      if 'dropout' not in self.args:
          self.args.dropout = 0

      if 'alternate_corr' not in self.args:
          self.args.alternate_corr = False

      # feature network, context network, and update block
      if args.small:
          self.fnet = SmallEncoder(output_dim=128, norm_fn='instance', dropout=args.dropout)
          self.cnet = SmallEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout)
          self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim)

      else:
          self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=args.dropout)
          self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='batch', dropout=args.dropout)
          self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)

  def freeze_bn(self):
      for m in self.modules():
          if isinstance(m, nn.BatchNorm2d):
              m.eval()

  def initialize_flow(self, img):
      """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
      N, C, H, W = img.shape
      coords0 = coords_grid(N, H//8, W//8, device=img.device)
      coords1 = coords_grid(N, H//8, W//8, device=img.device)

      # optical flow computed as difference: flow = coords1 - coords0
      return coords0, coords1

  def upsample_flow(self, flow, mask):
      """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
      N, _, H, W = flow.shape
      mask = mask.view(N, 1, 9, 8, 8, H, W)
      mask = torch.softmax(mask, dim=2)

      up_flow = F.unfold(8 * flow, [3,3], padding=1)
      up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)

      up_flow = torch.sum(mask * up_flow, dim=2)
      up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
      return up_flow.reshape(N, 2, 8*H, 8*W)

  # def forward(self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False):
  def forward(self, image1, image2):
      """ Estimate optical flow between pair of frames """
      iters=10
      flow_init=None
      upsample=True
      test_mode=True

      image1 = 2 * (image1 / 255.0) - 1.0
      image2 = 2 * (image2 / 255.0) - 1.0

      image1 = image1.contiguous()
      image2 = image2.contiguous()

      hdim = self.hidden_dim
      cdim = self.context_dim

      # run the feature network
      with autocast(enabled=self.args.mixed_precision):
          fmap1, fmap2 = self.fnet([image1, image2])

      fmap1 = fmap1.float()
      fmap2 = fmap2.float()
      if self.args.alternate_corr:
          corr_fn = AlternateCorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
      else:
          corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)

      # run the context network
      with autocast(enabled=self.args.mixed_precision):
          cnet = self.cnet(image1)
          net, inp = torch.split(cnet, [hdim, cdim], dim=1)
          net = torch.tanh(net)
          inp = torch.relu(inp)

      coords0, coords1 = self.initialize_flow(image1)

      if flow_init is not None:
          coords1 = coords1 + flow_init

      flow_predictions = []
      for itr in range(iters):
          coords1 = coords1.detach()
          corr = corr_fn(coords1) # index correlation volume

          flow = coords1 - coords0
          with autocast(enabled=self.args.mixed_precision):
              net, up_mask, delta_flow = self.update_block(net, inp, corr, flow)

          # F(t+1) = F(t) + \Delta(t)
          coords1 = coords1 + delta_flow

          # upsample predictions
          if up_mask is None:
              flow_up = upflow8(coords1 - coords0)
          else:
              flow_up = self.upsample_flow(coords1 - coords0, up_mask)

          flow_predictions.append(flow_up)

      # if test_mode:
      #     return coords1 - coords0, flow_up
      return coords1 - coords0, flow_up

      # return flow_predictions

demo.py

import sys
sys.path.append('core')

import argparse
import os
import cv2
import glob
import numpy as np
import torch
from PIL import Image

from core.raft import RAFT
from core.utils import flow_viz
from core.utils.utils import InputPadder

DEVICE = 'cuda'

def load_image(imfile):
  img = np.array(Image.open(imfile)).astype(np.uint8)
  img = torch.from_numpy(img).permute(2, 0, 1).float()
  return img[None].to(DEVICE)

def viz(img, flo):
  img = img[0].permute(1,2,0).cpu().numpy()
  flo = flo[0].permute(1,2,0).cpu().numpy()

  # map flow to rgb image
  flo = flow_viz.flow_to_image(flo)
  img_flo = np.concatenate([img, flo], axis=0)

  # import matplotlib.pyplot as plt
  # plt.imshow(img_flo / 255.0)
  # plt.show()

  cv2.imshow('image', img_flo[:, :, [2,1,0]]/255.0)
  cv2.waitKey()

def demo(args):
  model = torch.nn.DataParallel(RAFT(args))
  model.load_state_dict(torch.load(args.model))

  model = model.module
  model.to(DEVICE)
  model.eval()
  model.cpu()

  with torch.no_grad():

      import onnx
      from onnxsim import simplify
      MODEL='raft_things'
      RESOLUTION = [
          # [240,320],
          # [360,480],
          [480,640],
          # [720,1280],
      ]
      ITERS=10
      for H, W in RESOLUTION:
          onnx_file = f"{MODEL}_iter{ITERS}_{H}x{W}.onnx"
          x1 = torch.randn(1, 3, H, W).cpu()#.cuda()
          x2 = torch.randn(1, 3, H, W).cpu()#.cuda()
          torch.onnx.export(
              model,
              args=(x1,x2),
              f=onnx_file,
              opset_version=11,
              # dynamic_axes={
              #     'input.1' : {2: 'height', 3: 'width'},
              #     'gray' : {2: 'height', 3: 'width'},
              #     '242' : {2: 'height', 3: 'width'},
              #     '252' : {2: 'height', 3: 'width'}
              # }
          )
          model = onnx.load(onnx_file)
          model_simp, check = simplify(
              model,
              # input_shapes={
              #     "input.1": [1,3,H,W],
              #     "gray": [1,1,H,W],
              # }
          )
          onnx.save(model_simp, onnx_file)
      import sys
      sys.exit(0)

      images = glob.glob(os.path.join(args.path, '*.png')) + glob.glob(os.path.join(args.path, '*.jpg'))

      images = sorted(images)
      for imfile1, imfile2 in zip(images[:-1], images[1:]):
          image1 = load_image(imfile1)
          image2 = load_image(imfile2)

          padder = InputPadder(image1.shape)
          image1, image2 = padder.pad(image1, image2)

          flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
          viz(image1, flow_up)

if __name__ == '__main__':
  parser = argparse.ArgumentParser()
  parser.add_argument('--model', help="restore checkpoint")
  parser.add_argument('--path', help="dataset for evaluation")
  parser.add_argument('--small', action='store_true', help='use small model')
  parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
  parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
  args = parser.parse_args()

  demo(args)

ashank-art commented 1 year ago

Thank you for your support

PINTO0309 / PINTO_model_zoo