Different result on exported RPNPostProcess module

🐛 Bug

I tested two approach, calculating input of rpn_post_processor in Module and giving input as constant. The results are same. bbox is zero and objectness is incorrect. It seems that values are lossed during inference.

To Reproduce

import numpy as np
import torch
import io

from maskrcnn_benchmark.structures.image_list import ImageList
from maskrcnn_benchmark.structures.bounding_box import BoxList
from maskrcnn_benchmark.config import cfg

from demo.predictor import COCODemo
from demo.utils import load_image
from demo.transform import transform_image
from demo.onnx.utils import infer_shapes

import unittest

# This unittest is not working on 1.1.1 version of onnxruntime
import onnxruntime

config_file = "./configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml"
cfg.merge_from_file(config_file)
cfg.merge_from_list(["MODEL.DEVICE", "cpu"])
cfg.freeze()

coco_demo = COCODemo(
    cfg,
    confidence_threshold=0.7,
    min_image_size=800,
)

for param in coco_demo.model.parameters():
    param.requires_grad = False

original_image = load_image("./demo/sample.jpg")
sample_image, t_width, t_height = transform_image(cfg, original_image)

ONNX_OPSET_VERSION = 10

VALIDATION_TYPE = "FILE" # FILE, IO

class ONNXExportTester(unittest.TestCase):
    def run_model(self, model, inputs):
        with torch.no_grad():
            if isinstance(inputs, torch.Tensor):
                inputs = (inputs,)
            outputs = model(*inputs)
            if isinstance(outputs, torch.Tensor):
                outputs = (outputs,)

        return (inputs, outputs)

    def ort_validate(self, onnx_io, inputs, outputs):
        inputs, _ = torch.jit._flatten(inputs)
        outputs, _ = torch.jit._flatten(outputs)

        def to_numpy(tensor):
            if tensor.requires_grad:
                return tensor.detach().cpu().numpy()
            else:
                return tensor.cpu().numpy()

        inputs = list(map(to_numpy, inputs))
        outputs = list(map(to_numpy, outputs))

        if isinstance(onnx_io, str):
            ort_session = onnxruntime.InferenceSession(onnx_io)
        else:
            ort_session = onnxruntime.InferenceSession(onnx_io.getvalue())

        # compute onnxruntime output prediction
        ort_inputs = dict((ort_session.get_inputs()[i].name, inpt) for i, inpt in enumerate(inputs))
        ort_outs = ort_session.run(None, ort_inputs)

        for i in range(0, len(outputs)):
            torch.testing.assert_allclose(outputs[i].astype(np.float32), ort_outs[i].astype(np.float32), rtol=1e-02, atol=1e-04)

    def test_anchor_generator(self):
        from maskrcnn_benchmark.modeling.rpn.inference import make_rpn_postprocessor
        from maskrcnn_benchmark.modeling.rpn.anchor_generator import make_anchor_generator
        from maskrcnn_benchmark.modeling.box_coder import BoxCoder

        image_list = ImageList(sample_image.unsqueeze(0), [(sample_image.size(-2), sample_image.size(-1))])
        sample_features = coco_demo.model.backbone(image_list.tensors)

        anchor_generator = make_anchor_generator(cfg)

        objectness, rpn_box_regression = coco_demo.model.rpn.head(sample_features)
        anchors = anchor_generator(image_list, sample_features)

        class RPNPostProcessor(torch.nn.Module):
            def __init__(self):
                super(RPNPostProcessor, self).__init__()
                rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))

                # self.anchor_generator = make_anchor_generator(cfg)
                self.box_selector = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False)

            def forward(self, image, features):
                """
                Arguments:
                    image (Tensor): images for which we want to compute the predictions
                    features (list[Tensor]): features computed from the images that are
                        used for computing the predictions. Each tensor in the list
                        correspond to different feature levels
                """
                # image_list = ImageList(image.unsqueeze(0), [(image.size(-2), image.size(-1))])

                # objectness, rpn_box_regression = coco_demo.model.rpn.head(features)
                # anchors = self.anchor_generator(image_list, features)
                boxes = self.box_selector(anchors, objectness, rpn_box_regression)[0]

                return (boxes.bbox, boxes.get_field("objectness"))

        rpn_post_processor = RPNPostProcessor()
        rpn_post_processor.eval()

        inputs, outputs = self.run_model(rpn_post_processor, (sample_image, sample_features))

        if VALIDATION_TYPE == "IO":
            onnx_io = io.BytesIO()
        else:
            onnx_io = "./demo/onnx_test_models/rpn_post_processor.onnx"

        torch.onnx.export(rpn_post_processor, inputs, onnx_io,
                            verbose=False,
                            do_constant_folding=False,
                            input_names=["image", "feature_0", "feature_1", "feature_2", "feature_3", "feature_4"],
                            opset_version=ONNX_OPSET_VERSION)

        self.ort_validate(onnx_io, inputs, outputs)

if __name__ == '__main__':
    unittest.main()

Expected behavior

/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/rpn/inference.py:94: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  pre_nms_top_n = min(self.pre_nms_top_n, num_anchors)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/box_coder.py:87: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/box_coder.py:89: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/box_coder.py:91: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/box_coder.py:93: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
/home/jade/.pyenv/versions/maskrcnn-benchmark-1-3-1/lib/python3.7/site-packages/torch/tensor.py:426: RuntimeWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).
  'incorrect results).', category=RuntimeWarning)
/home/jade/.pyenv/versions/maskrcnn-benchmark-1-3-1/lib/python3.7/site-packages/torch/tensor.py:427: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  return iter(imap(lambda i: self[i], range(self.size(0))))
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/structures/bounding_box.py:21: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
  bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/structures/bounding_box.py:26: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if bbox.size(-1) != 4:
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/structures/bounding_box.py:216: TracerWarning: There are 4 live references to the data region being modified when tracing in-place operator clamp_. This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/structures/bounding_box.py:217: TracerWarning: There are 4 live references to the data region being modified when tracing in-place operator clamp_. This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/structures/bounding_box.py:218: TracerWarning: There are 4 live references to the data region being modified when tracing in-place operator clamp_. This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/structures/bounding_box.py:219: TracerWarning: There are 4 live references to the data region being modified when tracing in-place operator clamp_. This might cause the trace to be incorrect, because all other views that also reference this data will not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
  self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/rpn/inference.py:95: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True)
/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/maskrcnn_benchmark/modeling/rpn/inference.py:176: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness))
/home/jade/.pyenv/versions/maskrcnn-benchmark-1-3-1/lib/python3.7/site-packages/torch/onnx/symbolic_opset9.py:1881: UserWarning: Exporting aten::index operator of advanced indexing in opset 10 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.
  "If indices include negative values, the exported graph will produce incorrect results.")
F
======================================================================
FAIL: test_anchor_generator (__main__.ONNXExportTester)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/demo/unittest/onnx/export/rpn_post_processor.py", line 129, in test_anchor_generator
    self.ort_validate(onnx_io, inputs, outputs)
  File "/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/demo/unittest/onnx/export/rpn_post_processor.py", line 74, in ort_validate
    torch.testing.assert_allclose(outputs[i].astype(np.float32), ort_outs[i].astype(np.float32), rtol=1e-02, atol=1e-04)
  File "/home/jade/.pyenv/versions/maskrcnn-benchmark-1-3-1/lib/python3.7/site-packages/torch/testing/__init__.py", line 59, in assert_allclose
    count - 1, 100 * count / actual.numel()))
AssertionError: Not within tolerance rtol=0.01 atol=0.0001 at input[985, 2] (1087.0 vs. 0.0) and 3862 other locations (96.00%)

Environment

PyTorch version: 1.3.1 Is debug build: No CUDA used to build PyTorch: 10.1.243

OS: Ubuntu 18.04.3 LTS GCC version: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 CMake version: version 3.10.2

Python version: 3.7 Is CUDA available: Yes CUDA runtime version: 10.1.243 GPU models and configuration: GPU 0: GeForce RTX 2080 Ti Nvidia driver version: 440.48.02 cuDNN version: Probably one of the following: /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-10.1/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudnn.so.7

Versions of relevant libraries: [pip3] numpy==1.18.1 [pip3] onnx==1.6.0 [pip3] onnxruntime==1.1.0 [pip3] Pillow==6.2.2 [pip3] torch==1.3.1 [pip3] torchvision==0.4.2 [conda] Could not collect

schyun9212 / maskrcnn-benchmark