Fatal error: ATen is not a registered function/op

schyun9212 commented 4 years ago

🐛 Bug

The exported roi_head has ATen operator. But it is not supported in ONNX.

To Reproduce

import torch
import io
import unittest

from maskrcnn_benchmark.structures.image_list import ImageList

from demo.unittest.onnx.export import ONNXExportTester, ONNX_OPSET_VERSION, VALIDATION_TYPE, cfg, coco_demo, sample_features, sample_proposals, t_width, t_height

class FeatureExtractorTester(ONNXExportTester):
    def test_feature_extractor(self):
        from maskrcnn_benchmark.structures.bounding_box import BoxList
        from maskrcnn_benchmark.modeling.roi_heads.box_head.roi_box_feature_extractors import make_roi_box_feature_extractor

        class FeatureExtractor(torch.nn.Module):
            def __init__(self):
                super(FeatureExtractor, self).__init__()
                self.feature_extractor = make_roi_box_feature_extractor(cfg, 256)

            def forward(self, features, proposals):
                bbox, objectness = proposals

                proposals = BoxList(bbox, (t_width, t_height), mode="xyxy")
                proposals.add_field("objectenss", objectness)

                x = self.feature_extractor(features, [proposals])

                return x

        feature_extractor = FeatureExtractor()
        feature_extractor.eval()

        inputs, outputs = self.run_model(feature_extractor, (sample_features, sample_proposals))

        if VALIDATION_TYPE == "IO":
            onnx_io = io.BytesIO()
        else:
            onnx_io = "./demo/onnx_test_models/feature_extractor.onnx"

        torch.onnx.export(feature_extractor, inputs, onnx_io,
                            verbose=False,
                            do_constant_folding=False,
                            input_names=["feature_0", "feature_1", "feature_2", "feature_3", "feature_4", "bbox", "objectness"],
                            opset_version=ONNX_OPSET_VERSION)

        self.ort_validate(onnx_io, inputs, outputs)

if __name__ == '__main__':
    unittest.main()

Expected behavior

ERROR: test_feature_extractor (__main__.FeatureExtractorTester)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/demo/unittest/onnx/export/feature_extractor.py", line 45, in test_feature_extractor
    self.ort_validate(onnx_io, inputs, outputs)
  File "/home/jade/Workspace/maskrcnn/maskrcnn-benchmark-1-3-1/demo/unittest/onnx/export/__init__.py", line 73, in ort_validate
    ort_session = onnxruntime.InferenceSession(onnx_io)
  File "/home/jade/.pyenv/versions/maskrcnn-benchmark-1-3-1/lib/python3.7/site-packages/onnxruntime/capi/session.py", line 25, in __init__
    self._load_model(providers)
  File "/home/jade/.pyenv/versions/maskrcnn-benchmark-1-3-1/lib/python3.7/site-packages/onnxruntime/capi/session.py", line 43, in _load_model
    self._sess.load_model(providers)
onnxruntime.capi.onnxruntime_pybind11_state.Fail: [ONNXRuntimeError] : 1 : FAIL : Fatal error: ATen is not a registered function/op

----------------------------------------------------------------------
Ran 1 test in 1.102s

FAILED (errors=1)

Environment

PyTorch version: 1.3.1 Is debug build: No CUDA used to build PyTorch: 10.1.243

OS: Ubuntu 18.04.3 LTS GCC version: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 CMake version: version 3.10.2

Python version: 3.7 Is CUDA available: Yes CUDA runtime version: 10.1.243 GPU models and configuration: GPU 0: GeForce RTX 2080 Ti Nvidia driver version: 440.48.02 cuDNN version: Probably one of the following: /usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-10.1/targets/x86_64-linux/lib/libcudnn.so.7 /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudnn.so.7

Versions of relevant libraries: [pip3] numpy==1.18.1 [pip3] onnx==1.6.0 [pip3] onnxruntime==1.1.0 [pip3] Pillow==6.2.2 [pip3] torch==1.3.1 [pip3] torchvision==0.4.2 [conda] Could not collect

schyun9212 commented 4 years ago

ATen operator is included in ROIAlign a custom operator of maskrcnn-benchmark. I registered the custom operator to export but there were unexpected additional ATen operator after ROIAlign in graph.

Screenshot from 2020-01-31 17-41-33

  %109 = RoiAlign[output_height = 7, output_width = 7, sampling_ratio = 2, spatial_scale = 0.0625](%feature_2, %108, %106)
  %110 = Cast[to = 1](%109)
  %111 = Cast[to = 7](%101)
  %112 = Constant[value = <Scalar Tensor []>]()
  %113 = ATen[operator = 'index_put'](%94, %111, %110, %112)

schyun9212 commented 4 years ago

Reference about index_put https://discuss.pytorch.org/t/torchscript-indexing-question-filling-nans/53100

Suspected cause

# pooler.py
idx_in_level = torch.nonzero(levels.type(torch.int32) == level).squeeze(1)
rois_per_level = rois[idx_in_level]
result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype) # <---

schyun9212 commented 4 years ago

This issue is solved by replacing ATen related operator to supported operator in ffe5ded.

schyun9212 / maskrcnn-benchmark