Object Detection: ArithmeticOptimizer and Speed Issues with Instance Segmentation

System information

What is the top-level directory of the model you are using: tensorflow/models/research/object_detection
Have I written custom code (as opposed to using a stock example script provided in TensorFlow): Yes
OS Platform and Distribution (e.g., Linux Ubuntu 16.04): MacOS High Sierra
TensorFlow installed from (source or binary): pip
TensorFlow version (use command below): 1.13.1
Bazel version (if compiling from source): N/A
CUDA/cuDNN version: N/A
GPU model and memory: N/A
Exact command to reproduce: See script below

Describe the problem

I am encountering two odd (and possibly related) behaviors when attempting to run the "mask_rcnn_inception_v2_coco_2018_01_28" model.

When I run a session with a different (non instance mask) model like "ssd_inception_v2_coco_2018_01_28", the inference time for the first image is very slow (~9 sec) while the inference time for each subsequent image is much faster (~0.14 sec). This phenomenon is reproducible across models. When I run a session with an instance segmentation model, the inference time is consistently slow (~10 sec) and never improves even though I am not re-starting a session.
I get an ArithmeticOptimizer error when running an instance segmentation model, but the model still runs afterwards. This is the same error seen in #6215

Source code / logs

I have modified the object_detection_tutorial.ipynb code slightly to run multiple models sequentially for comparison in script form. I also added a few additional images to the test_images directory within object_detection. This script compares the performance of SSD_Inception_V2 vs Mask_RCNN_Inception_V2 and produces the error described above.

import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile

from distutils.version import StrictVersion
from collections import defaultdict
from io import StringIO

from matplotlib import pyplot as plt
from PIL import Image

import time

# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")
from object_detection.utils import ops as utils_ops

if StrictVersion(tf.__version__) < StrictVersion('1.12.0'):
    raise ImportError(
        'Please upgrade your TensorFlow installation to v1.12.*.')

from utils import label_map_util

from utils import visualization_utils as vis_util

def load_image_into_numpy_array(image):
    (im_width, im_height) = image.size
    return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(
        np.uint8)

def run_inference_for_single_image(image, session):
    # Get handles to input and output tensors
    ops = tf.get_default_graph().get_operations()
    all_tensor_names = {output.name for op in ops for output in op.outputs}
    tensor_dict = {}
    for key in [
            'num_detections', 'detection_boxes', 'detection_scores',
            'detection_classes', 'detection_masks'
    ]:
        tensor_name = key + ':0'
        if tensor_name in all_tensor_names:
            tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
                tensor_name)
    if 'detection_masks' in tensor_dict:
        # The following processing is only for single image
        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
        real_num_detection = tf.cast(tensor_dict['num_detections'][0],
                                     tf.int32)
        detection_boxes = tf.slice(detection_boxes, [0, 0],
                                   [real_num_detection, -1])
        detection_masks = tf.slice(detection_masks, [0, 0, 0],
                                   [real_num_detection, -1, -1])
        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
            detection_masks, detection_boxes, image.shape[0], image.shape[1])
        detection_masks_reframed = tf.cast(
            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
        # Follow the convention by adding back the batch dimension
        tensor_dict['detection_masks'] = tf.expand_dims(
            detection_masks_reframed, 0)
    image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

    # Run inference
    output_dict = sess.run(
        tensor_dict, feed_dict={image_tensor: np.expand_dims(image, 0)})

    # all outputs are float32 numpy arrays, so convert types as appropriate
    output_dict['num_detections'] = int(output_dict['num_detections'][0])
    output_dict['detection_classes'] = output_dict['detection_classes'][
        0].astype(np.uint8)
    output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
    output_dict['detection_scores'] = output_dict['detection_scores'][0]
    if 'detection_masks' in output_dict:
        output_dict['detection_masks'] = output_dict['detection_masks'][0]
    return output_dict

MODEL_NAMES = [
    'ssd_inception_v2_coco_2018_01_28',
    'mask_rcnn_inception_v2_coco_2018_01_28'
]

for MODEL_NAME in MODEL_NAMES:
    print("Model Name: {}".format(MODEL_NAME))

    MODEL_FILE = MODEL_NAME + '.tar.gz'
    DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

    # Path to frozen detection graph. This is the actual model that is used for the object detection.
    PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

    # List of the strings that is used to add correct label for each box.
    PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

    if not os.path.exists(PATH_TO_FROZEN_GRAPH):
        opener = urllib.request.URLopener()
        opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
        tar_file = tarfile.open(MODEL_FILE)
        for file in tar_file.getmembers():
            file_name = os.path.basename(file.name)
            if 'frozen_inference_graph.pb' in file_name:
                tar_file.extract(file, os.getcwd())

    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')

    category_index = label_map_util.create_category_index_from_labelmap(
        PATH_TO_LABELS, use_display_name=True)

    # For the sake of simplicity we will use only 2 images:
    # If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
    PATH_TO_TEST_IMAGES_DIR = 'test_images'
    TEST_IMAGE_PATHS = []
    for file in os.scandir(PATH_TO_TEST_IMAGES_DIR):
        if file.name.endswith(".jpg"):
            TEST_IMAGE_PATHS.append(
                os.path.join(PATH_TO_TEST_IMAGES_DIR, file.name))

    with detection_graph.as_default():
        with tf.Session() as sess:
            for i, image_path in enumerate(TEST_IMAGE_PATHS):
                image = Image.open(image_path)
                image.resize((640, 480))
                # the array based representation of the image will be used later in order to prepare the
                # result image with boxes and labels on it.
                image_np = load_image_into_numpy_array(image)

                # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
                image_np_expanded = np.expand_dims(image_np, axis=0)
                # Actual detection.
                start = time.time()
                output_dict = run_inference_for_single_image(image_np, sess)
                end = time.time()
                print("Inference time {}: {:.4f}".format(i, end - start))
                # Visualization of the results of a detection.
                vis_util.visualize_boxes_and_labels_on_image_array(
                    image_np,
                    output_dict['detection_boxes'],
                    output_dict['detection_classes'],
                    output_dict['detection_scores'],
                    category_index,
                    instance_masks=output_dict.get('detection_masks'),
                    use_normalized_coordinates=True,
                    line_thickness=8)

This produces the following console output:

Model Name: ssd_inception_v2_coco_2018_01_28
2019-03-28 14:46:24.770281: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Inference time 0: 8.1120
Inference time 1: 0.1437
Inference time 2: 0.1325
Inference time 3: 0.1400
Inference time 4: 0.1409
Model Name: mask_rcnn_inception_v2_coco_2018_01_28
WARNING:tensorflow:From /Users/gavinmartin/tensorflow_repo/models/research/object_detection/tf-od-venv/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:423: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
2019-03-28 14:46:44.738988: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
2019-03-28 14:46:47.741016: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
Inference time 0: 10.0803
2019-03-28 14:46:55.723338: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
2019-03-28 14:46:59.112565: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
Inference time 1: 10.7539
2019-03-28 14:47:08.015176: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
2019-03-28 14:47:11.565041: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
Inference time 2: 11.2618
2019-03-28 14:47:19.476457: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
2019-03-28 14:47:22.729805: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
Inference time 3: 10.4220
2019-03-28 14:47:31.414572: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
2019-03-28 14:47:35.067059: W ./tensorflow/core/grappler/optimizers/graph_optimizer_stage.h:241] Failed to run optimizer ArithmeticOptimizer, stage RemoveStackStridedSliceSameAxis node Preprocessor/map/while/ResizeToRange/strided_slice_3. Error: Pack node (Preprocessor/map/while/ResizeToRange/stack_2) axis attribute is out of bounds: 0
Inference time 4: 11.3128

I am running the script from within a venv that has the following dependencies installed:

Package              Version
-------------------- --------
absl-py              0.7.1
appnope              0.1.0
astor                0.7.1
attrs                19.1.0
backcall             0.1.0
bleach               3.1.0
contextlib2          0.5.5
cycler               0.10.0
Cython               0.29.6
decorator            4.4.0
defusedxml           0.5.0
entrypoints          0.3
gast                 0.2.2
grpcio               1.19.0
h5py                 2.9.0
ipykernel            5.1.0
ipython              7.4.0
ipython-genutils     0.2.0
ipywidgets           7.4.2
jedi                 0.13.3
Jinja2               2.10
jsonschema           3.0.1
jupyter              1.0.0
jupyter-client       5.2.4
jupyter-console      6.0.0
jupyter-core         4.4.0
Keras-Applications   1.0.7
Keras-Preprocessing  1.0.9
kiwisolver           1.0.1
lxml                 4.3.3
Markdown             3.1
MarkupSafe           1.1.1
matplotlib           3.0.3
mistune              0.8.4
mock                 2.0.0
nbconvert            5.4.1
nbformat             4.4.0
notebook             5.7.7
numpy                1.16.2
object-detection     0.1
opencv-python        4.0.0.21
pandocfilters        1.4.2
parso                0.3.4
pbr                  5.1.3
pexpect              4.6.0
pickleshare          0.7.5
Pillow               5.4.1
pip                  10.0.1
prometheus-client    0.6.0
prompt-toolkit       2.0.9
protobuf             3.7.1
ptyprocess           0.6.0
Pygments             2.3.1
pyparsing            2.3.1
pyrsistent           0.14.11
python-dateutil      2.8.0
pyzmq                18.0.1
qtconsole            4.4.3
Send2Trash           1.5.0
setuptools           39.0.1
six                  1.12.0
tensorboard          1.13.1
tensorflow           1.13.1
tensorflow-estimator 1.13.0
termcolor            1.1.0
terminado            0.8.1
testpath             0.4.2
tornado              6.0.2
traitlets            4.3.2
wcwidth              0.1.7
webencodings         0.5.1
Werkzeug             0.15.1
wheel                0.33.1
widgetsnbextension   3.4.2

Interestingly enough, if I downgrade to TensorFlow 1.12 and run this script again, the speeds are still problematic, but the ArithmeticOptimizer error goes away:

Model Name: ssd_inception_v2_coco_2018_01_28
2019-03-28 14:51:04.669860: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Inference time 0: 9.2073
Inference time 1: 0.1434
Inference time 2: 0.1429
Inference time 3: 0.1598
Inference time 4: 0.2158
Model Name: mask_rcnn_inception_v2_coco_2018_01_28
Inference time 0: 12.2215
Inference time 1: 11.5347
Inference time 2: 11.7466
Inference time 3: 14.0183
Inference time 4: 15.0711

The problem here is that you reload your tensors for each image when you really only need to load them once. Plus, you resize the mask tensor every time wich take a lot of processing power on my end at least. And since your images are resized to always be the same size, you can do this instead wich will defitely reduce the time it takes to run the algorithm:

def run_inference_for_single_image(image, session, tensor_dict, image_tensor):
    # Run inference
    output_dict = sess.run(
        tensor_dict, feed_dict={image_tensor: np.expand_dims(image, 0)})

    # all outputs are float32 numpy arrays, so convert types as appropriate
    output_dict['num_detections'] = int(output_dict['num_detections'][0])
    output_dict['detection_classes'] = output_dict['detection_classes'][
        0].astype(np.uint8)
    output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
    output_dict['detection_scores'] = output_dict['detection_scores'][0]
    if 'detection_masks' in output_dict:
        output_dict['detection_masks'] = output_dict['detection_masks'][0]
    return output_dict

MODEL_NAMES = [
    'ssd_inception_v2_coco_2018_01_28',
    'mask_rcnn_inception_v2_coco_2018_01_28'
]

for MODEL_NAME in MODEL_NAMES:
    print("Model Name: {}".format(MODEL_NAME))

    MODEL_FILE = MODEL_NAME + '.tar.gz'
    DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'

    # Path to frozen detection graph. This is the actual model that is used for the object detection.
    PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

    # List of the strings that is used to add correct label for each box.
    PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

    if not os.path.exists(PATH_TO_FROZEN_GRAPH):
        opener = urllib.request.URLopener()
        opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
        tar_file = tarfile.open(MODEL_FILE)
        for file in tar_file.getmembers():
            file_name = os.path.basename(file.name)
            if 'frozen_inference_graph.pb' in file_name:
                tar_file.extract(file, os.getcwd())

    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')

    category_index = label_map_util.create_category_index_from_labelmap(
        PATH_TO_LABELS, use_display_name=True)

    # For the sake of simplicity we will use only 2 images:
    # If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.
    PATH_TO_TEST_IMAGES_DIR = 'test_images'
    TEST_IMAGE_PATHS = []
    for file in os.scandir(PATH_TO_TEST_IMAGES_DIR):
        if file.name.endswith(".jpg"):
            TEST_IMAGE_PATHS.append(
                os.path.join(PATH_TO_TEST_IMAGES_DIR, file.name))

    with detection_graph.as_default():
        with tf.Session() as sess:
            tensor_dict = {}
            image_tensor = ''
            for i, image_path in enumerate(TEST_IMAGE_PATHS):
                image = Image.open(image_path)
                image = image.resize((640, 480))
                # the array based representation of the image will be used later in order to prepare the
                # result image with boxes and labels on it.
                image_np = load_image_into_numpy_array(image)

                #this will only load the tensors and resize the masks once wich saves a lot of processing power
                if i == 0:
                    # Get handles to input and output tensors
                    ops = tf.get_default_graph().get_operations()
                    all_tensor_names = {output.name for op in ops for output in op.outputs}
                    for key in [
                            'num_detections', 'detection_boxes', 'detection_scores',
                            'detection_classes', 'detection_masks'
                    ]:
                        tensor_name = key + ':0'
                        if tensor_name in all_tensor_names:
                            tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
                                tensor_name)

                    if 'detection_masks' in tensor_dict:
                        # The following processing is only for single image
                        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
                        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
                        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
                        real_num_detection = tf.cast(tensor_dict['num_detections'][0],
                                                    tf.int32)
                        detection_boxes = tf.slice(detection_boxes, [0, 0],
                                                [real_num_detection, -1])
                        detection_masks = tf.slice(detection_masks, [0, 0, 0],
                                                [real_num_detection, -1, -1])
                        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
                            detection_masks, detection_boxes, image_np.shape[0], image_np.shape[1])
                        detection_masks_reframed = tf.cast(
                            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
                        # Follow the convention by adding back the batch dimension
                        tensor_dict['detection_masks'] = tf.expand_dims(
                            detection_masks_reframed, 0)
                    image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

                # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
                image_np_expanded = np.expand_dims(image_np, axis=0)
                # Actual detection.
                start = time.time()
                output_dict = run_inference_for_single_image(image_np, sess, tensor_dict, image_tensor)
                end = time.time()
                print("Inference time {}: {:.4f}".format(i, end - start))
                # Visualization of the results of a detection.
                vis_util.visualize_boxes_and_labels_on_image_array(
                    image_np,
                    output_dict['detection_boxes'],
                    output_dict['detection_classes'],
                    output_dict['detection_scores'],
                    category_index,
                    instance_masks=output_dict.get('detection_masks'),
                    use_normalized_coordinates=True,
                    line_thickness=8)

Since I've done it this way I haven't seen the error "axis attribute is out of bounds: 0".

tensorflow / models