Inconsistent output from quantized tflite and float32 tflite

Issue Type

Performance

OS

Ubuntu

OS architecture

x86_64

Programming Language

Python

Framework

TensorFlowLite

Model name and Weights/Checkpoints URL

https://github.com/PINTO0309/PINTO_model_zoo/tree/main/194_face_recognizer_fast

Description

The output from tflite in float32 and full quantized uint8 are quite different. Not sure if it is expected for face recognition.

Relevant Log Output

No response

URL or source code for simple inference testing code


def get_quant_int8_output(interpreter, output_index):
    feature = interpreter.get_tensor(output_index)
    details = interpreter.get_output_details()
    for detail in details:
        if detail['index'] == output_index:
            break
    if feature.dtype == np.uint8:
        zero_points = detail["quantization_parameters"]["zero_points"]
        scales = detail["quantization_parameters"]["scales"]
        return (feature - zero_points) * scales
    return feature

image = plt.imread("Wu_Peng_0001.jpg")
image = cv2.resize(image, (112, 112))
image_float = image.astype(np.float32)

model_tf = tflite.Interpreter("194_face_recognizer_fast/saved_model_face_recognizer_fast/model_float32.tflite")
model_tf.allocate_tensors()
input_details = model_tf.get_input_details()[0]['index']
model_tf.set_tensor(input_details, image_float[None, ...])
output_details = model_tf.get_output_details()[0]['index']
model_tf.invoke()
embedding_tf = model_tf.get_tensor(output_details)

model_tf_uint8 = tflite.Interpreter("194_face_recognizer_fast/saved_model_face_recognizer_fast/model_full_integer_quant.tflite") 
model_tf_uint8.allocate_tensors()
input_details = model_tf_uint8.get_input_details()[0]['index']
model_tf_uint8.set_tensor(input_details, image[None, ...])
output_details = model_tf_uint8.get_output_details()[0]['index']
model_tf_uint8.invoke()
embedding_tf_uint8 = get_quant_int8_output(interpreter=model_tf_uint8, output_index=output_details)

import tensorflow as tf
import time
import numpy as np
from pprint import pprint

H=112
W=112

############################################################
from openvino.inference_engine import IECore
ie = IECore()
net = ie.read_network(
    model=f'openvino/FP32/face_recognizer_fast.xml',
    weights=f'openvino/FP32/face_recognizer_fast.bin'
)
input_blob = next(iter(net.input_info))
exec_net = ie.load_network(network=net, device_name='CPU')

roop = 1
e = 0.0
result = None
inp = np.ones((1,3,H,W), dtype=np.float32)
for _ in range(roop):
    s = time.time()
    result = exec_net.infer(inputs={input_blob: inp})
    e += (time.time() - s)
print('OpenVINO output @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
print(f'elapsed time: {e/roop*1000}ms')
print(f'shape: {result["fc1"].shape}')
pprint(result['fc1'])

############################################################

interpreter = tf.lite.Interpreter(model_path=f'model_float32.tflite', num_threads=4)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

roop = 1
e = 0.0
result = None
inp = np.ones((1,H,W,3), dtype=np.float32)
for _ in range(roop):
    s = time.time()
    interpreter.set_tensor(input_details[0]['index'], inp)
    interpreter.invoke()
    result = interpreter.get_tensor(output_details[0]['index'])
    e += (time.time() - s)
print('tflite output @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
print(f'elapsed time: {e/roop*1000}ms')
print(f'shape: {result.shape}')
pprint(result)

OpenVINO output @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
elapsed time: 7.77435302734375ms
shape: (1, 128)
array([[-0.30791214, -0.00172161, -0.32203954, -0.4465644 ,  0.5506708 ,
         0.37454242,  0.12765971, -0.34852087, -0.55883896,  0.05936675,
         0.1695568 , -0.2994346 , -0.32539135, -0.03171316,  0.1584038 ,
         0.06941229, -0.7254103 ,  0.22967511, -0.40271038,  0.03725373,
         0.17559841, -0.24316749,  0.20910957, -0.38083535,  0.2821527 ,
         0.64665926,  0.14585382, -0.01600966, -0.12683704, -0.16275045,
         0.046739  , -0.3108144 ,  0.42539626, -0.04507577, -0.01005561,
         0.33453947,  0.05367449,  0.09337044, -0.11997464, -0.25770757,
        -0.07461177, -0.6524699 ,  0.08730686, -0.06027391,  0.41346878,
        -0.31559795,  0.03768324,  0.41175154, -0.23365799, -0.17392859,
         0.44504082, -0.07452258,  0.12696928,  0.01730857, -0.4296226 ,
         0.19510299, -0.10751891, -0.04601097, -0.39241505,  0.27709037,
        -0.15694436, -0.2031919 , -0.5533962 , -0.04073964,  0.0481807 ,
        -0.5410749 ,  0.26406363,  0.00634025,  0.23910517,  0.5124856 ,
        -0.00100496,  0.41695806, -0.29561228,  0.1865741 , -0.02398361,
         0.06689882, -0.00815826,  0.20982082, -0.1156534 , -0.24292208,
         0.30511013,  0.08169898, -0.06288522,  0.05135342, -0.21524626,
         0.0141438 , -0.2397377 , -0.13586396,  0.29874226,  0.10599164,
        -0.38982812,  0.22435834,  0.1403134 ,  0.26996595, -0.27892074,
         0.45670548, -0.22077468,  0.05652796, -0.6642338 ,  0.6857936 ,
         0.18472582, -0.11868192, -0.20869748, -0.2998895 , -0.02720285,
         0.6399674 ,  0.05989804,  0.29125848,  0.06604294, -0.13117297,
        -0.01385057, -0.50441426, -0.14871584, -0.41186276, -0.15124089,
         0.27266443, -0.25697875, -0.33428097, -0.10980904,  0.14820883,
         0.3153484 ,  0.37297386,  0.07718931,  0.1712934 ,  0.18546768,
        -0.59192634,  0.33165112,  0.02169564]], dtype=float32)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
tflite output @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
elapsed time: 7.680654525756836ms
shape: (1, 128)
array([[-0.30791512, -0.00172003, -0.32203746, -0.44656596,  0.55067044,
         0.37454244,  0.12766045, -0.34852353, -0.55884385,  0.05936857,
         0.16955893, -0.29943365, -0.32538933, -0.03171276,  0.15840298,
         0.06941136, -0.7254181 ,  0.22967474, -0.40271398,  0.03725343,
         0.17559725, -0.24316718,  0.20910642, -0.38083786,  0.28215402,
         0.6466574 ,  0.14585344, -0.01601134, -0.12683675, -0.16274995,
         0.04673719, -0.3108138 ,  0.4253984 , -0.04507551, -0.01005552,
         0.33454174,  0.05367091,  0.0933719 , -0.11997414, -0.257707  ,
        -0.07460804, -0.6524724 ,  0.08730798, -0.0602743 ,  0.4134705 ,
        -0.31559613,  0.03768187,  0.41174823, -0.23365946, -0.17392643,
         0.44504157, -0.07452238,  0.12697074,  0.01730801, -0.42962405,
         0.19510286, -0.10752033, -0.04601185, -0.39241552,  0.27709046,
        -0.15694287, -0.20319283, -0.55339617, -0.04074154,  0.04818019,
        -0.5410755 ,  0.26406595,  0.00633784,  0.2391063 ,  0.51248306,
        -0.00100307,  0.41696066, -0.29561004,  0.18657266, -0.0239817 ,
         0.06689788, -0.00816114,  0.20982026, -0.11565492, -0.24292044,
         0.30510852,  0.08169975, -0.06288571,  0.0513528 , -0.21524292,
         0.01414456, -0.23973477, -0.13586488,  0.29874215,  0.10599194,
        -0.389825  ,  0.22436565,  0.14031234,  0.2699634 , -0.27892292,
         0.45670462, -0.22077489,  0.05652812, -0.6642385 ,  0.6857954 ,
         0.18472388, -0.11868376, -0.20869698, -0.29988924, -0.02720007,
         0.639966  ,  0.05989749,  0.29125947,  0.06604305, -0.13117433,
        -0.01385185, -0.50442   , -0.14871615, -0.41186142, -0.15123957,
         0.27266696, -0.25697806, -0.33427885, -0.10980733,  0.14821012,
         0.31534806,  0.3729769 ,  0.07718804,  0.17129503,  0.1854662 ,
        -0.591922  ,  0.33165163,  0.02169659]], dtype=float32)

The Float32 values before and after conversion are almost identical. Therefore, modify the parameters for quantization as you expect. https://github.com/PINTO0309/PINTO_model_zoo/blob/main/194_face_recognizer_fast/convert_script.txt

PINTO0309 / PINTO_model_zoo

Inconsistent output from quantized tflite and float32 tflite #263