MaybeShewill-CV / CRNN_Tensorflow

Convolutional Recurrent Neural Networks(CRNN) for Scene Text Recognition
MIT License
1.03k stars 388 forks source link

固化模型为pb文件的时候,单张图片推理速度为5S。 #444

Closed zhu2bowen closed 2 years ago

zhu2bowen commented 2 years ago

RT,本地执行.\tools\test_shadownet.py,单张图片推理1S不到,转化成pb文件,加载后推理速度慢了5倍! 转化的脚本如下:

!/usr/bin/env python3

import argparse

import numpy as np import tensorflow as tf

import sys sys.path.append(r"D:\develop\crnn_tensorflow\CRNN_Tensorflow") from config import global_config from crnn_model import crnn_net

CFG = global_config.cfg OUTPUT_NODE_NAME = ["CTCBeamSearchDecoder"]

def init_args(): """

:return:
"""
parser = argparse.ArgumentParser()

parser.add_argument('-i', '--ckpt_path', type=str, help='The pretrained ckpt model weights file path')
parser.add_argument('-s', '--pb_path', type=str, help='The model export dir')

return parser.parse_args()

def build_saved_model(ckpt_path, pb_path): """ Convert source ckpt weights file into tensorflow saved model :param ckpt_path: :param export_dir: :return: """

build inference tensorflow graph

image_size = tuple(CFG.ARCH.INPUT_SIZE)
image_tensor = tf.placeholder(
    dtype=tf.float32,
    shape=[1, image_size[1], image_size[0], 3],
    name='input_tensor')

# set crnn net
net = crnn_net.ShadowNet(
    phase='test',
    hidden_nums=CFG.ARCH.HIDDEN_UNITS,
    layers_nums=CFG.ARCH.HIDDEN_LAYERS,
    num_classes=CFG.ARCH.NUM_CLASSES
)

# compute inference logits
inference_ret = net.inference(
    inputdata=image_tensor,
    name='shadow_net',
    reuse=False
)

# beam search decode
decodes, _ = tf.nn.ctc_beam_search_decoder(
    inputs=inference_ret,
    sequence_length=CFG.ARCH.SEQ_LENGTH * np.ones(1),
    merge_repeated=False
)

saver = tf.train.Saver()

# Set sess configuration
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION
sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH
sess_config.gpu_options.allocator_type = 'BFC'

sess = tf.Session(config=sess_config)

with sess.as_default():
    saver.restore(sess=sess, save_path=ckpt_path)
    output_graph_def = tf.graph_util.convert_variables_to_constants(
        sess, sess.graph_def, OUTPUT_NODE_NAME
    )
    # save frozen graph to pb file
    with tf.gfile.GFile(pb_path, 'wb') as fp:
        fp.write(output_graph_def.SerializeToString())
return

def ckpt2pb(ckpt_file, pb_path): """Transfer ckpt format to pb.

Parameters
----------
ckpt_file : str
    full name of ckpt model file
pb_path : str
    full name of protobuf model file
output_node_name : list
    a list containing names of output nodes
"""
with tf.Session() as sess:
    # restore graph
    saver = tf.train.import_meta_graph(ckpt_file + ".meta")

    # restore weights
    saver.restore(sess, ckpt_file)

    # define frozen graph
    output_graph_def = tf.graph_util.convert_variables_to_constants(
        sess, sess.graph_def, OUTPUT_NODE_NAME
    )

    # save frozen graph to pb file
    with tf.gfile.GFile(pb_path, 'wb') as fp:
        fp.write(output_graph_def.SerializeToString())

if name == 'main': """ build saved model """

init args

args = init_args()

# build saved model
build_saved_model(args.ckpt_path, args.pb_path)

推理的脚本如下:

class Inference: def init(self, model_dir): """ :param pb_path: path of pb-model file :param with_nms: True or False, the model is with nms operations """ char_dict_path = osp.join(model_dir, "char_dict_cn.json") ord_map_dict_path = osp.join(model_dir, "ord_map_cn.json") self.codec = CharDict(char_dict_path, ord_map_dict_path)

    self.pb_path = osp.join(model_dir, "inference_model_cn.pb")
    self._sess = None
    self.inputs = None
    self.outputs = None

def load_graph(self):
    """Decode model file and get the default graph.

    Returns
    -------
    default_graph : obj
        tf graph
    """
    with open(self.pb_path, 'rb') as fp:
        decoded = fp.read()

    graph_def = tf.GraphDef()
    graph_def.ParseFromString(decoded)
    for node in graph_def.node:
        if node.device != '':
            print(node.device)

    # load the graph_def in the default graph_def
    # TODO: 获取pb文件中的阈值参数,并修改为设定阈值
    # TODO: batch size, img size都可以从模型中获取
    with tf.Graph().as_default() as default_graph:
        tf.import_graph_def(graph_def, name='')
        # get input tensor
        self.inputs = default_graph.get_tensor_by_name('input_tensor:0')
        # get output tensor
        output_0 = default_graph.get_tensor_by_name('CTCBeamSearchDecoder:0')
        output_1 = default_graph.get_tensor_by_name('CTCBeamSearchDecoder:1')
        output_2 = default_graph.get_tensor_by_name('CTCBeamSearchDecoder:2')
        self.outputs = [output_0, output_1, output_2]

    return default_graph

def setup(self, tf_config=None, gpu_total_memory=0, batch_size=10):
    """Initialize inference session.

    Parameters
    ----------
    tf_config : tf.ConfigProto
        TF config proto
    gpu_total_memory: int
        total memory in MB of GPU device, if >0 set fixed memory fraction for TF session, else allow_growth = True
    batch_size : int
        size of a batch
    """
    # restore model from frozen graph
    default_graph = self.load_graph()

    # config gpu allocation
    config = tf_config
    min_memory = 3000  # Mb

    if config is None:
        config = tf.ConfigProto()
        if gpu_total_memory > 0:
            required_memory = batch_size * 500  # default: 40 * 42 = 1680M
            required_memory = max(required_memory, min_memory)
            config.gpu_options.per_process_gpu_memory_fraction = required_memory / gpu_total_memory
        else:
            config.gpu_options.allow_growth = True

    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.per_process_gpu_memory_fraction = 0.9
    sess_config.gpu_options.allow_growth = True

    self._sess = tf.Session(graph=default_graph, config=sess_config)
    logger.info('session setup.')

def infer(self, img, target_size=640, conf_thres=0.25, iou_thres=0.45, keep_cls_idxs=None):
    img = cv2.resize(img, tuple(CFG.INPUT_SIZE), interpolation=cv2.INTER_LINEAR)
    img = np.array(img, np.float32) / 127.5 - 1.0
    print(time.time())
    pred = self._sess.run(self.outputs, feed_dict={self.inputs: [img]})
    print(time.time())
    pred = self.codec.sparse(*pred)[0]
    return pred

def close(self):
    if self._sess:
        self._sess.close()

if name == "main": import time infer = Inference(r"D:\develop\crnn_tensorflow\pretrained_models\saved_pb_diy") infer.setup() img_path = r"D:\develop\data\projects\distribution_license_detection\tests\tmp.JPG" image = cv2.imread(img_path, cv2.IMREAD_COLOR) print(time.time()) print(infer.infer(image)) print(time.time())

zhu2bowen commented 2 years ago

补充:用的是windows系统,上CPU,没有GPU;没有使用推理服务,耗时是上述脚本本地推理

MaybeShewill-CV commented 2 years ago

@zhu2bowen sry 我这边没有在windows环境上测试过这个pb模型,你可以测试下同样的模型代码在ubuntu上有没有问题,或者确认下是不是tf版本不同导致的问题

zhu2bowen commented 2 years ago

@MaybeShewill-CV 谢谢。后来的测试发现主要问题是tf.nn.ctc_beam_search_decoder带来的。有两个发现:1、加载冻结网络后,在迭代过程中,第一次infer网络主体、ctc_beam_search_decoder算子的推理时间都比较长,都超过1S。第二次推理后,网络主体推理时间迅速缩短到0.1S内;而ctc_beam_search_decoder算子推理时间仍然超过1S。当ctc_beam_search_decoder的beam_width调小后,第二次推理时间也会明显缩小,越小,这种加速越明显。2、使用python.tools.optimize_for_inference_lib优化pb文件没有效果。一个猜测:冻结过程导致tensorflow运行机制下ctc_beam_search_decoder算子的加速失效。参考问题https://github.com/tensorflow/tensorflow/issues/26200

MaybeShewill-CV commented 2 years ago

@zhu2bowen beam search确实比较慢 可以尝试下greedy search能不能兼顾精度和速度:)