PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.17k stars 5.57k forks source link

attention-ocr模型加载保存参数问题 #18271

Closed xiangyubo closed 5 years ago

xiangyubo commented 5 years ago

标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错  ”

我自己稍微改了一下 attention 模型的写法,使用自己准备的一个小数据集。数据量大约 1w 出头。我训练的时候,从loss 和编辑距离,预估准确率上看都很正常。在训练的时候,我会使用save_persistables保存当前预估准确率最好的模型参数。 训练结束后,我想把保存的参数通过 save_inference_model 转化成预测形式保存的参数。所以我的思路是先用 load_persistables 加载进来,然后再 save_inference_model 保存。如果我没有调用自己改写的 infer,整个过程不会报错。如果我调用自己改写 infer,在重新加载参数的时候会报错说找不到 conv_8 的参数。但实际上模型在卷积部分只有0-7号.....所以很懵逼

以下是我的代码: 训练部分 `

# -*- coding: UTF-8 -*-
"""
训练基于attention-ocr的网络,文字行识别
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import uuid
import numpy as np
import time
import six
import math
import random
import paddle
import paddle.fluid as fluid
import logging
import xml.etree.ElementTree
import codecs
import json

from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from PIL import Image, ImageEnhance, ImageDraw

logger = None
train_parameters = {
    "input_size": [1, 48, 512],
    "data_dir": "data/data6927/word-recognition",
    "train_dir": "trainImageSet",
    "eval_dir": "evalImageSet",
    "train_list": "train.txt",
    "eval_list": "eval.txt",
    "label_list": "label_list.txt",
    "class_dim": -1,
    "label_dict": {},
    "image_count": -1,
    "continue_train": True,
    "pretrained": False,
    "pretrained_model_dir": "./pretrained-model",
    "save_model_dir": "./attention-ocr-model",
    "num_epochs": 250,
    "train_batch_size": 256,
    "use_gpu": True,
    "decoder_size": 128,
    "word_vector_dim": 128,
    "max_char_length": 40,      # 最大识别字符串长度
    "gradient_clip": 10,
    "sos": 0,
    "eos": 1,
    "mean_color": 127.5,
    "mode": "train",
    "multi_data_reader_count": 4,
    "apply_distort": True,
    "image_distort_strategy": {
        "expand_prob": 0.5,
        "expand_max_ratio": 2,
        "hue_prob": 0.5,
        "hue_delta": 18,
        "contrast_prob": 0.5,
        "contrast_delta": 0.5,
        "saturation_prob": 0.5,
        "saturation_delta": 0.5,
        "brightness_prob": 0.5,
        "brightness_delta": 0.125
    },
    "sgd_strategy": {
        "learning_rate": 0.001,
        "lr_epochs": [70, 140, 200],
        "lr_decay": [1, 0.5, 0.1, 0.05]
    },
    "early_stop": {
        "sample_frequency": 50,
        "successive_limit": 3,
        "min_accuracy": 0.95
    }
}

class AttentionOCR(object):

    def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict):
        self.outputs = None
        self.decoder_size = decoder_size
        self.word_vector_dim = word_vector_dim
        self.label_dict = label_dict
        self.max_char_length = max_char_length
        self.num_classes = num_classes

    def name(self):
        return 'attention-ocr'

    def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False):
        tmp = input
        for i in six.moves.xrange(group):
            filter_size = 3
            conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
            conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std))
            tmp = fluid.layers.conv2d(
                input=tmp,
                num_filters=out_ch[i],
                filter_size=3,
                padding=1,
                bias_attr=False,
                param_attr=conv_param,
                act=None,  # LinearActivation
                use_cudnn=use_cudnn)
            tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
        if pooling:
            tmp = fluid.layers.pool2d(
                input=tmp,
                pool_size=2,
                pool_type='max',
                pool_stride=2,
                use_cudnn=use_cudnn,
                ceil_mode=True)

        return tmp

    def ocr_convs(self, input, is_test=False, use_cudnn=True):
        tmp = input
        tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn)

        return tmp

    def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
        conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)

        sliced_feature = fluid.layers.im2sequence(
            input=conv_features,
            stride=[1, 1],
            filter_size=[conv_features.shape[2], 1])

        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)

        fc_1 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)
        fc_2 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)

        gru_forward = fluid.layers.dynamic_gru(
            input=fc_1,
            size=rnn_hidden_size,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')
        gru_backward = fluid.layers.dynamic_gru(
            input=fc_2,
            size=rnn_hidden_size,
            is_reverse=True,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')

        encoded_vector = fluid.layers.concat(
            input=[gru_forward, gru_backward], axis=1)
        encoded_proj = fluid.layers.fc(input=encoded_vector,
                                       size=self.decoder_size,
                                       bias_attr=False)

        return gru_backward, encoded_vector, encoded_proj

    def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot):
        def simple_attention(encoder_vec, encoder_proj, decoder_state):
            decoder_state_proj = fluid.layers.fc(input=decoder_state,
                                                 size=self.decoder_size,
                                                 bias_attr=False)
            decoder_state_expand = fluid.layers.sequence_expand(
                x=decoder_state_proj, y=encoder_proj)
            concated = encoder_proj + decoder_state_expand
            concated = fluid.layers.tanh(x=concated)
            attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
            attention_weights = fluid.layers.sequence_softmax( input=attention_weights)
            weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
            scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0)
            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
            return context

        rnn = fluid.layers.DynamicRNN()

        with rnn.block():
            current_word = rnn.step_input(target_embedding)
            encoder_vec = rnn.static_input(encoder_vec)
            encoder_proj = rnn.static_input(encoder_proj)
            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
            fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False)
            decoder_inputs = fc_1 + fc_2
            h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3)
            rnn.update_memory(hidden_mem, h)
            out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax')
            rnn.output(out)
        return rnn()

    def net(self, images, label_in):

        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")

        label_in = fluid.layers.cast(x=label_in, dtype='int64')
        trg_embedding = fluid.layers.embedding(
            input=label_in,
            size=[self.num_classes + 2, self.word_vector_dim],
            dtype='float32')
        prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector,
                                                encoded_proj, decoder_boot)
        return prediction

    def infer(self, images, use_cudnn=True):
        beam_size = 1
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")
        init_state = decoder_boot
        array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length)
        counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)

        # fill the first element with init_state
        state_array = fluid.layers.create_array('float32')
        fluid.layers.array_write(init_state, array=state_array, i=counter)

        # ids, scores as memory
        ids_array = fluid.layers.create_array('int64')
        scores_array = fluid.layers.create_array('float32')

        init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
        init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2)

        fluid.layers.array_write(init_ids, array=ids_array, i=counter)
        fluid.layers.array_write(init_scores, array=scores_array, i=counter)

        cond = fluid.layers.less_than(x=counter, y=array_len)
        while_op = fluid.layers.While(cond=cond)
        with while_op.block():
            pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
            pre_state = fluid.layers.array_read(array=state_array, i=counter)
            pre_score = fluid.layers.array_read(array=scores_array, i=counter)

            pre_ids_emb = fluid.layers.embedding(
                input=pre_ids,
                size=[self.num_classes + 2, self.word_vector_dim],
                dtype='float32')

            context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size)

            # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
            pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
            context_expanded = fluid.layers.sequence_expand(context, pre_score)
            fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False)

            decoder_inputs = fc_1 + fc_2
            current_state, _, _ = fluid.layers.gru_unit(
                input=decoder_inputs,
                hidden=pre_state_expanded,
                size=self.decoder_size * 3)

            current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score)
            # use score to do beam search
            current_score = fluid.layers.fc(input=current_state_with_lod,
                                            size=self.num_classes + 2,
                                            bias_attr=True,
                                            act='softmax')
            topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size)

            # calculate accumulated scores after topk to reduce computation cost
            accu_scores = fluid.layers.elementwise_add(
                x=fluid.layers.log(topk_scores),
                y=fluid.layers.reshape(pre_score, shape=[-1]),
                axis=0)
            selected_ids, selected_scores = fluid.layers.beam_search(
                pre_ids,
                pre_score,
                topk_indices,
                accu_scores,
                beam_size,
                1,  # end_id
                #level=0
            )

            fluid.layers.increment(x=counter, value=1, in_place=True)

            # update the memories
            fluid.layers.array_write(current_state, array=state_array, i=counter)
            fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
            fluid.layers.array_write(selected_scores, array=scores_array, i=counter)

            # update the break condition: up to the max length or all candidates of
            # source sentences have ended.
            length_cond = fluid.layers.less_than(x=counter, y=array_len)
            finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids))
            fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
                                                      beam_size, eos)
        return ids

    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size):
        decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False)
        decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj)
        concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
        concated = fluid.layers.tanh(x=concated)
        attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
        attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
        scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0)
        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
        return context

def init_train_parameters():
    """
    初始化训练参数,主要是初始化图片数量,类别数
    :return:
    """
    train_list = os.path.join(train_parameters['data_dir'], train_parameters['train_list'])
    label_list = os.path.join(train_parameters['data_dir'], train_parameters['label_list'])
    index = 0
    with codecs.open(label_list, encoding='utf-8') as flist:
        lines = [line.strip() for line in flist]
        for line in lines:
            parts = line.split()
            train_parameters['label_dict'][parts[0]] = int(parts[1])
            index += 1
        train_parameters['class_dim'] = index
    with codecs.open(train_list, encoding='utf-8') as flist:
        lines = [line.strip() for line in flist]
        train_parameters['image_count'] = len(lines)

def init_log_config():
    """
    初始化日志相关配置
    :return:
    """
    global logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_path = os.path.join(os.getcwd(), 'logs')
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    log_name = os.path.join(log_path, 'train.log')
    sh = logging.StreamHandler()
    fh = logging.FileHandler(log_name, mode='w')
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
    fh.setFormatter(formatter)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    logger.addHandler(fh)

def resize_img(img, input_size):
    target_size = input_size
    percent_h = float(target_size[1]) / img.size[1]
    percent_w = float(target_size[2]) / img.size[0]
    percent = min(percent_h, percent_w)
    resized_width = int(round(img.size[0] * percent))
    resized_height = int(round(img.size[1] * percent))
    w_off = (target_size[2] - resized_width) / 2
    h_off = (target_size[1] - resized_height) / 2
    img = img.resize((resized_width, resized_height), Image.ANTIALIAS)
    array = np.ndarray((target_size[1], target_size[2], 3), np.uint8)
    array[:, :, 0] = 127
    array[:, :, 1] = 127
    array[:, :, 2] = 127
    ret = Image.fromarray(array)
    ret.paste(img, (np.random.randint(0, w_off + 1), int(h_off)))
    return ret

def random_brightness(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['brightness_prob']:
        brightness_delta = train_parameters['image_distort_strategy']['brightness_delta']
        delta = np.random.uniform(-brightness_delta, brightness_delta) + 1
        img = ImageEnhance.Brightness(img).enhance(delta)
    return img

def random_contrast(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['contrast_prob']:
        contrast_delta = train_parameters['image_distort_strategy']['contrast_delta']
        delta = np.random.uniform(-contrast_delta, contrast_delta) + 1
        img = ImageEnhance.Contrast(img).enhance(delta)
    return img

def random_saturation(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['saturation_prob']:
        saturation_delta = train_parameters['image_distort_strategy']['saturation_delta']
        delta = np.random.uniform(-saturation_delta, saturation_delta) + 1
        img = ImageEnhance.Color(img).enhance(delta)
    return img

def random_hue(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['hue_prob']:
        hue_delta = train_parameters['image_distort_strategy']['hue_delta']
        delta = np.random.uniform(-hue_delta, hue_delta)
        img_hsv = np.array(img.convert('HSV'))
        img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
        img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
    return img

def distort_image(img):
    prob = np.random.uniform(0, 1)
    # Apply different distort order
    if prob > 0.5:
        img = random_brightness(img)
        img = random_contrast(img)
        img = random_saturation(img)
        img = random_hue(img)
    else:
        img = random_brightness(img)
        img = random_saturation(img)
        img = random_hue(img)
        img = random_contrast(img)
    return img

def rotate_image(img):
    """
    图像增强,增加随机旋转角度
    """
    prob = np.random.uniform(0, 1)
    if prob > 0.5:
        angle = np.random.randint(-8, 8)
        img = img.rotate(angle)
    return img

def random_expand(img, keep_ratio=True):
    if np.random.uniform(0, 1) < train_parameters['image_distort_strategy']['expand_prob']:
        return img

    max_ratio = train_parameters['image_distort_strategy']['expand_max_ratio']
    w, h = img.size
    c = 3
    ratio_x = random.uniform(1, max_ratio)
    if keep_ratio:
        ratio_y = ratio_x
    else:
        ratio_y = random.uniform(1, max_ratio)
    oh = int(h * ratio_y)
    ow = int(w * ratio_x)
    off_x = random.randint(0, ow - w)
    off_y = random.randint(0, oh - h)

    out_img = np.zeros((oh, ow, c), np.uint8)
    for i in range(c):
        out_img[:, :, i] = train_parameters['mean_color']

    out_img[off_y: off_y + h, off_x: off_x + w, :] = img

    return Image.fromarray(out_img)

def preprocess(img, input_size):
    img_width, img_height = img.size
    if train_parameters['apply_distort']:
        img = distort_image(img)
    img = random_expand(img)
    img = rotate_image(img)
    # img = resize_img(img, input_size)
    # img = img.convert('L')
    # img = np.array(img).astype('float32') - train_parameters['mean_color']
    # img *= 0.007843
    return img

def custom_reader(file_list, data_dir, input_size, mode):
    def reader():
        np.random.shuffle(file_list)
        for line in file_list:
            # img_name, label
            parts = line.split()
            image_path = parts[0]
            img = Image.open(image_path)
            # img = Image.open(os.path.join(data_dir, image_path))
            if img.mode != 'RGB':
                img = img.convert('RGB')
            label = [int(train_parameters['label_dict'][c]) for c in parts[-1]]
            if len(label) == 0:
                continue
            if mode == 'train':
                img = preprocess(img, input_size)
            img = resize_img(img, input_size)
            img = img.convert('L')
            # img.save(image_path)
            img = np.array(img).astype('float32') - train_parameters['mean_color']
            # img *= 0.007843
            img = img[np.newaxis, ...]
            # print("{0} {1}".format(image_path, label))
            sos = train_parameters['sos']
            eos = train_parameters['eos']
            yield img, [sos] + label, label + [eos]

    return reader

def multi_process_custom_reader(file_path, data_dir, num_workers, input_size, mode):
    file_path = os.path.join(data_dir, file_path)
    readers = []
    images = [line.strip() for line in open(file_path)]
    n = int(math.ceil(len(images) // num_workers))
    image_lists = [images[i: i + n] for i in range(0, len(images), n)]
    train_path = os.path.join(train_parameters['data_dir'], train_parameters['train_dir'])
    for l in image_lists:
        reader = paddle.batch(custom_reader(l, train_path, input_size, mode),
                              batch_size=train_parameters['train_batch_size'])
        readers.append(paddle.reader.shuffle(reader, train_parameters['train_batch_size']))
    return paddle.reader.multiprocess_reader(readers, False)

def create_eval_reader(file_path, data_dir, input_size, mode):
    file_path = os.path.join(data_dir, file_path)
    images = [line.strip() for line in open(file_path)]
    eval_path = os.path.join(train_parameters['data_dir'], train_parameters['eval_dir'])
    return paddle.batch(custom_reader(images, eval_path, input_size, mode),
                        batch_size=train_parameters['train_batch_size'])

def optimizer_sgd_setting():
    batch_size = train_parameters["train_batch_size"]
    iters = train_parameters["image_count"] // batch_size
    learning_strategy = train_parameters['sgd_strategy']
    lr = learning_strategy['learning_rate']

    boundaries = [i * iters for i in learning_strategy["lr_epochs"]]
    values = [i * lr for i in learning_strategy["lr_decay"]]

    optimizer = fluid.optimizer.SGDOptimizer(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
        regularization=fluid.regularizer.L2Decay(0.00005))

    return optimizer

def build_train_program_with_async_reader(main_prog, startup_prog):
    with fluid.program_guard(main_prog, startup_prog):
        img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
        label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
        label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1)

        data_reader = fluid.layers.create_py_reader_by_data(capacity=train_parameters['train_batch_size'],
                                                            feed_list=[img, label_in, label_out],
                                                            name='train')
        multi_reader = multi_process_custom_reader(train_parameters['train_list'],
                                                   train_parameters['data_dir'],
                                                   train_parameters['multi_data_reader_count'],
                                                   train_parameters['input_size'],
                                                   'train')
        data_reader.decorate_paddle_reader(multi_reader)
        img, label_in, label_out = fluid.layers.read_file(data_reader)
        loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out)
        return data_reader, loss, distances, seq_num, decoded_out

def build_eval_program_with_feeder(main_prog, startup_prog, place):
    with fluid.program_guard(main_prog, startup_prog):
        img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
        label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
        label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1)

        feeder = fluid.DataFeeder(feed_list=[img, label_in, label_out], place=place, program=main_prog)
        reader = create_eval_reader(train_parameters['eval_list'],
                                    train_parameters['data_dir'],
                                    train_parameters['input_size'],
                                    'eval')
        loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out)
        return reader, loss, distances, seq_num, decoded_out

def get_loss(img, label_in, label_out):
    with fluid.unique_name.guard():
        class_dim = train_parameters['class_dim']
        decoder_size = train_parameters['decoder_size']
        word_vector_dim = train_parameters['word_vector_dim']
        label_dict = train_parameters['label_dict']
        max_char_length = train_parameters['max_char_length']
        model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, label_dict)
        prediction = model.net(img, label_in)

        label_out = fluid.layers.cast(x=label_out, dtype='int64')
        cost = fluid.layers.cross_entropy(input=prediction, label=label_out)
        loss = fluid.layers.reduce_sum(cost)

        optimizer = optimizer_sgd_setting()
        optimizer.minimize(loss)

        _, decoded_out = fluid.layers.topk(input=prediction, k=1)
        casted_label = fluid.layers.cast(x=label_out, dtype='int64')
        sos = train_parameters['sos']
        eos = train_parameters['eos']
        distances, seq_num = fluid.layers.edit_distance(decoded_out, label_out, ignored_tokens=[sos, eos])

        return loss, distances, seq_num, decoded_out

def load_pretrained_params(exe, program):
    if train_parameters['continue_train'] and os.path.exists(train_parameters['save_model_dir']):
        logger.info('load param from retrain model')
        fluid.io.load_persistables(executor=exe,
                                   dirname=train_parameters['save_model_dir'],
                                   main_program=program)
    elif train_parameters['pretrained'] and os.path.exists(train_parameters['pretrained_model_dir']):
        logger.info('load param from pretrained model')

        def if_exist(var):
            return os.path.exists(os.path.join(train_parameters['pretrained_model_dir'], var.name))

        fluid.io.load_vars(exe, train_parameters['pretrained_model_dir'], main_program=program,
                           predicate=if_exist)

def train():
    init_log_config()
    init_train_parameters()
    logger.info("start train attention-ocr, train params:%s", str(train_parameters))

    logger.info("create place, use gpu:" + str(train_parameters['use_gpu']))
    place = fluid.CUDAPlace(0) if train_parameters['use_gpu'] else fluid.CPUPlace()

    logger.info("build network and program")
    train_program = fluid.Program()
    train_start_program = fluid.Program()
    eval_program = fluid.Program()
    eval_start_program = fluid.Program()
    train_reader, loss, distances, seq_num, decoded_out = build_train_program_with_async_reader(train_program,
                                                                                                train_start_program)
    # eval_reader, eval_loss, eval_distances, eval_seq_num, eval_decoded_out = build_eval_program_with_feeder(eval_program, eval_start_program, place)
    # eval_program = eval_program.clone(for_test=True)

    logger.info("build executor and init params")
    exe = fluid.Executor(place)
    exe.run(train_start_program)
    train_fetch_list = [loss.name, distances.name, seq_num.name, decoded_out.name]
    # eval_fetch_list = [output.name]
    load_pretrained_params(exe, train_program)

    stop_strategy = train_parameters['early_stop']
    successive_limit = stop_strategy['successive_limit']
    sample_freq = stop_strategy['sample_frequency']
    min_accuracy = stop_strategy['min_accuracy']
    current_best_accuracy = 0.0
    stop_train = False
    successive_count = 0
    total_batch_count = 0
    distance_evaluator = fluid.metrics.EditDistance("edit-distance")
    for pass_id in range(train_parameters["num_epochs"]):
        logger.info("current pass: %d, start read image", pass_id)
        batch_id = 0
        train_reader.start()
        distance_evaluator.reset()
        try:
            while True:
                t1 = time.time()
                loss, distances, seq_num, decoded_out = exe.run(train_program, fetch_list=train_fetch_list,
                                                                return_numpy=False)
                distances = np.array(distances)
                seq_num = np.array(seq_num)
                distance_evaluator.update(distances, seq_num)
                period = time.time() - t1
                loss = np.mean(np.array(loss))
                batch_id += 1
                total_batch_count += 1

                if batch_id % 10 == 0:
                    distance, instance_error = distance_evaluator.eval()
                    # logger.info(np.array(decoded_out))
                    logger.info("Pass {0}, trainbatch {1}, loss {2} distance {3} instance error {4} time {5}"
                                .format(pass_id, batch_id, loss, distance, instance_error, "%2.2f sec" % period))

        except fluid.core.EOFException:
            train_reader.reset()

        distance, instance_error = distance_evaluator.eval()
        logger.info("Pass {0} distance {1} instance error {2}".format(pass_id, distance, instance_error))

        if 1.0 - instance_error >= current_best_accuracy:
            logger.info("temp save pass {0} train result, current bset accuracy {1}".format(pass_id, 1.0 - instance_error))
            current_best_accuracy = 1.0 - instance_error
            fluid.io.save_persistables(dirname=train_parameters['save_model_dir'], main_program=train_program, executor=exe)

    logger.info("training till last epcho, end training")

if __name__ == '__main__':
    train()

`

把模型保存成推理形式的代码: `

# -*- coding: UTF-8 -*-
"""
固化基于attention-ocr的网络,文字行识别
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import numpy as np
import random
import time
import codecs
import sys
import six
import functools
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.param_attr import ParamAttr
from PIL import Image, ImageEnhance

class_dim = 63
decoder_size = 128
word_vector_dim = 128

target_size = [1, 48, 512]
mean_rgb = 127.5
use_gpu = True
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
sos = 0
eos = 1
max_char_length = 40
save_freeze_dir = "./attention-ocr-model"

class AttentionOCR(object):

    def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict):
        self.outputs = None
        self.decoder_size = decoder_size
        self.word_vector_dim = word_vector_dim
        self.label_dict = label_dict
        self.max_char_length = max_char_length
        self.num_classes = num_classes

    def name(self):
        return 'attention-ocr'

    def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False):
        tmp = input
        for i in six.moves.xrange(group):
            filter_size = 3
            conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
            conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std))
            tmp = fluid.layers.conv2d(
                input=tmp,
                num_filters=out_ch[i],
                filter_size=3,
                padding=1,
                bias_attr=False,
                param_attr=conv_param,
                act=None,  # LinearActivation
                use_cudnn=use_cudnn)
            tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
        if pooling:
            tmp = fluid.layers.pool2d(
                input=tmp,
                pool_size=2,
                pool_type='max',
                pool_stride=2,
                use_cudnn=use_cudnn,
                ceil_mode=True)

        return tmp

    def ocr_convs(self, input, is_test=False, use_cudnn=True):
        tmp = input
        tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn)

        return tmp

    def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
        conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)

        sliced_feature = fluid.layers.im2sequence(
            input=conv_features,
            stride=[1, 1],
            filter_size=[conv_features.shape[2], 1])

        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)

        fc_1 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)
        fc_2 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)

        gru_forward = fluid.layers.dynamic_gru(
            input=fc_1,
            size=rnn_hidden_size,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')
        gru_backward = fluid.layers.dynamic_gru(
            input=fc_2,
            size=rnn_hidden_size,
            is_reverse=True,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')

        encoded_vector = fluid.layers.concat(
            input=[gru_forward, gru_backward], axis=1)
        encoded_proj = fluid.layers.fc(input=encoded_vector,
                                       size=self.decoder_size,
                                       bias_attr=False)

        return gru_backward, encoded_vector, encoded_proj

    def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot):
        def simple_attention(encoder_vec, encoder_proj, decoder_state):
            decoder_state_proj = fluid.layers.fc(input=decoder_state,
                                                 size=self.decoder_size,
                                                 bias_attr=False)
            decoder_state_expand = fluid.layers.sequence_expand(
                x=decoder_state_proj, y=encoder_proj)
            concated = encoder_proj + decoder_state_expand
            concated = fluid.layers.tanh(x=concated)
            attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
            attention_weights = fluid.layers.sequence_softmax( input=attention_weights)
            weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
            scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0)
            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
            return context

        rnn = fluid.layers.DynamicRNN()

        with rnn.block():
            current_word = rnn.step_input(target_embedding)
            encoder_vec = rnn.static_input(encoder_vec)
            encoder_proj = rnn.static_input(encoder_proj)
            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
            fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False)
            decoder_inputs = fc_1 + fc_2
            h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3)
            rnn.update_memory(hidden_mem, h)
            out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax')
            rnn.output(out)
        return rnn()

    def net(self, images, label_in):

        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")

        label_in = fluid.layers.cast(x=label_in, dtype='int64')
        trg_embedding = fluid.layers.embedding(
            input=label_in,
            size=[self.num_classes + 2, self.word_vector_dim],
            dtype='float32')
        prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector,
                                                encoded_proj, decoder_boot)
        return prediction

    def infer(self, images, use_cudnn=True):
        beam_size = 1
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")
        init_state = decoder_boot
        array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length)
        counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)

        # fill the first element with init_state
        state_array = fluid.layers.create_array('float32')
        fluid.layers.array_write(init_state, array=state_array, i=counter)

        # ids, scores as memory
        ids_array = fluid.layers.create_array('int64')
        scores_array = fluid.layers.create_array('float32')

        init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
        init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2)

        fluid.layers.array_write(init_ids, array=ids_array, i=counter)
        fluid.layers.array_write(init_scores, array=scores_array, i=counter)

        cond = fluid.layers.less_than(x=counter, y=array_len)
        while_op = fluid.layers.While(cond=cond)
        with while_op.block():
            pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
            pre_state = fluid.layers.array_read(array=state_array, i=counter)
            pre_score = fluid.layers.array_read(array=scores_array, i=counter)

            pre_ids_emb = fluid.layers.embedding(
                input=pre_ids,
                size=[self.num_classes + 2, self.word_vector_dim],
                dtype='float32')

            context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size)

            # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
            pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
            context_expanded = fluid.layers.sequence_expand(context, pre_score)
            fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False)

            decoder_inputs = fc_1 + fc_2
            current_state, _, _ = fluid.layers.gru_unit(
                input=decoder_inputs,
                hidden=pre_state_expanded,
                size=self.decoder_size * 3)

            current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score)
            # use score to do beam search
            current_score = fluid.layers.fc(input=current_state_with_lod,
                                            size=self.num_classes + 2,
                                            bias_attr=True,
                                            act='softmax')
            topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size)

            # calculate accumulated scores after topk to reduce computation cost
            accu_scores = fluid.layers.elementwise_add(
                x=fluid.layers.log(topk_scores),
                y=fluid.layers.reshape(pre_score, shape=[-1]),
                axis=0)
            selected_ids, selected_scores = fluid.layers.beam_search(
                pre_ids,
                pre_score,
                topk_indices,
                accu_scores,
                beam_size,
                1,  # end_id
                #level=0
            )

            fluid.layers.increment(x=counter, value=1, in_place=True)

            # update the memories
            fluid.layers.array_write(current_state, array=state_array, i=counter)
            fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
            fluid.layers.array_write(selected_scores, array=scores_array, i=counter)

            # update the break condition: up to the max length or all candidates of
            # source sentences have ended.
            length_cond = fluid.layers.less_than(x=counter, y=array_len)
            finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids))
            fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
                                                      beam_size, eos)
        return ids

    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size):
        decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False)
        decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj)
        concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
        concated = fluid.layers.tanh(x=concated)
        attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
        attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
        scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0)
        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
        return context

def freeze_model():

    exe = fluid.Executor(fluid.CPUPlace())
    image = fluid.layers.data(name='image', shape=target_size, dtype='float32')
    label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
    model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, {})
    pred = model.net(image, label_in)
    out = model.infer(image)

    freeze_program = fluid.default_main_program()
    exe.run(fluid.Program())
    fluid.io.load_persistables(exe, save_freeze_dir, freeze_program)
    freeze_program = freeze_program.clone(for_test=True)
    fluid.io.save_inference_model("./freeze_model", ['image'], out, exe, freeze_program)

if __name__ == '__main__':
    freeze_model()

`

NHZlX commented 5 years ago
pred = model.net(image, label_in)
out = model.infer(image)

在一个空间中进行了两次网络的构造。因此会出现问题。