Closed xiangyubo closed 5 years ago
标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错 ”
我自己稍微改了一下 attention 模型的写法,使用自己准备的一个小数据集。数据量大约 1w 出头。我训练的时候,从loss 和编辑距离,预估准确率上看都很正常。在训练的时候,我会使用save_persistables保存当前预估准确率最好的模型参数。 训练结束后,我想把保存的参数通过 save_inference_model 转化成预测形式保存的参数。所以我的思路是先用 load_persistables 加载进来,然后再 save_inference_model 保存。如果我没有调用自己改写的 infer,整个过程不会报错。如果我调用自己改写 infer,在重新加载参数的时候会报错说找不到 conv_8 的参数。但实际上模型在卷积部分只有0-7号.....所以很懵逼
以下是我的代码: 训练部分 `
# -*- coding: UTF-8 -*- """ 训练基于attention-ocr的网络,文字行识别 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import uuid import numpy as np import time import six import math import random import paddle import paddle.fluid as fluid import logging import xml.etree.ElementTree import codecs import json from paddle.fluid.initializer import MSRA from paddle.fluid.param_attr import ParamAttr from paddle.fluid.regularizer import L2Decay from PIL import Image, ImageEnhance, ImageDraw logger = None train_parameters = { "input_size": [1, 48, 512], "data_dir": "data/data6927/word-recognition", "train_dir": "trainImageSet", "eval_dir": "evalImageSet", "train_list": "train.txt", "eval_list": "eval.txt", "label_list": "label_list.txt", "class_dim": -1, "label_dict": {}, "image_count": -1, "continue_train": True, "pretrained": False, "pretrained_model_dir": "./pretrained-model", "save_model_dir": "./attention-ocr-model", "num_epochs": 250, "train_batch_size": 256, "use_gpu": True, "decoder_size": 128, "word_vector_dim": 128, "max_char_length": 40, # 最大识别字符串长度 "gradient_clip": 10, "sos": 0, "eos": 1, "mean_color": 127.5, "mode": "train", "multi_data_reader_count": 4, "apply_distort": True, "image_distort_strategy": { "expand_prob": 0.5, "expand_max_ratio": 2, "hue_prob": 0.5, "hue_delta": 18, "contrast_prob": 0.5, "contrast_delta": 0.5, "saturation_prob": 0.5, "saturation_delta": 0.5, "brightness_prob": 0.5, "brightness_delta": 0.125 }, "sgd_strategy": { "learning_rate": 0.001, "lr_epochs": [70, 140, 200], "lr_decay": [1, 0.5, 0.1, 0.05] }, "early_stop": { "sample_frequency": 50, "successive_limit": 3, "min_accuracy": 0.95 } } class AttentionOCR(object): def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict): self.outputs = None self.decoder_size = decoder_size self.word_vector_dim = word_vector_dim self.label_dict = label_dict self.max_char_length = max_char_length self.num_classes = num_classes def name(self): return 'attention-ocr' def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False): tmp = input for i in six.moves.xrange(group): filter_size = 3 conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5 conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std)) tmp = fluid.layers.conv2d( input=tmp, num_filters=out_ch[i], filter_size=3, padding=1, bias_attr=False, param_attr=conv_param, act=None, # LinearActivation use_cudnn=use_cudnn) tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test) if pooling: tmp = fluid.layers.pool2d( input=tmp, pool_size=2, pool_type='max', pool_stride=2, use_cudnn=use_cudnn, ceil_mode=True) return tmp def ocr_convs(self, input, is_test=False, use_cudnn=True): tmp = input tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn) tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn) tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn) tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn) return tmp def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True): conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn) sliced_feature = fluid.layers.im2sequence( input=conv_features, stride=[1, 1], filter_size=[conv_features.shape[2], 1]) para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02)) bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) fc_1 = fluid.layers.fc(input=sliced_feature, size=rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) fc_2 = fluid.layers.fc(input=sliced_feature, size=rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) gru_forward = fluid.layers.dynamic_gru( input=fc_1, size=rnn_hidden_size, param_attr=para_attr, bias_attr=bias_attr, candidate_activation='relu') gru_backward = fluid.layers.dynamic_gru( input=fc_2, size=rnn_hidden_size, is_reverse=True, param_attr=para_attr, bias_attr=bias_attr, candidate_activation='relu') encoded_vector = fluid.layers.concat( input=[gru_forward, gru_backward], axis=1) encoded_proj = fluid.layers.fc(input=encoded_vector, size=self.decoder_size, bias_attr=False) return gru_backward, encoded_vector, encoded_proj def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot): def simple_attention(encoder_vec, encoder_proj, decoder_state): decoder_state_proj = fluid.layers.fc(input=decoder_state, size=self.decoder_size, bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand( x=decoder_state_proj, y=encoder_proj) concated = encoder_proj + decoder_state_expand concated = fluid.layers.tanh(x=concated) attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False) attention_weights = fluid.layers.sequence_softmax( input=attention_weights) weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0) context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') return context rnn = fluid.layers.DynamicRNN() with rnn.block(): current_word = rnn.step_input(target_embedding) encoder_vec = rnn.static_input(encoder_vec) encoder_proj = rnn.static_input(encoder_proj) hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) context = simple_attention(encoder_vec, encoder_proj, hidden_mem) fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False) fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False) decoder_inputs = fc_1 + fc_2 h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3) rnn.update_memory(hidden_mem, h) out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax') rnn.output(out) return rnn() def net(self, images, label_in): gru_backward, encoded_vector, encoded_proj = self.encoder_net(images) backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first') decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, bias_attr=False, act="relu") label_in = fluid.layers.cast(x=label_in, dtype='int64') trg_embedding = fluid.layers.embedding( input=label_in, size=[self.num_classes + 2, self.word_vector_dim], dtype='float32') prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector, encoded_proj, decoder_boot) return prediction def infer(self, images, use_cudnn=True): beam_size = 1 gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn) backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first') decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, bias_attr=False, act="relu") init_state = decoder_boot array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length) counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = fluid.layers.create_array('float32') fluid.layers.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = fluid.layers.create_array('int64') scores_array = fluid.layers.create_array('float32') init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) fluid.layers.array_write(init_ids, array=ids_array, i=counter) fluid.layers.array_write(init_scores, array=scores_array, i=counter) cond = fluid.layers.less_than(x=counter, y=array_len) while_op = fluid.layers.While(cond=cond) with while_op.block(): pre_ids = fluid.layers.array_read(array=ids_array, i=counter) pre_state = fluid.layers.array_read(array=state_array, i=counter) pre_score = fluid.layers.array_read(array=scores_array, i=counter) pre_ids_emb = fluid.layers.embedding( input=pre_ids, size=[self.num_classes + 2, self.word_vector_dim], dtype='float32') context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size) # expand the recursive_sequence_lengths of pre_state to be the same with pre_score pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score) context_expanded = fluid.layers.sequence_expand(context, pre_score) fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False) fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False) decoder_inputs = fc_1 + fc_2 current_state, _, _ = fluid.layers.gru_unit( input=decoder_inputs, hidden=pre_state_expanded, size=self.decoder_size * 3) current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = fluid.layers.fc(input=current_state_with_lod, size=self.num_classes + 2, bias_attr=True, act='softmax') topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = fluid.layers.elementwise_add( x=fluid.layers.log(topk_scores), y=fluid.layers.reshape(pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = fluid.layers.beam_search( pre_ids, pre_score, topk_indices, accu_scores, beam_size, 1, # end_id #level=0 ) fluid.layers.increment(x=counter, value=1, in_place=True) # update the memories fluid.layers.array_write(current_state, array=state_array, i=counter) fluid.layers.array_write(selected_ids, array=ids_array, i=counter) fluid.layers.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = fluid.layers.less_than(x=counter, y=array_len) finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids)) fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond) ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array, beam_size, eos) return ids def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size): decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj) concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand) concated = fluid.layers.tanh(x=concated) attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False) attention_weights = fluid.layers.sequence_softmax(input=attention_weights) weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0) context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') return context def init_train_parameters(): """ 初始化训练参数,主要是初始化图片数量,类别数 :return: """ train_list = os.path.join(train_parameters['data_dir'], train_parameters['train_list']) label_list = os.path.join(train_parameters['data_dir'], train_parameters['label_list']) index = 0 with codecs.open(label_list, encoding='utf-8') as flist: lines = [line.strip() for line in flist] for line in lines: parts = line.split() train_parameters['label_dict'][parts[0]] = int(parts[1]) index += 1 train_parameters['class_dim'] = index with codecs.open(train_list, encoding='utf-8') as flist: lines = [line.strip() for line in flist] train_parameters['image_count'] = len(lines) def init_log_config(): """ 初始化日志相关配置 :return: """ global logger logger = logging.getLogger() logger.setLevel(logging.INFO) log_path = os.path.join(os.getcwd(), 'logs') if not os.path.exists(log_path): os.makedirs(log_path) log_name = os.path.join(log_path, 'train.log') sh = logging.StreamHandler() fh = logging.FileHandler(log_name, mode='w') fh.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") fh.setFormatter(formatter) sh.setFormatter(formatter) logger.addHandler(sh) logger.addHandler(fh) def resize_img(img, input_size): target_size = input_size percent_h = float(target_size[1]) / img.size[1] percent_w = float(target_size[2]) / img.size[0] percent = min(percent_h, percent_w) resized_width = int(round(img.size[0] * percent)) resized_height = int(round(img.size[1] * percent)) w_off = (target_size[2] - resized_width) / 2 h_off = (target_size[1] - resized_height) / 2 img = img.resize((resized_width, resized_height), Image.ANTIALIAS) array = np.ndarray((target_size[1], target_size[2], 3), np.uint8) array[:, :, 0] = 127 array[:, :, 1] = 127 array[:, :, 2] = 127 ret = Image.fromarray(array) ret.paste(img, (np.random.randint(0, w_off + 1), int(h_off))) return ret def random_brightness(img): prob = np.random.uniform(0, 1) if prob < train_parameters['image_distort_strategy']['brightness_prob']: brightness_delta = train_parameters['image_distort_strategy']['brightness_delta'] delta = np.random.uniform(-brightness_delta, brightness_delta) + 1 img = ImageEnhance.Brightness(img).enhance(delta) return img def random_contrast(img): prob = np.random.uniform(0, 1) if prob < train_parameters['image_distort_strategy']['contrast_prob']: contrast_delta = train_parameters['image_distort_strategy']['contrast_delta'] delta = np.random.uniform(-contrast_delta, contrast_delta) + 1 img = ImageEnhance.Contrast(img).enhance(delta) return img def random_saturation(img): prob = np.random.uniform(0, 1) if prob < train_parameters['image_distort_strategy']['saturation_prob']: saturation_delta = train_parameters['image_distort_strategy']['saturation_delta'] delta = np.random.uniform(-saturation_delta, saturation_delta) + 1 img = ImageEnhance.Color(img).enhance(delta) return img def random_hue(img): prob = np.random.uniform(0, 1) if prob < train_parameters['image_distort_strategy']['hue_prob']: hue_delta = train_parameters['image_distort_strategy']['hue_delta'] delta = np.random.uniform(-hue_delta, hue_delta) img_hsv = np.array(img.convert('HSV')) img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta img = Image.fromarray(img_hsv, mode='HSV').convert('RGB') return img def distort_image(img): prob = np.random.uniform(0, 1) # Apply different distort order if prob > 0.5: img = random_brightness(img) img = random_contrast(img) img = random_saturation(img) img = random_hue(img) else: img = random_brightness(img) img = random_saturation(img) img = random_hue(img) img = random_contrast(img) return img def rotate_image(img): """ 图像增强,增加随机旋转角度 """ prob = np.random.uniform(0, 1) if prob > 0.5: angle = np.random.randint(-8, 8) img = img.rotate(angle) return img def random_expand(img, keep_ratio=True): if np.random.uniform(0, 1) < train_parameters['image_distort_strategy']['expand_prob']: return img max_ratio = train_parameters['image_distort_strategy']['expand_max_ratio'] w, h = img.size c = 3 ratio_x = random.uniform(1, max_ratio) if keep_ratio: ratio_y = ratio_x else: ratio_y = random.uniform(1, max_ratio) oh = int(h * ratio_y) ow = int(w * ratio_x) off_x = random.randint(0, ow - w) off_y = random.randint(0, oh - h) out_img = np.zeros((oh, ow, c), np.uint8) for i in range(c): out_img[:, :, i] = train_parameters['mean_color'] out_img[off_y: off_y + h, off_x: off_x + w, :] = img return Image.fromarray(out_img) def preprocess(img, input_size): img_width, img_height = img.size if train_parameters['apply_distort']: img = distort_image(img) img = random_expand(img) img = rotate_image(img) # img = resize_img(img, input_size) # img = img.convert('L') # img = np.array(img).astype('float32') - train_parameters['mean_color'] # img *= 0.007843 return img def custom_reader(file_list, data_dir, input_size, mode): def reader(): np.random.shuffle(file_list) for line in file_list: # img_name, label parts = line.split() image_path = parts[0] img = Image.open(image_path) # img = Image.open(os.path.join(data_dir, image_path)) if img.mode != 'RGB': img = img.convert('RGB') label = [int(train_parameters['label_dict'][c]) for c in parts[-1]] if len(label) == 0: continue if mode == 'train': img = preprocess(img, input_size) img = resize_img(img, input_size) img = img.convert('L') # img.save(image_path) img = np.array(img).astype('float32') - train_parameters['mean_color'] # img *= 0.007843 img = img[np.newaxis, ...] # print("{0} {1}".format(image_path, label)) sos = train_parameters['sos'] eos = train_parameters['eos'] yield img, [sos] + label, label + [eos] return reader def multi_process_custom_reader(file_path, data_dir, num_workers, input_size, mode): file_path = os.path.join(data_dir, file_path) readers = [] images = [line.strip() for line in open(file_path)] n = int(math.ceil(len(images) // num_workers)) image_lists = [images[i: i + n] for i in range(0, len(images), n)] train_path = os.path.join(train_parameters['data_dir'], train_parameters['train_dir']) for l in image_lists: reader = paddle.batch(custom_reader(l, train_path, input_size, mode), batch_size=train_parameters['train_batch_size']) readers.append(paddle.reader.shuffle(reader, train_parameters['train_batch_size'])) return paddle.reader.multiprocess_reader(readers, False) def create_eval_reader(file_path, data_dir, input_size, mode): file_path = os.path.join(data_dir, file_path) images = [line.strip() for line in open(file_path)] eval_path = os.path.join(train_parameters['data_dir'], train_parameters['eval_dir']) return paddle.batch(custom_reader(images, eval_path, input_size, mode), batch_size=train_parameters['train_batch_size']) def optimizer_sgd_setting(): batch_size = train_parameters["train_batch_size"] iters = train_parameters["image_count"] // batch_size learning_strategy = train_parameters['sgd_strategy'] lr = learning_strategy['learning_rate'] boundaries = [i * iters for i in learning_strategy["lr_epochs"]] values = [i * lr for i in learning_strategy["lr_decay"]] optimizer = fluid.optimizer.SGDOptimizer( learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005)) return optimizer def build_train_program_with_async_reader(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog): img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32') label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1) label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1) data_reader = fluid.layers.create_py_reader_by_data(capacity=train_parameters['train_batch_size'], feed_list=[img, label_in, label_out], name='train') multi_reader = multi_process_custom_reader(train_parameters['train_list'], train_parameters['data_dir'], train_parameters['multi_data_reader_count'], train_parameters['input_size'], 'train') data_reader.decorate_paddle_reader(multi_reader) img, label_in, label_out = fluid.layers.read_file(data_reader) loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out) return data_reader, loss, distances, seq_num, decoded_out def build_eval_program_with_feeder(main_prog, startup_prog, place): with fluid.program_guard(main_prog, startup_prog): img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32') label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1) label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1) feeder = fluid.DataFeeder(feed_list=[img, label_in, label_out], place=place, program=main_prog) reader = create_eval_reader(train_parameters['eval_list'], train_parameters['data_dir'], train_parameters['input_size'], 'eval') loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out) return reader, loss, distances, seq_num, decoded_out def get_loss(img, label_in, label_out): with fluid.unique_name.guard(): class_dim = train_parameters['class_dim'] decoder_size = train_parameters['decoder_size'] word_vector_dim = train_parameters['word_vector_dim'] label_dict = train_parameters['label_dict'] max_char_length = train_parameters['max_char_length'] model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, label_dict) prediction = model.net(img, label_in) label_out = fluid.layers.cast(x=label_out, dtype='int64') cost = fluid.layers.cross_entropy(input=prediction, label=label_out) loss = fluid.layers.reduce_sum(cost) optimizer = optimizer_sgd_setting() optimizer.minimize(loss) _, decoded_out = fluid.layers.topk(input=prediction, k=1) casted_label = fluid.layers.cast(x=label_out, dtype='int64') sos = train_parameters['sos'] eos = train_parameters['eos'] distances, seq_num = fluid.layers.edit_distance(decoded_out, label_out, ignored_tokens=[sos, eos]) return loss, distances, seq_num, decoded_out def load_pretrained_params(exe, program): if train_parameters['continue_train'] and os.path.exists(train_parameters['save_model_dir']): logger.info('load param from retrain model') fluid.io.load_persistables(executor=exe, dirname=train_parameters['save_model_dir'], main_program=program) elif train_parameters['pretrained'] and os.path.exists(train_parameters['pretrained_model_dir']): logger.info('load param from pretrained model') def if_exist(var): return os.path.exists(os.path.join(train_parameters['pretrained_model_dir'], var.name)) fluid.io.load_vars(exe, train_parameters['pretrained_model_dir'], main_program=program, predicate=if_exist) def train(): init_log_config() init_train_parameters() logger.info("start train attention-ocr, train params:%s", str(train_parameters)) logger.info("create place, use gpu:" + str(train_parameters['use_gpu'])) place = fluid.CUDAPlace(0) if train_parameters['use_gpu'] else fluid.CPUPlace() logger.info("build network and program") train_program = fluid.Program() train_start_program = fluid.Program() eval_program = fluid.Program() eval_start_program = fluid.Program() train_reader, loss, distances, seq_num, decoded_out = build_train_program_with_async_reader(train_program, train_start_program) # eval_reader, eval_loss, eval_distances, eval_seq_num, eval_decoded_out = build_eval_program_with_feeder(eval_program, eval_start_program, place) # eval_program = eval_program.clone(for_test=True) logger.info("build executor and init params") exe = fluid.Executor(place) exe.run(train_start_program) train_fetch_list = [loss.name, distances.name, seq_num.name, decoded_out.name] # eval_fetch_list = [output.name] load_pretrained_params(exe, train_program) stop_strategy = train_parameters['early_stop'] successive_limit = stop_strategy['successive_limit'] sample_freq = stop_strategy['sample_frequency'] min_accuracy = stop_strategy['min_accuracy'] current_best_accuracy = 0.0 stop_train = False successive_count = 0 total_batch_count = 0 distance_evaluator = fluid.metrics.EditDistance("edit-distance") for pass_id in range(train_parameters["num_epochs"]): logger.info("current pass: %d, start read image", pass_id) batch_id = 0 train_reader.start() distance_evaluator.reset() try: while True: t1 = time.time() loss, distances, seq_num, decoded_out = exe.run(train_program, fetch_list=train_fetch_list, return_numpy=False) distances = np.array(distances) seq_num = np.array(seq_num) distance_evaluator.update(distances, seq_num) period = time.time() - t1 loss = np.mean(np.array(loss)) batch_id += 1 total_batch_count += 1 if batch_id % 10 == 0: distance, instance_error = distance_evaluator.eval() # logger.info(np.array(decoded_out)) logger.info("Pass {0}, trainbatch {1}, loss {2} distance {3} instance error {4} time {5}" .format(pass_id, batch_id, loss, distance, instance_error, "%2.2f sec" % period)) except fluid.core.EOFException: train_reader.reset() distance, instance_error = distance_evaluator.eval() logger.info("Pass {0} distance {1} instance error {2}".format(pass_id, distance, instance_error)) if 1.0 - instance_error >= current_best_accuracy: logger.info("temp save pass {0} train result, current bset accuracy {1}".format(pass_id, 1.0 - instance_error)) current_best_accuracy = 1.0 - instance_error fluid.io.save_persistables(dirname=train_parameters['save_model_dir'], main_program=train_program, executor=exe) logger.info("training till last epcho, end training") if __name__ == '__main__': train()
`
把模型保存成推理形式的代码: `
# -*- coding: UTF-8 -*- """ 固化基于attention-ocr的网络,文字行识别 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import numpy as np import random import time import codecs import sys import six import functools import math import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.param_attr import ParamAttr from PIL import Image, ImageEnhance class_dim = 63 decoder_size = 128 word_vector_dim = 128 target_size = [1, 48, 512] mean_rgb = 127.5 use_gpu = True place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) sos = 0 eos = 1 max_char_length = 40 save_freeze_dir = "./attention-ocr-model" class AttentionOCR(object): def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict): self.outputs = None self.decoder_size = decoder_size self.word_vector_dim = word_vector_dim self.label_dict = label_dict self.max_char_length = max_char_length self.num_classes = num_classes def name(self): return 'attention-ocr' def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False): tmp = input for i in six.moves.xrange(group): filter_size = 3 conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5 conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std)) tmp = fluid.layers.conv2d( input=tmp, num_filters=out_ch[i], filter_size=3, padding=1, bias_attr=False, param_attr=conv_param, act=None, # LinearActivation use_cudnn=use_cudnn) tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test) if pooling: tmp = fluid.layers.pool2d( input=tmp, pool_size=2, pool_type='max', pool_stride=2, use_cudnn=use_cudnn, ceil_mode=True) return tmp def ocr_convs(self, input, is_test=False, use_cudnn=True): tmp = input tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn) tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn) tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn) tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn) return tmp def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True): conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn) sliced_feature = fluid.layers.im2sequence( input=conv_features, stride=[1, 1], filter_size=[conv_features.shape[2], 1]) para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02)) bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) fc_1 = fluid.layers.fc(input=sliced_feature, size=rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) fc_2 = fluid.layers.fc(input=sliced_feature, size=rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) gru_forward = fluid.layers.dynamic_gru( input=fc_1, size=rnn_hidden_size, param_attr=para_attr, bias_attr=bias_attr, candidate_activation='relu') gru_backward = fluid.layers.dynamic_gru( input=fc_2, size=rnn_hidden_size, is_reverse=True, param_attr=para_attr, bias_attr=bias_attr, candidate_activation='relu') encoded_vector = fluid.layers.concat( input=[gru_forward, gru_backward], axis=1) encoded_proj = fluid.layers.fc(input=encoded_vector, size=self.decoder_size, bias_attr=False) return gru_backward, encoded_vector, encoded_proj def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot): def simple_attention(encoder_vec, encoder_proj, decoder_state): decoder_state_proj = fluid.layers.fc(input=decoder_state, size=self.decoder_size, bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand( x=decoder_state_proj, y=encoder_proj) concated = encoder_proj + decoder_state_expand concated = fluid.layers.tanh(x=concated) attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False) attention_weights = fluid.layers.sequence_softmax( input=attention_weights) weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0) context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') return context rnn = fluid.layers.DynamicRNN() with rnn.block(): current_word = rnn.step_input(target_embedding) encoder_vec = rnn.static_input(encoder_vec) encoder_proj = rnn.static_input(encoder_proj) hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) context = simple_attention(encoder_vec, encoder_proj, hidden_mem) fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False) fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False) decoder_inputs = fc_1 + fc_2 h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3) rnn.update_memory(hidden_mem, h) out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax') rnn.output(out) return rnn() def net(self, images, label_in): gru_backward, encoded_vector, encoded_proj = self.encoder_net(images) backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first') decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, bias_attr=False, act="relu") label_in = fluid.layers.cast(x=label_in, dtype='int64') trg_embedding = fluid.layers.embedding( input=label_in, size=[self.num_classes + 2, self.word_vector_dim], dtype='float32') prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector, encoded_proj, decoder_boot) return prediction def infer(self, images, use_cudnn=True): beam_size = 1 gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn) backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first') decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, bias_attr=False, act="relu") init_state = decoder_boot array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length) counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = fluid.layers.create_array('float32') fluid.layers.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = fluid.layers.create_array('int64') scores_array = fluid.layers.create_array('float32') init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) fluid.layers.array_write(init_ids, array=ids_array, i=counter) fluid.layers.array_write(init_scores, array=scores_array, i=counter) cond = fluid.layers.less_than(x=counter, y=array_len) while_op = fluid.layers.While(cond=cond) with while_op.block(): pre_ids = fluid.layers.array_read(array=ids_array, i=counter) pre_state = fluid.layers.array_read(array=state_array, i=counter) pre_score = fluid.layers.array_read(array=scores_array, i=counter) pre_ids_emb = fluid.layers.embedding( input=pre_ids, size=[self.num_classes + 2, self.word_vector_dim], dtype='float32') context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size) # expand the recursive_sequence_lengths of pre_state to be the same with pre_score pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score) context_expanded = fluid.layers.sequence_expand(context, pre_score) fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False) fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False) decoder_inputs = fc_1 + fc_2 current_state, _, _ = fluid.layers.gru_unit( input=decoder_inputs, hidden=pre_state_expanded, size=self.decoder_size * 3) current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = fluid.layers.fc(input=current_state_with_lod, size=self.num_classes + 2, bias_attr=True, act='softmax') topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = fluid.layers.elementwise_add( x=fluid.layers.log(topk_scores), y=fluid.layers.reshape(pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = fluid.layers.beam_search( pre_ids, pre_score, topk_indices, accu_scores, beam_size, 1, # end_id #level=0 ) fluid.layers.increment(x=counter, value=1, in_place=True) # update the memories fluid.layers.array_write(current_state, array=state_array, i=counter) fluid.layers.array_write(selected_ids, array=ids_array, i=counter) fluid.layers.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = fluid.layers.less_than(x=counter, y=array_len) finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids)) fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond) ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array, beam_size, eos) return ids def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size): decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj) concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand) concated = fluid.layers.tanh(x=concated) attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False) attention_weights = fluid.layers.sequence_softmax(input=attention_weights) weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0) context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') return context def freeze_model(): exe = fluid.Executor(fluid.CPUPlace()) image = fluid.layers.data(name='image', shape=target_size, dtype='float32') label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1) model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, {}) pred = model.net(image, label_in) out = model.infer(image) freeze_program = fluid.default_main_program() exe.run(fluid.Program()) fluid.io.load_persistables(exe, save_freeze_dir, freeze_program) freeze_program = freeze_program.clone(for_test=True) fluid.io.save_inference_model("./freeze_model", ['image'], out, exe, freeze_program) if __name__ == '__main__': freeze_model()
pred = model.net(image, label_in) out = model.infer(image)
在一个空间中进行了两次网络的构造。因此会出现问题。
标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错 ”
我自己稍微改了一下 attention 模型的写法,使用自己准备的一个小数据集。数据量大约 1w 出头。我训练的时候,从loss 和编辑距离,预估准确率上看都很正常。在训练的时候,我会使用save_persistables保存当前预估准确率最好的模型参数。 训练结束后,我想把保存的参数通过 save_inference_model 转化成预测形式保存的参数。所以我的思路是先用 load_persistables 加载进来,然后再 save_inference_model 保存。如果我没有调用自己改写的 infer,整个过程不会报错。如果我调用自己改写 infer,在重新加载参数的时候会报错说找不到 conv_8 的参数。但实际上模型在卷积部分只有0-7号.....所以很懵逼
以下是我的代码: 训练部分 `
`
把模型保存成推理形式的代码: `
`