PaddlePaddle / PaddleHub

Awesome pre-trained models toolkit based on PaddlePaddle. (400+ models including Image, Text, Audio, Video and Cross-Modal with Easy Inference & Serving)【安全加固,暂停交互,请耐心等待】
https://www.paddlepaddle.org.cn/hub
Apache License 2.0
12.72k stars 2.08k forks source link

模型部署中模型预加载错误 #694

Closed Malestudents closed 4 years ago

Malestudents commented 4 years ago
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

import numpy as np
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, serving
import paddlehub as hub

@moduleinfo(
    name="ERNIEFinetuned",
    version="1.0.0",
    summary="ERNIE tiny which was fine-tuned on the chnsenticorp dataset.",
    author="anonymous",
    author_email="",
    type="nlp/semantic_model")
class ERNIEFinetuned(hub.Module):
    def _initialize(self,
                    ckpt_dir="ckpt_ner",
                    num_class=3,
                    max_seq_len=128,
                    use_gpu=False,
                    batch_size=16):
        self.ckpt_dir = os.path.join(self.directory, ckpt_dir)
        self.num_class = num_class
        self.MAX_SEQ_LEN = max_seq_len

        self.params_path = os.path.join(self.ckpt_dir, 'best_model')

        # Load Paddlehub ERNIE Tiny pretrained model
        self.module = hub.Module(name='ernie')
        inputs, outputs, program = self.module.context(max_seq_len=128)

        # Download dataset and use accuracy as metrics
        # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
        # metric should be acc, f1 or matthews
        # For ernie_tiny, it use sub-word to tokenize chinese sentence
        # If not ernie tiny, sp_model_path and word_dict_path should be set None
        reader = hub.reader.SequenceLabelReader(
            vocab_path=self.module.get_vocab_path(),
            max_seq_len=128)

        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        sequence_output = outputs["sequence_output"]

        # Setup feed list for data feeder
        # Must feed all the tensor of module need
        feed_list = [
            inputs["input_ids"].name,
            inputs["position_ids"].name,
            inputs["segment_ids"].name,
            inputs["input_mask"].name,
        ]
        strategy = hub.AdamWeightDecayStrategy(
            weight_decay=0.01,
            warmup_proportion=0.1,
            learning_rate=5e-5)
        # Setup runing config for PaddleHub Finetune API
        config = hub.RunConfig(
            use_cuda=False,
            num_epoch=1,
            checkpoint_dir=self.ckpt_dir,
            batch_size=16,
            eval_interval=50,
            strategy=strategy)

        # Define a classfication finetune task by PaddleHub's API
        self.cls_task = hub.SequenceLabelTask(
            data_reader=reader,
            feature=sequence_output,
            feed_list=feed_list,
            add_crf=True,
            max_seq_len=128,
            num_classes=3,
            config=config)

    def predict(self, data, return_result=False, accelerate_mode=True):
        """
        Get prediction results
        """
        run_states = self.cls_task.predict(
            data=data,
            return_result = return_result,
            accelerate_mode = accelerate_mode)
        print(run_states)
        return result1

if __name__ == "__main__":
    ernie_tiny = ERNIEFinetuned(
        ckpt_dir="./ckpt_ner", num_class=3)
    # Data to be prdicted
    data=[]
    result = '梅小二主治医师查房记录'
    data.append(["\002".join(result)])
    print(data)
    predictions = ernie_tiny.predict(data=data)
    print(predictions)

错误示例: 1592271505(1)

Steffy-zxf commented 4 years ago

你好!以上错误说明,finetune之后保存模型参数没有被正确加载。请确认ckpt/best_model文件夹下是否存在cls_out_b文件是否存在。另外请注意,在finetune时和上述转module过程中创建SequenceLabelTask的代码保持一致。

Malestudents commented 4 years ago

finetune时和上述转module过程中创建SequenceLabelTask的代码是一致的, ckpt/best_model文件夹是finetune之后生成的文件夹,整体代码在finetune的时候运行正常 1592272758(1)

Steffy-zxf commented 4 years ago

Paddle 和 PaddleHub版本各是多少?另外你的finetune代码是怎么写的?

Malestudents commented 4 years ago

paddle1.7.1 paddlehub1.6.1 finetune代码:

from collections import defaultdict
import numpy as np
import re
import paddlehub as hub
import dataset_train
import datetime
import logging
import os
l_g = logging.getLogger()
l_g.setLevel(logging.ERROR) #日志等级为ERROR
# logging.basicConfig(level=logging.ERROR)
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
os.environ['CPU_NUM'] = str(3)
def call_module(context=None):
    module = hub.Module(name="ernie")
    inputs, outputs, program = module.context(max_seq_len=128)
    sequence_output = outputs["sequence_output"]
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    dataset = dataset_train.Express_NER()
    reader = hub.reader.SequenceLabelReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=128)
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=0.01,
        warmup_proportion=0.1,
        learning_rate=5e-5)
    config = hub.RunConfig(
        use_cuda=False,
        num_epoch=1,
        checkpoint_dir="ckpt_ner",
        batch_size=16,
        eval_interval=50,
        strategy=strategy)
    seq_label_task = hub.SequenceLabelTask(
        data_reader=reader,
        feature=sequence_output,
        feed_list=feed_list,
        add_crf=True,
        max_seq_len=128,
        num_classes=dataset.num_labels,
        config=config)
    print(dataset.num_labels)
    run_states = seq_label_task.finetune_and_eval()
    #results = [run_state.run_results for run_state in run_states]

if __name__=='__main__':
    start = datetime.datetime.now()
    call_module()
    end = datetime.datetime.now()
    print("使用时间:",end-start)
Malestudents commented 4 years ago

我更新过paddlehub版本,现在finetune报错,应该是版本有问题,这个paddle和paddlehub有版本对照表吗

Steffy-zxf commented 4 years ago

@Malestudents 可以将Paddle 和 PaddleHub 都升级到最新版本,Paddle v1.8.0以上,PaddleHub v1.7.1最新版本。升级之后重新finetune训练保存新的ckpt。

Malestudents commented 4 years ago

好的,我都重新更新一遍多谢解答

Malestudents commented 4 years ago

您好 我重新更新,重新训练了一遍还是这个错误,目前paddle1.8.2 paddlehub 1.7.1,best_model文件夹中还是没有存在cls_out_b文件 模型训练完成后调用没有问题 但是模型转moudle出现错误 以下是调用代码

#!/usr/bin/env Python
# coding=utf-8
from collections import defaultdict
import numpy as np
import re
import paddlehub as hub
import datetime
import logging

import os
os.environ['CPU_NUM'] = str(2)
l_g = logging.getLogger()
l_g.setLevel(logging.ERROR) #日志等级为ERROR
# logging.basicConfig(level=logging.ERROR)
#import paddle.fluid as fluid
#cpu_places = fluid.cpu_places(device_count=6)
#print(len(cpu_places))
def call_module(context=None):
    module = hub.Module(name="ernie")
    inputs, outputs, program = module.context(max_seq_len=128)
    sequence_output = outputs["sequence_output"]
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]
    reader = hub.reader.SequenceLabelReader(
        vocab_path=module.get_vocab_path(),
        max_seq_len=128)
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=0.01,
        warmup_proportion=0.1,
        learning_rate=5e-5)
    config = hub.RunConfig(
        use_cuda=False,
        num_epoch=1,
        checkpoint_dir="ckpt_ner",
        batch_size=16,
        eval_interval=50,
        strategy=strategy)
    seq_label_task = hub.SequenceLabelTask(
        data_reader=reader,
        feature=sequence_output,
        feed_list=feed_list,
        add_crf=True,
        max_seq_len=128,
        num_classes=3,
        config=config)
    return seq_label_task
def Predict(cls_task,context):
    keys ='B-PER', 'I-PER', 'O'
    inv_label_map = {key: val for key, val in enumerate(keys)}
    result1 =[]
    # test data
    # set "\002" to seperate the sentence in order to seperate the number sequence
    run_states = cls_task.predict(data=context)
    results = [run_state.run_results for run_state in run_states]

    return results

if __name__=='__main__':
    start = datetime.datetime.now()
    cls_task=call_module()
    data=[]
    result = '梅小二主治医师查房记录'
    data.append(["\002".join(result)])
    a = Predict(cls_task,data)
    print(a)
    end = datetime.datetime.now()
    print("使用时间:",end-start)
Steffy-zxf commented 4 years ago

贴下完整的堆栈信息吧?另外,可以先试试按照我们的序列标注任务预测demo 会不会出现错误https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.7/demo/sequence_labeling/predict.py

Malestudents commented 4 years ago

demo没有出现问题

Steffy-zxf commented 4 years ago

@Malestudents 其实fine-tune模型转module就是创建task的过程。如果demo没有问题的话,说明加载fine-tune模型以及创建task预测都是正常的。是否方便贴下完整的出错堆栈信息?是否能提供最小复现代码

Malestudents commented 4 years ago

模型的fine-tune没有问题就是转模型出现这个错误 模型fine-tune代码

from collections import defaultdict
import numpy as np
import re
import paddlehub as hub
import dataset_train
import datetime
import logging
import os
l_g = logging.getLogger()
l_g.setLevel(logging.ERROR) #日志等级为ERROR
# logging.basicConfig(level=logging.ERROR)
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
os.environ['CPU_NUM'] = str(3)
def call_module(context=None):
    module = hub.Module(name="ernie")
    inputs, outputs, program = module.context(max_seq_len=128)
    sequence_output = outputs["sequence_output"]
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    dataset = dataset_train.Express_NER()
    reader = hub.reader.SequenceLabelReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=128)
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=0.01,
        warmup_proportion=0.1,
        learning_rate=5e-5)
    config = hub.RunConfig(
        use_cuda=False,
        num_epoch=1,
        checkpoint_dir="hub_ernie_express_demo",
        batch_size=16,
        eval_interval=50,
        strategy=strategy)
    seq_label_task = hub.SequenceLabelTask(
        data_reader=reader,
        feature=sequence_output,
        feed_list=feed_list,
        add_crf=True,
        max_seq_len=128,
        num_classes=dataset.num_labels,
        config=config)
    print(dataset.num_labels)
    run_states = seq_label_task.finetune_and_eval()
    #results = [run_state.run_results for run_state in run_states]

if __name__=='__main__':
    start = datetime.datetime.now()
    call_module()
    end = datetime.datetime.now()
    print("使用时间:",end-start)

模型读取文件的代码

import os
import codecs
import csv

from paddlehub.dataset import InputExample

class Express_NER():
    """
    A set of manually annotated Chinese word-segmentation data about express information extraction.
    For more information please refer to
    https://aistudio.baidu.com/aistudio/projectdetail/131360
    """

    def __init__(self):
        # 数据集存放地址
        self.dataset_dir = 'data'

        self._load_train_examples()
        self._load_test_examples()
        self._load_dev_examples()

    def _load_train_examples(self):
        train_file = os.path.join(self.dataset_dir, "data_train.txt")
        self.train_examples = self._read_file(train_file)

    def _load_dev_examples(self):
        self.dev_file = os.path.join(self.dataset_dir, "data_dev.txt")
        self.dev_examples = self._read_file(self.dev_file)

    def _load_test_examples(self):
        self.test_file = os.path.join(self.dataset_dir, "data_test.txt")
        self.test_examples = self._read_file(self.test_file)
    def get_train_examples(self):
        return self.train_examples

    def get_dev_examples(self):
        return self.dev_examples

    def get_test_examples(self):
        return self.test_examples
    def get_labels(self):
        return [
            "B-PER", "I-PER","O"
        ]

    @property
    def num_labels(self):
        """
        Return the number of labels in the dataset.
        """
        return len(self.get_labels())

    def _read_file(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with codecs.open(input_file, "r", encoding="UTF-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            examples = []
            seq_id = 0
            # 跳过表头
            header = next(reader)  # skip header
            for line in reader:
                example = InputExample(
                    guid=seq_id, label=line[1], text_a=line[0])
                seq_id += 1
                examples.append(example)
            return examples