Open wjmzjx opened 4 years ago
你好!PaddleHub 1.8.0版本对Fine-tune API做了升级,可以使用以下用法:
# coding:utf-8
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
jieba_paddle = hub.Module(name='jieba_paddle')
def cut(text):
res = jieba_paddle.cut(text, use_paddle=False)
return res
if __name__ == '__main__':
# Load Paddlehub senta pretrained model
module = hub.Module(name="senta_bow", version='1.2.0')
inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len)
# Tokenizer tokenizes the text data and encodes the data as model needed.
# If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer.
# Otherwise, tokenizer should be hub.CustomTokenizer.
# If you choose CustomTokenizer, you can also change the chinese word segmentation tool, for example jieba.
tokenizer = hub.CustomTokenizer(
cut_function=cut, # jieba.cut as cut function
dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use sentence-level output.
sent_feature = outputs["sentence_feature"]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
python -u \
--batch_size=24 \
--max_seq_len=96 \
--use_gpu=True \
--checkpoint_dir=${CKPT_DIR} \
# coding:utf-8
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
jieba_paddle = hub.Module(name='jieba_paddle')
def cut(text):
res = jieba_paddle.cut(text, use_paddle=False)
return res
if __name__ == '__main__':
# Load Paddlehub senta pretrained model
module = hub.Module(name="senta_bow", version='1.2.0')
inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len)
# Tokenizer tokenizes the text data and encodes the data as model needed.
# If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer.
# Otherwise, tokenizer should be hub.CustomTokenizer.
# If you choose CustomTokenizer, you can also change the chinese word segmentation tool, for example jieba.
tokenizer = hub.CustomTokenizer(
cut_function=cut, # jieba.cut as cut function
dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len)
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use sentence-level output.
sent_feature = outputs["sentence_feature"]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
# Data to be predicted
data = ["这家餐厅很好吃", "这部电影真的很差劲"]
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in data
print(cls_task.predict(data=encoded_data, label_list=label_list))
CKPT_DIR="./ckpt_chnsenticorp "
python -u --checkpoint_dir $CKPT_DIR --use_gpu True --max_seq_len=96
[2020-08-01 09:29:37,563] [ INFO] - Processing the train set...
100%|██████████| 9600/9600 [00:06<00:00, 1564.96it/s]
E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\ UserWarning: Caution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'.
warnings.warn("Caution! 'set_gradient_clip' is not recommended "
[2020-08-01 09:29:43,783] [ INFO] - Strategy with warmup, linear decay, slanted triangle learning rate, weight decay regularization,
E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\ UserWarning: There are no operators in the program to be executed. If you pass Program manually, please use fluid.program_guard to ensure the current Program is being used.
[2020-08-01 09:29:43,786] [ INFO] - Try loading checkpoint from ckpt_20200801092937\ckpt.meta
[2020-08-01 09:29:43,787] [ INFO] - PaddleHub model checkpoint not found, start from scratch...
[2020-08-01 09:29:44,128] [ INFO] - PaddleHub finetune start
E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\ UserWarning: The following exception is not an EOF exception.
"The following exception is not an EOF exception.")
Traceback (most recent call last):
File "E:/Ss_working/Python_workSpace/fastNPLtest/", line 65, in
Windows not support stack backtrace yet.
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 2610, in append_op
attrs=kwargs.get("attrs", None))
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\layers\", line 1057, in sequence_unpad
outputs={'Out': out})
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 253, in _build_net
self.feature, length=self.seq_len_used)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 419, in _build_env
self.env.outputs = self._build_net()
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 545, in main_program
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 911, in load_checkpoint
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 384, in init_if_necessary
if not self.load_checkpoint():
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 960, in finetune
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 945, in finetune_and_eval
return self.finetune(do_eval=True)
File "E:/Ss_working/Python_workSpace/fastNPLtest/", line 65, in
Error: The padded sequence length can not be less than its original length. [Hint: Expected pad_seq_len >= valid_seq_len, but received pad_seq_len:96 < valid_seq_len:115.] at (D:\1.8.3\paddle\paddle\fluid\operators\math\ [operator < sequence_unpad > error]
Process finished with exit code 1
inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len)
dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len)
上面两处的max_seq_len 是否设置相同呢?
我运行过以上我提供的代码,在paddle 1.8.3, paddlehub 1.8.0,python3.6环境下可以正常运行的。你可以试试调试,打印下上述两处的max_seq_len
paddle 1.8.4 paddlehub 1.8.2 python 3.7.3
Error: The padded sequence length can not be less than its original length. [Hint: Expected pad_seq_len >= valid_seq_len, but received pad_seq_len:96 < valid_seq_len:117.] at (/paddle/paddle/fluid/operators/math/ [operator < sequence_unpad > error]
paddle 1.8.4 paddlehub 1.8.2 python 3.7.3
Error Message Summary:
Error: The padded sequence length can not be less than its original length. [Hint: Expected pad_seq_len >= valid_seq_len, but received pad_seq_len:96 < valid_seq_len:117.] at (/paddle/paddle/fluid/operators/math/ [operator < sequence_unpad > error]
我这边可以跑起来了。 max_length 不能大于96,否则会报上述错误。 麻烦开发同学看看,是否是bug,修复一下~
版本信息: paddle.version '1.8.3' paddlehub.version '1.8.0' win10
Error Message Summary:
InvalidArgumentError: The Tensor in the squeeze2 Op's Input Variable X(seq_len) is not initialized. [Hint: Expected t->IsInitialized() == true, but received t->IsInitialized():0 != true:1.] at (D:\1.8.3\paddle\paddle\fluid\framework\ [operator < squeeze2 > error]
Process finished with exit code 1
完整报错信息如下: E:\gl\sf\envs\python36\python.exe E:/Ss_working/Python_workSpace/fastNPLtest/ [2020-07-31 15:42:20,413] [ INFO] - Installing senta_bilstm module [2020-07-31 15:42:20,495] [ INFO] - Module senta_bilstm already installed in C:\Users\Administrator.paddlehub\modules\senta_bilstm [2020-07-31 15:42:23,655] [ INFO] - Dataset C:\Users\Administrator.paddlehub\dataset\chnsenticorp already cached. [2020-07-31 15:42:23,792] [ INFO] - Dataset label map = {'0': 0, '1': 1} [2020-07-31 15:42:23,792] [ INFO] - Installing lac module [2020-07-31 15:42:23,794] [ INFO] - Module lac already installed in C:\Users\Administrator.paddlehub\modules\lac [2020-07-31 15:42:26,467] [ WARNING] - The parameter use_pyreader has been dropped! PaddleHub over v1.8.0 will use pyreader by default. [2020-07-31 15:42:26,467] [ INFO] - Checkpoint dir: ckpt_20200731154226 !!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list. CPU_NUM indicates that how many CPUPlace are used in the current task. And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=8 # for example, set CPU_NUM as number of physical CPU core which is 8.
!!! The default number of CPU_NUM=1. [2020-07-31 15:42:26,546] [ INFO] - processing train data now... this may take a few minutes E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\ UserWarning: Caution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'. warnings.warn("Caution! 'set_gradient_clip' is not recommended " [2020-07-31 15:43:01,224] [ INFO] - Strategy with warmup, linear decay, slanted triangle learning rate, weight decay regularization, E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\ UserWarning: There are no operators in the program to be executed. If you pass Program manually, please use fluid.program_guard to ensure the current Program is being used. warnings.warn(error_info) [2020-07-31 15:43:01,225] [ INFO] - Try loading checkpoint from ckpt_20200731154226\ckpt.meta [2020-07-31 15:43:01,225] [ INFO] - PaddleHub model checkpoint not found, start from scratch... [2020-07-31 15:43:01,570] [ INFO] - PaddleHub finetune start E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\ UserWarning: The following exception is not an EOF exception. "The following exception is not an EOF exception.") Traceback (most recent call last): File "E:/Ss_working/Python_workSpace/fastNPLtest/", line 52, in
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 945, in finetune_and_eval
return self.finetune(do_eval=True)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 966, in finetune
run_states = self._run(do_eval=do_eval)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 1212, in _run
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 1071, in run
File "E:\gl\sf\envs\python36\lib\site-packages\", line 703, in reraise
raise value
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 1066, in run
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 1154, in _run_impl
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 1229, in _run_program
C++ Call Stacks (More useful to developers):
Windows not support stack backtrace yet.
Python Call Stacks (More useful to users):
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\", line 2610, in append_op attrs=kwargs.get("attrs", None)) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\common\", line 191, in _copy_vars_and_ops_in_blocks to_block.append_op(**op_info) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\common\", line 299, in clone_program dest_program.global_block()) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 413, in _build_env self._base_main_program, for_test=False) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 545, in main_program self._build_env() File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 911, in load_checkpoint main_program=self.main_program) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 384, in init_if_necessary if not self.load_checkpoint(): File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 960, in finetune self.init_if_necessary() File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\", line 945, in finetune_and_eval return self.finetune(do_eval=True) File "E:/Ss_working/Python_workSpace/fastNPLtest/", line 52, in
Error Message Summary:
InvalidArgumentError: The Tensor in the squeeze2 Op's Input Variable X(seq_len) is not initialized. [Hint: Expected t->IsInitialized() == true, but received t->IsInitialized():0 != true:1.] at (D:\1.8.3\paddle\paddle\fluid\framework\ [operator < squeeze2 > error]
Process finished with exit code 1
此外 senta_finetune.py中将feed_list = [inputs["words"].name]改动为feed_list = [inputs["text"].name] 不然报错 Traceback (most recent call last): File "E:/Ss_working/Python_workSpace/fastNPLtest/", line 30, in
feed_list = [inputs["words"].name]
KeyError: 'words'