Open wjmzjx opened 4 years ago
你好!PaddleHub 1.8.0版本对Fine-tune API做了升级,可以使用以下用法:
迁移学习:
# coding:utf-8
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
jieba_paddle = hub.Module(name='jieba_paddle')
def cut(text):
res = jieba_paddle.cut(text, use_paddle=False)
return res
if __name__ == '__main__':
# Load Paddlehub senta pretrained model
module = hub.Module(name="senta_bow", version='1.2.0')
inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len)
# Tokenizer tokenizes the text data and encodes the data as model needed.
# If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer.
# Otherwise, tokenizer should be hub.CustomTokenizer.
# If you choose CustomTokenizer, you can also change the chinese word segmentation tool, for example jieba.
tokenizer = hub.CustomTokenizer(
vocab_file=module.get_vocab_path(),
tokenize_chinese_chars=True,
cut_function=cut, # jieba.cut as cut function
)
dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use sentence-level output.
sent_feature = outputs["sentence_feature"]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
use_data_parallel=True,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.AdamWeightDecayStrategy())
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
dataset=dataset,
feature=sent_feature,
num_classes=dataset.num_labels,
config=config)
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
cls_task.finetune_and_eval()
启动脚本
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_chnsenticorp"
python -u senta_finetune.py \
--batch_size=24 \
--max_seq_len=96 \
--use_gpu=True \
--checkpoint_dir=${CKPT_DIR} \
--num_epoch=3
预测用法:
# coding:utf-8
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
args = parser.parse_args()
# yapf: enable.
jieba_paddle = hub.Module(name='jieba_paddle')
def cut(text):
res = jieba_paddle.cut(text, use_paddle=False)
return res
if __name__ == '__main__':
# Load Paddlehub senta pretrained model
module = hub.Module(name="senta_bow", version='1.2.0')
inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len)
# Tokenizer tokenizes the text data and encodes the data as model needed.
# If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer.
# Otherwise, tokenizer should be hub.CustomTokenizer.
# If you choose CustomTokenizer, you can also change the chinese word segmentation tool, for example jieba.
tokenizer = hub.CustomTokenizer(
vocab_file=module.get_vocab_path(),
tokenize_chinese_chars=True,
cut_function=cut, # jieba.cut as cut function
)
dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len)
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use sentence-level output.
sent_feature = outputs["sentence_feature"]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.AdamWeightDecayStrategy())
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
feature=sent_feature,
num_classes=num_classes,
config=config)
# Data to be predicted
data = ["这家餐厅很好吃", "这部电影真的很差劲"]
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in data
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
启动脚本:
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_chnsenticorp "
python -u predict.py --checkpoint_dir $CKPT_DIR --use_gpu True --max_seq_len=96
fine-tune部分还是无法运行
[2020-08-01 09:29:37,563] [ INFO] - Processing the train set...
100%|██████████| 9600/9600 [00:06<00:00, 1564.96it/s]
E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\clip.py:779: UserWarning: Caution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'.
warnings.warn("Caution! 'set_gradient_clip' is not recommended "
[2020-08-01 09:29:43,783] [ INFO] - Strategy with warmup, linear decay, slanted triangle learning rate, weight decay regularization,
E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py:1093: UserWarning: There are no operators in the program to be executed. If you pass Program manually, please use fluid.program_guard to ensure the current Program is being used.
warnings.warn(error_info)
[2020-08-01 09:29:43,786] [ INFO] - Try loading checkpoint from ckpt_20200801092937\ckpt.meta
[2020-08-01 09:29:43,787] [ INFO] - PaddleHub model checkpoint not found, start from scratch...
[2020-08-01 09:29:44,128] [ INFO] - PaddleHub finetune start
E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py:1070: UserWarning: The following exception is not an EOF exception.
"The following exception is not an EOF exception.")
Traceback (most recent call last):
File "E:/Ss_working/Python_workSpace/fastNPLtest/test.py", line 65, in
Windows not support stack backtrace yet.
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\framework.py", line 2610, in append_op
attrs=kwargs.get("attrs", None))
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\layers\sequence_lod.py", line 1057, in sequence_unpad
outputs={'Out': out})
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\classifier_task.py", line 253, in _build_net
self.feature, length=self.seq_len_used)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 419, in _build_env
self.env.outputs = self._build_net()
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 545, in main_program
self._build_env()
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 911, in load_checkpoint
main_program=self.main_program)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 384, in init_if_necessary
if not self.load_checkpoint():
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 960, in finetune
self.init_if_necessary()
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 945, in finetune_and_eval
return self.finetune(do_eval=True)
File "E:/Ss_working/Python_workSpace/fastNPLtest/test.py", line 65, in
Error: The padded sequence length can not be less than its original length. [Hint: Expected pad_seq_len >= valid_seq_len, but received pad_seq_len:96 < valid_seq_len:115.] at (D:\1.8.3\paddle\paddle\fluid\operators\math\sequence_padding.cc:38) [operator < sequence_unpad > error]
Process finished with exit code 1
inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len)
dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len)
上面两处的max_seq_len 是否设置相同呢?
我直接copy了迁移学习的代码,并未改动
我运行过以上我提供的代码,在paddle 1.8.3, paddlehub 1.8.0,python3.6环境下可以正常运行的。你可以试试调试,打印下上述两处的max_seq_len
是否一致?或者还有其他的差异?
paddle 1.8.4 paddlehub 1.8.2 python 3.7.3
Error: The padded sequence length can not be less than its original length. [Hint: Expected pad_seq_len >= valid_seq_len, but received pad_seq_len:96 < valid_seq_len:117.] at (/paddle/paddle/fluid/operators/math/sequence_padding.cc:38) [operator < sequence_unpad > error]
paddle 1.8.4 paddlehub 1.8.2 python 3.7.3
同无法运行:
Error Message Summary:
Error: The padded sequence length can not be less than its original length. [Hint: Expected pad_seq_len >= valid_seq_len, but received pad_seq_len:96 < valid_seq_len:117.] at (/paddle/paddle/fluid/operators/math/sequence_padding.cc:38) [operator < sequence_unpad > error]
我这边可以跑起来了。 max_length 不能大于96,否则会报上述错误。 麻烦开发同学看看,是否是bug,修复一下~
版本信息: paddle.version '1.8.3' paddlehub.version '1.8.0' win10
在运行PaddleHub/demo/senta/senta_finetune.py时报错
Error Message Summary:
InvalidArgumentError: The Tensor in the squeeze2 Op's Input Variable X(seq_len) is not initialized. [Hint: Expected t->IsInitialized() == true, but received t->IsInitialized():0 != true:1.] at (D:\1.8.3\paddle\paddle\fluid\framework\operator.cc:1289) [operator < squeeze2 > error]
Process finished with exit code 1
完整报错信息如下: E:\gl\sf\envs\python36\python.exe E:/Ss_working/Python_workSpace/fastNPLtest/check_pkg.py [2020-07-31 15:42:20,413] [ INFO] - Installing senta_bilstm module [2020-07-31 15:42:20,495] [ INFO] - Module senta_bilstm already installed in C:\Users\Administrator.paddlehub\modules\senta_bilstm [2020-07-31 15:42:23,655] [ INFO] - Dataset C:\Users\Administrator.paddlehub\dataset\chnsenticorp already cached. [2020-07-31 15:42:23,792] [ INFO] - Dataset label map = {'0': 0, '1': 1} [2020-07-31 15:42:23,792] [ INFO] - Installing lac module [2020-07-31 15:42:23,794] [ INFO] - Module lac already installed in C:\Users\Administrator.paddlehub\modules\lac [2020-07-31 15:42:26,467] [ WARNING] - The parameter use_pyreader has been dropped! PaddleHub over v1.8.0 will use pyreader by default. [2020-07-31 15:42:26,467] [ INFO] - Checkpoint dir: ckpt_20200731154226 !!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list. CPU_NUM indicates that how many CPUPlace are used in the current task. And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
export CPU_NUM=8 # for example, set CPU_NUM as number of physical CPU core which is 8.
!!! The default number of CPU_NUM=1. [2020-07-31 15:42:26,546] [ INFO] - processing train data now... this may take a few minutes E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\clip.py:779: UserWarning: Caution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documention of 'optimizer'. warnings.warn("Caution! 'set_gradient_clip' is not recommended " [2020-07-31 15:43:01,224] [ INFO] - Strategy with warmup, linear decay, slanted triangle learning rate, weight decay regularization, E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py:1093: UserWarning: There are no operators in the program to be executed. If you pass Program manually, please use fluid.program_guard to ensure the current Program is being used. warnings.warn(error_info) [2020-07-31 15:43:01,225] [ INFO] - Try loading checkpoint from ckpt_20200731154226\ckpt.meta [2020-07-31 15:43:01,225] [ INFO] - PaddleHub model checkpoint not found, start from scratch... [2020-07-31 15:43:01,570] [ INFO] - PaddleHub finetune start E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py:1070: UserWarning: The following exception is not an EOF exception. "The following exception is not an EOF exception.") Traceback (most recent call last): File "E:/Ss_working/Python_workSpace/fastNPLtest/check_pkg.py", line 52, in
cls_task.finetune_and_eval()
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 945, in finetune_and_eval
return self.finetune(do_eval=True)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 966, in finetune
run_states = self._run(do_eval=do_eval)
File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 1212, in _run
return_numpy=self.return_numpy)
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py", line 1071, in run
six.reraise(*sys.exc_info())
File "E:\gl\sf\envs\python36\lib\site-packages\six.py", line 703, in reraise
raise value
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py", line 1066, in run
return_merged=return_merged)
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py", line 1154, in _run_impl
use_program_cache=use_program_cache)
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\executor.py", line 1229, in _run_program
fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet:
C++ Call Stacks (More useful to developers):
Windows not support stack backtrace yet.
Python Call Stacks (More useful to users):
File "E:\gl\sf\envs\python36\lib\site-packages\paddle\fluid\framework.py", line 2610, in append_op attrs=kwargs.get("attrs", None)) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\common\paddle_helper.py", line 191, in _copy_vars_and_ops_in_blocks to_block.append_op(**op_info) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\common\paddle_helper.py", line 299, in clone_program dest_program.global_block()) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 413, in _build_env self._base_main_program, for_test=False) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 545, in main_program self._build_env() File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 911, in load_checkpoint main_program=self.main_program) File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 384, in init_if_necessary if not self.load_checkpoint(): File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 960, in finetune self.init_if_necessary() File "E:\gl\sf\envs\python36\lib\site-packages\paddlehub\finetune\task\base_task.py", line 945, in finetune_and_eval return self.finetune(do_eval=True) File "E:/Ss_working/Python_workSpace/fastNPLtest/check_pkg.py", line 52, in
cls_task.finetune_and_eval()
Error Message Summary:
InvalidArgumentError: The Tensor in the squeeze2 Op's Input Variable X(seq_len) is not initialized. [Hint: Expected t->IsInitialized() == true, but received t->IsInitialized():0 != true:1.] at (D:\1.8.3\paddle\paddle\fluid\framework\operator.cc:1289) [operator < squeeze2 > error]
Process finished with exit code 1
此外 senta_finetune.py中将feed_list = [inputs["words"].name]改动为feed_list = [inputs["text"].name] 不然报错 Traceback (most recent call last): File "E:/Ss_working/Python_workSpace/fastNPLtest/check_pkg.py", line 30, in
feed_list = [inputs["words"].name]
KeyError: 'words'