👑 Easy-to-use and powerful NLP and LLM library with 🤗 Awesome model zoo, supporting wide-range of NLP tasks from research to industrial applications, including 🗂Text Classification, 🔍 Neural Search, ❓ Question Answering, ℹ️ Information Extraction, 📄 Document Intelligence, 💌 Sentiment Analysis etc.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from dataclasses import dataclass, field
from typing import Optional
import paddle
import sklearn
from model import ErnieClassificationModel
from utils import compute_metrics, get_label_name, read_example
from paddle.nn import CrossEntropyLoss
from paddlenlp.data import DataCollatorForTokenClassification, DataCollatorWithPadding
from paddlenlp.datasets import load_dataset
from paddlenlp.trainer import (
CompressionArguments,
PdArgumentParser,
Trainer,
get_last_checkpoint,
)
from paddlenlp.transformers import AutoTokenizer, ErnieConfig
from paddlenlp.utils.log import logger
@dataclass
class DataArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using PdArgumentParser we can turn this class into argparse arguments to be able to
specify them on the command line.
"""
train_path: str = field(
default='train.txt',
metadata={"help": "The screen data path for train dataset."})
dev_path: str = field(
default='dev.txt',
metadata={"help": "The screen data path for dev dataset. Defaults to None."},
)
test_path: str = field(default=None, metadata={"help": "Test data path. Defaults to None."})
label_path: str = field(default='labels', metadata={"help": "Intent label dict path. Defaults to None."})
max_seq_length: Optional[int] = field(
default=128,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded. Defaults to 16."
},
)
max_vocab_size: Optional[int] = field(
default=8000,
metadata={"help": "The Maximum vocab size after pruning word embeddings. Defaults to 8000."},
)
ignore_index: Optional[int] = field(
default=0,
metadata={
"help": "Padding index, and it's used to pad noscreen label in screen data, "
"and pad screen label in noscreen data. Defaults to 9999."
},
)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: Optional[str] = field(
default="ernie-3.0-tiny-nano-v2-zh",
metadata={"help": "Path to pretrained model. Defaults to 'ernie-3.0-tiny-nano-v2-zh'"},
)
dropout: float = field(default=0.1, metadata={"help": "Dropout rate for JointErnie. Defaults to 0.1."})
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
import numpy as np
def get_label_name(filename_label):
label_names, label2id = [], {}
for idx, line in enumerate(open(filename_label)):
line = line.strip()
label_names.append(line)
label2id[line] = idx
return label_names, label2id
def read_example(filename, label2id, tokenizer, max_seq_length=16):
"""
Reads data from file.
"""
for line in open(filename):
example = {}
line = line.strip().split("\t")
if len(line) != 2:
continue
title, label = line
tokenized_input = tokenizer(
title, max_seq_len=max_seq_length, padding="max_length", truncation=True
)
example["label"] = label2id[label]
example["input_ids"] = tokenized_input["input_ids"]
yield example
[2023-04-17 20:46:20,224] [ INFO] - The default value for the training argument --report_to will change in v5 (from all installed integrations to none). In v5, you will need to use --report_to all to get the same behavior as now. You should start updating your code and make this info disappear :-).
[2023-04-17 20:46:20,226] [ INFO] - Model config ErnieConfig {
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": 0.1,
"enable_recompute": false,
"fuse": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 312,
"initializer_range": 0.02,
"intermediate_size": 1248,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 2048,
"model_type": "ernie",
"num_attention_heads": 12,
"num_hidden_layers": 4,
"pad_token_id": 0,
"paddlenlp_version": null,
"pool_act": "tanh",
"task_id": 0,
"task_type_vocab_size": 16,
"type_vocab_size": 4,
"use_task_id": false,
"vocab_size": 40000
}
[2023-04-17 20:46:22,150] [ INFO] - All model checkpoint weights were used when initializing ErnieClassificationModel.
软件环境
重复问题
错误描述
稳定复现步骤 & 代码
本项目包含三个代码:
运行代码
数据
数据文件中每行样本都是如下格式: 句子 + "\t" + 标签
代码文件
import os from dataclasses import dataclass, field from typing import Optional
import paddle import sklearn from model import ErnieClassificationModel from utils import compute_metrics, get_label_name, read_example from paddle.nn import CrossEntropyLoss
from paddlenlp.data import DataCollatorForTokenClassification, DataCollatorWithPadding from paddlenlp.datasets import load_dataset from paddlenlp.trainer import ( CompressionArguments, PdArgumentParser, Trainer, get_last_checkpoint, ) from paddlenlp.transformers import AutoTokenizer, ErnieConfig from paddlenlp.utils.log import logger
@dataclass class DataArguments: """ Arguments pertaining to what data we are going to input our model for training and eval. Using
PdArgumentParser
we can turn this class into argparse arguments to be able to specify them on the command line. """@dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """
def main(): parser = PdArgumentParser((ModelArguments, DataArguments, CompressionArguments)) model_args, data_args, compression_args = parser.parse_args_into_dataclasses()
if name == "main": main()
from cmath import log from paddle import nn from paddlenlp.transformers import ErnieModel, ErniePretrainedModel
class ErnieClassificationModel(ErniePretrainedModel): def init(self, config, n_class, dropout=None): super(ErnieClassificationModel, self).init(config) self.num_labels = n_class self.ernie = ErnieModel(config) self.dropout = nn.Dropout(dropout if dropout is not None else config['hidden_dropout_prob'])
Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
#
http://www.apache.org/licenses/LICENSE-2.0
#
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
import numpy as np
def get_label_name(filename_label): label_names, label2id = [], {} for idx, line in enumerate(open(filename_label)): line = line.strip() label_names.append(line) label2id[line] = idx
def read_example(filename, label2id, tokenizer, max_seq_length=16): """ Reads data from file. """ for line in open(filename): example = {} line = line.strip().split("\t") if len(line) != 2: continue title, label = line tokenized_input = tokenizer( title, max_seq_len=max_seq_length, padding="max_length", truncation=True ) example["label"] = label2id[label] example["input_ids"] = tokenized_input["input_ids"] yield example
def compute_metrics(p): """ 计算指标 """ logits, label = p.predictions, p.label_ids preds = logits.argmax(axis=-1) right = 0 right += sum(preds == label) accuracy = right / label.shape[0] * 100 return {"accuracy": accuracy}
def read_test_file(filename): """ 读取测试数据 """ for line in open(filename): line = line.strip() yield {"title": line}
def input_preprocess(text, tokenizer, max_seq_length=16): """ 输入数据预处理 """ data = tokenizer(text, max_length=max_seq_length) input_ids = data["input_ids"] return { "input_ids": np.array(input_ids, dtype="int32"), }
def cls_postprocess(logits, id2label): """ 预测结果后处理 """ max_value = np.max(logits, axis=1, keepdims=True) exp_data = np.exp(logits - max_value) probs = exp_data / np.sum(exp_data, axis=1, keepdims=True) out_dict = {"label": id2label[int(probs.argmax(axis=-1))], "confidence": probs.max(axis=-1)} return out_dict
[2023-04-17 20:46:20,224] [ INFO] - The default value for the training argument
--report_to
will change in v5 (from all installed integrations to none). In v5, you will need to use--report_to all
to get the same behavior as now. You should start updating your code and make this info disappear :-). [2023-04-17 20:46:20,226] [ INFO] - Model config ErnieConfig { "attention_probs_dropout_prob": 0.1, "classifier_dropout": 0.1, "enable_recompute": false, "fuse": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 312, "initializer_range": 0.02, "intermediate_size": 1248, "layer_norm_eps": 1e-12, "max_position_embeddings": 2048, "model_type": "ernie", "num_attention_heads": 12, "num_hidden_layers": 4, "pad_token_id": 0, "paddlenlp_version": null, "pool_act": "tanh", "task_id": 0, "task_type_vocab_size": 16, "type_vocab_size": 4, "use_task_id": false, "vocab_size": 40000 }[2023-04-17 20:46:22,150] [ INFO] - All model checkpoint weights were used when initializing ErnieClassificationModel.
[2023-04-17 20:46:22,150] [ WARNING] - Some weights of ErnieClassificationModel were not initialized from the model checkpoint at ernie-3.0-tiny-nano-v2-zh and are newly initialized: ['ernie.pooler.dense.weight', 'classifier.bias', 'ernie.pooler.dense.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. [2023-04-17 20:46:22,152] [ INFO] - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load 'ernie-3.0-tiny-nano-v2-zh'. [2023-04-17 20:46:22,152] [ INFO] - Already cached /home/xxx/.paddlenlp/models/ernie-3.0-tiny-nano-v2-zh/ernie_3.0_tiny_nano_v2_vocab.txt [2023-04-17 20:46:22,180] [ INFO] - tokenizer config file saved in /home/xxx/.paddlenlp/models/ernie-3.0-tiny-nano-v2-zh/tokenizer_config.json [2023-04-17 20:46:22,180] [ INFO] - Special tokens file saved in /home/xxx/.paddlenlp/models/ernie-3.0-tiny-nano-v2-zh/special_tokens_map.json [2023-04-17 20:46:22,505] [ INFO] - ============================================================ [2023-04-17 20:46:22,505] [ INFO] - Training Configuration Arguments [2023-04-17 20:46:22,505] [ INFO] - paddle commit id :0e92adceae06b6b7463f2dc7790ffb0601730009 [2023-04-17 20:46:22,506] [ INFO] - _no_sync_in_gradient_accumulation:True [2023-04-17 20:46:22,506] [ INFO] - activation_quantize_type :None [2023-04-17 20:46:22,506] [ INFO] - adam_beta1 :0.9 [2023-04-17 20:46:22,506] [ INFO] - adam_beta2 :0.999 [2023-04-17 20:46:22,506] [ INFO] - adam_epsilon :1e-08 [2023-04-17 20:46:22,506] [ INFO] - algo_list :None [2023-04-17 20:46:22,506] [ INFO] - batch_num_list :None [2023-04-17 20:46:22,506] [ INFO] - batch_size_list :None [2023-04-17 20:46:22,506] [ INFO] - bf16 :False [2023-04-17 20:46:22,506] [ INFO] - bf16_full_eval :False [2023-04-17 20:46:22,506] [ INFO] - bias_correction :False [2023-04-17 20:46:22,507] [ INFO] - current_device :cpu [2023-04-17 20:46:22,507] [ INFO] - dataloader_drop_last :False [2023-04-17 20:46:22,507] [ INFO] - dataloader_num_workers :0 [2023-04-17 20:46:22,507] [ INFO] - device :cpu [2023-04-17 20:46:22,507] [ INFO] - disable_tqdm :False [2023-04-17 20:46:22,507] [ INFO] - do_compress :False [2023-04-17 20:46:22,507] [ INFO] - do_eval :True [2023-04-17 20:46:22,507] [ INFO] - do_export :True [2023-04-17 20:46:22,507] [ INFO] - do_predict :False [2023-04-17 20:46:22,507] [ INFO] - do_train :True [2023-04-17 20:46:22,507] [ INFO] - eval_batch_size :8 [2023-04-17 20:46:22,507] [ INFO] - eval_steps :None [2023-04-17 20:46:22,508] [ INFO] - evaluation_strategy :IntervalStrategy.EPOCH [2023-04-17 20:46:22,508] [ INFO] - flatten_param_grads :False [2023-04-17 20:46:22,508] [ INFO] - fp16 :False [2023-04-17 20:46:22,508] [ INFO] - fp16_full_eval :False [2023-04-17 20:46:22,508] [ INFO] - fp16_opt_level :O1 [2023-04-17 20:46:22,508] [ INFO] - gradient_accumulation_steps :1 [2023-04-17 20:46:22,508] [ INFO] - greater_is_better :None [2023-04-17 20:46:22,508] [ INFO] - ignore_data_skip :False [2023-04-17 20:46:22,508] [ INFO] - input_dtype :int64 [2023-04-17 20:46:22,508] [ INFO] - input_infer_model_path :None [2023-04-17 20:46:22,508] [ INFO] - label_names :None [2023-04-17 20:46:22,508] [ INFO] - lazy_data_processing :True [2023-04-17 20:46:22,508] [ INFO] - learning_rate :5e-05 [2023-04-17 20:46:22,509] [ INFO] - load_best_model_at_end :False [2023-04-17 20:46:22,509] [ INFO] - local_process_index :0 [2023-04-17 20:46:22,509] [ INFO] - local_rank :-1 [2023-04-17 20:46:22,509] [ INFO] - log_level :-1 [2023-04-17 20:46:22,509] [ INFO] - log_level_replica :-1 [2023-04-17 20:46:22,509] [ INFO] - log_on_each_node :True [2023-04-17 20:46:22,509] [ INFO] - logging_dir :/home/xxx/project/paddle/ [2023-04-17 20:46:22,509] [ INFO] - logging_first_step :False [2023-04-17 20:46:22,509] [ INFO] - logging_steps :100 [2023-04-17 20:46:22,509] [ INFO] - logging_strategy :IntervalStrategy.STEPS [2023-04-17 20:46:22,509] [ INFO] - lr_scheduler_type :SchedulerType.LINEAR [2023-04-17 20:46:22,509] [ INFO] - max_grad_norm :1.0 [2023-04-17 20:46:22,510] [ INFO] - max_steps :-1 [2023-04-17 20:46:22,510] [ INFO] - metric_for_best_model :None [2023-04-17 20:46:22,510] [ INFO] - minimum_eval_times :None [2023-04-17 20:46:22,510] [ INFO] - moving_rate :0.9 [2023-04-17 20:46:22,510] [ INFO] - no_cuda :False [2023-04-17 20:46:22,510] [ INFO] - num_train_epochs :1.0 [2023-04-17 20:46:22,510] [ INFO] - onnx_format :True [2023-04-17 20:46:22,510] [ INFO] - optim :OptimizerNames.ADAMW [2023-04-17 20:46:22,510] [ INFO] - output_dir :/home/xxx/project/paddle/ [2023-04-17 20:46:22,510] [ INFO] - overwrite_output_dir :False [2023-04-17 20:46:22,510] [ INFO] - past_index :-1 [2023-04-17 20:46:22,510] [ INFO] - per_device_eval_batch_size :8 [2023-04-17 20:46:22,510] [ INFO] - per_device_train_batch_size :8 [2023-04-17 20:46:22,511] [ INFO] - prediction_loss_only :False [2023-04-17 20:46:22,511] [ INFO] - process_index :0 [2023-04-17 20:46:22,511] [ INFO] - prune_embeddings :False [2023-04-17 20:46:22,511] [ INFO] - recompute :False [2023-04-17 20:46:22,511] [ INFO] - remove_unused_columns :True [2023-04-17 20:46:22,511] [ INFO] - report_to :['visualdl'] [2023-04-17 20:46:22,511] [ INFO] - resume_from_checkpoint :None [2023-04-17 20:46:22,511] [ INFO] - round_type :round [2023-04-17 20:46:22,511] [ INFO] - run_name :/home/xxx/project/paddle [2023-04-17 20:46:22,511] [ INFO] - save_on_each_node :False [2023-04-17 20:46:22,511] [ INFO] - save_steps :100 [2023-04-17 20:46:22,511] [ INFO] - save_strategy :IntervalStrategy.STEPS [2023-04-17 20:46:22,512] [ INFO] - save_total_limit :None [2023-04-17 20:46:22,512] [ INFO] - scale_loss :32768 [2023-04-17 20:46:22,512] [ INFO] - seed :42 [2023-04-17 20:46:22,512] [ INFO] - sharding :[] [2023-04-17 20:46:22,512] [ INFO] - sharding_degree :-1 [2023-04-17 20:46:22,512] [ INFO] - should_log :True [2023-04-17 20:46:22,512] [ INFO] - should_save :True [2023-04-17 20:46:22,512] [ INFO] - skip_memory_metrics :True [2023-04-17 20:46:22,512] [ INFO] - strategy :dynabert+ptq [2023-04-17 20:46:22,512] [ INFO] - train_batch_size :8 [2023-04-17 20:46:22,512] [ INFO] - use_pact :True [2023-04-17 20:46:22,512] [ INFO] - warmup_ratio :0.1 [2023-04-17 20:46:22,512] [ INFO] - warmup_steps :0 [2023-04-17 20:46:22,513] [ INFO] - weight_decay :0.0 [2023-04-17 20:46:22,513] [ INFO] - weight_quantize_type :channel_wise_abs_max [2023-04-17 20:46:22,513] [ INFO] - width_mult_list :None [2023-04-17 20:46:22,513] [ INFO] - world_size :1 [2023-04-17 20:46:22,513] [ INFO] - [2023-04-17 20:46:22,514] [ INFO] - Running training [2023-04-17 20:46:22,514] [ INFO] - Num examples = 450 [2023-04-17 20:46:22,514] [ INFO] - Num Epochs = 1 [2023-04-17 20:46:22,514] [ INFO] - Instantaneous batch size per device = 8 [2023-04-17 20:46:22,514] [ INFO] - Total train batch size (w. parallel, distributed & accumulation) = 8 [2023-04-17 20:46:22,514] [ INFO] - Gradient Accumulation steps = 1 [2023-04-17 20:46:22,514] [ INFO] - Total optimization steps = 57.0 [2023-04-17 20:46:22,514] [ INFO] - Total num train samples = 450.0 [2023-04-17 20:46:22,515] [ INFO] - Number of trainable parameters = 17907866 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:44<00:00, 1.58it/s][2023-04-17 20:47:06,738] [ INFO] - Running Evaluation [2023-04-17 20:47:06,738] [ INFO] - Num examples = 113 [2023-04-17 20:47:06,738] [ INFO] - Total prediction steps = 15 [2023-04-17 20:47:06,738] [ INFO] - Pre device batch size = 8 [2023-04-17 20:47:06,738] [ INFO] - Total Batch size = 8 eval_loss: 0.4924260079860687, eval_accuracy: 71.68141592920354, eval_runtime: 3.0462, eval_samples_per_second: 37.095, eval_steps_per_second: 4.924, epoch: 1.0 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:47<00:00, 1.58it/s[2023-04-17 20:47:09,785] [ INFO] - Training completed.
train_runtime: 47.2703, train_samples_per_second: 9.52, train_steps_per_second: 1.206, train_loss: 0.5549765135112562, epoch: 1.0 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 57/57 [00:47<00:00, 1.21it/s] [2023-04-17 20:47:12,484] [ INFO] - tokenizer config file saved in /home/xxx/project/paddle/ [2023-04-17 20:47:12,484] [ INFO] - Special tokens file saved in /home/xxx/project/paddle/
Error in `python': double free or corruption (!prev): 0x00007f9496386800 ======= Backtrace: ========= /lib64/libc.so.6(+0x784b6)[0x7f9493f154b6] /lib64/libc.so.6(+0x7a403)[0x7f9493f17403] python(+0x181c1c)[0x7f9494f13c1c] python(+0x1803ab)[0x7f9494f123ab] python(PyDict_Clear+0x141)[0x7f9494e8e0b1] python(+0xfc179)[0x7f9494e8e179] python(+0x1638c6)[0x7f9494ef58c6] python(_PyGC_CollectNoFail+0x2a)[0x7f9494f9375a] python(PyImport_Cleanup+0x4bc)[0x7f9494f3ef9c] python(Py_FinalizeEx+0x64)[0x7f9494fb3714] python(+0x232e20)[0x7f9494fc4e20] python(_Py_UnixMain+0x3c)[0x7f9494fc518c] /lib64/libc.so.6(__libc_start_main+0x100)[0x7f9493ebc850] python(+0x1d803a)[0x7f9494f6a03a] ======= Memory map: ======== 7f93f0000000-7f93f0021000 rw-p 00000000 00:00 0 7f93f0021000-7f93f4000000 ---p 00000000 00:00 0 7f93f8000000-7f93f8021000 rw-p 00000000 00:00 0 7f93f8021000-7f93fc000000 ---p 00000000 00:00 0 7f93fc131000-7f9403c54000 r-xp 00000000 fc:10 57938935 /home/xxx/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/libs/libmklml_intel.so 7f9403c54000-7f9403e53000 ---p 07b23000 fc:10 57938935 /home/xxx/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/libs/libmklml_intel.so 7f9403e53000-7f9403e79000 r--p 07b22000 fc:10 57938935 /home/xxx/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/libs/libmklml_intel.so 7f9403e79000-7f9403f43000 rw-p 07b48000 fc:10 57938935 /home/xxx/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/libs/libmklml_intel.so 7f9403f43000-7f9403f69000 rw-p 00000000 00:00 0 7f9403f69000-7f9404000000 rw-p 07c12000 fc:10 57938935 /home/xxx/anaconda3/envs/paddle/lib/python3.7/site-packages/paddle/libs/libmklml_intel.so 7f9404000000-7f9404021000 rw-p 00000000 00:00 0 7f9404021000-7f9408000000 ---p 00000000 00:00 0 7f9408000000-7f9408021000 rw-p 00000000 00:00 0 7f9408021000-7f940c000000 ---p 00000000 00:00 0 7f940c000000-7f940c021000 rw-p 00000000 00:00 0 7f940c021000-7f9410000000 ---p 00000000 00:00 0 7f9410000000-7f9410021000 rw-p 00000000 00:00 0 7f9410021000-7f9414000000 ---p 00000000 00:00 0 7f9414000000-7f9414021000 rw-p 00000000 00:00 0 7f9414021000-7f9418000000 ---p 00000000 00:00 0 7f941a618000-7f9439667000 rw-p 00000000 00:00 0 7f943a668000-7f943a728000 rw-p 00000000 00:00 0 7f943a7a8000-7f943a928000 rw-p 00000000 00:00 0 7f943aa28000-7f943aa68000 rw-p 00000000 00:00 0 7f943aaa8000-7f943abe8000 rw-p 00000000 00:00 0 7f943ac28000-7f943ac68000 rw-p 00000000 00:00 0 7f943ace8000-7f943aea8000 rw-p 00000000 00:00 0 7f943b6a9000-7f943b6aa000 ---p 00000000 00:00 0 7f943b6aa000-7f943cd7a000 rw-p 00000000 00:00 0 7f943cd7a000-7f943cd7b000 ---p 00000000 00:00 0 7f943cd7b000-7f943d57b000 rw-p 00000000 00:00 0