Closed luoqishuai closed 3 years ago
import os, re
os.environ['TF_KERAS'] = '1' # 必须使用tf.keras
import tensorflow as tf
from bert4keras.models import build_transformer_model
from bert4keras.backend import keras, K
from bert4keras.optimizers import Adam
from bert4keras.optimizers import extend_with_weight_decay
from bert4keras.optimizers import extend_with_layer_adaptation
from bert4keras.optimizers import extend_with_piecewise_linear_lr
from bert4keras.optimizers import extend_with_gradient_accumulation
from keras.layers import Input, Lambda
from keras.models import Model
model = 'roberta'
sequence_length = 512
batch_size = 4096
config_path = r'E:\dl_model\chinese_roberta_wwm_ext_L-12_H-768_A-12\bert_config.json'
checkpoint_path = r'E:\dl_model\chinese_roberta_wwm_ext_L-12_H-768_A-12\bert_model.ckpt' # 如果从零训练,就设为None
learning_rate = 0.00176
weight_decay_rate = 0.01
num_warmup_steps = 3125
num_train_steps = 125000
steps_per_epoch = 10000
grad_accum_steps = 16 # 大于1即表明使用梯度累积
epochs = num_train_steps * grad_accum_steps // steps_per_epoch
exclude_from_weight_decay = ['Norm', 'bias']
tpu_address = None
which_optimizer = 'lamb' # adam 或 lamb,均自带weight decay
lr_schedule = {
num_warmup_steps * grad_accum_steps: 1.0,
num_train_steps * grad_accum_steps: 0.0,
}
floatx = K.floatx()
def build_transformer_model_with_mlm():
"""带mlm的bert模型
"""
bert = build_transformer_model(
config_path, with_mlm='linear', return_keras_model=False
)
proba = bert.model.output
# 辅助输入
token_ids = Input(shape=(None,), dtype='int64', name='token_ids') # 目标id
is_masked = Input(shape=(None,), dtype=floatx, name='is_masked') # mask标记
def mlm_loss(inputs):
"""计算loss的函数,需要封装为一个层
"""
y_true, y_pred, mask = inputs
print(f'{y_true.shape} {y_pred.shape}')
loss = K.sparse_categorical_crossentropy(
y_true, y_pred, from_logits=True
)
loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
return loss
def mlm_acc(inputs):
"""计算准确率的函数,需要封装为一个层
"""
y_true, y_pred, mask = inputs
y_true = K.cast(y_true, floatx)
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
return acc
mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
train_model = Model(
bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]
)
loss = {
'mlm_loss': lambda y_true, y_pred: y_pred,
'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
}
return bert, train_model, loss
def build_transformer_model_for_pretraining():
"""构建训练模型,通用于TPU/GPU
注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
时要格外留意。
"""
if model == 'roberta':
bert, train_model, loss = build_transformer_model_with_mlm()
# 优化器
optimizer = extend_with_weight_decay(Adam)
if which_optimizer == 'lamb':
optimizer = extend_with_layer_adaptation(optimizer)
optimizer = extend_with_piecewise_linear_lr(optimizer)
optimizer_params = {
'learning_rate': learning_rate,
'lr_schedule': lr_schedule,
'weight_decay_rate': weight_decay_rate,
'exclude_from_weight_decay': exclude_from_weight_decay,
'bias_correction': False,
}
if grad_accum_steps > 1:
optimizer = extend_with_gradient_accumulation(optimizer)
optimizer_params['grad_accum_steps'] = grad_accum_steps
optimizer = optimizer(**optimizer_params)
# 模型定型
train_model.compile(loss=loss, optimizer=optimizer)
# 如果传入权重,则加载。注:须在此处加载,才保证不报错。
if checkpoint_path is not None:
bert.load_model(checkpoint_path)
return train_model
train_model = build_transformer_model_for_pretraining()
train_model.summary()
预训练时候构建模型报错
Traceback (most recent call last):
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1610, in _create_c_op
c_op = c_api.TF_FinishOperation(op_desc)
tensorflow.python.framework.errors_impl.InvalidArgumentError: slice index -1 of dimension 0 out of bounds. for 'loss/mlm_loss_loss/strided_slice' (op: 'StridedSlice') with input shapes: [0], [1], [1], [1] and with computed input tensors: input[1] = <-1>, input[2] = <0>, input[3] = <1>.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 373, in compile
self._compile_weights_loss_and_weighted_metrics()
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 1653, in _compile_weights_loss_and_weighted_metrics
self.total_loss = self._prepare_total_loss(masks)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 1713, in _prepare_total_loss
per_sample_losses = loss_fn.call(y_true, y_pred)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\losses.py", line 220, in call
y_pred, y_true)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\losses\util.py", line 79, in squeeze_or_expand_dimensions
is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1])
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 813, in _slice_helper
name=name)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 979, in strided_slice
shrink_axis_mask=shrink_axis_mask)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py", line 10394, in strided_slice
shrink_axis_mask=shrink_axis_mask, name=name)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 793, in _apply_op_helper
op_def=op_def)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\func_graph.py", line 548, in create_op
compute_device)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3429, in _create_op_internal
op_def=op_def)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1773, in __init__
control_input_ops)
File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1613, in _create_c_op
raise ValueError(str(e))
ValueError: slice index -1 of dimension 0 out of bounds. for 'loss/mlm_loss_loss/strided_slice' (op: 'StridedSlice') with input shapes: [0], [1], [1], [1] and with computed input tensors: input[1] = <-1>, input[2] = <0>, input[3] = <1>.
Process finished with exit code 1
debug的时候路径 build_transformer_model_for_pretraining -> _compile_weights_loss_and_weighted_metrics -> _prepare_total_loss -> loss_fn.call
最后运行到
y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions( y_pred, y_true)
时候y_pre y_true的值是
y_pred Tensor("mlm_loss/Identity:0", shape=(), dtype=float32)
y_true Tensor("mlm_loss_target:0", dtype=float32)
然后报错了.
查找问题的时候 https://stackoverflow.com/questions/57748812/how-to-make-custom-loss-with-extra-input-in-tensorflow-2-0 这个感觉和我的很像,于是重装了tensorflow: conda install tensorflow-gpu==2.0.0
还是不行
keras:2.3.1 bert4keras:0.8.6 tensorflow-gpu:2.0.0
苏神求救
你用的好像是2.0.0-beta1 .
bert4keras 0.7.6 tf 2.0.0 keras 2.3.1
请教一个问题.
在data_utils.py中是通过some_texts来将corpus转换成预训练的格式. 我看逻辑是将文档分为句子,然后组成list.文档之间没有区分的标识.
但是bert 预训练中,文章和文章,是有空行. 然后mask 和 predict 是在单个文档之间进行的,而不是跨文档.
bert源码 create_pretraining_data create_training_instances
Input file format: (1) One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task). (2) Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
请问是我理解错了吗?请问是否需要划分文档,然后再进行预训练呢?还有,预训练是不是提升的特别少啊.感觉使用robert原版训练出来的模型,在微博文本(短文本)上识别不好.
想增量预训练这部分文本. 可是看到美团的一篇文章(https://tech.meituan.com/2019/11/14/nlp-bert-practice.html),才提升0.5%,%E6%89%8D%E6%8F%90%E5%8D%870.5%) .
好像就算预训练也学习不到这部分知识.
我这是按照RoBERTa的训练方式,可以跨文章,拼成最接近512长度的就行。
预训练的提升,有人说提升了几个点,有人说没有,看情况。如果已经到了天花板,那么0.5%的提升也是了不起了。
import os, re os.environ['TF_KERAS'] = '1' # 必须使用tf.keras import tensorflow as tf from bert4keras.models import build_transformer_model from bert4keras.backend import keras, K from bert4keras.optimizers import Adam from bert4keras.optimizers import extend_with_weight_decay from bert4keras.optimizers import extend_with_layer_adaptation from bert4keras.optimizers import extend_with_piecewise_linear_lr from bert4keras.optimizers import extend_with_gradient_accumulation from keras.layers import Input, Lambda from keras.models import Model model = 'roberta' sequence_length = 512 batch_size = 4096 config_path = r'E:\dl_model\chinese_roberta_wwm_ext_L-12_H-768_A-12\bert_config.json' checkpoint_path = r'E:\dl_model\chinese_roberta_wwm_ext_L-12_H-768_A-12\bert_model.ckpt' # 如果从零训练,就设为None learning_rate = 0.00176 weight_decay_rate = 0.01 num_warmup_steps = 3125 num_train_steps = 125000 steps_per_epoch = 10000 grad_accum_steps = 16 # 大于1即表明使用梯度累积 epochs = num_train_steps * grad_accum_steps // steps_per_epoch exclude_from_weight_decay = ['Norm', 'bias'] tpu_address = None which_optimizer = 'lamb' # adam 或 lamb,均自带weight decay lr_schedule = { num_warmup_steps * grad_accum_steps: 1.0, num_train_steps * grad_accum_steps: 0.0, } floatx = K.floatx() def build_transformer_model_with_mlm(): """带mlm的bert模型 """ bert = build_transformer_model( config_path, with_mlm='linear', return_keras_model=False ) proba = bert.model.output # 辅助输入 token_ids = Input(shape=(None,), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None,), dtype=floatx, name='is_masked') # mask标记 def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs print(f'{y_true.shape} {y_pred.shape}') loss = K.sparse_categorical_crossentropy( y_true, y_pred, from_logits=True ) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = K.cast(y_true, floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) train_model = Model( bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc] ) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss def build_transformer_model_for_pretraining(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ if model == 'roberta': bert, train_model, loss = build_transformer_model_with_mlm() # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile(loss=loss, optimizer=optimizer) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_model(checkpoint_path) return train_model train_model = build_transformer_model_for_pretraining() train_model.summary()
预训练时候构建模型报错
Traceback (most recent call last): File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1610, in _create_c_op c_op = c_api.TF_FinishOperation(op_desc) tensorflow.python.framework.errors_impl.InvalidArgumentError: slice index -1 of dimension 0 out of bounds. for 'loss/mlm_loss_loss/strided_slice' (op: 'StridedSlice') with input shapes: [0], [1], [1], [1] and with computed input tensors: input[1] = <-1>, input[2] = <0>, input[3] = <1>. During handling of the above exception, another exception occurred: Traceback (most recent call last): File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper result = method(self, *args, **kwargs) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 373, in compile self._compile_weights_loss_and_weighted_metrics() File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper result = method(self, *args, **kwargs) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 1653, in _compile_weights_loss_and_weighted_metrics self.total_loss = self._prepare_total_loss(masks) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 1713, in _prepare_total_loss per_sample_losses = loss_fn.call(y_true, y_pred) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\keras\losses.py", line 220, in call y_pred, y_true) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\losses\util.py", line 79, in squeeze_or_expand_dimensions is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1]) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 813, in _slice_helper name=name) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 979, in strided_slice shrink_axis_mask=shrink_axis_mask) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py", line 10394, in strided_slice shrink_axis_mask=shrink_axis_mask, name=name) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 793, in _apply_op_helper op_def=op_def) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\func_graph.py", line 548, in create_op compute_device) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3429, in _create_op_internal op_def=op_def) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1773, in __init__ control_input_ops) File "C:\public\study\anaconda\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1613, in _create_c_op raise ValueError(str(e)) ValueError: slice index -1 of dimension 0 out of bounds. for 'loss/mlm_loss_loss/strided_slice' (op: 'StridedSlice') with input shapes: [0], [1], [1], [1] and with computed input tensors: input[1] = <-1>, input[2] = <0>, input[3] = <1>. Process finished with exit code 1
debug的时候路径 build_transformer_model_for_pretraining -> _compile_weights_loss_and_weighted_metrics -> _prepare_total_loss -> loss_fn.call 最后运行到
y_pred, y_true = tf_losses_util.squeeze_or_expand_dimensions( y_pred, y_true)
时候y_pre y_true的值是y_pred Tensor("mlm_loss/Identity:0", shape=(), dtype=float32)
y_true Tensor("mlm_loss_target:0", dtype=float32)
然后报错了.查找问题的时候 https://stackoverflow.com/questions/57748812/how-to-make-custom-loss-with-extra-input-in-tensorflow-2-0 这个感觉和我的很像,于是重装了tensorflow: conda install tensorflow-gpu==2.0.0
还是不行
keras:2.3.1 bert4keras:0.8.6 tensorflow-gpu:2.0.0
苏神求救
我没遇到过这种错误,推荐用tf 1.14/1.15试试
嗯嗯,多谢大佬.
bert4keras 0.7.6 tf 2.0.0 keras 2.3.1
请教一个问题.
在data_utils.py中是通过some_texts来将corpus转换成预训练的格式. 我看逻辑是将文档分为句子,然后组成list.文档之间没有区分的标识.
但是bert 预训练中,文章和文章,是有空行. 然后mask 和 predict 是在单个文档之间进行的,而不是跨文档.
bert源码 create_pretraining_data create_training_instances
Input file format: (1) One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text. (Because we use the sentence boundaries for the "next sentence prediction" task). (2) Blank lines between documents. Document boundaries are needed so that the "next sentence prediction" task doesn't span between documents.
请问是我理解错了吗?请问是否需要划分文档,然后再进行预训练呢?还有,预训练是不是提升的特别少啊.感觉使用robert原版训练出来的模型,在微博文本(短文本)上识别不好.
想增量预训练这部分文本. 可是看到美团的一篇文章(https://tech.meituan.com/2019/11/14/nlp-bert-practice.html),才提升0.5% .
好像就算预训练也学习不到这部分知识.