Closed apple55bc closed 4 years ago
请问你用的是哪个版本,以及做了什么改动呢?看样子是模型重用后导致的无法训练问题,但我试了当前github的版本,以及历史的0.5.7版本,都不存在这个问题。
version='0.5.3' 代码我整合到一块了。跑出来应该就能复现那个错误 所有注释后带 <<< 的,都是改动的地方。有那么一点多,但是其实只要把MultiHeadAttention替换为很古老的一个版本,大概是19年11~12月的,就没问题。
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
@Author :apple.li
@Time :2020/3/6 13:27
@File :test_bert_error.py
@Desc :
"""
from bert4keras.bert import BertModel
from bert4keras.layers import *
class MultiHeadAttention(MultiHeadAttention):
def call(self, inputs, q_mask=None, v_mask=None, a_mask=None):
"""实现多头注意力
q_mask: 对输入的query序列的mask。
主要是将输出结果的padding部分置0。
v_mask: 对输入的value序列的mask。
主要是防止attention读取到padding信息。
a_mask: 对attention矩阵的mask。
不同的attention mask对应不同的应用。
"""
q, k, v = inputs[:3]
if a_mask:
if len(inputs) == 3:
a_mask = 'history_only'
else:
a_mask = inputs[3]
if isinstance(q_mask, str): # <<<
if not hasattr(self, 'q_mask_layer'):
self.q_mask_layer = search_layer(q, q_mask)
q_mask = self.q_mask_layer.output_mask
if isinstance(v_mask, str): # <<<
if not hasattr(self, 'v_mask_layer'):
self.v_mask_layer = search_layer(v, v_mask)
v_mask = self.v_mask_layer.output_mask
# Pooling
if self.pool_size > 1:
is_self_attention = (q is k is v)
q_in_len = K.shape(q)[1]
q = sequence_masking(q, q_mask, 0)
q = divisible_temporal_padding(q, self.pool_size)
q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg')
if is_self_attention:
k = v = q
else:
k = sequence_masking(k, v_mask, 0)
k = divisible_temporal_padding(k, self.pool_size)
k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg')
v = sequence_masking(v, v_mask, 0)
v = divisible_temporal_padding(v, self.pool_size)
v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg')
if v_mask is not None:
v_mask = v_mask[:, ::self.pool_size]
if a_mask is not None and not is_string(a_mask):
a_mask = a_mask[..., ::self.pool_size, ::self.pool_size]
# 线性变换
qw = self.q_dense(q)
kw = self.k_dense(k)
vw = self.v_dense(v)
# 形状变换
qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
# Attention
a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
# 相对位置编码
if self.max_relative_position is not None:
q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
q_idxs = K.expand_dims(q_idxs, 1)
v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
v_idxs = K.expand_dims(v_idxs, 0)
pos_ids = v_idxs - q_idxs
pos_ids = K.clip(pos_ids, -self.max_relative_position,
self.max_relative_position)
pos_ids = pos_ids + self.max_relative_position
pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
# Attention(续)
a = a / self.key_size**0.5
a = sequence_masking(a, v_mask, 1, -1)
if a_mask is not None:
if is_string(a_mask):
ones = K.ones_like(a[:1, :1])
a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
a = a - a_mask
else:
a = a - (1 - a_mask) * 1e12
a = K.softmax(a)
# 完成输出
o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
if self.max_relative_position is not None:
o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
o = self.o_dense(o)
# 恢复长度
if self.pool_size > 1:
o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len]
# 返回结果
o = sequence_masking(o, q_mask, 0)
return o
class NewBert(BertModel):
def build(self,
position_ids=None,
layer_norm_cond=None,
layer_norm_cond_size=None,
layer_norm_cond_hidden_size=None,
layer_norm_cond_hidden_act=None,
additional_input_layers=None):
"""Bert模型构建函数
layer_norm_*系列参数为实现Conditional Layer Normalization时使用,
用来实现以“固定长度向量”为条件的条件Bert。
"""
# 构建输入层
x_in = Input(shape=(None,), name='Input-Token')
s_in = Input(shape=(None,), name='Input-Segment')
emb_in = Input(shape=(None, self.embedding_size), name='Input-Emb') # <<<
x, s = input_layers = [x_in, s_in]
# 条件输入
if layer_norm_cond is not None:
z = layer_norm_cond
elif layer_norm_cond_size is not None:
z = Input(shape=(layer_norm_cond_size,), name='LayerNorm-Condition')
input_layers.append(z)
else:
z = None
layer_norm_cond_hidden_act = layer_norm_cond_hidden_act or 'linear'
# 补充输入层
if additional_input_layers is not None:
if isinstance(additional_input_layers, list):
input_layers.extend(additional_input_layers)
else:
input_layers.append(additional_input_layers)
# 补充mask
# x = ZeroMasking(name='Sequence-Mask')(x) # <<<
zero_mask = ZeroMasking(name='Sequence-Mask') # <<<
x = zero_mask(x) # <<<
self._sequence_mask = zero_mask._output_mask # <<<
# Embedding部分
self._token_embedding = Embedding(input_dim=self.vocab_size,
output_dim=self.embedding_size,
embeddings_initializer=self.initializer,
name='Embedding-Token') # <<<
x = self._token_embedding(x) # <<<
self.model_emb = keras.models.Model([x_in], x) # <<<
s = Embedding(input_dim=2,
output_dim=self.embedding_size,
embeddings_initializer=self.initializer,
name='Embedding-Segment')(s)
x = Add(name='Embedding-Token-Segment')([emb_in, s]) # <<<
if self.max_relative_position is None:
x = self.filter([x, position_ids])
x = PositionEmbedding(input_dim=self.max_position_embeddings,
output_dim=self.embedding_size,
merge_mode='add',
embeddings_initializer=self.initializer,
name='Embedding-Position')(x)
x = LayerNormalization(conditional=(z is not None),
hidden_units=layer_norm_cond_hidden_size,
hidden_activation=layer_norm_cond_hidden_act,
hidden_initializer=self.initializer,
name='Embedding-Norm')(self.filter([x, z]))
if self.dropout_rate > 0:
x = Dropout(rate=self.dropout_rate, name='Embedding-Dropout')(x)
if self.embedding_size != self.hidden_size:
x = Dense(units=self.hidden_size,
kernel_initializer=self.initializer,
name='Embedding-Mapping')(x)
# 主要Transformer部分
layers = None
for i in range(self.num_hidden_layers):
attention_name = 'Transformer-%d-MultiHeadSelfAttention' % (i + 1)
feed_forward_name = 'Transformer-%d-FeedForward' % (i + 1)
x, layers = self.transformer_block(
inputs=[x, z],
attention_mask=self.compute_attention_mask(i, s_in),
attention_name=attention_name,
feed_forward_name=feed_forward_name,
layer_norm_cond_hidden_size=layer_norm_cond_hidden_size,
layer_norm_cond_hidden_act=layer_norm_cond_hidden_act,
attention_pool_size=self.att_pool_size[i],
feed_forward_pool_size=self.ffn_pool_size[i],
layers=layers)
if not self.block_sharing:
layers = None
outputs = [x]
if self.with_pool or self.with_nsp:
# Pooler部分(提取CLS向量)
x = outputs[0]
x = Lambda(lambda x: x[:, 0], name='Pooler')(x)
pool_activation = 'tanh' if self.with_pool is True else self.with_pool
x = Dense(units=self.hidden_size,
activation=pool_activation,
kernel_initializer=self.initializer,
name='Pooler-Dense')(x)
if self.with_nsp:
# Next Sentence Prediction部分
x = Dense(units=2,
activation='softmax',
kernel_initializer=self.initializer,
name='NSP-Proba')(x)
outputs.append(x)
if self.with_mlm:
# Masked Language Model部分
x = outputs[0]
x = Dense(units=self.embedding_size,
activation=self.hidden_act,
kernel_initializer=self.initializer,
name='MLM-Dense')(x)
x = LayerNormalization(conditional=(z is not None),
hidden_units=layer_norm_cond_hidden_size,
hidden_activation=layer_norm_cond_hidden_act,
hidden_initializer=self.initializer,
name='MLM-Norm')(self.filter([x, z]))
mlm_activation = 'softmax' if self.with_mlm is True else self.with_mlm
embedding_dense = EmbeddingDense(embedding_name='Embedding-Token',
activation=mlm_activation,
name='MLM-Proba') # <<<
embedding_dense.kernel = K.transpose(self._token_embedding.embeddings) # <<<
x = embedding_dense(x) # <<<
outputs.append(x)
outputs += self.additional_outputs
if len(outputs) == 1:
outputs = outputs[0]
elif len(outputs) == 2:
outputs = outputs[1]
else:
outputs = outputs[1:]
embed = self.model_emb([x_in]) # <<<
self.model_emb_out = keras.models.Model([emb_in] + input_layers, outputs) # <<<
embed_out = self.model_emb_out([embed] + input_layers) # <<<
self.model = keras.models.Model(input_layers, embed_out) # <<<
def transformer_block(self,
inputs,
attention_mask=None,
attention_name='attention',
feed_forward_name='feed-forward',
layer_norm_cond_hidden_size=None,
layer_norm_cond_hidden_act='linear',
attention_pool_size=None,
feed_forward_pool_size=None,
layers=None):
"""构建单个Transformer Block
如果没传入layers则新建层;如果传入则重用旧层。
"""
x, z = inputs
layers = layers or [
MultiHeadAttention(heads=self.num_attention_heads,
head_size=self.attention_head_size,
kernel_initializer=self.initializer,
max_relative_position=self.max_relative_position,
pool_size=attention_pool_size,
name=attention_name),
Dropout(rate=self.dropout_rate,
name='%s-Dropout' % attention_name),
Add(name='%s-Add' % attention_name),
LayerNormalization(conditional=(z is not None),
hidden_units=layer_norm_cond_hidden_size,
hidden_activation=layer_norm_cond_hidden_act,
hidden_initializer=self.initializer,
name='%s-Norm' % attention_name),
FeedForward(units=self.intermediate_size,
groups=self.num_feed_forward_groups,
activation=self.hidden_act,
kernel_initializer=self.initializer,
pool_size=feed_forward_pool_size,
name=feed_forward_name),
Dropout(rate=self.dropout_rate,
name='%s-Dropout' % feed_forward_name),
Add(name='%s-Add' % feed_forward_name),
LayerNormalization(conditional=(z is not None),
hidden_units=layer_norm_cond_hidden_size,
hidden_activation=layer_norm_cond_hidden_act,
hidden_initializer=self.initializer,
name='%s-Norm' % feed_forward_name),
]
# Self Attention
xi, x = x, [x, x, x]
mask = self._sequence_mask # <<<
if attention_mask is None:
x = layers[0](x, q_mask=mask, v_mask=mask)
elif attention_mask is 'history_only':
x = layers[0](x, q_mask=mask, v_mask=mask, a_mask=True)
else:
x.append(attention_mask)
x = layers[0](x, q_mask=mask, v_mask=mask, a_mask=True)
if self.dropout_rate > 0:
x = layers[1](x)
x = layers[2]([xi, x])
x = layers[3](self.filter([x, z]))
# Feed Forward
xi = x
x = layers[4](x, mask=mask)
if self.dropout_rate > 0:
x = layers[5](x)
x = layers[6]([xi, x])
x = layers[7](self.filter([x, z]))
return x, layers
config = {'attention_probs_dropout_prob': 0.1, 'directionality': 'bidi', 'hidden_act': 'gelu',
'hidden_dropout_prob': 0.1, 'hidden_size': 516, 'initializer_range': 0.02, 'intermediate_size': 1024,
'max_position_embeddings': 256, 'num_attention_heads': 4, 'num_hidden_layers': 1,
'pooler_fc_size': 516, 'pooler_num_attention_heads': 6, 'pooler_num_fc_layers': 3,
'pooler_size_per_head': 128, 'pooler_type': 'first_token_transform', 'type_vocab_size': 2,
'vocab_size': 21128}
bert = NewBert(vocab_size=config['vocab_size'],
max_position_embeddings=config.get('max_position_embeddings'),
hidden_size=config['hidden_size'],
num_hidden_layers=config['num_hidden_layers'],
num_attention_heads=config['num_attention_heads'],
intermediate_size=config['intermediate_size'],
hidden_act=config['hidden_act'],
dropout_rate=config['hidden_dropout_prob'],
initializer_range=config.get('initializer_range'),
embedding_size=config.get('embedding_size'),
max_relative_position=None,
num_feed_forward_groups=config.get('num_feed_forward_groups'))
bert.build()
l_input_ids = keras.layers.Input(shape=(None,))
l_token_type_ids = keras.layers.Input(shape=(None,))
y_in = keras.Input(shape=(None,), name='Output')
emb = bert.model_emb([l_input_ids]) # 模型拆分开来是为了做Adv-loss
output = bert.model_emb_out([emb, l_input_ids, l_token_type_ids])
output_id = keras.layers.Lambda(lambda x: K.argmax(x, axis=-1))(output)
model = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output_id)
model.summary()
model.predict([[[3,456,4]],[[0,0,0]]])
我发现问题出现在search_layer这块。 将MultiheadAttention中的call函数开头:
q, k, v = inputs[:3]
idx = 3
if q_mask is True:
q_mask = inputs[idx]
idx += 1
else:
q_mask = None
if v_mask is True:
v_mask = inputs[idx]
idx += 1
else:
v_mask = None
if a_mask is True:
if len(inputs) > idx:
a_mask = inputs[idx]
else:
a_mask = 'history_only'
else:
a_mask = None
# q, k, v = inputs[:3]
# if a_mask:
# if len(inputs) == 3:
# a_mask = 'history_only'
# else:
# a_mask = inputs[3]
# if isinstance(q_mask, str):
# if not hasattr(self, 'q_mask_layer'):
# self.q_mask_layer = search_layer(q, q_mask)
# q_mask = self.q_mask_layer.output_mask
# if isinstance(v_mask, str):
# if not hasattr(self, 'v_mask_layer'):
# self.v_mask_layer = search_layer(v, v_mask)
# v_mask = self.v_mask_layer.output_mask
就不会出现这个问题。 不知道最新版本会不会有这个问题存在。
我大概理解bug原因所在了。之前的search_layer实现的mask机制确实可能在模型重用时出现问题,后来决定用回keras自带的mask机制了。
0.5.5及之后的版本应该都不会有这个问题了。推荐用最新版,因为最新版的api做了一些改动和标准化,并在相当长一段时间内api都不会发生改变了。
好的 感谢!
当bert模型构建完,在外部如下:
其中bert源码我有微调的地方。 这样调用后,模型构建没问题,在训练时候会报错:
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'Input-Token' with dtype float and shape [?,?] [[{{node Input-Token}}]]
经过排查是MultiHeadAttention这个类的问题。如果把它替换为老版本(19年12月前)版本就不会有这个问题。 希望修复,感谢!