Open bringtree opened 5 years ago
有个char模式 def dump_token_embeddings 是负责 通过char 得到我们要用的词向量。 函数的大致流程是这样子的:
把单词 拆解成 字母 然后 转ascii。 然后里面有个model。 BidirectionalLanguageModel-> BidirectionalLanguageModelGraph 由于是 char 模式。开始准备cnn的输入
with tf.device("/cpu:0"):
self.embedding_weights = tf.get_variable(
"char_embed", [n_chars, char_embed_dim],
dtype=DTYPE,
initializer=tf.random_uniform_initializer(-1.0, 1.0)
)
# shape (batch_size, unroll_steps, max_chars, embed_dim)
self.char_embedding = tf.nn.embedding_lookup(self.embedding_weights,
self.ids_placeholder)
self.ids_placeholder 通过self.embedding_weights (模型训练的,维度char_embed_dim)对应上 自己的 self.char_embedding
# the convolutions
def make_convolutions(inp):
with tf.variable_scope('CNN') as scope:
convolutions = []
for i, (width, num) in enumerate(filters):
if cnn_options['activation'] == 'relu':
# He initialization for ReLU activation
# with char embeddings init between -1 and 1
#w_init = tf.random_normal_initializer(
# mean=0.0,
# stddev=np.sqrt(2.0 / (width * char_embed_dim))
#)
# Kim et al 2015, +/- 0.05
w_init = tf.random_uniform_initializer(
minval=-0.05, maxval=0.05)
elif cnn_options['activation'] == 'tanh':
# glorot init
w_init = tf.random_normal_initializer(
mean=0.0,
stddev=np.sqrt(1.0 / (width * char_embed_dim))
)
w = tf.get_variable(
"W_cnn_%s" % i,
[1, width, char_embed_dim, num],
initializer=w_init,
dtype=DTYPE)
b = tf.get_variable(
"b_cnn_%s" % i, [num], dtype=DTYPE,
initializer=tf.constant_initializer(0.0))
conv = tf.nn.conv2d(
inp, w,
strides=[1, 1, 1, 1],
padding="VALID") + b
# now max pool
conv = tf.nn.max_pool(
conv, [1, 1, max_chars-width+1, 1],
[1, 1, 1, 1], 'VALID')
# activation
conv = activation(conv)
conv = tf.squeeze(conv, squeeze_dims=[2])
convolutions.append(conv)
return tf.concat(convolutions, 2)
embedding = make_convolutions(self.char_embedding)
接着cnn。层数和激活函数,卷积核是在配置文件中自己设置, 步数是1,使用的是max pool. 输出的特征全部拼接。 合成embedding. 接着你会看到一个投影的W和b
with tf.variable_scope('CNN_proj') as scope:
W_proj_cnn = tf.get_variable(
"W_proj", [n_filters, projection_dim],
initializer=tf.random_normal_initializer(
mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
dtype=DTYPE)
b_proj_cnn = tf.get_variable(
"b_proj", [projection_dim],
initializer=tf.constant_initializer(0.0),
dtype=DTYPE)
和一个有门控的函数。carry_gate控制transfor_gate 和 x 的输入。难道要准备写循环了? 哦 原来是highway network
def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
return carry_gate * transform_gate + (1.0 - carry_gate) * x
for i in range(n_highway):
with tf.variable_scope('CNN_high_%s' % i) as scope:
W_carry = tf.get_variable(
'W_carry', [highway_dim, highway_dim],
# glorit init
initializer=tf.random_normal_initializer(
mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
dtype=DTYPE)
b_carry = tf.get_variable(
'b_carry', [highway_dim],
initializer=tf.constant_initializer(-2.0),
dtype=DTYPE)
W_transform = tf.get_variable(
'W_transform', [highway_dim, highway_dim],
initializer=tf.random_normal_initializer(
mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
dtype=DTYPE)
b_transform = tf.get_variable(
'b_transform', [highway_dim],
initializer=tf.constant_initializer(0.0),
dtype=DTYPE)
embedding = high(embedding, W_carry, b_carry,
W_transform, b_transform)
接着投影
embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
后面是一些维度的变化。 最后得到准备投放lstm的词向量
self.embedding = embedding
def _build(self):
if self.use_character_inputs:
self._build_word_char_embeddings()
else:
self._build_word_embeddings()
self._build_lstms()
开始lstm的。 就是双向lstm 拿了 目标词 前向的 h_1-h_n-1, 以及后向的h_1-h_n-1. 以及 2个 state。 这块其实不好写的。 你可以当做是在训练slot。作者的句子长度是在这里算出来的。mask存储 判断的0 和非0.
我们就不纠结这块代码了。把握下整体就好了。 这块build 完 BidirectionalLanguageModelGraph的初始化也就结束了。 BidirectionalLanguageModel的初始化也就结束。 记住这个过程叫做embedding_op.
embedding_op = model(ids_placeholder)['token_embeddings']
n_tokens = vocab.size
embed_dim = int(embedding_op.shape[2])
embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
for k in range(n_tokens):
token = vocab.id_to_word(k)
char_ids = batcher.batch_sentences([[token]])[0, 1, :].reshape(
1, 1, -1)
embeddings[k, :] = sess.run(
embedding_op, feed_dict={ids_placeholder: char_ids}
)
另外一方面elmo 会由于输出层的约束 导致模型在一次多义上表现不是特别好
源代码
https://github.com/allenai/bilm-tf.git