def process_encoding_input(target_data, vocab_to_int, batch_size):
'''Remove the last word id from each batch and concat the to the begining of each batch'''
# slice = tf.strided_slice(sample, begin=[0,0], end=[4,4], strides=[1,1])
# process_input = tf.concat([tf.fill([4, 1], 9999), slice], 1)
# process_input = [[9999 11 12 13], [9999 31 32 33] , [9999 51 52 53], [9999 61 62 63]]
# target data has batch_size rows, -1 means everything, so first elect of each row
slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
# tf.fill creates array of batch_size X 1 and then fill in value of '<GO>'
# create matrix that has first column as value vocab_to_int['<GO>'] and second as index [first column of each row)
process_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), slice], 1)
return process_input
def encoding_layer(rnn_size, article_length, num_layers, rnn_inputs, keep_prob):
'''Create the encoding layer'''
# Number of layer inside neural network
for layer in range(num_layers):
with tf.variable_scope('encoder_{}'.format(layer)):
# forward direction cell with random weights with seed value for reproduce random value
cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
# Dropout to kills cells that are not changing.
cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=keep_prob)
cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=keep_prob)
# Bidirectional as it si more optimized, spl with Dropouts
enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, rnn_inputs, article_length, dtype=tf.float32)
# Join outputs since we are using a bidirectional RNN
enc_output = tf.concat(enc_output, 2)
return enc_output, enc_state
def train_decoding_layer(dec_embed_input, headline_length, dec_cell, initial_state, output_layer,
max_headline_length):
'''Create the training logits'''
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, article_length, headline_length,
max_headline_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
'''Create the decoding cell and attention for the training and inference decoding layers'''
# creating layer and Dropout layers
for layer in range(num_layers):
with tf.variable_scope('decoder_{}'.format(layer)):
lstm = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
# creating Dense- This is also called output layer. This will produce the summary.
output_layer = Dense(vocab_size, activation='relu', kernel_initializer =
tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
# Using BahdanauAttention as one of the widely used Attention Algorithms
attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, article_length,
normalize=False, name='BahdanauAttention')
dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, rnn_size)
# tf.contrib.seq2seq.AttentionWrapperState(
# cell_state=enc_state,
# time=tf.zeros([], dtype=tf.int32),
# attention=_zero_state_tensors(rnn_size, batch_size, tf.float32),
# alignments=self._attention_mechanism.initial_alignments(
# batch_size, dtype),
# alignment_history=alignment_history)
# initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state[0], _zero_state_tensors(rnn_size,
# batch_size,
# tf.float32))
# alignment_history = ()
# attention_state = ()
# initial_state = tf.contrib.seq2seq.AttentionWrapperState( cell_state=enc_state[0],
# time=tf.zeros([], dtype=tf.int32),
# attention=_zero_state_tensors(rnn_size, batch_size,
# tf.float32),
# alignments=attn_mech.initial_alignments(
# batch_size, tf.float32),
# alignment_history=alignment_history,
# attention_state = attention_state
# )
# initializing the initial state, layer it would be update by output from one cell
initial_state = dec_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
# Creating training logits - which would be used during training dataset
with tf.variable_scope("decode"):
training_logits = train_decoding_layer(dec_embed_input,
headline_length,
dec_cell,
initial_state,
output_layer,
max_headline_length)
# Creating inference logits - which would produce output using train model
with tf.variable_scope("decode", reuse=True):
inference_logits = inference_decoding_layer(embeddings,
vocab_to_int['<GO>'],
vocab_to_int['<EOS>'],
dec_cell,
initial_state,
output_layer,
max_headline_length,
batch_size)
return training_logits, inference_logits
def seq2seq_model(input_data, target_data, keep_prob, article_length, headline_length, max_headliney_length,
vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, word_embedding_matrix):
'''Use the previous functions to create the training and inference logits'''
# Use fasttext's embeddings and the newly created ones as our embeddings
embeddings = word_embedding_matrix
# embedding_lookup returns embedding values of input_data that we have provided
print("Geting embedding for encoder input")
enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
# Define encoder layers - with respect to size of neurons, hidden layers and design (such as bi-directional)
print("Initializing encoder layers")
enc_output, enc_state = encoding_layer(rnn_size, article_length, num_layers, enc_embed_input, keep_prob)
print("Adding 'GO' to start text")
dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
print("Getting embedding for encoder input")
dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
print("Getting decoding_layer logits ... ")
# Train: Learn model parameters.
# Inference: Apply model on unseen data to assess performance.
training_logits, inference_logits = decoding_layer(dec_embed_input,
embeddings,
enc_output,
enc_state,
vocab_size,
article_length,
headline_length,
max_headliney_length,
rnn_size,
vocab_to_int,
keep_prob,
batch_size,
num_layers)
return training_logits, inference_logits
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
# Load the model inputs
print("Load input parameter ...")
input_data, targets, lr, keep_prob, headline_length, max_headline_length, article_length = model_inputs()
# Create the training and inference logits
print("Create instance of seq2seq model parameter ...")
# training_logits gives us matrix of possibilities when we trained the system whereas
# inference_logits are used when we are trying to predict summary out of it.
training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
targets,
keep_prob,
article_length,
headline_length,
max_headline_length,
len(vocab_to_int) + 1,
rnn_size,
num_layers,
vocab_to_int,
batch_size,
word_embedding_matrix)
# Create tensors for the training logits and inference logits
training_logits = tf.identity(training_logits.rnn_output, 'logits')
# used for basic decoder
inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
# inference_logits = tf.identity(inference_logits.predicted_ids, name='predictions')
# Create the weights for sequence_loss
masks = tf.sequence_mask(headline_length, max_headline_length, dtype=tf.float32, name='masks')
with tf.name_scope("optimization"):
# Loss function
cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
# Optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
# Gradient Clipping
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")
# input_data, targets, lr, keep_prob, headline_length, max_headline_length, article_length
return train_graph, train_op, cost, input_data, targets, lr, keep_prob, headline_length, max_headline_length, \
article_length
import tensorflow as tf from tensorflow.python.layers.core import Dense
import config import vectorization
print('TensorFlow Version: {}'.format(tf.version))
Getting the Hyperparameters
epochs = config.epochs batch_size = config.batch_size rnn_size = config.rnn_size num_layers = config.num_layers learning_rate = config.learning_rate keep_probability = config.keep_probability
def model_inputs(): '''Create palceholders for inputs to the model'''
def process_encoding_input(target_data, vocab_to_int, batch_size): '''Remove the last word id from each batch and concat the to the begining of each batch'''
sample = [[11, 12, 13], [31, 32, 33], [51, 52, 53], [61, 62,63]]
def encoding_layer(rnn_size, article_length, num_layers, rnn_inputs, keep_prob): '''Create the encoding layer'''
def train_decoding_layer(dec_embed_input, headline_length, dec_cell, initial_state, output_layer, max_headline_length): '''Create the training logits'''
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer, max_headline_length, batch_size): '''Create the inference logits'''
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, article_length, headline_length, max_headline_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers): '''Create the decoding cell and attention for the training and inference decoding layers'''
def seq2seq_model(input_data, target_data, keep_prob, article_length, headline_length, max_headliney_length, vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, word_embedding_matrix): '''Use the previous functions to create the training and inference logits'''
def build_graph(vocab_to_int, word_embedding_matrix):
Build the graph
def main(): print ("Prepare input parameters ...") sorted_articles, sorted_headlines, vocab_to_int, word_embedding_matrix = vectorization.create_input_for_graph() print("Build Graph parameters ...") build_graph(vocab_to_int, word_embedding_matrix)
'''-------------------------main------------------------------'''
main ()