How to combine LSTM layers with varying input-data shapes (word + character)?

Horsmann commented 7 years ago

Hi,

I have a NN in which I train a seq-2-seq model.

I want to combine word-level and character-level information. Currently I get an exception saying setting an array element with a sequence which probably origins in the way I provide word and char information

The words are represented like this

[
 [1 2 3 4]  # sequence A
 [3 2 1 2] # sequence B
 ...
]

For the characters I have an additional dimension e.g. one word has several characters

[
  [
    [1 2 0] # word 1 in sequence A
    [1 2 1 ] # word 2 in sequence A
   ....
  ]
]

The modelling of the input data seems, thus, to be wrong. How would you correctly train models, one on word and one on char to concatenate them? I attached my code as minimal (not-)working example which demonstrates this error. The error messages changes if using Tensorflow but the error occurs for both frameworks.

Code:

import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, TimeDistributed, Bidirectional
from keras.layers import LSTM, Merge, Dropout
from keras.utils import np_utils

np.set_printoptions(threshold=np.nan)

def mapValuesCharLevel(sequences, map):

    all = []
    for s in sequences:
        sout=[]
        for w in s:
            wout=[]
            for c in w:
                wout.append(map[c])
            sout.append(wout)
        all.append(sout)
    return all

def buildIntMap(input):
    startIdx=1
    map = {}
    for s in input:
        for w in s:
            if w not in map:
               map[w]=startIdx
               startIdx+=1
    return map

def mapValues(input, map):
    out = []
    for s in input:
        out_s = []
        for w in s:
            out_s.append(map[w])
        out.append(out_s)
    return out

def padWordLevel(data, max,longest_sequence):
    aout=[]
    for s in data:
        sout = []
        for w in s:
            zeros = [0] * max
            zeros[:-len(w)] = w
            sout.append(np.array(zeros))
        while len(sout) < longest_sequence:
            sout.append(np.array(0))
        aout.append(sout)
    return np.array(aout)

#TRAIN = "en_train.txt"
#TEST = "en_test.txt"

#word_train_token, word_train_label = nnu.loadCorpus(TRAIN)
#word_test_token, word_test_label = nnu.loadCorpus(TEST)

word_train_token = word_test_token = [
    ['Great', 'Western', 'said', 'it', 'had', 'a', 'sharp', 'increase', 'in', 'margins', 'in', 'the', 'recent', 'third', 'quarter', '.'],
    ['Margins', 'are', 'the', 'difference', 'between', 'the', 'yield', 'on', 'the', 'company', 's', 'earning', 'assets', 'and', 'its', 'own', 'cost', 'of', 'funds', '.'],
    ['But', 'a', 'reduction', 'in', 'one-time', 'gains', 'on', 'the', 'sale', 'of', 'various', 'assets', 'and', 'an', 'increase', 'in', 'the', 'company', 's', 'provision', 'for ', 'loan', 'losses', 'held', 'down', 'the', 'earnings', 'gain', ', ', 'the', 'company', 'said', '.']
]

word_train_label = word_test_label = [
    ['NNP', 'NNP', 'VBD', 'PRP', 'VBD', 'DT', 'JJ', 'NN', 'IN', 'NNS', 'IN', 'DT', 'JJ', 'JJ', 'NN', '.'],
    ['NNS', 'VBP', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'POS', 'VBG', 'NNS', 'CC', 'PRP$', 'JJ', 'NN', 'IN', 'NNS', '.'],
    ['CC', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'CC', 'DT', 'NN', 'IN', 'DT', 'NN', 'POS', 'NN', 'IN', 'NN', 'NNS', 'VBD', 'RP', 'DT', 'NNS', 'NN', ',', 'DT', 'NN', 'VBD', '.']
]

longest_sequence = max(len(s) for s in (word_train_token+word_test_token))
print("Longest seq: ", longest_sequence)

word_tokenMap = buildIntMap(word_train_token+word_test_token)
word_labelMap = buildIntMap(word_train_label+word_test_label)

word_trainTokenInt = mapValues(word_train_token, word_tokenMap)
word_trainLabelInt = mapValues(word_train_label, word_labelMap)

word_testTokenInt = mapValues(word_test_token, word_tokenMap)
word_testLabelInt = mapValues(word_test_label, word_labelMap)

word_vocabSize=len(word_tokenMap)

word_trainTokenIntPad = sequence.pad_sequences(word_trainTokenInt, maxlen=longest_sequence)
word_trainLabelIntPad = sequence.pad_sequences(word_trainLabelInt, maxlen=longest_sequence)
word_testTokenIntPad = sequence.pad_sequences(word_testTokenInt, maxlen=longest_sequence)
word_testLabelIntPad = sequence.pad_sequences(word_testLabelInt, maxlen=longest_sequence)

word_maximal_value_A = max([ys for sent in word_trainLabelInt for ys in sent])
word_maximal_value_B = max([ys for sent in word_testLabelInt for ys in sent])
word_maximal_value = max([word_maximal_value_A, word_maximal_value_B]) + 1

word_train_label = np.array([np_utils.to_categorical(seq, word_maximal_value) for seq in word_trainLabelIntPad])
word_test_label = np.array([np_utils.to_categorical(seq, word_maximal_value) for seq in word_testLabelIntPad])

longest_word = max(len(w) for s in (word_train_token+word_test_token)for w in s)
print("Longest word: ", longest_word)
char_list = list(set(''.join([w for s in word_train_token for w in s])))

char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))

char_trainTokenInt = mapValuesCharLevel(word_train_token, char_indices)
char_testTokenInt = mapValuesCharLevel(word_test_token, char_indices)

char_trainTokenIntPad = padWordLevel(char_trainTokenInt, longest_word, longest_sequence)
char_testTokenIntPad = padWordLevel(char_testTokenInt, longest_word, longest_sequence)

#print("Word data: ", word_trainTokenIntPad[0:2])
#print("Char data: ", char_trainTokenIntPad[0:2])

word_maximal_value_A = max([ys for sent in word_trainLabelInt for ys in sent])
word_maximal_value_B = max([ys for sent in word_testLabelInt for ys in sent])
word_maximal_value = max([word_maximal_value_A, word_maximal_value_B]) + 1

word_train_label = np.array([np_utils.to_categorical(seq, word_maximal_value) for seq in word_trainLabelIntPad])
word_test_label = np.array([np_utils.to_categorical(seq, word_maximal_value) for seq in word_testLabelIntPad])

print(word_trainTokenIntPad.shape)
print(char_trainTokenIntPad.shape)
print(word_train_label.shape)

EMBEDDING_DIM=64
LSTM_DIM=64

word = Sequential()
word.add(Embedding(input_dim=len(word_tokenMap), output_dim=64, input_length=word_trainTokenIntPad.shape[1]))
word.add(Dropout(0.2))

char = Sequential()
print("Num chars: ", len(char_list))
char.add(Embedding(input_dim=len(char_list), output_dim=64, input_length=word_trainTokenIntPad.shape[1]))
char.add(Dropout(0.2))
char.add(LSTM(LSTM_DIM, return_sequences=True))
char.add(TimeDistributed(Dense(word_maximal_value)))

merge = Sequential()
merge.add(Merge([word, char], mode="concat"))
merge.add(Bidirectional(LSTM(LSTM_DIM, return_sequences=True)))
merge.add(TimeDistributed(Dense(word_maximal_value)))
merge.add(Activation('softmax'))

# try using different optimizers and different optimizer configs
merge.compile(loss='categorical_crossentropy',
                 optimizer='sgd',
                 metrics=['accuracy'])

for i in range(0,20):
    print("===== EPOCHE "+str(i+1)+" =====")
    merge.fit([word_trainTokenIntPad,char_trainTokenIntPad], word_train_label, nb_epoch=1, shuffle=True, verbose=1)

kaya27 commented 7 years ago

Hello! Actually I'm facing the same problem have you solved the issue?

Horsmann commented 7 years ago

No, sorry. Still unsolved :(

zhhongzhi commented 7 years ago

def padWordLevel(data, max,longest_sequence):
    aout=[]
    for s in data:
        sout = []
        for w in s:
            zeros = [0] * max
            zeros[:-len(w)] = w
            sout.append(np.array(zeros))
        while len(sout) < longest_sequence:
            sout.append(np.array(0))
        aout.append(sout)
    return np.array(aout)

change line sout.append(np.array(0))to sout.append([]), it works. Maybe you should check whether the padding works as aspected. The padding result should be an (3, 33, 10) array. but not (3, 33), with the lower level element have both vector and int.

My implement of the char seq padding function.


def padWordLevel(char_TokenInt, maxlen, longest_seq):
    char_TokenIntPad = []
    for char_in_token in char_TokenInt:
        char_in_token = list(char_in_token)
        for j in range(longest_seq-len(char_in_token)):
            char_in_token.append([])
        char_TokenIntPad.append(pad_sequences(char_in_token, maxlen=maxlen, padding='post'))
    return np.asarray(char_TokenIntPad)

Hope it helps. And this issue may help.

mrmutator commented 7 years ago

Hi, I'm also trying to find a solution for that problem. As far as I understand, the embedding layer in Keras only accepts a two-dimensional array. For the character-level embeddings above, a three-dimensional array is defined (num_sentences x num_words_per_sentence x num_characters_per_word). Therefore, I get the following error when running the code above:

ValueError: Error when checking input: expected embedding_2_input to have 2 dimensions, but got array with shape (3, 33, 10)

Is there a nice way to solve this?

Neway6655 commented 7 years ago

@mrmutator any updates?

I met the same issue, my embedding inputs have 3 dimensions, and seems keras embedding does not support this.

Dragon615 commented 7 years ago

Is there any update?? I'm having the same issue. Keras seems to accept only 2-dimensional array!!

Neway6655 commented 7 years ago

I changed it by using tf.nn.embedding_lookup, it can support 3-dimensional inputs.

embedding = tf.Variable(tf.random_uniform((features_total_size, n_embedding), -1, 1))
deep_inputs = tf.placeholder(tf.int32, (batch_size, time_step, features), name='deep_inputs')
embed_x = tf.nn.embedding_lookup(embedding, deep_inputs)

RA-Danny commented 6 years ago

@Neway6655 Can you give a complete model with the shape of your data please ? I'm struggling to combine word+character embedding with Keras..

Many thanks :)

Neway6655 commented 6 years ago

@RA-Danny I don't know how to do this using Keras, I am using tensorflow's embedding api directly.

saroufimc1 commented 6 years ago

@Neway6655 : can you please share your complete code using tensorflow ? Thanks :)

Neway6655 commented 6 years ago

@saroufimc1 here is my code snippet, I add some comments along the code for better understanding.

brand_size = 4190     # there are 4190 brands, id from 1 to 4190.
n_embedding = 20      # I want to embedding each brand into a vector of length 20.
time_step = 12
features_cnt = 100    # one user has most 100 brands purchased history within 12 months.
batch_size = 160

with tf.name_scope('inputs'):
    embedding = tf.Variable(tf.random_uniform((brand_size, n_embedding), -1, 1))
    deep_inputs = tf.placeholder(tf.int32, (batch_size, time_step, features_cnt), name='deep_inputs')
    print(deep_inputs.shape)            # it prints (160, 12, 100)
    embed_x = tf.nn.embedding_lookup(embedding, deep_inputs)
    print(embed_x.shape)        # it prints (160, 12, 100, 20)
    deep_reshape_inputs = tf.reshape(embed_x, [-1, time_step, features_cnt * n_embedding])

with tf.name_scope('targets'):
    deep_targets = tf.placeholder(tf.int32, (batch_size, 1) , name='deep_targets')

keep_prob = tf.placeholder(tf.float32, name='keep_prob')

with tf.name_scope("RNN_cells"):
    purchase_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell(purchase_lstm_size, keep_prob) for _ in range(num_layers)], state_is_tuple = True)

with tf.name_scope("RNN_init_state"):
    purchase_initial_state = purchase_cell.zero_state(batch_size, tf.float32)

# Run the data through the RNN layers
with tf.variable_scope("RNN_purchase_forward"):
    purchase_outputs, purchase_state = tf.nn.dynamic_rnn(purchase_cell, deep_reshape_inputs, initial_state=purchase_initial_state)

armandidandeh commented 6 years ago

Wondering if the problem was solved properly. On a side-note, how did you decide the architecture of the network? Are you following a specific publication/logic/heuristic?

Horsmann commented 5 years ago

I found this website here: https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html

It uses Tensorflow 1.x not Keras but the bottom line is that you run a neural network which generates LSTM outputs that you just append to the word embedding lookup table of the actual sequence tagging task you are trying to solve. Its solved with pre-processing essentially. Doing both, char and word, in a single network does seem to work with Tensorflow/Keras.

Hopefully, this helps someone..

keras-team / keras

How to combine LSTM layers with varying input-data shapes (word + character)? #6353