strongio / keras-bert

A simple technique to integrate BERT from tf hub to keras
258 stars 108 forks source link

Bert Embeddings Layer for Word Sense Disambiguation task #30

Closed elsheikh21 closed 4 years ago

elsheikh21 commented 4 years ago

I am trying to use your keras embeddings layer wrapper to use it for WSD, however I have this error every time

Traceback
``` Traceback (most recent call last): File "D:/SVC/GitLab/ahmed_elsheikh_1873337_nlp19project/code/model_bert_prova.py", line 234, in model = baseline_model(output_size, visualize=True) File "D:/SVC/GitLab/ahmed_elsheikh_1873337_nlp19project/code/model_bert_prova.py", line 61, in baseline_model )(bert_embedding) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\wrappers.py", line 473, in __call__ return super(Bidirectional, self).__call__(inputs, **kwargs) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 746, in __call__ self.build(input_shapes) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\wrappers.py", line 612, in build self.forward_layer.build(input_shape) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\utils\tf_utils.py", line 149, in wrapper output_shape = fn(instance, input_shape) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\recurrent.py", line 552, in build self.cell.build(step_input_shape) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\utils\tf_utils.py", line 149, in wrapper output_shape = fn(instance, input_shape) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\recurrent.py", line 1934, in build constraint=self.kernel_constraint) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 609, in add_weight aggregation=aggregation) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\training\checkpointable\base.py", line 639, in _add_variable_with_custom_getter **kwargs_for_getter) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1977, in make_variable aggregation=aggregation) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 183, in __call__ return cls._variable_v1_call(*args, **kwargs) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 146, in _variable_v1_call aggregation=aggregation) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 125, in previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2437, in default_variable_creator import_scope=import_scope) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 187, in __call__ return super(VariableMetaclass, cls).__call__(*args, **kwargs) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 297, in __init__ constraint=constraint) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 409, in _init_from_args initial_value() if init_from_fn else initial_value, File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1959, in shape, dtype=dtype, partition_info=partition_info) File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\init_ops.py", line 473, in __call__ scale /= max(1., (fan_in + fan_out) / 2.) TypeError: unsupported operand type(s) for +: 'NoneType' and 'int' Exception ignored in: > Traceback (most recent call last): File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\client\session.py", line 738, in __del__ TypeError: 'NoneType' object is not callable ```
The Wrapper Layer by you guys
```{python} import tensorflow as tf import tensorflow_hub as hub from tensorflow.keras import backend as K from tensorflow.keras.layers import Layer class BertEmbeddingLayer(Layer): ''' Integrate BERT Embeddings from tensorflow hub into a custom Keras layer. references: 1. https://github.com/strongio/keras-bert 2. https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1 ''' def __init__(self, n_fine_tune_layers=10, pooling="mean", bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1", **kwargs,): self.n_fine_tune_layers = n_fine_tune_layers self.trainable = True self.output_size = 768 self.pooling = pooling self.bert_path = bert_path if self.pooling not in ["first", "mean"]: raise NameError( f"Undefined pooling type (must be either first or mean, but is {self.pooling}") super(BertEmbeddingLayer, self).__init__(**kwargs) def build(self, input_shape): self.bert = hub.Module(self.bert_path, trainable=self.trainable, name=f"{self.name}_module") # Remove unused layers trainable_vars = self.bert.variables if self.pooling == "first": trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name] trainable_layers = ["pooler/dense"] elif self.pooling == "mean": trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name and not "/pooler/" in var.name] trainable_layers = [] else: raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}") # Select how many layers to fine tune for i in range(self.n_fine_tune_layers): trainable_layers.append(f"encoder/layer_{str(11 - i)}") # Update trainable vars to contain only the specified layers trainable_vars = [ var for var in trainable_vars if any([l in var.name for l in trainable_layers]) ] # Add to trainable weights for var in trainable_vars: self._trainable_weights.append(var) for var in self.bert.variables: if var not in self._trainable_weights: self._non_trainable_weights.append(var) super(BertEmbeddingLayer, self).build(input_shape) def call(self, inputs): inputs = [K.cast(x, dtype="int32") for x in inputs] input_ids, input_mask, segment_ids = inputs bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids ) if self.pooling == "first": pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["pooled_output"] elif self.pooling == "mean": result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["sequence_output"] def mul_mask(x, m): return x * tf.expand_dims(m, axis=-1) def masked_reduce_mean(x, m): return tf.reduce_sum(mul_mask(x, m), axis=1) / (tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) input_mask = tf.cast(input_mask, tf.float32) pooled = masked_reduce_mean(result, input_mask) else: raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}") return pooled def compute_output_shape(self, input_shape): return input_shape[0][0], input_shape[0][1], self.output_size ```
My BiLSTM Model
```{python} import os import yaml import numpy as np from argparse import ArgumentParser import tensorflow as tf import tensorflow_hub as hub from tensorflow.keras.layers import (LSTM, Add, Bidirectional, Dense, Input, TimeDistributed, Embedding) from tensorflow.keras.preprocessing.sequence import pad_sequences try: from bert.tokenization import FullTokenizer except ModuleNotFoundError: os.system('pip install bert-tensorflow') from tensorflow.keras.models import Model from tensorflow.keras import backend as K from tqdm import tqdm from keras_bert import BertEmbeddingLayer from model_utils import visualize_plot_mdl from parsing_dataset import load_dataset from utilities import configure_tf, initialize_logger def parse_args(): parser = ArgumentParser(description="WSD") parser.add_argument("--model_type", default='baseline', type=str, help="""Choose the model: baseline: BiLSTM Model. attention: Attention Stacked BiLSTM Model. seq2seq: Seq2Seq Attention.""") return vars(parser.parse_args()) def train_model(mdl, data, epochs=1, batch_size=32): [train_input_ids, train_input_masks, train_segment_ids], train_labels = data history = mdl.fit([train_input_ids, train_input_masks, train_segment_ids], train_labels, epochs=epochs, batch_size=batch_size) return history def baseline_model(output_size): hidden_size = 128 max_seq_len = 64 in_id = Input(shape=(max_seq_len,), name="input_ids") in_mask = Input(shape=(max_seq_len,), name="input_masks") in_segment = Input(shape=(max_seq_len,), name="segment_ids") bert_inputs = [in_id, in_mask, in_segment] bert_embedding = BertEmbeddingLayer()(bert_inputs) embedding_size = 768 bilstm = Bidirectional(LSTM(hidden_size, return_sequences=True, input_shape=(None, None, embedding_size) ), merge_mode='sum' )(bert_embedding) output = TimeDistributed(Dense(output_size, activation="softmax"))(bilstm) mdl = Model(inputs=bert_inputs, outputs=output, name="Bert_BiLSTM") mdl.compile(loss="sparse_categorical_crossentropy", optimizer='adadelta', metrics=["acc"]) return mdl def initialize_vars(sess): sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) K.set_session(sess) class PaddingInputExample(object): """Fake example so the num input examples is a multiple of the batch size. When running eval/predict on the TPU, we need to pad the number of examples to be a multiple of the batch size, because the TPU requires a fixed batch size. The alternative is to drop the last batch, which is bad because it means the entire output data won't be generated. We use this class instead of `None` because treating `None` as padding batches could cause silent errors. """ class InputExample(object): """A single training/test example for simple sequence classification.""" def __init__(self, guid, text_a, text_b=None, label=None): """Constructs a InputExample. Args: guid: Unique id for the example. text_a: string. The un-tokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. text_b: (Optional) string. The un-tokenized text of the second sequence. Only must be specified for sequence pair tasks. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label def create_tokenizer_from_hub_module(bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"): """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run( [ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ] ) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) def convert_single_example(tokenizer, example, max_seq_length=256): """Converts a single `InputExample` into a single `InputFeatures`.""" if isinstance(example, PaddingInputExample): input_ids = [0] * max_seq_length input_mask = [0] * max_seq_length segment_ids = [0] * max_seq_length label = [0] * max_seq_length return input_ids, input_mask, segment_ids, label tokens_a = tokenizer.tokenize(example.text_a) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0: (max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) example.label.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) example.label.append(0) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) example.label.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return input_ids, input_mask, segment_ids, example.label def convert_examples_to_features(tokenizer, examples, max_seq_length=256): """Convert a set of `InputExample`s to a list of `InputFeatures`.""" input_ids, input_masks, segment_ids, labels = [], [], [], [] for example in tqdm(examples, desc="Converting examples to features"): input_id, input_mask, segment_id, label = convert_single_example(tokenizer, example, max_seq_length) input_ids.append(np.array(input_id)) input_masks.append(np.array(input_mask)) segment_ids.append(np.array(segment_id)) labels.append(np.array(label)) return np.array(input_ids), np.array(input_masks), np.array(segment_ids), np.array(labels).reshape(-1, 1) def convert_text_to_examples(texts, labels): """Create InputExamples""" InputExamples = [] for text, label in zip(texts, labels): InputExamples.append( InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label) ) return InputExamples # Initialize session sess = tf.Session() params = parse_args() initialize_logger() configure_tf() # Load our config file config_file_path = os.path.join(os.getcwd(), "config.yaml") config_file = open(config_file_path) config_params = yaml.load(config_file) # This parameter allow that train_x to be in form of words, to allow using of your keras-elmo layer elmo = config_params["use_elmo"] dataset = load_dataset(elmo=elmo) vocabulary_size = dataset.get("vocabulary_size") output_size = dataset.get("output_size") # Parse data in Bert format max_seq_length = 64 train_x = dataset.get("train_x") train_text = [' '.join(x) for x in train_x] train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text] train_text = np.array(train_text, dtype=object)[:, np.newaxis] # print(train_text.shape) # (37184, 1) train_labels = dataset.get("train_y") # Instantiate tokenizer tokenizer = create_tokenizer_from_hub_module() # Convert data to InputExample format train_examples = convert_text_to_examples(train_text, train_labels) # Extract features (train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length) bert_inputs = [train_input_ids, train_input_masks, train_segment_ids] data = bert_inputs, train_labels del dataset model = baseline_model(output_size) # Instantiate variables initialize_vars(sess) history = train_model(model, data) ```

Can you please let me know how to solve it?