kpe / bert-for-tf2

A Keras TensorFlow 2.0 implementation of BERT, ALBERT and adapter-BERT.
https://github.com/kpe/bert-for-tf2
MIT License
803 stars 193 forks source link

may be there is some problem work with tf hub #89

Open Kiris-tingna opened 3 years ago

Kiris-tingna commented 3 years ago

hi, I am using this script to generate albert saved model which is capativble with tf serving

since i genrated model , the input is { "instances":[ {"inputs": ["你好么"]} ] } output result seem not right, actually i want the albert out embedding vector.

{ "predictions": [ [ 101, 872, 1962, 720, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] }

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import bert, os

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

model_name = "albert_base"
model_dir = bert.fetch_brightmart_albert_model(model_name, ".models")
model_ckpt = os.path.join(model_dir, "albert_model.ckpt")

bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")

class BertTokenizerLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_file_path, sequence_length=128, lower_case=True):
        super(BertTokenizerLayer, self).__init__()
        self.CLS_ID = tf.constant(101, dtype=tf.int64)
        self.SEP_ID = tf.constant(102, dtype=tf.int64)
        self.PAD_ID = tf.constant(0, dtype=tf.int64)
        self.sequence_length = tf.constant(sequence_length)
        vocab = self.load_vocab(vocab_file_path)
        # These two lines are basically what makes it work
        # assigning the vocab to a tf.Module and then later assigning the
        # intantiated Module to e.g. a Keras Model
        self.create_vocab_table(vocab)
        self.bert_tokenizer = text.BertTokenizer(
            vocab_lookup_table=self.vocab_table,
            token_out_type=tf.int64,
            lower_case=lower_case,
        )

    def load_vocab(self, vocab_file):
        """Loads a vocabulary file into a list."""
        vocab = []
        with tf.io.gfile.GFile(vocab_file, "r") as reader:
            while True:
                token = reader.readline()
                if not token:
                    break
                token = token.strip()
                vocab.append(token)
        return vocab

    def create_vocab_table(self, vocab, num_oov=1):
        vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
        self.init = tf.lookup.KeyValueTensorInitializer(
            keys=vocab, values=vocab_values, key_dtype=tf.string, value_dtype=tf.int64
        )
        self.vocab_table = tf.lookup.StaticVocabularyTable(
            self.init, num_oov, lookup_key_dtype=tf.string
        )

    @tf.function
    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        """
        Perform the BERT preprocessing from text -> input token id
        """
        # Convert text into token ids
        tokens = self.bert_tokenizer.tokenize(inputs)

        # Flatten the ragged tensors
        tokens = tokens.merge_dims(1, 2)

        # Add start and end token ids to the id sequence
        start_tokens = tf.fill([tf.shape(inputs)[0], 1], self.CLS_ID)
        end_tokens = tf.fill([tf.shape(inputs)[0], 1], self.SEP_ID)
        tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)

        # Truncate to sequence length
        tokens = tokens[:, : self.sequence_length]

        # Convert ragged tensor to tensor and pad with PAD_ID
        tokens = tokens.to_tensor(default_value=self.PAD_ID)

        # Pad to sequence length
        pad = self.sequence_length - tf.shape(tokens)[1]
        tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=self.PAD_ID)

        return tf.reshape(tokens, [-1, self.sequence_length])

# text_input = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
# tokenizerd = BertTokenizerLayer(vocab_file_path=os.path.join(model_dir, "vocab.txt"))
# input_tokens = tokenizerd(text_input)
# embed_output = l_bert(input_tokens)    # output: [batch_size, max_seq_len, hidden_size]
# model = tf.keras.Model(inputs=text_input, outputs=embed_output)
# model.save("./models/albert-zh/1", signatures=tokenizerd.call.get_concrete_function(tf.TensorSpec([], tf.string)))

model = tf.keras.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    l_bert
])
model.tokenizer = BertTokenizerLayer(vocab_file_path=os.path.join(model_dir, "vocab.txt"))
model.save("./models/albert-zh/1", signatures=model.tokenizer.call.get_concrete_function(tf.TensorSpec(None, tf.string)))