bert-mlm port to keras-core with tf backend

Mrutyunjay01 commented 1 year ago

As mentioned in #837, this PR provides a fix for Constant initializer. I believe there's no need to use ops.cast() as it applies the standard_dtype in call().

This issue was found while trying to import End-to-End Masked Language Modelling with BERT to keras-core. The following changes are made:

Fixed Imports
Supports tensorflow backend
Removed MaskedTextGenerator as it wasn't used anywhere, rather to display examples.

kindly take a look at the PR, and suggest changes, if and where necessary.

Here's the diff:

diff --git a/./keras-io/examples/nlp/masked_language_modeling.py b/./keras-core/examples/keras_io/tensorflow/nlp/end_to_end_mlm_with_bert.py
index 7c6ed30..02990ca 100644
--- a/./keras-io/examples/nlp/masked_language_modeling.py
+++ b/./keras-core/examples/keras_io/tensorflow/nlp/end_to_end_mlm_with_bert.py
@@ -1,8 +1,9 @@
 """
 Title: End-to-end Masked Language Modeling with BERT
 Author: [Ankur Singh](https://twitter.com/ankur310794)
+Converted to Keras-Core: [Mrutyunjay Biswal](https://twitter.com/LearnStochastic)
 Date created: 2020/09/18
-Last modified: 2020/09/18
+Last modified: 2023/09/06
 Description: Implement a Masked Language Model (MLM) with BERT and fine-tune it on the IMDB Reviews dataset.
 Accelerator: GPU
 """
@@ -30,34 +31,35 @@ This example teaches you how to build a BERT model from scratch,
 train it with the masked language modeling task,
 and then fine-tune this model on a sentiment classification task.

-We will use the Keras `TextVectorization` and `MultiHeadAttention` layers
+We will use the Keras-Core `TextVectorization` and `MultiHeadAttention` layers
 to create a BERT Transformer-Encoder network architecture.

-Note: This example should be run with `tf-nightly`.
+Note: This is only tensorflow backend compatible.
 """

 """
 ## Setup
-
-Install `tf-nightly` via `pip install tf-nightly`.
 """

-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
-from tensorflow.keras.layers import TextVectorization
-from dataclasses import dataclass
-import pandas as pd
-import numpy as np
-import glob
+import os
 import re
-from pprint import pprint
+import glob
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from dataclasses import dataclass
+
+import tensorflow as tf
+import keras_core as keras
+from keras_core import layers
+from keras_core.models import Model
+from keras_core.initializers import Constant
+from keras_core.layers import TextVectorization

 """
-## Set-up Configuration
+## Configuration
 """

-
 @dataclass
 class Config:
     MAX_LEN = 256
@@ -68,21 +70,45 @@ class Config:
     NUM_HEAD = 8  # used in bert model
     FF_DIM = 128  # used in bert model
     NUM_LAYERS = 1
+    NUM_EPOCHS = 1
+    STEPS_PER_EPOCH = 2

 config = Config()

 """
-## Load the data
+## Download the Data: IMDB Movie Review Sentiment Classification
+
+Download the IMDB data and load into a Pandas DataFrame.
+"""
+
+fpath = keras.utils.get_file(
+    origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+)
+dirpath = Path(fpath).parent.absolute()
+os.system(f"tar -xf {fpath} -C {dirpath}")

-We will first download the IMDB data and load into a Pandas dataframe.
 """
+The `aclImdb` folder contains a `train` and `test` subfolder:
+"""
+
+os.system(f"ls {dirpath}/aclImdb")
+os.system(f"ls {dirpath}/aclImdb/train")
+os.system(f"ls {dirpath}/aclImdb/test")

-"""shell
-curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
-tar -xf aclImdb_v1.tar.gz
 """
+We are only interested in the `pos` and `neg` subfolders, so let's delete the rest:
+"""
+
+os.system(f"rm -r {dirpath}/aclImdb/train/unsup")
+os.system(f"rm -r {dirpath}/aclImdb/train/*.feat")
+os.system(f"rm -r {dirpath}/aclImdb/train/*.txt")
+os.system(f"rm -r {dirpath}/aclImdb/test/*.feat")
+os.system(f"rm -r {dirpath}/aclImdb/test/*.txt")

+"""
+Let's read the dataset from the text files to a DataFrame.
+"""

 def get_text_list_from_files(files):
     text_list = []
@@ -94,9 +120,10 @@ def get_text_list_from_files(files):

 def get_data_from_text_files(folder_name):
-    pos_files = glob.glob("aclImdb/" + folder_name + "/pos/*.txt")
+
+    pos_files = glob.glob(f"{dirpath}/aclImdb/" + folder_name + "/pos/*.txt")
     pos_texts = get_text_list_from_files(pos_files)
-    neg_files = glob.glob("aclImdb/" + folder_name + "/neg/*.txt")
+    neg_files = glob.glob(f"{dirpath}/aclImdb/" + folder_name + "/neg/*.txt")
     neg_texts = get_text_list_from_files(neg_files)
     df = pd.DataFrame(
         {
@@ -107,11 +134,11 @@ def get_data_from_text_files(folder_name):
     df = df.sample(len(df)).reset_index(drop=True)
     return df

-
 train_df = get_data_from_text_files("train")
 test_df = get_data_from_text_files("test")

-all_data = train_df.append(test_df)
+all_data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
+assert len(all_data) != 0, f'{all_data} is empty'

 """
 ## Dataset preparation
@@ -125,11 +152,9 @@ Below, we define 3 preprocessing functions.

 1.  The `get_vectorize_layer` function builds the `TextVectorization` layer.
 2.  The `encode` function encodes raw text into integer token ids.
-3.  The `get_masked_input_and_labels` function will mask input token ids.
-It masks 15% of all input tokens in each sequence at random.
+3.  The `get_masked_input_and_labels` function will mask input token ids. It masks 15% of all input tokens in each sequence at random.
 """

-
 def custom_standardization(input_data):
     lowercase = tf.strings.lower(input_data)
     stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
@@ -248,6 +273,7 @@ mlm_ds = tf.data.Dataset.from_tensor_slices(
 )
 mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)

+
 """
 ## Create BERT model (Pretraining Model) for masked language modeling

@@ -257,19 +283,18 @@ It will take token ids as inputs (including masked tokens)
 and it will predict the correct ids for the masked input tokens.
 """

-
-def bert_module(query, key, value, i):
+def bert_module(query, key, value, layer_num):
     # Multi headed self-attention
     attention_output = layers.MultiHeadAttention(
         num_heads=config.NUM_HEAD,
         key_dim=config.EMBED_DIM // config.NUM_HEAD,
-        name="encoder_{}/multiheadattention".format(i),
+        name=f"encoder_{layer_num}_multiheadattention",
     )(query, key, value)
-    attention_output = layers.Dropout(0.1, name="encoder_{}/att_dropout".format(i))(
+    attention_output = layers.Dropout(0.1, name=f"encoder_{layer_num}_att_dropout")(
         attention_output
     )
     attention_output = layers.LayerNormalization(
-        epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i)
+        epsilon=1e-6, name=f"encoder_{layer_num}_att_layernormalization"
     )(query + attention_output)

     # Feed-forward layer
@@ -278,14 +303,14 @@ def bert_module(query, key, value, i):
             layers.Dense(config.FF_DIM, activation="relu"),
             layers.Dense(config.EMBED_DIM),
         ],
-        name="encoder_{}/ffn".format(i),
+        name=f"encoder_{layer_num}_ffn",
     )
     ffn_output = ffn(attention_output)
-    ffn_output = layers.Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))(
+    ffn_output = layers.Dropout(0.1, name=f"encoder_{layer_num}_ffn_dropout")(
         ffn_output
     )
     sequence_output = layers.LayerNormalization(
-        epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i)
+        epsilon=1e-6, name=f"encoder_{layer_num}_ffn_layernormalization"
     )(attention_output + ffn_output)
     return sequence_output

@@ -305,12 +330,12 @@ def get_pos_encoding_matrix(max_len, d_emb):

 loss_fn = keras.losses.SparseCategoricalCrossentropy(
-    reduction=tf.keras.losses.Reduction.NONE
+    reduction=None
 )
-loss_tracker = tf.keras.metrics.Mean(name="loss")
+loss_tracker = keras.metrics.Mean(name="loss")

-class MaskedLanguageModel(tf.keras.Model):
+class MaskedLanguageModel(Model):
     def train_step(self, inputs):
         if len(inputs) == 3:
             features, labels, sample_weight = inputs
@@ -351,12 +376,14 @@ def create_masked_language_bert_model():
     word_embeddings = layers.Embedding(
         config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
     )(inputs)
+    
     position_embeddings = layers.Embedding(
         input_dim=config.MAX_LEN,
         output_dim=config.EMBED_DIM,
-        weights=[get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)],
+        embeddings_initializer=Constant(get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)),
         name="position_embedding",
     )(tf.range(start=0, limit=config.MAX_LEN, delta=1))
+    
     embeddings = word_embeddings + position_embeddings

     encoder_output = embeddings
@@ -372,49 +399,6 @@ def create_masked_language_bert_model():
     mlm_model.compile(optimizer=optimizer)
     return mlm_model

-
-id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
-token2id = {y: x for x, y in id2token.items()}
-
-
-class MaskedTextGenerator(keras.callbacks.Callback):
-    def __init__(self, sample_tokens, top_k=5):
-        self.sample_tokens = sample_tokens
-        self.k = top_k
-
-    def decode(self, tokens):
-        return " ".join([id2token[t] for t in tokens if t != 0])
-
-    def convert_ids_to_tokens(self, id):
-        return id2token[id]
-
-    def on_epoch_end(self, epoch, logs=None):
-        prediction = self.model.predict(self.sample_tokens)
-
-        masked_index = np.where(self.sample_tokens == mask_token_id)
-        masked_index = masked_index[1]
-        mask_prediction = prediction[0][masked_index]
-
-        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
-        values = mask_prediction[0][top_indices]
-
-        for i in range(len(top_indices)):
-            p = top_indices[i]
-            v = values[i]
-            tokens = np.copy(sample_tokens[0])
-            tokens[masked_index[0]] = p
-            result = {
-                "input_text": self.decode(sample_tokens[0].numpy()),
-                "prediction": self.decode(tokens),
-                "probability": v,
-                "predicted mask token": self.convert_ids_to_tokens(p),
-            }
-            pprint(result)
-
-
-sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
-generator_callback = MaskedTextGenerator(sample_tokens.numpy())
-
 bert_masked_model = create_masked_language_bert_model()
 bert_masked_model.summary()

@@ -422,8 +406,8 @@ bert_masked_model.summary()
 ## Train and Save
 """

-bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
-bert_masked_model.save("bert_mlm_imdb.h5")
+bert_masked_model.fit(mlm_ds, epochs=Config.NUM_EPOCHS, steps_per_epoch=Config.STEPS_PER_EPOCH)
+bert_masked_model.save("bert_mlm_imdb.keras")

 """
 ## Fine-tune a sentiment classification model
@@ -431,15 +415,14 @@ bert_masked_model.save("bert_mlm_imdb.h5")
 We will fine-tune our self-supervised model on a downstream task of sentiment classification.
 To do this, let's create a classifier by adding a pooling layer and a `Dense` layer on top of the
 pretrained BERT features.
-
 """

 # Load pretrained bert model
 mlm_model = keras.models.load_model(
-    "bert_mlm_imdb.h5", custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
+    "bert_mlm_imdb.keras", custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
 )
-pretrained_bert_model = tf.keras.Model(
-    mlm_model.input, mlm_model.get_layer("encoder_0/ffn_layernormalization").output
+pretrained_bert_model = Model(
+    mlm_model.input, mlm_model.get_layer("encoder_0_ffn_layernormalization").output
 )

 # Freeze it
@@ -466,7 +449,8 @@ classifer_model.summary()
 # Train the classifier with frozen BERT stage
 classifer_model.fit(
     train_classifier_ds,
-    epochs=5,
+    epochs=Config.NUM_EPOCHS,
+    steps_per_epoch=Config.STEPS_PER_EPOCH,
     validation_data=test_classifier_ds,
 )

@@ -478,7 +462,8 @@ classifer_model.compile(
 )
 classifer_model.fit(
     train_classifier_ds,
-    epochs=5,
+    epochs=Config.NUM_EPOCHS,
+    steps_per_epoch=Config.STEPS_PER_EPOCH,
     validation_data=test_classifier_ds,
 )

@@ -492,7 +477,6 @@ the `TextVectorization` layer, and let's evaluate. Our model will accept raw str
 as input.
 """

-
 def get_end_to_end(model):
     inputs_string = keras.Input(shape=(1,), dtype="string")
     indices = vectorize_layer(inputs_string)
@@ -506,4 +490,4 @@ def get_end_to_end(model):

 end_to_end_classification_model = get_end_to_end(classifer_model)
-end_to_end_classification_model.evaluate(test_raw_classifier_ds)
+end_to_end_classification_model.evaluate(test_raw_classifier_ds)
\ No newline at end of file

cc: @fchollet

google-cla[bot] commented 1 year ago

Thanks for your pull request! It looks like this may be your first contribution to a Google open source project. Before we can look at your pull request, you'll need to sign a Contributor License Agreement (CLA).

View this failed invocation of the CLA check for more information.

For the most up to date status, view the checks section at the bottom of the pull request.

Mrutyunjay01 commented 1 year ago

Addressed the review comments in the latest commit, please take a look.

@fchollet

codecov[bot] commented 1 year ago

Codecov Report

:exclamation: No coverage uploaded for pull request base (main@f0f8f1c). Click here to learn what that means. Patch has no changes to coverable lines.

Additional details and impacted files

```diff @@ Coverage Diff @@ ## main #843 +/- ## ======================================= Coverage ? 70.90% ======================================= Files ? 344 Lines ? 33300 Branches ? 6409 ======================================= Hits ? 23612 Misses ? 8093 Partials ? 1595 ```

:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.

Mrutyunjay01 commented 1 year ago

Fixed lint checks from previous build.

@fchollet

keras-team / keras-core

bert-mlm port to keras-core with tf backend #843

Codecov Report