Open david-thrower opened 9 months ago
This may make a suitable base model, but may need further preprocessing:
import tensorflow as tf
from minbpe import GPT4Tokenizer
from keras_nlp.models import GPT2Preprocessor
class TextEncoderLayer(tf.keras.layers.Layer):
def __init__(self,
# tokenizer,
sequence_length = 100):
super(TextEncoderLayer, self).__init__()
tokenizer = GPT4Tokenizer()
self.tokenizer = tokenizer
self.sequence_length = sequence_length
def call(self, text):
_tokens = []
for text_0 in text:
tokens = self.tokenizer.encode(str(text_0), allowed_special="all")
_tokens.append(tokens)
# ragged_tokens = tf.ragged.constant(padded_tokens)
# token_tensor = tf.constant(_tokens)
padded_tokens =\
tf.keras.preprocessing.sequence.pad_sequences(
_tokens, maxlen=self.sequence_length, padding='post')
return tf.constant(padded_tokens) # ragged_tokens
# Usage example
text_1 = tf.constant(["<|endoftext|>hello world"], dtype=tf.string)
text = tf.constant(["<|endoftext|>hello world", "test 9"], dtype=tf.string)
# tf.constant("<|endoftext|>hello world", dtype=tf.string),
# tf.constant("test 9", dtype=tf.string)])
# tokenizer = GPT4Tokenizer()
text_encoder_layer = TextEncoderLayer() # tokenizer)
print("2 tensor: as layer:")
print(text_encoder_layer(text))
print("One tensor: as layer:")
print(text_encoder_layer(text_1))
# Check if compatible with preprocessor:
inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
tokens_1 = TextEncoderLayer()(inp)
vocab_size = 100276
embedded = tf.keras.layers.Embedding(
input_dim=vocab_size,
output_dim=18,
input_length=100)(tokens_1)
flat = tf.keras.layers.Flatten()(embedded)
m1 = tf.keras.Model(inputs=inp, outputs=flat)
result_1 = m1(text_1)
print("1 Tensor:")
print(result_1)
result = m1(text)
print("2 tensor:")
print(result)
Kind of issue: Feature / enhancement; Natural Language Processing.
TLDR: