Open vishesh9131 opened 6 months ago
its replaced temporarily with keras
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, LayerNormalization, Dropout, Layer, MultiHeadAttention
And I have removed my custom transformer code. I haven’t pushed but if anybody want to use it you can and this is code:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, LayerNormalization, Dropout, Layer, MultiHeadAttention
from sklearn.model_selection import train_test_split
class TransformerEncoder(Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerEncoder, self).__init__()
self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = tf.keras.Sequential(
[Dense(ff_dim, activation="relu"), Dense(embed_dim)]
)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def build_model(vocab_size, embedding_dim, max_length):
model = Sequential([
Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
TransformerEncoder(embed_dim=embedding_dim, num_heads=8, ff_dim=512),
GlobalAveragePooling1D(),
Dense(vocab_size, activation='softmax')
])
return model
filepath = 'data_1.txt'
with open(filepath, 'r', encoding='utf-8') as file:
text = file.read()
text = text.split('\n')
train_text, test_text = train_test_split(text, test_size=0.2, random_state=42)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
X_train_seq = tokenizer.texts_to_sequences(train_text)
X_test_seq = tokenizer.texts_to_sequences(test_text)
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
model = build_model(vocab_size, embedding_dim, max_length)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
y_train = np.random.randint(vocab_size, size=len(X_train_pad))
y_test = np.random.randint(vocab_size, size=len(X_test_pad))
num_epochs = 20
history = model.fit(X_train_pad, y_train, epochs=num_epochs, validation_split=0.2, verbose=2)
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)
seed_text = "Once upon a time"
num_words = 100
seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]
generated_text = seed_text
for _ in range(num_words):
padded_sequence = pad_sequences([seed_sequence], maxlen=max_length, padding='post')
predicted_probs = model.predict(padded_sequence, verbose=0)[0]
predicted_probs = predicted_probs / np.sum(predicted_probs)
predicted_word_index = np.random.choice(len(predicted_probs), p=predicted_probs)
predicted_word = tokenizer.index_word.get(predicted_word_index, '')
if predicted_word == '':
break
seed_sequence.append(predicted_word_index)
seed_sequence = seed_sequence[1:]
generated_text += ' ' + predicted_word
print(generated_text)
Issue: Multi-Head Attention Producing Incorrect Vectors
The multi-head attention mechanism in our transformer model appears to be producing incorrect vectors. Specifically, the attention matrix is not accurately capturing the relationships between different elements of the input sequence, leading to erroneous outputs. This issue manifests in the following ways:
Potential Causes:
Next Steps:
By addressing these potential causes, we aim to rectify the issues with the multi-head attention mechanism and improve the overall performance of our transformer model.