import torch import torch.nn as nn import nltk from gensim.models import Word2Vec from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter

nltk.download('stopwords') nltk.download('punkt')

Define the Transformer model

class Transformer(nn.Module): def init(self, input_dim, hidden_dim, num_layers, num_heads, dropout): super(Transformer, self).init()

    # Multi-Head Self-Attention Layer
    self.self_attention = nn.MultiheadAttention(input_dim, num_heads, dropout=dropout)

    # Position-wise Feedforward Layer
    self.feedforward = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, input_dim)
    )

    # Layer Normalization
    self.layer_norm1 = nn.LayerNorm(input_dim)
    self.layer_norm2 = nn.LayerNorm(input_dim)

    self.num_layers = num_layers

def forward(self, x):
    # Self-Attention Layer
    attn_output, _ = self.self_attention(x, x, x)
    x = x + attn_output
    x = self.layer_norm1(x)

    # Position-wise Feedforward Layer
    ff_output = self.feedforward(x)
    x = x + ff_output
    x = self.layer_norm2(x)

    return x

Example usage

input_dim = 512 # Input dimension hidden_dim = 2048 # Hidden layer dimension num_layers = 6 # Number of transformer layers num_heads = 8 # Number of attention heads dropout = 0.1 # Dropout probability

Create a Transformer model

transformer_model = Transformer(input_dim, hidden_dim, num_layers, num_heads, dropout)

with open('D:\rstudio 연습\attention_is_all_you_need.txt', 'r', encoding='utf-8') as file:

text1 = file.read()

sentences = sent_tokenize(text1.lower())

text1 = "I have an apple. I have a bear. I see an apple. Tiger is big." sentences = sent_tokenize(text1.lower())

print(sentences)

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

Count the frequency of each word in each sentence

word_freq_per_sentence = [Counter(sentence) for sentence in tokenized_sentences]

Sort the words based on frequency in each sentence

sorted_words_per_sentence = [sorted(freq.items(), key=lambda x: x[1], reverse=True) for freq in word_freq_per_sentence]

Print the sorted words and their frequencies for each sentence

for i, sentence_words in enumerate(sorted_words_per_sentence): print(f"Sentence {i + 1}:") for word, freq in sentence_words: print(f"{word}: {freq}") print()

model = Word2Vec(tokenized_sentences, vector_size=512, min_count=1) print(model)

text tokenize

text = input("what would u search:") word_by_word = word_tokenize(text.lower()) print(word_by_word) word_vectors = [model.wv[word] for word in word_by_word]

input_tensor = torch.tensor(word_vectors)

input_tensor = input_tensor.unsqueeze(1)

output = transformer_model(input_tensor)

print(output)

non2021115 / non202

transformer, word embedding by. gpt and from my fucking head #1