non2021115 / non202

0 stars 0 forks source link

transformer, word embedding by. gpt and from my fucking head #1

Open non2021115 opened 1 year ago

non2021115 commented 1 year ago

import torch import torch.nn as nn import nltk from gensim.models import Word2Vec from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter

nltk.download('stopwords') nltk.download('punkt')

Define the Transformer model

class Transformer(nn.Module): def init(self, input_dim, hidden_dim, num_layers, num_heads, dropout): super(Transformer, self).init()

    # Multi-Head Self-Attention Layer
    self.self_attention = nn.MultiheadAttention(input_dim, num_heads, dropout=dropout)

    # Position-wise Feedforward Layer
    self.feedforward = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, input_dim)
    )

    # Layer Normalization
    self.layer_norm1 = nn.LayerNorm(input_dim)
    self.layer_norm2 = nn.LayerNorm(input_dim)

    self.num_layers = num_layers

def forward(self, x):
    # Self-Attention Layer
    attn_output, _ = self.self_attention(x, x, x)
    x = x + attn_output
    x = self.layer_norm1(x)

    # Position-wise Feedforward Layer
    ff_output = self.feedforward(x)
    x = x + ff_output
    x = self.layer_norm2(x)

    return x

Example usage

input_dim = 512 # Input dimension hidden_dim = 2048 # Hidden layer dimension num_layers = 6 # Number of transformer layers num_heads = 8 # Number of attention heads dropout = 0.1 # Dropout probability

Create a Transformer model

transformer_model = Transformer(input_dim, hidden_dim, num_layers, num_heads, dropout)

with open('D:\rstudio 연습\attention_is_all_you_need.txt', 'r', encoding='utf-8') as file:

text1 = file.read()

sentences = sent_tokenize(text1.lower())

text1 = "I have an apple. I have a bear. I see an apple. Tiger is big." sentences = sent_tokenize(text1.lower())

print(sentences)

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

Count the frequency of each word in each sentence

word_freq_per_sentence = [Counter(sentence) for sentence in tokenized_sentences]

Sort the words based on frequency in each sentence

sorted_words_per_sentence = [sorted(freq.items(), key=lambda x: x[1], reverse=True) for freq in word_freq_per_sentence]

Print the sorted words and their frequencies for each sentence

for i, sentence_words in enumerate(sorted_words_per_sentence): print(f"Sentence {i + 1}:") for word, freq in sentence_words: print(f"{word}: {freq}") print()

model = Word2Vec(tokenized_sentences, vector_size=512, min_count=1) print(model)

text tokenize

text = input("what would u search:") word_by_word = word_tokenize(text.lower()) print(word_by_word) word_vectors = [model.wv[word] for word in word_by_word]

input_tensor = torch.tensor(word_vectors)

input_tensor = input_tensor.unsqueeze(1)

output = transformer_model(input_tensor)

print(output)

non2021115 commented 1 year ago

word embedding - transformer

the output

The outermost dimension (the first dimension) corresponds to the batch size. In your case, it appears to be 4, which means you processed four sentences or input sequences simultaneously.

The second dimension corresponds to the sequence length. Each sentence or input sequence may have a different length, and the model processes them as sequences of tokens.

The remaining dimensions (in your case, 512 dimensions) represent the embedding dimensions or features for each token in the input sequence.