Open non2021115 opened 1 year ago
word embedding - transformer
the output
The outermost dimension (the first dimension) corresponds to the batch size. In your case, it appears to be 4, which means you processed four sentences or input sequences simultaneously.
The second dimension corresponds to the sequence length. Each sentence or input sequence may have a different length, and the model processes them as sequences of tokens.
The remaining dimensions (in your case, 512 dimensions) represent the embedding dimensions or features for each token in the input sequence.
import torch import torch.nn as nn import nltk from gensim.models import Word2Vec from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter
nltk.download('stopwords') nltk.download('punkt')
Define the Transformer model
class Transformer(nn.Module): def init(self, input_dim, hidden_dim, num_layers, num_heads, dropout): super(Transformer, self).init()
Example usage
input_dim = 512 # Input dimension hidden_dim = 2048 # Hidden layer dimension num_layers = 6 # Number of transformer layers num_heads = 8 # Number of attention heads dropout = 0.1 # Dropout probability
Create a Transformer model
transformer_model = Transformer(input_dim, hidden_dim, num_layers, num_heads, dropout)
with open('D:\rstudio 연습\attention_is_all_you_need.txt', 'r', encoding='utf-8') as file:
text1 = file.read()
sentences = sent_tokenize(text1.lower())
text1 = "I have an apple. I have a bear. I see an apple. Tiger is big." sentences = sent_tokenize(text1.lower())
print(sentences)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
Count the frequency of each word in each sentence
word_freq_per_sentence = [Counter(sentence) for sentence in tokenized_sentences]
Sort the words based on frequency in each sentence
sorted_words_per_sentence = [sorted(freq.items(), key=lambda x: x[1], reverse=True) for freq in word_freq_per_sentence]
Print the sorted words and their frequencies for each sentence
for i, sentence_words in enumerate(sorted_words_per_sentence): print(f"Sentence {i + 1}:") for word, freq in sentence_words: print(f"{word}: {freq}") print()
model = Word2Vec(tokenized_sentences, vector_size=512, min_count=1) print(model)
text tokenize
text = input("what would u search:") word_by_word = word_tokenize(text.lower()) print(word_by_word) word_vectors = [model.wv[word] for word in word_by_word]
input_tensor = torch.tensor(word_vectors)
input_tensor = input_tensor.unsqueeze(1)
output = transformer_model(input_tensor)
print(output)