non2021115 / non202

0 stars 0 forks source link

word embedding by gpt and from my fucking head #2

Open non2021115 opened 11 months ago

non2021115 commented 11 months ago

import nltk from gensim.models import Word2Vec from nltk.tokenize import word_tokenize, sent_tokenize from collections import Counter

nltk.download('stopwords') nltk.download('punkt')

with open('txt_file_route', 'r', encoding='utf-8') as file: text1 = file.read() sentences = sent_tokenize(text1.lower())

print(sentences)

tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

Count the frequency of each word in each sentence

word_freq_per_sentence = [Counter(sentence) for sentence in tokenized_sentences]

Sort the words based on frequency in each sentence

sorted_words_per_sentence = [sorted(freq.items(), key=lambda x: x[1], reverse=True) for freq in word_freq_per_sentence]

Print the sorted words and their frequencies for each sentence

for i, sentence_words in enumerate(sorted_words_per_sentence): print(f"Sentence {i + 1}:") for word, freq in sentence_words: print(f"{word}: {freq}") print()

model = Word2Vec(tokenized_sentences, vector_size=100, min_count=1) print(model)

text tokenize

text = input("what would u search:") word_by_word = (word_tokenize(text.lower())) print(word_by_word) vector = model.wv[word_by_word] print(vector)

non2021115 commented 11 months ago

output

make sentences to vector