how do I get the vocabulary file?

xlHuang0719 commented 3 years ago

Hi, when I run the project, I don't know how to create the vocab file as specified in the conf file: [vocabulary] tran: ${data:root}/text_300/sum_train_300/tran.tok.vocab.txt desc: ${data:root}/text_300/sum_train_300/tran.tok.vocab.txt

I want to know how to create the tran.tok.vocab.txt file, appreciate your reply.

xlHuang0719 commented 3 years ago

sorry, that's a simple problem in nlp. I have figured it out.

xiang-xiang-zhu commented 3 years ago

sorry, that's a simple problem in nlp. I have figured it out.

hello,how do you solve this problem?

xlHuang0719 commented 3 years ago

sorry, that's a simple problem in nlp. I have figured it out.

hello,how do you solve this problem?

Hi, I constructed the vocab file by the following python script.

import os import tensorlayer as tl import nltk import json nltk.download('punkt')

TOKENS = {"": 0, "": 1, "": 2, "": 3}

def main(src_path, mid_path, dest_path): try: with open(src_path, 'r', encoding='UTF-8') as f: txt = f.readlines() except Exception as e: print("Could not open") processed_capts = [] for tran in txt: vid_idx = tran.find(' ') sentences = tran[vid_idx+1:] c = tl.nlp.process_sentence(sentences, start_word=TOKENS[''], end_word=TOKENS['']) processed_capts.append(c) tl.nlp.create_vocab(processed_capts, word_counts_output_file=mid_path, min_word_count=10) dict_save = {} word_index = 0 for tok, idx in TOKENS.items(): dict_save[tok] = str(word_index)+' 0' word_index += 1 with open(mid_path, 'r', encoding='UTF-8') as f: vocabs = f.readlines() for idx,word in enumerate(vocabs): if word.split()[0] not in dict_save.keys(): dict_save[word.split()[0]] = str(word_index)+' '+word.split()[1] word_index += 1 with open(dest_path, 'w') as f: json.dump(dict_save, f)

if name == "main": text_path = '/workspace/how2' src_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.txt') mid_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.vocab_original.txt') dest_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.vocab.txt') main(src_path, mid_path, dest_path)

xiang-xiang-zhu commented 3 years ago

sorry, that's a simple problem in nlp. I have figured it out.

hello,how do you solve this problem?

Hi, I constructed the vocab file by the following python script.

import os import tensorlayer as tl import nltk import json nltk.download('punkt')

TOKENS = {"": 0, "": 1, "": 2, "": 3}

def main(src_path, mid_path, dest_path): try: with open(src_path, 'r', encoding='UTF-8') as f: txt = f.readlines() except Exception as e: print("Could not open") processed_capts = [] for tran in txt: vid_idx = tran.find(' ') sentences = tran[vid_idx+1:] c = tl.nlp.process_sentence(sentences, start_word=TOKENS[''], end_word=TOKENS['']) processed_capts.append(c) tl.nlp.create_vocab(processed_capts, word_counts_output_file=mid_path, min_word_count=10) dict_save = {} word_index = 0 for tok, idx in TOKENS.items(): dict_save[tok] = str(word_index)+' 0' word_index += 1 with open(mid_path, 'r', encoding='UTF-8') as f: vocabs = f.readlines() for idx,word in enumerate(vocabs): if word.split()[0] not in dict_save.keys(): dict_save[word.split()[0]] = str(word_index)+' '+word.split()[1] word_index += 1 with open(dest_path, 'w') as f: json.dump(dict_save, f)

if name == "main": text_path = '/workspace/how2' src_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.txt') mid_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.vocab_original.txt') dest_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.vocab.txt') main(src_path, mid_path, dest_path)

Thank you very much I want to know if you encounter many bugs when using nmtpytorch. I run it on torch 1.8.0, but there are always various problems

xlHuang0719 commented 3 years ago

sorry, that's a simple problem in nlp. I have figured it out.

hello,how do you solve this problem?

Hi, I constructed the vocab file by the following python script. import os import tensorlayer as tl import nltk import json nltk.download('punkt') TOKENS = {"": 0, "": 1, "": 2, "": 3} def main(src_path, mid_path, dest_path): try: with open(src_path, 'r', encoding='UTF-8') as f: txt = f.readlines() except Exception as e: print("Could not open") processed_capts = [] for tran in txt: vid_idx = tran.find(' ') sentences = tran[vid_idx+1:] c = tl.nlp.process_sentence(sentences, start_word=TOKENS[''], end_word=TOKENS['']) processed_capts.append(c) tl.nlp.create_vocab(processed_capts, word_counts_output_file=mid_path, min_word_count=10) dict_save = {} word_index = 0 for tok, idx in TOKENS.items(): dict_save[tok] = str(word_index)+' 0' word_index += 1 with open(mid_path, 'r', encoding='UTF-8') as f: vocabs = f.readlines() for idx,word in enumerate(vocabs): if word.split()[0] not in dict_save.keys(): dict_save[word.split()[0]] = str(word_index)+' '+word.split()[1] word_index += 1 with open(dest_path, 'w') as f: json.dump(dict_save, f) if name == "main": text_path = '/workspace/how2' src_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.txt') mid_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.vocab_original.txt') dest_path = os.path.join(text_path, 'text_300/sum_train_300/tran.tok.vocab.txt') main(src_path, mid_path, dest_path)

Thank you very much I want to know if you encounter many bugs when using nmtpytorch. I run it on torch 1.8.0, but there are always various problems

maybe you can try to degrade you pytorch version to 1.6 or lower

amankhullar / mast

how do I get the vocabulary file? #2