Open yourSylvia opened 5 years ago
In your registered problem, there is a function name generate_encoded_samples
@registry.register_problem
class TranslateEnzhSub50k(translate.TranslateProblem):
...
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
encoder = self.get_vocab(data_dir)
target_encoder = self.get_vocab(data_dir, is_target=True)
return text_problems.text2text_generate_encoded(generator, encoder, target_encoder,
has_inputs=self.has_inputs)
def get_vocab(self, data_dir, is_target=False):
vocab_filename = os.path.join(data_dir, self.target_vocab_name if is_target else self.source_vocab_name)
if not tf.gfile.Exists(vocab_filename):
raise ValueError("Vocab %s not found" % vocab_filename)
return text_encoder.SubwordTextEncoder(vocab_filename, replace_oov="UNK")
The function text2text_generate_encoded used for encoding your inputs. So, you problem should be change to:
def text2text_generate_encoded(sample_generator,
vocab,
targets_vocab=None,
has_inputs=True):
"""Encode Text2Text samples from the generator with the vocab."""
targets_vocab = targets_vocab or vocab
for sample in sample_generator:
if has_inputs:
# call the function
sample["inputs"] = vocab.encode_without_tokenizing(sample["inputs"])
sample["inputs"].append(text_encoder.EOS_ID)
sample["targets"] = targets_vocab.encode_without_tokenizing(sample["targets"])
sample["targets"].append(text_encoder.EOS_ID)
yield sample
@registry.register_problem
class TranslateEnzhSub50k(translate.TranslateProblem):
...
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
encoder = self.get_vocab(data_dir)
target_encoder = self.get_vocab(data_dir, is_target=True)
# call local encoding generator
return text2text_generate_encoded(generator, encoder, target_encoder,
has_inputs=self.has_inputs)
def get_vocab(self, data_dir, is_target=False):
vocab_filename = os.path.join(data_dir, self.target_vocab_name if is_target else self.source_vocab_name)
if not tf.gfile.Exists(vocab_filename):
raise ValueError("Vocab %s not found" % vocab_filename)
return text_encoder.SubwordTextEncoder(vocab_filename, replace_oov="UNK")
Thank you so much for your reply!
In your code section, what is the source_vocab_name in get_vocab?
I found this in source code: tensor2tensor/data_generators/translate_enzh.py, but my problem is based on text_problems.Text2TextProblem.
As your suggestion, now my code is:
def text2text_generate_encoded(sample_generator, vocab, targets_vocab=None, has_inputs=True): """Encode Text2Text samples from the generator with the vocab.""" targets_vocab = targets_vocab or vocab for sample in sample_generator: if has_inputs:
sample["inputs"] = vocab.encode_without_tokenizing(sample["inputs"])
sample["inputs"].append(text_encoder.EOS_ID)
sample["targets"] =
targets_vocab.encode_without_tokenizing(sample["targets"]) sample["targets"].append(text_encoder.EOS_ID) yield sample
@registry.register_problem class MyProblem(text_problems.Text2TextProblem):
@property
def approx_vocab_size(self):
return 2**15
@property
def is_generate_per_split(self):
# generate_data will shard the data into TRAIN and EVAL for us.
return False
@property
def raw_data_splits(self):
"""Splits of data to produce and number of output shards for
each. 10% evaluation data""" return [{ "split": problem.DatasetSplit.TRAIN, "shards": 9, }, { "split": problem.DatasetSplit.EVAL, "shards": 1, }]
def generate_samples(self, data_dir, tmp_dir, raw_data_splits):
del data_dir
del tmp_dir
del raw_data_splits
# read the original training data
# source data
edit_dir= '/userhome/34/xwli/t2t/raw_data/edit'
orig_dir = '/userhome/34/xwli/t2t/raw_data/orig'
edit_files = os.listdir(edit_dir)
orig_files = os.listdir(orig_dir)
index = 0
for edit_file, orig_file in zip(edit_files, orig_files):
q_r = open(os.path.join(edit_dir, edit_file), 'r', encoding='UTF-8')
a_r = open(os.path.join(orig_dir, orig_file), 'r', encoding='UTF-8')
edited_list = nltk.sent_tokenize(q_r.read())
original_list = nltk.sent_tokenize(a_r.read())
q_r.close()
a_r.close()
for sen_pair in list(zip(original_list, edited_list)):
original = sen_pair[0]
edited = sen_pair[1]
yield {
"inputs": original,
"targets": edited
}
print('finish NO. %d file' % index)
index += 1
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
encoder = self.get_vocab(data_dir)
target_encoder = self.get_vocab(data_dir, is_target=True)
# call local encoding generator
return text2text_generate_encoded(generator, encoder,
target_encoder, has_inputs=self.has_inputs)
def get_vocab(self, data_dir, is_target=False):
vocab_filename = os.path.join(data_dir, self.target_vocab_name
if is_target else self.source_vocab_name) if not tf.gfile.Exists(vocab_filename): raise ValueError("Vocab %s not found" % vocab_filename) return text_encoder.SubwordTextEncoder(vocab_filename, replace_oov="UNK")
BTW I am not working on translation but similar, it is a grammar correction system.
yanwii notifications@github.com 于2019年3月18日周一 下午6:10写道:
In your registered problem, there is a function name generate_encoded_samples
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoder = self.get_vocab(data_dir) target_encoder = self.get_vocab(data_dir, is_target=True) return text_problems.text2text_generate_encoded(generator, encoder, target_encoder, has_inputs=self.has_inputs)
def get_vocab(self, data_dir, is_target=False): vocab_filename = os.path.join(data_dir, self.target_vocab_name if is_target else self.source_vocab_name) if not tf.gfile.Exists(vocab_filename): raise ValueError("Vocab %s not found" % vocab_filename) return text_encoder.TokenTextEncoder(vocab_filename, replace_oov="UNK")
The function text2text_generate_encoded used for encoding your inputs. So, you problem should be like this:
def text2text_generate_encoded(sample_generator, vocab, targets_vocab=None, has_inputs=True): """Encode Text2Text samples from the generator with the vocab.""" targets_vocab = targets_vocab or vocab for sample in sample_generator: if has_inputs:
call the function
sample["inputs"] = vocab.encode_without_tokenizing(sample["inputs"]) sample["inputs"].append(text_encoder.EOS_ID) sample["targets"] = targets_vocab.encode_without_tokenizing(sample["targets"]) sample["targets"].append(text_encoder.EOS_ID) yield sample
@registry.register_problem class TranslateEnzhSub50k(translate.TranslateProblem): ... def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
generator = self.generate_samples(data_dir, tmp_dir, dataset_split) encoder = self.get_vocab(data_dir) target_encoder = self.get_vocab(data_dir, is_target=True) # call local encoding generator return text2text_generate_encoded(generator, encoder, target_encoder, has_inputs=self.has_inputs) def get_vocab(self, data_dir, is_target=False): vocab_filename = os.path.join(data_dir, self.target_vocab_name if is_target else self.source_vocab_name) if not tf.gfile.Exists(vocab_filename): raise ValueError("Vocab %s not found" % vocab_filename) return text_encoder.SubwordTextEncoder(vocab_filename, replace_oov="UNK")
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/tensorflow/tensor2tensor/issues/1498#issuecomment-473847506, or mute the thread https://github.com/notifications/unsubscribe-auth/AfIQMOeMDLb6E8OCymfIWKKunFVsL4aSks5vX2YMgaJpZM4b5Txb .
You should create a directory named usr_dir, and there are two files __init__.py and problem.py:
# file __init__.py
from .problem import MyProblem
and when you use tensor2tensor, add a argument:
t2t-trainer ... --t2t_usr_dir=usr_dir
Yes, I had already added my problem to the initial file.
Do you know why there are other languages words in the vocab_file? I thought the vocab_file should like a vocabulary dictionary and the output files from t2t-datagen should contain the id to the token in volcab_file. Now I'm confused.
yanwii notifications@github.com 于2019年3月19日周二 上午9:36写道:
You should create a directory named usr_dir, and there are two files init.py and problem.py:
file init.py
from .problem import MyProblem
and when you use tensor2tensor, add a argument:
t2t-trainer ... --t2t_usr_dir=usr_dir
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/tensorflow/tensor2tensor/issues/1498#issuecomment-474164041, or mute the thread https://github.com/notifications/unsubscribe-auth/AfIQMBz7Iexg96V08vckxLCMnPmXa1ueks5vYD81gaJpZM4b5Txb .
The index of the words in vocab_file represent it's ids. And the trainset is the source of the other languages words.
In my case, the source text and the target text are both English, can I use the same vocab_file as well?
Description
I came across with some trouble in generating training data using t2t-datagen. The tokenizes are messy.
I found a function in tensor2tensor/tensor2tensor/data_generators/text_encoder.py, which can be used to generate data without tokenize. How to use it with my own registered problem? I have no idea where to put it...
Anyone can help?
Environment information
tensorflow-gpu 1.13.1 tensor2tensor 1.13.0