Open shankharaj29 opened 5 years ago
In my case, my work is english-to-chinese translation, my registered problem:
ENZH_RAW_DATASETS = {
"TRAIN": "raw-train.zh-en",
"DEV": "raw-dev.zh-en"
}
def get_enzh_raw_dataset(directory, filename):
train_path = os.path.join(directory, filename)
if not (tf.gfile.Exists(train_path + ".en") and
tf.gfile.Exists(train_path + ".zh")):
raise Exception("there should be some training/dev data in the tmp dir.")
return train_path
@registry.register_problem
class TranslateEnzhSub50k(translate.TranslateProblem):
@property
def approx_vocab_size(self):
return 50000
@property
def source_vocab_name(self):
return "vocab.enzh-sub-en.%d" % self.approx_vocab_size
@property
def target_vocab_name(self):
return "vocab.enzh-sub-zh.%d" % self.approx_vocab_size
def get_vocab(self, data_dir, is_target=False):
vocab_filename = os.path.join(data_dir, self.target_vocab_name if is_target else self.source_vocab_name)
if not tf.gfile.Exists(vocab_filename):
raise ValueError("Vocab %s not found" % vocab_filename)
return text_encoder.TokenTextEncoder(vocab_filename, replace_oov="UNK")
def generate_samples(self, data_dir, tmp_dir, dataset_split):
train = dataset_split == problem.DatasetSplit.TRAIN
dataset_path = (ENZH_RAW_DATASETS["TRAIN"] if train else ENZH_RAW_DATASETS["DEV"])
train_path = get_enzh_raw_dataset(tmp_dir, dataset_path)
return text_problems.text2text_txt_iterator(train_path + ".en",
train_path + ".zh")
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
encoder = self.get_vocab(data_dir)
target_encoder = self.get_vocab(data_dir, is_target=True)
return text_problems.text2text_generate_encoded(generator, encoder, target_encoder,
has_inputs=self.has_inputs)
def feature_encoders(self, data_dir):
source_token = self.get_vocab(data_dir)
target_token = self.get_vocab(data_dir, is_target=True)
return {
"inputs": source_token,
"targets": target_token,
}
my trainsets are separated by space:
# raw-train.zh-en.en
For greater sharpness, but with a slight increase in graininess, you can use a 1:1 dilution of this developer.
# raw-train.zh-en.zh
为 了 更 好 的 锐 度 , 但 是 附 带 的 会 多 一 些 颗 粒 度 , 可 以 使 用 这 个 显 影 剂 的 1 : 1 稀 释 液 。
Description
I was unable to use my own dataset. I was trying to use a dataset having english-hindi translation data. I updated translation_ende.py , text_problems.py and generate_utils.py. I tried to hardcode these programs in order to download my own dataset . Yet, when I executed it , it was downloading the previously existing dataset. I have also written a python script in data_generators. No change in the output was encountered even though I created a class and registered it as a subclass of Problem. I have already read the previous issues on this topic. However, I was unable to get a solution for this kind of problem.
...
Environment information
For bugs: reproduction and error logs