Open zfallahnejad opened 5 years ago
Here's an example I made for character level vocabulary: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/algorithmic_math_two_variables.py
Thanks. I defined two text2text problems as follows:
@registry.register_problem
class T2tChatbot(text_problems.Text2TextProblem):
@property
def approx_vocab_size(self):
return 2**13 # ~8k
@property
def is_generate_per_split(self):
# generate_data will NOT shard the data into TRAIN and EVAL for us.
return False
@property
def dataset_splits(self):
"""Splits of data to produce and number of output shards for each."""
# 10% evaluation data
return [{
"split": problem.DatasetSplit.TRAIN,
"shards": 90,
}, {
"split": problem.DatasetSplit.EVAL,
"shards": 10,
}]
def generate_samples(self, data_dir, tmp_dir, dataset_split):
with open('data/joorak2_emoji/raw2_emoji.txt', 'r') as rawfp:
for line in rawfp:
question,answer = line.strip().split('\t')
if len(question) > 0 and len(answer) > 0:
yield {
"inputs": question,
"targets": answer
}
# Smaller than the typical translate model, and with more regularization
@registry.register_hparams
def transformer_chatbot():
hparams = transformer.transformer_base()
hparams.num_hidden_layers = 2
hparams.hidden_size = 128
hparams.filter_size = 512
hparams.num_heads = 4
hparams.attention_dropout = 0.6
hparams.layer_prepostprocess_dropout = 0.6
hparams.learning_rate = 0.05
return hparams
@registry.register_problem
class T2tChatbot(text_problems.Text2TextProblem):
@property
def vocab_type(self):
return text_problems.VocabType.CHARACTER
#@property
#def approx_vocab_size(self):
# return 2**13 # ~8k
@property
def is_generate_per_split(self):
# generate_data will NOT shard the data into TRAIN and EVAL for us.
return False
@property
def dataset_splits(self):
"""Splits of data to produce and number of output shards for each."""
# 10% evaluation data
return [{
"split": problem.DatasetSplit.TRAIN,
"shards": 90,
}, {
"split": problem.DatasetSplit.EVAL,
"shards": 10,
}]
def generate_samples(self, data_dir, tmp_dir, dataset_split):
with open('data/joorak2_emoji/raw2_emoji.txt', 'r') as rawfp:
for line in rawfp:
question,answer = line.strip().split('\t')
if len(question) > 0 and len(answer) > 0:
yield {
"inputs": question,
"targets": answer
}
# Smaller than the typical translate model, and with more regularization
@registry.register_hparams
def transformer_chatbot():
hparams = transformer.transformer_base()
hparams.num_hidden_layers = 2
hparams.hidden_size = 128
hparams.filter_size = 512
hparams.num_heads = 4
hparams.attention_dropout = 0.6
hparams.layer_prepostprocess_dropout = 0.6
hparams.learning_rate = 0.05
return hparams
The first one uses subword and generates very good outputs after 50k steps. The second one uses character and generates one fixed word(like "ok" or "yes") for each input text even after 150k steps. Do you know why?
Following this turtorial[https://cloud.google.com/blog/products/gcp/cloud-poetry-training-and-hyperparameter-tuning-custom-text-models-on-cloud-ml-engine], I have trained a Text2Text Model for a chatbot. The
VocabType
of this model set assubword
. Why? Why subword works better? How can I change vocabulary type toCharacter
?