Closed igormis closed 4 years ago
One simple solution is to use a wrapper around the simplifier(source_filepath, pred_filepath)
that takes a string as input and outputs a string as you mentioned in #8
Another solution is to directly use some fairseq code, but I am not sure it is going to be better so you might not want to go the extra mile for it.
Here is some dirty code for that might not be working, for inspiration:
class StreamTextSimplification:
Batch = namedtuple("Batch", "srcs sample")
Translation = namedtuple("Translation", "src_str hypos pos_scores alignments")
Result = namedtuple("Result", "translation score")
def __init__(self):
self.model_path = "text_simplification/model"
self.checkpoint_path = os.path.join(self.model_path, "checkpoint_last.pt")
input_args = [
self.model_path,
"--path",
self.checkpoint_path,
"--source-lang",
"complex",
"--target-lang",
"simple",
"--beam",
"8",
"--nbest",
"1",
"--lenpen",
"1.0",
"--diverse-beam-groups",
"-1",
"--diverse-beam-strength",
"0.5",
"--print-alignment",
"--gen-subset",
"tmp",
"--model-overrides",
'{"encoder_embed_path": None, "decoder_embed_path": None}',
"--cpu",
]
parser = options.get_generation_parser(interactive=True)
args = options.parse_args_and_arch(parser, input_args)
self.args = args
if args.buffer_size < 1:
args.buffer_size = 1
if args.max_tokens is None and args.max_sentences is None:
args.max_sentences = 1
assert (
not args.max_sentences or args.max_sentences <= args.buffer_size
), "--max-sentences/--batch-size cannot be larger than --buffer-size"
print(args)
self.use_cuda = torch.cuda.is_available() and not args.cpu
# Setup task, e.g., translation
self.task = tasks.setup_task(args)
# Load ensemble
print("| loading model(s) from {}".format(args.path))
model_paths = args.path.split(":")
self.models, model_args = checkpoint_utils.load_model_ensemble(
model_paths, arg_overrides=eval(args.model_overrides), task=self.task
)
# Set dictionaries
self.src_dict = self.task.source_dictionary
self.tgt_dict = self.task.target_dictionary
# Optimize ensemble for generation
for model in self.models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
need_attn=args.print_alignment,
)
if args.fp16:
model.half()
if self.use_cuda:
model.cuda()
# Initialize generator
self.translator = SequenceGenerator(
tgt_dict=self.tgt_dict,
beam_size=args.beam,
normalize_scores=(not args.unnormalized),
len_penalty=args.lenpen,
unk_penalty=args.unkpen,
temperature=args.temperature,
max_len_a=args.max_len_a,
max_len_b=args.max_len_b,
min_len=args.min_len,
)
print(self.translator)
# Load alignment dictionary for unknown word replacement
# (None if no unknown word replacement, empty if no path to align dictionary)
self.align_dict = utils.load_align_dict(args.replace_unk)
self.bpe_codes_path = "bpe_codes_70000"
self.mosesdecoder_path = "mosesdecoder-master/scripts/tokenizer"
self.normalize_punctuation_path = os.path.join(
self.mosesdecoder_path, "normalize-punctuation.perl"
)
self.tokenizer_path = os.path.join(self.mosesdecoder_path, "tokenizer.perl")
self.detokenizer_path = os.path.join(self.mosesdecoder_path, "detokenizer.perl")
def normalize_punctuation(self, text):
process = Popen(
["perl", self.normalize_punctuation_path, "-l", "en"],
stdout=PIPE,
stdin=PIPE,
stderr=PIPE,
encoding="utf8",
)
return process.communicate(text)[0].strip()
def tokenize(self, text):
process = Popen(
["perl", self.tokenizer_path, "-a", "-l", "en", "-q", "-no-escape"],
stdout=PIPE,
stdin=PIPE,
stderr=PIPE,
encoding="utf8",
)
return process.communicate(text)[0].strip()
def detokenize(self, text):
process = Popen(
["perl", self.detokenizer_path, "-q"],
stdout=PIPE,
stdin=PIPE,
stderr=PIPE,
encoding="utf8",
)
return process.communicate(text)[0].strip()
@lru_cache(maxsize=1000)
def translate(
self,
text,
dep_tree_depth_ratio=0.95,
wordrank_ratio=0.8,
length_ratio=0.9,
levenshtein_ratio=0.85,
fkgl_ratio=0.4,
):
preprocessors = [
DependencyTreeDepthRatioPreprocessor(
bucket_size=0.05, target_ratio=dep_tree_depth_ratio
),
WordRankRatioPreprocessor(bucket_size=0.05, target_ratio=wordrank_ratio),
LengthRatioPreprocessor(bucket_size=0.05, target_ratio=length_ratio),
LevenshteinPreprocessor(bucket_size=0.05, target_ratio=levenshtein_ratio),
FKGLRatioPreprocessor(bucket_size=0.05, target_ratio=fkgl_ratio),
BPEPreprocessor(bpe_codes_path=self.bpe_codes_path, n_bpe_codes=70000),
]
composed_preprocessor = ComposedPreprocessor(preprocessors)
text = self.tokenize(self.normalize_punctuation(text))
inputs = [composed_preprocessor.encode_sentence(text)]
args = self.args
def make_result(src_str, hypos):
result = self.Translation(
src_str="O\t{}".format(src_str), hypos=[], pos_scores=[], alignments=[]
)
# Process top predictions
for hypo in hypos[: min(len(hypos), args.nbest)]:
hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
hypo_tokens=hypo["tokens"].int().cpu(),
src_str=src_str,
alignment=hypo["alignment"].int().cpu()
if hypo["alignment"] is not None
else None,
align_dict=self.align_dict,
tgt_dict=self.tgt_dict,
remove_bpe=args.remove_bpe,
)
result.hypos.append(hypo_str)
result.pos_scores.append(hypo["score"])
result.alignments.append(
"A\t{}".format(
" ".join(map(lambda x: str(utils.item(x)), alignment))
)
if alignment is not None
else None
)
return result
def process_batch(batch):
if self.use_cuda:
batch.sample["net_input"]["src_tokens"] = batch.sample["net_input"][
"src_tokens"
].cuda()
batch.sample["net_input"]["src_lengths"] = batch.sample["net_input"][
"src_lengths"
].cuda()
translations = self.translator.generate(
models=self.models, sample=batch.sample
)
return [make_result(batch.srcs[i], t) for i, t in enumerate(translations)]
indices = []
results = []
for batch, batch_indices in self.make_batches(
inputs, args, self.src_dict, self.models[0].max_positions()
):
indices.extend(batch_indices)
results += process_batch(batch)
for i in np.argsort(indices):
result = results[i]
print(result.src_str)
for hypo, pos_score, align in zip(
result.hypos, result.pos_scores, result.alignments
):
print(hypo)
print(pos_score)
print(align)
translation = self.detokenize(
composed_preprocessor.decode_sentence(hypo)
)
return self.Result(translation=translation, score=pos_score)
def make_batches(self, lines, args, src_dict, max_positions):
tokens = [
src_dict.encode_line(src_str, add_if_not_exist=False).long()
for src_str in lines
]
lengths = np.array([t.numel() for t in tokens])
itr = self.task.get_batch_iterator(
dataset=fairseq.data.LanguagePairDataset(tokens, lengths, src_dict),
max_tokens=args.max_tokens,
max_sentences=args.max_sentences,
max_positions=max_positions,
).next_epoch_itr(shuffle=False)
for batch in itr:
yield self.Batch(srcs=[lines[i] for i in batch["id"]], sample=batch), batch[
"id"
]
I implemented your model for sentence simplification. However I would like to test it with streams of news titles and see the output. However, I have problems when using it as API, as for each title it generates input, output and some temp files. Thus, I was wondering is it possible to use it as a service, i.e. the input to be the string (utf-8 encoded) and the model should generate an output which is a simplified string. Any help on this?