facebookresearch / access

Code to reproduce the experiments from the paper.
Other
101 stars 36 forks source link

Using ACCESS as a streamline service?! #7

Closed igormis closed 4 years ago

igormis commented 4 years ago

I implemented your model for sentence simplification. However I would like to test it with streams of news titles and see the output. However, I have problems when using it as API, as for each title it generates input, output and some temp files. Thus, I was wondering is it possible to use it as a service, i.e. the input to be the string (utf-8 encoded) and the model should generate an output which is a simplified string. Any help on this?

louismartin commented 4 years ago

One simple solution is to use a wrapper around the simplifier(source_filepath, pred_filepath) that takes a string as input and outputs a string as you mentioned in #8

Another solution is to directly use some fairseq code, but I am not sure it is going to be better so you might not want to go the extra mile for it.

Here is some dirty code for that might not be working, for inspiration:

class StreamTextSimplification:
    Batch = namedtuple("Batch", "srcs sample")
    Translation = namedtuple("Translation", "src_str hypos pos_scores alignments")
    Result = namedtuple("Result", "translation score")

    def __init__(self):
            self.model_path = "text_simplification/model"
            self.checkpoint_path = os.path.join(self.model_path, "checkpoint_last.pt")

            input_args = [
                self.model_path,
                "--path",
                self.checkpoint_path,
                "--source-lang",
                "complex",
                "--target-lang",
                "simple",
                "--beam",
                "8",
                "--nbest",
                "1",
                "--lenpen",
                "1.0",
                "--diverse-beam-groups",
                "-1",
                "--diverse-beam-strength",
                "0.5",
                "--print-alignment",
                "--gen-subset",
                "tmp",
                "--model-overrides",
                '{"encoder_embed_path": None, "decoder_embed_path": None}',
                "--cpu",
            ]

            parser = options.get_generation_parser(interactive=True)
            args = options.parse_args_and_arch(parser, input_args)
            self.args = args

            if args.buffer_size < 1:
                args.buffer_size = 1
            if args.max_tokens is None and args.max_sentences is None:
                args.max_sentences = 1

            assert (
                not args.max_sentences or args.max_sentences <= args.buffer_size
            ), "--max-sentences/--batch-size cannot be larger than --buffer-size"

            print(args)

            self.use_cuda = torch.cuda.is_available() and not args.cpu

            # Setup task, e.g., translation
            self.task = tasks.setup_task(args)

            # Load ensemble
            print("| loading model(s) from {}".format(args.path))
            model_paths = args.path.split(":")
            self.models, model_args = checkpoint_utils.load_model_ensemble(
                model_paths, arg_overrides=eval(args.model_overrides), task=self.task
            )

            # Set dictionaries
            self.src_dict = self.task.source_dictionary
            self.tgt_dict = self.task.target_dictionary

            # Optimize ensemble for generation
            for model in self.models:
                model.make_generation_fast_(
                    beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
                    need_attn=args.print_alignment,
                )
                if args.fp16:
                    model.half()
                if self.use_cuda:
                    model.cuda()

            # Initialize generator
            self.translator = SequenceGenerator(
                tgt_dict=self.tgt_dict,
                beam_size=args.beam,
                normalize_scores=(not args.unnormalized),
                len_penalty=args.lenpen,
                unk_penalty=args.unkpen,
                temperature=args.temperature,
                max_len_a=args.max_len_a,
                max_len_b=args.max_len_b,
                min_len=args.min_len,
            )

            print(self.translator)

            # Load alignment dictionary for unknown word replacement
            # (None if no unknown word replacement, empty if no path to align dictionary)
            self.align_dict = utils.load_align_dict(args.replace_unk)
        self.bpe_codes_path = "bpe_codes_70000"
        self.mosesdecoder_path = "mosesdecoder-master/scripts/tokenizer"
        self.normalize_punctuation_path = os.path.join(
            self.mosesdecoder_path, "normalize-punctuation.perl"
        )
        self.tokenizer_path = os.path.join(self.mosesdecoder_path, "tokenizer.perl")
        self.detokenizer_path = os.path.join(self.mosesdecoder_path, "detokenizer.perl")

    def normalize_punctuation(self, text):
        process = Popen(
            ["perl", self.normalize_punctuation_path, "-l", "en"],
            stdout=PIPE,
            stdin=PIPE,
            stderr=PIPE,
            encoding="utf8",
        )
        return process.communicate(text)[0].strip()

    def tokenize(self, text):
        process = Popen(
            ["perl", self.tokenizer_path, "-a", "-l", "en", "-q", "-no-escape"],
            stdout=PIPE,
            stdin=PIPE,
            stderr=PIPE,
            encoding="utf8",
        )
        return process.communicate(text)[0].strip()

    def detokenize(self, text):
        process = Popen(
            ["perl", self.detokenizer_path, "-q"],
            stdout=PIPE,
            stdin=PIPE,
            stderr=PIPE,
            encoding="utf8",
        )
        return process.communicate(text)[0].strip()

    @lru_cache(maxsize=1000)
    def translate(
        self,
        text,
        dep_tree_depth_ratio=0.95,
        wordrank_ratio=0.8,
        length_ratio=0.9,
        levenshtein_ratio=0.85,
        fkgl_ratio=0.4,
    ):
        preprocessors = [
            DependencyTreeDepthRatioPreprocessor(
                bucket_size=0.05, target_ratio=dep_tree_depth_ratio
            ),
            WordRankRatioPreprocessor(bucket_size=0.05, target_ratio=wordrank_ratio),
            LengthRatioPreprocessor(bucket_size=0.05, target_ratio=length_ratio),
            LevenshteinPreprocessor(bucket_size=0.05, target_ratio=levenshtein_ratio),
            FKGLRatioPreprocessor(bucket_size=0.05, target_ratio=fkgl_ratio),
            BPEPreprocessor(bpe_codes_path=self.bpe_codes_path, n_bpe_codes=70000),
        ]

        composed_preprocessor = ComposedPreprocessor(preprocessors)
        text = self.tokenize(self.normalize_punctuation(text))
        inputs = [composed_preprocessor.encode_sentence(text)]

        args = self.args

        def make_result(src_str, hypos):
            result = self.Translation(
                src_str="O\t{}".format(src_str), hypos=[], pos_scores=[], alignments=[]
            )

            # Process top predictions
            for hypo in hypos[: min(len(hypos), args.nbest)]:
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo["tokens"].int().cpu(),
                    src_str=src_str,
                    alignment=hypo["alignment"].int().cpu()
                    if hypo["alignment"] is not None
                    else None,
                    align_dict=self.align_dict,
                    tgt_dict=self.tgt_dict,
                    remove_bpe=args.remove_bpe,
                )
                result.hypos.append(hypo_str)
                result.pos_scores.append(hypo["score"])
                result.alignments.append(
                    "A\t{}".format(
                        " ".join(map(lambda x: str(utils.item(x)), alignment))
                    )
                    if alignment is not None
                    else None
                )
            return result

        def process_batch(batch):
            if self.use_cuda:
                batch.sample["net_input"]["src_tokens"] = batch.sample["net_input"][
                    "src_tokens"
                ].cuda()
                batch.sample["net_input"]["src_lengths"] = batch.sample["net_input"][
                    "src_lengths"
                ].cuda()
            translations = self.translator.generate(
                models=self.models, sample=batch.sample
            )

            return [make_result(batch.srcs[i], t) for i, t in enumerate(translations)]

        indices = []
        results = []
        for batch, batch_indices in self.make_batches(
            inputs, args, self.src_dict, self.models[0].max_positions()
        ):
            indices.extend(batch_indices)
            results += process_batch(batch)

        for i in np.argsort(indices):
            result = results[i]
            print(result.src_str)
            for hypo, pos_score, align in zip(
                result.hypos, result.pos_scores, result.alignments
            ):
                print(hypo)
                print(pos_score)
                print(align)
                translation = self.detokenize(
                    composed_preprocessor.decode_sentence(hypo)
                )
                return self.Result(translation=translation, score=pos_score)

    def make_batches(self, lines, args, src_dict, max_positions):
        tokens = [
            src_dict.encode_line(src_str, add_if_not_exist=False).long()
            for src_str in lines
        ]
        lengths = np.array([t.numel() for t in tokens])
        itr = self.task.get_batch_iterator(
            dataset=fairseq.data.LanguagePairDataset(tokens, lengths, src_dict),
            max_tokens=args.max_tokens,
            max_sentences=args.max_sentences,
            max_positions=max_positions,
        ).next_epoch_itr(shuffle=False)
        for batch in itr:
            yield self.Batch(srcs=[lines[i] for i in batch["id"]], sample=batch), batch[
                "id"
            ]