writer / replaCy

spaCy match and replace, maintaining conjugation
https://pypi.org/project/replacy/
MIT License
34 stars 8 forks source link

Support list variables in match_dicts #72

Open sam-writer opened 4 years ago

sam-writer commented 4 years ago

In our newest project we are using a wrapped version of replacy to support list variables in match_dicts, like so

import json
import os
from typing import List

from replacy import ReplaceMatcher
from replacy.db import load_json as load_replacy_files_from_directory

here = os.path.abspath(os.path.dirname(__file__))

class ModifiedReplaceMatcher:
    def __init__(self):
        rd_path = os.path.join(here, "resources/match_dicts")
        proto_match_dict = load_replacy_files_from_directory(rd_path)
        vocab_refs = self._load_vocab_refs("resources/variables/vocab_refs.json")
        self.rmatch_dict = self._refine_match_dict(proto_match_dict, vocab_refs)

    def _load_vocab_refs(self, vocab_refs_path: str):
        file_path = os.path.join(here, vocab_refs_path)
        with open(file_path, "r", encoding="utf-8") as f:
            return json.load(f)

    def _remove_square_brackets_from_list_of_strings(self, l: List[str]) -> str:
        """
        look at me, I'm metaprogramming
        turns ["a", "b", "c"]
        into '"a", "b", "c"'
        """
        list_str = '"'
        list_str += '", "'.join(l)
        list_str += '"'
        return list_str

    def _refine_match_dict(self, match_dict: dict, vocab_refs: dict) -> dict:
        """
        Replace $REF:something by vocab list
        from vocab_refs.json file
        This should probably be replaCy functionality
        And we could add functionality if we did fancier parsing
        """
        r_matcher_str = json.dumps(match_dict)
        for ref_id, ref_list in vocab_refs.items():
            # this is a sin
            ref_list_str = self._remove_square_brackets_from_list_of_strings(ref_list)
            target = f'"$REF:{ref_id}"'
            r_matcher_str = r_matcher_str.replace(target, ref_list_str)
            # end sin
        return json.loads(r_matcher_str)

    def get_matcher(self, nlp, kenlm_path):
        return ReplaceMatcher(nlp, match_dict=self.rmatch_dict, lm_path=kenlm_path)

Where resources/variables/vocab_refs.json would have an entry like

{
  "variable-name": [
    "hello",
    "hi",
    "yo"
  ]
}

This allows for a match dict syntax like:

[
  {"LOWER":{"IN": ["$REF:variable-name"]}}
]

which is convenient for frequently-used lists of words. Lists are easier than dicts though.