In our newest project we are using a wrapped version of replacy to support list variables in match_dicts, like so
import json
import os
from typing import List
from replacy import ReplaceMatcher
from replacy.db import load_json as load_replacy_files_from_directory
here = os.path.abspath(os.path.dirname(__file__))
class ModifiedReplaceMatcher:
def __init__(self):
rd_path = os.path.join(here, "resources/match_dicts")
proto_match_dict = load_replacy_files_from_directory(rd_path)
vocab_refs = self._load_vocab_refs("resources/variables/vocab_refs.json")
self.rmatch_dict = self._refine_match_dict(proto_match_dict, vocab_refs)
def _load_vocab_refs(self, vocab_refs_path: str):
file_path = os.path.join(here, vocab_refs_path)
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
def _remove_square_brackets_from_list_of_strings(self, l: List[str]) -> str:
"""
look at me, I'm metaprogramming
turns ["a", "b", "c"]
into '"a", "b", "c"'
"""
list_str = '"'
list_str += '", "'.join(l)
list_str += '"'
return list_str
def _refine_match_dict(self, match_dict: dict, vocab_refs: dict) -> dict:
"""
Replace $REF:something by vocab list
from vocab_refs.json file
This should probably be replaCy functionality
And we could add functionality if we did fancier parsing
"""
r_matcher_str = json.dumps(match_dict)
for ref_id, ref_list in vocab_refs.items():
# this is a sin
ref_list_str = self._remove_square_brackets_from_list_of_strings(ref_list)
target = f'"$REF:{ref_id}"'
r_matcher_str = r_matcher_str.replace(target, ref_list_str)
# end sin
return json.loads(r_matcher_str)
def get_matcher(self, nlp, kenlm_path):
return ReplaceMatcher(nlp, match_dict=self.rmatch_dict, lm_path=kenlm_path)
Where resources/variables/vocab_refs.json would have an entry like
{
"variable-name": [
"hello",
"hi",
"yo"
]
}
This allows for a match dict syntax like:
[
{"LOWER":{"IN": ["$REF:variable-name"]}}
]
which is convenient for frequently-used lists of words. Lists are easier than dicts though.
In our newest project we are using a wrapped version of replacy to support list variables in match_dicts, like so
Where
resources/variables/vocab_refs.json
would have an entry likeThis allows for a match dict syntax like:
which is convenient for frequently-used lists of words. Lists are easier than dicts though.