This PR adds more lang/country coverage in PII_regexes.
Values were derived from the snippet below.
from collections import defaultdict
import pycountry
from babel.languages import get_official_languages
country_2_lang_new = dict()
missing_countries = []
for c in list(pycountry.countries):
country_code = c.alpha_2.lower()
langs = [
# merge locales
lang.split("_")[0] if "_" in lang else lang
for lang in get_official_languages(country_code, regional=False, de_facto=True)
]
if langs:
country_2_lang_new[country_code] = list(set(langs))
else:
missing_countries.append(country_code)
for k, v in country_2_lang_old.items():
if isinstance(v, str):
country_2_lang_old[k] = [v]
country_2_lang = {**country_2_lang_old, **country_2_lang_new}
lang_2_countries = defaultdict(list)
for country, langs in country_2_lang.items():
for lang in langs:
lang_2_countries[lang].append(country)
This PR adds more lang/country coverage in PII_regexes. Values were derived from the snippet below.