huu4ontocord / rio

Text pre-processing for NLP datasets
Apache License 2.0
11 stars 6 forks source link

Update language:country mappings #15

Closed j-chim closed 2 years ago

j-chim commented 2 years ago

This PR adds more lang/country coverage in PII_regexes. Values were derived from the snippet below.

from collections import defaultdict
import pycountry
from babel.languages import get_official_languages

country_2_lang_new = dict()
missing_countries = []
for c in list(pycountry.countries):
    country_code = c.alpha_2.lower()
    langs = [
        # merge locales
        lang.split("_")[0] if "_" in lang else lang 
        for lang in get_official_languages(country_code, regional=False, de_facto=True)
    ]
    if langs:
        country_2_lang_new[country_code] = list(set(langs))
    else:
        missing_countries.append(country_code)

for k, v in country_2_lang_old.items():
    if isinstance(v, str):
        country_2_lang_old[k] = [v]

country_2_lang = {**country_2_lang_old, **country_2_lang_new}

lang_2_countries = defaultdict(list)
for country, langs in country_2_lang.items():
    for lang in langs:
        lang_2_countries[lang].append(country)