Make sure that that annotation titles do not need to be NFD normalized and cleaned by systems themselves

Some cases in annotations might require to clean titles to match wiki-dump. Fix.

from cleantext import clean
import unicodedata

def clean_title(text):
    text = unquote(text)
    text = clean(text.strip(),fix_unicode=True,               # fix various unicode errors
    to_ascii=False,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
    )
    return text

def get_wiki_page_from_title(page, db):
    page = clean_title(page)
    page = unicodedata.normalize('NFD', page)
    lines = db.get_doc_json(page)
    wiki_page = WikiPage(page, lines)
    return pa

Raldir / FEVEROUS

Make sure that that annotation titles do not need to be NFD normalized and cleaned by systems themselves #2