Add cleaning of HTML - Githubissues

It would be nice to be able to use HTML files as the corpus. Here is code for converting HTML markup to plain text, which possibly could be transformed into a HTML parser in Sparv:

from bs4 import BeautifulSoup

def clean_html(html, text_element='body', html_parser='lxml', keep_paragraphs=False):
    html = BeautifulSoup(html, html_parser)
    if text_element:
        html = html.find(text_element)
    if keep_paragraphs:
        for elem in html.find_all():
            if elem.name not in _block_level_elems:
                elem.unwrap()
        html.smooth()
        text = html.get_text('\n\n', strip=True)
    else:
        text = ''.join(html.strings)
    return clean_whitespace(text, keep_paragraphs)

def clean_whitespace(text, keep_paragraphs=False):
    if keep_paragraphs:
        paragraphs = regex_paragraph_break.split(text)
    else:
        paragraphs = [text]
    paragraphs = [' '.join(para.split()) for para in paragraphs]
    text = '\n\n'.join(para for para in paragraphs if para)
    return text

regex_paragraph_break = re.compile(r'\s*\n\s*\n\s*', re.UNICODE)

# HTML block-level elements, taken from:
# https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
# I've also added <br> as a block-level element, because that's how it's often used

_block_level_elems = set('''
  address article aside blockquote details dialog 
  dd div dl dt
  fieldset figcaption figure footer form
  h1 h2 h3 h4 h5 h6
  header hgroup hr li main nav ol
  p pre section table ul br
'''.split())

If keep_paragraphs is True, it will replace all block-level elements with "\n\n" so that Sparv can split them into paragraphs

spraakbanken / sparv-pipeline

Add cleaning of HTML #102