It would be nice to be able to use HTML files as the corpus. Here is code for converting HTML markup to plain text, which possibly could be transformed into a HTML parser in Sparv:
from bs4 import BeautifulSoup
def clean_html(html, text_element='body', html_parser='lxml', keep_paragraphs=False):
html = BeautifulSoup(html, html_parser)
if text_element:
html = html.find(text_element)
if keep_paragraphs:
for elem in html.find_all():
if elem.name not in _block_level_elems:
elem.unwrap()
html.smooth()
text = html.get_text('\n\n', strip=True)
else:
text = ''.join(html.strings)
return clean_whitespace(text, keep_paragraphs)
def clean_whitespace(text, keep_paragraphs=False):
if keep_paragraphs:
paragraphs = regex_paragraph_break.split(text)
else:
paragraphs = [text]
paragraphs = [' '.join(para.split()) for para in paragraphs]
text = '\n\n'.join(para for para in paragraphs if para)
return text
regex_paragraph_break = re.compile(r'\s*\n\s*\n\s*', re.UNICODE)
# HTML block-level elements, taken from:
# https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
# I've also added <br> as a block-level element, because that's how it's often used
_block_level_elems = set('''
address article aside blockquote details dialog
dd div dl dt
fieldset figcaption figure footer form
h1 h2 h3 h4 h5 h6
header hgroup hr li main nav ol
p pre section table ul br
'''.split())
If keep_paragraphs is True, it will replace all block-level elements with "\n\n" so that Sparv can split them into paragraphs
It would be nice to be able to use HTML files as the corpus. Here is code for converting HTML markup to plain text, which possibly could be transformed into a HTML parser in Sparv:
If
keep_paragraphs
is True, it will replace all block-level elements with "\n\n" so that Sparv can split them into paragraphs