Open ndarville opened 10 years ago
from baseparser import BaseParser
from BeautifulSoup import BeautifulSoup
class JPParser(BaseParser):
feeder_pat = '^http://jyllands-posten.dk/(indland|politik|international)/.'
feeder_pages = ['http://jyllands-posten.dk/seneste/']
def _parse(self, html):
"""Retrieve and serve the required fields to create an entry."""
soup = BeautifulSoup(html,
convertEntities=BeautifulSoup.HTML_ENTITIES,
fromEncoding='utf-8')
self.meta = soup.findAll('meta')
self.title = soup.find('h1').getText()
self.date = soup.find('div', 'date').getText().lstrip().capitalize()
self.byline = soup.find('a', 'artByline').getText().title()
summary = soup.find('div', 'artDescription').getText()
content = soup.find('div', 'content-block').getText()
self.body = '\n'.join([summary, content]) # Remove div.relArt
[ ] Ensure all articles (and their sections) are captured
[x] Working URL schemes