ndarville / newsdiffs

Automatic scraper that tracks changes in news articles over time.
Other
1 stars 0 forks source link

Parser for TV2.dk #4

Open ndarville opened 10 years ago

ndarville commented 10 years ago
ndarville commented 10 years ago
from baseparser import BaseParser
from BeautifulSoup import BeautifulSoup

class InformationParser(BaseParser):
    feeder_pat = '^http://nyhederne.tv2.dk/(w+)/'
    feeder_pages =  ['http://nyhederne.tv2.dk/list']

    def _parse(self, html):
        """Retrieve and serve the required fields to create an entry."""
        soup = BeautifulSoup(html,
            convertEntities=BeautifulSoup.HTML_ENTITIES,
            fromEncoding='utf-8')

        self.meta = soup.findAll('meta')
        self.title = soup.find('h1').getText()
        self.date = soup.find('time', {'itemprop': 'datetime'}).getText()
        self.byline = soup.find('div', 'page-byline').next.next.getText()
        self.body =  "" soup.find('div', 'page-body').getText()