chanelcolgate / hydroelectric-project

0 stars 0 forks source link

Scraping with Scrapy #25

Open chanelcolgate opened 2 years ago

chanelcolgate commented 2 years ago

Setting up Scrapy

//E Element by relative reference (in this case relative to the root Document)
//E[@id="foo"] select Element with id foo
//*[@id="foo"] select any element with id foo
//E/F[1] first child element of element
//E/*[1] first child of element

Testing xpaths with the Scrapy shell

scrapy shell https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country

A First Scrapy Spider

# nwiners_list_spider.py

import scrapy
import re
# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
        name = scrapy.Field()
        link_text = scrapy.Field()

# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
    """ Scrapes the country and link-text of the Nobel-winners. """

    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]
    # C. A parse method to deal with the HTTP response
    def parse(self, response):
        h3s = response.xpath('//h3')
        items = []
        for h3 in h3s:
            country = h3.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h3.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    text = w.xpath('descendant-or-self::text()').extract()
                    items.append(NWinnerItem(
                        country=country[0], name=text[0],
                        link_text = ' '.join(text)
                    ))
        return items

import scrapy import re

BASE_URL = 'http://en.wikipedia.org'

def process_winner_li(w, country=None): """ Process a winner's

  • tag, adding country of birth or nationality, as applicable """ wdata = {} wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0] text = ' '.join(w.xpath('descendant-or-self::text()').extract())

    get comma-delineated name and strip trailing white-space

    wdata['name'] = text.split(',')[0].strip()
    
    year = re.findall(r'\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)
    
    category = re.findall(
        r'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics', text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)
    
    if country:
        if text.find(r'*') != -1:
            wdata['nationality'] = ''
            wdata['born_in'] = country
        else:
            wdata['nationality'] = country
            wdata['born_in'] = ''
    
    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

    A. Define the data to be scraped

    class NWinnerItem(scrapy.Item): category = scrapy.Field() date_of_birth = scrapy.Field() date_of_death = scrapy.Field() gender = scrapy.Field() link = scrapy.Field() name = scrapy.Field() nationality = scrapy.Field() born_in = scrapy.Field() place_of_birth = scrapy.Field() place_of_death = scrapy.Field() year = scrapy.Field()

    B. Create a named spider

    class NWinnerSpider(scrapy.Spider): """ Scrapes the country and link-text of the Nobel-winners. """

    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]
    # C. A parse method to deal with the HTTP response
    def parse(self, response):
        h3s = response.xpath('//h3')
        items = []
        for h3 in h3s:
            country = h3.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h3.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    # text = w.xpath('descendant-or-self::text()').extract()
                    wdata = process_winner_li(w, country[0])
                    items.append(NWinnerItem(
                        category = wdata['category'],
                        link = wdata['link'],
                        name = wdata['name'],
                        nationality = wdata['nationality'],
                        born_in = wdata['born_in'],
                        year = wdata['year']
                    ))
        return items
    #### Scraping the Individual Biography Pages
    #### Chaining Requests and Yielding Data
    ##### Caching our pages

    nobel_winners/nobel_winners/setting.py

    HTTPCACHE_ENABLED = True

    ##### Yielding requests
    ```python
    # nwiners_list_spider.py
    
    import scrapy
    import re
    
    BASE_URL = 'http://en.wikipedia.org'
    
    def process_winner_li(w, country=None):
        """
        Process a winner's <li> tag, adding country of birth or
        nationality, as applicable
        """
        wdata = {}
        wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
        text = ' '.join(w.xpath('descendant-or-self::text()').extract())
        # get comma-delineated name and strip trailing white-space
        wdata['name'] = text.split(',')[0].strip()
    
        year = re.findall(r'\d{4}', text)
        if year:
            wdata['year'] = int(year[0])
        else:
            wdata['year'] = 0
            print('Oops, no year in ', text)
    
        category = re.findall(
            r'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics', text)
        if category:
            wdata['category'] = category[0]
        else:
            wdata['category'] = ''
            print('Oops, no category in ', text)
    
        if country:
            if text.find(r'*') != -1:
                wdata['nationality'] = ''
                wdata['born_in'] = country
            else:
                wdata['nationality'] = country
                wdata['born_in'] = ''
    
        # store a copy of the link's text-string for any manual corrections
        wdata['text'] = text
        return wdata
    
    # A. Define the data to be scraped
    class NWinnerItem(scrapy.Item):
        category = scrapy.Field()
        date_of_birth = scrapy.Field()
        date_of_death = scrapy.Field()
        gender = scrapy.Field()
        link = scrapy.Field()
        name = scrapy.Field()
        nationality = scrapy.Field()
        born_in = scrapy.Field()
        place_of_birth = scrapy.Field()
        place_of_death = scrapy.Field()
        year = scrapy.Field()
        text = scrapy.Field()
    
    # B. Create a named spider
    class NWinnerSpider(scrapy.Spider):
        """ Scrapes the country and link-text of the Nobel-winners. """
    
        name = 'nwinners_full'
        allowed_domains = ['en.wikipedia.org']
        start_urls = [
            "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
        ]
        # C. A parse method to deal with the HTTP response
        def parse(self, response):
            filename = response.url.split('/')[-1]
            h3s = response.xpath('//h3')
            items = []
            for h3 in list(h3s)[:2]:
                country = h3.xpath('span[@class="mw-headline"]/text()').extract()
                if country:
                    winners = h3.xpath('following-sibling::ol[1]')
                    for w in winners.xpath('li'):
                        # text = w.xpath('descendant-or-self::text()').extract()
                        wdata = process_winner_li(w, country[0])
                        request = scrapy.Request(
                            wdata['link'],
                            callback=self.parse_bio,
                            dont_filter=True
                        )
                        request.meta['item'] = NWinnerItem(**wdata)
                        yield request
    
        def parse_bio(self, response):
            item = response.meta['item']
            href = response.xpath("//*[@id='t-wikibase']/a/@href").extract()[0]
            href = href.split('/')
            href.pop(4)
            href = '/'.join(href)
            if href:
                request = scrapy.Request(
                    href,
                    callback=self.parse_wikidata,
                    dont_filter=True
                )
                request.meta['item'] = item
                yield request
    
        def parse_wikidata(self, response):
            item = response.meta['item']
            property_codes = [
                {'name': 'date_of_birth', 'code': 'P569'},
                {'name': 'date_of_death', 'code': 'P570'},
                {'name': 'place_of_birth', 'code': 'P19', 'link': True},
                {'name': 'place_of_death', 'code': 'P20', 'link': True},
                {'name': 'gender', 'code': 'P21', 'link': True}
            ]
            p_template = '//*[@id="%(code)s"]/div[2]/div/div/div[2]/div[1]/div/div[2]/div[2]/div[1]'
            for prop in property_codes:
                extra_html = ''
                if prop.get('link'):
                    extra_html = '/a'
                sel = response.xpath(p_template%prop + extra_html + '/text()')
                if sel:
                    item[prop['name']] = sel[0].extract()
    
            yield item
    • With our request chain in place, let's check that the spider is scraping our required data:
      scrapy crawl nwinners_full

      Scrapy Pipelines

      
      # nobel_winners/nobel_winners/pipelines.py
      # Define your item pipelines here
      #
      # Don't forget to add your pipeline to the ITEM_PIPELINES setting
      # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

    useful for handling different item types with a single interface

    from itemadapter import ItemAdapter from scrapy.exceptions import DropItem

    class NobelWinnersPipeline: def process_item(self, item, spider): return item

    class DropNonPersons(object): """Remove non-person winners"""

    def process_item(self, item, spider):
        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item
    ```python
    # nobel_winners/nobel_winners/pipelines.py
    ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}

    Scraping Text and Images with a Pipeline

    # nwiners_list_spider.py
    
    import scrapy
    import re
    
    BASE_URL = 'http://en.wikipedia.org'
    
    # A. Define the data to be scraped
    class NWinnerItem(scrapy.Item):
        link = scrapy.Field()
        name = scrapy.Field()
        mini_bio = scrapy.Field()
        image_urls = scrapy.Field()
        bio_image = scrapy.Field()
        images = scrapy.Field()
    
    # B. Create a named spider
    class NWinnerSpider(scrapy.Spider):
        name = 'nwinners_minibio'
        allowed_domains = ['en.wikipedia.org']
        start_urls = [
            "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
        ]
        # C. A parse method to deal with the HTTP response
        def parse(self, response):
            filename = response.url.split('/')[-1]
            h3s = response.xpath('//h3')
            items = []
            for h3 in list(h3s)[:2]:
                country = h3.xpath('span[@class="mw-headline"]/text()').extract()
                if country:
                    winners = h3.xpath('following-sibling::ol[1]')
                    for w in winners.xpath('li'):
                        wdata = {}
                        wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
                        # Process the winner's bio-page with get_mini_bio
                        request = scrapy.Request(
                            wdata['link'],
                            callback=self.get_mini_bio)
                        request.meta['item'] = NWinnerItem(**wdata)
                        yield request
    
        def get_mini_bio(self, response):
            """ Get the winner's bio-text and photo """
    
            BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
            item = response.meta['item']
            item['image_urls'] = []
            img_src = response.xpath('//table[contains(@class,"infobox")]//img/@src')
            if img_src:
                item['image_urls'] = ['http:' + img_src[0].extract()]
            mini_bio = ''
            paras = response.xpath('//*[@id="mw-content-text"]/div[1]/p[2][text() or normalize-space(.)=""]').extract()
    
            for p in paras:
                if p == '<p></p>':
                    break
                mini_bio += p
    
            # correct for wiki links
            mini_bio = mini_bio.replace('href="/wiki"', 'href="' + BASE_URL + '/wiki')
            mini_bio = mini_bio.replace('href="#', item['link'] + '#')
            item['mini_bio'] = mini_bio
            yield item
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    # useful for handling different item types with a single interface
    import scrapy
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    from itemadapter import ItemAdapter
    
    class NobelImagesPipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
            for image_url in item['image_urls']:
                yield scrapy.Request(image_url)
    
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if image_paths:
                item['bio_image'] = image_paths[0]
    
            return item
    
    class NobelWinnersPipeline:
        def process_item(self, item, spider):
            return item
    
    class DropNonPersons(object):
        """Remove non-person winners"""
    
        def process_item(self, item, spider):
            if not item['gender']:
                raise DropItem("No gender for %s"%item['name'])
            return item
    ITEM_PIPELINES = {'nobel_winners.pipelines.NobelImagesPipeline': 1}
    IMAGES_STORE = 'images'
  • chanelcolgate commented 2 years ago

    Scrapy