Setting up Scrapy

//E Element by relative reference (in this case relative to the root Document)
//E[@id="foo"] select Element with id foo
//*[@id="foo"] select any element with id foo
//E/F[1] first child element of element
//E/*[1] first child of element

Testing xpaths with the Scrapy shell

scrapy shell

A First Scrapy Spider


import scrapy
import re
# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
        name = scrapy.Field()
        link_text = scrapy.Field()

# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
    """ Scrapes the country and link-text of the Nobel-winners. """

    name = 'nwinners_list'
    allowed_domains = ['']
    start_urls = [
    # C. A parse method to deal with the HTTP response
    def parse(self, response):
        h3s = response.xpath('//h3')
        items = []
        for h3 in h3s:
            country = h3.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h3.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    text = w.xpath('descendant-or-self::text()').extract()
                        country=country[0], name=text[0],
                        link_text = ' '.join(text)
        return items

    #### Scraping the Individual Biography Pages
    #### Chaining Requests and Yielding Data
    ##### Caching our pages



    ##### Yielding requests
    • With our request chain in place, let's check that the spider is scraping our required data:
      scrapy crawl nwinners_full

      Scrapy Pipelines

      # nobel_winners/nobel_winners/
      # Define your item pipelines here
      # Don't forget to add your pipeline to the ITEM_PIPELINES setting
      # See:

    useful for handling different item types with a single interface

    from itemadapter import ItemAdapter from scrapy.exceptions import DropItem

    class NobelWinnersPipeline: def process_item(self, item, spider): return item

    class DropNonPersons(object): """Remove non-person winners"""

    def process_item(self, item, spider):
        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item
    # nobel_winners/nobel_winners/
    ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}

    Scraping Text and Images with a Pipeline

    import scrapy
    import re
    BASE_URL = ''
    # A. Define the data to be scraped
    class NWinnerItem(scrapy.Item):
        link = scrapy.Field()
        name = scrapy.Field()
        mini_bio = scrapy.Field()
        image_urls = scrapy.Field()
        bio_image = scrapy.Field()
        images = scrapy.Field()
    # B. Create a named spider
    class NWinnerSpider(scrapy.Spider):
        name = 'nwinners_minibio'
        allowed_domains = ['']
        start_urls = [
        # C. A parse method to deal with the HTTP response
        def parse(self, response):
            filename = response.url.split('/')[-1]
            h3s = response.xpath('//h3')
            items = []
            for h3 in list(h3s)[:2]:
                country = h3.xpath('span[@class="mw-headline"]/text()').extract()
                if country:
                    winners = h3.xpath('following-sibling::ol[1]')
                    for w in winners.xpath('li'):
                        wdata = {}
                        wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
                        # Process the winner's bio-page with get_mini_bio
                        request = scrapy.Request(
                        request.meta['item'] = NWinnerItem(**wdata)
                        yield request
        def get_mini_bio(self, response):
            """ Get the winner's bio-text and photo """
            BASE_URL_ESCAPED = 'http:\/\/'
            item = response.meta['item']
            item['image_urls'] = []
            img_src = response.xpath('//table[contains(@class,"infobox")]//img/@src')
            if img_src:
                item['image_urls'] = ['http:' + img_src[0].extract()]
            mini_bio = ''
            paras = response.xpath('//*[@id="mw-content-text"]/div[1]/p[2][text() or normalize-space(.)=""]').extract()
            for p in paras:
                if p == '<p></p>':
                mini_bio += p
            # correct for wiki links
            mini_bio = mini_bio.replace('href="/wiki"', 'href="' + BASE_URL + '/wiki')
            mini_bio = mini_bio.replace('href="#', item['link'] + '#')
            item['mini_bio'] = mini_bio
            yield item
    # Define your item pipelines here
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See:
    # useful for handling different item types with a single interface
    import scrapy
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    from itemadapter import ItemAdapter
    class NobelImagesPipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
            for image_url in item['image_urls']:
                yield scrapy.Request(image_url)
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if image_paths:
                item['bio_image'] = image_paths[0]
            return item
    class NobelWinnersPipeline:
        def process_item(self, item, spider):
            return item
    class DropNonPersons(object):
        """Remove non-person winners"""
        def process_item(self, item, spider):
            if not item['gender']:
                raise DropItem("No gender for %s"%item['name'])
            return item
    ITEM_PIPELINES = {'nobel_winners.pipelines.NobelImagesPipeline': 1}
    IMAGES_STORE = 'images'
  • chanelcolgate commented 2 years ago
