Setting up Scrapy

Quick pip install will do the job
```
pip install scrapy
```
Let's generate a fresh project for our Nobel-prize scraping, using the startproject option. This is going to generate a project folder so make sure you run it from a suitable work directory:
```
scrapy startproject nobel_winners
```
Establishing the Targets

//E	Element by relative reference (in this case relative to the root Document)
//E[@id="foo"]	select Element with id foo
//*[@id="foo"]	select any element with id foo
//E/F[1]	first child element of element
//E/*[1]	first child of element

Testing xpaths with the Scrapy shell

scrapy shell https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country

A First Scrapy Spider

# nwiners_list_spider.py

import scrapy
import re
# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
        name = scrapy.Field()
        link_text = scrapy.Field()

# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
    """ Scrapes the country and link-text of the Nobel-winners. """

    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]
    # C. A parse method to deal with the HTTP response
    def parse(self, response):
        h3s = response.xpath('//h3')
        items = []
        for h3 in h3s:
            country = h3.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h3.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    text = w.xpath('descendant-or-self::text()').extract()
                    items.append(NWinnerItem(
                        country=country[0], name=text[0],
                        link_text = ' '.join(text)
                    ))
        return items

Let's see what scrapping spiders are available:
```
scrapy list
```
To start it scrapping we use the crawl command and direct the output to a nwinners.json file. By default we will get a lot of Python logging information accompanying the crawl:
```
scrapy crawl nwinners_list -o nobel_winners.json
```
The output of the scrapy crawl shows 1075 items successfully scraped. Let's look at our JSON output file to make sure things have gone according to plan:
```
head nobel_winners.json
```
```
# nwiners_list_spider.py
```

import scrapy import re

BASE_URL = 'http://en.wikipedia.org'

def process_winner_li(w, country=None): """ Process a winner's

tag, adding country of birth or nationality, as applicable """ wdata = {} wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0] text = ' '.join(w.xpath('descendant-or-self::text()').extract())

get comma-delineated name and strip trailing white-space

wdata['name'] = text.split(',')[0].strip()

year = re.findall(r'\d{4}', text)
if year:
    wdata['year'] = int(year[0])
else:
    wdata['year'] = 0
    print('Oops, no year in ', text)

category = re.findall(
    r'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics', text)
if category:
    wdata['category'] = category[0]
else:
    wdata['category'] = ''
    print('Oops, no category in ', text)

if country:
    if text.find(r'*') != -1:
        wdata['nationality'] = ''
        wdata['born_in'] = country
    else:
        wdata['nationality'] = country
        wdata['born_in'] = ''

# store a copy of the link's text-string for any manual corrections
wdata['text'] = text
return wdata

A. Define the data to be scraped

class NWinnerItem(scrapy.Item): category = scrapy.Field() date_of_birth = scrapy.Field() date_of_death = scrapy.Field() gender = scrapy.Field() link = scrapy.Field() name = scrapy.Field() nationality = scrapy.Field() born_in = scrapy.Field() place_of_birth = scrapy.Field() place_of_death = scrapy.Field() year = scrapy.Field()

B. Create a named spider

class NWinnerSpider(scrapy.Spider): """ Scrapes the country and link-text of the Nobel-winners. """

name = 'nwinners_list'
allowed_domains = ['en.wikipedia.org']
start_urls = [
    "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
]
# C. A parse method to deal with the HTTP response
def parse(self, response):
    h3s = response.xpath('//h3')
    items = []
    for h3 in h3s:
        country = h3.xpath('span[@class="mw-headline"]/text()').extract()
        if country:
            winners = h3.xpath('following-sibling::ol[1]')
            for w in winners.xpath('li'):
                # text = w.xpath('descendant-or-self::text()').extract()
                wdata = process_winner_li(w, country[0])
                items.append(NWinnerItem(
                    category = wdata['category'],
                    link = wdata['link'],
                    name = wdata['name'],
                    nationality = wdata['nationality'],
                    born_in = wdata['born_in'],
                    year = wdata['year']
                ))
    return items

#### Scraping the Individual Biography Pages
#### Chaining Requests and Yielding Data
##### Caching our pages

nobel_winners/nobel_winners/setting.py

HTTPCACHE_ENABLED = True

##### Yielding requests
```python
# nwiners_list_spider.py

import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

def process_winner_li(w, country=None):
    """
    Process a winner's <li> tag, adding country of birth or
    nationality, as applicable
    """
    wdata = {}
    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
    text = ' '.join(w.xpath('descendant-or-self::text()').extract())
    # get comma-delineated name and strip trailing white-space
    wdata['name'] = text.split(',')[0].strip()

    year = re.findall(r'\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)

    category = re.findall(
        r'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics', text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)

    if country:
        if text.find(r'*') != -1:
            wdata['nationality'] = ''
            wdata['born_in'] = country
        else:
            wdata['nationality'] = country
            wdata['born_in'] = ''

    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
    category = scrapy.Field()
    date_of_birth = scrapy.Field()
    date_of_death = scrapy.Field()
    gender = scrapy.Field()
    link = scrapy.Field()
    name = scrapy.Field()
    nationality = scrapy.Field()
    born_in = scrapy.Field()
    place_of_birth = scrapy.Field()
    place_of_death = scrapy.Field()
    year = scrapy.Field()
    text = scrapy.Field()

# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
    """ Scrapes the country and link-text of the Nobel-winners. """

    name = 'nwinners_full'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]
    # C. A parse method to deal with the HTTP response
    def parse(self, response):
        filename = response.url.split('/')[-1]
        h3s = response.xpath('//h3')
        items = []
        for h3 in list(h3s)[:2]:
            country = h3.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h3.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    # text = w.xpath('descendant-or-self::text()').extract()
                    wdata = process_winner_li(w, country[0])
                    request = scrapy.Request(
                        wdata['link'],
                        callback=self.parse_bio,
                        dont_filter=True
                    )
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def parse_bio(self, response):
        item = response.meta['item']
        href = response.xpath("//*[@id='t-wikibase']/a/@href").extract()[0]
        href = href.split('/')
        href.pop(4)
        href = '/'.join(href)
        if href:
            request = scrapy.Request(
                href,
                callback=self.parse_wikidata,
                dont_filter=True
            )
            request.meta['item'] = item
            yield request

    def parse_wikidata(self, response):
        item = response.meta['item']
        property_codes = [
            {'name': 'date_of_birth', 'code': 'P569'},
            {'name': 'date_of_death', 'code': 'P570'},
            {'name': 'place_of_birth', 'code': 'P19', 'link': True},
            {'name': 'place_of_death', 'code': 'P20', 'link': True},
            {'name': 'gender', 'code': 'P21', 'link': True}
        ]
        p_template = '//*[@id="%(code)s"]/div[2]/div/div/div[2]/div[1]/div/div[2]/div[2]/div[1]'
        for prop in property_codes:
            extra_html = ''
            if prop.get('link'):
                extra_html = '/a'
            sel = response.xpath(p_template%prop + extra_html + '/text()')
            if sel:
                item[prop['name']] = sel[0].extract()

        yield item

With our request chain in place, let's check that the spider is scraping our required data:

scrapy crawl nwinners_full

Scrapy Pipelines


# nobel_winners/nobel_winners/pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter from scrapy.exceptions import DropItem

class NobelWinnersPipeline: def process_item(self, item, spider): return item

class DropNonPersons(object): """Remove non-person winners"""

def process_item(self, item, spider):
    if not item['gender']:
        raise DropItem("No gender for %s"%item['name'])
    return item

```python
# nobel_winners/nobel_winners/pipelines.py
ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}

Scraping Text and Images with a Pipeline

# nwiners_list_spider.py

import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
    link = scrapy.Field()
    name = scrapy.Field()
    mini_bio = scrapy.Field()
    image_urls = scrapy.Field()
    bio_image = scrapy.Field()
    images = scrapy.Field()

# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_minibio'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]
    # C. A parse method to deal with the HTTP response
    def parse(self, response):
        filename = response.url.split('/')[-1]
        h3s = response.xpath('//h3')
        items = []
        for h3 in list(h3s)[:2]:
            country = h3.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h3.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = {}
                    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
                    # Process the winner's bio-page with get_mini_bio
                    request = scrapy.Request(
                        wdata['link'],
                        callback=self.get_mini_bio)
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def get_mini_bio(self, response):
        """ Get the winner's bio-text and photo """

        BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
        item = response.meta['item']
        item['image_urls'] = []
        img_src = response.xpath('//table[contains(@class,"infobox")]//img/@src')
        if img_src:
            item['image_urls'] = ['http:' + img_src[0].extract()]
        mini_bio = ''
        paras = response.xpath('//*[@id="mw-content-text"]/div[1]/p[2][text() or normalize-space(.)=""]').extract()

        for p in paras:
            if p == '<p></p>':
                break
            mini_bio += p

        # correct for wiki links
        mini_bio = mini_bio.replace('href="/wiki"', 'href="' + BASE_URL + '/wiki')
        mini_bio = mini_bio.replace('href="#', item['link'] + '#')
        item['mini_bio'] = mini_bio
        yield item

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# useful for handling different item types with a single interface
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter

class NobelImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if image_paths:
            item['bio_image'] = image_paths[0]

        return item

class NobelWinnersPipeline:
    def process_item(self, item, spider):
        return item

class DropNonPersons(object):
    """Remove non-person winners"""

    def process_item(self, item, spider):
        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item

ITEM_PIPELINES = {'nobel_winners.pipelines.NobelImagesPipeline': 1}
IMAGES_STORE = 'images'

chanelcolgate / hydroelectric-project

Scraping with Scrapy #25