Let's generate a fresh project for our Nobel-prize scraping, using the startproject option. This is going to generate a project folder so make sure you run it from a suitable work directory:
scrapy startproject nobel_winners
Establishing the Targets
//E
Element by relative reference (in this case relative to the root Document)
# nwiners_list_spider.py
import scrapy
import re
# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
country = scrapy.Field()
name = scrapy.Field()
link_text = scrapy.Field()
# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
""" Scrapes the country and link-text of the Nobel-winners. """
name = 'nwinners_list'
allowed_domains = ['en.wikipedia.org']
start_urls = [
"https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
]
# C. A parse method to deal with the HTTP response
def parse(self, response):
h3s = response.xpath('//h3')
items = []
for h3 in h3s:
country = h3.xpath('span[@class="mw-headline"]/text()').extract()
if country:
winners = h3.xpath('following-sibling::ol[1]')
for w in winners.xpath('li'):
text = w.xpath('descendant-or-self::text()').extract()
items.append(NWinnerItem(
country=country[0], name=text[0],
link_text = ' '.join(text)
))
return items
Let's see what scrapping spiders are available:
scrapy list
To start it scrapping we use the crawl command and direct the output to a nwinners.json file. By default we will get a lot of Python logging information accompanying the crawl:
scrapy crawl nwinners_list -o nobel_winners.json
The output of the scrapy crawl shows 1075 items successfully scraped. Let's look at our JSON output file to make sure things have gone according to plan:
def process_winner_li(w, country=None):
"""
Process a winner's
tag, adding country of birth or
nationality, as applicable
"""
wdata = {}
wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
text = ' '.join(w.xpath('descendant-or-self::text()').extract())
get comma-delineated name and strip trailing white-space
wdata['name'] = text.split(',')[0].strip()
year = re.findall(r'\d{4}', text)
if year:
wdata['year'] = int(year[0])
else:
wdata['year'] = 0
print('Oops, no year in ', text)
category = re.findall(
r'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics', text)
if category:
wdata['category'] = category[0]
else:
wdata['category'] = ''
print('Oops, no category in ', text)
if country:
if text.find(r'*') != -1:
wdata['nationality'] = ''
wdata['born_in'] = country
else:
wdata['nationality'] = country
wdata['born_in'] = ''
# store a copy of the link's text-string for any manual corrections
wdata['text'] = text
return wdata
A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
category = scrapy.Field()
date_of_birth = scrapy.Field()
date_of_death = scrapy.Field()
gender = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
nationality = scrapy.Field()
born_in = scrapy.Field()
place_of_birth = scrapy.Field()
place_of_death = scrapy.Field()
year = scrapy.Field()
B. Create a named spider
class NWinnerSpider(scrapy.Spider):
""" Scrapes the country and link-text of the Nobel-winners. """
name = 'nwinners_list'
allowed_domains = ['en.wikipedia.org']
start_urls = [
"https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
]
# C. A parse method to deal with the HTTP response
def parse(self, response):
h3s = response.xpath('//h3')
items = []
for h3 in h3s:
country = h3.xpath('span[@class="mw-headline"]/text()').extract()
if country:
winners = h3.xpath('following-sibling::ol[1]')
for w in winners.xpath('li'):
# text = w.xpath('descendant-or-self::text()').extract()
wdata = process_winner_li(w, country[0])
items.append(NWinnerItem(
category = wdata['category'],
link = wdata['link'],
name = wdata['name'],
nationality = wdata['nationality'],
born_in = wdata['born_in'],
year = wdata['year']
))
return items
#### Scraping the Individual Biography Pages
#### Chaining Requests and Yielding Data
##### Caching our pages
nobel_winners/nobel_winners/setting.py
HTTPCACHE_ENABLED = True
##### Yielding requests
```python
# nwiners_list_spider.py
import scrapy
import re
BASE_URL = 'http://en.wikipedia.org'
def process_winner_li(w, country=None):
"""
Process a winner's <li> tag, adding country of birth or
nationality, as applicable
"""
wdata = {}
wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
text = ' '.join(w.xpath('descendant-or-self::text()').extract())
# get comma-delineated name and strip trailing white-space
wdata['name'] = text.split(',')[0].strip()
year = re.findall(r'\d{4}', text)
if year:
wdata['year'] = int(year[0])
else:
wdata['year'] = 0
print('Oops, no year in ', text)
category = re.findall(
r'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics', text)
if category:
wdata['category'] = category[0]
else:
wdata['category'] = ''
print('Oops, no category in ', text)
if country:
if text.find(r'*') != -1:
wdata['nationality'] = ''
wdata['born_in'] = country
else:
wdata['nationality'] = country
wdata['born_in'] = ''
# store a copy of the link's text-string for any manual corrections
wdata['text'] = text
return wdata
# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
category = scrapy.Field()
date_of_birth = scrapy.Field()
date_of_death = scrapy.Field()
gender = scrapy.Field()
link = scrapy.Field()
name = scrapy.Field()
nationality = scrapy.Field()
born_in = scrapy.Field()
place_of_birth = scrapy.Field()
place_of_death = scrapy.Field()
year = scrapy.Field()
text = scrapy.Field()
# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
""" Scrapes the country and link-text of the Nobel-winners. """
name = 'nwinners_full'
allowed_domains = ['en.wikipedia.org']
start_urls = [
"https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
]
# C. A parse method to deal with the HTTP response
def parse(self, response):
filename = response.url.split('/')[-1]
h3s = response.xpath('//h3')
items = []
for h3 in list(h3s)[:2]:
country = h3.xpath('span[@class="mw-headline"]/text()').extract()
if country:
winners = h3.xpath('following-sibling::ol[1]')
for w in winners.xpath('li'):
# text = w.xpath('descendant-or-self::text()').extract()
wdata = process_winner_li(w, country[0])
request = scrapy.Request(
wdata['link'],
callback=self.parse_bio,
dont_filter=True
)
request.meta['item'] = NWinnerItem(**wdata)
yield request
def parse_bio(self, response):
item = response.meta['item']
href = response.xpath("//*[@id='t-wikibase']/a/@href").extract()[0]
href = href.split('/')
href.pop(4)
href = '/'.join(href)
if href:
request = scrapy.Request(
href,
callback=self.parse_wikidata,
dont_filter=True
)
request.meta['item'] = item
yield request
def parse_wikidata(self, response):
item = response.meta['item']
property_codes = [
{'name': 'date_of_birth', 'code': 'P569'},
{'name': 'date_of_death', 'code': 'P570'},
{'name': 'place_of_birth', 'code': 'P19', 'link': True},
{'name': 'place_of_death', 'code': 'P20', 'link': True},
{'name': 'gender', 'code': 'P21', 'link': True}
]
p_template = '//*[@id="%(code)s"]/div[2]/div/div/div[2]/div[1]/div/div[2]/div[2]/div[1]'
for prop in property_codes:
extra_html = ''
if prop.get('link'):
extra_html = '/a'
sel = response.xpath(p_template%prop + extra_html + '/text()')
if sel:
item[prop['name']] = sel[0].extract()
yield item
With our request chain in place, let's check that the spider is scraping our required data:
scrapy crawl nwinners_full
Scrapy Pipelines
# nobel_winners/nobel_winners/pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class NobelWinnersPipeline:
def process_item(self, item, spider):
return item
class DropNonPersons(object):
"""Remove non-person winners"""
def process_item(self, item, spider):
if not item['gender']:
raise DropItem("No gender for %s"%item['name'])
return item
# nwiners_list_spider.py
import scrapy
import re
BASE_URL = 'http://en.wikipedia.org'
# A. Define the data to be scraped
class NWinnerItem(scrapy.Item):
link = scrapy.Field()
name = scrapy.Field()
mini_bio = scrapy.Field()
image_urls = scrapy.Field()
bio_image = scrapy.Field()
images = scrapy.Field()
# B. Create a named spider
class NWinnerSpider(scrapy.Spider):
name = 'nwinners_minibio'
allowed_domains = ['en.wikipedia.org']
start_urls = [
"https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
]
# C. A parse method to deal with the HTTP response
def parse(self, response):
filename = response.url.split('/')[-1]
h3s = response.xpath('//h3')
items = []
for h3 in list(h3s)[:2]:
country = h3.xpath('span[@class="mw-headline"]/text()').extract()
if country:
winners = h3.xpath('following-sibling::ol[1]')
for w in winners.xpath('li'):
wdata = {}
wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
# Process the winner's bio-page with get_mini_bio
request = scrapy.Request(
wdata['link'],
callback=self.get_mini_bio)
request.meta['item'] = NWinnerItem(**wdata)
yield request
def get_mini_bio(self, response):
""" Get the winner's bio-text and photo """
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(@class,"infobox")]//img/@src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[@id="mw-content-text"]/div[1]/p[2][text() or normalize-space(.)=""]').extract()
for p in paras:
if p == '<p></p>':
break
mini_bio += p
# correct for wiki links
mini_bio = mini_bio.replace('href="/wiki"', 'href="' + BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
class NobelImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if image_paths:
item['bio_image'] = image_paths[0]
return item
class NobelWinnersPipeline:
def process_item(self, item, spider):
return item
class DropNonPersons(object):
"""Remove non-person winners"""
def process_item(self, item, spider):
if not item['gender']:
raise DropItem("No gender for %s"%item['name'])
return item
Setting up Scrapy
startproject
option. This is going to generate a project folder so make sure you run it from a suitable work directory:Establishing the Targets
Testing xpaths with the Scrapy shell
A First Scrapy Spider
crawl
command and direct the output to anwinners.json
file. By default we will get a lot of Python logging information accompanying the crawl:crawl
shows 1075 items successfully scraped. Let's look at our JSON output file to make sure things have gone according to plan:import scrapy import re
BASE_URL = 'http://en.wikipedia.org'
def process_winner_li(w, country=None): """ Process a winner's
get comma-delineated name and strip trailing white-space
A. Define the data to be scraped
class NWinnerItem(scrapy.Item): category = scrapy.Field() date_of_birth = scrapy.Field() date_of_death = scrapy.Field() gender = scrapy.Field() link = scrapy.Field() name = scrapy.Field() nationality = scrapy.Field() born_in = scrapy.Field() place_of_birth = scrapy.Field() place_of_death = scrapy.Field() year = scrapy.Field()
B. Create a named spider
class NWinnerSpider(scrapy.Spider): """ Scrapes the country and link-text of the Nobel-winners. """
nobel_winners/nobel_winners/setting.py
HTTPCACHE_ENABLED = True
Scrapy Pipelines
useful for handling different item types with a single interface
from itemadapter import ItemAdapter from scrapy.exceptions import DropItem
class NobelWinnersPipeline: def process_item(self, item, spider): return item
class DropNonPersons(object): """Remove non-person winners"""
Scraping Text and Images with a Pipeline