# `src/spiders/title.py`
import math
import re
from typing import Generator
import scrapy
from scrapy.responsetypes import Response
from scrapy import Request
from apify import Actor
class TitleSpider(scrapy.Spider):
name = 'title'
allowed_domains = ['primark.com']
start_urls = ['https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets']
def parse(self, response: Response) -> Generator[Request, None, None]:
Actor.log.info(f'TitleSpider.parse is parsing {response}...')
numberArticles = response.css('div.MuiBox-root > p.MuiTypography-body2::text').get()
if numberArticles:
numeric_part = re.search(r'\d+', numberArticles)
if numeric_part:
cleanNumberArticles = int(numeric_part.group())
numberPages = math.ceil(cleanNumberArticles / 24) + 1
allPages = [response.url] + [f'{response.url}?page={i}' for i in range(2, numberPages)]
for moreUrl in allPages:
request = Request(dont_filter=True, url=moreUrl, callback=self._parse_second)
Actor.log.debug(f'TitleSpider.parse is yielding a new request={request}...')
yield request
def _parse_second(self, response: Response) -> Generator[Request, None, None]:
Actor.log.info(f'TitleSpider._parse_second is parsing {response}...')
articleContainer = response.css('div.MuiGrid-root.MuiGrid-container')
individualContainer = articleContainer.css('div.MuiGrid-item')
articleLinkContainer = individualContainer.css('a.MuiTypography-colorPrimary')
articleLink = articleLinkContainer.css('a::attr(href)').getall()
articlePages = list({f'https://www.primark.com{i}' for i in articleLink if i.startswith('/fr-fr/p/')})
for urlarticle in articlePages:
request = Request(url=urlarticle, callback=self._parse_third)
Actor.log.debug(f'TitleSpider._parse_second is yielding a new request={request}...')
yield request
def _parse_third(self, response: Response) -> Generator[dict, None, None]:
Actor.log.info(f'TitleSpider._parse_third is parsing {response}...')
productname = response.css('h1.MuiTypography-root.MuiTypography-body1::text').get()
description = response.css('h5.MuiTypography-root.MuiTypography-body1::text').get()
prix = response.css('p.MuiTypography-root.MuiTypography-body1::text').get()
color = response.css('span.MuiTypography-root.MuiTypography-body2::text').get()
breadcrumpContainer = response.css('li.MuiBreadcrumbs-li > a:first-child::text').getall()
gender = breadcrumpContainer[0]
firstCategorie = breadcrumpContainer[1]
categorie = breadcrumpContainer[2]
link = response.url
imageContainer = response.css('div.jss1088 > img').getall()
result = {
'productname': productname,
'description': description,
'prix': prix,
'color': color,
'gender': gender,
'firstCategorie': firstCategorie,
'categorie': categorie,
'link': link,
'imageContainer': imageContainer,
}
Actor.log.debug(f'TitleSpider._parse_third is yielding a new result={result}...')
yield result
What is wrong
If the first level parse function yields a new request, Scheduler.enqueue_request is called immediately after.
[apify][INFO] TitleSpider.parse is parsing <200 https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets>...
...
[apify][DEBUG] TitleSpider.parse is yielding a new request=<GET https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets?page=2>...
[apify][DEBUG] [N9og2C]: ApifyScheduler.enqueue_request was called (scrapy_request=<GET https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets?page=2>)...
[apify][DEBUG] [ZFwgrc]: to_apify_request was called (scrapy_request=<GET https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets?page=2>)...
[apify][DEBUG] [ZFwgrc]: scrapy_request was converted to the apify_request={'url': 'https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets?page=2', ...}
...
However, if the second level parse function yields a new request, Scheduler.enqueue_request is not called at all.
[apify][INFO] TitleSpider._parse_second is parsing <200 https://www.primark.com/fr-fr/c/femme/vetements/pulls-et-gilets?page=2>...
[apify][DEBUG] TitleSpider._parse_second is yielding a new request=<GET https://www.primark.com/fr-fr/p/gilet-a-col-ras-du-cou-gris-chine-991069427720>...
[apify][DEBUG] TitleSpider._parse_second is yielding a new request=<GET https://www.primark.com/fr-fr/p/pull-ras-du-cou-en-maille-fine-gris-991070596706>...
[apify][DEBUG] TitleSpider._parse_second is yielding a new request=<GET https://www.primark.com/fr-fr/p/pull-decontracte-a-col-roule-camel-991080764610>...
...
And it means, the yielded Request from the 2nd level parse function never gets into the request queue.
Description
How to replicate it
What is wrong
If the first level parse function yields a new request,
Scheduler.enqueue_request
is called immediately after.However, if the second level parse function yields a new request,
Scheduler.enqueue_request
is not called at all.And it means, the yielded Request from the 2nd level parse function never gets into the request queue.