Open blacksteel1288 opened 3 months ago
Interesting question. I've encountered two limitations while trying to make this work:
process_spider_output
methods in spider middlewares to be defined as coroutines, but does not support async def process_spider_exception
methods. This is the reason I'm using asyncio.create_task
in my example belowTargetClosedError: 'Page.screenshot: Target page, context or browser has been closed
exception. This can be handled by connecting to the spider_idle
signal and raising DontCloseSpider
if the screenshot has not yet been taken.Full example:
import asyncio
import logging
import scrapy
from playwright.async_api import Page
from scrapy import signals
from scrapy.crawler import Crawler
class HandleExceptionMiddleware:
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(crawler)
def __init__(self, crawler: Crawler) -> None:
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.screenshot_taken = asyncio.Event()
def spider_idle(self, spider):
if not self.screenshot_taken.is_set():
raise scrapy.exceptions.DontCloseSpider()
def process_spider_exception(self, response, exception, spider):
logging.info("Caught exception: %s", exception.__class__)
page: Page = response.meta["playwright_page"]
asyncio.create_task(self.take_screenshot(page=page))
return []
async def take_screenshot(self, page: Page):
await page.screenshot(path="example_exception.png", full_page=True)
self.screenshot_taken.set()
await page.close()
class HandleExceptionSpider(scrapy.Spider):
name = "exception"
custom_settings = {
"SPIDER_MIDDLEWARES": {HandleExceptionMiddleware: 100},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}
def start_requests(self):
yield scrapy.Request(
url="https://example.org",
meta={"playwright": True, "playwright_include_page": True},
)
def parse(self, response, **kwargs):
logging.info("Received response for %s", response.url)
1 / 0
Is there a way to take a screenshot for a
process_spider_exception
error?I can't figure out how to access the page object in that middleware.