scrapy-plugins / scrapy-playwright

🎭 Playwright integration for Scrapy
BSD 3-Clause "New" or "Revised" License
1.02k stars 112 forks source link

API to handle multiple browser contexts #12

Closed elacuesta closed 3 years ago

elacuesta commented 3 years ago

It would be good to have a way to manage multiple browsers contexts, either defined at the start of the crawl (in settings, for instance) or created/destroyed during the crawl.

/cc @kalessin

michaelvsinko commented 3 years ago

That would be nice!

I'm trying to control a number of context and a number of pages per context.

The motivation of this is that I need to make some requests with single IP, but after that I need to change it. I'm using https://github.com/mattes/rotating-proxy as rotating proxy.

I have modified middleware to do that and now I can set the maximum number of context using PLAYWRIGHT_MAX_CONTEXTS and the maximum number of pages per context using PLAYWRIGHT_MAX_PAGES_PER_CONTEXT. After setting context

...
"PLAYWRIGHT_CONTEXT_ARGS": {
    "proxy": {
        "server": "127.0.0.1:5566",
    },
},

each context has its own IP.

fork, middleware, usage example

Middleware ```python import asyncio import logging from time import time from typing import Callable, Optional, Type, TypeVar, Union from urllib.parse import urlparse from playwright.async_api import BrowserContext, Page, PlaywrightContextManager, Request as PwRequest, Route from scrapy import Spider, signals from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler from scrapy.http import Request, Response from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes from scrapy.statscollectors import StatsCollector from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import verify_installed_reactor from twisted.internet.defer import Deferred, inlineCallbacks from scrapy_playwright.page import PageCoroutine __all__ = ["ScrapyPlaywrightDownloadHandler"] PlaywrightHandler = TypeVar("PlaywrightHandler", bound="ScrapyPlaywrightDownloadHandler") logger = logging.getLogger("scrapy-playwright") def _make_request_handler( browser_type: str, scrapy_request: Request, stats: StatsCollector, ) -> Callable: def request_handler(route: Route, pw_request: PwRequest) -> None: """ Override request headers, method and body """ if pw_request.url == scrapy_request.url: overrides = { "method": scrapy_request.method, "headers": { key.decode("utf-8").lower(): value[0].decode("utf-8") for key, value in scrapy_request.headers.items() }, } if scrapy_request.body: overrides["post_data"] = scrapy_request.body.decode(scrapy_request.encoding) # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET if browser_type == "firefox": overrides["headers"]["host"] = urlparse(pw_request.url).netloc else: overrides = {"headers": pw_request.headers.copy()} # override user agent, for consistency with other requests if scrapy_request.headers.get("user-agent"): user_agent = scrapy_request.headers["user-agent"].decode("utf-8") overrides["headers"]["user-agent"] = user_agent asyncio.create_task(route.continue_(**overrides)) # increment stats stats.inc_value("playwright/request_method_count/{}".format(pw_request.method)) stats.inc_value("playwright/request_count") if pw_request.is_navigation_request(): stats.inc_value("playwright/request_count/navigation") return request_handler class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): browser_type: str = "chromium" # default browser type default_navigation_timeout: Optional[int] = None launch_options: dict = dict() context_options: dict = dict() def __init__(self, crawler: Crawler) -> None: settings = crawler.settings super().__init__(settings=settings, crawler=crawler) verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") crawler.signals.connect(self._engine_started, signals.engine_started) self.stats = crawler.stats # read settings self.launch_options = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {} self.context_args = settings.getdict("PLAYWRIGHT_CONTEXT_ARGS") or {} self.default_navigation_timeout = ( settings.getint("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") or None ) if settings.get("PLAYWRIGHT_BROWSER_TYPE"): self.browser_type = settings["PLAYWRIGHT_BROWSER_TYPE"] self.max_contexts = ( settings.getint("PLAYWRIGHT_MAX_CONTEXTS") or 1 ) self.max_pages_per_context = ( settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT") or None ) self.contexts = [] self._creating_lock = asyncio.Lock() @classmethod def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler: return cls(crawler) def _engine_started(self) -> Deferred: logger.info("Launching browser") return deferred_from_coro(self._launch_browser()) async def _launch_browser(self) -> None: self.playwright_context_manager = PlaywrightContextManager() self.playwright = await self.playwright_context_manager.start() browser_launcher = getattr(self.playwright, self.browser_type).launch self.browser = await browser_launcher(**self.launch_options) logger.info(f"Browser {self.browser_type} launched") @inlineCallbacks def close(self) -> Deferred: yield super().close() if getattr(self, "contexts", None): logger.info("Closing browser contexts") for context in self.contexts: for page in context.pages: yield deferred_from_coro(page.close()) yield deferred_from_coro(context.close()) if getattr(self, "browser", None): logger.info("Closing browser") yield deferred_from_coro(self.browser.close()) yield deferred_from_coro(self.playwright_context_manager.__aexit__()) def download_request(self, request: Request, spider: Spider) -> Deferred: if request.meta.get("playwright"): return deferred_from_coro(self._download_request(request, spider)) return super().download_request(request, spider) async def _download_request(self, request: Request, spider: Spider) -> Response: context = request.meta.get("playwright_context") page = request.meta.get("playwright_page") if not isinstance(page, Page) and isinstance(context, BrowserContext): async with self._creating_lock: page = await self._create_page(context) elif not isinstance(page, Page): async with self._creating_lock: context = await self._create_or_get_context() page = await self._create_page(context) await page.unroute("**") await page.route( "**", _make_request_handler( browser_type=self.browser_type, scrapy_request=request, stats=self.stats ), ) try: result = await self._download_request_with_page(request, spider, page) except Exception: if not page.is_closed(): await page.close() self.stats.inc_value("playwright/page_count/closed") raise else: return result async def _create_or_get_context(self) -> BrowserContext: context_index = await self._get_free_context_index() if ( context_index is None and len(self.contexts) < self.max_contexts ): context = await self._create_context() elif context_index is None: while (context_index := await self._get_free_context_index()) is None: logger.debug( "Waiting to release some context, maximum number of contexts is reached", ) await asyncio.sleep(1.0) context = self.contexts[context_index] else: context = self.contexts[context_index] return context async def _create_context(self) -> BrowserContext: context = await self.browser.new_context(**self.context_args) self.contexts.append(context) logger.info("Browser context started") if self.default_navigation_timeout: context.set_default_navigation_timeout(self.default_navigation_timeout) return context async def _get_free_context_index(self) -> Union[int, None]: free_context_index = None for i, context in enumerate(self.contexts): if ( self.max_pages_per_context is None or len(context.pages) < self.max_pages_per_context ): free_context_index = i break return free_context_index async def _create_page(self, context: BrowserContext) -> Page: page = await context.new_page() if self.default_navigation_timeout: page.set_default_navigation_timeout(self.default_navigation_timeout) return page async def _download_request_with_page( self, request: Request, spider: Spider, page: Page ) -> Response: start_time = time() page_load_event = request.meta.get("playwright_page_load_event") or "load" response = await page.goto( url=request.url, wait_until=page_load_event, ) page_coroutines = request.meta.get("playwright_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) pc.result = await method(*pc.args, **pc.kwargs) await page.wait_for_load_state( state=page_load_event, timeout=self.default_navigation_timeout, ) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time if request.meta.get("playwright_include_page"): request.meta["playwright_page"] = page else: await page.close() self.stats.inc_value("playwright/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["playwright"], )