Error handler does not work

Hitreno-2 commented 1 month ago

I am using the latest version of crawlee, python 3.11, windows 11, tried chromium and firefox. There is a simple example P.S. There is also an error ValueError: Cannot close the browser while there are open pages. Dont know how to fix that

from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
from datetime import timedelta as td

urls = ["https://randomname32482395f.com"]

crawler = PlaywrightCrawler(
    headless=False,
    browser_type='firefox', # tried chromium
    request_handler_timeout=td(seconds=30),
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
    print(await context.page.title())
    await context.page.close()

@crawler.failed_request_handler # also tried error_handler
async def error_handler(context: PlaywrightCrawlingContext, error) -> None:
    print(f"Error processing {context.request.url}")

async def test():
    await crawler.add_requests(urls)
    await crawler.run()

import asyncio
asyncio.run(test())

expected result: "Error processing ["https://randomname32482395f.com"]"

obtained result:

[crawlee.statistics.statistics] INFO  crawlee.playwright_crawler.playwright_crawler request statistics {
        "requests_finished": 0,
        "requests_failed": 0,
        "retry_histogram": [
          0
        ],
        "request_avg_failed_duration": null,
        "request_avg_finished_duration": null,
        "requests_finished_per_minute": 0,
        "requests_failed_per_minute": 0,
        "request_total_duration": 0.0,
        "requests_total": 0,
        "crawler_runtime": 0.008034
      }
[crawlee.autoscaling.autoscaled_pool] INFO  current_concurrency = 0; desired_concurrency = 2; cpu = 0.0; mem = 0.0; event_loop = 0.0; client_info = 0.0
[crawlee.playwright_crawler.playwright_crawler] ERROR Request failed and reached maximum retries
      Traceback (most recent call last):
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\context_pipeline.py", line 62, in __call__
          result = await middleware_instance.__anext__()
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\playwright_crawler\playwright_crawler.py", line 68, in _page_goto
          await crawlee_page.page.goto(context.request.url)
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\async_api\_generated.py", line 8657, in goto
          await self._impl_obj.goto(
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_page.py", line 519, in goto
          return await self._main_frame.goto(**locals_to_params(locals()))
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_frame.py", line 145, in goto
          await self._channel.send("goto", locals_to_params(locals()))
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_connection.py", line 59, in send
          return await self._connection.wrap_api_call(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\playwright\_impl\_connection.py", line 514, in wrap_api_call
          raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
      playwright._impl._errors.Error: Page.goto: NS_ERROR_UNKNOWN_HOST
      Call log:
      navigating to "https://randomname32482395f.com/", waiting until "load"

      The above exception was the direct cause of the following exception:

      Traceback (most recent call last):
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 717, in __run_task_function
          await wait_for(
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\_utils\wait.py", line 37, in wait_for
          return await asyncio.wait_for(operation(), timeout.total_seconds())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 489, in wait_for
          return fut.result()
                 ^^^^^^^^^^^^
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 849, in __run_request_handler
          await self._context_pipeline(crawling_context, self.router)
        File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\context_pipeline.py", line 70, in __call__
          raise ContextPipelineInitializationError(e, crawling_context) from e
      crawlee.basic_crawler.errors.ContextPipelineInitializationError: (Error('Page.goto: NS_ERROR_UNKNOWN_HOST\nCall log:\nnavigating to "https://randomname32482395f.com/", waiting until "load"\n'), BasicCrawlingContext(request=Request(url='https://randomname32482395f.com', unique_key='https://randomname32482395f.com', method='get', payload=None, headers={}, user_data={'__crawlee': {'state': <RequestState.REQUEST_HANDLER: 3>}}, retry_count=2, no_retry=False, loaded_url=None, handled_at=None, id='9UKWVpeUSFHmghR', json_=None, order_no=None), session=<Session id='2058HIh7GZ' max_age=datetime.timedelta(seconds=3000) user_data={} max_error_score=3.0 error_score_decrement=0.5 created_at=datetime.datetime(2024, 7, 14, 11, 55, 40, 206931, tzinfo=datetime.timezone.utc) usage_count=0 max_usage_count=50 error_score=0.0 cookies={} blocked_status_codes=[401, 403, 429]>, proxy_info=None, send_request=<function BasicCrawler._prepare_send_request_function.<locals>.send_request at 0x000001C77D93D1C0>, add_requests=<bound method RequestHandlerRunResult.add_requests of RequestHandlerRunResult(add_requests_calls=[])>, push_data=<bound method BasicCrawler._push_data of <crawlee.playwright_crawler.playwright_crawler.PlaywrightCrawler object at 0x000001C7782F5590>>, log=<Logger crawlee.playwright_crawler.playwright_crawler (INFO)>))
[crawlee.autoscaling.autoscaled_pool] INFO  Waiting for remaining tasks to finish
Traceback (most recent call last):
  File "c:\Users\user\Desktop\parsera\check.py", line 28, in <module>
    asyncio.run(test())
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 654, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "c:\Users\user\Desktop\parsera\check.py", line 25, in test
    await crawler.run()
  File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 350, in run
    await run_task
  File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 378, in _run_crawler
    async with AsyncExitStack() as exit_stack:
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\contextlib.py", line 745, in __aexit__
    raise exc_details[1]
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\contextlib.py", line 728, in __aexit__
    cb_suppress = await cb(*exc_details)
                  ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\browsers\browser_pool.py", line 173, in __aexit__
    await browser.close(force=True)
  File "C:\Users\user\Desktop\parsera\.venv\Lib\site-packages\crawlee\browsers\playwright_browser_controller.py", line 96, in close
    raise ValueError('Cannot close the browser while there are open pages.')
ValueError: Cannot close the browser while there are open pages.
Exception ignored in: <function BaseSubprocessTransport.__del__ at 0x000001C77A2E1BC0>
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_subprocess.py", line 126, in __del__
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_subprocess.py", line 104, in close
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\proactor_events.py", line 109, in close
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 762, in call_soon
  File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\asyncio\base_events.py", line 520, in _check_closed
RuntimeError: Event loop is closed

Hitreno-2 commented 1 month ago

Also tried changing handler positions, doesn't help.

janbuchar commented 1 month ago

Hello, thank you for your interest in Crawlee and for the bug report. Currently, the error_handler only handles errors thrown in your request_handler function, but the error you're seeing happens before the request handler is invoked.

We decided to change this and let error_handler handle both kinds of errors, at the cost of not having access to the full PlaywrightCrawlingContext, but only BasicCrawlingContext. You can expect this change to be released in a matter of days.

apify / crawlee-python

Error handler does not work #296