Crawler runs into infinite loop and won't stop on request limit calling the handler again and agin for the same context

I am observing strange behavior with my crawler defined below. The parsing should stop after 5 requests but it won't stop at all. It seems that this line in routes.py:

price = await context.page.locator('#viewad-price').text_content() or ''

is provoking an infinite loop. The crawler is trying to parse the same url again and again or rather the default_handler keeps being called again and again for the same context.

My main.py:

import asyncio
import logging

from crawlee.playwright_crawler.playwright_crawler import PlaywrightCrawler

from .routes import router

# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')   

async def main() -> None:
    """The crawler entry point."""
    crawler = PlaywrightCrawler(
        request_handler=router,     
        max_requests_per_crawl=5,
    )

    await crawler.run(
        [
            'https://www.kleinanzeigen.de/s-motorraeder-roller/motorrad/berlin/anzeige:angebote/preis:1200:10000/motorrad/k0c305l3331+motorraeder_roller.type_s:motorrad',
        ]
    )

    await crawler.export_data('results.json')

if __name__ == '__main__':
    asyncio.run(main())

my routes.py:

from crawlee.basic_crawler import Router
from crawlee.playwright_crawler import PlaywrightCrawlingContext

router = Router[PlaywrightCrawlingContext]()

@router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:

    context.log.info(f'default_handler is processing {context.request.url}')

    await context.enqueue_links(
        selector='article h2 a',
        label='DETAILS',
    )

    # await context.enqueue_links(
    #     selector='.pagination-next',
    # )

@router.handler('DETAILS')
async def detail_handler(context: PlaywrightCrawlingContext) -> None:
    # This replaces the context.request.label == DETAIL branch of the if clause.
    # context.log.info(f'detail_handler is processing {context.request.url}')

    title = await context.page.locator('article h1').text_content()
    title = title.replace('Reserviert • Gelöscht •', '').strip()

    context.log.info(f'parsing {title}')

    price = await context.page.locator('#viewad-price').text_content() or ''
    # price = context.soup.find('h2').text # this line and above somehow create an infinite loop
    location = await context.page.locator('#viewad-locality').text_content()
    view_cnt = await context.page.locator('#viewad-cntr span').text_content()
    created = await context.page.locator('#viewad-extra-info span').text_content()

    # details_table = await context.page.locator('li.addetailslist--detail')
    # for detail in details_table:
    #     key = await detail.locator('span').elementHandle().textContent()
    #     value = await detail.locator('strong').elementH           andle().textContent()
    #     context.log.info(f'{key}: {value}')

    # d = [ k.text for k in context.soup.find_all('li', class_='addetailslist--detail') ]   

    data = {
        'title': title,
        'url': context.request.url,
        'price': price,
        'view_cnt': view_cnt,
        'location': location,
        'created': created,
        # 'created': view_cnt,
        # 'details': d,
    }

    context.log.info(f'pushing data: {data}')

    await context.push_data(data)

Hi, I tried to execute your code with additional logging to be able to better investigate what's going on there.

Mostly in detail handler:

@router.handler('DETAILS')
async def detail_handler(context: PlaywrightCrawlingContext) -> None:
    context.log.info(f'detail_handler: processing {context.request.url} ...')

    title = await context.page.locator('article h1').text_content() or ''
    title = title.replace('Reserviert • Gelöscht •', '').strip()
    context.log.info(f'title: {title}')

    price = (await context.page.locator('#viewad-price').text_content() or '').strip()
    context.log.info(f'price: {price}')

    location = (await context.page.locator('#viewad-locality').text_content() or '').strip()
    context.log.info(f'location: {location}')

    view_cnt = (await context.page.locator('#viewad-cntr span').text_content() or '').strip()
    context.log.info(f'view_cnt: {view_cnt}')

    created = (await context.page.locator('#viewad-extra-info span').text_content() or '').strip()
    context.log.info(f'created: {created}')

    data = {
        'title': title,
        'url': context.request.url,
        'price': price,
        'view_cnt': view_cnt,
        'location': location,
        'created': created,
    }

    context.log.info(f'all data extracted ({data}), pushing...')
    await context.push_data(data)

It did not run into an infinity loop, the crawler run ended with the following:

[crawlee.playwright_crawler.playwright_crawler] INFO  Final request statistics:
┌───────────────────────────────┬────────────┐
│ requests_finished             │ 1          │
│ requests_failed               │ 25         │
│ retry_histogram               │ [1, 0, 25] │
│ request_avg_failed_duration   │ 1.38054    │
│ request_avg_finished_duration │ 1.835258   │
│ requests_finished_per_minute  │ 2          │
│ requests_failed_per_minute    │ 44         │
│ request_total_duration        │ 36.348751  │
│ requests_total                │ 26         │
│ crawler_runtime               │ 33.728409  │
└───────────────────────────────┴────────────┘

but all requests processed by the detail handler failed because of Playwright's "strict mode violation", see the log below:

[crawlee.playwright_crawler.playwright_crawler] INFO  detail_handler: processing https://www.kleinanzeigen.de/s-anzeige/victory-polaris-v11-vegas-bj-08-2016-nur-5500-km-top-zustand-festpreis-/2736558027-305-3401 ...
[crawlee.playwright_crawler.playwright_crawler] INFO  title: Victory Polaris V11 Vegas Bj 08/2016 nur 5500 Km Top Zustand Festpreis !!!
[crawlee.playwright_crawler.playwright_crawler] INFO  price: 8.000 €
[crawlee.playwright_crawler.playwright_crawler] INFO  location: 10997 Friedrichshain-Kreuzberg - Kreuzberg
[crawlee.playwright_crawler.playwright_crawler] INFO  view_cnt: 7148
[crawlee.playwright_crawler.playwright_crawler] ERROR Request failed and reached maximum retries
      Traceback (most recent call last):
        File "/home/vdusek/Projects/crawlee-py/src/crawlee/basic_crawler/context_pipeline.py", line 76, in __call__
          await final_context_consumer(cast(TCrawlingContext, crawling_context))
        File "/home/vdusek/Projects/crawlee-py/src/crawlee/basic_crawler/router.py", line 55, in __call__
          return await handler(context)
                 ^^^^^^^^^^^^^^^^^^^^^^
        File "/home/vdusek/Projects/crawlee-py/run_issue_437.py", line 40, in detail_handler
          created = (await context.page.locator('#viewad-extra-info span').text_content() or '').strip()
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/vdusek/Projects/crawlee-py/.venv/lib/python3.12/site-packages/playwright/async_api/_generated.py", line 17101, in text_content
          await self._impl_obj.text_content(timeout=timeout)
        File "/home/vdusek/Projects/crawlee-py/.venv/lib/python3.12/site-packages/playwright/_impl/_locator.py", line 607, in text_content
          return await self._frame.text_content(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/vdusek/Projects/crawlee-py/.venv/lib/python3.12/site-packages/playwright/_impl/_frame.py", line 613, in text_content
          return await self._channel.send("textContent", locals_to_params(locals()))
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/vdusek/Projects/crawlee-py/.venv/lib/python3.12/site-packages/playwright/_impl/_connection.py", line 59, in send
          return await self._connection.wrap_api_call(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "/home/vdusek/Projects/crawlee-py/.venv/lib/python3.12/site-packages/playwright/_impl/_connection.py", line 514, in wrap_api_call
          raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
      playwright._impl._errors.Error: Locator.text_content: Error: strict mode violation: locator("#viewad-extra-info span") resolved to 2 elements:
          1) <span>19.08.2024</span> aka get_by_text("19.08.2024")
          2) <span id="viewad-cntr-num">7148</span> aka get_by_text("7148")

      Call log:
      waiting for locator("#viewad-extra-info span")

Your locator is not specific enough and returns multiple items.

Maybe also Strictness section in Playwright docs could help - https://playwright.dev/python/docs/locators#strictness.

Closing the issue as it seems like this is not a Crawlee-related issue.

apify / crawlee-python

Crawler runs into infinite loop and won't stop on request limit calling the handler again and agin for the same context #437