Sync to Async - Githubissues

Hello, I would like to ask for help to switch from sync to async. I have the following code and this part "result_main = main_llm_extraction_strategy.run(url, [internal])" is still in sync.

How do I solve it in async, so that it remains functional, that it connects the links and the given event, the scraper runs again on links later.

main_llm_extraction_strategy = LLMExtractionStrategy(
    provider=f'{PROVIDER}/{SCRAPER_MODEL}', 
    api_token=os.getenv('GEMINI_API_KEY'),
    schema=Event.model_json_schema(),
    extraction_type="schema",
    instruction=EXTRACTION_INSTRUCTION
)

async def extract_events_from_link(url: str) -> List[Dict]:
    all_extracted_events = []

    try:
        async with AsyncWebCrawler(verbose=True) as crawler:
            result = await crawler.arun(
                url,
                word_count_threshold=0,
                bypass_cache=True,
                bypass_headless=False
            )

            internal = '\n'.join([a['href'] for a in result.links.get('internal', [])])

            result_main = main_llm_extraction_strategy.run(url, [internal])

            extracted_events = parse_extracted_events(result_main)

            for event in extracted_events[:3]:
                event_link = event.get('event_link')
                if event_link:
                    detail_results = await crawler.arun(
                        url=event_link,
                        word_count_threshold=0,
                        extraction_strategy=sub_extraction_strategy,
                        bypass_cache=True,
                    )

Hi @Armandd1 Thx for using Crawl4ai, actually, your current approach isn't the best way; instead, pass the extraction strategy to the crawler. Here's a hypothetical code example to illustrate the concept, assuming some common structures you might want to extract. Feel free to modify it and let me know if any issues arise.

import os
from typing import List, Dict
from crawl4ai import AsyncWebCrawler 
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field

class EventDetail(BaseModel):
    description: str = Field(..., description="Detailed description of the event")
    location: str = Field(..., description="Event location")
    datetime: str = Field(..., description="Event date and time")

class Event(BaseModel):
    title: str = Field(..., description="Event title")
    event_link: str = Field(..., description="Link to event details")
    category: str = Field(..., description="Event category")

async def extract_events_from_link(url: str) -> List[Dict]:
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Initial extraction
        result = await crawler.arun(
            url=url,
            word_count_threshold=0,
            extraction_strategy=LLMExtractionStrategy(
                provider=f'{PROVIDER}/{SCRAPER_MODEL}', 
                api_token=os.getenv('GEMINI_API_KEY'),
                schema=Event.model_json_schema(),
                instruction=EXTRACTION_INSTRUCTION
            ),
            bypass_cache=True
        )

        events = json.loads(result.extracted_content)

        # Process the first 3 events' details
        for event in events[:3]:
            if event_link := event.get('event_link'):
                detail_result = await crawler.arun(
                    url=event_link,
                    word_count_threshold=0,
                    extraction_strategy=LLMExtractionStrategy(
                        provider=f'{PROVIDER}/{SCRAPER_MODEL}',
                        api_token=os.getenv('GEMINI_API_KEY'),
                        schema=EventDetail.model_json_schema(),
                        instruction="Extract detailed event information including description, location, and datetime"
                    ),
                    bypass_cache=True
                )
                event['details'] = json.loads(detail_result.extracted_content)

        return events

unclecode / crawl4ai

Sync to Async #258