unclecode / crawl4ai

🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper
Apache License 2.0
16.38k stars 1.2k forks source link

Sync to Async #258

Closed Armandd1 closed 1 week ago

Armandd1 commented 1 week ago

Hello, I would like to ask for help to switch from sync to async. I have the following code and this part "result_main = main_llm_extraction_strategy.run(url, [internal])" is still in sync.

How do I solve it in async, so that it remains functional, that it connects the links and the given event, the scraper runs again on links later.

main_llm_extraction_strategy = LLMExtractionStrategy(
    provider=f'{PROVIDER}/{SCRAPER_MODEL}', 
    api_token=os.getenv('GEMINI_API_KEY'),
    schema=Event.model_json_schema(),
    extraction_type="schema",
    instruction=EXTRACTION_INSTRUCTION
)

async def extract_events_from_link(url: str) -> List[Dict]:
    all_extracted_events = []

    try:
        async with AsyncWebCrawler(verbose=True) as crawler:
            result = await crawler.arun(
                url,
                word_count_threshold=0,
                bypass_cache=True,
                bypass_headless=False
            )

            internal = '\n'.join([a['href'] for a in result.links.get('internal', [])])

            result_main = main_llm_extraction_strategy.run(url, [internal])

            extracted_events = parse_extracted_events(result_main)

            for event in extracted_events[:3]:
                event_link = event.get('event_link')
                if event_link:
                    detail_results = await crawler.arun(
                        url=event_link,
                        word_count_threshold=0,
                        extraction_strategy=sub_extraction_strategy,
                        bypass_cache=True,
                    )
unclecode commented 1 week ago

Hi @Armandd1 Thx for using Crawl4ai, actually, your current approach isn't the best way; instead, pass the extraction strategy to the crawler. Here's a hypothetical code example to illustrate the concept, assuming some common structures you might want to extract. Feel free to modify it and let me know if any issues arise.

import os
from typing import List, Dict
from crawl4ai import AsyncWebCrawler 
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field

class EventDetail(BaseModel):
    description: str = Field(..., description="Detailed description of the event")
    location: str = Field(..., description="Event location")
    datetime: str = Field(..., description="Event date and time")

class Event(BaseModel):
    title: str = Field(..., description="Event title")
    event_link: str = Field(..., description="Link to event details")
    category: str = Field(..., description="Event category")

async def extract_events_from_link(url: str) -> List[Dict]:
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Initial extraction
        result = await crawler.arun(
            url=url,
            word_count_threshold=0,
            extraction_strategy=LLMExtractionStrategy(
                provider=f'{PROVIDER}/{SCRAPER_MODEL}', 
                api_token=os.getenv('GEMINI_API_KEY'),
                schema=Event.model_json_schema(),
                instruction=EXTRACTION_INSTRUCTION
            ),
            bypass_cache=True
        )

        events = json.loads(result.extracted_content)

        # Process the first 3 events' details
        for event in events[:3]:
            if event_link := event.get('event_link'):
                detail_result = await crawler.arun(
                    url=event_link,
                    word_count_threshold=0,
                    extraction_strategy=LLMExtractionStrategy(
                        provider=f'{PROVIDER}/{SCRAPER_MODEL}',
                        api_token=os.getenv('GEMINI_API_KEY'),
                        schema=EventDetail.model_json_schema(),
                        instruction="Extract detailed event information including description, location, and datetime"
                    ),
                    bypass_cache=True
                )
                event['details'] = json.loads(detail_result.extracted_content)

        return events