Issue with JsonCssExtractionStrategy

unclecode / crawl4ai

🔥🕷️ Crawl4AI: Crawl Smarter, Faster, Freely. For AI.

Apache License 2.0

17k stars 1.26k forks source link

How can I change the browser?

import asyncio
from crawl4ai import AsyncWebCrawler
import base64
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json

async def main():
    schema = {
        "name": "News Articles",
        "baseSelector": "article.tease-card",
        "fields": [
            {
                "name": "title",
                "selector": "h2",
                "type": "text",
            },
            {
                "name": "summary",
                "selector": "div.tease-card__info",
                "type": "text",
            }
        ],
    }

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True)
        )
        extracted_data = json.loads(result.extracted_content)
        print(f"Extracted {len(extracted_data)} articles")
        print(json.dumps(extracted_data[0], indent=2))

if __name__ == "__main__":
    asyncio.run(main())

output

Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.
[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds
[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.
Extracted 258 articles
{
  "index": 0,
  "tags": [],
  "content": "IE 11 is not supported. For an optimal experience visit our site on another\nbrowser."
}

import asyncio from crawl4ai import AsyncWebCrawler import base64 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy import json async def main(): schema = { "name": "News Articles", "baseSelector": ".wide-tease-item__info-wrapper", "fields": [ { "name": "title", "selector": "h2", "type": "text", }, { "name": "summary", "selector": "div.wide-tease-item__description", "type": "text", } ], } async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", bypass_cache=True, extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True) ) extracted_data = json.loads(result.extracted_content) print(f"Extracted {len(extracted_data)} articles") print(json.dumps(extracted_data[0], indent=2)) if __name__ == "__main__": asyncio.run(main())

unclecode / crawl4ai

Issue with JsonCssExtractionStrategy #163