Crawlee—A web scraping and browser automation library for Python to build reliable crawlers. Extract data for AI, LLMs, RAG, or GPTs. Download HTML, PDF, JPG, PNG, and other files from websites. Works with BeautifulSoup, Playwright, and raw HTTP. Both headful and headless mode. With proxy rotation.
import asyncio
from crawlee.playwright_crawler import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
)
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f"Processing {context.request.url} ...")
# Extract data from the page.
data = {
"url": context.request.url,
"title": context.page.title.string if context.page.title else None,
}
# Enqueue all links found on the page.
await context.enqueue_links()
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(["https://crawlee.dev"])
# Export the entire dataset to a CSV file.
await crawler.export_data("results.csv")
if __name__ == "__main__":
asyncio.run(main())
TERMINAL:
Traceback (most recent call last):
File "g:\python\python_experiment\my-crawler\my-crawler\routes.py", line 40, in <module>
asyncio.run(main())
File "D:\python3.9\lib\asyncio\runners.py", line 44, in run
return loop.run_until_complete(main)
File "D:\python3.9\lib\asyncio\base_events.py", line 642, in run_until_complete
return future.result()
File "g:\python\python_experiment\my-crawler\my-crawler\routes.py", line 36, in main
await crawler.export_data("results.csv")
File "g:\python\python_experiment\venv\lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 474, in export_data
return await dataset.write_to(content_type, path.open('w', newline=''))
File "g:\python\python_experiment\venv\lib\site-packages\crawlee\storages\dataset.py", line 213, in write_to
writer.writerows([items[0].keys(), *[item.values() for item in items]])
IndexError: list index out of range
TERMINAL: