unstructured data download

If you are asking if we can download files within the process of crawling and scraping, the answer is yes. I will provide the code snippet right here. If you are asking if we crawl information from PDF or video and other formats, it's in the backlog. Hopefully, by this year, we're going to have them as well.

async def download_example():
    """Example of downloading files from Python.org"""
    # downloads_path = os.path.join(os.getcwd(), "downloads")
    downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
    os.makedirs(downloads_path, exist_ok=True)

    print(f"Downloads will be saved to: {downloads_path}")

    async with AsyncWebCrawler(
        accept_downloads=True,
        downloads_path=downloads_path,
        verbose=True
    ) as crawler:
        result = await crawler.arun(
            url="https://www.python.org/downloads/",
            js_code="""
            // Find and click the first Windows installer link
            const downloadLink = document.querySelector('a[href$=".exe"]');
            if (downloadLink) {
                console.log('Found download link:', downloadLink.href);
                downloadLink.click();
            } else {
                console.log('No .exe download link found');
            }
            """,
            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
            cache_mode=CacheMode.BYPASS
        )

        if result.downloaded_files:
            print("\nDownload successful!")
            print("Downloaded files:")
            for file_path in result.downloaded_files:
                print(f"- {file_path}")
                print(f"  File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
        else:
            print("\nNo files were downloaded")

unclecode / crawl4ai

unstructured data download #283