Open Pranshu172 opened 2 weeks ago
Hi, thank you for using the library. For the benefit of others who may have similar questions, I'll provide a detailed answer that can be used for future reference.
from crawl4ai import AsyncWebCrawler
import asyncio
async def main(): async with AsyncWebCrawler(verbose=True) as crawler: urls = ["https://example1.com", "https://example2.com"]
# Method 1: Built-in delay
results = await crawler.arun_many(
urls,
delay_between_requests=2.0, # Add 2 second delay between requests
**kwargs
)
# Method 2: Custom throttling
semaphore = asyncio.Semaphore(3) # Limit to 3 concurrent requests
async def crawl_with_throttle(url):
async with semaphore:
result = await crawler.arun(url)
await asyncio.sleep(1) # Add delay after each request
return result
tasks = [crawl_with_throttle(url) for url in urls]
results = await asyncio.gather(*tasks)
if name == "main": asyncio.run(main())
2. Add Retry Logic with Exponential Backoff:
```python
from crawl4ai import AsyncWebCrawler
import asyncio
import random
async def crawl_with_retry(crawler, url, max_retries=3):
for attempt in range(max_retries):
try:
result = await crawler.arun(url)
# Check content for rate limit messages
if "too many requests" in result.markdown.lower():
delay = (2 ** attempt) + random.uniform(0, 1) # Exponential backoff
print(f"Rate limited, waiting {delay:.2f}s before retry")
await asyncio.sleep(delay)
continue
return result
except Exception as e:
if attempt == max_retries - 1:
raise
delay = (2 ** attempt) + random.uniform(0, 1)
await asyncio.sleep(delay)
return None
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
urls = ["https://example1.com", "https://example2.com"]
results = await asyncio.gather(*[
crawl_with_retry(crawler, url) for url in urls
])
from crawl4ai import AsyncWebCrawler
import random
PROXY_LIST = [ "http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080" ]
async def main(): async with AsyncWebCrawler( verbose=True, proxy=random.choice(PROXY_LIST) # Rotate proxies ) as crawler: results = await crawler.arun_many(urls)
## Regarding the status code issue:
- Many sites return 200 with rate limit messages in content instead of 429
- You can modify the crawler to check content for rate limit indicators:
I have two solutions for you. The `success` flag indicates true crawl success beyond HTTP status codes. It will be `False` if there are JavaScript errors, empty content, error messages, anti-bot notices, or unexpected page structures - even with a 200 response. Always check both `success` and `error_message` for accurate crawl validation.
```python
# Enhanced rate limit and success checking
async def check_crawl_success(result):
# Check both rate limits and general success
is_rate_limited = await check_rate_limit(result)
is_successful = result.success and result.markdown.strip() != ""
if not is_successful:
if is_rate_limited:
return False, "Rate limited"
return False, "Crawl failed"
return True, "Success"
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
results = await crawler.arun_many(urls)
# Filter successful results
successful_results = []
for result in results:
success, message = await check_crawl_success(result)
if success:
successful_results.append(result)
else:
print(f"Failed to crawl {result.url}: {message}")
# Continue processing only successful results
for result in successful_results:
# Process your successful crawls
pass
You can also manually check for special messages in the content to detect rate limits or similar issues:
async def check_rate_limit(result):
rate_limit_indicators = [
"too many requests",
"rate limit exceeded",
"please try again later",
"access temporarily limited"
]
if any(indicator in result.markdown.lower() for indicator in rate_limit_indicators):
return True
return False
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url)
if await check_rate_limit(result):
print("Rate limited despite 200 status code")
# Handle accordingly
I'm currently testing a new Scraper module for Crawl4AI. It uses graph search algorithms to intelligently crawl websites, handling everything from simple blog posts to complex nested pages. Whether you need to crawl an entire documentation site or extract data from multiple product pages, the Scraper will handle all the heavy lifting - navigation, content extraction, and optimization. Stay tuned for its release! In the meantime, the rate limiting solutions above should help with your current crawling needs.
Stay tuned for the release! In the meantime, the solutions above will help manage rate limiting and ensure reliable crawling results.
Thanks for detailed explanation I will try these!
You're welcome.
While extracting multiple links, I encountered a situation where some of them returned a "Too Many Requests" message, but the status code was still 200.
Thanks!