This affects python code most prominently since wrong indentation leads to compile time errors.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
def crawl_website(start_url, max_depth=2):
"""Crawls a website to find all URLs within a given depth.
Args:
start_url: The starting URL of the website to crawl.
max_depth: The maximum depth to crawl.
Returns:
A set of all URLs found on the website.
"""
visited_urls = set()
urls_to_visit = deque([start_url])
current_depth = 0
while urls_to_visit and current_depth <= max_depth:
url = urls_to_visit.popleft()
if url not in visited_urls:
visited_urls.add(url)
print(f"Crawling: {url}")
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a', href=True):
absolute_url = urljoin(url, link['href'])
if absolute_url not in visited_urls:
urls_to_visit.append(absolute_url)
except requests.exceptions.RequestException as e:
print(f"Error crawling {url}: {e}")
current_depth += 1
return visited_urls
# Example usage:
start_url = "https://www.example.com" # Replace with your website
found_urls = crawl_website(start_url)
print("\nAll URLs found:")
for url in found_urls:
print(url)
What version are you using?
This affects python code most prominently since wrong indentation leads to compile time errors.
What happened?
-
Steps to reproduce
Supporting info to reproduce
No response
Relevant log output
No response