Closed souzatharsis closed 1 month ago
I will try this to replace website_extractor.py :
"""
Website Extractor Module
This module is responsible for extracting clean text content from websites using
BeautifulSoup for local HTML parsing instead of the Jina AI API.
"""
import requests
import re
import html
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from podcastfy.utils.config import load_config
class WebsiteExtractor:
def __init__(self):
"""
Initialize the WebsiteExtractor.
"""
self.config = load_config()
self.website_extractor_config = self.config.get('website_extractor')
def extract_content(self, url: str) -> str:
"""
Extract clean text content from a website using BeautifulSoup.
Args:
url (str): Website URL.
Returns:
str: Extracted clean text content.
Raises:
Exception: If there's an error in extracting the content.
"""
try:
# Normalize the URL
normalized_url = self.normalize_url(url)
# Request the webpage
response = requests.get(normalized_url)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Extract and clean the text content
raw_text = soup.get_text(separator="\n") # Get all text content
cleaned_content = self.clean_markdown(raw_text)
return cleaned_content
except requests.RequestException as e:
raise Exception(f"Failed to extract content from {url}: {str(e)}")
except Exception as e:
raise Exception(f"An unexpected error occurred while extracting content from {url}: {str(e)}")
def normalize_url(self, url: str) -> str:
"""
Normalize the given URL by adding scheme if missing and ensuring it's a valid URL.
Args:
url (str): The URL to normalize.
Returns:
str: The normalized URL.
Raises:
ValueError: If the URL is invalid after normalization attempts.
"""
# If the URL doesn't start with a scheme, add 'https://'
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
# Parse the URL
parsed = urlparse(url)
# Ensure the URL has a valid scheme and netloc
if not all([parsed.scheme, parsed.netloc]):
raise ValueError(f"Invalid URL: {url}")
return parsed.geturl()
def clean_markdown(self, markdown_content: str) -> str:
"""
Remove images, special markdown tags, URIs, and leftover brackets from the content.
Also remove specific headers and their content.
Args:
markdown_content (str): The markdown content to clean.
Returns:
str: Cleaned text content.
"""
# Decode HTML entities
cleaned_content = html.unescape(markdown_content)
# Remove image markdown
image_pattern = r'!\[.*?\]\(.*?\)'
cleaned_content = re.sub(image_pattern, '', cleaned_content)
# Remove inline links and URIs
link_pattern = r'\[([^\]]+)\]\([^\)]+\)'
cleaned_content = re.sub(link_pattern, r'\1', cleaned_content)
uri_pattern = r'https?://\S+|www\.\S+'
cleaned_content = re.sub(uri_pattern, '', cleaned_content)
# Remove special markdown tags (e.g., bold, italic, code)
special_tags_pattern = r'(\*{1,2}|_{1,2}|`)'
cleaned_content = re.sub(special_tags_pattern, '', cleaned_content)
# Remove any remaining markdown headers
header_pattern = r'^#+\s'
cleaned_content = re.sub(header_pattern, '', cleaned_content, flags=re.MULTILINE)
# Remove horizontal rules
hr_pattern = r'^\s*[-*_]{3,}\s*$'
cleaned_content = re.sub(hr_pattern, '', cleaned_content, flags=re.MULTILINE)
# Remove blockquotes
blockquote_pattern = r'^>\s'
cleaned_content = re.sub(blockquote_pattern, '', cleaned_content, flags=re.MULTILINE)
# Remove extra newlines
cleaned_content = re.sub(r'\n{3,}', '\n\n', cleaned_content)
# Remove leftover brackets from images and links
leftover_brackets_pattern = r'\[|\]|\(|\)'
cleaned_content = re.sub(leftover_brackets_pattern, '', cleaned_content)
# Remove specific headers and their content
title_pattern = r'^Title:.*\n'
url_source_pattern = r'^URL Source:.*\n'
markdown_content_pattern = r'^Markdown Content:\n'
warning_pattern = r'^Warning:.*\n'
cleaned_content = re.sub(title_pattern, '', cleaned_content, flags=re.MULTILINE)
cleaned_content = re.sub(url_source_pattern, '', cleaned_content, flags=re.MULTILINE)
cleaned_content = re.sub(markdown_content_pattern, '', cleaned_content, flags=re.MULTILINE)
cleaned_content = re.sub(warning_pattern, '', cleaned_content, flags=re.MULTILINE)
# Apply markdown cleaning patterns from config
for pattern in self.website_extractor_config['markdown_cleaning']['remove_patterns']:
cleaned_content = re.sub(pattern, '', cleaned_content)
return cleaned_content.strip()
def main(seed: int = 42) -> None:
"""
Main function to test the WebsiteExtractor class.
"""
# Load configuration
config = load_config()
# Create an instance of WebsiteExtractor
extractor = WebsiteExtractor()
# Test URL
test_url = "www.souzatharsis.com"
try:
# Extract content from the test URL
content = extractor.extract_content(test_url)
# Print the first 500 characters of the extracted content
print(f"Extracted content (first 500 characters):\n{content[:500]}...")
# Print the total length of the extracted content
print(f"Total length of extracted content: {len(content)} characters")
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()
maybe we can also implement a npx playwright test, complementing the scraping module? I will also provide this code. though my main use will be to process podcastfy from PDFs, md and docx files like NotebookLM.
I have some sitemaps for news websites, it's easy to create and export the selectors and its overall sitemap with webscraper.io or inspector and turn them into a Typescript test to run in Playwright.
@240db This is the only issue left so we can enable usage with no API KEY! Are you interested in pushing a PR?
We went from requiring 3 API Keys to 1 now (jina.ai) as we since delivered:
Done! Implemented it based on your original draft. Many thanks @240db !
no reason to stick with jina when BeautifulSoap can do the job to parse websites to the extend needed in this package