Replace r.jina.ai with simple BeautifulSoap

souzatharsis commented 1 month ago

no reason to stick with jina when BeautifulSoap can do the job to parse websites to the extend needed in this package

240db commented 1 month ago

I will try this to replace website_extractor.py :

"""
Website Extractor Module

This module is responsible for extracting clean text content from websites using
BeautifulSoup for local HTML parsing instead of the Jina AI API.
"""

import requests
import re
import html
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from podcastfy.utils.config import load_config

class WebsiteExtractor:
    def __init__(self):
        """
        Initialize the WebsiteExtractor.
        """
        self.config = load_config()
        self.website_extractor_config = self.config.get('website_extractor')

    def extract_content(self, url: str) -> str:
        """
        Extract clean text content from a website using BeautifulSoup.

        Args:
            url (str): Website URL.

        Returns:
            str: Extracted clean text content.

        Raises:
            Exception: If there's an error in extracting the content.
        """
        try:
            # Normalize the URL
            normalized_url = self.normalize_url(url)

            # Request the webpage
            response = requests.get(normalized_url)
            response.raise_for_status()  # Raise an exception for bad status codes

            # Parse the page content with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract and clean the text content
            raw_text = soup.get_text(separator="\n")  # Get all text content
            cleaned_content = self.clean_markdown(raw_text)

            return cleaned_content
        except requests.RequestException as e:
            raise Exception(f"Failed to extract content from {url}: {str(e)}")
        except Exception as e:
            raise Exception(f"An unexpected error occurred while extracting content from {url}: {str(e)}")

    def normalize_url(self, url: str) -> str:
        """
        Normalize the given URL by adding scheme if missing and ensuring it's a valid URL.

        Args:
            url (str): The URL to normalize.

        Returns:
            str: The normalized URL.

        Raises:
            ValueError: If the URL is invalid after normalization attempts.
        """
        # If the URL doesn't start with a scheme, add 'https://'
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        # Parse the URL
        parsed = urlparse(url)

        # Ensure the URL has a valid scheme and netloc
        if not all([parsed.scheme, parsed.netloc]):
            raise ValueError(f"Invalid URL: {url}")

        return parsed.geturl()

    def clean_markdown(self, markdown_content: str) -> str:
        """
        Remove images, special markdown tags, URIs, and leftover brackets from the content.
        Also remove specific headers and their content.

        Args:
            markdown_content (str): The markdown content to clean.

        Returns:
            str: Cleaned text content.
        """
        # Decode HTML entities
        cleaned_content = html.unescape(markdown_content)

        # Remove image markdown
        image_pattern = r'!\[.*?\]\(.*?\)'
        cleaned_content = re.sub(image_pattern, '', cleaned_content)

        # Remove inline links and URIs
        link_pattern = r'\[([^\]]+)\]\([^\)]+\)'
        cleaned_content = re.sub(link_pattern, r'\1', cleaned_content)
        uri_pattern = r'https?://\S+|www\.\S+'
        cleaned_content = re.sub(uri_pattern, '', cleaned_content)

        # Remove special markdown tags (e.g., bold, italic, code)
        special_tags_pattern = r'(\*{1,2}|_{1,2}|`)'
        cleaned_content = re.sub(special_tags_pattern, '', cleaned_content)

        # Remove any remaining markdown headers
        header_pattern = r'^#+\s'
        cleaned_content = re.sub(header_pattern, '', cleaned_content, flags=re.MULTILINE)

        # Remove horizontal rules
        hr_pattern = r'^\s*[-*_]{3,}\s*$'
        cleaned_content = re.sub(hr_pattern, '', cleaned_content, flags=re.MULTILINE)

        # Remove blockquotes
        blockquote_pattern = r'^>\s'
        cleaned_content = re.sub(blockquote_pattern, '', cleaned_content, flags=re.MULTILINE)

        # Remove extra newlines
        cleaned_content = re.sub(r'\n{3,}', '\n\n', cleaned_content)

        # Remove leftover brackets from images and links
        leftover_brackets_pattern = r'\[|\]|\(|\)'
        cleaned_content = re.sub(leftover_brackets_pattern, '', cleaned_content)

        # Remove specific headers and their content
        title_pattern = r'^Title:.*\n'
        url_source_pattern = r'^URL Source:.*\n'
        markdown_content_pattern = r'^Markdown Content:\n'
        warning_pattern = r'^Warning:.*\n'
        cleaned_content = re.sub(title_pattern, '', cleaned_content, flags=re.MULTILINE)
        cleaned_content = re.sub(url_source_pattern, '', cleaned_content, flags=re.MULTILINE)
        cleaned_content = re.sub(markdown_content_pattern, '', cleaned_content, flags=re.MULTILINE)
        cleaned_content = re.sub(warning_pattern, '', cleaned_content, flags=re.MULTILINE)

        # Apply markdown cleaning patterns from config
        for pattern in self.website_extractor_config['markdown_cleaning']['remove_patterns']:
            cleaned_content = re.sub(pattern, '', cleaned_content)

        return cleaned_content.strip()

def main(seed: int = 42) -> None:
    """
    Main function to test the WebsiteExtractor class.
    """
    # Load configuration
    config = load_config()

    # Create an instance of WebsiteExtractor
    extractor = WebsiteExtractor()

    # Test URL
    test_url = "www.souzatharsis.com"

    try:
        # Extract content from the test URL
        content = extractor.extract_content(test_url)

        # Print the first 500 characters of the extracted content
        print(f"Extracted content (first 500 characters):\n{content[:500]}...")

        # Print the total length of the extracted content
        print(f"Total length of extracted content: {len(content)} characters")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

maybe we can also implement a npx playwright test, complementing the scraping module? I will also provide this code. though my main use will be to process podcastfy from PDFs, md and docx files like NotebookLM.

I have some sitemaps for news websites, it's easy to create and export the selectors and its overall sitemap with webscraper.io or inspector and turn them into a Typescript test to run in Playwright.

souzatharsis commented 1 month ago

@240db This is the only issue left so we can enable usage with no API KEY! Are you interested in pushing a PR?

We went from requiring 3 API Keys to 1 now (jina.ai) as we since delivered:

Support for Microsof Edge TSS (which requires no API KEY) as opposed to OpenAI and 11labs
Support for running local llms which can be used in place of Gemini

souzatharsis commented 1 month ago

Done! Implemented it based on your original draft. Many thanks @240db !

souzatharsis / podcastfy

Replace r.jina.ai with simple BeautifulSoap #18