Search API endpoint not correctly search for urls

AndreasThinks commented 4 months ago

Hey folks, I've been trying to do some searching by url (to check for duplicate articles) and it never seems to return anything. As you can see below, I have tried a lot, and just can't get URL based search to work!

My code is below. Searching by title absolutely works, but URLs just never seem to return anything. What's going on?

import os
import asyncio
import httpx
from dotenv import load_dotenv
from typing import Dict, Any, List
from urllib.parse import quote, quote_plus, urlparse, urlunparse

load_dotenv()

API_URL = "https://api-prod.omnivore.app/api/graphql"
API_KEY = os.getenv("OMNIVORE_API_KEY")

async def query_omnivore(query: str, variables: Dict[str, Any] = None) -> Dict[str, Any]:
    headers = {
        "Content-Type": "application/json",
        "Authorization": API_KEY
    }
    payload = {
        "query": query,
        "variables": variables or {}
    }
    async with httpx.AsyncClient() as client:
        response = await client.post(API_URL, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()

async def search_with_strategy(strategy: str, search_term: str) -> List[Dict[str, Any]]:
    query = """
    query Search($after: String, $first: Int, $query: String) {
        search(after: $after, first: $first, query: $query) {
            ... on SearchSuccess {
                edges {
                    node {
                        id
                        url
                        title
                        createdAt
                    }
                }
            }
            ... on SearchError {
                errorCodes
            }
        }
    }
    """
    variables = {
        "query": f"{strategy}:{search_term}",
        "first": 10
    }
    result = await query_omnivore(query, variables)
    if "data" in result and "search" in result["data"] and "edges" in result["data"]["search"]:
        return [edge["node"] for edge in result["data"]["search"]["edges"]]
    return []

async def get_article_by_id(article_id: str) -> Dict[str, Any]:
    query = """
    query Article($id: ID!) {
        article(id: $id) {
            ... on ArticleSuccess {
                article {
                    id
                    title
                    url
                    originalArticleUrl
                    createdAt
                    savedAt
                    publishedAt
                    content
                }
            }
            ... on ArticleError {
                errorCodes
            }
        }
    }
    """
    variables = {"id": article_id}
    result = await query_omnivore(query, variables)
    if "data" in result and "article" in result["data"] and "article" in result["data"]["article"]:
        return result["data"]["article"]["article"]
    return {}

def generate_partial_url_matches(url: str) -> List[str]:
    parsed = urlparse(url)
    path_parts = parsed.path.split('/')

    variations = [
        url,
        url.rstrip('/'),
        parsed.netloc + parsed.path,
        parsed.path.lstrip('/'),
        '/'.join(path_parts[-2:]) if len(path_parts) > 1 else path_parts[-1],
        path_parts[-1],
        '-'.join(path_parts[-1].split('-')[:3]) if '-' in path_parts[-1] else path_parts[-1],
        parsed.netloc.split('.')[-2] if len(parsed.netloc.split('.')) > 1 else parsed.netloc,
    ]

    # Add variations with different numbers of path segments
    for i in range(1, len(path_parts) + 1):
        variations.append('/'.join(path_parts[-i:]))

    # Add variations with domain and different numbers of path segments
    for i in range(1, len(path_parts) + 1):
        variations.append(parsed.netloc + '/' + '/'.join(path_parts[-i:]))

    return list(set(variations))  # Remove duplicates

async def debug_partial_url_search(url: str, title: str):
    print(f"Debugging partial URL search for: {url}")
    print(f"Article title: {title}")

    partial_matches = generate_partial_url_matches(url)

    for partial in partial_matches:
        print(f"\nTrying partial match: {partial}")

        # Try exact match
        results = await search_with_strategy("url", f'"{partial}"')
        if results:
            print(f"Found results for exact partial match: {partial}")
            for result in results:
                print(f"  ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
        else:
            print("No results found for exact partial match")

        # Try contains match
        results = await search_with_strategy("url", f'*{partial}*')
        if results:
            print(f"Found results for contains partial match: {partial}")
            for result in results:
                print(f"  ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
        else:
            print("No results found for contains partial match")

        # Try starts with match
        results = await search_with_strategy("url", f'{partial}*')
        if results:
            print(f"Found results for starts with partial match: {partial}")
            for result in results:
                print(f"  ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
        else:
            print("No results found for starts with partial match")

        # Try ends with match
        results = await search_with_strategy("url", f'*{partial}')
        if results:
            print(f"Found results for ends with partial match: {partial}")
            for result in results:
                print(f"  ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
        else:
            print("No results found for ends with partial match")

    # Try title search as a fallback
    print("\nTrying title search as fallback")
    results = await search_with_strategy("title", f'"{title}"')
    if results:
        print("Found articles by title search:")
        for result in results:
            print(f"  ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")

            # Fetch full article details
            article_details = await get_article_by_id(result['id'])
            if article_details:
                print("  Article details:")
                print(f"    Stored URL: {article_details.get('url')}")
                print(f"    Original URL: {article_details.get('originalArticleUrl')}")
                print(f"    Created At: {article_details.get('createdAt')}")
                print(f"    Saved At: {article_details.get('savedAt')}")
                print(f"    Published At: {article_details.get('publishedAt')}")
                content = article_details.get('content', '')[:100]
                print(f"    Content preview: {content}...")
    else:
        print("No articles found by title search")

async def main():
    url_to_debug = "https://www.nature.com/articles/d41586-024-02012-5"
    title_to_debug = "Not all"
    await debug_partial_url_search(url_to_debug, title_to_debug)

if __name__ == "__main__":
    asyncio.run(main())

jacksonh commented 3 months ago

We don't have a matcher for full URL, just site. So one would have to be added here: https://github.com/omnivore-app/omnivore/blob/ca7ceeba7ea3355e640e4013957bf956afb0bc9a/packages/api/src/services/library_item.ts#L1447

AndreasThinks commented 3 months ago

Ah, thanks. Have ambitiously tried to fix this, and have a PR I'm working on on #4201

omnivore-app / omnivore

Search API endpoint not correctly search for urls #4116