lemon24 / reader

A Python feed reader library.
https://reader.readthedocs.io
BSD 3-Clause "New" or "Revised" License
444 stars 37 forks source link

Handle 429 Too Many Requests gracefully #307

Open lemon24 opened 1 year ago

lemon24 commented 1 year ago

https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429

Somewhat related to #246.

Implementation mechanism suggested in #332 overview.

lemon24 commented 3 months ago

Pseudocode for what a plugin would look like, to illustrate current API gaps.

Update: Changed to bump the interval based on retry-after.

INTERVALS = [60 * hours for hours in [1, 3, 6, 12, 24, 2 * 24, 7 * 24]]

def after_feed_update_hook(reader, url, response):
    # response does not exist as of reader 3.13;
    # it would be nice to unify feed and response somehow

    if response.status not in {429, 503}:
        continue

    # everything below needs to catch FeedNotFoundError;
    # ...but would not, if we passed FeedUpdateIntent
    feed = reader.get_feed(url)

    if retry_after_str := response.headers.get('retry-after'):
        try
            seconds = int(retry_after_str)
        except ValueError:
            retry_after = werkzeug.http.parse_date(retry_after_str)
        else:
            assert seconds >= 0
            retry_after = feed.last_retrieved + timedelta(seconds=seconds)
    else:
        retry_after = None

    if retry_after:
        if retry_after > feed.update_after:
            reader._storage.set_feed_update_after(feed, retry_after)

    if response.status != 429:
        return

    full_config = get_config(reader, feed, resolve=True)
    old_interval = full_config['interval']
    if retry_after:
        retry_after_interval = (retry_after - feed.last_retrieved).total_seconds() // 60
    else:
        retry_after_interval = 0

    for new_interval in INTERVALS:
        if retry_after_interval > new_interval:
            continue
        if old_interval > new_interval:
            continue

    if old_interval == new_interval:
        return

    feed_config = get_config(reader, feed)
    feed_config['interval'] = new_interval
    set_config(reader, feed, feed_config)

    if not retry_after:
        # set_config does not refresh update_after, so we have to do it;
        # if we do it automatically, we wouldn't need to do this
        full_config['full_config'] = new_interval
        update_after = next_update_after(feed.last_retrieved, **full_config)
        reader._storage.set_feed_update_after(feed, update_after)
lemon24 commented 3 months ago

API changes:

lemon24 commented 2 months ago

Idea: We could also bump the update interval for servers that don't send caching headers (ETag / Last-Modified), or that don't honor the matching conditional requests. (original discussion)

lemon24 commented 2 months ago

So, per the previous comments, the parser API will need to change.

Here's what it looks like as of 3.14, stripped to illustrate the data flow:

cclass Parser:

    def parallel(
        feeds: Iterable[FeedArgument], ... 
    ) -> Iterable[tuple[FeedArgument, ParsedFeed | None | ParseError]]:
        """Retrieve and parse many feeds, possibly in parallel."""

        def retrieve(
            feed: FeedArgument,
        ) -> tuple[FeedArgument, ContextManager[RetrieveResult[T] | None] | Exception]:
            """Single-argument retrieve() wrapper used with map()."""

    def __call__(url, http_etag, http_last_modified) -> ParsedFeed | None:
        """Convenience wrapper over parallel()."""

    def retrieve(
        url, http_etag, http_last_modified, ... 
    ) -> ContextManager[RetrieveResult[T] | None]:

    def parse(url, result: RetrieveResult[T]) -> ParsedFeed: 

def retriever_type( 
    url, http_etag, http_last_modified, http_accept
) -> ContextManager[RetrieveResult[T] | None]: 

def parser_type(
    url, resource: T, headers 
) -> tuple[FeedData, Collection[EntryData]]:

class FeedArgument:
    url: str
    http_etag: str
    http_last_modified: str

class RetrieveResult:
    resource: T
    mime_type: str
    http_etag: str
    http_last_modified: str
    headers: dict[str, str] 

class ParsedFeed:
    feed: FeedData
    entries: Iterable[EntryData]
    http_etag: str 
    http_last_modified: str 
    mime_type: str 
lemon24 commented 1 month ago

Here's what an ~ideal parser API would look like:

class Parser:

    def parallel(
        feeds: Iterable[FeedArgument], ... 
    ) -> Iterable[ParseResult]:
        """Retrieve and parse many feeds, possibly in parallel."""

    def __call__(url, caching_info: JSON) -> ParsedFeed | None:
        """Convenience wrapper over parallel()."""

    # Leaving retrieve() and parse() for convenience wrappers.

    def retrieve_fn(feed: FeedArgument, ...) -> RetrieveResult[T]:
        # For use with map(): single argument, does not raise.
        # Unhandled exceptions get wrapped in ParseError (as they do now).
        # As now, it enters the retriever context early to catch errors.

    def parse_fn(result: RetrieveResult[T]) -> ParseResult:
        # For use with map(): single argument, does not raise.
        # Unhandled exceptions get wrapped in ParseError (as they do now).
        # Pass-through exceptions, status, and headers from retrieve_fn().

class FeedArgument(Protocol):
    url: str
    # 'etag' and 'last-modified' used as well-known keys
    caching_info: JSON
    # (optional mode) from FeedForUpdate, 
    # may remove the need for decider.process_feed_for_update
    stale: bool

def retriever_type( 
    feed: FeedArgument, accept
) -> ContextManager[RetrievedFeed[T] | T]:
    # FeedArgument replaces multiple arguments for extensibility.
    # Return types other than RetrievedFeed[T] are for convenience.
    # Can pass additional info by raising RetrieveError | NotModified;
    # this is reverting to an older retriever API, because 
    # "exceptions are for exceptional cases", see RetrieveResult.value.

class RetrieveResult:
    feed: FeedArgument

    # After multiple attempts, this seems like the right place
    # for the context manager. Worse alterntives:
    #
    # * `Parser.retrieve_fn() -> ContextManager[RetrieveResult[T]]`
    #   means parallel() can't access .feed before entering the context, 
    #   which means unexpected errors do not have a access to a .feed
    #
    # * `RetrievedFeed.resource: ContextManager[T]` and
    #   `retriever_type() -> RetrieveResult | RetrievedFeed | None`
    #   means retrievers that use context managers (e.g. the session)
    #   have to conditionally exit that context for non-200 responses
    #
    value: ContextManager[RetrievedFeed[T]] | ParseError

    # propagated from either RetrievedFeed or RetrieveError
    status: int
    headers: dict[str, str] 

class RetrievedFeed:
    """Formerly known as RetrieveResult."""
    resource: T
    mime_type: str
    caching_info: JSON
    status: int
    headers: dict[str, str] 
    # (optional move) from RetrieverType, a bit more flexible
    slow_to_read: bool

class RetrieveError(ParseError):
    status: int
    headers: dict[str, str]

class NotModified(RetrieveError):
    pass

def parser_type(
    url, resource: T, headers 
) -> tuple[FeedData, Collection[EntryData]]:

class ParseResult:
    feed: FeedArgument
    value: ParsedFeed | None | ParseError
    # from RetrieveResult
    status: int
    headers: dict[str, str]

class ParsedFeed:
    feed: FeedData
    entries: Collection[EntryData]
    # from RetrievedFeed
    mime_type: str
    caching_info: JSON