Open lemon24 opened 1 year ago
Pseudocode for what a plugin would look like, to illustrate current API gaps.
Update: Changed to bump the interval based on retry-after.
INTERVALS = [60 * hours for hours in [1, 3, 6, 12, 24, 2 * 24, 7 * 24]]
def after_feed_update_hook(reader, url, response):
# response does not exist as of reader 3.13;
# it would be nice to unify feed and response somehow
if response.status not in {429, 503}:
continue
# everything below needs to catch FeedNotFoundError;
# ...but would not, if we passed FeedUpdateIntent
feed = reader.get_feed(url)
if retry_after_str := response.headers.get('retry-after'):
try
seconds = int(retry_after_str)
except ValueError:
retry_after = werkzeug.http.parse_date(retry_after_str)
else:
assert seconds >= 0
retry_after = feed.last_retrieved + timedelta(seconds=seconds)
else:
retry_after = None
if retry_after:
if retry_after > feed.update_after:
reader._storage.set_feed_update_after(feed, retry_after)
if response.status != 429:
return
full_config = get_config(reader, feed, resolve=True)
old_interval = full_config['interval']
if retry_after:
retry_after_interval = (retry_after - feed.last_retrieved).total_seconds() // 60
else:
retry_after_interval = 0
for new_interval in INTERVALS:
if retry_after_interval > new_interval:
continue
if old_interval > new_interval:
continue
if old_interval == new_interval:
return
feed_config = get_config(reader, feed)
feed_config['interval'] = new_interval
set_config(reader, feed, feed_config)
if not retry_after:
# set_config does not refresh update_after, so we have to do it;
# if we do it automatically, we wouldn't need to do this
full_config['full_config'] = new_interval
update_after = next_update_after(feed.last_retrieved, **full_config)
reader._storage.set_feed_update_after(feed, update_after)
API changes:
Idea: We could also bump the update interval for servers that don't send caching headers (ETag / Last-Modified), or that don't honor the matching conditional requests. (original discussion)
So, per the previous comments, the parser API will need to change.
Here's what it looks like as of 3.14, stripped to illustrate the data flow:
cclass Parser:
def parallel(
feeds: Iterable[FeedArgument], ...
) -> Iterable[tuple[FeedArgument, ParsedFeed | None | ParseError]]:
"""Retrieve and parse many feeds, possibly in parallel."""
def retrieve(
feed: FeedArgument,
) -> tuple[FeedArgument, ContextManager[RetrieveResult[T] | None] | Exception]:
"""Single-argument retrieve() wrapper used with map()."""
def __call__(url, http_etag, http_last_modified) -> ParsedFeed | None:
"""Convenience wrapper over parallel()."""
def retrieve(
url, http_etag, http_last_modified, ...
) -> ContextManager[RetrieveResult[T] | None]:
def parse(url, result: RetrieveResult[T]) -> ParsedFeed:
def retriever_type(
url, http_etag, http_last_modified, http_accept
) -> ContextManager[RetrieveResult[T] | None]:
def parser_type(
url, resource: T, headers
) -> tuple[FeedData, Collection[EntryData]]:
class FeedArgument:
url: str
http_etag: str
http_last_modified: str
class RetrieveResult:
resource: T
mime_type: str
http_etag: str
http_last_modified: str
headers: dict[str, str]
class ParsedFeed:
feed: FeedData
entries: Iterable[EntryData]
http_etag: str
http_last_modified: str
mime_type: str
Here's what an ~ideal parser API would look like:
class Parser:
def parallel(
feeds: Iterable[FeedArgument], ...
) -> Iterable[ParseResult]:
"""Retrieve and parse many feeds, possibly in parallel."""
def __call__(url, caching_info: JSON) -> ParsedFeed | None:
"""Convenience wrapper over parallel()."""
# Leaving retrieve() and parse() for convenience wrappers.
def retrieve_fn(feed: FeedArgument, ...) -> RetrieveResult[T]:
# For use with map(): single argument, does not raise.
# Unhandled exceptions get wrapped in ParseError (as they do now).
# As now, it enters the retriever context early to catch errors.
def parse_fn(result: RetrieveResult[T]) -> ParseResult:
# For use with map(): single argument, does not raise.
# Unhandled exceptions get wrapped in ParseError (as they do now).
# Pass-through exceptions, status, and headers from retrieve_fn().
class FeedArgument(Protocol):
url: str
# 'etag' and 'last-modified' used as well-known keys
caching_info: JSON
# (optional mode) from FeedForUpdate,
# may remove the need for decider.process_feed_for_update
stale: bool
def retriever_type(
feed: FeedArgument, accept
) -> ContextManager[RetrievedFeed[T] | T]:
# FeedArgument replaces multiple arguments for extensibility.
# Return types other than RetrievedFeed[T] are for convenience.
# Can pass additional info by raising RetrieveError | NotModified;
# this is reverting to an older retriever API, because
# "exceptions are for exceptional cases", see RetrieveResult.value.
class RetrieveResult:
feed: FeedArgument
# After multiple attempts, this seems like the right place
# for the context manager. Worse alterntives:
#
# * `Parser.retrieve_fn() -> ContextManager[RetrieveResult[T]]`
# means parallel() can't access .feed before entering the context,
# which means unexpected errors do not have a access to a .feed
#
# * `RetrievedFeed.resource: ContextManager[T]` and
# `retriever_type() -> RetrieveResult | RetrievedFeed | None`
# means retrievers that use context managers (e.g. the session)
# have to conditionally exit that context for non-200 responses
#
value: ContextManager[RetrievedFeed[T]] | ParseError
# propagated from either RetrievedFeed or RetrieveError
status: int
headers: dict[str, str]
class RetrievedFeed:
"""Formerly known as RetrieveResult."""
resource: T
mime_type: str
caching_info: JSON
status: int
headers: dict[str, str]
# (optional move) from RetrieverType, a bit more flexible
slow_to_read: bool
class RetrieveError(ParseError):
status: int
headers: dict[str, str]
class NotModified(RetrieveError):
pass
def parser_type(
url, resource: T, headers
) -> tuple[FeedData, Collection[EntryData]]:
class ParseResult:
feed: FeedArgument
value: ParsedFeed | None | ParseError
# from RetrieveResult
status: int
headers: dict[str, str]
class ParsedFeed:
feed: FeedData
entries: Collection[EntryData]
# from RetrievedFeed
mime_type: str
caching_info: JSON
OK, Retry-After support done directly in the updater; bumping the interval should probably be done as part of a plugin (feels somewhat icky to do it in core); also, the plugin may do other HTTP things, like #246.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
Somewhat related to #246.
Implementation mechanism suggested in #332 overview.