Open edgarrmondragon opened 3 months ago
How-to guides
from __future__ import annotations import datetime import typing as t from urllib.parse import parse_qsl, urlparse from requests import Response from singer_sdk.helpers._compat import date_fromisoformat from singer_sdk.pagination import BaseAPIPaginator class PageValue(t.NamedTuple): """A page value for DateRangePage.""" start_date: datetime.date end_date: datetime.date page: int class DateRangeAndPagePaginator(BaseAPIPaginator[PageValue]): """Pagination class for DateRangePage.""" def __init__( self, start_date: datetime.date, ) -> None: self.today = datetime.datetime.now(tz=datetime.timezone.utc).date() start_value = self._get_next_date_range(start_date) super().__init__(start_value=start_value) def _get_next_date_range(self, start_date: datetime.date) -> PageValue | None: end_date = min(self.today, start_date + datetime.timedelta(days=60)) # End date is inclusive, so we need to start the next range a day after start_date = start_date + datetime.timedelta(days=1) return None if end_date == self.today else PageValue(start_date, end_date, 1) def get_next(self, response): """Return the next page tuple or None.""" next_url: str = response.links.get("next", {}).get("url") if next_url: parsed_url = urlparse(next_url) query = dict(parse_qsl(parsed_url.query)) start_date = date_fromisoformat(query.get("start_date")) end_date = date_fromisoformat(query.get("end_date")) page = int(query.get("page")) return PageValue(start_date, end_date, page) return self._get_next_date_range(self.current_value.end_date) paginator = DateRangeAndPagePaginator(start_date=datetime.date(2021, 1, 1)) print(paginator.current_value) # PageValue(start_date=datetime.date(2021, 1, 2), end_date=datetime.date(2021, 3, 2), page=1) # noqa: ERA001, E501 next_link_template = '<https://example.com?start_date={start_date}&end_date={end_date}&page={page}>; rel="next"' # noqa: E501 response = Response() response.headers = { "link": next_link_template.format( start_date=paginator.current_value.start_date, end_date=paginator.current_value.end_date, page=paginator.current_value.page + 1, ), } paginator.advance(response) print(paginator.current_value) # PageValue(start_date=datetime.date(2021, 1, 2), end_date=datetime.date(2021, 3, 2), page=2) # noqa: ERA001, E501 response = Response() paginator.advance(response) print(paginator.current_value) # PageValue(start_date=datetime.date(2021, 3, 3), end_date=datetime.date(2021, 5, 1), page=1) # noqa: ERA001, E501
The risk with this is that the last page in a date range comes empty and pagination is terminated prematurely:
https://github.com/meltano/sdk/blob/950f391b3b64755cfe8e8250b66eed726d560353/singer_sdk/streams/rest.py#L389-L395
Related:
One way to circumvent breaking the loop when a date range is empty would be to make the break behavior in that loop an opt-out based on a new attribute of the RESTStream class. PRs welcome!
break
RESTStream
https://meltano.slack.com/archives/C068YBQQF1V/p1720057717641439?thread_ts=1720043007.794179&cid=C068YBQQF1V
Documentation type
How-to guides
Description
The risk with this is that the last page in a date range comes empty and pagination is terminated prematurely:
https://github.com/meltano/sdk/blob/950f391b3b64755cfe8e8250b66eed726d560353/singer_sdk/streams/rest.py#L389-L395
Related: