meltano / sdk

Write 70% less code by using the SDK to build custom extractors and loaders that adhere to the Singer standard: https://sdk.meltano.com
https://sdk.meltano.com
Apache License 2.0
87 stars 64 forks source link

docs: Document an example of date-range pagination #2318

Open edgarrmondragon opened 3 months ago

edgarrmondragon commented 3 months ago

Documentation type

How-to guides

Description

from __future__ import annotations

import datetime
import typing as t
from urllib.parse import parse_qsl, urlparse

from requests import Response

from singer_sdk.helpers._compat import date_fromisoformat
from singer_sdk.pagination import BaseAPIPaginator

class PageValue(t.NamedTuple):
    """A page value for DateRangePage."""

    start_date: datetime.date
    end_date: datetime.date
    page: int

class DateRangeAndPagePaginator(BaseAPIPaginator[PageValue]):
    """Pagination class for DateRangePage."""

    def __init__(
        self,
        start_date: datetime.date,
    ) -> None:
        self.today = datetime.datetime.now(tz=datetime.timezone.utc).date()
        start_value = self._get_next_date_range(start_date)
        super().__init__(start_value=start_value)

    def _get_next_date_range(self, start_date: datetime.date) -> PageValue | None:
        end_date = min(self.today, start_date + datetime.timedelta(days=60))

        # End date is inclusive, so we need to start the next range a day after
        start_date = start_date + datetime.timedelta(days=1)

        return None if end_date == self.today else PageValue(start_date, end_date, 1)

    def get_next(self, response):
        """Return the next page tuple or None."""
        next_url: str = response.links.get("next", {}).get("url")
        if next_url:
            parsed_url = urlparse(next_url)
            query = dict(parse_qsl(parsed_url.query))

            start_date = date_fromisoformat(query.get("start_date"))
            end_date = date_fromisoformat(query.get("end_date"))
            page = int(query.get("page"))

            return PageValue(start_date, end_date, page)

        return self._get_next_date_range(self.current_value.end_date)

paginator = DateRangeAndPagePaginator(start_date=datetime.date(2021, 1, 1))
print(paginator.current_value)
# PageValue(start_date=datetime.date(2021, 1, 2), end_date=datetime.date(2021, 3, 2), page=1)  # noqa: ERA001, E501

next_link_template = '<https://example.com?start_date={start_date}&end_date={end_date}&page={page}>; rel="next"'  # noqa: E501
response = Response()
response.headers = {
    "link": next_link_template.format(
        start_date=paginator.current_value.start_date,
        end_date=paginator.current_value.end_date,
        page=paginator.current_value.page + 1,
    ),
}

paginator.advance(response)
print(paginator.current_value)
# PageValue(start_date=datetime.date(2021, 1, 2), end_date=datetime.date(2021, 3, 2), page=2)  # noqa: ERA001, E501

response = Response()
paginator.advance(response)
print(paginator.current_value)
# PageValue(start_date=datetime.date(2021, 3, 3), end_date=datetime.date(2021, 5, 1), page=1)  # noqa: ERA001, E501

The risk with this is that the last page in a date range comes empty and pagination is terminated prematurely:

https://github.com/meltano/sdk/blob/950f391b3b64755cfe8e8250b66eed726d560353/singer_sdk/streams/rest.py#L389-L395


Related:

edgarrmondragon commented 4 days ago

One way to circumvent breaking the loop when a date range is empty would be to make the break behavior in that loop an opt-out based on a new attribute of the RESTStream class. PRs welcome!

https://meltano.slack.com/archives/C068YBQQF1V/p1720057717641439?thread_ts=1720043007.794179&cid=C068YBQQF1V