gjoseph92 / stackstac

Turn a STAC catalog into a dask-based xarray
https://stackstac.readthedocs.io
MIT License
232 stars 49 forks source link

Stacking local assets - slow performance #246

Open fkroeber opened 3 months ago

fkroeber commented 3 months ago

Description Stacking the same assets - once with reading from a local directory, the other time with reading from the original remote source - shows different performances. Stacking local elements takes more time on average.

Expected behaviour Stacking assets that are available locally (on a fast SSD storage close to the CPU unit) should be faster than stacking remote assets, which need to be fetched via low bandwidths first.

Investigation/Reproduction So far I have only been able to narrow down the phenomenon to the extent that it seems not related to the actual I/O, insofar as the time difference between stacking local and remote assets also persists when using the FakeReader. The time difference in the example below is as follows:

Minimum local computation time: 10.83 seconds Minimum remote computation time: 4.38 seconds

import timeit
import planetary_computer as pc
import stac_asset
import stackstac
import warnings
from pystac.item_collection import ItemCollection
from pystac_client import Client
from shapely.geometry import box
from stackstac.reader_protocol import FakeReader

def prepare_env():
    # define the time range and bounding box
    t_start = "2020-04-01"
    t_end = "2020-06-01"
    xmin, ymin, xmax, ymax = 142.13, -34.2, 142.18, -34.15

    # create a catalog instance
    catalog = Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=pc.sign_inplace
    )

    # search for items
    query = catalog.search(
        collections="landsat-c2-l2",
        datetime=[t_start, t_end],
        limit=100,
        intersects=box(xmin, ymin, xmax, ymax),
    )
    remote_path = "remote_items.json"
    remote_items = query.item_collection()
    remote_items.save_object(remote_path)

    # download the items & define path of automatically generated item collection
    stac_asset.download_item_collection(
        item_collection=remote_items,
        directory="test_dir",
        keep_non_downloaded=False,
        config=stac_asset.Config(include=["qa_pixel"], warn=True),
    )
    local_path = "test_dir/item-collection.json"
    return remote_path, local_path

def test_fun(items):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        stackstac_kwargs = {
            "assets": ["qa_pixel"],
            "resolution": 100,
            "epsg": 3857,
            "dtype": "uint8",
            "fill_value": 255,
            "reader": FakeReader,
        }
        item_coll = ItemCollection.from_file(items)
        _ = stackstac.stack(item_coll, **stackstac_kwargs).compute()

def time_operations():
    # time local computation
    local_time = timeit.repeat(
        "test_fun(local_path)",
        setup="from __main__ import test_fun, local_path",
        repeat=5,
        number=3,
    )
    # calculate minimum time to capture the best-case scenario
    min_local_time = min(local_time)
    print(f"Minimum local computation time: {min_local_time} seconds")

    # time remote computation
    remote_time = timeit.repeat(
        "test_fun(remote_path)",
        setup="from __main__ import test_fun, remote_path",
        repeat=5,
        number=3,
    )
    # calculate minimum time
    min_remote_time = min(remote_time)
    print(f"Minimum remote computation time: {min_remote_time} seconds")

if __name__ == "__main__":
    remote_path, local_path = prepare_env()
    time_operations()