stac-utils / pystac-client

Python client for searching STAC APIs
https://pystac-client.readthedocs.io
Other
155 stars 48 forks source link

qeurying CMR STAC with more than 10k assets #112

Closed joemcglinchy closed 2 years ago

joemcglinchy commented 2 years ago

Hello,

I'm trying to query https://cmr.earthdata.nasa.gov/stac/LPDAAC_ECS for all available assets for the AG100.v3 collection. I see that there is a limit of 10k assets per search, so i'm trying to get around that by chunking the lon-lat coordinate space. my code:

pystac 1.1.0 pyhd8ed1ab_0 conda-forge pystac-client 0.2.0 pypi_0 pypi

python 3.9.6 windows 10

xsplit = 45
ysplit = 90
x_coords = np.arange(-180,180+xsplit,xsplit)
y_coords = np.arange(-90,90+ysplit, ysplit)
x_coords, y_coords

bboxes = []
for i in range(8 -1):
    xmin = x_coords[i]
    xmax = x_coords[i+1]

    ymin = y_coords[0]
    ymax = y_coords[1]

    bboxes.append([xmin, ymin, xmax, ymax])

for i in range(8 -1):
    xmin = x_coords[i]
    xmax = x_coords[i+1]

    ymin = y_coords[1]
    ymax = y_coords[2]

    bboxes.append([xmin, ymin, xmax, ymax])

URL = "https://cmr.earthdata.nasa.gov/stac/LPDAAC_ECS/"
coll = "AG100.v003"
catalog2 = pystac_client.Client.open(URL)

all_items = []
for bbox in bboxes:

    print(f'on bbox {bbox}')
    # STAC item search... can't use both intersects and bounding box together, one or the other
    stac_items = catalog2.search(
        #intersects=dict(type="Point", coordinates=[geom.centroid.x, geom.centroid.y]), #this works...
        #intersects=area_of_interest,
        bbox=bbox, 
        collections=[coll]
    )

    print(f'stac_items.matched = {stac_items.matched()}')

    items_collection = stac_items.get_item_collections()
    items = [i.to_dict() for i in items_collection] # <---- error occurs here

    all_items.append(items)

returns the following output and error:

on bbox [-180, -90, -135, 0]
stac_items.matched = 0
on bbox [-135, -90, -90, 0]
stac_items.matched = 104
on bbox [-90, -90, -45, 0]
stac_items.matched = 1941
on bbox [-45, -90, 0, 0]
stac_items.matched = 332
on bbox [0, -90, 45, 0]
stac_items.matched = 916
on bbox [45, -90, 90, 0]
stac_items.matched = 92
on bbox [90, -90, 135, 0]
stac_items.matched = 866
on bbox [-180, 0, -135, 90]
stac_items.matched = 962
on bbox [-135, 0, -90, 90]
stac_items.matched = 2740
---------------------------------------------------------------------------
APIError                                  Traceback (most recent call last)
C:\software\Anaconda3\envs\hydrosat\lib\site-packages\pystac_client\stac_api_io.py in request(self, href, method, headers, parameters)
    108             if resp.status_code != 200:
--> 109                 raise APIError(resp.text)
    110             return resp.content.decode("utf-8")

APIError: "connect EMFILE 99.84.210.119:443 - Local (undefined:undefined)"

During handling of the above exception, another exception occurred:

APIError                                  Traceback (most recent call last)
C:\Users\JOSEPH~1\AppData\Local\Temp/ipykernel_1048/2798668582.py in <module>
     20 
     21     items_collection = stac_items.get_item_collections()
---> 22     items = [i.to_dict() for i in items_collection]
     23 
     24     all_items.append(items)

C:\Users\JOSEPH~1\AppData\Local\Temp/ipykernel_1048/2798668582.py in <listcomp>(.0)
     20 
     21     items_collection = stac_items.get_item_collections()
---> 22     items = [i.to_dict() for i in items_collection]
     23 
     24     all_items.append(items)

C:\software\Anaconda3\envs\hydrosat\lib\site-packages\pystac_client\item_search.py in get_item_collections(self)
    400         item_collection : pystac_client.ItemCollection
    401         """
--> 402         for page in self._stac_io.get_pages(self.url, self.method, self.get_parameters()):
    403             yield ItemCollection.from_dict(page, root=self.client)
    404 

C:\software\Anaconda3\envs\hydrosat\lib\site-packages\pystac_client\stac_api_io.py in get_pages(self, url, method, parameters)
    189         while next_link:
    190             link = Link.from_dict(next_link)
--> 191             page = self.read_json(link, parameters=parameters)
    192             yield page
    193 

C:\software\Anaconda3\envs\hydrosat\lib\site-packages\pystac\stac_io.py in read_json(self, source, *args, **kwargs)
    195             given source.
    196         """
--> 197         txt = self.read_text(source, *args, **kwargs)
    198         return self.json_loads(txt)
    199 

C:\software\Anaconda3\envs\hydrosat\lib\site-packages\pystac_client\stac_api_io.py in read_text(self, source, parameters, *args, **kwargs)
     85                 # parameters are already in the link href
     86                 parameters = {}
---> 87             return self.request(href, *args, method=method, headers=headers, parameters=parameters)
     88 
     89     def request(self,

C:\software\Anaconda3\envs\hydrosat\lib\site-packages\pystac_client\stac_api_io.py in request(self, href, method, headers, parameters)
    110             return resp.content.decode("utf-8")
    111         except Exception as err:
--> 112             raise APIError(str(err))
    113 
    114     def write_text_to_href(self, href: str, *args: Any, **kwargs: Any) -> None:

APIError: "connect EMFILE 99.84.210.119:443 - Local (undefined:undefined)"

Previously, before chunking the coordinates, i was getting a Bad Gateway error when calling stac_items.get_item_collection().

I realize these assets will be h5 files and I won't be able to retrieve the pixels. I am trying to extract the href for each item so i can download them via curl.

Any thoughts?

thanks! Joe

matthewhanson commented 2 years ago

Hi @joemcglinchy , sorry for not responding to this in 6 months!
This doesn't seem to be a problem with pystac-client, but rather the CMR-STAC API which does have some issues.

I'm going to close this for now, but if this is still currently an issue please respond here and we can investigate more.