microsoft / GlobalMLBuildingFootprints

Worldwide building footprints derived from satellite imagery
Other
1.42k stars 206 forks source link

STAC API fails for some countries (e.g., Belgium) #40

Open HassanMasoomi opened 1 year ago

HassanMasoomi commented 1 year ago

I am trying to DL a representative Lat/Long for each footprint. Using the STAC API in the code chunk 1 below fails to get data for several countries (basically, the parameter "items" does not read the full info it needs normally). So, I tried the code chunk 2 to get what I wanted (but it's so slow to do so). Any reason why those info missing for some countries?

Code chunk 1

import os
import geopandas
import planetary_computer
import pystac_client
import pandas as pd

location = 'Belgium'

catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=planetary_computer.sign_inplace,
)

#os.chdir("xxxx")
for i in location:
    if not os.path.isfile(i + '_MS_Bldgs.csv'):
        try:
            print(i)
            items = catalog.search(
            collections=["ms-buildings"], query={"msbuildings:region": {"eq": i}}
            )
            item = next(items.items())
            asset = item.assets["data"]

            df = geopandas.read_parquet(
                asset.href, storage_options=asset.extra_fields["table:storage_options"]
            )
            s = df.representative_point()
            #s.head()
            ss = pd.DataFrame(data = {'long': s.x, 'lat': s.y})
            #ss.head()
            ss.to_csv(i + '_MS_Bldgs.csv')
        except:
            print("#################################")
            print("Failed to run for: " + i)
            print("#################################")

Code chunk 2

import pandas as pd
import geopandas as gpd
from shapely.geometry import shape

location = 'Belgium'

dataset_links = pd.read_csv("https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv")
my_links = dataset_links[dataset_links.Location == location]
out_file_name = location + '.csv'
pd.DataFrame(columns=['long', 'lat']).to_csv(out_file_name, index = False)
for _, row in my_links.iterrows():
    df = pd.read_json(row.Url, lines=True)
    df['geometry'] = df['geometry'].apply(shape)
    df = gpd.GeoDataFrame(df, crs=4326)
    df = df.representative_point()
    df = pd.DataFrame(data = {'long': df.x, 'lat': df.y})
    df.to_csv(out_file_name, mode = 'a', index = False, header = False)
tomalrussell commented 10 months ago

Either data has been updated, or the slightly different approach helps, but this works fine for Belgium as of January 2024:

import planetary_computer
import pystac_client
import dask_geopandas
import deltalake

LOCATION = "Belgium"

def get_table_and_credentials():
    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )
    collection = catalog.get_collection("ms-buildings")
    asset = collection.assets["delta"]
    storage_options = {
        "account_name": asset.extra_fields["table:storage_options"]["account_name"],
        "sas_token": asset.extra_fields["table:storage_options"]["credential"],
    }

    # Set up DeltaTable to query URIs
    table = deltalake.DeltaTable(asset.href, storage_options=storage_options)

    return table, storage_options

# Storage options only last for so long - if they need to be reused,
# suggest try/except around any data read, and retry after re-requesting
table, storage_options = get_table_and_credentials()

# Query based on RegionName or quadkey: (key, "=", value) and/or (key, "in", values)
uris = table.file_uris([("RegionName", "=", LOCATION)])

# Read into a dask-geopandas dataframe
df = dask_geopandas.read_parquet(uris, storage_options=storage_options)

# then process as before
s = df.representative_point()
ss = pd.DataFrame(data = {'lng': s.x, 'lat': s.y})
ss.to_csv(f"{LOCATION}_MS_Bldgs.csv", index=False)