Auto-collection of Population and Built-up data layers

matamadio commented 2 years ago

Until now, all data must be cooked and hand-feed to the script. The prototype for auto-fetch from API or RES is there for worldpop, only needs refinement with "year" selector.

It should be something like:

    if exp_cat_dd.value == 'pop':
      for year_pop_dd.value == 'year'
        try:
            exp_ras = f"{DATA_DIR}/EXP/{country}_WPOP{year}.tif"
        except ValueError:
            do the magic stuff

The magic stuff (api harvesting):

    # Load or save ISO3 country list
    iso3_path = os.path.join(DATA_DIR, "cache/iso3.json")
    if not os.path.exists(iso3_path):
        resp = json.loads(requests.get(f"https://www.worldpop.org/rest/data/pop/wpgp?iso3={country}").text)

        with open(iso3_path, 'w') as outfile:
            json.dump(resp, outfile)
    else:
        with open(iso3_path, 'r') as infile:
            resp = json.load(infile)

    # TODO: Download WorldPop data from API if the layer is not found (see except before)
    # Target population data files are extracted from the JSON list downloaded above
    metadata = resp['data'][1]
    data_src = metadata['files']

    Save population data to cache location
    for data_fn in tqdm(data_src):
        fid = metadata['id']
        cache_fn = os.path.basename(data_fn)

        # Look for indicated file in cache directory
        # Use the data file if it is found, but warn the user. 
        # (if data is incorrect or corrupted, they should delete it from cache)
        if f"{fid}_{cache_fn}" in os.listdir(CACHE_DIR):
            warnings.warn(f"Found {fid}_{cache_fn} in cache, skipping...")
            continue

        # Write to cache file if not found
        with open(os.path.join(CACHE_DIR, f"{fid}_{cache_fn}"), "wb") as handle:
            response = requests.get(data_fn)
            handle.write(response.content)

    Run analysis

I am discussing with DLR to make the same thing possible with WSF19 and WSF-Evo data, which would be cool to calculate change of risk across years.

matamadio commented 1 year ago

Worldpop superseeded - move to GHS. Same reasoning applies, as GHS comes as tiles that can be automatically processed.

matamadio commented 1 month ago

Need to think a better approach than tiles zip download. Alternatives to explore:

WSF (built-up only) via STAC catalog
GHSL via GEE catalog - see collection

matamadio commented 4 weeks ago

The only practical population dataset to fetch online is WorldPop - No API for GHSL, only way is export from GEE which takes too long and needs gdrive connection. Thus WorldPop is included as default option for Population, but can be overridden by adding a providing a diffeent file.

The reason this matters is that GHSL looks much more realistic compared to Worldpop, and that reflects in the risk estimates. Compare POP

matamadio commented 2 weeks ago

For built-up, we can easily leverage the WSF datasets (2019, evo, 3d) via STAC: https://geoservice.dlr.de/eoc/ogc/stac/v1/ I made a test notebook that harvests and mosaic the WSF-evo data depending on the selected country. It works but it is a bit slow (couple of minutes).

import os
import requests
import geopandas as gpd
import shutil
from shapely.geometry import shape, MultiPolygon
from shapely.ops import unary_union
import concurrent.futures
from merge_utils import merge_tifs  # Assuming this is a custom module you've created

# Define the REST API URL
rest_api_url = "https://services.arcgis.com/iQ1dY19aHwbSDYIF/ArcGIS/rest/services/World_Bank_Global_Administrative_Divisions_VIEW/FeatureServer"

# Function to get the correct layer ID based on administrative level
def get_layer_id_for_adm(adm_level):
    layers_url = f"{rest_api_url}/layers"
    response = requests.get(layers_url, params={'f': 'json'})

    if response.status_code == 200:
        layers_info = response.json().get('layers', [])
        target_layer_name = f"WB_GAD_ADM{adm_level}"

        for layer in layers_info:
            if layer['name'] == target_layer_name:
                return layer['id']

        print(f"Layer matching {target_layer_name} not found.")
        return None
    else:
        print(f"Failed to fetch layers. Status code: {response.status_code}")
        return None

# Function to fetch the ADM data using the correct layer ID
def get_adm_data(country, adm_level):
    layer_id = get_layer_id_for_adm(adm_level)

    if layer_id is not None:
        query_url = f"{rest_api_url}/{layer_id}/query"
        params = {
            'where': f"ISO_A3 = '{country}'",
            'outFields': '*',
            'f': 'geojson'
        }

        response = requests.get(query_url, params=params)

        if response.status_code == 200:
            data = response.json()
            features = data.get('features', [])
            if features:
                geometry = [shape(feature['geometry']) for feature in features]
                properties = [feature['properties'] for feature in features]
                gdf = gpd.GeoDataFrame(properties, geometry=geometry)

                return gdf
            else:
                print("No features found for the specified query.")
                return None
        else:
            print(f"Error fetching data: {response.status_code}")
            return None
    else:
        print("Invalid administrative level or layer mapping not found.")
        return None

# Function to download files
def download_file(url, dest_folder):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    local_filename = os.path.join(dest_folder, url.split('/')[-1])
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print(f"Downloaded {local_filename}")
    return local_filename

# Mapping of administrative levels to field names
adm_field_mapping = {
    0: {'code': 'HASC_0', 'name': 'NAM_0'},
    1: {'code': 'HASC_1', 'name': 'NAM_1'},
    2: {'code': 'HASC_2', 'name': 'NAM_2'},
    # Add mappings for additional levels as needed
}

# Example usage
country = 'BGD'  # Bangladesh's ISO_A3 code
adm_level = 2    # Administrative level (e.g., 2)

# Fetch the data
adm_data = get_adm_data(country, adm_level)

if adm_data is not None:
    # Step 1: Extract and combine the geometry into a single geometry
    ADM_area = adm_data.unary_union  # Combine all geometries into one

    if not isinstance(ADM_area, (MultiPolygon, shape)):
        # If the union results in a non-polygonal shape, handle it accordingly
        ADM_area = MultiPolygon([ADM_area])

    # Convert to bounding box
    bbox = ADM_area.bounds  # (minx, miny, maxx, maxy)

    # Step 2: Construct the STAC search query
    search_query = {
        "bbox": list(bbox),  # [minx, miny, maxx, maxy]
        "collections": ["WSF_Evolution"],  # Assuming this is the correct collection name
        "limit": 100  # Adjust as needed
    }

    # Step 3: Send the POST request to the STAC API
    stac_search_url = "https://geoservice.dlr.de/eoc/ogc/stac/v1/search"
    headers = {
        "Content-Type": "application/json"
    }

    response = requests.post(stac_search_url, headers=headers, json=search_query)

    # Step 4: Check the response status and parse the results
    if response.status_code == 200:
        search_results = response.json()
        items = search_results.get("features", [])
        if items:
            print(f"Found {len(items)} items.")
            # Directory to save downloaded files
            subfolder_name = f"{country}_tifs"
            download_folder = os.path.join(f"wd/data/{country}_WSF_evo/", subfolder_name)
            if not os.path.exists(download_folder):
                os.makedirs(download_folder)

            # Step 5: Download .tif assets into the subdirectory
            tif_files = []
            for item in items:
                print(f"ID: {item['id']}")
                for asset_key, asset_value in item['assets'].items():
                    if asset_value['href'].endswith('.tif'):
                        print(f"Downloading {asset_key} from {asset_value['href']}")
                        tif_file = download_file(asset_value['href'], download_folder)
                        tif_files.append(tif_file)

            # Step 6: Mosaic the downloaded .tif files using the subdirectory
            if tif_files:
                print("Mosaicing downloaded .tif files...")
                merge_tifs(download_folder)

                # Step 7: Rename the mosaiced file
                merged_tif_path = os.path.join(download_folder, f"{subfolder_name}.tif")
                output_filename = os.path.join(f"wd/data/{country}_WSF_evo/", f"{country}_WSF-evo.tif")
                os.rename(merged_tif_path, output_filename)
                print(f"Mosaiced file saved as {output_filename}")
                # Delete the tiles subfolder after mosaicing
                shutil.rmtree(download_folder)
                print(f"Deleted temporary folder: {download_folder}")
            else:
                print("No .tif files found for mosaicing.")
        else:
            print("No items found for the specified query.")
    else:
        print(f"Error {response.status_code}: {response.text}")
else:
    print("Missing ADM data!")

matamadio commented 5 days ago

Task completed: WorldPop2020 and WSF2019 are fetched and processed automatically by the script as default layers.

GFDRR / CCDR-tools

Auto-collection of Population and Built-up data layers #6