Open mdsumner opened 5 months ago
working on example here: https://discourse.pangeo.io/t/reading-hdf-eos-hdf4-files-in-parallel-gdal-rasterio-etc
get the files
url <- "https://e4ftl01.cr.usgs.gov/MOTA/MCD43A4.061/2001.01.01" l <- readLines(url) xs <- paste0(url, "/", as.character(gsub("\"$", "", na.omit(stringr::str_extract(l, "MCD43A4.*hdf\""))))) header <- readLines("~/earthdata")[1] cmd <- sprintf("wget %s --header \"%s\" -O %s", xs, header, basename(xs)) library(furrr) plan(multicore) #,workers = 16 ) future_map(cmd, system) plan(sequential)
now VRT them into one big longlat raster and then merge into one big dask xarray
from osgeo import gdal import rasterio import xarray as xr import glob from osgeo import gdal gdal.UseExceptions() import time import tempfile hdf = glob.glob("pangeo_hdf/*.hdf") band1_src = [f"HDF4_EOS:EOS_GRID:\"{file}\":MOD_Grid_BRDF:Nadir_Reflectance_Band1" for file in hdf] band2_src = [f"HDF4_EOS:EOS_GRID:\"{file}\":MOD_Grid_BRDF:Nadir_Reflectance_Band2" for file in hdf] ## we can capture the VRT text in memory here band1 = gdal.BuildVRT("", band1_src) band2 = gdal.BuildVRT("", band2_src) opts = gdal.WarpOptions(outputBounds = [-180, -90, 180, 90], width = 86400, height = 43200, dstSRS = "EPSG:4326", format = "VRT") wband1vrt = tempfile.NamedTemporaryFile(suffix = ".vrt", prefix = "band1_") wband2vrt = tempfile.NamedTemporaryFile(suffix = ".vrt", prefix = "band2_") ## but we can't chain that VRT into this WarpedVRT, so we actually create files here w1 = gdal.Warp(wband1vrt.name, band1.GetMetadata("xml:VRT")[0], options = opts) w2 = gdal.Warp(wband2vrt.name, band2.GetMetadata("xml:VRT")[0], options = opts) ## destroy so that file gets flushed into existence w1 = None w2 = None ## literal virtual files ds = xr.merge([xr.open_dataset(wband1vrt.name, chunks = {}, engine = "rasterio").band_data.rename("Band1"), xr.open_dataset(wband2vrt.name, chunks = {}, engine = "rasterio").band_data.rename("Band2")]) time.strftime("%H:%M:%S") ## takes about 4 minutes on 48 cores print(ds.Band1.mean().values) time.strftime("%H:%M:%S")
working on example here: https://discourse.pangeo.io/t/reading-hdf-eos-hdf4-files-in-parallel-gdal-rasterio-etc
get the files
now VRT them into one big longlat raster and then merge into one big dask xarray