mdsumner / rw-book

https://mdsumner.github.io/rw-book/
4 stars 0 forks source link

merging HDF4 from Sinusoidal modis #4

Open mdsumner opened 5 months ago

mdsumner commented 5 months ago

working on example here: https://discourse.pangeo.io/t/reading-hdf-eos-hdf4-files-in-parallel-gdal-rasterio-etc

get the files

url <- "https://e4ftl01.cr.usgs.gov/MOTA/MCD43A4.061/2001.01.01"
l <- readLines(url)
xs <- paste0(url, "/", as.character(gsub("\"$", "", na.omit(stringr::str_extract(l, "MCD43A4.*hdf\"")))))
header <- readLines("~/earthdata")[1]

cmd <- sprintf("wget %s --header \"%s\" -O %s", xs, header, basename(xs))

library(furrr)
plan(multicore) #,workers = 16 )
future_map(cmd, system)
plan(sequential)

now VRT them into one big longlat raster and then merge into one big dask xarray

from osgeo import gdal
import rasterio
import xarray as xr
import glob
from osgeo import gdal
gdal.UseExceptions()
import time
import tempfile

hdf  = glob.glob("pangeo_hdf/*.hdf")

band1_src = [f"HDF4_EOS:EOS_GRID:\"{file}\":MOD_Grid_BRDF:Nadir_Reflectance_Band1" for file in hdf]
band2_src = [f"HDF4_EOS:EOS_GRID:\"{file}\":MOD_Grid_BRDF:Nadir_Reflectance_Band2" for file in hdf]

## we can capture the VRT text in memory here
band1 = gdal.BuildVRT("", band1_src)
band2 = gdal.BuildVRT("", band2_src)

opts = gdal.WarpOptions(outputBounds = [-180, -90, 180, 90], width = 86400, height = 43200, dstSRS = "EPSG:4326", format = "VRT")

wband1vrt = tempfile.NamedTemporaryFile(suffix = ".vrt", prefix = "band1_")
wband2vrt = tempfile.NamedTemporaryFile(suffix = ".vrt", prefix = "band2_")

## but we can't chain that VRT into this WarpedVRT, so we actually create files here
w1 = gdal.Warp(wband1vrt.name, band1.GetMetadata("xml:VRT")[0], options = opts)
w2 = gdal.Warp(wband2vrt.name, band2.GetMetadata("xml:VRT")[0], options = opts)

## destroy so that file gets flushed into existence
w1 = None
w2 = None

## literal virtual files
ds = xr.merge([xr.open_dataset(wband1vrt.name,  chunks = {}, engine = "rasterio").band_data.rename("Band1"), 
               xr.open_dataset(wband2vrt.name,  chunks = {}, engine = "rasterio").band_data.rename("Band2")])

time.strftime("%H:%M:%S")

## takes about 4 minutes on 48 cores
print(ds.Band1.mean().values)

time.strftime("%H:%M:%S")