AustralianAntarcticDivision / raadtools

Tools for Synoptic Environmental Spatial Data
http://australianantarcticdivision.github.io/raadtools/
24 stars 3 forks source link

notes on xarray/MDIM #145

Open mdsumner opened 1 month ago

mdsumner commented 1 month ago

here's the basics of how we could return an xarray object (sometimes you'll want the date from the fileset, sometimes it will be in the object already)

library(reticulate)
xarray <- import("xarray")
pandas <- import("pandas")

ftodate <- function(x) pandas$DatetimeIndex(list(
  format(as.Date(stringr::str_extract(basename(x), "[0-9]{8}"), "%Y%m%d"))
))

da <- function(x) {
  ds <- xarray$open_dataset(x)
  ds$expand_dims(time = ftodate(x))
}
files <- raadtools::read_par(returnfiles = TRUE)
ds <- xarray$concat(lapply(files$fullname[1:100], da), dim = "time")
<xarray.Dataset> Size: 15GB
Dimensions:  (time: 100, lat: 4320, lon: 8640, rgb: 3, eightbitcolor: 256)
Coordinates:
  * time     (time) datetime64[ns] 800B 2002-07-04 2002-07-12 ... 2004-08-28
  * lat      (lat) float32 17kB 89.98 89.94 89.9 89.85 ... -89.9 -89.94 -89.98
  * lon      (lon) float32 35kB -180.0 -179.9 -179.9 ... 179.9 179.9 180.0
Dimensions without coordinates: rgb, eightbitcolor
Data variables:
    par      (time, lat, lon) float32 15GB dask.array<chunksize=(1, 44, 87), meta=np.ndarray>
    palette  (time, rgb, eightbitcolor) uint8 77kB dask.array<chunksize=(1, 3, 256), meta=np.ndarray>
Attributes: (12/62)
    product_name:                     AQUA_MODIS.20020704_20020711.L3m.8D.PAR...
    instrument:                       MODIS
    title:                            MODISA Level-3 Standard Mapped Image
    project:                          Ocean Biology Processing Group (NASA/GS...
    platform:                         Aqua
    source:                           satellite observations from MODIS-Aqua
    ...                               ...
    cdm_data_type:                    grid
    keywords:                         Earth Science > Oceans > Ocean Optics >...
    keywords_vocabulary:              NASA Global Change Master Directory (GC...
    data_bins:                        20363532
    data_minimum:                     0.018797303
    data_maximum:                     66.28954
>

Don't yet know how to specify what the time is (interval or start or middle or whatever)

I think this could avoid the actual calls to open datasets by using kerchunk ...

mdsumner commented 1 month ago

next increment, here write write to Zarr and then open that with GDAL multdim, read at /10 and plot

library(reticulate)
xarray <- import("xarray")
pandas <- import("pandas")

ftodate <- function(x) pandas$DatetimeIndex(list(
  format(as.Date(stringr::str_extract(basename(x), "[0-9]{8}"), "%Y%m%d"))
))

da <- function(x) {
  ds <- xarray$open_dataset(x)
  ds$expand_dims(time = ftodate(x))
}
files <- raadtools::read_par(returnfiles = TRUE)
ds <- xarray$concat(lapply(files$fullname[1:10], da), dim = "time")

ds$to_zarr(store = "/tmp/par.zarr")

gdal <- import("osgeo.gdal")
gdal$UseExceptions()
ds <- gdal$OpenEx("/tmp/par.zarr", gdal$OF_MULTIDIM_RASTER)
rg <- ds$GetRootGroup()
rg$GetMDArrayNames()
# [1] "lat"     "lon"     "time"    "palette" "par"
par <- rg$OpenMDArray("par")

par$GetBlockSize()

par$GetDimensionCount()

lapply(par$GetDimensions(), function(.x) .x$GetSize())

b <- par$GetView("[:,:,1]")
b$GetBlockSize()
d <- b$GetDimensions()
lapply(d, function(.x) .x$GetSize())
bytes <- b$Read()
length(bytes)
##10 * 4320 * 2
#m <- matrix(readBin(bytes, "integer", n = 4320 * 10, size = 2), 10)
#ximage::ximage(m)

b <- par$GetView("[0:1,::10,::10]")
b$GetBlockSize()
d <- b$GetDimensions()
shape <- lapply(d, function(.x) .x$GetSize())
sbytes <- b$Read()
length(sbytes)
m <- matrix(readBin(sbytes, "integer", n = length(sbytes)/2, size = 2), shape[[2]] , byrow = TRUE)

ximage::ximage(m, c(-180, 180, -90, 90))
mdsumner commented 1 month ago

ReadAsArray() is giving me a honky error in the libs, so I'm unpacking the bytes above, but it should be as easy as this:

lon <- rg$OpenMDArray("lon")
lon$Read()  ## I don't know if we can get these from the View() defined above, probably
mdsumner commented 1 month ago

ah, we can do this (but the GeoTransform is not very useful ...)

cd <- b$AsClassicDataset(1L, 2L)
> cd$RasterXSize
[1] 432
> cd$RasterYSize
[1] 864
> cd$GetGeoTransform()
mdsumner commented 1 month ago

If we don't use a View, the geotransform is intact, but the coordinates are still not accessible from the MDArray.

dd <- par$AsClassicDataset(1L, 2L)
> dd$RasterYSize
[1] 8640
> dd$RasterXSize
[1] 4320
> dd$GetGeoTransform()
[[1]]
[1] 90

[[2]]
[1] -0.04166667

[[3]]
[1] 0

[[4]]
[1] -180

[[5]]
[1] 0

[[6]]
[1] 0.04166667

So, tighten this up and might be a few bug reports. This at least is a pretty nice way to permute dims (from multdim read to Classic)

mdsumner commented 1 month ago

Works fine on GDAL autotest examples, so we might have a report-worthy example (so long as my xarray write is up to scratch, do that all in python on same system).

f <- "autotest/gdrivers/data/zarr/array_dimensions.zarr"
> library(reticulate)
> gdal <- import("osgeo.gdal")
Would you like to create a default Python environment for the reticulate package? (Yes/no/cancel) no
> gdal$OpenEx(f, gdal$OF_MULTIDIM_RASTER)
/usr/local/lib/python3/dist-packages/osgeo/gdal.py:314: FutureWarning: Neither gdal.UseExceptions() nor gdal.DontUseExceptions() has been explicitly called. In GDAL 4.0, exceptions will be enabled by default.
  warnings.warn(
<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x15499e3a1ad0> >
> ds <- gdal$OpenEx(f, gdal$OF_MULTIDIM_RASTER)
> rg <- ds$GetRootGroup()
var <- rg$OpenMDArray("var")
> var$ReadAsArray()
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
 [1,]  
mdsumner commented 1 month ago

works fine here, so maybe specifically a netcdf related thing

import xarray
ds = xarray.open_dataset("autotest/gdrivers/data/zarr/array_dimensions.zarr")
ds
<xarray.Dataset> Size: 7kB
Dimensions:  (lat: 20, lon: 40)
Coordinates:
  * lat      (lat) float64 160B 4.662e-310 0.0 ... 1.128e-310 1.128e-310
  * lon      (lon) float64 320B 4.662e-310 0.0 ... 1.128e-310 1.128e-310
Data variables:
    var      (lat, lon) float64 6kB ...
ds.to_zarr("../var.zarr")
<xarray.backends.zarr.ZarrStore object at 0x14c406def940>
from osgeo import gdal
gdal.UseExceptions()
ds = gdal.OpenEx("../var.zarr", gdal.OF_MULTIDIM_RASTER)
rg = ds.GetRootGroup()
var = rg.OpenMDArray("var")
var.ReadAsArray()
array([[0.00000e+000, 0.00000e+000, 0.00000e+000, 0.00000e+000,
mdsumner commented 1 month ago

works fine for these ones ... so I still have to try on the par above where I'm munging my own time coord

import xarray
unc =  ["https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202202/oisst-avhrr-v02r01.20220218.nc#mode=bytes",
 "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202202/oisst-avhrr-v02r01.20220219.nc#mode=bytes"]
arrds  = [xarray.open_dataset(x, engine = "netcdf4", chunks = {}) for x in unc]
ds = xarray.concat(arrds, dim = "time")
ds
<xarray.Dataset> Size: 33MB
Dimensions:  (time: 2, zlev: 1, lat: 720, lon: 1440)
Coordinates:
  * time     (time) datetime64[ns] 16B 2022-02-18T12:00:00 2022-02-19T12:00:00
  * zlev     (zlev) float32 4B 0.0
  * lat      (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88
  * lon      (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9
Data variables:
    sst      (time, zlev, lat, lon) float32 8MB nan nan nan ... -1.8 -1.8 -1.8
    anom     (time, zlev, lat, lon) float32 8MB nan nan nan nan ... 0.0 0.0 0.0
    err      (time, zlev, lat, lon) float32 8MB nan nan nan nan ... 0.3 0.3 0.3
    ice      (time, zlev, lat, lon) float32 8MB nan nan nan ... 0.98 0.98 0.98
Attributes: (12/37)
    Conventions:                CF-1.6, ACDD-1.3
    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
    references:                 Reynolds, et al.(2007) Daily High-Resolution-...
    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...
    id:                         oisst-avhrr-v02r01.20220218.nc
    naming_authority:           gov.noaa.ncei
    ...                         ...
    time_coverage_start:        2022-02-18T00:00:00Z
    time_coverage_end:          2022-02-18T23:59:59Z
    metadata_link:              https://doi.org/10.25921/RE9P-PT57
    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0
    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...
    sensor:                     Thermometer, AVHRR

ds.to_zarr("../oisst.zarr")

<xarray.backends.zarr.ZarrStore object at 0x14c466dc95c0>
from osgeo import gdal
gds = gdal.OpenEx("../oisst.zarr", gdal.OF_MULTIDIM_RASTER)
 rg = gds.GetRootGroup()
rg.GetMDArrayNames()
['lat', 'lon', 'time', 'zlev', 'anom', 'err', 'ice', 'sst']
sst = rg.OpenMDArray("sst")
a = sst.ReadAsArray()
a.shape
(2, 1, 720, 1440)
mdsumner commented 1 month ago

Here's something from the MDIM world, in python/reticulate for now

#f <- "/rdsi/PUBLIC/raad/data/www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc"

u <- "/vsicurl/https://github.com/mdsumner/fixoisst/raw/main/file.nc"
library(reticulate)
gdal <- import("osgeo.gdal")
gdal$UseExceptions()

drv <-  gdal$GetDriverByName("VRT")
ds <- drv$CreateMultiDimensional("myds")
rg <-ds$GetRootGroup()
None <- reticulate::py_none()
dim0 = rg$CreateDimension("time", None, None, 1L)
dim1 = rg$CreateDimension("zlev", None, None, 1L)
dim2 = rg$CreateDimension("lat", None, None, 360L)
dim3 = rg$CreateDimension("lon", None, None, 360L)

#https://gdal.org/api/python/mdim_api.html#osgeo.gdal.MDArray.GetResampled

ids <- gdal$OpenEx(u, gdal$OF_MULTIDIM_RASTER)
rg2 <- ids$GetRootGroup()
sst <- rg2$OpenMDArray("sst")

#GetResampled(MDArray self, int nDimensions, GDALRIOResampleAlg resample_alg, OSRSpatialReferenceShadow ** srs, char ** options=None)→ MDArray

osr <- import("osgeo.osr")
crs <- osr$SpatialReference()
crs$SetFromUserInput("+proj=laea +lon_0=147")
v <- sst$GetResampled(list(dim0, dim1, dim2, dim3), gdal$GRA_Bilinear, crs)
a <- v$Read()
ximage::ximage(matrix(readBin(a, "integer", size = 2, n = 360*360, signed = F), 360, byrow = TRUE))

v$AsClassicDataset(2L, 3L)$GetGeoTransform()
[[1]]
[1] 10223952

[[2]]
[1] -56784.64

[[3]]
[1] 0

[[4]]
[1] -12230967

[[5]]
[1] 0

[[6]]
[1] 44017.02

image

issues obviously with nodata and scaling and the usual antimeridian stuff, and what's with not having a target extent for the output grid ?? just that only Even has used this?? ... but this is promising

mdsumner commented 1 month ago

this works on pawsey, the readasarray problem is local to aceecostats, some mix of gdal lying around with my source builds

#wget https://github.com/mdsumner/fixoisst/raw/main/file.nc
library(reticulate)
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
ds <- gdal$OpenEx("file.nc", gdal$OF_MULTIDIM_RASTER)
rg <- ds$GetRootGroup()
sst <- rg$OpenMDArray("sst")
gdal$VersionInfo()
#[1] "3090000"
bd <- gdal$ExtendedDataType$Create(gdal$GDT_UInt16)
d <- sst$ReadAsArray(buffer_datatype = bd)
[1] "3100000"
> range(d)
[1]    0 3240
mdsumner commented 1 month ago

and perhaps more interesting

gdal$SetConfigOption("AWS_NO_SIGN_REQUEST", "YES")
ds <- gdal$OpenEx("/vsis3/mur-sst/zarr", gdal$OF_MULTIDIM_RASTER)

(ds$GetRootGroup()$OpenMDArray("analysed_sst")$GetDimensions()[[1]])$GetSize()
[1] 6443
> (ds$GetRootGroup()$OpenMDArray("analysed_sst")$GetDimensions()[[2]])$GetSize()
[1] 17999
> (ds$GetRootGroup()$OpenMDArray("analysed_sst")$GetDimensions()[[3]])$GetSize()
[1] 36000

bd <- gdal$ExtendedDataType$Create(gdal$GDT_UInt16)
sst <- ds$GetRootGroup()$OpenMDArray("analysed_sst")
str(sst$ReadAsArray(buffer_datatype=bd, count = c(2L, 24L, 56L)))
 int [1:2, 1:24, 1:56] 0 0 0 0 0 0 0 0 0 0 ...
mdsumner commented 2 days ago

Here we use the 'https://..nc#mode=bytes' method for xarray/netcdf4, then slice with reticulate idioms

library(reticulate)
xarray <- import("xarray")
#ym:  198109
#ymd:  19810901
dates <- seq(as.Date("1981-09-01"), by =  "1 day", length.out = 10)
ym <- format(dates, "%Y%m")
ymd <- format(dates, "%Y%m%d")

pat <- "https://projects.pawsey.org.au/www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/%s/oisst-avhrr-v02r01.%s.nc#mode=bytes"

u <- sprintf(pat, ym, ymd)

## can't do this with chunks = list()
library(furrr)
plan(multisession)
arrds <- future_map(u, xarray$open_dataset, engine = "netcdf4", chunks = py_eval("{}"))
plan(sequential)

x <- xarray$concat(arrds, dim = "time")

#py_run_string("result = r.x.sel(time = slice('1981-09-01', '1981-09-04'))")
#py$result

## OR   https://github.com/rstudio/reticulate/issues/122
py_slice <- import_builtins()$slice
x$sel(time = py_slice('1981-09-01', '1981-09-04'), lon = py_slice(130, 150))