Open mdsumner opened 1 month ago
next increment, here write write to Zarr and then open that with GDAL multdim, read at /10 and plot
library(reticulate)
xarray <- import("xarray")
pandas <- import("pandas")
ftodate <- function(x) pandas$DatetimeIndex(list(
format(as.Date(stringr::str_extract(basename(x), "[0-9]{8}"), "%Y%m%d"))
))
da <- function(x) {
ds <- xarray$open_dataset(x)
ds$expand_dims(time = ftodate(x))
}
files <- raadtools::read_par(returnfiles = TRUE)
ds <- xarray$concat(lapply(files$fullname[1:10], da), dim = "time")
ds$to_zarr(store = "/tmp/par.zarr")
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
ds <- gdal$OpenEx("/tmp/par.zarr", gdal$OF_MULTIDIM_RASTER)
rg <- ds$GetRootGroup()
rg$GetMDArrayNames()
# [1] "lat" "lon" "time" "palette" "par"
par <- rg$OpenMDArray("par")
par$GetBlockSize()
par$GetDimensionCount()
lapply(par$GetDimensions(), function(.x) .x$GetSize())
b <- par$GetView("[:,:,1]")
b$GetBlockSize()
d <- b$GetDimensions()
lapply(d, function(.x) .x$GetSize())
bytes <- b$Read()
length(bytes)
##10 * 4320 * 2
#m <- matrix(readBin(bytes, "integer", n = 4320 * 10, size = 2), 10)
#ximage::ximage(m)
b <- par$GetView("[0:1,::10,::10]")
b$GetBlockSize()
d <- b$GetDimensions()
shape <- lapply(d, function(.x) .x$GetSize())
sbytes <- b$Read()
length(sbytes)
m <- matrix(readBin(sbytes, "integer", n = length(sbytes)/2, size = 2), shape[[2]] , byrow = TRUE)
ximage::ximage(m, c(-180, 180, -90, 90))
ReadAsArray() is giving me a honky error in the libs, so I'm unpacking the bytes above, but it should be as easy as this:
lon <- rg$OpenMDArray("lon")
lon$Read() ## I don't know if we can get these from the View() defined above, probably
ah, we can do this (but the GeoTransform is not very useful ...)
cd <- b$AsClassicDataset(1L, 2L)
> cd$RasterXSize
[1] 432
> cd$RasterYSize
[1] 864
> cd$GetGeoTransform()
If we don't use a View, the geotransform is intact, but the coordinates are still not accessible from the MDArray.
dd <- par$AsClassicDataset(1L, 2L)
> dd$RasterYSize
[1] 8640
> dd$RasterXSize
[1] 4320
> dd$GetGeoTransform()
[[1]]
[1] 90
[[2]]
[1] -0.04166667
[[3]]
[1] 0
[[4]]
[1] -180
[[5]]
[1] 0
[[6]]
[1] 0.04166667
So, tighten this up and might be a few bug reports. This at least is a pretty nice way to permute dims (from multdim read to Classic)
Works fine on GDAL autotest examples, so we might have a report-worthy example (so long as my xarray write is up to scratch, do that all in python on same system).
f <- "autotest/gdrivers/data/zarr/array_dimensions.zarr"
> library(reticulate)
> gdal <- import("osgeo.gdal")
Would you like to create a default Python environment for the reticulate package? (Yes/no/cancel) no
> gdal$OpenEx(f, gdal$OF_MULTIDIM_RASTER)
/usr/local/lib/python3/dist-packages/osgeo/gdal.py:314: FutureWarning: Neither gdal.UseExceptions() nor gdal.DontUseExceptions() has been explicitly called. In GDAL 4.0, exceptions will be enabled by default.
warnings.warn(
<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x15499e3a1ad0> >
> ds <- gdal$OpenEx(f, gdal$OF_MULTIDIM_RASTER)
> rg <- ds$GetRootGroup()
var <- rg$OpenMDArray("var")
> var$ReadAsArray()
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
[1,]
works fine here, so maybe specifically a netcdf related thing
import xarray
ds = xarray.open_dataset("autotest/gdrivers/data/zarr/array_dimensions.zarr")
ds
<xarray.Dataset> Size: 7kB
Dimensions: (lat: 20, lon: 40)
Coordinates:
* lat (lat) float64 160B 4.662e-310 0.0 ... 1.128e-310 1.128e-310
* lon (lon) float64 320B 4.662e-310 0.0 ... 1.128e-310 1.128e-310
Data variables:
var (lat, lon) float64 6kB ...
ds.to_zarr("../var.zarr")
<xarray.backends.zarr.ZarrStore object at 0x14c406def940>
from osgeo import gdal
gdal.UseExceptions()
ds = gdal.OpenEx("../var.zarr", gdal.OF_MULTIDIM_RASTER)
rg = ds.GetRootGroup()
var = rg.OpenMDArray("var")
var.ReadAsArray()
array([[0.00000e+000, 0.00000e+000, 0.00000e+000, 0.00000e+000,
works fine for these ones ... so I still have to try on the par above where I'm munging my own time coord
import xarray
unc = ["https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202202/oisst-avhrr-v02r01.20220218.nc#mode=bytes",
"https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202202/oisst-avhrr-v02r01.20220219.nc#mode=bytes"]
arrds = [xarray.open_dataset(x, engine = "netcdf4", chunks = {}) for x in unc]
ds = xarray.concat(arrds, dim = "time")
ds
<xarray.Dataset> Size: 33MB
Dimensions: (time: 2, zlev: 1, lat: 720, lon: 1440)
Coordinates:
* time (time) datetime64[ns] 16B 2022-02-18T12:00:00 2022-02-19T12:00:00
* zlev (zlev) float32 4B 0.0
* lat (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88
* lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9
Data variables:
sst (time, zlev, lat, lon) float32 8MB nan nan nan ... -1.8 -1.8 -1.8
anom (time, zlev, lat, lon) float32 8MB nan nan nan nan ... 0.0 0.0 0.0
err (time, zlev, lat, lon) float32 8MB nan nan nan nan ... 0.3 0.3 0.3
ice (time, zlev, lat, lon) float32 8MB nan nan nan ... 0.98 0.98 0.98
Attributes: (12/37)
Conventions: CF-1.6, ACDD-1.3
title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
references: Reynolds, et al.(2007) Daily High-Resolution-...
source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...
id: oisst-avhrr-v02r01.20220218.nc
naming_authority: gov.noaa.ncei
... ...
time_coverage_start: 2022-02-18T00:00:00Z
time_coverage_end: 2022-02-18T23:59:59Z
metadata_link: https://doi.org/10.25921/RE9P-PT57
ncei_template_version: NCEI_NetCDF_Grid_Template_v2.0
comment: Data was converted from NetCDF-3 to NetCDF-4 ...
sensor: Thermometer, AVHRR
ds.to_zarr("../oisst.zarr")
<xarray.backends.zarr.ZarrStore object at 0x14c466dc95c0>
from osgeo import gdal
gds = gdal.OpenEx("../oisst.zarr", gdal.OF_MULTIDIM_RASTER)
rg = gds.GetRootGroup()
rg.GetMDArrayNames()
['lat', 'lon', 'time', 'zlev', 'anom', 'err', 'ice', 'sst']
sst = rg.OpenMDArray("sst")
a = sst.ReadAsArray()
a.shape
(2, 1, 720, 1440)
Here's something from the MDIM world, in python/reticulate for now
#f <- "/rdsi/PUBLIC/raad/data/www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc"
u <- "/vsicurl/https://github.com/mdsumner/fixoisst/raw/main/file.nc"
library(reticulate)
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
drv <- gdal$GetDriverByName("VRT")
ds <- drv$CreateMultiDimensional("myds")
rg <-ds$GetRootGroup()
None <- reticulate::py_none()
dim0 = rg$CreateDimension("time", None, None, 1L)
dim1 = rg$CreateDimension("zlev", None, None, 1L)
dim2 = rg$CreateDimension("lat", None, None, 360L)
dim3 = rg$CreateDimension("lon", None, None, 360L)
#https://gdal.org/api/python/mdim_api.html#osgeo.gdal.MDArray.GetResampled
ids <- gdal$OpenEx(u, gdal$OF_MULTIDIM_RASTER)
rg2 <- ids$GetRootGroup()
sst <- rg2$OpenMDArray("sst")
#GetResampled(MDArray self, int nDimensions, GDALRIOResampleAlg resample_alg, OSRSpatialReferenceShadow ** srs, char ** options=None)→ MDArray
osr <- import("osgeo.osr")
crs <- osr$SpatialReference()
crs$SetFromUserInput("+proj=laea +lon_0=147")
v <- sst$GetResampled(list(dim0, dim1, dim2, dim3), gdal$GRA_Bilinear, crs)
a <- v$Read()
ximage::ximage(matrix(readBin(a, "integer", size = 2, n = 360*360, signed = F), 360, byrow = TRUE))
v$AsClassicDataset(2L, 3L)$GetGeoTransform()
[[1]]
[1] 10223952
[[2]]
[1] -56784.64
[[3]]
[1] 0
[[4]]
[1] -12230967
[[5]]
[1] 0
[[6]]
[1] 44017.02
issues obviously with nodata and scaling and the usual antimeridian stuff, and what's with not having a target extent for the output grid ?? just that only Even has used this?? ... but this is promising
this works on pawsey, the readasarray problem is local to aceecostats, some mix of gdal lying around with my source builds
#wget https://github.com/mdsumner/fixoisst/raw/main/file.nc
library(reticulate)
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
ds <- gdal$OpenEx("file.nc", gdal$OF_MULTIDIM_RASTER)
rg <- ds$GetRootGroup()
sst <- rg$OpenMDArray("sst")
gdal$VersionInfo()
#[1] "3090000"
bd <- gdal$ExtendedDataType$Create(gdal$GDT_UInt16)
d <- sst$ReadAsArray(buffer_datatype = bd)
[1] "3100000"
> range(d)
[1] 0 3240
and perhaps more interesting
gdal$SetConfigOption("AWS_NO_SIGN_REQUEST", "YES")
ds <- gdal$OpenEx("/vsis3/mur-sst/zarr", gdal$OF_MULTIDIM_RASTER)
(ds$GetRootGroup()$OpenMDArray("analysed_sst")$GetDimensions()[[1]])$GetSize()
[1] 6443
> (ds$GetRootGroup()$OpenMDArray("analysed_sst")$GetDimensions()[[2]])$GetSize()
[1] 17999
> (ds$GetRootGroup()$OpenMDArray("analysed_sst")$GetDimensions()[[3]])$GetSize()
[1] 36000
bd <- gdal$ExtendedDataType$Create(gdal$GDT_UInt16)
sst <- ds$GetRootGroup()$OpenMDArray("analysed_sst")
str(sst$ReadAsArray(buffer_datatype=bd, count = c(2L, 24L, 56L)))
int [1:2, 1:24, 1:56] 0 0 0 0 0 0 0 0 0 0 ...
Here we use the 'https://..nc#mode=bytes' method for xarray/netcdf4, then slice with reticulate idioms
library(reticulate)
xarray <- import("xarray")
#ym: 198109
#ymd: 19810901
dates <- seq(as.Date("1981-09-01"), by = "1 day", length.out = 10)
ym <- format(dates, "%Y%m")
ymd <- format(dates, "%Y%m%d")
pat <- "https://projects.pawsey.org.au/www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/%s/oisst-avhrr-v02r01.%s.nc#mode=bytes"
u <- sprintf(pat, ym, ymd)
## can't do this with chunks = list()
library(furrr)
plan(multisession)
arrds <- future_map(u, xarray$open_dataset, engine = "netcdf4", chunks = py_eval("{}"))
plan(sequential)
x <- xarray$concat(arrds, dim = "time")
#py_run_string("result = r.x.sel(time = slice('1981-09-01', '1981-09-04'))")
#py$result
## OR https://github.com/rstudio/reticulate/issues/122
py_slice <- import_builtins()$slice
x$sel(time = py_slice('1981-09-01', '1981-09-04'), lon = py_slice(130, 150))
here's the basics of how we could return an xarray object (sometimes you'll want the date from the fileset, sometimes it will be in the object already)
Don't yet know how to specify what the time is (interval or start or middle or whatever)
I think this could avoid the actual calls to open datasets by using kerchunk ...