DOI-USGS / geoknife

R tools for geo-web processing of gridded data via the Geo Data Portal. geoknife slices up gridded data according to overlap with irregular features, such as watersheds, lakes, points, etc.
https://doi-usgs.github.io/geoknife/
Other
69 stars 23 forks source link

Long term: geoknife future with python version of gdp and local processing of zarr data. #427

Open dblodgett-usgs opened 1 year ago

dblodgett-usgs commented 1 year ago

RNetCDF now functions with zarr and the migration to a new version of the GDP processing service is under way. It's time to start planning what this transition means for geoknife.

The initial version of this issue is just dropping in some example zarr data access showing proof of function -- as the issue progresses, I want to work up a strategy for migrating sbtools to work against NetCDF-binary or zarr data as well as using the new gdptools processing service that is in development right now.

Super basic netcdf-java test data:

u <- "https://github.com/Unidata/netcdf-java/raw/799f4ab239f4ea70da0c2a230c28f5594e4fe54c/cdm/zarr/src/test/data/zarr_test_data.zip"
zfile <- file.path(tempdir(), basename(u))
download.file(u, zfile, mode = "wb")

zdir <- gsub("zip", "zarr", zfile)

zip::unzip(zfile, exdir = zdir)

setwd(dirname(zfile))

zdir <- paste0("file://", basename(zdir), "/#mode=zarr,file")

cat(system2("C:/Program Files/netCDF 4.9.0/bin/ncdump", c("-sh", zdir), stdout = TRUE))
#> netcdf \#mode\=zarr\,file { dimensions:  _zdim_20 = 20 ;  // global attributes:      :_SuperblockVersion = 0 ;       :_IsNetcdf4 = 1 ;       :_Format = "netCDF-4" ;  group: group_with_attrs {   variables:     int F_order_array(_zdim_20, _zdim_20) ;         F_order_array:bar = "apples" ;          F_order_array:baz = 1b, 2b, 3b, 4b ;        F_order_array:foo = 42b ;           F_order_array:_Storage = "chunked" ;        F_order_array:_ChunkSizes = 4, 5 ;          F_order_array:_Endianness = "little" ;      short nested(_zdim_20, _zdim_20) ;          nested:_Storage = "chunked" ;           nested:_ChunkSizes = 10, 10 ;           nested:_Endianness = "little" ;     float partial_fill1(_zdim_20, _zdim_20) ;           partial_fill1:_Storage = "chunked" ;        partial_fill1:_ChunkSizes = 10, 10 ;        partial_fill1:_Endianness = "little" ;      float partial_fill2(_zdim_20, _zdim_20) ;           partial_fill2:_Storage = "chunked" ;        partial_fill2:_ChunkSizes = 10, 10 ;        partial_fill2:_Endianness = "little" ;      float uninitialized(_zdim_20, _zdim_20) ;           uninitialized:_Storage = "chunked" ;        uninitialized:_ChunkSizes = 10, 10 ;        uninitialized:_Endianness = "little" ;    // group attributes:          :group_attr = "foo" ;   } // group group_with_attrs  group: group_with_dims {   variables:      int var1D(_zdim_20) ;           var1D:_Storage = "chunked" ;        var1D:_ChunkSizes = 5 ;         var1D:_Endianness = "little" ;      int var2D(_zdim_20, _zdim_20) ;         var2D:_Storage = "chunked" ;        var2D:_ChunkSizes = 5, 5 ;          var2D:_Endianness = "little" ;      int var3D(_zdim_20, _zdim_20, _zdim_20) ;           var3D:_Storage = "chunked" ;        var3D:_ChunkSizes = 5, 5, 5 ;           var3D:_Endianness = "little" ;      int var4D(_zdim_20, _zdim_20, _zdim_20, _zdim_20) ;         var4D:_Storage = "chunked" ;        var4D:_ChunkSizes = 5, 5, 5, 5 ;        var4D:_Endianness = "little" ;    // group attributes:   } // group group_with_dims }

nc <- RNetCDF::open.nc(zdir)

RNetCDF::file.inq.nc(nc)
#> $ndims
#> [1] 1
#> 
#> $nvars
#> [1] 0
#> 
#> $ngatts
#> [1] 0
#> 
#> $unlimdimid
#> [1] NA
#> 
#> $format
#> [1] "netcdf4"
#> 
#> $libvers
#> [1] "4.9.0 of Jun 24 2022 09:44:47 $"

Created on 2022-12-05 with reprex v2.0.2

Another test -- I've had a little flakeyness with specifying paths, but things seem pretty close.

options(timeout=600)

u <- "https://cida.usgs.gov/thredds/fileServer/demo/thredds/gridmet/gridmet/pr_2022.nc"

f <- basename(u)

ncfile <- file.path(tempdir(), f)

download.file(u, ncfile, mode = "wb")

setwd(dirname(ncfile))

z <- gsub("nc", "zarr", f)

unlink(z, recursive = TRUE)

(z_path <- paste0("file://", z, "/#mode=zarr,file"))
#> [1] "file://pr_2022.zarr/#mode=zarr,file"

system2("C:/Program Files/netCDF 4.9.0/bin/nccopy", c(f, z_path), stdout = TRUE)
#> character(0)

cat(system2("C:/Program Files/netCDF 4.9.0/bin/ncdump.exe", c("-h", z_path), stdout = TRUE), sep = "\n")
#> netcdf \#mode\=zarr\,file {
#> dimensions:
#>  crs = 1 ;
#>  day = 155 ;
#>  lat = 585 ;
#>  lon = 1386 ;
#> variables:
#>  ushort crs(crs) ;
#>      crs:grid_mapping_name = "latitude_longitude" ;
#>      crs:longitude_of_prime_meridian = 0b ;
#>      crs:semi_major_axis = 6378140. ;
#>      crs:long_name = "WGS 84" ;
#>      crs:inverse_flattening = 298.257 ;
#>      crs:GeoTransform = "-124.7666666333333 0.041666666666666 0  49.400000000000000 -0.041666666666666" ;
#>      crs:spatial_ref = "GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 84\",6378137,298.257223563,AUTHORITY[\"EPSG\",\"7030\"]],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.0174532925199433,AUTHORITY[\"EPSG\",\"9122\"]],AUTHORITY[\"EPSG\",\"4326\"]]" ;
#>  double day(day) ;
#>      day:description = "days since 1900-01-01" ;
#>      day:units = "days since 1900-01-01 00:00:00" ;
#>      day:long_name = "time" ;
#>      day:standard_name = "time" ;
#>      day:calendar = "gregorian" ;
#>  double lat(lat) ;
#>      lat:units = "degrees_north" ;
#>      lat:description = "latitude" ;
#>      lat:long_name = "latitude" ;
#>      lat:standard_name = "latitude" ;
#>      lat:axis = "Y" ;
#>  double lon(lon) ;
#>      lon:units = "degrees_east" ;
#>      lon:description = "longitude" ;
#>      lon:long_name = "longitude" ;
#>      lon:standard_name = "longitude" ;
#>      lon:axis = "X" ;
#>  ushort precipitation_amount(day, lat, lon) ;
#>      precipitation_amount:_FillValue = 32767s ;
#>      precipitation_amount:units = "mm" ;
#>      precipitation_amount:description = "Daily Accumulated Precipitation" ;
#>      precipitation_amount:long_name = "pr" ;
#>      precipitation_amount:standard_name = "pr" ;
#>      precipitation_amount:missing_value = 32767s ;
#>      precipitation_amount:dimensions = "lon lat time" ;
#>      precipitation_amount:grid_mapping = "crs" ;
#>      precipitation_amount:coordinate_system = "WGS84,EPSG:4326" ;
#>      precipitation_amount:scale_factor = 0.1 ;
#>      precipitation_amount:add_offset = 0b ;
#>      precipitation_amount:coordinates = "lon lat time" ;
#>      precipitation_amount:_Unsigned = "true" ;
#> 
#> // global attributes:
#>      :geospatial_bounds_crs = "EPSG:4326" ;
#>      :Conventions = "CF-1.6" ;
#>      :geospatial_bounds = "POLYGON((-124.7666666333333 49.400000000000000, -124.7666666333333 25.066666666666666, -67.058333300000015 25.066666666666666, -67.058333300000015 49.400000000000000, -124.7666666333333 49.400000000000000))" ;
#>      :geospatial_lat_min = "25.066666666666666" ;
#>      :geospatial_lat_max = "49.40000000000000" ;
#>      :geospatial_lon_min = "-124.7666666333333" ;
#>      :geospatial_lon_max = "-67.058333300000015" ;
#>      :geospatial_lon_resolution = "0.041666666666666" ;
#>      :geospatial_lat_resolution = "0.041666666666666" ;
#>      :geospatial_lat_units = "decimal_degrees north" ;
#>      :geospatial_lon_units = "decimal_degrees east" ;
#>      :coordinate_system = "EPSG:4326" ;
#>      :author = "John Abatzoglou - University of California Merced, jabatzoglou@ucmerced.edu" ;
#>      :date = "05 June 2022" ;
#>      :note1 = "The projection information for this file is: GCS WGS 1984." ;
#>      :note2 = "Citation: Abatzoglou, J.T., 2013, Development of gridded surface meteorological data for ecological applications and modeling, International Journal of Climatology, DOI: 10.1002/joc.3413" ;
#>      :last_permanent_slice = "95" ;
#>      :last_early_slice = "155" ;
#>      :last_provisional_slice = "149" ;
#>      :note3 = "Data in slices after last_permanent_slice (1-based) are considered provisional and subject to change with subsequent updates" ;
#>      :note4 = "Data in slices after last_provisional_slice (1-based) are considered early and subject to change with subsequent updates" ;
#>      :note5 = "Days correspond approximately to calendar days ending at midnight, Mountain Standard Time (7 UTC the next calendar day)" ;
#> }

nc <- RNetCDF::open.nc(f)

RNetCDF::file.inq.nc(nc)
#> $ndims
#> [1] 4
#> 
#> $nvars
#> [1] 5
#> 
#> $ngatts
#> [1] 22
#> 
#> $unlimdimid
#> [1] NA
#> 
#> $format
#> [1] "netcdf4"
#> 
#> $libvers
#> [1] "4.9.0 of Jun 24 2022 09:44:47 $"

RNetCDF::close.nc(nc)

nc <- RNetCDF::open.nc(z_path)

RNetCDF::file.inq.nc(nc)
#> $ndims
#> [1] 4
#> 
#> $nvars
#> [1] 5
#> 
#> $ngatts
#> [1] 22
#> 
#> $unlimdimid
#> [1] NA
#> 
#> $format
#> [1] "netcdf4"
#> 
#> $libvers
#> [1] "4.9.0 of Jun 24 2022 09:44:47 $"

RNetCDF::close.nc(nc)

ncmeta::nc_meta(z_path)
#> $dimension
#> # A tibble: 4 × 5
#>      id name  length unlim coord_dim
#>   <int> <chr>  <dbl> <lgl> <lgl>    
#> 1     0 crs        1 FALSE TRUE     
#> 2     1 day      155 FALSE TRUE     
#> 3     2 lat      585 FALSE TRUE     
#> 4     3 lon     1386 FALSE TRUE     
#> 
#> $variable
#> # A tibble: 5 × 6
#>      id name                 type      ndims natts dim_coord
#>   <int> <chr>                <chr>     <int> <int> <lgl>    
#> 1     0 crs                  NC_USHORT     1     7 TRUE     
#> 2     1 day                  NC_DOUBLE     1     5 TRUE     
#> 3     2 lat                  NC_DOUBLE     1     5 TRUE     
#> 4     3 lon                  NC_DOUBLE     1     5 TRUE     
#> 5     4 precipitation_amount NC_USHORT     3    13 FALSE    
#> 
#> $attribute
#> # A tibble: 57 × 4
#>       id name                        variable value       
#>    <int> <chr>                       <chr>    <named list>
#>  1     0 grid_mapping_name           crs      <chr [1]>   
#>  2     1 longitude_of_prime_meridian crs      <dbl [1]>   
#>  3     2 semi_major_axis             crs      <dbl [1]>   
#>  4     3 long_name                   crs      <chr [1]>   
#>  5     4 inverse_flattening          crs      <dbl [1]>   
#>  6     5 GeoTransform                crs      <chr [1]>   
#>  7     6 spatial_ref                 crs      <chr [1]>   
#>  8     0 description                 day      <chr [1]>   
#>  9     1 units                       day      <chr [1]>   
#> 10     2 long_name                   day      <chr [1]>   
#> # … with 47 more rows
#> 
#> $axis
#> # A tibble: 7 × 3
#>    axis variable             dimension
#>   <int> <chr>                    <int>
#> 1     1 crs                          0
#> 2     2 day                          1
#> 3     3 lat                          2
#> 4     4 lon                          3
#> 5     5 precipitation_amount         3
#> 6     6 precipitation_amount         2
#> 7     7 precipitation_amount         1
#> 
#> $grid
#> # A tibble: 5 × 4
#>   grid     ndims variables        nvars
#>   <chr>    <int> <list>           <int>
#> 1 D3,D2,D1     3 <tibble [1 × 1]>     1
#> 2 D0           1 <tibble [1 × 1]>     1
#> 3 D1           1 <tibble [1 × 1]>     1
#> 4 D2           1 <tibble [1 × 1]>     1
#> 5 D3           1 <tibble [1 × 1]>     1
#> 
#> $source
#> # A tibble: 1 × 2
#>   access              source                             
#>   <dttm>              <chr>                              
#> 1 2022-12-05 09:28:14 file://pr_2022.zarr/#mode=zarr,file
#> 
#> attr(,"class")
#> [1] "ncmeta"

Created on 2022-12-05 with reprex v2.0.2

amsnyder commented 1 year ago

@lekoenig this may be of interest to you

dblodgett-usgs commented 1 year ago

@mikejohnson51 -- would you consider contributing some of the core components of opendap.catalog and climateR to geoknife? In thinking about a v2 or geoknife, I want it to work with local or remote data as well as R geoprocessing or remote web service calls to the new gdp. I feel like the components of your work could fit here and we could get them onto cran through geoknife?