mdsumner / ghrsst.coop

0 stars 0 forks source link

drafting the osgeo.gdal pathway #6

Open mdsumner opened 4 months ago

mdsumner commented 4 months ago

keen to compare this with the xarray/odc way

I may have mistakes I'm just going to keep iterating until I'm confident.

may need to update the missing value

from datetime import datetime
from os import path
from osgeo import gdal

gdal.UseExceptions()

## I just put this file in my home
gdal.SetConfigOption("GDAL_HTTP_HEADER_FILE", f"{path.expanduser('~/')}earthdata")

## full from-url version
filename = "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20240227090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc"
sds = "analysed_sst" 
dsn = f"vrt://NetCDF:\"/vsicurl/{filename}\":{sds}?a_srs=OGC:CRS84&a_ullr=-180,89.995,180,-89.995&a_scale=0.001&a_offset=25"

## for a quick local test
#filename = "/scratch/pawsey0973/mdsumner/20240227090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc"
#dsn = f"vrt://NetCDF:\"{filename}\":{sds}?a_srs=OGC:CRS84&a_ullr=-180,89.995,180,-89.995&a_scale=0.001&a_offset=25"

ds = gdal.Open(dsn)

ds.SetMetadata({})  ## kill all the global meta, because we only have one var per file

## modify metadata at band level (only for analysed_sst, and error, and anom I guess)
b = ds.GetRasterBand(1)
b.SetMetadataItem("units", "celsius")
b.SetMetadataItem("add_offset", "25")
b.SetMetadataItem("scale_factor", "0.001")

## RESAMPLING is for the overviews
opts = gdal.TranslateOptions(format = "COG", 
        creationOptions = [ "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", 
        "RESAMPLING=AVERAGE", "SPARSE_OK=YES"])

destName = path.join("/tmp/", path.basename(filename).replace(".nc", f".{sds}.tif"))

## remember ice, sst, mask, error have different types and offset/scale so a_scale/a_offset needs to be a special case
gdal.Translate(destName, ds, format = "COG", outputType = ds.GetRasterBand(1).DataType, options = opts)

now prove that unscale works (this fails if I add "&ovr=5" so I think that's a GDAL bug which I will take aside

gdalinfo "vrt:///tmp/20240227090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.analysed_sst.tif?unscale=true&ot=Float32" -stats

Also I think unscale&ot should somehow change the missing value report here (or is that just harmless because we changed to float??)

Driver: VRT/Virtual Raster
Files: /tmp/20240227090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.analysed_sst.tif
Size is 36000, 17999
Coordinate System is:
GEOGCRS["WGS 84",
    ENSEMBLE["World Geodetic System 1984 ensemble",
        MEMBER["World Geodetic System 1984 (Transit)"],
        MEMBER["World Geodetic System 1984 (G730)"],
        MEMBER["World Geodetic System 1984 (G873)"],
        MEMBER["World Geodetic System 1984 (G1150)"],
        MEMBER["World Geodetic System 1984 (G1674)"],
        MEMBER["World Geodetic System 1984 (G1762)"],
        MEMBER["World Geodetic System 1984 (G2139)"],
        ELLIPSOID["WGS 84",6378137,298.257223563,
            LENGTHUNIT["metre",1]],
        ENSEMBLEACCURACY[2.0]],
    PRIMEM["Greenwich",0,
        ANGLEUNIT["degree",0.0174532925199433]],
    CS[ellipsoidal,2],
        AXIS["geodetic latitude (Lat)",north,
            ORDER[1],
            ANGLEUNIT["degree",0.0174532925199433]],
        AXIS["geodetic longitude (Lon)",east,
            ORDER[2],
            ANGLEUNIT["degree",0.0174532925199433]],
    USAGE[
        SCOPE["Horizontal component of 3D system."],
        AREA["World."],
        BBOX[-90,-180,90,180]],
    ID["EPSG",4326]]
Data axis to CRS axis mapping: 2,1
Origin = (-180.000000000000000,89.995000000000005)
Pixel Size = (0.010000000000000,-0.010000000000000)
Metadata:
  AREA_OR_POINT=Area
Image Structure Metadata:
  COMPRESSION=ZSTD
  INTERLEAVE=BAND
Corner Coordinates:
Upper Left  (-180.0000000,  89.9950000) (180d 0' 0.00"W, 89d59'42.00"N)
Lower Left  (-180.0000000, -89.9950000) (180d 0' 0.00"W, 89d59'42.00"S)
Upper Right ( 180.0000000,  89.9950000) (180d 0' 0.00"E, 89d59'42.00"N)
Lower Right ( 180.0000000, -89.9950000) (180d 0' 0.00"E, 89d59'42.00"S)
Center      (   0.0000000,   0.0000000) (  0d 0' 0.01"E,  0d 0' 0.01"N)
Band 1 Block=1024x1024 Type=Float32, ColorInterp=Gray
  Minimum=-1.800, Maximum=32.206, Mean=14.268, StdDev=11.932
  NoData Value=-32768
  Overviews: 18000x8999, 9000x4499, 4500x2249, 2250x1124, 1125x562, 562x281
  Metadata:
    add_offset=25
    comment=Interim near-real-time (nrt) version using Multi-Resolution Variational Analysis (MRVA) method for interpolation; to be replaced by Final version
    coordinates=lon lat
    long_name=analysed sea surface temperature
    NETCDF_DIM_time=1361869200
    NETCDF_VARNAME=analysed_sst
    scale_factor=0.001
    source=MODIS_T-JPL, MODIS_A-JPL, AMSR2-REMSS, AVHRRMTB_G-NAVO, iQUAM-NOAA/NESDIS, Ice_Conc-OSISAF
    standard_name=sea_surface_foundation_temperature
    STATISTICS_MAXIMUM=32.206001281738
    STATISTICS_MEAN=14.267902365861
    STATISTICS_MINIMUM=-1.7999999523163
    STATISTICS_STDDEV=11.93164601663
    STATISTICS_VALID_PERCENT=66.52
    units=celsius
    valid_max=32767
    valid_min=-32767
    _FillValue=-32768
mdsumner commented 4 months ago

to compare in very bare bones way the osgeo.gdal path to odc

import odc
import xarray
from odc.geo.geobox import GeoBox
from odc.geo.xr import assign_crs, xr_reproject
from odc.geo.cog._rio import _write_cog
import affine
from typing import Tuple, Union
from pathlib import Path
from s3path import S3Path

f = "/rdsi/PUBLIC/raad/data/www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc"
data = xarray.open_dataset(
                    f,
                    mask_and_scale=False,

                    engine="h5netcdf",
                ).load()

data = assign_crs(data, crs="EPSG:4326")

new_geobox = GeoBox(
        data.odc.geobox.shape,
        affine.Affine(0.01, 0.0, -180.0, 0.0, 0.01, -89.995, 0.0, 0.0, 1.0),
        data.odc.geobox.crs,
    )

output_location = ""
var = "sst"
cog_file = f"_{var}.tif"

_write_cog(
                    data[var].data.squeeze(),
                    new_geobox,
                    cog_file,
                    overwrite=True,
                    nodata=data.attrs.get("nodata", None),

                )
mdsumner commented 3 months ago

I have put up two "rough_.py" scripts that do things the odc way, and the osgeo.gdal way.

Each create a date at random (no safety net for the very few missing days), and write the analysed_sst to COG.

https://github.com/mdsumner/ghrsst.coop/commit/dd37390213fcb989d0d3ee4eed12884a3ff78f71

They produce comparable COGs in comparable time (~200seconds). I don't yet know if the multicore affects, I added that and blocksize to the odc way (it doesn't seem to respect the blocksize creation option). ODC creates tidier overviews and fewer of them, I don't yet know how (if) to control those with GDAL itself. (We could write them externally and bundle together, but that's kind of a pain, I assume odc is doing that more or less).

I think the majority of the time is spent on streaming from the source, not so much on the write (but I'd like to check that).

mdsumner commented 3 months ago

added a warper comparison, to resolve the grid entirely to a sensible 36000x18000 grid, one problem is that using bilinear resampling is probably not meaningful, but I just don't know if it matters - the result is different of course, though.

I set them all to use "ALL_CPUS", but I'm not sure it makes any difference, writing the COG is fast, streaming the netcdf from url takes about 3minutes and that's all there is to it.

Doing a barebones comparison on the files in dd37390213fcb989d0d3ee4eed12884a3ff78f71:

s <- list.files(pattern = "*way*")
for (i in 1:3) {print(s[i]); print(system.time(system(sprintf("python3 %s", s[i]))))}
[1] "rough_gdalway.py"
   user  system elapsed 
 87.203  10.016 160.980 
[1] "rough_odcway.py"
   user  system elapsed 
 52.548  19.926 210.189 
[1] "warp_gdalway.py"
   user  system elapsed 
106.744  10.926 242.980 

A more critical issue is using AVERAGE resampling for the overviews, this biases the values in the lower resolutions because of the lack of cosine weighting for the latitude (something I need to get across with a bit of advice, and using that OISST example).

mdsumner commented 3 months ago

actually, probably can do workingType and outputType for the warp, and set multithread and warpMemoryLimit , in that case we see

[1] "rough_gdalway.py"
   user  system elapsed 
 83.970  11.118 158.541 
[1] "rough_odcway.py"
   user  system elapsed 
 50.447  18.996 147.876 
[1] "warp_gdalway.py"
   user  system elapsed 
106.124  11.535 195.569 

I expect the speedup for odc is about network variability, but I'll try some local tests.

mdsumner commented 3 months ago

odc is slightly faster

mdsumner commented 1 month ago

finally we can do this all in R, with latest gdalraster VSIFile class (just test mode here, but we can use this to write to VRT to draft the correct metadata keys, and then ultimately write to vismem and then read that into with VSIFile)

Sys.setenv("GDAL_HTTP_HEADER_FILE"="/perm_storage/home/mdsumner/earthdata")
dsn <- sprintf("vrt://%s?a_ullr=-180,89.995,180,-89.995&a_srs=EPSG:4326&a_scale=0.001&a_offset=25&sd_name=analysed_sst", "/vsicurl/https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc")
library(gdalraster)
gdalraster::translate(dsn, "ghrsst1.vrt")
mdsumner commented 1 month ago

all the "junk metadata" is on the dataset level, not the band levelso that's all ok. Note that assigning the geotransform does not clear the GEOLOCATION domain (with the fuzzy rectlinear x/y coords).

  <Metadata>
    <MDI key="analysed_sst#add_offset">298.15</MDI>
    <MDI key="analysed_sst#comment">"Final" version using Multi-Resolution Variational Analysis (MRVA) method for interpolation</MDI>
    <MDI key="analysed_sst#coordinates">lon lat</MDI>
    <MDI key="analysed_sst#long_name">analysed sea surface temperature</MDI>
    <MDI key="analysed_sst#scale_factor">0.001</MDI>
    <MDI key="analysed_sst#source">AMSRE-REMSS, AVHRR_Pathfinder-PFV5.2-NODC_day, AVHRR_Pathfinder-PFV5.2-NODC_night, MODIS_T-JPL, iQUAM-NOAA/NESDIS, Ice_Conc-OSISAF</MDI>
    <MDI key="analysed_sst#standard_name">sea_surface_foundation_temperature</MDI>
    <MDI key="analysed_sst#units">kelvin</MDI>
    <MDI key="analysed_sst#valid_max">32767</MDI>
    <MDI key="analysed_sst#valid_min">-32767</MDI>
    <MDI key="analysed_sst#_FillValue">-32768</MDI>
    <MDI key="lat#axis">Y</MDI>
    <MDI key="lat#comment">none</MDI>
    <MDI key="lat#long_name">latitude</MDI>
    <MDI key="lat#standard_name">latitude</MDI>
    <MDI key="lat#units">degrees_north</MDI>
    <MDI key="lat#valid_max">90</MDI>
    <MDI key="lat#valid_min">-90</MDI>
    <MDI key="lon#axis">X</MDI>
    <MDI key="lon#comment">none</MDI>
    <MDI key="lon#long_name">longitude</MDI>
    <MDI key="lon#standard_name">longitude</MDI>
    <MDI key="lon#units">degrees_east</MDI>
    <MDI key="lon#valid_max">180</MDI>
    <MDI key="lon#valid_min">-180</MDI>
    <MDI key="NC_GLOBAL#acknowledgment">Please acknowledge the use of these data with the following statement:  These data were provided by JPL under support by NASA MEaSUREs program.</MDI>
    <MDI key="NC_GLOBAL#cdm_data_type">grid</MDI>
    <MDI key="NC_GLOBAL#comment">MUR = "Multi-scale Ultra-high Reolution"</MDI>
    <MDI key="NC_GLOBAL#Conventions">CF-1.5</MDI>
    <MDI key="NC_GLOBAL#creator_email">ghrsst@podaac.jpl.nasa.gov</MDI>
    <MDI key="NC_GLOBAL#creator_name">JPL MUR SST project</MDI>
    <MDI key="NC_GLOBAL#creator_url">http://mur.jpl.nasa.gov</MDI>
    <MDI key="NC_GLOBAL#date_created">20150819T103929Z</MDI>
    <MDI key="NC_GLOBAL#easternmost_longitude">180</MDI>
    <MDI key="NC_GLOBAL#file_quality_level">1</MDI>
    <MDI key="NC_GLOBAL#gds_version_id">2.0</MDI>
    <MDI key="NC_GLOBAL#geospatial_lat_resolution">0.01 degrees</MDI>
    <MDI key="NC_GLOBAL#geospatial_lat_units">degrees north</MDI>
    <MDI key="NC_GLOBAL#geospatial_lon_resolution">0.01 degrees</MDI>
    <MDI key="NC_GLOBAL#geospatial_lon_units">degrees east</MDI>
    <MDI key="NC_GLOBAL#history">created at nominal 4-day latency; replaced nrt (1-day latency) version.</MDI>
    <MDI key="NC_GLOBAL#id">MUR-JPL-L4-GLOB-v04.1</MDI>
    <MDI key="NC_GLOBAL#institution">Jet Propulsion Laboratory</MDI>
    <MDI key="NC_GLOBAL#keywords">Oceans &gt; Ocean Temperature &gt; Sea Surface Temperature</MDI>
    <MDI key="NC_GLOBAL#keywords_vocabulary">NASA Global Change Master Directory (GCMD) Science Keywords</MDI>
    <MDI key="NC_GLOBAL#license">These data are available free of charge under data policy of JPL PO.DAAC.</MDI>
    <MDI key="NC_GLOBAL#Metadata_Conventions">Unidata Observation Dataset v1.0</MDI>
    <MDI key="NC_GLOBAL#metadata_link">http://podaac.jpl.nasa.gov/ws/metadata/dataset/?format=iso&amp;shortName=MUR-JPL-L4-GLOB-v04.1</MDI>
    <MDI key="NC_GLOBAL#naming_authority">org.ghrsst</MDI>
    <MDI key="NC_GLOBAL#netcdf_version_id">4.1</MDI>
    <MDI key="NC_GLOBAL#northernmost_latitude">90</MDI>
    <MDI key="NC_GLOBAL#platform">Aqua, DMSP, NOAA-POES, Suomi-NPP, Terra</MDI>
    <MDI key="NC_GLOBAL#processing_level">L4</MDI>
    <MDI key="NC_GLOBAL#product_version">04.1</MDI>
    <MDI key="NC_GLOBAL#project">NASA Making Earth Science Data Records for Use in Research Environments (MEaSUREs) Program</MDI>
    <MDI key="NC_GLOBAL#publisher_email">ghrsst-po@nceo.ac.uk</MDI>
    <MDI key="NC_GLOBAL#publisher_name">GHRSST Project Office</MDI>
    <MDI key="NC_GLOBAL#publisher_url">http://www.ghrsst.org</MDI>
    <MDI key="NC_GLOBAL#references">http://podaac.jpl.nasa.gov/Multi-scale_Ultra-high_Resolution_MUR-SST</MDI>
    <MDI key="NC_GLOBAL#sensor">AMSR-E, AVHRR, MODIS, SSM/I, VIIRS, in-situ</MDI>
    <MDI key="NC_GLOBAL#source">AMSRE-REMSS, AVHRR_Pathfinder-PFV5.2-NODC_day, AVHRR_Pathfinder-PFV5.2-NODC_night, MODIS_T-JPL, iQUAM-NOAA/NESDIS, Ice_Conc-OSISAF</MDI>
    <MDI key="NC_GLOBAL#southernmost_latitude">-90</MDI>
    <MDI key="NC_GLOBAL#spatial_resolution">0.01 degrees</MDI>
    <MDI key="NC_GLOBAL#standard_name_vocabulary">NetCDF Climate and Forecast (CF) Metadata Convention</MDI>
    <MDI key="NC_GLOBAL#start_time">20020601T090000Z</MDI>
    <MDI key="NC_GLOBAL#stop_time">20020601T090000Z</MDI>
    <MDI key="NC_GLOBAL#summary">A merged, multi-sensor L4 Foundation SST analysis product from JPL.</MDI>
    <MDI key="NC_GLOBAL#time_coverage_end">20020601T210000Z</MDI>
    <MDI key="NC_GLOBAL#time_coverage_start">20020531T210000Z</MDI>
    <MDI key="NC_GLOBAL#title">Daily MUR SST, Final product</MDI>
    <MDI key="NC_GLOBAL#uuid">27665bc0-d5fc-11e1-9b23-0800200c9a66</MDI>
    <MDI key="NC_GLOBAL#westernmost_longitude">-180</MDI>
    <MDI key="NETCDF_DIM_EXTRA">{time}</MDI>
    <MDI key="NETCDF_DIM_time_DEF">{1,4}</MDI>
    <MDI key="NETCDF_DIM_time_VALUES">675766800</MDI>
    <MDI key="time#axis">T</MDI>
    <MDI key="time#comment">Nominal time of analyzed fields</MDI>
    <MDI key="time#long_name">reference time of sst field</MDI>
    <MDI key="time#standard_name">time</MDI>
    <MDI key="time#units">seconds since 1981-01-01 00:00:00 UTC</MDI>
  </Metadata>
  <Metadata domain="GEOLOCATION">
    <MDI key="GEOREFERENCING_CONVENTION">PIXEL_CENTER</MDI>
    <MDI key="LINE_OFFSET">0</MDI>
    <MDI key="LINE_STEP">1</MDI>
    <MDI key="PIXEL_OFFSET">0</MDI>
    <MDI key="PIXEL_STEP">1</MDI>
    <MDI key="SRS">GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]</MDI>
    <MDI key="X_BAND">1</MDI>
    <MDI key="X_DATASET">NETCDF:"/vsicurl/https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc":lon</MDI>
    <MDI key="Y_BAND">1</MDI>
    <MDI key="Y_DATASET">NETCDF:"/vsicurl/https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc":lat</MDI>
  </Metadata>
  <VRTRasterBand dataType="Int16" band="1" blockXSize="2047" blockYSize="1023">
    <Metadata>
      <MDI key="add_offset">298.15</MDI>
      <MDI key="comment">"Final" version using Multi-Resolution Variational Analysis (MRVA) method for interpolation</MDI>
      <MDI key="coordinates">lon lat</MDI>
      <MDI key="long_name">analysed sea surface temperature</MDI>
      <MDI key="NETCDF_DIM_time">675766800</MDI>
      <MDI key="NETCDF_VARNAME">analysed_sst</MDI>
      <MDI key="scale_factor">0.001</MDI>
      <MDI key="source">AMSRE-REMSS, AVHRR_Pathfinder-PFV5.2-NODC_day, AVHRR_Pathfinder-PFV5.2-NODC_night, MODIS_T-JPL, iQUAM-NOAA/NESDIS, Ice_Conc-OSISAF</MDI>
      <MDI key="standard_name">sea_surface_foundation_temperature</MDI>
      <MDI key="units">kelvin</MDI>
      <MDI key="valid_max">32767</MDI>
      <MDI key="valid_min">-32767</MDI>
      <MDI key="_FillValue">-32768</MDI>
    </Metadata>
mdsumner commented 1 month ago

This drops all the pesky dataset level metadata but not the band level.

import os
from osgeo import gdal
from datetime import datetime
from os import path
os.environ["GDAL_HTTP_HEADER_FILE"] = "~/earthdata"

gdal.UseExceptions()

datestring = '2002-06-01'
sds = "analysed_sst"
dt    = datetime.strptime(datestring, '%Y-%m-%d')
year  = datetime.strftime(dt, "%Y")
month = datetime.strftime(dt, "%m")
day   = datetime.strftime(dt, "%d")
jday  = datetime.strftime(dt, "%j")

## local
filename = f'/rdsi/PUBLIC/raad/data/podaac-opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/{year}/{jday}/{year}{month}{day}090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'
## url
#filename = f'/vsicurl/https://podaac-opendap.jpl.nasa.gov/opendap/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/{year}/{jday}/{year}{month}{day}090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'

opts = gdal.TranslateOptions(format = "COG", 
     creationOptions = [ "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", "RESAMPLING=AVERAGE", "SPARSE_OK=YES", "COPY_SRC_MDD=NO"])

if (sds == "analysed_sst") | (sds == "analysis_error"):
 sn = f"vrt://{filename}?a_srs=OGC:CRS84&a_ullr=-180,89.995,180,-89.995&a_scale=0.001&a_offset=25&sd_name={sds}"
else:
 dsn = f"vrt://{filename}?a_srs=OGC:CRS84&a_ullr=-180,89.995,180,-89.995"

ds = gdal.Open(dsn)
destName = "/tmp/mytif.tif"

gdal.Translate(destName, ds, options = opts)

we also could do everything in one swoop with a chunk of VRT:

https://gist.github.com/mdsumner/4cdd7b5c92e7c69fe766e1fc19ac16b5

to use that template we just do

opts = gdal.TranslateOptions(format = "COG", 
     creationOptions = [ "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", "RESAMPLING=AVERAGE", "SPARSE_OK=YES", "COPY_SRC_MDD=NO"])

gdal.Translate(<tif>, <vrt>, options = opts)
mdsumner commented 1 month ago

doing this with reticulate on GADI

library(reticulate)
Sys.setenv("PYTHONPATH" = "$PYTHONPATH:/usr/local/lib/python3/dist-packages")

reticulate::use_python("/usr/bin/python3")
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
earthdata <- "~/earthdata"

Sys.setenv("GDAL_HTTP_HEADER_FILE" = earthdata)
library(dplyr)
root <- ghrsstcogs"

files <- tibble(date = seq(as.Date("2002-06-01"), Sys.Date(), by = "1 day"), 
                base = "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1") |> 
 mutate(day = format(date, "%d"), year = format(date, "%Y"), jday = format(date, "%j"), 
        month = format(date, "%m")) |> 
 mutate(url = glue::glue("{base}/{year}{month}{day}090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc")) |>
  mutate(local = gsub(".nc$", "_analysed_sst.tif", file.path(root, basename(url)))) |> 
  mutate(dsn = glue::glue("vrt:///vsicurl/{url}?a_srs=EPSG:4326&&a_gt=-180,0.01,0,89.995,0,-0.01&a_scale=0.001&a_offset=25&sd_name=analysed_sst")) |> 
  select(date, dsn, local)

gdal$SetConfigOption("GDAL_HTTP_HEADER_FILE", earthdata)

for (i in rev(seq_len(nrow(files)))) {

ds = try(gdal$Open(files$dsn[i]))

if (!file.exists(files$local[i])) {
if (!inherits(ds, "try-error")) {
    ds$GetRasterBand(1L)$SetMetadata(list())
  fs::dir_create(dirname(files$local[i]), recurse = TRUE)
  gdal$Translate(files$local[i], ds, format = "COG", 
               creationOptions = c( "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", "RESAMPLING=AVERAGE", "SPARSE_OK=YES", "COPY_SRC_MDD=NO"))

}}

}
mdsumner commented 1 month ago

quick test in R to read the bytes (no file/disk) via reticulate, to draft into python:

library(reticulate)
Sys.setenv("PYTHONPATH" = "$PYTHONPATH:/usr/local/lib/python3/dist-packages")

reticulate::use_python("/usr/bin/python3")
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
earthdata <- normalizePath("~/earthdata")

Sys.setenv("GDAL_HTTP_HEADER_FILE" = earthdata)
library(dplyr)
root <- "/tmp/ghrsstcogs"

files <- tibble(date = seq(as.Date("2002-06-01"), Sys.Date(), by = "1 day"), 
                base = "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1") |> 
 mutate(day = format(date, "%d"), year = format(date, "%Y"), jday = format(date, "%j"), 
        month = format(date, "%m")) |> 
 mutate(url = glue::glue("{base}/{year}{month}{day}090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc")) |>
  mutate(local = gsub(".nc$", "_analysed_sst.tif", file.path(root, basename(url)))) |> 
  mutate(dsn = glue::glue("vrt:///vsicurl/{url}?a_srs=EPSG:4326&&a_gt=-180,0.01,0,89.995,0,-0.01&a_scale=0.001&a_offset=25&sd_name=analysed_sst")) |> 
  select(date, dsn, local)

gdal$SetConfigOption("GDAL_HTTP_HEADER_FILE", earthdata)

i <- 1
ds = try(gdal$Open(files$dsn[i]))

  ds$GetRasterBand(1L)$SetMetadata(list())
  #fs::dir_create(dirname(files$local[i]), recurse = TRUE)
  local <- file.path("/vsimem", basename(files$local[i]))
  gdal$Translate(local, ds, format = "COG", 
               creationOptions = c( "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", "RESAMPLING=AVERAGE", "SPARSE_OK=YES", "COPY_SRC_MDD=NO"))

  f <- gdal$VSIFOpenL(local, 'rb')
  gdal$VSIFSeekL(f, 0L, 2L)  # seek to end
  size < gdal$VSIFTellL(f)
  gdal$VSIFSeekL(f, 0L, 0L)  # seek to beginning
  bytes <- gdal$VSIFReadL(1, size, f)
  gdal$VSIFCloseL(f)
  # Upload the raw data to s3
  ##s3.put_object(Key=key, Bucket=bucket_name, Body=bytes, ContentLength=size)
  gdal$Unlink(local)
mdsumner commented 1 month ago

a few corrections

library(reticulate)
Sys.setenv("PYTHONPATH" = "$PYTHONPATH:/usr/local/lib/python3/dist-packages")

reticulate::use_python("/usr/bin/python3")
gdal <- import("osgeo.gdal")
gdal$UseExceptions()
earthdata <- normalizePath("~/earthdata")

Sys.setenv("GDAL_HTTP_HEADER_FILE" = earthdata)
library(dplyr)
root <- "/tmp/ghrsstcogs"

files <- tibble(date = seq(as.Date("2002-06-01"), Sys.Date(), by = "1 day"), 
                base = "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1") |> 
 mutate(day = format(date, "%d"), year = format(date, "%Y"), jday = format(date, "%j"), 
        month = format(date, "%m")) |> 
 mutate(url = glue::glue("{base}/{year}{month}{day}090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc")) |>
  mutate(local = gsub(".nc$", "_analysed_sst.tif", file.path(root, basename(url)))) |> 
  mutate(dsn = glue::glue("vrt:///vsicurl/{url}?a_srs=EPSG:4326&&a_gt=-180,0.01,0,89.995,0,-0.01&a_scale=0.001&a_offset=25&sd_name=analysed_sst")) |> 
  select(date, dsn, local)

gdal$SetConfigOption("GDAL_HTTP_HEADER_FILE", earthdata)

i <- 1
ds = try(gdal$Open(files$dsn[i]))

  ds$GetRasterBand(1L)$SetMetadata(list())
  #fs::dir_create(dirname(files$local[i]), recurse = TRUE)
  local <- file.path("/vsimem", basename(files$local[i]))
  gdal$Translate(local, ds, format = "COG", 
               creationOptions = c( "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", "RESAMPLING=AVERAGE", "SPARSE_OK=YES", "COPY_SRC_MDD=NO"))

  f <- gdal$VSIFOpenL(local, 'rb')
  gdal$VSIFSeekL(f, 0L, 2L)  # seek to end
  size <- gdal$VSIFTellL(f)
  gdal$VSIFSeekL(f, 0L, 0L)  # seek to beginning
  bytes <- gdal$VSIFReadL(1, size, f)
  gdal$VSIFCloseL(f)
  # Upload the raw data to s3
  ##s3.put_object(Key=key, Bucket=bucket_name, Body=bytes, ContentLength=size)
  gdal$Unlink(local)
mdsumner commented 1 month ago

pure python, entirely in-memory cog from nc url (can't see how to zap the band metadata ...):

requires 3.9 for the sd_name syntax, otherwise could use NETCDF:{/vicurl/url}:subdataset, but the other vrt:// protocol args are 3.7 or 3.8


from datetime import datetime
from os import path
from os import environ
import pathlib
import re

from osgeo import gdal

os.environ["GDAL_HTTP_HEADER_FILE"] = str(pathlib.Path("~/earthdata").expanduser().resolve())
gdal.UseExceptions()
subdatasets = ["analysed_sst"]
datestring = '2002-06-01'
dt    = datetime.strptime(datestring, '%Y-%m-%d')
year  = datetime.strftime(dt, "%Y")
month = datetime.strftime(dt, "%m")
day   = datetime.strftime(dt, "%d")
jday  = datetime.strftime(dt, "%j")

base = "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1"
file = f"{year}{month}{day}090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc"
local = re.sub("\\.nc$", "_analysed_sst.tif", file)
local = f"/vsimem/{local}"

dsn = f"vrt:///vsicurl/{base}/{file}?a_srs=EPSG:4326&a_gt=-180,0.01,0,89.995,0,-0.01&a_scale=0.001&a_offset=25&sd_name=analysed_sst"

opts = gdal.TranslateOptions(format = "COG",
     creationOptions = [  "BLOCKSIZE=1024",  "COMPRESS=ZSTD", "PREDICTOR=STANDARD", "RESAMPLING=AVERAGE", "SPARSE_OK=YES", "COPY_SRC_MDD=NO"])
ds = gdal.Open(dsn)
gdal.Translate(local, ds, options = opts)
ds.Close(d)

## now open in rw, to zap band level md (but breaks COG layout, I don't know how to do this)
#lds = gdal.OpenEx(local, gdal.GA_Update, open_options=["IGNORE_COG_LAYOUT_BREAK=YES"])
#lds.GetRasterBand(1).SetMetadata({})
#lds.Close()

f = gdal.VSIFOpenL(local, 'rb')
gdal.VSIFSeekL(f, 0, 2)  # seek to end
size = gdal.VSIFTellL(f)
gdal.VSIFSeekL(f, 0, 0)  # seek to beginning
bytes = gdal.VSIFReadL(1, size, f)
gdal.VSIFCloseL(f)
# Upload the raw data to s3
##s3.put_object(Key=key, Bucket=bucket_name, Body=bytes, ContentLength=size)
gdal.Unlink(local)