HARPgroup / model_meteorology

0 stars 0 forks source link

Tiling and raster performance #70

Open rburghol opened 3 months ago

rburghol commented 3 months ago

Try retiling according to https://postgis.net/docs/RT_Retile.html

Tasks:

-- get a randomish raster to work with select tid, to_timestamp(tstime) from dh_timeseries_weather as a left outer join dh_variabledefinition as b on (a.varid = b.hydroid and b.varkey='nldas2_obs_hourly') where a.tid is not null limit 1; -- tid | to_timestamp -----------+------------------------ -- 110396782 | 2006-01-01 07:00:00-05

create temp table ts_rast_tile as select a.tid, a.tstime, a.tsendtime, b.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 10, 10) as rast from dh_timeseries_weather as a left outer join dh_variabledefinition as b on (a.varid = b.hydroid and b.varkey='nldas2_precip_hourly') where a.tid = 110396782 ;


#### Verify Data

select to_timestamp(min(tstime)), max(to_timetamp(tstime)) from ts_rast_tile ;

select count(*) from ts_rast_tile;

3504000 / 3504001

select 3504000/( 365 * 24); ?column?

  400

-- so, we have 400 tiles over the bay area


### Do a whole year of tiles for performance testing

create temp table ts_rast_tile as select a.tid, a.tstime, a.tsendtime, b.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 50, 50) as rast from dh_timeseries_weather as a left outer join dh_variabledefinition as b on (a.varid = b.hydroid) where extract(year from to_timestamp(a.tstime)) = 2001 and b.varkey='nldas2_obs_hourly';

-- create an index CREATE INDEX trast_ix ON ts_rast_tile USING GIST(ST_Envelope(rast)); CREATE INDEX trast_fix ON ts_rast_tile (featureid); CREATE INDEX trast_vix ON ts_rast_tile (varid); CREATE INDEX trast_tix ON ts_rast_tile (tstime);


#### Check Indices
- The old table dh_timeseries_weather

explain select count(*) from dh_timeseries_weather as met left outer join field_data_dh_geofield as cov on (met.rast && cov.dh_geofield_geom) where cov.entity_id = 290010 and met.rast is not null and met.varid = 1451;

Aggregate (cost=16.37..16.38 rows=1 width=8) -> Nested Loop (cost=0.99..16.37 rows=1 width=0) Join Filter: ((met.rast)::geometry && cov.dh_geofield_geom) -> Index Scan using dh_timeseries_weather_dh_tsw_vix_idx on dh_timeseries_weather met (cost=0.57..7.16 rows=1 width=18) Index Cond: (varid = 1451) Filter: (rast IS NOT NULL) -> Index Scan using field_data_dh_geofield_entity_id_idx on field_data_dh_geofield cov (cost=0.42..8.44 rows=1 width=2248) Index Cond: (entity_id = 290010) (8 rows)

- the tiled table

explain select count(*) from ts_rast_tile as met left outer join field_data_dh_geofield as cov on (met.rast && cov.dh_geofield_geom) where cov.entity_id = 290010 and met.varid = 1451;

Aggregate (cost=2883822.45..2883822.46 rows=1 width=8) -> Nested Loop (cost=0.42..2883813.69 rows=3505 width=0) Join Filter: ((met.rast)::geometry && cov.dh_geofield_geom) -> Index Scan using field_data_dh_geofield_entity_id_idx on field_data_dh_geofield cov (cost=0.42..8.44 rows=1 width=2248) Index Cond: (entity_id = 290010) -> Seq Scan on ts_rast_tile met (cost=0.00..211616.38 rows=3504510 width=293) Filter: (varid = 1451) JIT: Functions: 10 Options: Inlining true, Optimization true, Expressions true, Deforming true (10 rows)


### Now Compare Performance

#### Using Non-tiled in dh_timeseries_weather

\set band '1' \set varkey 'nldas2_obs_hourly' \set hydrocode 'usgs_ws_02031000'

select met.featureid, to_timestamp(met.tsendtime) as obs_date, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_mm, 0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_in from dh_feature as f left outer join field_data_dh_geofield as fgeo on ( fgeo.entity_id = f.hydroid and fgeo.entity_type = 'dh_feature' ) left outer join dh_variabledefinition as v on ( v.varkey = :'varkey' ) left outer join dh_feature as mcov on ( mcov.hydrocode = 'cbp6_met_coverage' ) left outer join dh_timeseries_weather as met on ( mcov.hydroid = met.featureid and met.varid = v.hydroid AND extract(year from to_timestamp(met.tstime)) = 2001 AND fgeo.dh_geofield_geom && met.rast ) where f.hydrocode = :'hydrocode' and met.tid is not null order by met.tsendtime;


#### The Tiled table

select met.featureid, to_timestamp(met.tsendtime) as obs_date, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_mm, 0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_in from dh_feature as f left outer join field_data_dh_geofield as fgeo on ( fgeo.entity_id = f.hydroid and fgeo.entity_type = 'dh_feature' ) left outer join dh_variabledefinition as v on ( v.varkey = :'varkey' ) left outer join dh_feature as mcov on ( mcov.hydrocode = 'cbp6_met_coverage' ) left outer join dh_timeseries_weather as met on ( mcov.hydroid = met.featureid and met.varid = v.hydroid ) where f.hydrocode = :'hydrocode' order by met.tsendtime;


#### Insert final tiled dataset in, using a unique varkey
- TBD

insert into dh_timeseries_weather(tstime,tsendtime, varid, featureid, entity_type, rast) select a.tstime, a.tsendtime, c.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 50, 50) as rast from dh_timeseries_weather as a left outer join dh_variabledefinition as b on (a.varid = b.hydroid) left outer join dh_variabledefinition as c on (c.varkey = 'nldas2_precip_hourly_tiled') where b.varkey='nldas2_obs_hourly' -- comment this to get them all and extract(year from to_timestamp(a.tstime)) = 2001 and c.varkey is not null;


#### Tiled data in `dh_timeseries_weather` table

\set band '1' \set varkey 'nldas2_precip_hourly_tiled' \set hydrocode 'usgs_ws_02031000'

select met.featureid, to_timestamp(met.tsendtime) as obs_date, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_mm, 0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_in from dh_feature as f left outer join field_data_dh_geofield as fgeo on ( fgeo.entity_id = f.hydroid and fgeo.entity_type = 'dh_feature' ) left outer join dh_variabledefinition as v on ( v.varkey = :'varkey' ) left outer join dh_feature as mcov on ( mcov.hydrocode = 'cbp6_met_coverage' ) left outer join dh_timeseries_weather as met on ( mcov.hydroid = met.featureid and met.varid = v.hydroid AND extract(year from to_timestamp(met.tstime)) = 2001 AND fgeo.dh_geofield_geom && met.rast ) where f.hydrocode = :'hydrocode' and met.tid is not null order by met.tsendtime;


#### Use Explicit Feature IDs to simplify query planner
- This totally helps to use explicit IDs.
- Timing comparison 
   - 2001 only (`AND extract(year from to_timestamp(met.tstime)) = 2001`):
      - 1000x1000: `Time: 18457.256 ms (00:18.457)`
      - 50x50: `Time: 12951.363 ms (00:12.951)`
   - All data (no date filter):
      - 1000x1000: `Time: 845279.936 ms (14:05.280)`
      - 50x50: `Time: 578732.285 ms (09:38.732)`

\set band '1' \set varkey 'nldas2_precip_hourly_tiled' \set hydrocode 'usgs_ws_02031000' \set covid 290010 \set metid 617850 \set metvarid 1455 -- not resampled into small files -- \set metvarid 1455

select met.featureid, to_timestamp(met.tsendtime) as obs_date, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_mm, 0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), :band, TRUE)).mean as precip_in from field_data_dh_geofield as fgeo left outer join dh_timeseries_weather as met on ( met.featureid = :metid and met.varid = :metvarid AND extract(year from to_timestamp(met.tstime)) = 2001 AND fgeo.dh_geofield_geom && met.rast ) where fgeo.entity_id = :covid and met.tid is not null and fgeo.entity_type = 'dh_feature' order by met.tsendtime;

#### Insert the remaining 40+ years of data

insert into dh_timeseries_weather(tstime,tsendtime, varid, featureid, entity_type, rast) select a.tstime, a.tsendtime, c.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 50, 50) as rast from dh_timeseries_weather as a left outer join dh_variabledefinition as b on (a.varid = b.hydroid) left outer join dh_variabledefinition as c on (c.varkey = 'nldas2_precip_hourly_tiled') where extract(year from to_timestamp(a.tstime)) <> 2001 and b.varkey='nldas2_obs_hourly' and c.varkey is not null;

COBrogan commented 1 month ago

Returning to raster index testing. Looking into NLDAS rasters, we see:

gdalinfo /backup/meteorology/2023/070/NLDAS_FORA0125_H.A20230311.2300.002.grb_CBP.gtiff

Driver: GTiff/GeoTIFF
Files: /backup/meteorology/2023/070/NLDAS_FORA0125_H.A20230311.2300.002.grb_CBP.gtiff
Size is 75, 65
Coordinate System is:
GEOGCRS["WGS 84"...
Band 1 Block=75x13 Type=Float64, ColorInterp=Gray

So, we are storing float64 data. To effectively tile this raster, we'd want to keep each tile to 2048 bytes per tile. float64 is 8 bytes per pixel, so the maximum size of our tile should be 16x16 per the recommendations here. So,

create temp table ts_rast_tile as
select a.tid, a.tstime, a.tsendtime, b.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 16, 16) as rast
from dh_timeseries_weather as a
left outer join dh_variabledefinition as b
on (a.varid = b.hydroid) 
where extract(year from to_timestamp(a.tstime)) = 2023
and b.varkey='nldas2_obs_hourly';

-- create an index
CREATE INDEX trast_ix ON ts_rast_tile USING GIST(ST_ConvexHull(rast));
--Add all constraints
SELECT AddRasterConstraints('ts_rast_tile'::name, 'rast'::name);

Let's evaluate the constraints on our rasters to see if these make sense. My current understanding is that constraints can speed up certain processes by ensuring postGIS knows all rasters in the tables fall in the bounds of the constraint. Notice that dh_timeseries_weather has no constrains nor a spatial index. The lack of constraints makes sense given that the data is representative of many different varykeys. But, it should have a spatial index.

select r_table_catalog,r_table_schema,r_table_name, r_raster_column,srid,scale_x,scale_y,blocksize_x,blocksize_y,same_alignment,regular_blocking,num_bands,pixel_types, nodata_values ,out_db, spatial_index--,extent
from raster_columns;
r_table_name r_raster_column srid scale_x scale_y blocksize_x blocksize_y same_alignment regular_blocking num_bands pixel_types nodata_values out_db spatial_index
dh_timeseries_weather rast 0 f f f
tmp_precipgrid rast 0 f f f
ts_rast_tile rast 4326 0.125 -0.125 16 16 t f 1 {64BF} {9999} {f} t

Let's give dh_timeseries_weather a spatial index:

CREATE INDEX trast_ix ON dh_timeseries_weather USING GIST(ST_Envelope(rast));

From here, we should be able to query our temporary table ts_rast_tile to test if the tile raster has improved our performance at all.

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = 'usgs_ws_01668000'
)

Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
    extract(year from to_timestamp(met.tsendtime)) as yr,
    extract(month from to_timestamp(met.tsendtime)) as mo,
    extract(day from to_timestamp(met.tsendtime)) as da,
    (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm,
    0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in
FROM usgs_features as f
left outer join field_data_dh_geofield as fgeo
on (
   fgeo.entity_id = f.hydroid
   and fgeo.entity_type = 'dh_feature' 
) 
JOIN(
    select * 
    from ts_rast_tile as met
    left outer join dh_feature as mcov
        on (
        mcov.hydrocode = 'cbp6_met_coverage'
    )
    left outer join dh_variabledefinition as v
        on (
        v.varkey = 'nldas2_obs_hourly'--'nldas2_precip_hourly_tiled'
    )
    where mcov.hydroid = met.featureid 
        and met.varid = v.hydroid
        AND extract(year from to_timestamp(met.tsendtime)) = '2023'
) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
WHERE f.hydrocode = 'usgs_ws_01668000'

order by met.tsendtime;

Time: 36921.604 ms (00:36.922)

One year summarized in 36 seconds. When comparing to dh_timeseries_weather:

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = 'usgs_ws_01668000'
)

Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
    extract(year from to_timestamp(met.tsendtime)) as yr,
    extract(month from to_timestamp(met.tsendtime)) as mo,
    extract(day from to_timestamp(met.tsendtime)) as da,
    (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm,
    0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in
FROM usgs_features as f
left outer join field_data_dh_geofield as fgeo
on (
   fgeo.entity_id = f.hydroid
   and fgeo.entity_type = 'dh_feature' 
) 
JOIN(
    select *
        from dh_timeseries_weather as met
        left outer join dh_variabledefinition as b
            on (met.varid = b.hydroid) 
        where extract(year from to_timestamp(met.tstime)) = 2023
            and b.varkey='nldas2_obs_hourly'
) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
WHERE f.hydrocode = 'usgs_ws_01668000'

order by met.tsendtime;

Time: 29694.877 ms (00:29.695)

Interestingly not a big gain, but this method now seems faster than previously. The spatial index likely helped improve the speed. From dh_timeserires_weather, the above query took about 26 minutes to run for the full 40-year period for usgs_ws_01668000. calc_raster_ts took 49 minutes. Using the tiled temporary table above without constraints took 35 minutes.

Confirm the number of tiles we get make sense:

COPY(
WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = 'usgs_ws_01668000'
)

Select st_astext(st_convexhull(met.rast))
FROM usgs_features as f
left outer join field_data_dh_geofield as fgeo
on (
   fgeo.entity_id = f.hydroid
   and fgeo.entity_type = 'dh_feature' 
) 
JOIN(
    select *
        from ts_rast_tile as met
        left outer join dh_variabledefinition as b
            on (met.varid = b.hydroid) 
        where extract(year from to_timestamp(met.tstime)) = 2023
            and b.varkey='nldas2_obs_hourly'
            and extract(month from to_timestamp(met.tstime)) = 12
            and extract(day from to_timestamp(met.tstime)) = 12
) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
WHERE f.hydrocode = 'usgs_ws_01668000'
) to '/tmp/getRasterTiles.csv' CSV;

Switching to 4x4 tiles results in one year summaries taking 1 minute, nearly twice the time as the 16x16 tiled raster. However, the below query ran extremely quickly. The only difference is no clipping is used here, so this is the sum of all tiles (which is bigger than our boundary by a lot).

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = 'usgs_ws_01668000'
)

Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
    extract(year from to_timestamp(met.tsendtime)) as yr,
    extract(month from to_timestamp(met.tsendtime)) as mo,
    extract(day from to_timestamp(met.tsendtime)) as da,
    sum((ST_summarystats(met.rast, 1, TRUE)).mean)as precip_mm,
    sum(0.0393701 * (ST_summarystats(met.rast, 1, TRUE)).mean) as precip_in
FROM usgs_features as f
left outer join field_data_dh_geofield as fgeo
on (
   fgeo.entity_id = f.hydroid
   and fgeo.entity_type = 'dh_feature' 
) 
JOIN(
    select * 
    from ts_rast_tile as met
    left outer join dh_feature as mcov
        on (
        mcov.hydrocode = 'cbp6_met_coverage'
    )
    left outer join dh_variabledefinition as v
        on (
        v.varkey = 'nldas2_obs_hourly'--'nldas2_precip_hourly_tiled'
    )
    where mcov.hydroid = met.featureid 
        and met.varid = v.hydroid
        AND extract(year from to_timestamp(met.tsendtime)) = '2023'
        --and extract(month from to_timestamp(met.tstime)) = 12
        --and extract(day from to_timestamp(met.tstime)) = 12
) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
WHERE f.hydrocode = 'usgs_ws_01668000'

GROUP BY met.featureid,met.tsendtime

order by met.tsendtime;

Time: 2665.869 ms (00:02.666)

Insert 4x4 tiles into db and repeat for 16x16 via 'nldas2_precip_hourly_tiled_16x16 varkey

'nldas2_precip_hourly_tiled_4by4' as varkey,
v.datatype,
'nldas2_precip_hourly_tiled_4by4' as varcode,
v.timestep,
v.timeunits,
v.nodataval,
'NLDAS_tiled4by4' as varabbrev,
v.multiplicity
from dh_variabledefinition as v
where v.varkey='nldas2_obs_hourly';

insert into dh_timeseries_weather(tstime,tsendtime, varid, featureid, entity_type, rast)
select a.tstime, a.tsendtime, c.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 4, 4) as rast
from dh_timeseries_weather as a
left outer join dh_variabledefinition as b
on (a.varid = b.hydroid) 
left outer join dh_variabledefinition as c
on (c.varkey = 'nldas2_precip_hourly_tiled_4by4') 
where b.varkey='nldas2_obs_hourly'
-- comment this to get them all
and  extract(year from to_timestamp(a.tstime)) = 2023
and c.varkey is not null;
INSERT 0 4051680

Use the new tiled data set to confirm results above:

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = 'usgs_ws_01668000'
)

Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
    extract(year from to_timestamp(met.tsendtime)) as yr,
    extract(month from to_timestamp(met.tsendtime)) as mo,
    extract(day from to_timestamp(met.tsendtime)) as da,
    sum((ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean) as precip_mm,
    sum(0.0393701 * (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean) as precip_in
FROM usgs_features as f
left outer join field_data_dh_geofield as fgeo
on (
   fgeo.entity_id = f.hydroid
   and fgeo.entity_type = 'dh_feature' 
) 
JOIN(
    select *
        from dh_timeseries_weather as met
        left outer join dh_variabledefinition as b
            on (met.varid = b.hydroid) 
        where extract(year from to_timestamp(met.tstime)) = 2023
            and b.varkey='nldas2_precip_hourly_tiled_4by4'
) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
WHERE f.hydrocode = 'usgs_ws_01668000'

GROUP BY met.featureid,met.tsendtime

order by met.tsendtime;

We can restructure the query to use union. So, the query would first get the USGS feature, then find the tiles that overlap with the feature, Union them into a single raster, and then proceed with clipping and summary. Removing one of the st_summarystats calls speeds up performance by half. Overall, this query took 16 seconds. Which is still one second slower than the dh_timeseries_weather equivalent. The 16x16 tiles were about on par with dh_timeseries_weather

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = 'usgs_ws_01668000'
),
metUnion as (
    Select met.featureid, met.tsendtime,
        st_union(met.rast) as rast
    FROM usgs_features as f
    left outer join field_data_dh_geofield as fgeo
    on (
        fgeo.entity_id = f.hydroid
        and fgeo.entity_type = 'dh_feature' 
    ) 
    JOIN(
        select *
        from dh_timeseries_weather as met
        left outer join dh_variabledefinition as b
            on (met.varid = b.hydroid) 
        where extract(year from to_timestamp(met.tstime)) = 2023
            and b.varkey='nldas2_precip_hourly_tiled_4by4'
    ) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)

    group by met.featureid, met.tsendtime
)

Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
    extract(year from to_timestamp(met.tsendtime)) as yr,
    extract(month from to_timestamp(met.tsendtime)) as mo,
    extract(day from to_timestamp(met.tsendtime)) as da,
    sum((ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean) as precip_mm
FROM usgs_features as f
left outer join field_data_dh_geofield as fgeo
on (
   fgeo.entity_id = f.hydroid
   and fgeo.entity_type = 'dh_feature' 
) 
JOIN metUnion AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)

GROUP BY met.featureid,met.tsendtime

order by met.tsendtime;
rburghol commented 1 month ago

Just check the overlap:

\set band '1'
 \set ftype 'cbp6_landseg'
 \set varkey 'nldas2_obs_hourly'
 \set resample_varkey 'daymet_mod_daily'
 \set hydrocode 'N51137'
 \set fname '/tmp/N51137-nldas2-all.csv'
 \timing ON
select met.featureid, to_timestamp(met.tsendtime) as obs_date, 
  met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) as yr, 
  extract(month from to_timestamp(met.tsendtime)) as mo, 
  extract(day from to_timestamp(met.tsendtime)) as da, 
  extract(hour from to_timestamp(met.tsendtime)) as hr, 
  count(met.*) as num_tiles
from dh_feature as f 
left outer join field_data_dh_geofield as fgeo 
on ( fgeo.entity_id = f.hydroid and fgeo.entity_type = 'dh_feature' ) 
left outer join dh_variabledefinition as v 
on ( v.varkey = :'varkey' ) 
left outer join raster_templates as rt 
on ( rt.varkey = :'resample_varkey' ) 
left outer join dh_feature as mcov 
on ( mcov.hydrocode = 'cbp6_met_coverage' ) 
left outer join dh_timeseries_weather as met 
on ( mcov.hydroid = met.featureid and met.varid = v.hydroid and fgeo.dh_geofield_geom && met.rast ) 
where f.hydrocode = :'hydrocode' 
and f.ftype = :'ftype' 
and ( (met.tstime >= 441777600) or (441777600 = -1) ) and ( (met.tsendtime <= 1704085199) or (1704085199 = -1) ) 
group by met.featureid, to_timestamp(met.tsendtime), 
  met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) , 
  extract(month from to_timestamp(met.tsendtime)) , 
  extract(day from to_timestamp(met.tsendtime)) , 
  extract(hour from to_timestamp(met.tsendtime)) 
order by met.tsendtime 
;
rburghol commented 1 month ago

Just check the overlap:

\set band '1'
 \set ftype 'cbp6_landseg'
 \set varkey 'nldas2_precip_hourly_tiled'
 \set resample_varkey 'daymet_mod_daily'
 \set hydrocode 'N51137'
 \set fname '/tmp/N51137-nldas2-all.csv'
 \timing ON
select met.featureid, to_timestamp(met.tsendtime) as obs_date, 
  met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) as yr, 
  extract(month from to_timestamp(met.tsendtime)) as mo, 
  extract(day from to_timestamp(met.tsendtime)) as da, 
  extract(hour from to_timestamp(met.tsendtime)) as hr, 
  count(met.*) as num_tiles
from dh_feature as f 
left outer join field_data_dh_geofield as fgeo 
on ( fgeo.entity_id = f.hydroid and fgeo.entity_type = 'dh_feature' ) 
left outer join dh_variabledefinition as v 
on ( v.varkey = :'varkey' ) 
left outer join raster_templates as rt 
on ( rt.varkey = :'resample_varkey' ) 
left outer join dh_feature as mcov 
on ( mcov.hydrocode = 'cbp6_met_coverage' ) 
left outer join dh_timeseries_weather as met 
on ( mcov.hydroid = met.featureid and met.varid = v.hydroid and fgeo.dh_geofield_geom && met.rast ) 
where f.hydrocode = :'hydrocode' 
and f.ftype = :'ftype' 
and ( (met.tstime >= 441777600) or (441777600 = -1) ) and ( (met.tsendtime <= 1704085199) or (1704085199 = -1) ) 
group by met.featureid, to_timestamp(met.tsendtime), 
  met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) , 
  extract(month from to_timestamp(met.tsendtime)) , 
  extract(day from to_timestamp(met.tsendtime)) , 
  extract(hour from to_timestamp(met.tsendtime)) 
order by met.tsendtime 
;

Re-order the tables

WITH f as (select * from dh_feature where hydrocode = :'hydrocode' and ftype = :'ftype' ),
v as (select * from dh_variabledefinition where varkey = :'varkey'),
rt as (select * from raster_templates where varkey = :'resample_varkey' ),
mcov as (select * from dh_feature where hydrocode = 'cbp6_met_coverage' ) 
select met.featureid, to_timestamp(met.tsendtime) as obs_date, 
  met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) as yr, 
  extract(month from to_timestamp(met.tsendtime)) as mo, 
  extract(day from to_timestamp(met.tsendtime)) as da, 
  extract(hour from to_timestamp(met.tsendtime)) as hr, 
  count(met.*) as num_tiles
from dh_timeseries_weather as met, field_data_dh_geofield as fgeo
where met.featureid in (select hydroid from mcov)
  and met.varid in (select hydroid from v) 
  and ( (met.tstime >= 441777600) or (441777600 = -1) ) 
  and ( (met.tsendtime <= 1704085199) or (1704085199 = -1) )
  and fgeo.entity_type = 'dh_feature' 
  and fgeo.entity_id in (select hydroid from f)
  and (fgeo.dh_geofield_geom && met.rast )  
group by met.featureid, to_timestamp(met.tsendtime), 
  met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) , 
  extract(month from to_timestamp(met.tsendtime)) , 
  extract(day from to_timestamp(met.tsendtime)) , 
  extract(hour from to_timestamp(met.tsendtime)) 
order by met.tsendtime 
;

- Inventory query: count only

WITH f as (select from dh_feature where hydrocode = :'hydrocode' and ftype = :'ftype' ), v as (select hydroid from dh_variabledefinition where varkey = :'varkey'), rt as (select from raster_templates where varkey = :'resample_varkey' ), mcov as (select * from dh_feature where hydrocode = 'cbp6_met_coverage' ) select count(met.featureid) from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid in (select hydroid from mcov) and met.varid in (select hydroid from v) and met.tstime >= 441777600 and met.tsendtime <= 1704085199 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id in (select hydroid from f) and (fgeo.dh_geofield_geom && met.rast )
;


- use varid and featureid hard coded
   - These resulted in change from 30 seconds down to 10 seconds for the inventory query 

\set band '1' \set ftype 'cbp6_landseg' \set varkey 'nldas2_precip_hourly_tiled' \set resample_varkey 'daymet_mod_daily' \set hydrocode 'N51137' \set fname '/tmp/N51137-nldas2-all.csv' \timing ON

-- sets all integer feature and varid with query select hydroid as met_varid from dh_variabledefinition where varkey = :'varkey' \gset select hydroid as fid from dh_feature where hydrocode = :'hydrocode' and ftype = :'ftype' \gset select hydroid as covid from dh_feature where hydrocode = 'cbp6_met_coverage' \gset

select count(met.featureid) from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and met.tsendtime <= 1704085199 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.rast ) ;

count

350635 (1 row)

Time: 413137.081 ms (06:53.137)


- Add a polygon column with the envelope

alter table dh_timeseries_weather add column bbox geometry; update dh_timeseries_weather set bbox = st_envelope(rast) where rast is not null;

select count(met.featureid) from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and met.tsendtime <= 1704085199 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) ;

count

350635 (1 row)

Time: 24043.444 ms (00:24.043)


- Now retrieve some data:

select met.featureid, to_timestamp(met.tsendtime) as obs_date, met.tstime, met.tsendtime, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, extract(hour from to_timestamp(met.tsendtime)) as hr, (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm, 0.0393701 (ST_summarystats(st_clip(met.rast, fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 -- and met.tsendtime <= 441777600 + 86400 21 and met.tsendtime <= 1704085199 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) order by met.tsendtime ;

rburghol commented 1 month ago

This is a mis-statement of what I did -- this is just the time for the NON-resampled. Still an improvement over previous, which took like 2 hours I think? Using bbox polygon field, tiled nldas2 rasters resampling to daymet resolution (in conjunction with the spatial summary) took less than 10 minutes

@COBrogan @mwdunlap2004 - reconfiguring the query saved a couple minutes

image

\set band '1'
\set ftype 'cbp6_landseg'
\set varkey 'nldas2_precip_hourly_tiled'
\set resample_varkey 'daymet_mod_daily'
\set hydrocode 'N51177'
\set fname '/tmp/N51177-nldas2-all.csv'
-- sets all integer feature and varid with query 
select hydroid as met_varid from dh_variabledefinition where varkey = :'varkey' \gset
select hydroid as fid from dh_feature where hydrocode = :'hydrocode' and ftype = :'ftype' \gset
select hydroid as covid from dh_feature where hydrocode = 'cbp6_met_coverage' \gset

\timing ON

select met.featureid, to_timestamp(met.tsendtime) as obs_date, 
  met.tid, met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) as yr, 
  extract(month from to_timestamp(met.tsendtime)) as mo, 
  extract(day from to_timestamp(met.tsendtime)) as da, 
  extract(hour from to_timestamp(met.tsendtime)) as hr, 
  (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).count,
  (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm,
  0.0393701 * (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in
 from dh_timeseries_weather as met, field_data_dh_geofield as fgeo, raster_templates as rt 
where met.featureid = :covid
  and met.varid = :met_varid
  and met.tstime >= 441777600
  and rt.varkey = :'resample_varkey' 
  and met.tsendtime <= 441777600 + 86400 
  and met.tsendtime <= 1704085199
  and fgeo.entity_type = 'dh_feature'
  and fgeo.entity_id = :fid
  and (fgeo.dh_geofield_geom && met.bbox )
  order by met.tsendtime
;
rburghol commented 1 month ago

Timing on one that works

\set band '1'
\set ftype 'cbp6_landseg'
\set varkey 'nldas2_precip_hourly_tiled'
\set resample_varkey 'daymet_mod_daily'
\set hydrocode 'H51113'
\set fname '/tmp/H51113-nldas2-all.csv'
-- sets all integer feature and varid with query 
select hydroid as met_varid from dh_variabledefinition where varkey = :'varkey' \gset
select hydroid as fid from dh_feature where hydrocode = :'hydrocode' and ftype = :'ftype' \gset
select hydroid as covid from dh_feature where hydrocode = 'cbp6_met_coverage' \gset

\timing ON

select met.featureid, to_timestamp(met.tsendtime) as obs_date, 
  met.tid, met.tstime, met.tsendtime, 
  extract(year from to_timestamp(met.tsendtime)) as yr, 
  extract(month from to_timestamp(met.tsendtime)) as mo, 
  extract(day from to_timestamp(met.tsendtime)) as da, 
  extract(hour from to_timestamp(met.tsendtime)) as hr, 
  (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).count,
  (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm,
  0.0393701 * (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in
 from dh_timeseries_weather as met, field_data_dh_geofield as fgeo, raster_templates as rt 
where met.featureid = :covid
  and met.varid = :met_varid
  and met.tstime >= 441777600
  and rt.varkey = :'resample_varkey' 
  and met.tsendtime <= 1704085199
  and fgeo.entity_type = 'dh_feature'
  and fgeo.entity_id = :fid
  and (fgeo.dh_geofield_geom && met.bbox )
  order by met.tsendtime
;