HARPgroup / model_meteorology

0 stars 0 forks source link

Landseg Raster Tile Case Study: H51113 #90

Open rburghol opened 2 months ago

rburghol commented 2 months ago

Case Study H51113

\timing ON


#### Full Time Period with 3 calls to st_summarystats

select met.featureid, to_timestamp(met.tsendtime) as obs_date, met.tid, met.tstime, met.tsendtime, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, extract(hour from to_timestamp(met.tsendtime)) as hr, (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).count, (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm, 0.0393701 * (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in from dh_timeseries_weather as met, field_data_dh_geofield as fgeo, raster_templates as rt where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and rt.varkey = :'resample_varkey' and met.tsendtime <= 1704085199 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) order by met.tsendtime ;


#### 1 Year - with 3 calls to `st_summarystats()`

select met.featureid, to_timestamp(met.tsendtime) as obs_date, met.tid, met.tstime, met.tsendtime, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, extract(hour from to_timestamp(met.tsendtime)) as hr, (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).count, (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_mm, 0.0393701 (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in from dh_timeseries_weather as met, field_data_dh_geofield as fgeo, raster_templates as rt where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and rt.varkey = :'resample_varkey' and met.tsendtime <= 441777600 + 86400 365 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) order by met.tsendtime ;


#### 1 Year - 1 call to `st_summarystats()`
- H51113 `Time: 53 seconds`
   - compare to time for 3 calls to st_summarystats: `Time: 157599.771 ms (02:37.600)`
- 

select met.featureid, to_timestamp(met.tsendtime) as obs_date, met.tid, met.tstime, met.tsendtime, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, extract(hour from to_timestamp(met.tsendtime)) as hr, 0.0393701 (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)).mean as precip_in from dh_timeseries_weather as met, field_data_dh_geofield as fgeo, raster_templates as rt where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and rt.varkey = :'resample_varkey' and met.tsendtime <= 441777600 + 86400 365 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) order by met.tsendtime ;


#### 1-year with `st_summarystats()` using sub-query, and then extract 3 aspects
- with 50x50 tile: `Time: 54735.048 ms (00:54.735)`
   - pretty much same as 1 call example above
- with 16x16 tile: ``

select featureid, obs_date, yr, mo, da, hr, (stats).mean as precip_mm, 0.0393701 (stats).mean as precip_in, (stats).count from ( select met.featureid, to_timestamp(met.tsendtime) as obs_date, met.tid, met.tstime, met.tsendtime, extract(year from to_timestamp(met.tsendtime)) as yr, extract(month from to_timestamp(met.tsendtime)) as mo, extract(day from to_timestamp(met.tsendtime)) as da, extract(hour from to_timestamp(met.tsendtime)) as hr, (ST_summarystats(st_clip(st_resample(met.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)) as stats from dh_timeseries_weather as met, field_data_dh_geofield as fgeo, raster_templates as rt where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and rt.varkey = :'resample_varkey' and met.tsendtime <= 441777600 + 86400 365 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) order by met.tsendtime ) as foo;


#### 1-year with sub-query and ST_UNION prior to raster summary
- H51113, nldas_precip_hourly_tiled: `Time: 160398.636 ms (02:40.399)`
- N51177, nldas_precip_hourly_tiled: `Time: 293739.298 ms (04:53.739)`

select featureid, obs_date, yr, mo, da, hr, (stats).mean as precip_mm, 0.0393701 (stats).mean as precip_in, (stats).count from ( select featureid, to_timestamp(bar.tsendtime) as obs_date, extract(year from to_timestamp(bar.tsendtime)) as yr, extract(month from to_timestamp(bar.tsendtime)) as mo, extract(day from to_timestamp(bar.tsendtime)) as da, extract(hour from to_timestamp(bar.tsendtime)) as hr, (ST_summarystats(st_clip(st_resample(bar.rast, rt.rast), fgeo.dh_geofield_geom), 1, TRUE)) as stats from ( select met.featureid, met.tstime, met.tsendtime, st_union(met.rast) as rast from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and met.tsendtime <= 441777600 + 86400 365 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) group by met.featureid, met.tstime, met.tsendtime order by met.tsendtime ) as bar, field_data_dh_geofield as fgeo, raster_templates as rt where rt.varkey = :'resample_varkey' and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid ) as foo;


#### Just extract a Union'ed Raster for overlapping tiles for 1-year
- This is fast: 
- H51113 (1 cell): `Time: 4597.188 ms (00:04.597)`
- N51177 (6 cells): `Time: 8319.706 ms (00:08.320)`

drop table rast_ts ; create temp table rast_ts as select tstime, tsendtime, rast, rnum from ( select met.featureid, met.tstime, met.tsendtime, st_union(met.rast) as rast, count(met.rast) as rnum from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and met.tsendtime <= 441777600 + 86400 * 365 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) group by met.featureid, met.tstime, met.tsendtime order by met.tsendtime ) as bar;


#### Union plus resample at same time

#### Resample after creating a temp table
- H51113 (1 cells), nldas2_precip_hourly_tiled: `Time: 147068.928 ms (02:27.069`
- N51177 (6 cells), nldas2_precip_hourly_tiled: `Time: 265888.826 ms (04:25.889)`

update rast_ts set rast = st_resample(rast, rt.rastem) from ( select rast as rastem from raster_templates where varkey = :'resample_varkey' ) as rt;


#### Use map algebra instead of st_clip 
- Time: 23003.892 ms (00:23.004)

select met.tsendtime, st_summarystats( st_mapalgebra( feat.rast, 1, met.rast, 1, '[rast1] * [rast2]'), 1, TRUE ) as precip_mm from ( select ST_AsRaster(geo.dh_geofield_geom, rt.rast, st_bandpixeltype(rt.rast, 1), 1.0) as rast from field_data_dh_geofield as geo, raster_templates as rt where geo.entity_id = :fid and rt.varkey = :'resample_varkey' ) as feat, res_rast_ts as met;


#### 1 year with sub-query and `st_union` before stats, plus `map_algebra` instead of `st_clip`
- H51113 (1 cells), nldas2_precip_hourly_tiled: Time: 210566.539 ms (03:30.567)

select featureid, obs_date, yr, mo, da, hr, (stats).mean as precip_mm, 0.0393701 (stats).mean as precip_in, (stats).count from ( select featureid, to_timestamp(bar.tsendtime) as obs_date, extract(year from to_timestamp(bar.tsendtime)) as yr, extract(month from to_timestamp(bar.tsendtime)) as mo, extract(day from to_timestamp(bar.tsendtime)) as da, extract(hour from to_timestamp(bar.tsendtime)) as hr, (ST_summarystats(st_mapalgebra( feat.rast, 1, st_resample(bar.rast, rt.rast), 1, '[rast1] [rast2]'), 1, TRUE)) as stats from ( select met.featureid, met.tstime, met.tsendtime, st_union(met.rast) as rast from dh_timeseries_weather as met, field_data_dh_geofield as fgeo where met.featureid = :covid and met.varid = :met_varid and met.tstime >= 441777600 and met.tsendtime <= 441777600 + 86400 * 365 and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid and (fgeo.dh_geofield_geom && met.bbox ) group by met.featureid, met.tstime, met.tsendtime order by met.tsendtime ) as bar, field_data_dh_geofield as fgeo, raster_templates as rt, ( select ST_AsRaster(geo.dh_geofield_geom, rt.rast, st_bandpixeltype(rt.rast, 1), 1.0) as rast from field_data_dh_geofield as geo, raster_templates as rt where geo.entity_id = :fid and rt.varkey = :'resample_varkey' ) as feat where rt.varkey = :'resample_varkey' and fgeo.entity_type = 'dh_feature' and fgeo.entity_id = :fid ) as foo;



#### Comparing Results
- First one, all in single query

| featureid |        obs_date        |    tid    |  tstime   | tsendtime |  yr  | mo | da | hr |       precip_in |
| --- |        -------------------------   |    --- |  ----   | ----- |  --  | -- | -- | -- | ---- |
| 617850 | 1984-01-05 03:00:00-05 | 206821290 | 442134000 | 442137600 | 1984 |  1 |  5 |  3 |   0.007274584990256395 |

- single query with `st_union` and `map_algebra `

| featureid |        obs_date        |  yr  | mo | da | hr |       precip_mm        |       precip_in        | count |
| --|        --|  --| ---| --| --|       --------|       ------| ----|
| 617850 | 1984-01-05 03:00:00-05 | 1984 |  1 |  5 |  3 |    0.18477436393881455 |   0.007274585185707522 |   156 |
COBrogan commented 2 months ago

@rburghol This behemoth of a query ran in 8 seconds to extract one year of data with clipping and resampling for the land segment. It takes about 30 seconds to run for usgs_ws_01668000 (1595 sq mi drainage area). For forty years with this land segment, the below query ran in 6 mins 17 seconds.

\set hydrocode 'H51113'
\set fname '/tmp/H51113-nldas2-all.csv'
\set band '1'
\set varkey 'nldas2_precip_hourly_tiled_16x16'
\set resample_varkey 'daymet_mod_daily'
-- sets all integer feature and varid with query 
select hydroid as covid from dh_feature where hydrocode = 'cbp6_met_coverage' \gset

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = :'hydrocode'
),
metUnion as (
    Select met.featureid, met.tsendtime,
        st_union(met.rast) as rast
    FROM usgs_features as f
    left outer join field_data_dh_geofield as fgeo
    on (
        fgeo.entity_id = f.hydroid
        and fgeo.entity_type = 'dh_feature' 
    ) 
    JOIN(
        select *
        from dh_timeseries_weather as met
        left outer join dh_variabledefinition as b
            on (met.varid = b.hydroid) 
        where extract(year from to_timestamp(met.tstime)) = 2023
            and b.varkey=:'varkey'
            and met.featureid = :covid
    ) AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)

    group by met.featureid, met.tsendtime
),
met as (
    Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
        extract(year from to_timestamp(met.tsendtime)) as yr,
        extract(month from to_timestamp(met.tsendtime)) as mo,
        extract(day from to_timestamp(met.tsendtime)) as da,
        extract(hour from to_timestamp(met.tsendtime)) as hr,
        (ST_summarystats(st_clip(st_resample(met.rast,rt.rast), fgeo.dh_geofield_geom), :'band', TRUE)).mean as stats
    FROM usgs_features as f
    left outer join field_data_dh_geofield as fgeo
    on (
        fgeo.entity_id = f.hydroid
        and fgeo.entity_type = 'dh_feature' 
    ) 
    JOIN metUnion AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
    left join (select rast from raster_templates where varkey = :'resample_varkey') as rt
    ON 1 = 1
)
select featureid, obs_date, yr, mo, da, hr, 
  0.0393701 * stats precip_in
from met
order by met.obs_date;
COBrogan commented 2 months ago

All 40 years of NLDAS has been retiled into dh_timeseries_weather with the varkey nldas2_precip_hourly_tiled_16x16

SQL Insert ``` sql insert into dh_timeseries_weather(tstime,tsendtime, varid, featureid, entity_type, rast) select a.tstime, a.tsendtime, c.hydroid as varid, a.featureid, a.entity_type, st_tile(a.rast, 1, 16, 16) as rast from dh_timeseries_weather as a left outer join dh_variabledefinition as b on (a.varid = b.hydroid) left outer join dh_variabledefinition as c on (c.varkey = 'nldas2_precip_hourly_tiled_16x16') where b.varkey='nldas2_obs_hourly' and c.varkey is not null; ```

A new workflow is available to reference these, stored under the configs nldas2_tiled and stormVol_nldas2_tiled. The only change is the varkey. I also updated nldas2_resampled and nldas2_resamptile to reference the tiled dataset. I tested nldas2_tiled and had no issues:

sbatch /opt/model/meta_model/run_model raster_met nldas2_tiled usgs_ws_01668000 auto geo
COBrogan commented 2 months ago

The above query uses ST_Intersects and takes about 18 seconds to run for 2023 for landsegment H51113. To try and improve efficiency, I replaced the ST_Intersects call with a simple && referencing dh_timeseries_weather.bbox. The query ran in 13 seconds and was faster. Several SO posts imply && should either be equivalent to ST_Intersects or faster as it does not create the bounding box intersection.

\set hydrocode 'H51113'
\set fname '/tmp/H51113-nldas2-all.csv'
\set band '1'
\set varkey 'nldas2_precip_hourly_tiled_16x16'
\set resample_varkey 'daymet_mod_daily'
-- sets all integer feature and varid with query 
select hydroid as covid from dh_feature where hydrocode = 'cbp6_met_coverage' \gset

WITH usgs_features AS (
  SELECT * 
  FROM  dh_feature
  WHERE hydrocode = :'hydrocode'
),
metUnion as (
    Select met.featureid, met.tsendtime,
        st_union(met.rast) as rast
    FROM usgs_features as f
    left outer join field_data_dh_geofield as fgeo
    on (
        fgeo.entity_id = f.hydroid
        and fgeo.entity_type = 'dh_feature' 
    ) 
    JOIN(
        select *
        from dh_timeseries_weather as met
        left outer join dh_variabledefinition as b
            on (met.varid = b.hydroid) 
        where extract(year from to_timestamp(met.tstime)) = 2023
            and b.varkey=:'varkey'
            and met.featureid = :covid
    ) AS met
    ON fgeo.dh_geofield_geom && met.bbox

    group by met.featureid, met.tsendtime
),
met as (
    Select met.featureid, to_timestamp(met.tsendtime) as obs_date,
        extract(year from to_timestamp(met.tsendtime)) as yr,
        extract(month from to_timestamp(met.tsendtime)) as mo,
        extract(day from to_timestamp(met.tsendtime)) as da,
        extract(hour from to_timestamp(met.tsendtime)) as hr,
        (ST_summarystats(st_clip(st_resample(met.rast,rt.rast), fgeo.dh_geofield_geom), :'band', TRUE)).mean as stats
    FROM usgs_features as f
    left outer join field_data_dh_geofield as fgeo
    on (
        fgeo.entity_id = f.hydroid
        and fgeo.entity_type = 'dh_feature' 
    ) 
    JOIN metUnion AS met
    ON ST_Intersects(ST_ConvexHull(met.rast),fgeo.dh_geofield_geom)
    left join (select rast from raster_templates where varkey = :'resample_varkey') as rt
    ON 1 = 1
)
select featureid, obs_date, yr, mo, da, hr, 
  0.0393701 * stats precip_in
from met
order by met.obs_date;

Building off this, the full forty year period ran in 8 minutes and 22 seconds using ST_Intersection but only 8 minutes and 4 seconds via &&. Minor time savings, so I'll update calc_raster_ts and our resample workflow

rburghol commented 2 months ago

@COBrogan dang that 18 to 13 second drop for a single year had me pretty stoked... tho saving only 18 seconds on the full 40 years took the wind outta my sails!!