tembo-io / pg_timeseries

Simple and focused time-series tables for PostgreSQL, from Tembo
PostgreSQL License
305 stars 5 forks source link

Use assignment instead of select into to set variables in last_agg and first_agg #22

Closed brianpursley closed 3 months ago

brianpursley commented 3 months ago

I don't know if there is a specific reason to use select into versus assignment to set old_rank, but changing it to use assignment cuts execution time in half for me: 3.8s --> 1.8s in the test below, and similar relative improvements on larger tests I ran.


CREATE TABLE sensor_readings (
    timestamp TIMESTAMPTZ NOT NULL,
    location_id INT NOT NULL,
    temperature DECIMAL(5, 2) NOT NULL,
    humidity DECIMAL(5, 2) NOT NULL,
    voc DECIMAL(7, 2) NOT NULL
) PARTITION BY RANGE (timestamp);
CREATE INDEX idx_sensor_readings_timestamp ON sensor_readings (timestamp);
CLUSTER sensor_readings USING idx_sensor_readings_timestamp;
CREATE INDEX idx_sensor_readings_location_id ON sensor_readings (location_id);
SELECT enable_ts_table('sensor_readings');

DO
$$
DECLARE
    start_date TIMESTAMPTZ := '2024-01-01 00:00:00+00';
    end_date TIMESTAMPTZ := '2024-05-01 00:00:00+00';
    c TIMESTAMPTZ;
    location_id INT;
    temperature DECIMAL(5, 2);
    humidity DECIMAL(5, 2);
    voc DECIMAL(7, 2);
BEGIN
    c := start_date;
    WHILE c < end_date LOOP
        FOR location_id IN 1..10 LOOP
            IF random() > 0.1 THEN
                temperature := 20 + (random() * 15);
                humidity := 30 + (random() * 70);
                voc := 100 + (random() * 900);
                INSERT INTO sensor_readings (timestamp, location_id, temperature, humidity, voc)
                VALUES (c, location_id, temperature, humidity, voc);
            END IF;
        END LOOP;
        c := c + INTERVAL '5 minute';
    END LOOP;
END;
$$;

select count(*) from sensor_readings;

/*
 count  
--------
 313733
(1 row)
*/

select location_id, last(temperature, timestamp) from sensor_readings group by location_id order by location_id;

/*
 location_id | last  
-------------+-------
           1 | 20.48
           2 | 25.03
           3 | 26.39
           4 | 33.50
           5 | 31.57
           6 | 20.93
           7 | 34.23
           8 | 22.32
           9 | 32.09
          10 | 27.67
(10 rows)
*/

explain analyze select location_id, last(temperature, timestamp) from sensor_readings group by location_id order by location_id;

/*
                                                                             QUERY PLAN                                                                              
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=89193.02..89193.52 rows=200 width=36) (actual time=3841.259..3841.266 rows=10 loops=1)
   Sort Key: sensor_readings.location_id
   Sort Method: quicksort  Memory: 25kB
   ->  HashAggregate  (cost=89133.38..89185.38 rows=200 width=36) (actual time=3841.230..3841.256 rows=10 loops=1)
         Group Key: sensor_readings.location_id
         Batches: 1  Memory Usage: 40kB
         ->  Append  (cost=0.00..7552.39 rows=323093 width=18) (actual time=0.007..130.770 rows=313733 loops=1)
               ->  Seq Scan on sensor_readings_p20240423 sensor_readings_1  (cost=0.00..333.33 rows=18133 width=18) (actual time=0.007..2.595 rows=18133 loops=1)
               ->  Seq Scan on sensor_readings_p20240430 sensor_readings_2  (cost=0.00..47.94 rows=2594 width=18) (actual time=0.009..0.323 rows=2594 loops=1)
               ->  Seq Scan on sensor_readings_p20240507 sensor_readings_3  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.003..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240514 sensor_readings_4  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240521 sensor_readings_5  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.003..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240528 sensor_readings_6  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240604 sensor_readings_7  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240611 sensor_readings_8  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240618 sensor_readings_9  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240625 sensor_readings_10  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240702 sensor_readings_11  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_default sensor_readings_12  (cost=0.00..5372.06 rows=293006 width=18) (actual time=0.004..36.851 rows=293006 loops=1)
 Planning Time: 0.258 ms
 Execution Time: 3841.316 ms
(21 rows)
*/

CREATE OR REPLACE FUNCTION public.last_agg(state text[], value anyelement, rank anycompatible)
 RETURNS text[]
 LANGUAGE plpgsql
AS $function$
DECLARE
  new_state text[] := state;
  old_rank rank%TYPE;
BEGIN
  IF rank IS NULL THEN
    RETURN new_state;
  END IF;

  old_rank := state[2];
  IF (state IS NULL) OR
     (rank > old_rank) THEN
    new_state := ARRAY[value::text, rank::text];
  END IF;

  RETURN new_state;
END;
$function$;

select location_id, last(temperature, timestamp) from sensor_readings group by location_id order by location_id;

/*
 location_id | last  
-------------+-------
           1 | 20.48
           2 | 25.03
           3 | 26.39
           4 | 33.50
           5 | 31.57
           6 | 20.93
           7 | 34.23
           8 | 22.32
           9 | 32.09
          10 | 27.67
(10 rows)
*/

explain analyze select location_id, last(temperature, timestamp) from sensor_readings group by location_id order by location_id;

/*
                                                                              QUERY PLAN                                                                              
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=89193.02..89193.52 rows=200 width=36) (actual time=1822.059..1822.067 rows=10 loops=1)
   Sort Key: sensor_readings.location_id
   Sort Method: quicksort  Memory: 25kB
   ->  HashAggregate  (cost=89133.38..89185.38 rows=200 width=36) (actual time=1822.028..1822.058 rows=10 loops=1)
         Group Key: sensor_readings.location_id
         Batches: 1  Memory Usage: 40kB
         ->  Append  (cost=0.00..7552.39 rows=323093 width=18) (actual time=0.007..353.218 rows=313733 loops=1)
               ->  Seq Scan on sensor_readings_p20240423 sensor_readings_1  (cost=0.00..333.33 rows=18133 width=18) (actual time=0.007..2.070 rows=18133 loops=1)
               ->  Seq Scan on sensor_readings_p20240430 sensor_readings_2  (cost=0.00..47.94 rows=2594 width=18) (actual time=0.011..0.318 rows=2594 loops=1)
               ->  Seq Scan on sensor_readings_p20240507 sensor_readings_3  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.003..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240514 sensor_readings_4  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240521 sensor_readings_5  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.003..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240528 sensor_readings_6  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.003..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240604 sensor_readings_7  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.003 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240611 sensor_readings_8  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240618 sensor_readings_9  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240625 sensor_readings_10  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_p20240702 sensor_readings_11  (cost=0.00..20.40 rows=1040 width=24) (actual time=0.002..0.002 rows=0 loops=1)
               ->  Seq Scan on sensor_readings_default sensor_readings_12  (cost=0.00..5372.06 rows=293006 width=18) (actual time=0.005..262.178 rows=293006 loops=1)
 Planning Time: 0.250 ms
 Execution Time: 1822.119 ms
(21 rows)
*/
ChuckHend commented 3 months ago

Thank you @brianpursley