fi-ts / postgreslet

A small controller which act as bridge between zalando-postgres-operator
GNU Affero General Public License v3.0
6 stars 0 forks source link

Add postgres-exporter services and servicemonitor for every postgres-operator environment #9

Open y16144 opened 3 years ago

y16144 commented 3 years ago
apiVersion: v1
kind: Service
metadata:
  labels:
    application: spilo
    app: postgres-exporter
  name: postgres-exporter
  namespace: {{ operator_config.namespace }}
spec:
  ports:
  - name: metrics
    port: 9187
    protocol: TCP
    targetPort: 9187
  selector:
    application: spilo
  type: ClusterIP
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: {{ operator_config.name_prometheus_svc }}
  namespace: {{ operator_config.namespace }}
  labels:
    app: postgres-exporter
    release: prometheus
spec:
  endpoints:
    - port: metrics
  namespaceSelector:
    matchNames:
    - {{ operator_config.namespace }}
  selector:
    matchLabels:
      app: postgres-exporter
apiVersion: v1
kind: ConfigMap
metadata:
  name: postgres-exporter-configmap
  namespace: "{{ operator_config.namespace }}"
data:
  queries.yaml  : |+

    pg_postmaster:
      query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
      master: true
      metrics:
        - start_time_seconds:
            usage: "GAUGE"
            description: "Time at which postmaster started"

    pg_is_in_recovery:
      query: "SELECT CASE WHEN pg_is_in_recovery = true THEN 1 ELSE 2 END AS status from pg_is_in_recovery();"
      metrics:
        - status:
            usage: "GAUGE"
            description: "Return value of 1 means database is in recovery. Otherwise 2 it is a primary."

    pg_replication_lag:
      query: "SELECT
               CASE
               WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
               ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())::INTEGER
               END
            AS replay_time"
      metrics:
        - replay_time:
            usage: "GAUGE"
            description: "Length of time since the last transaction was replayed on replica. Will always increase if no writes on primary."

    pg_replication_global_status:
      query: "SELECT (extract(epoch from now()) * 1e9)::int8 as epoch_ns, application_name as tag_application_name,
              concat(coalesce(client_addr::text, client_hostname), '_', client_port::text) as tag_client_info,
              coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), write_lsn)::int8, 0) as write_lag_b,
              coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), flush_lsn)::int8, 0) as flush_lag_b,
              coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)::int8, 0) as replay_lag_b,
              state,
              sync_state,
              case when sync_state in ('sync', 'quorum') then 1 else 0 end as is_sync_int
            from
              pg_catalog.pg_stat_replication"
      metrics:
        - tag_application_name:
            usage: "LABEL"
            description: "Replication Database (Standby)"
        - tag_client_info:
            usage: "LABEL"
            description: "Replication Client Info (Standby)"
        - state:
            usage: "LABEL"
            description: "Replication State"
        - sync_state:
            usage: "LABEL"
            description: "Replication Sync State"
        - write_lag_b:
            usage: "GAUGE"
            description: "Replication Write Lag Master"
        - flush_lag_b:
            usage: "GAUGE"
            description: "Replication Flush Lag Master"
        - replay_lag_b:
            usage: "GAUGE"
            description: "Replication Replay Lag Master"

    pg_replication_global_status_standby:
      query: "select
             (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
             pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) as replay_lag_b,
             extract(epoch from (now() - pg_last_xact_replay_timestamp()))::int8 as last_replay_s;"
      metrics: 
        - replay_lag_b:
            usage: "GAUGE"
            description: "Replication Replay Lag Standby"
        - last_replay_s:
            usage: "GAUGE"
            description: "Last Replication Lag Time Standby"

    pg_replication_lag_size:
      query: "SELECT client_addr as replica
            , client_hostname as replica_hostname
            , client_port as replica_port
            , pg_wal_lsn_diff(sent_lsn, replay_lsn) as bytes 
            FROM pg_catalog.pg_stat_replication"
      metrics:
        - replica:
            usage: "LABEL"
            description: "Replica address"
        - replica_hostname:
            usage: "LABEL"
            description: "Replica hostname"
        - replica_port:
            usage: "LABEL"
            description: "Replica port"
        - bytes:
            usage: "GAUGE"
            description: "Replication lag in bytes"

    pg_replication_slots:
      query: "SELECT slot_name, active::int, pg_wal_lsn_diff(pg_current_wal_insert_lsn(), restart_lsn) AS retained_bytes FROM pg_catalog.pg_replication_slots"
      metrics:
        - slot_name:
            usage: "LABEL"
            description: "Name of replication slot"
        - active: 
            usage: "GAUGE" 
            description: "Active state of slot. 1 = true. 0 = false."
        - retained_bytes:
            usage: "GAUGE"
            description: "The amount of WAL (in bytes) being retained for this slot"

    pg_wal_activity:
      query: "SELECT last_5_min_size_bytes,
          (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes
          FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x;"
      metrics:
        - last_5_min_size_bytes:
            usage: "GAUGE"
            description: "Current size in bytes of the last 5 minutes of WAL generation. Includes recycled WALs."
        - total_size_bytes:
            usage: "GAUGE"
            description: "Current size in bytes of the WAL directory"

    pg_stat_wal_receiver:
      query: |
        SELECT case status when 'stopped' then 0 when 'starting' then 1 when 'streaming' then 2 when 'waiting' then 3 when 'restarting' then 4 when 'stopping' then 5 else -1 end as status,
               (receive_start_lsn- '0/0') % (2^52)::bigint as receive_start_lsn,
               receive_start_tli,
               (received_lsn- '0/0') % (2^52)::bigint as received_lsn,
               received_tli,
               extract(epoch from last_msg_send_time) as last_msg_send_time,
               extract(epoch from last_msg_receipt_time) as last_msg_receipt_time,
               (latest_end_lsn - '0/0') % (2^52)::bigint as latest_end_lsn,
               extract(epoch from latest_end_time) as latest_end_time,
               substring(slot_name from 'repmgr_slot_([0-9]*)') as upstream_node,
               trim(both '''' from substring(conninfo from 'host=([^ ]*)')) as upstream_host,
               slot_name
          FROM pg_stat_wal_receiver
      metrics:
        - status:
            usage: "GAUGE"
            description: "Activity status of the WAL receiver process (0=stopped 1=starting 2=streaming 3=waiting 4=restarting 5=stopping)"
        - receive_start_lsn:
            usage: "COUNTER"
            description: "First transaction log position used when WAL receiver is started"
        - receive_start_tli:
            usage: "GAUGE"
            description: "First timeline number used when WAL receiver is started"
        - received_lsn:
            usage: "COUNTER"
            description: "Last transaction log position already received and flushed to disk, the initial value of this field being the first log position used when WAL receiver is started"
        - received_tli:
            usage: "GAUGE"
            description: "Timeline number of last transaction log position received and flushed to disk, the initial value of this field being the timeline number of the first log position used when WAL receiver is started"
        - last_msg_send_time:
            usage: "COUNTER"
            description: "Send time of last message received from origin WAL sender"
        - last_msg_receipt_time:
            usage: "COUNTER"
            description: "Receipt time of last message received from origin WAL sender"
        - latest_end_lsn:
            usage: "COUNTER"
            description: "Last transaction log position reported to origin WAL sender"
        - latest_end_time:
            usage: "COUNTER"
            description: "Time of last transaction log position reported to origin WAL sender"
        - upstream_node:
            usage: "GAUGE"
            description: "The repmgr node from the upstream slot name"
        - upstream_host:
            usage: "LABEL"
            description: "The upstream host this node is replicating from"
        - slot_name:
            usage: "LABEL"
            description: "The upstream slot_name this node is replicating from"

    pg_archive_command_status:
      query: "SELECT CASE 
        WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0
        WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) < 0 THEN 0
        ELSE EXTRACT(epoch from (last_failed_time - last_archived_time)) 
        END AS seconds_since_last_fail
        FROM pg_catalog.pg_stat_archiver"
      metrics:
        - seconds_since_last_fail:
            usage: "GAUGE"
            description: "Seconds since the last recorded failure of the archive_command"

    pg_stat_user_tables:
      query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of current database"
        - schemaname:
            usage: "LABEL"
            description: "Name of the schema that this table is in"
        - relname:
            usage: "LABEL"
            description: "Name of this table"
        - seq_scan:
            usage: "COUNTER"
            description: "Number of sequential scans initiated on this table"
        - seq_tup_read:
            usage: "COUNTER"
            description: "Number of live rows fetched by sequential scans"
        - idx_scan:
            usage: "COUNTER"
            description: "Number of index scans initiated on this table"
        - idx_tup_fetch:
            usage: "COUNTER"
            description: "Number of live rows fetched by index scans"
        - n_tup_ins:
            usage: "COUNTER"
            description: "Number of rows inserted"
        - n_tup_upd:
            usage: "COUNTER"
            description: "Number of rows updated"
        - n_tup_del:
            usage: "COUNTER"
            description: "Number of rows deleted"
        - n_tup_hot_upd:
            usage: "COUNTER"
            description: "Number of rows HOT updated (i.e., with no separate index update required)"
        - n_live_tup:
            usage: "GAUGE"
            description: "Estimated number of live rows"
        - n_dead_tup:
            usage: "GAUGE"
            description: "Estimated number of dead rows"
        - n_mod_since_analyze:
            usage: "GAUGE"
            description: "Estimated number of rows changed since last analyze"
        - last_vacuum:
            usage: "GAUGE"
            description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
        - last_autovacuum:
            usage: "GAUGE"
            description: "Last time at which this table was vacuumed by the autovacuum daemon"
        - last_analyze:
            usage: "GAUGE"
            description: "Last time at which this table was manually analyzed"
        - last_autoanalyze:
            usage: "GAUGE"
            description: "Last time at which this table was analyzed by the autovacuum daemon"
        - vacuum_count:
            usage: "COUNTER"
            description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
        - autovacuum_count:
            usage: "COUNTER"
            description: "Number of times this table has been vacuumed by the autovacuum daemon"
        - analyze_count:
            usage: "COUNTER"
            description: "Number of times this table has been manually analyzed"
        - autoanalyze_count:
            usage: "COUNTER"
            description: "Number of times this table has been analyzed by the autovacuum daemon"

    pg_statio_user_tables:
      query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of current database"
        - schemaname:
            usage: "LABEL"
            description: "Name of the schema that this table is in"
        - relname:
            usage: "LABEL"
            description: "Name of this table"
        - heap_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from this table"
        - heap_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in this table"
        - idx_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from all indexes on this table"
        - idx_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in all indexes on this table"
        - toast_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from this table's TOAST table (if any)"
        - toast_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in this table's TOAST table (if any)"
        - tidx_blks_read:
            usage: "COUNTER"
            description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
        - tidx_blks_hit:
            usage: "COUNTER"
            description: "Number of buffer hits in this table's TOAST table indexes (if any)"

    pg_wal_activity:
      query: "SELECT last_5_min_size_bytes,
          (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes
          FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x;"
      metrics:
        - last_5_min_size_bytes:
            usage: "GAUGE"
            description: "Current size in bytes of the last 5 minutes of WAL generation. Includes recycled WALs."
        - total_size_bytes:
            usage: "GAUGE"
            description: "Current size in bytes of the WAL directory"

    pg_database:
      query: "SELECT pg_database.datname, pg_database_size(pg_database.datname) as size FROM pg_database"
      metrics:
        - datname:
            usage: "LABEL"
            description: "Name of the database"
        - size:
            usage: "GAUGE"
            description: "Disk space used by the database"
eberlep commented 3 years ago

ServiceMonitor still missing. Prometheus Federation needs some discussion.