Additions to queries.yml for the postgres exporter

We are considering using the omnibus postgres exporter for the repeatable db provisioning effort and would like to use the omnibus postgres exporter.
In omnibus there is a template to configure it, though it is missing some sections that we currently have on .com production that we would need to add. It's possible given that this is very specific to .com it might be better to reference one managed outside of omnibus, or we could extend the template a bit more. This is not a serious blocker for the project, but it will mean that we can't use the bundled exporter.
Here is the production queries.yml, there are several sections like:
pg_integer_capacity
pg_replication_slots
pg_xlog_position
pg_stat_wal_receiver
Obviously some of these are pretty .com specific.
pg_replication:
  query: "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT as lag, CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END as is_replica"
  metrics:
    - lag:
        usage: "GAUGE"
        description: "Replication lag behind master in seconds"
    - is_replica:
        usage: "GAUGE"
        description: "Indicates if this host is a slave"

# Note that upstream_host and slot_name labels will cause broken time
# series which can be worked around using "ignoring" but it seems
# better to have these labels and have to work around them than to
# lose the data?

# Also in 9.6 and prior this view, like the pg_stat_statements view
# below requires a hacky SECURITY DEFINER function to grant access to
# the data inside as they filter out data for non-superuser even if
# you've granted access explicitly.
# Below statement must be executed to grant postgres-exporter user proper permissions

# CREATE FUNCTION public.f_pg_stat_wal_receiver() RETURNS SETOF pg_stat_wal_receiver
#     LANGUAGE sql SECURITY DEFINER
#     AS $$select * from pg_catalog.pg_stat_wal_receiver$$;


# ALTER FUNCTION public.f_pg_stat_wal_receiver() OWNER TO "gitlab-psql";

# CREATE VIEW postgres_exporter.pg_stat_wal_receiver AS
#  SELECT f_pg_stat_wal_receiver.pid,
#     f_pg_stat_wal_receiver.status,
#     f_pg_stat_wal_receiver.receive_start_lsn,
#     f_pg_stat_wal_receiver.receive_start_tli,
#     f_pg_stat_wal_receiver.received_lsn,
#     f_pg_stat_wal_receiver.received_tli,
#     f_pg_stat_wal_receiver.last_msg_send_time,
#     f_pg_stat_wal_receiver.last_msg_receipt_time,
#     f_pg_stat_wal_receiver.latest_end_lsn,
#     f_pg_stat_wal_receiver.latest_end_time,
#     f_pg_stat_wal_receiver.slot_name,
#     f_pg_stat_wal_receiver.conninfo
#    FROM public.f_pg_stat_wal_receiver() f_pg_stat_wal_receiver(pid, status, receive_start_lsn, receive_start_tli, received_lsn, received_tli, last_msg_send_time, last_msg_receipt_time, latest_end_lsn, latest_end_time, slot_name, conninfo);

# GRANT SELECT ON TABLE postgres_exporter.pg_stat_wal_receiver TO postgres_exporter;

pg_stat_wal_receiver:
  query: |
    SELECT case status when 'stopped' then 0 when 'starting' then 1 when 'streaming' then 2 when 'waiting' then 3 when 'restarting' then 4 when 'stopping' then 5 else -1 end as status,
           (receive_start_lsn- '0/0') % (2^52)::bigint as receive_start_lsn,
           receive_start_tli,
           (received_lsn- '0/0') % (2^52)::bigint as received_lsn,
           received_tli,
           extract(epoch from last_msg_send_time) as last_msg_send_time,
           extract(epoch from last_msg_receipt_time) as last_msg_receipt_time,
           (latest_end_lsn - '0/0') % (2^52)::bigint as latest_end_lsn,
           extract(epoch from latest_end_time) as latest_end_time,
           substring(slot_name from 'repmgr_slot_([0-9]*)') as upstream_node,
           trim(both '''' from substring(conninfo from 'host=([^ ]*)')) as upstream_host,
           slot_name
      FROM pg_stat_wal_receiver
  metrics:
    - status:
        usage: "GAUGE"
        description: "Activity status of the WAL receiver process (0=stopped 1=starting 2=streaming 3=waiting 4=restarting 5=stopping)"
    - receive_start_lsn:
        usage: "COUNTER"
        description: "First transaction log position used when WAL receiver is started"
    - receive_start_tli:
        usage: "GAUGE"
        description: "First timeline number used when WAL receiver is started"
    - received_lsn:
        usage: "COUNTER"
        description: "Last transaction log position already received and flushed to disk, the initial value of this field being the first log position used when WAL receiver is started"
    - received_tli:
        usage: "GAUGE"
        description: "Timeline number of last transaction log position received and flushed to disk, the initial value of this field being the timeline number of the first log position used when WAL receiver is started"
    - last_msg_send_time:
        usage: "COUNTER"
        description: "Send time of last message received from origin WAL sender"
    - last_msg_receipt_time:
        usage: "COUNTER"
        description: "Receipt time of last message received from origin WAL sender"
    - latest_end_lsn:
        usage: "COUNTER"
        description: "Last transaction log position reported to origin WAL sender"
    - latest_end_time:
        usage: "COUNTER"
        description: "Time of last transaction log position reported to origin WAL sender"
    - upstream_node:
        usage: "GAUGE"
        description: "The repmgr node from the upstream slot name"
    - upstream_host:
        usage: "LABEL"
        description: "The upstream host this node is replicating from"
    - slot_name:
        usage: "LABEL"
        description: "The upstream slot_name this node is replicating from"

pg_archiver:
  query: |
    WITH
      current_wal_file AS (
         SELECT CASE WHEN NOT pg_is_in_recovery() THEN pg_xlogfile_name(pg_current_xlog_insert_location()) ELSE NULL END pg_xlogfile_name
      ),
      current_wal AS (
        SELECT
          ('x'||substring(pg_xlogfile_name,9,8))::bit(32)::int log,
          ('x'||substring(pg_xlogfile_name,17,8))::bit(32)::int seg,
          pg_xlogfile_name
        FROM current_wal_file
      ),
      archive_wal AS(
        SELECT
          ('x'||substring(last_archived_wal,9,8))::bit(32)::int log,
          ('x'||substring(last_archived_wal,17,8))::bit(32)::int seg,
          last_archived_wal
        FROM pg_stat_archiver
      )
    SELECT coalesce(((cw.log - aw.log) * 256) + (cw.seg-aw.seg),'NaN'::float) as pending_wal_count FROM current_wal cw, archive_wal aw
  metrics:
    - pending_wal_count:
        usage: "GAUGE"
        description: "No. of pending WAL files to be archived"

pg_postmaster:
  query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
  metrics:
    - start_time_seconds:
        usage: "GAUGE"
        description: "Time at which postmaster started"

pg_stat_user_tables:
  query: |
    SELECT
      current_database() datname,
      schemaname,
      relname,
      seq_scan,
      seq_tup_read,
      idx_scan,
      idx_tup_fetch,
      n_tup_ins,
      n_tup_upd,
      n_tup_del,
      n_tup_hot_upd,
      n_live_tup,
      n_dead_tup,
      GREATEST(last_autovacuum, last_vacuum, '1970-01-01Z') as last_vacuum,
      GREATEST(last_autoanalyze, last_analyze, '1970-01-01Z') as last_analyze,
      (vacuum_count + autovacuum_count) as vacuum_count,
      (analyze_count + autoanalyze_count) as analyze_count
    FROM
      pg_stat_user_tables
  metrics:
    - datname:
        usage: "LABEL"
        description: "Name of current database"
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - seq_scan:
        usage: "COUNTER"
        description: "Number of sequential scans initiated on this table"
    - seq_tup_read:
        usage: "COUNTER"
        description: "Number of live rows fetched by sequential scans"
    - idx_scan:
        usage: "COUNTER"
        description: "Number of index scans initiated on this table"
    - idx_tup_fetch:
        usage: "COUNTER"
        description: "Number of live rows fetched by index scans"
    - n_tup_ins:
        usage: "COUNTER"
        description: "Number of rows inserted"
    - n_tup_upd:
        usage: "COUNTER"
        description: "Number of rows updated"
    - n_tup_del:
        usage: "COUNTER"
        description: "Number of rows deleted"
    - n_tup_hot_upd:
        usage: "COUNTER"
        description: "Number of rows HOT updated (i.e., with no separate index update required)"
    - n_live_tup:
        usage: "GAUGE"
        description: "Estimated number of live rows"
    - n_dead_tup:
        usage: "GAUGE"
        description: "Estimated number of dead rows"
    - last_vacuum:
        usage: "GAUGE"
        description: "Last time at which this table was vacuumed (not counting VACUUM FULL)"
    - last_analyze:
        usage: "GAUGE"
        description: "Last time at which this table was analyzed"
    - vacuum_count:
        usage: "COUNTER"
        description: "Number of times this table has been vacuumed"
    - analyze_count:
        usage: "COUNTER"
        description: "Number of times this table has been analyzed"

pg_statio_user_tables:
  query: "SELECT schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
  metrics:
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - heap_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table"
    - heap_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table"
    - idx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from all indexes on this table"
    - idx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in all indexes on this table"
    - toast_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table's TOAST table (if any)"
    - toast_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table's TOAST table (if any)"
    - tidx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
    - tidx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this table's TOAST table indexes (if any)"

pg_stat_user_indexes:
  query: "SELECT schemaname, relname, indexrelname, idx_scan, idx_tup_read, idx_tup_fetch FROM pg_stat_user_indexes"
  metrics:
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of the table for this index"
    - indexrelname:
        usage: "LABEL"
        description: "Name of this index"
    - idx_scan:
        usage: "COUNTER"
        description: "Number of index scans initiated on this index"
    - idx_tup_read:
        usage: "COUNTER"
        description: "Number of index entries returned by scans on this index"
    - idx_tup_fetch:
        usage: "COUNTER"
        description: "Number of live table rows fetched by simple index scans using this index"

pg_statio_user_indexes:
  query: "SELECT schemaname, relname, indexrelname, idx_blks_read, idx_blks_hit FROM pg_statio_user_indexes"
  metrics:
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of the table for this index"
    - indexrelname:
        usage: "LABEL"
        description: "Name of this index"
    - idx_blks_read:
        usage: "COUNTER"
        description: "Number of disk blocks read from this index"
    - idx_blks_hit:
        usage: "COUNTER"
        description: "Number of buffer hits in this index"

pg_stat_statements:
  query: |
    SELECT
      pg_get_userbyid(userid) as user,
      pg_database.datname,
      pg_stat_statements.queryid,
      pg_stat_statements.calls,
      pg_stat_statements.total_time / 1000.0 as seconds_total,
      pg_stat_statements.rows,
      pg_stat_statements.blk_read_time / 1000.0 as block_read_seconds_total,
      pg_stat_statements.blk_write_time / 1000.0 as block_write_seconds_total
      FROM pg_stat_statements
      JOIN pg_database
        ON pg_database.oid = pg_stat_statements.dbid
      WHERE
        total_time > (
          SELECT percentile_cont(0.1)
            WITHIN GROUP (ORDER BY total_time)
            FROM pg_stat_statements
        )
      ORDER BY seconds_total DESC
      LIMIT 500
  metrics:
    - user:
        usage: "LABEL"
        description: "The user who executed the statement"
    - datname:
        usage: "LABEL"
        description: "The database in which the statement was executed"
    - queryid:
        usage: "LABEL"
        description: "Internal hash code, computed from the statement's parse tree"
    - calls:
        usage: "COUNTER"
        description: "Number of times executed"
    - seconds_total:
        usage: "COUNTER"
        description: "Total time spent in the statement, in seconds"
    - rows:
        usage: "COUNTER"
        description: "Total number of rows retrieved or affected by the statement"
    - block_read_seconds_total:
        usage: "COUNTER"
        description: "Total time the statement spent reading blocks, in seconds"
    - block_write_seconds_total:
        usage: "COUNTER"
        description: "Total time the statement spent writing blocks, in seconds"

pg_total_relation_size:
  query: |
    SELECT relnamespace::regnamespace as schemaname,
           relname as relname,
           pg_total_relation_size(oid) bytes
      FROM pg_class
     WHERE relkind = 'r';
  metrics:
    - schemaname:
        usage: "LABEL"
        description: "Name of the schema that this table is in"
    - relname:
        usage: "LABEL"
        description: "Name of this table"
    - bytes:
        usage: "GAUGE"
        description: "total disk space usage for the specified table and associated indexes"

pg_blocked:
  query: |
    SELECT
      count(blocked.transactionid) AS queries,
      '__transaction__' AS table
    FROM pg_catalog.pg_locks blocked
    WHERE NOT blocked.granted AND locktype = 'transactionid'
    GROUP BY locktype
    UNION
    SELECT
      count(blocked.relation) AS queries,
      blocked.relation::regclass::text AS table
    FROM pg_catalog.pg_locks blocked
    WHERE NOT blocked.granted AND locktype != 'transactionid'
    GROUP BY relation
  metrics:
    - queries:
        usage: "GAUGE"
        description: "The current number of blocked queries"
    - table:
        usage: "LABEL"
        description: "The table on which a query is blocked"

pg_oldest_blocked:
  query: |
    SELECT coalesce(extract('epoch' from max(clock_timestamp() - state_change)), 0) age_seconds
      FROM pg_stat_activity
     WHERE wait_event_type = 'Lock'
       AND state='active'
  metrics:
    - age_seconds:
        usage: "GAUGE"
        description: "Largest number of seconds any transaction is currently waiting on a lock"

pg_slow:
  query: |
    SELECT COUNT(*) AS queries
    FROM pg_stat_activity
    WHERE state = 'active' AND (now() - query_start) > '1 seconds'::interval
  metrics:
    - queries:
        usage: "GAUGE"
        description: "Current number of slow queries"

pg_long_running_transactions:
  query: |
    SELECT COUNT(*) as transactions,
    MAX(EXTRACT(EPOCH FROM (clock_timestamp() - xact_start))) AS age_in_seconds
    FROM pg_stat_activity
    WHERE state is distinct from 'idle' AND (now() - xact_start) > '1 minutes'::interval AND query not like 'autovacuum:%'
  metrics:
    - queries:
        usage: "GAUGE"
        description: "Current number of long running transactions"
    - age_in_seconds:
        usage: "GAUGE"
        description: "The current maximum transaction age in seconds"

pg_stuck_idle_in_transaction:
  query: |
    SELECT COUNT(*) AS queries
    FROM pg_stat_activity
    WHERE state = 'idle in transaction' AND (now() - query_start) > '10 minutes'::interval
  metrics:
    - queries:
        usage: "GAUGE"
        description: "Current number of queries that are stuck being idle in transactions"

# All xid and lsn metrics here are reported mod 2^52 to ensure they
# fit within a float for Prometheus :( Really annoying that counters
# aren't stored in a 64-bit integer. Note that for queries that report
# floats this only works because postgres_exporter does know to set
# extra_float_digits (which it sets to 2). So they don't print in
# exponential notation and precision is maintained up to 2^53-1.

pg_txid:
  query: |
    SELECT
      CASE WHEN pg_is_in_recovery() THEN 'NaN'::float ELSE txid_current() % (2^52)::bigint END AS current,
      CASE WHEN pg_is_in_recovery() THEN 'NaN'::float ELSE txid_snapshot_xmin(txid_current_snapshot()) % (2^52)::bigint END AS xmin,
      CASE WHEN pg_is_in_recovery() THEN 'NaN'::float ELSE txid_current() - txid_snapshot_xmin(txid_current_snapshot()) END AS xmin_age
  metrics:
    - current:
        usage: "COUNTER"
        description: "Current 64-bit transaction id of the query used to collect this metric (truncated to low 52 bits)"
    - xmin:
        usage: "COUNTER"
        description: "Oldest transaction id of a transaction still in progress, i.e. not known committed or aborted (truncated to low 52 bits)"
    - xmin_age:
        usage: "GAUGE"
        description: "Age of oldest transaction still not committed or aborted measured in transaction ids"

pg_database_wraparound:
  query: >
    SELECT
      datname,
      age(d.datfrozenxid) as age_datfrozenxid,
      mxid_age(d.datminmxid) as age_datminmxid
    FROM
      pg_catalog.pg_database d
    WHERE
      d.datallowconn
  metrics:
    - datname:
        usage: "LABEL"
        description: "Database Name"
    - age_datfrozenxid:
        usage: "GAUGE"
        description: "Age of the oldest transaction ID that has not been frozen."
    - age_datminmxid:
        usage: "GAUGE"
        description: "Age of the oldest multi-transaction ID that has been replaced with a transaction ID."

pg_xlog_position:
  query: |
    SELECT CASE
           WHEN pg_is_in_recovery()
           THEN (pg_last_xlog_replay_location() - '0/0') % (2^52)::bigint
           ELSE (pg_current_xlog_location() - '0/0') % (2^52)::bigint
           END AS bytes
  metrics:
    - bytes:
        usage: "COUNTER"
        description: "Postgres LSN (log sequence number) being generated on primary or replayed on replica (truncated to low 52 bits)"

# This should be pushed upstream. We really just want "active" so we
# can monitor for orphaned slots causing xlog space usage to grow

pg_replication_slots:
  query: |
    SELECT slot_name, slot_type,
           case when active then 1.0 else 0.0 end AS active,
           age(xmin) AS xmin_age,
           age(catalog_xmin) AS catalog_xmin_age,
           CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END - restart_lsn AS restart_lsn_bytes,
           CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END - confirmed_flush_lsn AS confirmed_flush_lsn_bytes
      FROM pg_replication_slots
  metrics:
    - slot_name:
        usage: "LABEL"
        description: "Slot Name"
    - slot_type:
        usage: "LABEL"
        description: "Slot Type"
    - active:
        usage: "GAUGE"
        description: "Boolean flag indicating whether this slot has a consumer streaming from it"
    - xmin_age:
        usage: "GAUGE"
        description: "Age of oldest transaction that cannot be vacuumed due to this replica"
    - catalog_xmin_age:
        usage: "GAUGE"
        description: "Age of oldest transaction that cannot be vacuumed from catalogs due to this replica (used by logical replication)"
    - restart_lsn_bytes:
        usage: "GAUGE"
        description: "Amount of data on in xlog that must be this replica may need to complete recovery"
    - confirmed_flush_lsn_bytes:
        usage: "GAUGE"
        description: "Amount of data on in xlog that must be this replica has not yet received"

pg_stat_ssl:
  query: |
    SELECT pid, bits,
           CASE WHEN ssl THEN 1.0 ELSE 0.0 END AS active,
           CASE WHEN compression THEN 1.0 ELSE 0.0 END AS compression
           FROM pg_stat_ssl
  metrics:
    - pid:
        usage: "LABEL"
        description: "Process ID of a backend or WAL sender process"
    - active:
        usage: "GAUGE"
        description: "Boolean flag indicating if SSL is used on this connection"
    - bits:
        usage: "GAUGE"
        description: "Number of bits in the encryption algorithm is in use"
    - compression:
        usage: "GAUGE"
        description: "Boolean flag indicating if SSL compression is in use"

# This tracks the remaining integer capacity for primary keys of selected tables
# This is specific to GitLab.com and a temporary measure until those primary keys
# have been migrated to int8 data type.
# See https://gitlab.com/groups/gitlab-org/-/epics/4785
#
pg_integer_capacity:
  query: >
    SELECT 'events' as table_name, 'id' as column_name, max(id) as current, (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('events', 'id')) as maximum FROM events
      UNION ALL
    SELECT 'ci_job_artifacts', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('ci_job_artifacts', 'id')) FROM ci_job_artifacts
      UNION ALL
    SELECT 'ci_builds', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('ci_builds', 'id')) FROM ci_builds
      UNION ALL
    SELECT 'ci_builds_metadata', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('ci_builds_metadata', 'id')) FROM ci_builds_metadata
      UNION ALL
    SELECT 'web_hook_logs', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('web_hook_logs', 'id')) FROM web_hook_logs
      UNION ALL
    SELECT 'sent_notifications', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('sent_notifications', 'id')) FROM sent_notifications
      UNION ALL
    SELECT 'taggings', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('taggings', 'id')) FROM taggings
      UNION ALL
    SELECT 'notes', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('notes', 'id')) FROM notes
      UNION ALL
    SELECT 'ci_stages', 'id', max(id), (select (2^(numeric_precision-1)-1) FROM information_schema.columns WHERE (table_name, column_name) = ('ci_stages', 'id')) FROM ci_stages
  metrics:
    - table_name:
        usage: LABEL
        description: Name of the table
    - column_name:
        usage: LABEL
        description: Name of the column
    - current:
        usage: GAUGE
        description: Current maximum value for this column
    - maximum:
        usage: GAUGE
        description: Maximum value this column based on its data type

#
# This query extracts marginalia comments from pg_stat_activity and provides a sampled summary of the type of
# endpoints that are actively making calls
#
pg_stat_activity_marginalia_sampler:
  query: >
    SELECT
      usename AS usename,
      a.matches[1] AS application,
      a.matches[2] AS endpoint,
      a.matches[3] AS command,
      a.wait_event AS wait_event,
      a.state AS state,
      a.wait_event_type AS wait_event_type,
      COUNT(*) active_count,
      MAX(age_in_seconds) AS max_tx_age_in_seconds
    FROM (
      SELECT
        usename,
        regexp_matches(query, '^\s*(?:\/\*(?:application:(\w+),?)?(?:correlation_id:\w+,?)?(?:jid:\w+,?)?(?:endpoint_id:([\w/\-\.:\#\s]+),?)?\*\/)?\s*(\w+)') AS matches,
        state,
        wait_event,
        wait_event_type,
        EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds
      FROM
        pg_stat_activity
      ) a
    GROUP BY usename, application, endpoint, command, state, wait_event, wait_event_type
    ORDER BY active_count DESC
  metrics:
    - usename:
        usage: LABEL
        description: The user running the command
    - application:
        usage: LABEL
        description: Name of the application as presented in marginalia comment
    - endpoint:
        usage: LABEL
        description: Name of the web endpoint or sidekiq job as presented in marginalia comment
    - command:
        usage: LABEL
        description: The first word for the running command
    - wait_event:
        usage: LABEL
        description: Wait event  of the activity as presented by pg_stat_activity.wait_event
    - state:
        usage: LABEL
        description: State of the activity as presented by pg_stat_activity.state
    - wait_event_type:
        usage: LABEL
        description: Wait event type of the activity as presented by pg_stat_activity.wait_event_type
    - active_count:
        usage: GAUGE
        description: Number of active queries at time of sample
    - max_tx_age_in_seconds:
        usage: GAUGE
        description: Number of active queries at time of sample

# This records long running autovacuum processes
# The reason we limit to long-running processes is to avoid
# cardinality problems in prometheus
# Since we're only concerned about long running processes
# we limit the output only to those
pg_stat_activity_autovacuum:
  query: >
    SELECT
      SPLIT_PART(query, '.', 2) AS relname,
      EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds
    FROM
      pg_stat_activity
    WHERE
      query like 'autovacuum:%'
    AND
      EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) > 1200
  metrics:
    - relname:
        usage: LABEL
        description: The table being vacuumed
    - age_in_seconds:
        usage: GAUGE
        description: The age of the vacuum process in seconds

# Keep track of the number of the total number of autovacuum workers
# currently active
pg_stat_activity_autovacuum_active:
  query: >
    SELECT v.phase,
           CASE
             when a.query ~ '^autovacuum.*to prevent wraparound' then 'wraparound'
             when a.query ~* '^vacuum' then 'user'
             when a.pid is null then null
             ELSE 'regular'
           END as mode,
           count(1) as workers_count
      FROM pg_stat_progress_vacuum v
      LEFT JOIN pg_stat_activity a using (pid)
     GROUP BY 1,2
  metrics:
    - phase:
        usage: LABEL
        description: Vacuum phase
    - mode:
        usage: LABEL
        description: Vacuum mode
    - workers_count:
        usage: GAUGE
        description: The number of active autovacuum workers in this state

#
# This query extracts marginalia metadata from pg_stat_activity and provides a sampled summary about the long running transactions.
#
pg_long_running_transactions_marginalia:
  query: >
    SELECT
      activity.matches[1] AS application,
      activity.matches[2] AS endpoint,
      MAX(age_in_seconds) AS max_age_in_seconds
    FROM (
      SELECT
        regexp_matches(query, '^\s*(?:\/\*(?:application:(\w+),?)?(?:correlation_id:\w+,?)?(?:jid:\w+,?)?(?:endpoint_id:([\w/\-\.:\#\s]+),?)?\*\/)?\s*(\w+)') AS matches,
        EXTRACT(EPOCH FROM (clock_timestamp() - xact_start)) AS age_in_seconds
      FROM
        pg_stat_activity
      WHERE state <> 'idle'
        AND (clock_timestamp() - xact_start) > '30 seconds'::interval
        AND query NOT LIKE 'autovacuum:%'
      ) activity
    GROUP BY application, endpoint
    ORDER BY max_age_in_seconds DESC
  metrics:
    - application:
        usage: LABEL
        description: Name of the application as presented in marginalia comment
    - endpoint:
        usage: LABEL
        description: Name of the web endpoint or sidekiq job as presented in marginalia comment
    - max_age_in_seconds:
        usage: GAUGE
        description: The current maximum transaction age in seconds

#    NOT
# DO NOT ADD GENERAL PURPOSE GITLAB MONITORING HERE
#    NOT
#
# This file is for PostgreSQL statistics exporting. Keep in mind any metrics exported from here will be:
#  a) exported from every database including read-only replicas
#  b) scraped frequently based on database-monitoring needs
#  c) Queried live on demand for every scrape
#
# Moreover this file does not have tests and if you get this file
# wrong you can easily disable all database monitoring metrics.
#
# If you want to monitor the Gitlab system as a whole you're almost
# certainly looking for gitlab-monitor:
#
#  https://gitlab.com/gitlab-org/gitlab-monitor
#
# There you will be able to write Ruby code, cache data, access Redis,
# etc.