PostgreSQL 모니터링은 Prometheus,alertmanager,grafana를 활용하여 구축할 수 있습니다.
방법은 이전에 MySQL 모니터링 구축하는 방법에 대해 공유했던 글을 참고 부탁드리며
이번 글에서는 제가 사용하는 모니터링 항목과 그에 따른 alert rule을 공유드리겠습니다.
exporter
- exporter 기동 커맨드
### postgres_exporter
wget https://github.com/wrouesnel/postgres_exporter/releases/download/v0.8.0/postgres_exporter_v0.8.0_linux-amd64.tar.gz
export DATA_SOURCE_NAME="postgresql://login:password@hostname:port/dbname"
./postgres_exporter --exclude-databases="template0,template1" --web.listen-address=:9187 --extend.query-path='./queries.yaml'
queries.yaml
postgres_exporter 는 DB 내 pg_stat* view 의 데이터를 긁어오는데 이외에도 다른 데이터를 수집하고 싶다면
exporter 기동 시 –extend.query-path=’./queries.yaml’ 옵션을 지정해주고 queries.yaml 파일에 그 내용을 작성해야 합니다.
pg_replication:
query: "SELECT CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay, CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END AS is_replica;"
master: true
metrics:
- lag:
usage: "GAUGE"
description: "Replication lag behind master in seconds"
- is_replica:
usage: "GAUGE"
description: "Indicates if this host is a replica"
pg_postmaster:
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
master: true
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started"
pg_stat_user_tables:
query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans initiated on this table"
- seq_tup_read:
usage: "COUNTER"
description: "Number of live rows fetched by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans initiated on this table"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of live rows fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of rows inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of rows updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of rows deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of rows HOT updated (i.e., with no separate index update required)"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- n_mod_since_analyze:
usage: "GAUGE"
description: "Estimated number of rows changed since last analyze"
- last_vacuum:
usage: "GAUGE"
description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
- last_autovacuum:
usage: "GAUGE"
description: "Last time at which this table was vacuumed by the autovacuum daemon"
- last_analyze:
usage: "GAUGE"
description: "Last time at which this table was manually analyzed"
- last_autoanalyze:
usage: "GAUGE"
description: "Last time at which this table was analyzed by the autovacuum daemon"
- vacuum_count:
usage: "COUNTER"
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
- autovacuum_count:
usage: "COUNTER"
description: "Number of times this table has been vacuumed by the autovacuum daemon"
- analyze_count:
usage: "COUNTER"
description: "Number of times this table has been manually analyzed"
- autoanalyze_count:
usage: "COUNTER"
description: "Number of times this table has been analyzed by the autovacuum daemon"
pg_statio_user_tables:
query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- heap_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table"
- heap_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table"
- idx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from all indexes on this table"
- idx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in all indexes on this table"
- toast_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table (if any)"
- toast_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table (if any)"
- tidx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
- tidx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table indexes (if any)"
pg_database:
query: "SELECT pg_database.datname, pg_database_size(pg_database.datname) as size_bytes FROM pg_database"
master: true
cache_seconds: 30
metrics:
- datname:
usage: "LABEL"
description: "Name of the database"
- size_bytes:
usage: "GAUGE"
description: "Disk space used by the database"
pg_stat_statements:
query: "SELECT t2.rolname, t3.datname, queryid, calls, total_time / 1000 as total_time_seconds, min_time / 1000 as min_time_seconds, max_time / 1000 as max_time_seconds, mean_time / 1000 as mean_time_seconds, stddev_time / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements t1 JOIN pg_roles t2 ON (t1.userid=t2.oid) JOIN pg_database t3 ON (t1.dbid=t3.oid) WHERE t2.rolname != 'rdsadmin'"
master: true
metrics:
- rolname:
usage: "LABEL"
description: "Name of user"
- datname:
usage: "LABEL"
description: "Name of database"
- queryid:
usage: "LABEL"
description: "Query ID"
- calls:
usage: "COUNTER"
description: "Number of times executed"
- total_time_seconds:
usage: "COUNTER"
description: "Total time spent in the statement, in milliseconds"
- min_time_seconds:
usage: "GAUGE"
description: "Minimum time spent in the statement, in milliseconds"
- max_time_seconds:
usage: "GAUGE"
description: "Maximum time spent in the statement, in milliseconds"
- mean_time_seconds:
usage: "GAUGE"
description: "Mean time spent in the statement, in milliseconds"
- stddev_time_seconds:
usage: "GAUGE"
description: "Population standard deviation of time spent in the statement, in milliseconds"
- rows:
usage: "COUNTER"
description: "Total number of rows retrieved or affected by the statement"
- shared_blks_hit:
usage: "COUNTER"
description: "Total number of shared block cache hits by the statement"
- shared_blks_read:
usage: "COUNTER"
description: "Total number of shared blocks read by the statement"
- shared_blks_dirtied:
usage: "COUNTER"
description: "Total number of shared blocks dirtied by the statement"
- shared_blks_written:
usage: "COUNTER"
description: "Total number of shared blocks written by the statement"
- local_blks_hit:
usage: "COUNTER"
description: "Total number of local block cache hits by the statement"
- local_blks_read:
usage: "COUNTER"
description: "Total number of local blocks read by the statement"
- local_blks_dirtied:
usage: "COUNTER"
description: "Total number of local blocks dirtied by the statement"
- local_blks_written:
usage: "COUNTER"
description: "Total number of local blocks written by the statement"
- temp_blks_read:
usage: "COUNTER"
description: "Total number of temp blocks read by the statement"
- temp_blks_written:
usage: "COUNTER"
description: "Total number of temp blocks written by the statement"
- blk_read_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- blk_write_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
pg_replication_slots:
query: "SELECT slot_name, slot_type,case when active then 1.0 else 0.0 end AS active, age(xmin) AS xmin_age, age(catalog_xmin) AS catalog_xmin_age FROM pg_replication_slots"
metrics:
- slot_name:
usage: "LABEL"
description: "Slot Name"
- slot_type:
usage: "LABEL"
description: "Slot Type"
- active:
usage: "GAUGE"
description: "Boolean flag indicating whether this slot has a consumer streaming from it"
- xmin_age:
usage: "GAUGE"
description: "Age of oldest transaction that cannot be vacuumed due to this replica"
- catalog_xmin_age:
usage: "GAUGE"
description: "Age of oldest transaction that cannot be vacuumed from catalogs due to this replica"
queries.yaml 수정
- 버전별 replication , vacuum 관련 테이블이 다를 수 있음
#### pg >=10
SELECT CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()
THEN 0
ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())
END AS log_delay,
CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END AS is_replica;
#### pg <10
SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location()
THEN 0
ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())
END AS log_delay,
CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END AS is_replica;
alertmanager
### postgresql down
- alert: PostgresqlDown
expr: pg_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql down (instance {{ $labels.instance }})"
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### postgresql restarted
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql restarted (instance {{ $labels.instance }})"
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### replication lag
- alert: PostgresqlReplicationLag
expr: (pg_replication_lag) > 600
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql replication lag (instance {{ $labels.instance }})"
description: "PostgreSQL replication lag is going up (> 600s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### table not vacuumed
- alert: PostgresqlTableNotVaccumed
expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql table not vaccumed (instance {{ $labels.instance }})"
description: "Table has not been vaccum for 24 hours\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### too many connections
- alert: PostgresqlTooManyConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql too many connections (instance {{ $labels.instance }})"
description: "PostgreSQL instance has too many connections\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### slow queries
- alert: PostgresqlSlowQueries
expr: IF avg(rate(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m])) by (datname) > 2 * 60
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql slow queries (instance {{ $labels.instance }})"
description: "PostgreSQL executes slow queries\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### unused replication slot
- alert: PostgresqlUnusedReplicationSlot
expr: pg_replication_slots_active == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql unused replication slot (instance {{ $labels.instance }})"
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### too many dead tuples
- alert: PostgresqlTooManyDeadTuples
expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql too many dead tuples (instance {{ $labels.instance }})"
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### standby promoted
- alert: PostgresqlPromotedNode
expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql promoted node (instance {{ $labels.instance }})"
description: "Postgresql standby server has been promoted as primary node\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
### too many locks aquired
- alert: PostgresqlTooManyLocksAcquired
expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql too many locks acquired (instance {{ $labels.instance }})"
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"