Setup Database Metrics Alerts (CPU, Memory, Disk, Connections)
An alert is a notification before users notice the problem. "Disk 95% full" is a disaster. "Disk 75% full, growing 2 GB/day" is 10 days to solve it. Correct thresholds and correct delivery channels are half the work.
Monitoring Stack
Standard stack for databases:
PostgreSQL/MySQL/MongoDB
│
Exporter (postgres_exporter / mysqld_exporter / mongodb_exporter)
│ metrics in Prometheus format
Prometheus (collection and storage of metrics)
│
Alertmanager (alert routing)
│
Telegram / Slack / PagerDuty
│
Grafana (visualization)
Installing Exporters
# PostgreSQL
docker run -d \
--name postgres_exporter \
-e DATA_SOURCE_NAME="postgresql://monitoring:password@localhost:5432/postgres?sslmode=disable" \
-p 9187:9187 \
quay.io/prometheuscommunity/postgres-exporter:latest
# MySQL
docker run -d \
--name mysqld_exporter \
-e DATA_SOURCE_NAME="monitoring:password@(localhost:3306)/" \
-p 9104:9104 \
prom/mysqld-exporter:latest
# Node Exporter for system metrics (CPU, RAM, disk)
docker run -d \
--name node_exporter \
--pid="host" \
-v /:/host:ro,rslave \
-p 9100:9100 \
quay.io/prometheus/node-exporter:latest \
--path.rootfs=/host
Monitoring user for PostgreSQL (minimum privileges):
CREATE USER monitoring WITH PASSWORD 'monitoring_password';
GRANT pg_monitor TO monitoring; -- system role PG 10+
-- or manually:
GRANT SELECT ON pg_stat_database TO monitoring;
GRANT SELECT ON pg_stat_replication TO monitoring;
GRANT EXECUTE ON FUNCTION pg_current_wal_lsn() TO monitoring;
Prometheus Alert Rules
# /etc/prometheus/rules/database.yml
groups:
- name: postgresql_critical
rules:
# Database unavailable
- alert: PostgreSQLDown
expr: pg_up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "PostgreSQL unavailable on {{ $labels.instance }}"
description: "Database not responding for over 30 seconds"
# Disk > 85% full
- alert: DiskSpaceHigh
expr: |
(node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} -
node_filesystem_free_bytes{mountpoint="/var/lib/postgresql"}) /
node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL disk {{ $value | printf \"%.1f\" }}% full"
# Critical disk fill
- alert: DiskSpaceCritical
expr: |
(node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} -
node_filesystem_free_bytes{mountpoint="/var/lib/postgresql"}) /
node_filesystem_size_bytes{mountpoint="/var/lib/postgresql"} * 100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "CRITICAL: PostgreSQL disk {{ $value | printf \"%.1f\" }}% full"
# Too many connections
- alert: PostgreSQLTooManyConnections
expr: |
pg_stat_activity_count / pg_settings_max_connections * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "{{ $value | printf \"%.0f\" }}% of PostgreSQL connections used"
description: "Consider PgBouncer or increase max_connections"
# Long running transaction (> 10 minutes)
- alert: PostgreSQLLongRunningTransaction
expr: |
pg_stat_activity_max_tx_duration{state="active"} > 600
for: 1m
labels:
severity: warning
annotations:
summary: "Transaction running for {{ $value | printf \"%.0f\" }} seconds"
# Replication lag > 60 seconds
- alert: PostgreSQLReplicationLag
expr: pg_replication_lag > 60
for: 2m
labels:
severity: critical
annotations:
summary: "Replica is {{ $value | printf \"%.0f\" }} seconds behind"
- name: postgresql_warning
rules:
# Cache hit rate < 99%
- alert: PostgreSQLLowCacheHitRate
expr: |
(sum(pg_stat_database_blks_hit) /
(sum(pg_stat_database_blks_hit) + sum(pg_stat_database_blks_read))) * 100 < 99
for: 10m
labels:
severity: warning
annotations:
summary: "Cache hit rate: {{ $value | printf \"%.1f\" }}% (normal > 99%)"
# CPU > 80%
- alert: HighCPUUsage
expr: |
100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "CPU {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}%"
# Free RAM < 10%
- alert: LowFreeMemory
expr: |
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Free memory: {{ $value | printf \"%.1f\" }}%"
Alertmanager: Telegram Routing
# /etc/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: telegram-critical
routes:
- match:
severity: critical
receiver: telegram-critical
repeat_interval: 30m # repeat critical every 30 minutes
- match:
severity: warning
receiver: telegram-warning
repeat_interval: 4h
receivers:
- name: telegram-critical
telegram_configs:
- api_url: "https://api.telegram.org"
bot_token: "BOT_TOKEN"
chat_id: -1001234567890 # group/channel ID







