node_exporter_rules.yml
groups:
- name: HOST
rules:
- alert: MasterDown
expr: up{job='federate'} == 0
for: 10m
labels:
severity: info
annotations:
summary: "Master 主机服务异常"
description: "{{ $labels.instance }} Master 9090 端口服务异常"
- alert: InstanceDown
expr: (up{job='node'} == 0) and ((node_time_seconds-node_boot_time_seconds) > 1800)
for: 5m
labels:
severity: info
annotations:
summary: "监控数据获取异常"
description: "{{ $labels.instance }} 主机可能宕机,所在节点 Master 私网IP {{ $labels.master_private_ip }} ;如果是已下线机器则忽略"
- alert: InstanceReboot
expr: (node_time_seconds-node_boot_time_seconds) < 600
labels:
severity: info
annotations:
summary: "新启动"
description: "{{ $labels.instance }} 新启动"
value: "{{ $value }}"
- alert: CPU
expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 96
for: 7m
labels:
severity: critical
annotations:
summary: "CPU 使用率很高"
description: "{{ $labels.instance }} CPU 使用率 {{ $value }}%"
value: "{{ $value }}"
- alert: LOAD
expr: node_load5 / on (instance) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance)) by(instance) > 7
for: 7m
labels:
severity: critical
annotations:
summary: "overload"
description: "{{ $labels.instance }} 负载/CPU核数比 {{ $value }}"
value: "{{ $value }}"
- alert: MEM
expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 96
for: 10m
labels:
severity: critical
annotations:
summary: "闪存不足"
description: "{{ $labels.instance }} MEM 使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: DISK
expr: round((100-(node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/node_filesystem_size_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'})*100 > 90 and node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/1073741824 < 10),0.01)
for: 28m
labels:
severity: info
annotations:
summary: "存储空间不足"
description: "{{ $labels.instance }} {{ $labels.mountpoint }} 存储空间使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: IOWAIT
expr: round((avg by (instance,job) (irate(node_cpu_seconds_total{mode="iowait"}[3m])) * 100),0.01) > 80
for: 7m
labels:
severity: info
annotations:
summary: "CPU iowait 过高"
description: "{{ $labels.instance }} CPU iowait {{ $value }}%"
value: "{{ $value }}"
- alert: IO
expr: round(100-(avg(irate(node_disk_io_time_seconds_total[3m])) by(instance,job)* 100),0.01) < 60
for: 7m
labels:
severity: info
annotations:
summary: "磁盘 I/O 性能低"
description: "{{ $labels.instance }} {{ $labels.mountpoint }} 磁盘 I/O 时间占比 {{ $value }}%"
value: "{{ $value }}"
- alert: ProcessNearFDLimits
expr: process_open_fds / process_max_fds > 0.8
for: 3m
labels:
severity: critical
annotations:
summary: "A process hits 80% of the limit"
description: "{{ $labels.instance }} 进程使用的文件描述符数占比 {{ $value }}"
value: "{{ $value }}"
- alert: TCP_ESTAB
expr: node_netstat_Tcp_CurrEstab > 20000
for: 3m
labels:
severity: info
annotations:
summary: "TCP 会话数很多"
description: "{{ $labels.instance }} TCP_ESTABLISHED {{ $value }}"
value: "{{ $value }}"
- alert: Processes_Blocked
expr: node_procs_blocked / on(instance) sum by(instance) (count by(cpu, instance) (node_cpu_seconds_total{mode="system"})) > 2
for: 3m
labels:
severity: critical
annotations:
summary: "任务阻塞"
description: "{{ $labels.instance }} 当前被阻塞的任务数/CPU核数比 {{ $value }}"
value: "{{ $value }}"
prometheus学习链接
Prometheus告警链接参考 Prometheus-book