Prometheus本身不支持告警功能,主要通过插件alertmanage来实现告警。AlertManager用于接收Prometheus发送的告警并对于告警进行一系列的处理后发送给指定的用户。
Prometheus监控系统的的报警规则是在Prometheus这个组件完成配置的。
prometheus支持2种类型的规则:
[root@localhost opt]# ll alertmanager-0.20.0.linux-amd64.tar.gz -rw-r--r--. 1 root root 23928771 May 21 20:02 alertmanager-0.20.0.linux-amd64.tar.gz[root@localhost opt]# tar -zxvf alertmanager-0.20.0.linux-amd64.tar.gz[root@localhost opt]# cp -r alertmanager-0.20.0.linux-amd64 /usr/local/alertmanager[root@localhost ~]# vi /usr/lib/systemd/system/alertmanager.service[Unit]Description=Prometheus Alertmanager Service daemonAfter=network.target[Service]User=rootGroup=rootType=simpleExecStart=/usr/local/alertmanager/alertmanager \ --config.file=/usr/local/alertmanager/alertmanager.yml \ --storage.path=/usr/local/alertmanager/data/ \ --data.retention=120h \ --web.external-url=http://192.168.1.10:9093 --web.listen-address=:9093Restart=on-failure[Install]WantedBy=multi-user.target# alertmanager选项说明# ExecStart=/usr/local/alertmanager/alertmanager 启动运行alertmanager程序所在的路径# --config.file=/usr/local/alertmanager/alertmanager.yml 指定alertmanager配置文件路径# --storage.path=/usr/local/alertmanager/data/ 数据存储路径# --data.retention=120h 历史数据最大保留时间,默认120小时# --web.external-url 生成返回alertmanager的相对和绝对链接地址,可以在后续告警通知信息中直接点击链接地址访问alertmanager web ui。其格式为http://{ip或者域名}:9093# --web.listen-address 监听web接口和API的地址端口[root@localhost ~]# systemctl daemon-reload[root@localhost ~]# systemctl restart alertmanager.service[root@localhost ~]# systemctl status alertmanager.service浏览器访问示例地址:http://192.168.2.136:9093/#/status
[root@localhost ~]# docker pull prom/alertmanager[root@localhost ~]# docker imagesREPOSITORY TAG IMAGE ID CREATED SIZEdocker.io/prom/alertmanager latest 0881eb8f169f 5 months ago 52.1 MB[root@localhost ~]# docker run -d -p 9093:9093 -v /usr/local/alertmanager/simple.yml:/etc/alertmanager/config.yml --name alertmanager prom/alertmanager[root@localhost ~]# docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES121610a9f7ee prom/alertmanager "/bin/alertmanager..." 17 seconds ago Up 16 seconds 0.0.0.0:9093->9093/tcp alertmanager打开prometheus.yml配置文件,去掉注释,修改如下:
# Alertmanager configurationalerting: alertmanagers: - static_configs:
- targets:
- 192.168.2.136:9093
......
- job_name: 'AlertManager'
static_configs:
- targets: ['localhost:9090']node_exporter是否是up的,不是的话会发告警
rule_files: - "/usr/local/prometheus/rules/*_rules.yml"创建规则:
[root@centos7_9-mod prometheus]#mkdir rules[root@centos7_9-mod prometheus]#cd rules[root@centos7_9-mod rules]# vi node_rules.ymlgroups:- name: test
rules:
- alert: prometheus
expr: up{job="node_exporter"} == 0
for: 3m
labels:
serverity: critical
annotations:
summary: "node down"
description: "Node has been down for more than 3 minutes."校验及重启:
[root@centos7_9-mod prometheus]# ./promtool check rules rules/node_rules.ymlChecking rules/node_rules.yml SUCCESS: 1 rules foundsystemctl restart prometheus.service模拟停止node_exporter:
systemctl stop node_exporter.service告警页面
告警查询
3m后变红
vi rules/node_rules.yml- name: test
rules:
- alert: prometheus
expr: up{job="node_exporter"} == 0
for: 3m
labels:
serverity: critical
annotations:
summary: "{{ $labels.instance }} down.up=={{ $value }}"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 3 minutes."[root@centos7_9-mod prometheus]# systemctl restart node_exporter.service [root@centos7_9-mod prometheus]# systemctl restart prometheus.service 正常
重新停止:
[root@centos7_9-mod prometheus]# systemctl stop node_exporter.service有图有真相
[root@localhost alertmanager]# cat alertmanager.yml global: resolve_timeout: 5m smtp_smarthost: 'smtp.sknfie.com:465' # 邮箱SMTP服务器代理地址 smtp_from: 'sknfie@163.com' # 发送邮件的名称 smtp_auth_username: 'sknfie@163.com' # 邮箱用户名称 smtp_auth_password: 'rkmdpoviehcvddde' # 邮箱授权密码 smtp_require_tls: falseroute: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'email'receivers:- name: 'email'
email_configs:
- to: 'sknfie@163.com'
headers: { Subject: " WARNING- -告警邮件" }
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance'][root@localhost alertmanager]# ./amtool check-config alertmanager.yml Checking 'alertmanager.yml' SUCCESS[root@localhost alertmanager]# systemctl restart alertmanager[root@localhost prometheus]# cat prometheus.yml global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs:
- targets:
- 192.168.2.136:9093
rule_files:
- "/usr/local/prometheus/rules/*.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090'] - job_name: 'node'
static_configs:
- targets: ['192.168.1.6:9100'] - job_name: 'Alertmanager'
static_configs:
- targets: ['192.168.1.10:9093'][root@localhost prometheus]# cat rules/up_rules.yml groups:- name: UP
rules:
- alert: node
expr: up{job="node"} == 0
for: 1m
labels:
severity: crirical
annotations:
description: " {{ $labels.instance }} of job of {{ $labels.job }} has been down for more than 5 minutes."
summary: "{{ $labels.instance }} down,up=={{ $value }}"[root@localhost prometheus]# systemctl restart prometheus停止node_exporter
[root@localhost ~]# systemctl stop node_exporter原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。