~]# useradd -r -m -d /var/lib/prometheus prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.14.0/prometheus-2.14.0.linux-amd64.tar.gz
tar -xf prometheus-2.14.0.linux-amd64.tar.gz -C /usr/local/
cd /usr/local
ln -sv prometheus-2.14.0.linux-amd64 prometheus
vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=The Prometheus 2 monitoring system and time series database.
Documentation=https://prometheus.io
After=network.target
[Service]
EnvironmentFile=-/etc/sysconfig/prometheus
User=prometheus
ExecStart=/usr/local/prometheus/prometheus --storage.tsdb.path=/home/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --web.listen-address=0.0.0.0:9090 --web.external-url= $PROM_EXTRA_ARGS
Restart=on-failure
StartLimitInterval=1
RestartSec=3
[Install]
WantedBy=multi-user.target
其他运行时参数: ./prometheus --help
systemctl daemon-reload
systemctl start prometheus.service
iptables -I INPUT -p tcp --dport 9090 -s NETWORK/MASK -j ACCEPT
http://IP:PORT
$ docker run --name prometheus -d -v ./prometheus:/etc/prometheus/ -v ./db/:/prometheus -p 9090:9090 prom/prometheus --config.file=/etc/prometheus/prometheus.yml --web.listen-address="0.0.0.0:9090" --storage.tsdb.path=/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles --storage.tsdb.retention=30d
--config.file=/etc/prometheus/prometheus.yml # 指明主配置文件
--web.listen-address="0.0.0.0:9090" # 指明监听地址端口
--storage.tsdb.path=/prometheus # 指明数据库目录
--web.console.libraries=/usr/share/prometheus/console_libraries
--web.console.templates=/usr/share/prometheus/consoles # 指明console lib 和 tmpl
--storage.tsdb.retention=60d # 指明数据保留天数,默认15
Prometheus的主配置?件为prometheus.yml
它主要由global、rule_files、scrape_configs、alerting、remote_write和remote_read?个配置段组成:
- global:全局配置段;
- rule_files:指定告警规则文件的路径
- scrape_configs:
scrape配置集合,?于定义监控的?标对象(target)的集合,以及描述如何抓取 (scrape)相关指标数据的配置参数;
通常,每个scrape配置对应于?个单独的作业(job),
?每个targets可通过静态配置(static_configs)直接给出定义,也可基于Prometheus?持的服务发现机制进 ??动配置;
- job_name: ‘nodes‘
static_configs: # 静态指定,targets中的 host:port/metrics 将会作为metrics抓取对象
- targets: [‘localhost:9100‘]
- targets: [‘172.20.94.1:9100‘]
- job_name: ‘docker_host‘
file_sd_configs: # 基于文件的服务发现,文件中(yml 和json 格式)定义的host:port/metrics将会成为抓取对象
- files:
- ./sd_files/docker_host.yml
refresh_interval: 30s
可由Prometheus使?的Alertmanager实例的集合,以及如何同这些Alertmanager交互的配置参数;
每个Alertmanager可通过静态配置(static_configs)直接给出定义, 也可基于Prometheus?持的服务发现机制进??动配置;
配置“远程写”机制,Prometheus需要将数据保存于外部的存储系统(例如InfluxDB)时 定义此配置段,
随后Prometheus将样本数据通过HTTP协议发送给由URL指定适配器(Adaptor);
remote_read:
配置“远程读”机制,Prometheus将接收到的查询请求交给由URL指定适配器 (Adpater)执?,
Adapter将请求条件转换为远程存储服务中的查询请求,并将获取的响应数据转换为Prometheus可?的格式;
监控及告警规则配置文件:*.yml
rule_files:
- "test_rules.yml" # 指定配置告警规则的文件路径
服务发现定义文件:支持yaml 和 json 两种格式
file_sd_configs:
- files:
- ./sd_files/http.yml
refresh_interval: 30s
global:
scrape_interval: 15s #每过15秒抓取一次指标数据
evaluation_interval: 15s#每过15秒执行一次报警规则,也就是说15秒执行一次报警
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]# 设置报警信息推送地址 , 一般而言设置的是alertManager的地址
rule_files:
- "test_rules.yml" # 指定配置告警规则的文件路径
scrape_configs:
- job_name: ‘node‘#自己定义的监控的job_name
static_configs: # 配置静态规则,直接指定抓取的ip:port
- targets: [‘localhost:9100‘]
- job_name: ‘CDG-MS‘
honor_labels: true
metrics_path: ‘/prometheus‘
static_configs:
- targets: [‘localhost:8089‘]
relabel_configs:
- target_label: env
replacement: dev
- job_name: ‘eureka‘
file_sd_configs: # 基于文件的服务发现
- files:
- "/app/enmonster/basic/prometheus/prometheus-2.2.1.linux-amd64/eureka.json" # 支持json 和yml 两种格式
refresh_interval: 30s # 30s钟自行刷新配置,读取文件,修改之后无需手动reload
relabel_configs:
- source_labels: [__job_name__]
regex: (.*)
target_label: job
replacement: ${1}
- target_label: env
replacement: dev
告警规则配置文件示例:
[root@host40 monitor-bak]# cat prometheus/rules/docker_monitor.yml
groups:
name: "container monitor"
rules:
基于文件的服务发现定义文件: *.yml
[root@host40 monitor]# cat prometheus/sd_files/virtual_lan.yml
- targets: [‘10.10.11.179:9100‘]
- targets: [‘10.10.11.178:9100‘]
[root@host40 monitor]# cat prometheus/sd_files/tcp.yml
- targets: [‘10.10.11.178:8001‘]
labels:
server_name: http_download
- targets: [‘10.10.11.178:3307‘]
labels:
server_name: xiaojing_db
- targets: [‘10.10.11.178:3001‘]
labels:
server_name: test_web
node_exporter 在被监控节点安装,抓取主机监控信息,并对外提供http服务,供prometheus抓取监控信息。
下载并解压
wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
tar xf node_exporter-0.18.1.linux-amd64.tar.gz -C /usr/local/
cd /usr/local
ln -sv node_exporter-0.18.1.linux-amd64/ node_exporter
创建用户:
useradd -r -m -d /var/lib/prometheus prometheus
配置unit file:
vim /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=Prometheus exporter for machine metrics, written in Go with pluggable metric
collectors.Documentation=https://github.com/prometheus/node_exporterAfter=network.target
[Service]
EnvironmentFile=-/etc/sysconfig/node_exporter
User=prometheus
ExecStart=/usr/local/node_exporter/node_exporter $NODE_EXPORTER_OPTS
Restart=on-failure
StartLimitInterval=1
RestartSec=3
[Install]
WantedBy=multi-user.target
启动服务:
systemctl daemon-reload
systemctl start node_exporter.service
可以手动测试是否可以获取metrics信息:
curl http://localhost:9100/metrics
开启防火墙:
iptables -I INPUT -p tcp --dport 9100 -s NET/MASK -j ACCEPT
image: quay.io/prometheus/node-exporter,prom/node-exporter
启动命令:
docker run -d --net="host" --pid="host" -v "/:/host:ro,rslave" --name monitor-node-exporter --restart always quay.io/prometheus/node-exporter --path.rootfs=/host --web.listen-address=:9100
对于部分低版本的docker,出现报错:Error response from daemon: linux mounts: Could not find source mount of /
解决办法:-v "/:/host:ro,rslave" -> -v "/:/host:ro"
开启关闭collectors:
./node_exporter --help # 查看支持的所有collectors,可根据实际需求 enable 和 disabled 各项指标收集
如 --collector.cpu=disabled ,不再收集cpu相关信息
Textfile Collector: 文本文件收集器
通过 启动参数 --collector.textfile.directory="DIR" 可开启文本文件收集器
收集器会收集目录下所有*.prom的文件中的指标,指标必须满足 prom格式
示例:
echo my_batch_job_completion_time $(date +%s) > /path/to/directory/my_batch_job.prom.$$
mv /path/to/directory/my_batch_job.prom.$$ /path/to/directory/my_batch_job.prom
echo ‘role{role="application_server"} 1‘ > /path/to/directory/role.prom.$$
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
rpc_duration_seconds{quantile="0.5"} 4773
http_request_duration_seconds_bucket{le="0.5"} 129389
即如果node_exporter 不能满足自身指标抓取,可以通过脚本形式将指标抓取之后写入文件,由node_exporter对外提供个prometheus抓取
可以省掉pushgateway
示例: prometheus.yml
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
static_configs:
job_name: ‘nodes‘
static_configs:
- job_name: ‘node_real_lan‘
file_sd_configs:
- files:
sudo docker run --volume=/:/rootfs:ro --volume=/var/run:/var/run:ro --volume=/sys:/sys:ro --volume=/var/lib/docker/:/var/lib/docker:ro --volume=/dev/disk/:/dev/disk:ro --publish=9080:8080 --detach=true --name=cadvisor --privileged --device=/dev/kmsg google/cadvisor:v0.33.0
配置示例:
- job_name: ‘docker‘
static_configs:
下载并安装
wget https://dl.grafana.com/oss/release/grafana-7.2.2-1.x86_64.rpm
sudo yum install grafana-7.2.2-1.x86_64.rpm
准备service 文件:
[Unit]
Description=Grafana instance
Documentation=http://docs.grafana.org
Wants=network-online.target
After=network-online.target
After=postgresql.service mariadb.service mysqld.service
[Service]
EnvironmentFile=/etc/sysconfig/grafana-server
User=grafana
Group=grafana
Type=notify
Restart=on-failure
WorkingDirectory=/usr/share/grafana
RuntimeDirectory=grafana
RuntimeDirectoryMode=0750
ExecStart=/usr/sbin/grafana-server --config=${CONF_FILE} --pidfile=${PID_FILE_DIR}/grafana-server.pid--packaging=rpm cfg:default.paths.logs=${LOG_DIR} cfg:default.paths.data=${DATA_DIR} cfg:default.paths.plugins=${PLUGINS_DIR} cfg:default.paths.provisioning=${PROVISIONING_CFG_DIR}
LimitNOFILE=10000
TimeoutStopSec=20
[Install]
WantedBy=multi-user.target
启动grafana
systemctl enable grafana-server.service
systemctl restart grafana-server.service
默认监听3000端口
开启防火墙:
iptables -I INPUT -p tcp --dport 3000 -s NET/MASK -j ACCEPT
image: grafana/grafana
docker run -d --name=grafana -p 3000:3000 grafana/grafana:7.2.2
web页面访问:
http://ip:port
首次登陆会要求自行设置账号密码
7.2版本会要求输入账号密码之后重置,初始账号密码都是admin
使用流程:
常用模板编号:
重置管理员密码:
查看Grafana配置文件,确定grafana.db的路径
配置文件路径:/etc/grafana/grafana.ini
[paths]
;data = /var/lib/grafana
[database]
# For "sqlite3" only, path relative to data_path setting
;path = grafana.db
通过配置文件得知grafana.db的完整路径如下:
/var/lib/grafana/grafana.db
使用sqlites修改admin密码
sqlite3 /var/lib/grafana/grafana.db
sqlite> update user set password =
‘59acf18b94d7eb0694c61e60ce44c110c7a683ac6a8f09580d626f90f4a242000746579358d77dd9e570e83fa24faa88a8a6‘,
salt = ‘F3FAxVm33R‘ where login = ‘admin‘;
.exit
使用admin admin 登录
grafana-server配置 smtp服务器,配置发件邮箱
vim /etc/grafana/grafana.ini
[smtp]
enabled = true
host = smtp.126.com:465
user = USER@126.com
password = PASS
skip_verify = false
from_address = USER@126.com
from_name = Grafana Alart
grafana页面添加Notification Channel
Alerting -> Notification Channel
save之前 可以send test
进入dashboard,添加alart rules
prometheus用来查询数据库的语法规则,用来将数据库中存储的由各exporter 采集到的metric指标组织成可视化的图标信息,以及告警规则
gauges: 返回单一数值,如:
node_boot_time_seconds{instance="10.10.11.40:9100",job="node_real_lan"} 1574040030
counters: 计数,
histograms: 直方图,统计数据的分布情况。比如最大值,最小值,中间值,中位数,百分位数等。
node_boot_time_seconds{instance="10.10.11.40:9100",job="node_real_lan"}
如上示例,这里的instance,和job 就是label
也可以在配置文件自行定义label,如:
- targets: [‘10.10.11.178:3001‘]
labels:
server_name: test_web
添加的label即会在prometheus查询数据使用:
metric{servername=...,}
计算cpu使用率:
(1-((sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance))/(sum(increase(node_cpu_seconds_total[1m])) by (instance)))) * 100
其中metric:
node_cpu_seconds_total # 总cpu 使用时间
node_cpu_seconds_total{mode="idle"} # 空闲cpu使用时间,其他类似标签: user , system , steal , softirq , irq , nice , iowait , idle
用到的函数:
increase( [1m]) # 1分钟之类的增量。
sum()
sum() by (TAG) # 其中 TAG 是标签,此地 instance 代表的是机器名. 按主机名进行相加,否则多主机只会显示一条线。
匹配运算:
= #等于 Select labels that are exactly equal to the provided string.
!= #不等于 Select labels that are not equal to the provided string.
=~ #正则表达式匹配 Select labels that regex-match the provided string.
!~ #正则表达式不匹配 Select labels that do not regex-match the provided string.
示例:
node_cpu_seconds_total{mode="idle"} # mode : 标签,metric自带属性。
api_http_requests_total{method="POST", handler="/messages"}
http_requests_total{environment=~"staging|testing|development",method!="GET"}
注意: 必须指定一个名称或至少一个与空字符串不匹配的标签匹配器
{job=~".*"} # Bad!
{job=~".+"} # Good!
{job=~".*",method="get"} # Good!
时间范围:
s -秒
m - 分钟
h - 小时
d - 天
w -周
y -年
运算符:
+ (addition)
- (subtraction)
* (multiplication)
/ (division)
% (modulo)
^ (power/exponentiatio
== (equal)
!= (not-equal)
> (greater-than)
< (less-than)
>= (greater-or-equal)
<= (less-or-equal)
and (intersection)
or (union)
unless (complement)
集合运算符:
sum (calculate sum over dimensions)
min (select minimum over dimensions)
max (select maximum over dimensions)
avg (calculate the average over dimensions)
stddev (calculate population standard deviation over dimensions)
stdvar (calculate population standard variance over dimensions)
count (count number of elements in the vector)
count_values (count number of elements with the same value)
bottomk (smallest k elements by sample value)
topk (largest k elements by sample value)
quantile (calculate φ-quantile (0 ≤ φ ≤ 1) over dimensions)
sum() by (instance) #求和(根据条件求和)
increase() # 取增量,针对counter类型
示例:
increase(node_network_receive_bytes_total[30s]) # 接受流量
rate() # 专门搭配counter类型数据使用的函数,按照设置的一个时间段,取counter在这个时间段中的平均每秒的增量
示例:
rate(node_network_receive_bytes_total[30s])*8 # 入口带宽
topk() # 给定数字x,根据数值排序之后去最高的x个数
示例:
topk(5,node_cpu_seconds_total) # 取node_cpu_seconds_total 最长的前5个
topk(5,increase(node_network_receive_bytes_total[10m])) # 10m钟之内收到的流量前5
注意:
会造成图像散列。
console中执行,取一次值。
count() # 计数,如 count(node_load1 > 5)
avg() by () # 取均值,by(label)
配置规则以将抓取的数据汇总到新的时间序列中
示例:
将以下规则,记录进prometheus.rules.yml文件
avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
记录进prometheus.rules.yml文件中
groups:
- name: example
rules:
record: job_service:rpc_durations_seconds_count:avg_rate5m
expr: avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
在prometheus.yml文件中:
rule_files:
相当于生产了一个新的matric,不过此matric不是抓取来的,而是计算来的。
文档地址:https://prometheus.io/docs/alerting/latest/configuration/
下载并安装:
wget https://github.com/prometheus/alertmanager/releases/download/v0.20.0/
alertmanager-0.20.0.linux-amd64.tar.gz
tar -xf alertmanager-0.20.0.linux-amd64.tar.gz -C /usr/local
cd /usr/local && ln -sv alertmanager-0.20.0.linux-amd64/ alertmanager && cd alertmanager
启动:
nohup ./alertmanager --config.file="alertmanager.yml" --storage.path="data/ --web.listen-address=":9093" &
image: prom/alertmanager
docker run
docker run -dit --name monitor-alertmanager -v ./alertmanager/db/:/alertmanager -v ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml ./alertmanager/templates/:/etc/alertmanager/templates -p 9093:9093 --restart always --privileged true prom/alertmanager --config.file="/etc/alertmanager/alertmanager.yml" --storage.path="/alertmanager --web.listen-address=":9093"
grouping: 分组
示例:
发生网络分区时,群集中正在运行数十个或数百个服务实例。您有一半的服务实例不再可以访问数据库。
Prometheus中的警报规则配置为在每个服务实例无法与数据库通信时为其发送警报。结果,数百个警报被发送到Alertmanager。
作为用户,人们只希望获得一个页面,同时仍然能够准确查看受影响的服务实例。因此,可以将Alertmanager配置为按警报的群集和
警报名称分组警报,以便它发送一个紧凑的通知。
Inhibition: 抑制
抑制是一种概念,如果某些其他警报已经触发,则抑制某些警报的通知。
正在触发警报,通知您无法访问整个群集。可以将Alertmanager配置为使与该群集有关的所有其他警报静音。这样可以防止与实际问题无关的数百或数千个触发警报的通知。
Silences: 静默
alerting:
alerting:
alertmanagers:
static_configs:
rule_files:
# Load rules once and periodically evaluate them according to the global ‘evaluation_interval‘.
rule_files:
"rules/*.yml"
scrape_configs:
scrape_configs:
示例:
[root@xiang-03 /usr/local/prometheus]#cat rules/node.yml
groups:
name: "system info"
rules:
主配置文件中需要配置:
先看示例:
vim alertmanager.yml
global:
smtp_smarthost: ‘xxx‘
smtp_from: ‘xxx‘
smtp_auth_username: ‘xxx‘
smtp_auth_password: ‘xxx‘
smtp_require_tls: false
templates:
to: ‘xxx@xx.xx‘
html: ‘{{ template "xx.html" . }}‘
headers: { Subject: " {{ 第二路由匹配测试}}" }
vim test.tmpl
{{ define "xx.html" }}
<table border="5">
<tr><td>报警项</td>
<td>磁盘</td>
<td>报警阀值</td>
<td>开始时间</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr><td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
gloable:
resolve_timeout: # 在没有报警的情况下声明为已解决的时间
route: # 所有报警信息进入后的根路由,用来设置报警的分发策略
group_by: [‘LABEL_NAME‘,‘alertname‘, ‘cluster‘,‘job‘,‘instance‘,...]
这里的标签列表是接收到报警信息后的重新分组标签,例如,接收到的报警信息里面有许多具有 cluster=A
和alertname=LatncyHigh 这样的标签的报警信息将会批量被聚合到一个分组里面
group_wait: 30s
当一个新的报警分组被创建后,需要等待至少group_wait时间来初始化通知,这种方式可以确保您能有足够的时间为同一分组来获取多个警报,然后一起触发这个报警信息。
group_interval: 5m
当第一个报警发送后,等待‘group_interval‘时间来发送新的一组报警信息。
repeat_interval: 5m
如果一个报警信息已经发送成功了,等待‘repeat_interval‘时间来重新发送他们
match:
label_name: NAME
匹配报警规则,满足条件的告警将被发给 receiver
match_re:
label_name: <regex>, ...
正则表达式匹配。满足条件的告警将被发给 receiver
receiver: receiver_name
将满足match 和 match_re的告警发给后端 告警媒介(邮件,webhook,pagerduty,wechat,...)
必须有一个default receivererr="root route must specify a default receiver"
routes:
- <route> ...
配置多条规则。
templates:
[ - <filepath> ... ]
? 配置模板,比如邮件告警页面模板
receivers:
- <receiver> ...# 列表
- name: receiver_name # 用于填写在route.receiver中的名字
email_configs: # 配置邮件告警
- to: <tmpl_string>
send_resolved: <boolean> | default = false # 故障恢复之后,是否发送恢复通知
配置接受邮件告警的邮箱,也可以配置单独配置发件邮箱。 详见官方文档
https://prometheus.io/docs/alerting/latest/configuration/#email_config
- name: ...
wechat_configs:
- send_resolved: <boolean> | default = false
api_secret: <secret> | default = global.wechat_api_secret
api_url: <string> | default = global.wechat_api_url
corp_id: <string> | default = global.wechat_api_corp_id
message: <tmpl_string> | default = ‘{{ template "wechat.default.message" . }}‘
agent_id: <string> | default = ‘{{ template "wechat.default.agent_id" . }}‘
to_user: <string> | default = ‘{{ template "wechat.default.to_user" . }}‘
to_party: <string> | default = ‘{{ template "wechat.default.to_party" . }}‘
to_tag: <string> | default = ‘{{ template "wechat.default.to_tag" . }}‘
# 说明
to_user: 企业微信用户ID
to_party: 需要发送的组id
corp_id: 企业微信账号唯一ID 可以在 我的企业 查看
agent_id: 应用的 ID,应用管理 --> 打开自定应用查看
api_secret: 应用的密钥
打开企业微信注册 https://work.weixin.qq.com
微信API官方文档 https://work.weixin.qq.com/api/doc#90002/90151/90854
企业微信告警配置
inhibit_rules:
- source_match:
severity: ‘critical‘
target_match:
severity: ‘warning‘
equal: [‘alertname‘, ‘dev‘, ‘instance‘]
抑制相关配置
注册企业: https://work.weixin.qq.com
可以注册未认证企业,人数上限200,绑定个人微信即可使用web后台
微信API官方文档 : https://work.weixin.qq.com/api/doc#90002/90151/90854
注册之后绑定私人微信即可扫码进入管理后台。
发送告警的应用需要新建,操作也很简单
需要注意的参数:
receivers:
- name: ‘default‘
email_configs:
- to: ‘XXX‘
send_resolved: true
wechat_configs:
- send_resolved: true
corp_id: ‘XXX‘
api_secret: ‘XXX‘
agent_id: 1000002
to_user: XXX
to_party: 2
message: ‘{{ template "wechat.html" . }}‘
template:
由于alertmanager默认的微信报警模板太丑丑陋和冗长,所以使用告警模板,邮件模板默认的倒是还可以
cat wechat.tmpl
{{ define "wechat.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
[@警报~]
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
详情: {{ .Annotations.description }}
值: {{ .Annotations.value }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
[@恢复~]
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end -}}
{{- end }}
参考来源: https://blog.csdn.net/knight_zhou/article/details/106323719
Prometheus 邮件告警自定义模板的默认使用的是utc时间。
触发时间: {{ .StartsAt.Format "2020-01-02 15:04:05" }}
修改之后:{{ (.StartsAt.Add 28800e9).Format "2020-01-02 15:04:05" }}
vim rules/docker_monitor.yml
groups:
- name: "container monitor"
rules:
- alert: "Container down: env1"
expr: time() - container_last_seen{name="env1"} > 60
for: 30s
labels:
severity: critical
annotations:
summary: "Container down: {{$labels.instance}} name={{$labels.name}}"
注意:
此项指标只能监控容器down 掉,无法准确监控容器恢复(不准),即便容器没有成功启动,过一段时间,也会受到resolve通知
groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up == 0
for: 1m
labels:
status: 非常严重
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器延时超过5分钟"
- alert: CPU使用情况
expr: 100-(avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100)
for: 1m
labels:
status: 一般告警
annotations:
summary: "{{$labels.mountpoint}} CPU使用率过高!"
description: "{{$labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)"
- alert: cpu使用率过高告警 # 查询提供了hostname label
expr: (100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 10
nodename) (node_uname_info) > 85
for: 5m
labels:
region: 成都
annotations:
summary: "{{$labels.instance}}({{$labels.nodename}})CPU使用率过高!"
description: ‘服务器{{$labels.instance}}({{$labels.nodename}})CPU使用率超过85%(
$value}}%)‘
- alert: 系统负载过高
expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"}
nodename) (node_uname_info)>1.1
for: 3m
labels:
region: 成都
annotations:
summary: "{{$labels.instance}}({{$labels.nodename}})系统负载过高!"
description: ‘{{$labels.instance}}({{$labels.nodename}})当前负载超标率 {{printf
- alert: 内存不足告警
expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* o
nodename) (node_uname_info) > 80
for: 3m
labels:
region: 成都
annotations:
summary: "{{$labels.instance}}({{$labels.nodename}})内存使用率过高!"
description: ‘服务器{{$labels.instance}}({{$labels.nodename}})内存使用率超过80%(
$value}}%)‘
- alert: IO操作耗时
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) <
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~‘tap.*|veth.*|br.*|do
instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~‘tap.*|veth.*|br.*|d
instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{
- alert: network in
expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1
for: 1m
labels:
name: network
severity: Critical
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高"
description: "{{$labels.mountpoint }}流入网络异常,高于100M"
value: "{{ $value }}"
- alert: network out
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 /
for: 1m
labels:
name: network
severity: Critical
annotations:
summary: "{{$labels.mountpoint}} 发送网络带宽过高"
description: "{{$labels.mountpoint }}发送网络异常,高于100M"
value: "{{ $value }}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$valu
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_b
> 80
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
- alert: 硬盘空间不足告警 # 查询结果多了hostname等label
expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_by
)* on(instance) group_left(nodename) (node_uname_info)> 80
for: 3m
labels:
region: 成都
annotations:
summary: "{{$labels.instance}}({{$labels.nodename}})硬盘使用率过高!"
description: ‘服务器{{$labels.instance}}({{$labels.nodename}})硬盘使用率超过80%(
$value}}%)‘
- alert: volume fullIn fourdaysd # 预计磁盘4天后写满
expr: predict_linear(node_filesystem_free_bytes[2h], 4 * 24 * 3600) < 0
for: 5m
labels:
name: disk
severity: Critical
annotations:
summary: "{{$labels.mountpoint}} 预计主机可用磁盘空间4天后将写满"
description: "{{$labels.mountpoint }}"
value: "{{ $value }}%"
- alert: disk write rate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024
for: 1m
labels:container_memory_max_usage_bytes
name: disk
severity: Critical
annotations:
summary: "disk write rate (instance {{ $labels.instance }})"
description: "磁盘写入速率大于50MB/s"
value: "{{ $value }}%"
- alert: disk read latency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_complet
for: 1m
labels:
name: disk
severity: Critical
annotations:
summary: "unusual disk read latency (instance {{ $labels.instance }})"
description: "磁盘读取延迟大于100毫秒"
value: "{{ $value }}%"
- alert: disk write latency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_compl
for: 1m
labels:
name: disk
severity: Critical
annotations:
summary: "unusual disk write latency (instance {{ $labels.instance }})"
description: "磁盘写入延迟大于100毫秒"
value: "{{ $value }}%"
GET /-/healthy
GET /-/ready
POST /-/reload
curl -u monitor:fosafer.com 127.0.0.1:9093/-/healthy
OK
curl -XPOST -u monitor:fosafer.com 127.0.0.1:9093/-/reload
[root@host40 monitor]# curl -XPOST -u monitor:fosafer.com 127.0.0.1:9093/-/reload
failed to reload config: yaml: unmarshal errors:
line 26: field receiver already set in type config.plain
等同: docker exec -it monitor-alertmanager kill -1 1 ,但是失败会报错
blackbox_exporter是Prometheus 官方提供的 exporter 之一,可以提供 http、dns、tcp、icmp 的监控数据采集。
应用场景:
HTTP 测试
定义 Request Header 信息
判断 Http status / Http Respones Header / Http Body 内容
TCP 测试
业务组件端口状态监听
应用层协议定义与监听
ICMP 测试
主机探活机制
POST 测试
接口联通性
SSL 证书过期时间
下载并解压
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/
blackbox_exporter-0.18.0.linux-amd64.tar.gz
tar -xf blackbox_exporter-0.18.0.linux-amd64.tar.gz -C /usr/local/
cd /usr/local
ln -sv blackbox_exporter-0.18.0.linux-amd64 blackbox_exporter
cd blackbox_exporter
./blackbox_exporter --version
添加systemd服务unit:
vim /lib/systemd/system/blackbox_exporter.service
[Unit]
Description=blackbox_exporter
After=network.target
[Service]
User=root
Type=simple
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable blackbox_exporter
systemctl start blackbox_exporter
image: prom/blackbox-exporter:master
docker run:
docker run --rm -d -p 9115:9115 --name blackbox_exporter -v `pwd`:/config prom/blackbox-exporter:master --config.file=/config/blackbox.yml
默认配置文件:
blackbox_exporter 默认情况配置文件已经能够满足大多数需求,后续如需自行配置,参见官方文档,以及项目类一个示例配置文件
cat blackbox.yml
modules:
http_2xx:
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
官方介绍: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
参考文档: https://blog.csdn.net/qq_25934401/article/details/84325356
说明:
labels:
job: job_name
__address__: <host>:<port>
instance: 默认__address__,如果没有被重新标签的话
__scheme__: scheme
__metrics_path__: path
__param_<name>: url 中第一个出现的 <name> 参数
scrape_configs:
- job_name: ‘blackbox‘
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- http://prometheus.io # Target to probe with http.
- https://prometheus.io# Target to probe with https.
- http://example.com:8080 # Target to probe with http on port 8080.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter‘s real hostname:port.
- job_name: "blackbox_telnet_port]"
scrape_interval: 5s
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: [ ‘1x3.x1.xx.xx4:443‘ ]
labels:
group: ‘xxxidc机房ip监控‘
- targets: [‘10.xx.xx.xxx:443‘]
labels:
group: ‘Process status of nginx(main) server‘
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 10.xxx.xx.xx:9115
- job_name: ‘blackbox00_ping_idc_ip‘
scrape_interval: 10s
metrics_path: /probe
params:
module: [icmp] #ping
static_configs:
- targets: [ ‘1x.xx.xx.xx‘ ]
labels:
group: ‘xxnginx 虚拟IP‘
relabel_configs:
- source_labels: [__address__]
regex: (.*)(:80)?
target_label: __param_target
replacement: ${1}
- source_labels: [__param_target]
regex: (.*)
target_label: ping
replacement: ${1}
- source_labels: []
regex: .*
target_label: __address__
replacement: 1x.xxx.xx.xx:9115
- job_name: ‘blackbox_http_2xx_post‘
scrape_interval: 10s
metrics_path: /probe
params:
module: [http_post_2xx_query]
static_configs:
- targets:
- https://xx.xxx.com/api/xx/xx/fund/query.action
labels:
group: ‘Interface monitoring‘
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 1x.xx.xx.xx:9115 # The blackbox exporter‘s real hostname:port.
cat << ‘EOF‘ > prometheus.yml
rule_files:
- ssl_expiry.rules
scrape_configs:
- job_name: ‘blackbox‘
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- example.com # Target to probe
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # Blackbox exporter.
EOF
cat << ‘EOF‘ > ssl_expiry.rules
groups:
- name: ssl_expiry.rules
rules:
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"} - time() < 86400 * 30
for: 10m
EOF
类似于:
curl http://172.16.10.65:9115/probe?target=prometheus.io&module=http_2xx&debug=true
icmp、tcp、http、post 监测是否正常可以观察probe_success 这一指标
probe_success == 0 ##联通性异常
probe_success == 1 ##联通性正常
告警也是判断这个指标是否等于0,如等于0 则触发异常报警
[sss@prometheus01 prometheus]$ cat rules/blackbox-alert.rules
groups:
- name: blackbox_network_stats
rules:
prometheus
alertmanager
grafana
nginx
node_exporter
cadvisor
blackbox_exporter
prom/prometheus
prom/alertmanager
quay.io/prometheus/node-exporter ,prom/node-exporter
gcr.io/google_containers/cadvisor[:v0.36.0] # 需要能访问google
google/cadvisor:v0.33.0 # docker hub镜像,版本没有google的新
grafana/grafana
nginx
将iamge pull下来之后从新tag ,并上传至本地harbor 仓库
image: 10.10.11.40:80/base/nginx:1.19.3
image: 10.10.11.40:80/base/prometheus:2.22.0
image: 10.10.11.40:80/base/grafana:7.2.2
image: 10.10.11.40:80/base/alertmanager:0.21.0
image: 10.10.11.40:80/base/node_exporter:1.0.1
image: 10.10.11.40:80/base/cadvisor:v0.33.0
image: 10.10.11.40:80/base/blackbox-exporter:0.18.0
目录结构一览
mkdir /home/deploy/monitor
cd /home/deploy/monitor
[root@host40 monitor]# tree
.
├── alertmanager
│ ├── alertmanager.yml
│ ├── db
│ │ ├── nflog
│ │ └── silences
│ └── templates
│ └── wechat.tmpl
├── blackbox_exporter
│ └── blackbox.yml
├── docker-compose.yml
├── grafana
│ └── db
│ ├── grafana.db
│ ├── plugins
...
├── nginx
│ ├── auth
│ └── nginx.conf
├── node-exporter
│ └── textfiles
├── node_exporter_install_docker.sh
├── prometheus
│ ├── db
│ ├── prometheus.yml
│ ├── rules
│ │ ├── docker_monitor.yml
│ │ ├── system_monitor.yml
│ │ └── tcp_monitor.yml
│ └── sd_files
│ ├── docker_host.yml
│ ├── http.yml
│ ├── icmp.yml
│ ├── real_lan.yml
│ ├── real_wan.yml
│ ├── sedFDm5Rw
│ ├── tcp.yml
│ ├── virtual_lan.yml
│ └── virtual_wan.yml
└── sd_controler.sh
nginx basic认证需要的文件:
[root@host40 monitor-bak]# ls nginx/auth/ -a
. .. .htpasswd
部分挂在目录权限:
prometheus,grafana,alertmanager 的 db目录 需要777权限
单独挂在的配置文件 alertmanager.yml,prometheus.yml,nginx.conf 需要 666权限。
如果为了安全起见,建议将配置文件放入专门目录中挂载,并在command 中修改启动参数指定配置文件即可
[root@host40 monitor-bak]# cat docker-compose.yml
version: "3"
services:
nginx:
image: 10.10.11.40:80/base/nginx:1.19.3
hostname: nginx
container_name: monitor-nginx
restart: always
privileged: false
ports:
- 3001:3000
- 9090:9090
- 9093:9093
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf
- ./nginx/auth:/etc/nginx/basic_auth
networks:
monitor:
aliases:
- nginx
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
prometheus:
image: 10.10.11.40:80/base/prometheus:2.22.0
container_name: monitor-prometheus
hostname: prometheus
restart: always
privileged: true
volumes:
- ./prometheus/db/:/prometheus/
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/rules/:/etc/prometheus/rules/
- ./prometheus/sd_files/:/etc/prometheus/sd_files/
command:
- ‘--config.file=/etc/prometheus/prometheus.yml‘
- ‘--storage.tsdb.path=/prometheus‘
- ‘--web.console.libraries=/usr/share/prometheus/console_libraries‘
- ‘--web.console.templates=/usr/share/prometheus/consoles‘
- ‘--storage.tsdb.retention=60d‘
networks:
monitor:
aliases:
- prometheus
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
grafana:
image: 10.10.11.40:80/base/grafana:7.2.2
container_name: monitor-grafana
hostname: grafana
restart: always
privileged: true
volumes:
- ./grafana/db/:/var/lib/grafana
networks:
monitor:
aliases:
- grafana
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
alertmanger:
image: 10.10.11.40:80/base/alertmanager:0.21.0
container_name: monitor-alertmanager
hostname: alertmanager
restart: always
privileged: true
volumes:
- ./alertmanager/db/:/alertmanager
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ./alertmanager/templates/:/etc/alertmanager/templates
networks:
monitor:
aliases:
- alertmanager
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
node-exporter:
image: 10.10.11.40:80/base/node_exporter:1.0.1
container_name: monitor-node-exporter
hostname: host40
restart: always
privileged: true
volumes:
- /:/host:ro,rslave
- ./node-exporter/textfiles/:/textfiles
network_mode: "host"
command:
- ‘--path.rootfs=/host‘
- ‘--web.listen-address=:9100‘
- ‘--collector.textfile.directory=/textfiles‘
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
cadvisor:
image: 10.10.11.40:80/base/cadvisor:v0.33.0
container_name: monitor-cadvisor
hostname: cadvisor
restart: always
privileged: true
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- 9080:8080
networks:
monitor:
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
blackbox_exporter:
image: 10.10.11.40:80/base/blackbox-exporter:0.18.0
container_name: monitor-blackbox
hostname: blackbox-exporter
restart: always
privileged: true
volumes:
- ./blackbox_exporter/:/etc/blackbox_exporter
networks:
monitor:
aliases:
- blackbox
command:
- ‘--config.file=/etc/blackbox_exporter/blackbox.yml‘
logging:
driver: json-file
options:
max-file: ‘5‘
max-size: 50m
networks:
monitor:
ipam:
config:
- subnet: 192.168.17.0/24
由于prometheus,alertmanager 本身不带认证功能,所以前端使用nginx完成调度和basic auth 认证,同一代理后端监听端口,便于管理。
prometheus: 9090
grafana:3000
alertmanager: 9093
node_exproter: 9100
cadvisor: 8080 (客户端)
echo monitor:`openssl passwd -crypt 123456` > .htpasswd
单独挂在配置文件容器不更新:(当然也可以选择挂在目录,而不是直接挂在文件)
chmod 666 nginx.conf
nginx容器加载配置文件:
docker exec -it web-director nginx -s reload
nginx.conf
[root@host40 monitor-bak]# cat nginx/nginx.conf
user nginx;
worker_processes auto;
error_log /var/log/nginx/error.log;
pid /run/nginx.pid;
include /usr/share/nginx/modules/*.conf;
events {
worker_connections 10240;
}
http {
log_format main ‘$remote_addr - $remote_user [$time_local] "$request" ‘
‘$status $body_bytes_sent "$http_referer" ‘
‘"$http_user_agent" "$http_x_forwarded_for"‘;
access_log /var/log/nginx/access.log main;
sendfileon;
tcp_nopush on;
tcp_nodelayon;
keepalive_timeout65;
types_hash_max_size 2048;
include /etc/nginx/mime.types;
default_type application/octet-stream;
proxy_connect_timeout500ms;
proxy_send_timeout1000ms;
proxy_read_timeout3000ms;
proxy_buffers 64 8k;
proxy_busy_buffers_size 128k;
proxy_temp_file_write_size 64k;
proxy_redirect off;
proxy_next_upstream error invalid_header timeout http_502 http_504;
proxy_http_version 1.1;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Real-Port $remote_port;
proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
client_max_body_size 10m;
client_body_buffer_size 512k;
client_body_timeout 180;
client_header_timeout 10;
send_timeout 240;
gzip on;
gzip_min_length 1k;
gzip_buffers 4 16k;
gzip_comp_level 2;
gzip_types application/javascript application/x-javascript text/css text/javascript image/jpeg image/gif image/png;
gzip_vary off;
gzip_disable "MSIE [1-6].";
server {
listen 3000;
servername ;
location / {
proxy_pass http://grafana:3000;
}
}
server {
listen 9090;
servername ;
location / {
auth_basic "auth for monitor";
auth_basic_user_file /etc/nginx/basic_auth/.htpasswd;
proxy_pass http://prometheus:9090;
}
}
server {
listen 9093;
servername ;
location / {
auth_basic "auth for monitor";
auth_basic_user_file /etc/nginx/basic_auth/.htpasswd;
proxy_pass http://alertmanager:9093;
}
}
}
### 9.5 prometheus
- 注意db目录需可写,给777权限
#### 9.5.1 主配置文件: prometheus.yml
[root@host40 monitor-bak]# cat prometheus/prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
rule_files:
scrape_configs:
job=<job_name>
to any timeseries scraped from this config.job_name: ‘prometheus‘
static_configs:
job_name: ‘node_real_lan‘
file_sd_configs:
job_name: ‘node_virtual_lan‘
file_sd_configs:
job_name: ‘node_real_wan‘
file_sd_configs:
job_name: ‘node_virtual_wan‘
file_sd_configs:
ls prometheus/sd_files/
docker_host.yml http.yml icmp.yml real_lan.yml real_wan.yml sedFDm5Rw tcp.yml virtual_lan.yml virtual_wan.yml
cat prometheus/sd_files/docker_host.yml
- targets: [‘10.10.11.178:9080‘]
- targets: [‘10.10.11.99:9080‘]
- targets: [‘10.10.11.40:9080‘]
- targets: [‘10.10.11.35:9080‘]
- targets: [‘10.10.11.45:9080‘]
- targets: [‘10.10.11.46:9080‘]
- targets: [‘10.10.11.48:9080‘]
- targets: [‘10.10.11.47:9080‘]
- targets: [‘10.10.11.65:9081‘]
- targets: [‘10.10.11.61:9080‘]
- targets: [‘10.10.11.66:9080‘]
- targets: [‘10.10.11.68:9080‘]
- targets: [‘10.10.11.98:9080‘]
- targets: [‘10.10.11.75:9080‘]
- targets: [‘10.10.11.97:9080‘]
- targets: [‘10.10.11.179:9080‘]
cat prometheus/sd_files/tcp.yml
- targets: [‘10.10.11.178:8001‘]
labels:
server_name: http_download
- targets: [‘10.10.11.178:3307‘]
labels:
server_name: xiaojing_db
- targets: [‘10.10.11.178:3001‘]
labels:
server_name: test_web
cat prometheus/rules/docker_monitor.yml
groups:
- name: "container monitor"
rules:
- alert: "Container down: env1"
expr: time() - container_last_seen{name="env1"} > 60
for: 30s
labels:
severity: critical
annotations:
summary: "Container down: {{$labels.instance}} name={{$labels.name}}"
tcp rules:
cat prometheus/rules/tcp_monitor.yml
groups:
- name: blackbox_network_stats
rules:
alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} ,server-name: {{ $labels.server_name }} is down"
description: "连接不通..."
cat prometheus/rules/system_monitor.yml
groups:
- name: "system info"
rules:
- alert: "服务器宕机"
expr: up == 0
for: 3m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器无法连接,持续时间已超过3mins"
- alert: "系统负载过高"
expr: (node_load1/count without (cpu, mode) (node_cpu_seconds_total{mode="system"}))* on(instance) group_left(
nodename) (node_uname_info) > 1.1
for: 3m
labels:
servirity: warning
annotations:
summary: "{{$labels.instance}}:系统负载过高"
description: "{{$labels.instance}}:系统负载过高."
value: "{{$value}}"
- alert: "CPU 使用率超过90%"
expr: 100-(avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 90
for: 3m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}:CPU 使用率90%"
description: "{{$labels.instance}}:CPU 使用率超过90%."
value: "{{$value}}"
- alert: "内存使用率超过80%"
expr: (100 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100)* on(instance) group_left(
nodename) (node_uname_info) > 80
for: 3m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}:内存使用率80%"
description: "{{$labels.instance}}:内存使用率超过80%"
value: "{{$value}}"
- alert: "IO操作耗时超过60%"
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 40
for: 3m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}:IO操作耗时超过60%"
description: "{{$labels.instance}}:IO操作耗时超过60%"
value: "{{$value}}"
- alert: "磁盘分区容量超过85"
expr: (100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes
{fstype=~"ext4|xfs"}*100) )* on(instance) group_left(nodename) (node_uname_info)> 85
for: 3m
labels:
severity: longtime
annotations:
summary: "{{$labels.instance}}:磁盘分区容量超过85%"
description: "{{$labels.instance}}:磁盘分区容量超过85%"
value: "{{$value}}"
- alert: "磁盘将在4天后写满"
expr: predict_linear(node_filesystem_free_bytes[2h], 4 * 24 * 3600) < 0
for: 3m
labels:
severity: longtime
annotations:
summary: "{{$labels.instance}}: 预计将有磁盘分区在4天后写满,"
description: "{{$labels.instance}}:预计将有磁盘分区在4天后写满,"
value: "{{$value}}"
注意db目录可写:
主配置文件:
cat alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: ‘smtphz.qiye.163.com:25‘
smtp_from: ‘XXX@fosafer.com‘
smtp_auth_username: ‘XXX@fosafer.com‘
smtp_auth_password: ‘XXX‘
smtp_hello: ‘qiye.163.com‘
smtp_require_tls: true
route:
group_by: [‘instance‘]
group_wait: 30s
receiver: default
routes:
- group_interval: 3m
repeat_interval: 10m
match:
severiry: warning
receiver: ‘default‘
- group_interval: 3m
repeat_interval: 30m
match:
severiry: critical
receiver: ‘default‘
- group_interval: 5m
repeat_interval: 24h
match:
severiry: longtime
receiver: ‘default‘
templates:
name: ‘default‘
email_configs:
wechat_configs:
name: ‘critical‘
email_configs:
告警模板文件
cat alertmanager/templates/wechat.tmpl
{{ define "wechat.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
[@警报~]
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
详情: {{ .Annotations.description }}
值: {{ .Annotations.value }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
[@恢复~]
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end -}}
{{- end }}
安装脚本:
http://10.10.11.178:8001/node_exporter_install.sh
安装脚本:
http://10.10.11.178:8001/node_exporter_install_docker.sh
需要的image,对于没有添加10.10.11.40:80 仓库的docker主机,可以下载save的image,先load image 在安装
http://10.10.11.178:8001/monitor-client.tgz
所有的job都使用基于文件的服务发现,所以,只用将target写入sd_file即可,无需重读配置文件
基于此写了一个文本处理脚本作为sd_files的前端,通过命令行的形式添加和删除targets,无需手动编辑文件
脚本名称: sd_controler.sh
脚本使用:./sd_controler.sh 即可查看usage
完整脚本如下:
[root@host40 monitor]# cat sd_controler.sh
#!/bin/bash
#version: 1.0
#Description: add | del | show instance from|to prometheus file_sd_files.
# rl | vl | dk | rw | vw | tcp | http | icmp : short for job name, each one means a sd_file.
# tcp | http | icmp ( because with ports for service ) add with label (server_name by default) to easy read in alert emails.
# each time can only add|del for one instance.
#说明:用来添加、删除、查看prometheus基于文件的服务发现中的条目。比如IP:PORT 组合。
# rl | vl | dk | rw | vw | tcp | http | icmp :这写prometheus job名称的简称,每一项代表一个job,操作一个sd_file 即job文件服务发现使用的文件。
# tcp | http | icmp,由于常常无法根据服务端口第一时间确认挂掉的是什么服务,所以,在tcp http icmp(顺带)添加的时候要求带上server_name的标签label,
#让监控人员收到告警邮件第十时间知道挂掉的是什么服务。
# 每一次只能添加、删除一条记录,如果需要批量添加,可以直接使用vim 文本操作,或者写for 语句批量执行。
### vars
SD_DIR=./prometheus/sd_files
DOCKER_SD=$SD_DIR/docker_host.yml
RL_HOST_SD=$SD_DIR/real_lan.yml
VL_HOST_SD=$SD_DIR/virtual_lan.yml
RW_HOST_SD=$SD_DIR/real_wan.yml
VW_HOST_SD=$SD_DIR/virtual_wan.yml
TCP_SD=$SD_DIR/tcp.yml
HTTP_SD=$SD_DIR/http.yml
ICMP_SD=$SD_DIR/icmp.yml
SDFILE=
### funcs
usage(){
echo -e "Usage: $0 < rl | vl | dk | rw | vw | tcp | http | icmp > < add | del | show > [ IP:PORT | FQDN ] [ server-name ]"
echo -e " example: \n\t node add:\t $0 rl add | del 10.10.10.10:9100\n\t tcp,http,icmp add:\t $0 tcp add 10.10.10.10:3306 web-mysql\n\t del:\t $0 http del www.baidu.com\n\t show:\t $0 rl | vl | dk | rw | vw | tcp | http | icmp show."
exit
}
add(){
# $1: SDFILE, $2: IP:PORT
grep -q $2 $1 || echo -e "- targets: [‘$2‘]" >> $1
}
del(){
# $1: SDFILE, $2: IP:PORT
sed -i ‘/‘$2‘/d‘ $1
}
add_with_label(){
# $1: SDFILE, $2: [IP:[PROT]|FQDN] $3:SERVER-NAME
LABEL_01="server_name"
if ! grep -q ‘$2‘ $1;then
echo -e "- targets: [‘$2‘]" >> $1
echo -e " labels:" >> $1
echo -e " ${LABEL_01}: $3" >> $1
fi
}
del_with_label(){
# $1: SDFILE, $2: [IP:[PROT]|FQDN]
NUM=`cat -n $SDFILE |grep "‘$2‘"|awk ‘{print $1}‘`
let ENDNUM=NUM+2
sed -i $NUM,${ENDNUM}d $1
}
action(){
if [ "$1" == "add" ];then
add $SDFILE $2
elif [ "$1" == "del" ];then
del $SDFILE $2
elif [ "$1" == "show" ];then
cat $SDFILE
fi
}
action_with_label(){
if [ "$1" == "add" ];then
add_with_label $SDFILE $2 $3
elif [ "$1" == "del" ];then
del_with_label $SDFILE $2 $3
elif [ "$1" == "show" ];then
cat $SDFILE
fi
}
[ "$2" == "" ] || [[ ! "$2" =~ ^(add|del|show)$ ]] && usage
curl --version &>/dev/null || { echo -e "no curl found. " && exit 15; }
if [[ $1 =~ ^(rl|vl|rw|vw|dk)$ ]] && [ "$2" == "add" ];then
[ "$3" == "" ] && usage
if [ "$4" != "-f" ];then
COOD=curl -IL -o /dev/null --retry 3 --connect-timeout 3 -s -w "%{http_code}" http://$3/metrics
[ "$COOD" != "200" ] && echo -e "http://$3/metrics is not arriable. check it again. or you can use -f to ignor it." && exit 11
fi
fi
if [[ $1 =~ ^(tcp|http|icmp)$ ]] && [ "$2" == "add" ];then
[ "$4" == "" ] && echo -e "监听 tcp http icmp 服务时必须指明 server-name." && usage
fi
case $1 in
rl)
SDFILE=$RL_HOST_SD
action $2 $3 && echo $2 OK
;;
vl)
SDFILE=$VL_HOST_SD
action $2 $3 && echo $2 OK
;;
dk)
SDFILE=$DOCKER_SD
action $2 $3 && echo $2 OK
;;
rw)
SDFILE=$RW_HOST_SD
action $2 $3 && echo $2 OK
;;
vw)
SDFILE=$VW_HOST_SD
action $2 $3 && echo $2 OK
;;
tcp)
SDFILE=$TCP_SD
action_with_label $2 $3 $4 && echo $2 OK
;;
http)
SDFILE=$HTTP_SD
action_with_label $2 $3 $4 && echo $2 OK
;;
icmp)
SDFILE=$ICMP_SD
action_with_label $2 $3 $4 && echo $2 OK
;;
*)
usage
;;
esac
prometheus+grafana+alertmanager 安装配置文档
原文:https://blog.51cto.com/mageedu/2568334