# Prometheus 监控 Kubernetes
# 1、监控 Kubernetes 环境准备
[root@k8s-master01 ~]# kubectl get nodes | |
NAME STATUS ROLES AGE VERSION | |
k8s-master01 Ready <none> 46d v1.27.10 | |
k8s-master02 Ready <none> 46d v1.27.10 | |
k8s-master03 Ready <none> 46d v1.27.10 | |
k8s-node01 Ready <none> 46d v1.27.10 | |
k8s-node02 Ready <none> 46d v1.27.10 |
# 2、部署 AlertManager ⾄ Kubernetes
- ①先交付 webhook_wechat、webhook_dingding;
- ②创建 ConfigMap,准备邮件的告警 template 模板;
- ③创建 ConfigMap,准备告警路由相关的配置;
- ④创建 HeadlessService;
- ⑤创建 statefulSet,运⾏ 3 个节点 AlertManager;
- ⑥创建 Ingress 对外提供;
#创建 namespace | |
[root@k8s-master01 ~]# kubectl create ns monitoring | |
[root@k8s-master01 01-alert-webhook-wechat]# sed -i "s#kube-prom#monitoring#g" *.yaml |
# 2.1 部署 webhook_wechat
1、编写 deployment,运⾏ webhook-wechat
[root@k8s-master01 01-alert-webhook-wechat]# cat 01-webhook-wechat-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: webhook-wechat
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: wechat
template:
metadata:
labels:
app: wechat
spec:
containers:
- name: webchat
image: oldxu3957/webhook_wechat_oldxu:v1.0
args: ["--port","5001"] # 默认就是5001端口
ports:
- containerPort: 5001
[root@k8s-master01 01-alert-webhook-wechat]# kubectl apply -f 01-webhook-wechat-deploy.yaml
[root@k8s-master01 01-alert-webhook-wechat]# kubectl get pods -n monitoring
NAME READY STATUS RESTARTS AGE
webhook-wechat-54b5bbf677-rmr26 1/1 Running 0 16s
2、编写 Service
[root@k8s-master01 01-alert-webhook-wechat]# cat 02-webhook-wechat-service.yaml
apiVersion: v1
kind: Service
metadata:
name: webhook-wechat-svc
namespace: monitoring
spec:
selector:
app: wechat
ports:
- port: 5001
targetPort: 5001
[root@k8s-master01 01-alert-webhook-wechat]# kubectl apply -f 02-webhook-wechat-service.yaml
[root@k8s-master01 01-alert-webhook-wechat]# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
webhook-wechat-svc ClusterIP 10.96.82.7 <none> 5001/TCP 13s
3、测试 webhook-wechat 是否能正常发送消息 (8e24a24d-3f48-4ea8-bbde-36ca84d857e4)
[root@prom-node01 ~]# curl -X POST http://10.96.82.7:5001/alert?token=8e24a24d-3f48-4ea8-bbde-36ca84d857e4 \
-H "Content-Type: application/json" \
-d '{
"alerts": [
{
"status": "firing",
"labels": {
"severity": "critical",
"alertname": "InstanceDown",
"instance": "example1"
},
"annotations": {
"summary": "Instance example1 down",
"description": "The instance example1 is down."
},
"startsAt": "2024-12-20T15:04:05Z",
"endsAt": "0001-01-01T00:00:00Z"
},
{
"status": "resolved",
"labels": {
"severity": "critical",
"alertname": "InstanceDown",
"instance": "example1"
},
"annotations": {
"summary": "Instance example1 is back up",
"description": "The instance example1 has recovered."
},
"startsAt": "2024-12-20T15:04:05Z",
"endsAt": "2024-12-20T16:04:05Z"
}
]
}'
# 2.2 部署 webhook_dingding
1、编写 deployment,运⾏ webhook-dingding
[root@k8s-master01 02-alert-webhook-dingding]# cat 01-webhook-dingding-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: webhook-dingding
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: dingding
template:
metadata:
labels:
app: dingding
spec:
containers:
- name: dingding
image: oldxu3957/webhook_dingding_oldxu:v1.0
args: ["--port","5002"]
ports:
- containerPort: 5002
[root@k8s-master01 02-alert-webhook-dingding]# kubectl apply -f 01-webhook-dingding-deploy.yaml
[root@k8s-master01 02-alert-webhook-dingding]# kubectl get pods -n monitoring
NAME READY STATUS RESTARTS AGE
webhook-dingding-6d68854649-r6smd 1/1 Running 0 14s
webhook-wechat-54b5bbf677-rmr26 1/1 Running 0 13m
2、编写 Service
[root@k8s-master01 02-alert-webhook-dingding]# cat 02-webhook-dingding-service.yaml
apiVersion: v1
kind: Service
metadata:
name: webhook-dingding-svc
namespace: monitoring
spec:
selector:
app: dingding
ports:
- port: 5002
targetPort: 5002
[root@k8s-master01 02-alert-webhook-dingding]# kubectl apply -f 02-webhook-dingding-service.yaml
[root@k8s-master01 02-alert-webhook-dingding]# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
webhook-dingding-svc ClusterIP 10.96.241.84 <none> 5002/TCP 0s
webhook-wechat-svc ClusterIP 10.96.82.7 <none> 5001/TCP 12m
3、测试 webhook-wechat 是否能正常发送消息,49989606592d7ff06ee4b83120bf5a81ed1e4c3860696dcd7e663be1c66ef43f
[root@k8s-master01 ~]# curl -X POST http://10.96.241.84:5002/alert?token=49989606592d7ff06ee4b83120bf5a81ed1e4c3860696dcd7e663be1c66ef43f \
-H "Content-Type: application/json" \
-d '{
"alerts": [
{
"status": "firing",
"labels": {
"severity": "critical",
"alertname": "InstanceDown",
"instance": "example1"
},
"annotations": {
"summary": "Instance example1 down",
"description": "The instance example1 is down."
},
"startsAt": "2024-12-20T15:04:05Z",
"endsAt": "0001-01-01T00:00:00Z"
},
{
"status": "resolved",
"labels": {
"severity": "critical",
"alertname": "InstanceDown",
"instance": "example1"
},
"annotations": {
"summary": "Instance example1 is back up",
"description": "The instance example1 has recovered."
},
"startsAt": "2024-12-20T15:04:05Z",
"endsAt": "2024-12-20T16:04:05Z"
}
]
}'
# 2.3 创建 AlertManager 配置
1、使⽤ ConfigMap 创建 AlertManager 所需的配置⽂件,名称为: alert-configs
[root@k8s-master01 03-alertmanager]# cat 01-alert-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alert-configs
namespace: monitoring
data:
alertmanager.yml: |-
# 全局配置
global:
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '373370405@qq.com'
smtp_auth_username: '373370405@qq.com'
smtp_auth_password: 'jmtpwlkuijaybhic'
smtp_hello: 'qq.com'
smtp_require_tls: false
# 加载模板的路径
templates:
- '/etc/alertmanager/template/*.tmpl'
# 路由规则
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 30s
repeat_interval: 5m
receiver: webhook-dingding-ops # 默认发送给钉钉
# 子路由
routes:
- match_re:
job: 'kube.*'
receiver: 'webhook-wechat' # 如果匹配到job=kube.*的都发送给微信
continue: true
- match_re:
job: 'redis_exporter'
receiver: 'email' # 如果job=domain_exporter则都发送给email
continue: true
receivers:
- name: 'email'
email_configs:
- to: '373370405@qq.com'
send_resolved: true
html: '{{ template "email.html" . }}' # 发送邮件内容,调用该模板进行渲染
- name: 'webhook-wechat'
webhook_configs:
- url: 'http://webhook-wechat-svc:5001/alert?token=8e24a24d-3f48-4ea8-bbde-36ca84d857e4'
- name: 'webhook-dingding-ops'
webhook_configs:
- url: 'http://webhook-dingding-svc:5002/alert?token=49989606592d7ff06ee4b83120bf5a81ed1e4c3860696dcd7e663be1c66ef43f'
[root@k8s-master01 03-alertmanager]# kubectl apply -f 01-alert-configs-configmap.yaml
[root@k8s-master01 03-alertmanager]# kubectl get cm -n monitoring
NAME DATA AGE
alert-configs 1 25s
alert-template 1 19s
kube-root-ca.crt 1 27m
2、使⽤ configmap 创建 AlertManager 邮件所依赖的模板⽂件,名称为: alert-template
[root@k8s-master01 03-alertmanager]# cat 02-alert-template-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alert-template
namespace: monitoring
data:
email.tmpl: |-
{{ define "email.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
<h2 style="color: red;">@告警通知</h2>
告警程序: AlertManager <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
<h2 style="color: green;">@告警恢复</h2>
告警程序: AlertManager <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
告警主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
{{ end }}{{ end -}}
{{ end }}
[root@k8s-master01 03-alertmanager]# kubectl apply -f 02-alert-template-configmap.yaml
[root@k8s-master01 03-alertmanager]# kubectl get cm -n monitoring
NAME DATA AGE
alert-configs 1 25s
alert-template 1 19s
kube-root-ca.crt 1 27m
# 2.4 创建 HeadLessService
[root@k8s-master01 03-alertmanager]# cat 03-alertmanager-headlessService.yaml
apiVersion: v1
kind: Service
metadata:
name: alertmanager-svc
namespace: monitoring
spec:
clusterIP: "None"
selector:
app: alert
ports:
- name: web
port: 9093
targetPort: 9093
- name: cluster
port: 9094
targetPort: 9094
[root@k8s-master01 03-alertmanager]# kubectl apply -f 03-alertmanager-headlessService.yaml
[root@k8s-master01 03-alertmanager]# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager-svc ClusterIP None <none> 9093/TCP,9094/TCP 6s
webhook-dingding-svc ClusterIP 10.96.241.84 <none> 5002/TCP 12m
webhook-wechat-svc ClusterIP 10.96.82.7 <none> 5001/TCP 24m
# 2.5 部署 AlertManager 服务
使⽤ statefulSet 编写 AlertManager 的⾼可⽤清单⽂件
- 1、定义 Alertmanager 实例的启动命令,包含了对应的 “配置⽂件路径 “、“数据存储路径” 以及 “Gossip 集群通信” 相关的参数。
- 2、定义 AlertManager 实例挂载 “邮件模板” 的 ConfigMap 资源,以及 AlertManager 主配置⽂件的 ConfigMap 资源。
- 3、Alertmanager 的每个实例,都需要使⽤ PVC 模板来提供数据持久化;
- 4、AlertManager 在启动时可以采⽤并⾏的⽅式,因为集群之间没有先后依赖关系;
[root@k8s-master01 03-alertmanager]# cat 04-alertmanager-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: alertmanager
namespace: monitoring
spec:
serviceName: "alertmanager-svc"
podManagementPolicy: "Parallel" #采用并行方式
replicas: 3
selector:
matchLabels:
app: alert
template:
metadata:
labels:
app: alert
spec:
volumes:
- name: alert-cfg
configMap:
name: alert-configs
- name: alert-temp-cfg
configMap:
name: alert-template
containers:
- name: alertmanager
image: prom/alertmanager:v0.26.0
args:
- "--web.listen-address=:9093"
- "--cluster.listen-address=0.0.0.0:9094"
- "--cluster.peer=alertmanager-0.alertmanager-svc:9094"
- "--cluster.peer=alertmanager-1.alertmanager-svc:9094"
- "--cluster.peer=alertmanager-2.alertmanager-svc:9094"
- "--cluster.peer-timeout=60s"
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/etc/alertmanager/data"
- "--data.retention=120h"
volumeMounts:
- name: alert-cfg
mountPath: /etc/alertmanager/
- name: alert-temp-cfg
mountPath: /etc/alertmanager/template
- name: alert-data
mountPath: /etc/alertmanager/data
ports:
- name: web
containerPort: 9093
- name: cluster
containerPort: 9094
resources:
requests:
cpu: 200m
memory: 200Mi
limits:
cpu: 300m
memory: 300Mi
volumeClaimTemplates:
- metadata:
name: alert-data
spec:
accessModes: ["ReadWriteMany"]
storageClassName: "nfs-storage"
resources:
requests:
storage: 3Gi
[root@k8s-master01 03-alertmanager]# kubectl apply -f 04-alertmanager-statefulset.yaml
[root@k8s-master01 03-alertmanager]# kubectl get pods -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-0 1/1 Running 0 97s
alertmanager-1 1/1 Running 0 97s
alertmanager-2 1/1 Running 0 97s
webhook-dingding-6d68854649-r6smd 1/1 Running 0 19m
webhook-wechat-54b5bbf677-rmr26 1/1 Running 0 32m
# 2.6 发布 AlertManager 服务
1、编写 Ingress 资源清单
[root@k8s-master01 03-alertmanager]# cat 05-alertmanager-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alert-ingress
namespace: monitoring
spec:
ingressClassName: "nginx"
rules:
- host: "k8s-alert.hmallleasing.com"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: alertmanager-svc
port:
number: 9093
[root@k8s-master01 03-alertmanager]# kubectl apply -f 05-alertmanager-ingress.yaml
[root@k8s-master01 03-alertmanager]# kubectl get ingress -n monitoring
NAME CLASS HOSTS ADDRESS PORTS AGE
alert-ingress nginx k8s-alert.hmallleasing.com 192.168.40.103,192.168.40.104,192.168.40.105 80 21s
2、访问 AlertManager 的⻚⾯
# 2.7 测试 AlertManager 告警
1、模拟故障测试 AlertManager 告警是否能正常发送
[root@k8s-master01 ~]# kubectl run tools --image=uhub.service.ucloud.cn/oldxu/tools:v1.0
[root@k8s-master01 ~]# kubectl exec -it tools -- /bin/bash
# 路由给钉钉
/alert_test_oldxu --alertURL="http://alertmanager-svc.monitoring.svc.cluster.local:9093/api/v1/alerts" --label="alertname=CPU故障,instance=dingding,severity=critical,job=node_exporter"
# 路由给微信
/alert_test_oldxu --alertURL="http://alertmanager-svc.monitoring.svc.cluster.local:9093/api/v1/alerts" --label="alertname=节点故障,instance=wechat,severity=critical,job=kube-nodes"
# 路由给邮件
/alert_test_oldxu --alertURL="http://alertmanager-svc.monitoring.svc.cluster.local:9093/api/v1/alerts" --label="alertname=redis故障,instance=email,severity=critical,job=redis_exporter"
# 3、部署 Prometheus ⾄ Kubernetes
- 1、创建 ConfigMap,准备 Prometheus 配置⽂件,定义 AlertManager 的地址、以及 Rules 规则⽂件的路径等
- 2、创建 ConfigMap,准备 Prometheus 告警相关的规则⽂件;
- 3、创建 RBAC 权限,我们需要使⽤ Prometheus 访问 APIServer 来抓取各种资源的指标,这意味着 Prometheus 的 Pod 需要相应的权限来访问 Kubernetes API
- 4、创建 HeadlessService;
- 5、创建 statefulSet,运⾏单节点的 Prometheus(⽣产环境不建议使⽤ NFS 作为后端存储);
- 6、创建 Ingress,对外提供 Prometheus
# 3.1 创建 Prometheus 配置⽂件
1、编辑 Prometheus 配置⽂件,先使⽤最⼩化配置。(后期监控其他组件在进⾏修改或增加。) configMap 资源名称: prom-configs
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
[root@k8s-master01 04-prometheus]# kubectl get cm -n monitoring
NAME DATA AGE
alert-configs 1 25m
alert-template 1 25m
kube-root-ca.crt 1 52m
prom-configs 1 16s
# 3.2 创建 Prometheus 告警规则
1、编辑 PrometheusRules 告警规则⽂件 以 node、pods、jvm、redis、blackbox 等告警规则⽂件为例,后续根据情况在添加, configMap 资源名称: prom-rules
注意:这些 rules 是基于此前节点监控的规则,然后结合 K8S 的标签进⾏了重新修订。
[root@k8s-master01 04-prometheus]# cat 02-prom-rules-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-rules
namespace: monitoring
data:
node_rules.yml: |-
groups:
- name: CPU告警规则
rules:
- alert: 节点CPU使用率超过80%
expr: ( 1 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,job) ) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "主机CPU利用率过高,实例:{{ $labels.instance }} , {{ $labels.job }}"
description: "该实例的CPU利用率低于20%,当前利用率:{{ $value }}%。可能存在CPU资源浪费情况。"
- alert: CPU饱和度过高
expr: sum(node_load1) by (instance,job) / (count(node_cpu_seconds_total{mode="idle"}) by (instance,job) * 2) * 100 > 80
for: 2m
labels:
severity: critical
annotations:
summary: "CPU饱和度过高,实例:{{ $labels.instance }} , {{ $labels.job }}"
description: "该实例的1分钟平均CPU负载超过了核心数的两倍,已经持续2分钟,当前CPU饱和度:{{ $value }}%。需要立即检查系统负载情况。"
- alert: 主机内存不足
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "主机内存使用率较高, 实例:{{ $labels.instance }}, 任务:{{ $labels.job }}"
description: "该实例的内存使用率持续2分钟高于80%,当前利用率:{{ $value }}%"
- alert: 内存饱和度高
expr: ( 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) * 100 > 30
for: 2m
labels:
severity: warning
annotations:
summary: "主机内存内存饱和度高, 实例:{{ $labels.instance }}, 任务:{{ $labels.job }}"
description: "SWAP内存使用率已连续2分钟超过30%,表明内存饱和度过高,当前SWAP使用率为:{{ $value }}%。"
- alert: 磁盘空间告急
expr: ( node_filesystem_size_bytes{device!="tmpfs"} - node_filesystem_avail_bytes{device!="tmpfs"} ) / node_filesystem_size_bytes{device!="tmpfs"} * 100 > 70
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 磁盘{{ $labels.mountpoint }} 分区空间不足"
description: "实例 {{ $labels.instance }} 的磁盘空间使用率已超过 70%,当前使用率为 {{ $value }}%,请及时处理。"
- alert: 磁盘Inode空间告急
expr: (node_filesystem_files{device!="tmpfs"} - node_filesystem_files_free{device!="tmpfs"} ) / node_filesystem_files{device!="tmpfs"} * 100 > 70
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 磁盘空间不足"
description: "实例 {{ $labels.instance }} 的磁盘Inode空间使用率已超过 70%,当前使用率为 {{ $value }}%,请及时处理。"
- alert: 磁盘IOPS写入较高
#expr: sum(rate(node_disk_writes_completed_total[1m])) by (instance,job) / 120 * 100 >60
#round函数可以对值进行四舍五入
expr: round(max(irate(node_disk_writes_completed_total[1m])) by (instance,job) / 120 * 100) > 60
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} IOPS每秒写入次数超过120次/s"
description:
目前磁盘IOPS写入饱和度是 {{ $value }}%
目前磁盘IOPS每秒写入最大 {{ printf `max(rate(node_disk_writes_completed_total{instance="%s",job="%s"}[1m]))` $labels.instance $labels.job | query | first | value | printf "%.2f" }} 次/s
- alert: 磁盘IOPS读取较高
expr: round(max(irate(node_disk_reads_completed_total[1m])) by (instance,job) / 120 * 100) > 60
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} IOPS每秒读取次数超过120次/s"
description:
目前磁盘IOPS读取饱和度是 {{ $value }}%
目前磁盘IOPS每秒读取最大 {{ printf `max(rate(node_disk_reads_completed_total{instance="%s",job="%s"}[1m]))` $labels.instance $labels.job | query | first | value | printf "%.2f" }} 次/s
- alert: 磁盘IO写入吞吐较高
expr: round(max(rate(node_disk_written_bytes_total[1m])) by (instance,job) / 1024 /1024 / 30 * 100) > 60
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 磁盘IO写入每秒超过最大30MB/s"
description:
目前磁盘IO写入吞吐量的饱和度是 {{ $value }}%。
目前磁盘IO写入吞吐量每秒最大是 {{ printf `max(rate(node_disk_written_bytes_total{instance="%s",job="%s"}[1m])) /1024/1024` $labels.instance $labels.job | query | first | value | printf "%.2f" }}MB/s
- alert: 磁盘IO读取吞吐较高
expr: round(max(rate(node_disk_read_bytes_total[1m])) by (instance,job) / 1024 /1024 /30 * 100 ) > 60
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 磁盘IO读取每秒超过最大30MB/s"
description:
目前磁盘IO读取吞吐量的饱和度是 {{ $value }}%。
目前磁盘IO读取吞吐量每秒最大是 {{ printf `max(rate(node_disk_read_bytes_total{instance="%s",job="%s"}[1m])) /1024/1024` $labels.instance $labels.job | query | first | value | printf "%.2f" }}MB/s
- alert: 网络下载带宽异常
expr: max(irate(node_network_receive_bytes_total[1m]) * 8 / 1024 / 1024) by (instance,job,device) / 50 * 100 >= 80
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 的 {{ $labels.device }}接口下载流量已经超过公司实际50Mbps"
description:
目前下载带宽已经达到 {{ printf `(irate(node_network_receive_bytes_total{instance="%s",job="%s",device="%s"}[1m]) * 8 / 1024 / 1024)` $labels.instance $labels.job $labels.device | query | first | value | printf "%.2f" }} Mbps/s
目前下载带宽使用率在 {{ $value }}%
- alert: 网络上传带宽异常
expr: max(irate(node_network_transmit_bytes_total[1m]) * 8 / 1024 / 1024) by (instance,job,device) / 50 * 100 >= 80
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 的 {{ $labels.device }}接口上传流量已经超过公司实际50Mbps"
description:
目前上传带宽已经达到 {{ printf `(irate(node_network_transmit_bytes_total{instance="%s",job="%s",device="%s"}[1m]) * 8 / 1024 / 1024)` $labels.instance $labels.job $labels.device | query | first | value | printf "%.2f" }} Mbps/s
目前上传带宽使用率在 {{ $value }}%
- alert: 网络TCP连接数异常
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit * 100 > 80
for: 1m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 的 tcp连接数超过80%"
description:
目前TCP连接数是 {{ printf `node_nf_conntrack_entries{instance="%s",job="%s"}` $labels.instance $labels.job | query | first | value | printf "%.2f" }}
目前TCP连接使用率是 {{ $value }}%
- alert: 节点处于Down状态
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "实例:{{ $labels.instance }} 处于Down状态"
description: "{{ $labels.instance }} 节点已连接超时"
kube_pods_rules.yml: |-
groups:
- name: Pods的告警规则文件
rules:
- alert: Pod中容器的CPU利用率高
expr: sum (rate(container_cpu_usage_seconds_total{image!=""}[5m])) by (instance,job,pod,namespace) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod CPU利用率高"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的CPU利用率当前为 {{ $value }}%,超过了80%的阈值。"
- alert: Pod中容器内存利用率高
expr: |
sum(container_memory_working_set_bytes{name!=""}) by (instance,job,pod,namespace)
/
sum(container_spec_memory_limit_bytes{name!=""} > 0) by (instance,job,pod,namespace) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod内存利用率高"
description: 在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的内存最大限制是 {{ printf `sum (container_spec_memory_limit_bytes{namespace="%s",pod="%s"} > 0 ) /1024 /1024` $labels.namespace $labels.pod | query | first | value }}MB , 目前利用率已达{{ $value }}%,超过限制的80%。
- alert: Pod容器网络发送速率过高
expr: sum(rate(container_network_transmit_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络发送速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
- alert: Pod容器网络接收速率过高
expr: sum(rate(container_network_receive_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络接收速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
- alert: Pod容器磁盘写入吞吐量过大
expr: sum (rate(container_fs_writes_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘写入吞吐量过大"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘写入吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
- alert: Pod容器磁盘读取吞吐量过大
expr: sum (rate(container_fs_reads_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘读取吞吐量过大"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘读取吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
jvm_rules.yml: |-
groups:
- name: "JVM告警规则"
rules:
- alert: JVM堆内存使用率过高
expr: jvm_memory_bytes_used{area="heap",} / jvm_memory_bytes_max{area="heap",} * 100 > 90
for: 1m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 实例的JVM 堆内存使用率超过80%"
description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' PodJVM堆内存使用率超过80%, 当前使用率是 {{ $value }}%"
- alert: JVMGC时间过长
expr: sum (rate(jvm_gc_collection_seconds_sum[5m]) / rate(jvm_gc_collection_seconds_count[5m])) by (instance,job,gc,namespace,pod_name) > 1
for: 1m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 实例的JVM GC时间超过了1秒。"
description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod使用 {{ $labels.gc }} GC垃圾回收算法时间超过1s,当前值 {{ $value }}秒"
- alert: JVM死锁线程过多
expr: min_over_time(jvm_threads_deadlocked[5m]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "JVM检测到'{{ $labels.instance }}' 实例有死锁线程"
description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod,在过去5分钟检测到死锁线程, 当前死锁线程数是 {{ $value }}。"
redis_rules.yml: |-
groups:
- name: redis告警规则
rules:
- alert: Redis实例宕机
expr: redis_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例宕机"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去5分钟内无法连接。"
- alert: Redis连接数过高
expr: redis_connected_clients / redis_config_maxclients * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例连接数超过80%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod当前连接数占最大连接数的比率超过80%。当前比率: {{ $value }}%。"
- alert: Redis连接被拒绝
expr: increase(redis_rejected_connections_total[1h]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例有连接被拒绝"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去1小时内有连接被拒绝。当前被拒绝的连接数: {{ $value }}。"
- alert: Redis内存使用率过高
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例内存使用率超过80%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的内存使用率超过配置的最大内存值的80%。当前内存使用率: {{ $value }}%。"
- alert: Redis缓存命中率低
expr: |
irate(redis_keyspace_hits_total[5m])
/
(irate(redis_keyspace_hits_total[5m]) + irate(redis_keyspace_misses_total[5m])) * 100 < 90
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例缓存命中率低于90%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近5分钟内的缓存命中率低于90%。当前命中率: {{ $value }}%。"
- alert: Redis即将过期的Key数量过多
expr: |
sum(redis_db_keys_expiring) by (instance, job, namespace,pod_name,db)
/
sum(redis_db_keys) by (instance, job, namespace,pod_name,db) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例中的 '{{ $labels.db }}' 数据库有大量即将过期的Key"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod中的 '{{ $labels.db }}' 数据库有超过50%的Key即将过期。当前过期比率: {{ $value }}%。"
- alert: RedisRDB备份失败
expr: redis_rdb_last_bgsave_status == 0
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例 RDB备份失败"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近的RDB备份失败。"
- alert: RedisRDB备份时间过长
expr: redis_rdb_last_bgsave_duration_sec > 3 and redis_rdb_last_bgsave_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例 RDB备份成功但耗时超过3秒"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod, RDB备份成功但耗时超过了3秒。持续时间: {{ $value }}秒。"
- alert: RedisRDB备份过期
expr: (time() - redis_rdb_last_save_timestamp_seconds) > 36000
for: 24h
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例超过10小时未进行RDB备份"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod已超过10小时没有生成新的RDB备份文件。"
- alert: Redis命令拒绝率过高
expr: |
sum(irate(redis_commands_rejected_calls_total[5m])) by (instance,job,namespace,pod_name)
/
sum(irate(redis_commands_total[5m])) by (instance,job,namespace,pod_name) * 100 > 25
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例命令拒绝率超过25%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的命令拒绝率超过了25%。当前拒绝率: {{ $value }}%。"
- alert: Redis命令平均响应时间过长
expr: |
sum(rate(redis_commands_duration_seconds_total[5m])) by (instance,job,namespace,pod_name)
/
sum(rate(redis_commands_processed_total[5m])) by (instance,job,namespace,pod_name) > 0.250
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例命令平均响应时间超过250ms"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的执行命令平均响应时间超过了250毫秒。当前平均响应时间: {{ $value }}秒。"
blackbox_tcp_rules.yml: |-
groups:
- name: Blackbox_tcp告警规则文件
rules:
- alert: Service TCP探测失败
expr: sum(probe_success{job=~".*tcp"}) by (instance,job,namespace,service_name) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "探测 '{{ $labels.instance }}' Service 的TCP接口探测失败。"
description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 地址探测失败。"
- alert: Service TCP请求的响应时间过长
expr: probe_duration_seconds{job=~".*tcp"} > 0.500
for: 5m
labels:
severity: critical
annotations:
summary: "探测 '{{ $labels.instance }}' Service 的TCP响应时间超过了500毫秒。"
description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"
- alert: Service的DNS解析响应时间过长
expr: probe_dns_lookup_time_seconds{job=~".*tcp"} > 0.500
for: 5m
labels:
severity: critical
annotations:
summary: "探测 '{{ $labels.instance }}' Service 的DNS解析响应时间超过了500毫秒。"
description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"
blackbox_http_rules.yml: |-
groups:
- name: Blackbox_http告警规则文件
rules:
- alert: 站点平均请求过长
expr: sum (avg_over_time(probe_http_duration_seconds[1m])) by (instance,job,namespace,ingress_name) > 3
for: 1m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' 域名整体请求时间超过了3秒。"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名最近1分钟的平均请求时间超过3秒。当前平均请求时间:{{ $value }}秒。"
- alert: 站点阶段耗时过长
expr: |
(
probe_http_duration_seconds{phase="connect"} > 1 or
probe_http_duration_seconds{phase="processing"} > 1 or
probe_http_duration_seconds{phase="resolve"} > 1 or
probe_http_duration_seconds{phase="tls"} > 1 or
probe_http_duration_seconds{phase="transfer"} > 1
)
for: 1m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' 域名在 '{{ $labels.phase }}' 阶段耗时过长"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在阶段 '{{ $labels.phase }}' 的耗时超过0.5秒。当前耗时:{{ $value }}秒。"
- alert: 站点响应状态码异常
expr: probe_http_status_code <= 199 or probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 域名返回异常状态码"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名返回的状态码为 {{ $value }},表明请求可能存在问题。"
- alert: 重定向次数过多
expr: probe_http_redirects > 5
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' 域名重定向次数过多"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在最近的探测中重定向次数超过5次。当前次数:{{ $value }}次。"
- alert: 证书即将过期<30
expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 30
for: 24h
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 域名的 SSL 证书即将过期"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"
- alert: 证书即将过期<7
expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 7
for: 24h
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}' 域名的 SSL 证书即将过期"
description: "{{ $labels.namespace }} 名称空间 {{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"
2、创建 ConfigMap
[root@k8s-master01 04-prometheus]# kubectl apply -f 02-prom-rules-configmap.yaml
[root@k8s-master01 04-prometheus]# kubectl get cm -n monitoring
NAME DATA AGE
alert-configs 1 34m
alert-template 1 34m
kube-root-ca.crt 1 61m
prom-configs 1 9m2s
prom-rules 6 13s
# 3.3 创建 PrometheusRBAC 权限
1、创建⼀个 ServiceAccount ⽤户,名称为 prometheus-sa
2、创建 ClusterRole,设定对应的权限规则,名称为 Prometheus-role 。
3、创建 ClusterRoleBinding,名称为 prometheus-rolebinding ,将 Prometheus-role ⻆⾊的权限关联⾄ kube-prom
名称空间 prometheus-sa ⽤户。
[root@k8s-master01 04-prometheus]# cat 03-prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-sa
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus-role
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- "networking.k8s.io"
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus-rolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-role
subjects:
- kind: ServiceAccount
name: prometheus-sa
namespace: monitoring
[root@k8s-master01 04-prometheus]# kubectl apply -f 03-prometheus-rbac.yaml
serviceaccount/prometheus-sa created
clusterrole.rbac.authorization.k8s.io/prometheus-role created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-rolebinding created
# 3.4 创建 headlessSevice
[root@k8s-master01 04-prometheus]# cat 04-prometheus-headlessService.yaml
apiVersion: v1
kind: Service
metadata:
name: prometheus-svc
namespace: monitoring
spec:
clusterIP: "None"
selector:
app: prometheus
ports:
- name: http
port: 9090
targetPort: 9090
[root@k8s-master01 04-prometheus]# kubectl apply -f 04-prometheus-headlessService.yaml
[root@k8s-master01 04-prometheus]# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager-svc ClusterIP None <none> 9093/TCP,9094/TCP 37m
prometheus-svc ClusterIP None <none> 9090/TCP 16s
webhook-dingding-svc ClusterIP 10.96.241.84 <none> 5002/TCP 49m
webhook-wechat-svc ClusterIP 10.96.82.7 <none> 5001/TCP 62m
# 3.5 部署 Prometheus 服务
使⽤ statefulSet 编写 Prometheus 清单⽂件
- 1、定义 Prometheus 所关联的 ServiceAccount 账户,确保有权限能访问 APIServer
- 1、定义 Prometheus 实例的启动命令,包含了对应的 “配置⽂件路径 “、“数据存储路径” 相关的参数。
- 2、定义 Prometheus 实例挂载 “配置⽂件” 的 ConfigMap 资源,以及 rules 告警规则⽂件的 ConfigMap 资源。
- 3、Prometheus 使⽤ PVC 模板来提供数据持久化;
[root@k8s-master01 04-prometheus]# cat 05-prometheus-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: monitoring
spec:
serviceName: "prometheus-svc"
replicas: 2
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: "prometheus-sa" # sa账号
volumes:
- name: prom-cfg
configMap:
name: prom-configs
- name: prom-rules-cfg
configMap:
name: prom-rules
containers:
- name: prom
image: prom/prometheus:v2.49.1
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/etc/prometheus/data"
- "--storage.tsdb.retention.time=10d"
- "--web.enable-lifecycle"
volumeMounts:
- name: prom-cfg
mountPath: /etc/prometheus
- name: prom-rules-cfg
mountPath: /etc/prometheus/rules
- name: prom-data
mountPath: /etc/prometheus/data
ports:
- containerPort: 9090
resources:
requests:
cpu: 1000m
memory: 1024Mi
limits:
cpu: 1000m
memory: 1024Mi
volumeClaimTemplates:
- metadata:
name: prom-data
spec:
accessModes: ["ReadWriteMany"]
storageClassName: "nfs-storage"
resources:
requests:
storage: 3Gi
[root@k8s-master01 04-prometheus]# kubectl apply -f 05-prometheus-statefulset.yaml
[root@k8s-master01 04-prometheus]# kubectl get pods -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-0 1/1 Running 0 38m
alertmanager-1 1/1 Running 0 38m
alertmanager-2 1/1 Running 0 38m
prometheus-0 1/1 Running 0 2m35s
prometheus-1 1/1 Running 0 92s
webhook-dingding-6d68854649-r6smd 1/1 Running 0 56m
webhook-wechat-54b5bbf677-rmr26 1/1 Running 0 69m
# 3.6 发布 Prometheus 服务
1、编写 Ingress 资源清单
[root@k8s-master01 04-prometheus]# cat 06-prometheus-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prom-ingress
namespace: monitoring
spec:
ingressClassName: "nginx"
rules:
- host: "k8s-prom.hmallleasing.com"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus-svc
port:
number: 9090
[root@k8s-master01 04-prometheus]# kubectl apply -f 06-prometheus-ingress.yaml
[root@k8s-master01 04-prometheus]# kubectl get ingress -n monitoring
NAME CLASS HOSTS ADDRESS PORTS AGE
alert-ingress nginx k8s-alert.hmallleasing.com 192.168.40.103,192.168.40.104,192.168.40.105 80 38m
prom-ingress nginx k8s-prom.hmallleasing.com 192.168.40.103,192.168.40.104,192.168.40.105 80 78s
# 4、部署 Grafana ⾄ Kubernetes
- 1、创建 HeadlessService;
- 2、部署 Grafana,使⽤ statefulSet;
- 3、创建 Ingress,对外提供 Grafana;
# 4.1 创建 HeadlessService
1、编写 Grafana 的 headlessService
[root@k8s-master01 05-grafana]# cat 01-grafana-headlessService.yaml
apiVersion: v1
kind: Service
metadata:
name: grafana-svc
namespace: monitoring
spec:
clusterIP: "None"
selector:
app: grafana
ports:
- name: http
port: 3000
targetPort: 3000
2、检查 service
[root@k8s-master01 05-grafana]# kubectl apply -f 01-grafana-headlessService.yaml
[root@k8s-master01 05-grafana]# kubectl get svc -n monitoring
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager-svc ClusterIP None <none> 9093/TCP,9094/TCP 23h
grafana-svc ClusterIP None <none> 3000/TCP 5s
prometheus-svc ClusterIP None <none> 9090/TCP 22h
webhook-dingding-svc ClusterIP 10.96.241.84 <none> 5002/TCP 23h
webhook-wechat-svc ClusterIP 10.96.82.7 <none> 5001/TCP 23h
# 4.2 部署 Grafana 服务
使⽤ statefulSet 编写 Grafana 清单⽂件
- 1、Grafana 需要通过 GF_SECURITY_ADMIN_USER 传递⽤户名, GF_SECURITY_ADMIN_PASSWORD 传递密码。
- 2、Grafana 的实例,需要使⽤ PVC 模板来提供数据持久化;
- 3、Grafana 的持久存储需要考虑权限, fsGroup: 472
[root@k8s-master01 05-grafana]# cat 02-grafana-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: grafana
namespace: monitoring
spec:
serviceName: "grafana-svc"
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472 # 当Pod启动时,Kubernetes会自动将此组ID应用到Pod级别共享的存储上(比如持久卷)。
containers:
- name: grafana
image: grafana/grafana:10.2.2
env:
- name: GF_SECURITY_ADMIN_USER
value: "admin"
- name: GF_SECURITY_ADMIN_PASSWORD
value: "talent"
volumeMounts:
- name: grafana-data
mountPath: /var/lib/grafana
ports:
- containerPort: 3000
readinessProbe:
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
failureThreshold: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
livenessProbe:
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
failureThreshold: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
resources:
requests:
cpu: 500m
memory: 2048Mi
limits:
cpu: 500m
memory: 2048Mi
volumeClaimTemplates:
- metadata:
name: grafana-data
spec:
accessModes: ["ReadWriteMany"]
storageClassName: "nfs-storage"
resources:
requests:
storage: 3Gi
[root@k8s-master01 05-grafana]# kubectl apply -f 02-grafana-statefulset.yaml
[root@k8s-master01 ~]# kubectl get pods -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-0 1/1 Running 1 (6m18s ago) 23h
alertmanager-1 1/1 Running 1 (<invalid> ago) 23h
alertmanager-2 1/1 Running 1 (11m ago) 23h
grafana-0 1/1 Running 0 2m22s
prometheus-0 1/1 Running 1 (4m34s ago) 23h
prometheus-1 1/1 Running 1 (6m18s ago) 23h
webhook-dingding-6d68854649-r6smd 1/1 Running 1 (4m34s ago) 23h
webhook-wechat-54b5bbf677-rmr26 1/1 Running 1 (6m18s ago) 24h
# 4.3 发布 Grafana 服务
1、编辑 Grafana 的 Ingress
[root@k8s-master01 05-grafana]# cat 03-grafana-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana-ingress
namespace: monitoring
spec:
ingressClassName: "nginx"
rules:
- host: "k8s-grafana.hmallleasing.com"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana-svc
port:
number: 3000
[root@k8s-master01 05-grafana]# kubectl apply -f 03-grafana-ingress.yaml
[root@k8s-master01 05-grafana]# kubectl get ingress -n monitoring
NAME CLASS HOSTS ADDRESS PORTS AGE
alert-ingress nginx k8s-alert.hmallleasing.com 192.168.40.103,192.168.40.105 80 23h
grafana-ingress nginx k8s-grafana.hmallleasing.com 192.168.40.103,192.168.40.105 80 21s
prom-ingress nginx k8s-prom.hmallleasing.com 192.168.40.103,192.168.40.105 80 23h
2、访问 Grafana 的 web 界⾯
# 4.4 配置 Grafana 连接 Prometheus
1、点击 Connections-->Add newConnection,搜索 Prometheus,点击添加 DataSource
2、点击测试并保存
# 5、部署 blackbox ⾄ Kubernetes
- 1、创建 ConfigMap,定义 blackbox.yml 中的检测模块;
- 2、部署 Blackbox,使⽤ Deployment;
- 3、创建 Service、ingress,对外发布 Blackbox;
# 5.1 创建 Blackbox 的配置⽂件
1、创建 ConfigMap 配置,定义 Blackbox 的检测⽅法,名称为 blackbox-configs
[root@k8s-master01 06-blackbox]# cat 01-blackbox-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-configs
namespace: monitoring
data:
blackbox.yml: |-
modules:
# http检查模块
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
valid_http_versions: [ "HTTP/1.1", "HTTP/2.0" ]
# Http Post检查模块
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: "ip4"
valid_http_versions: [ "HTTP/1.1", "HTTP/2.0" ]
# TCP检查模块
tcp_connect:
prober: tcp
timeout: 5s
# ICMP检查模块
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"
# DNS检查模块
dns_tcp:
prober: dns
dns:
transport_protocol: "tcp"
preferred_ip_protocol: "ip4"
query_name: "kubernetes.default.svc.cluster.local"
# SSH检查模块
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
2、创建 configmap
[root@k8s-master01 06-blackbox]# kubectl apply -f 01-blackbox-configs-configmap.yaml
configmap/blackbox-configs created
[root@k8s-master01 06-blackbox]# kubectl get cm -n monitoring
NAME DATA AGE
alert-configs 1 23h
alert-template 1 23h
blackbox-configs 1 1s
kube-root-ca.crt 1 24h
prom-configs 1 23h
prom-rules 6 23h
# 5.2 部署 Blackbox 服务
编写 Blackbox 的部署清单⽂件
[root@k8s-master01 06-blackbox]# cat 02-blackbox-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: blackbox
template:
metadata:
labels:
app: blackbox
spec:
volumes:
- name: blackbox-cfg
configMap:
name: blackbox-configs
containers:
- name: blackbox
image: prom/blackbox-exporter:v0.24.0
args:
- "--web.listen-address=:9115"
- "--config.file=/etc/blackbox_exporter/blackbox.yml"
volumeMounts:
- name: blackbox-cfg
mountPath: /etc/blackbox_exporter
ports:
- containerPort: 9115
[root@k8s-master01 06-blackbox]# kubectl apply -f 02-blackbox-deployment.yaml
[root@k8s-master01 06-blackbox]# kubectl get pods -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-0 1/1 Running 1 (20m ago) 23h
alertmanager-1 1/1 Running 1 (<invalid> ago) 23h
alertmanager-2 1/1 Running 1 (25m ago) 23h
blackbox-7c7c8db4f7-hqs4c 1/1 Running 0 112s
grafana-0 1/1 Running 0 16m
prometheus-0 1/1 Running 1 (19m ago) 23h
prometheus-1 1/1 Running 1 (20m ago) 23h
webhook-dingding-6d68854649-r6smd 1/1 Running 1 (19m ago) 24h
webhook-wechat-54b5bbf677-rmr26 1/1 Running 1 (20m ago) 24h
# 5.3 发布 blackbox 服务
1、创建 Service
[root@k8s-master01 06-blackbox]# cat 03-blackbox-service.yaml
apiVersion: v1
kind: Service
metadata:
name: blackbox-svc
namespace: monitoring
spec:
selector:
app: blackbox
ports:
- name: http
port: 9115
targetPort: 9115
[root@k8s-master01 06-blackbox]# kubectl apply -f 03-blackbox-service.yaml
2、创建 Ingress
[root@k8s-master01 06-blackbox]# cat 04-blackbox-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: blackbox-ingress
namespace: monitoring
spec:
ingressClassName: "nginx"
rules:
- host: "k8s-blackbox.hmallleasing.com"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: blackbox-svc
port:
number: 9115
[root@k8s-master01 06-blackbox]# kubectl apply -f 04-blackbox-ingress.yaml
3、访问 blackbox ⻚⾯
#
6、监控 Kubernetes 集群节点
使⽤ Prometheus 监控 Kubernetes 集群中的 节点,⼤体需要如下⼏步:
- 1、使⽤ DaemonSet 部署 Node_exporter
- 2、使⽤ Prometheus 的 Kubernetes 服务发现功能,⾃动识别集群中的 Node 节点
- 3、使⽤ relabeling 功能调整⽬标地址和端⼝:
- 4、使⽤ relabel 功能,为节点增加⼀些必要的标签维度;
# 6.1 部署 Node-Exporter
使⽤ DaemonSet 部署 Node_exporter
- 1、使⽤ DaemonSet 部署 Node Exporter,确保设置 hostPID、hostIPC 和 hostNetwork 为 true,让 Node Exporter 容器能够访问宿主机的⽹络、进程和 IPC 空间。
- 2、需要确保 Node Exporter 可以在所有节点上运⾏,包括 Master 节点,因此需要在 DaemonSet 的 Pod 规范中添加容忍度,允许调度到有污点的节点上运⾏。
- 3、通过 volumeMounts,将宿主机的 /proc、/sys 和根⽬录 / 挂载到 Node Exporter 容器的相应位置。确保 Node Exporter 能够直接访问宿主机的系统信息。
- 4、最后在 Node Exporter 的启动参数中,指定 /proc、/sys 等挂载后的路径,以便正确读取宿主机的数据。
[root@k8s-master01 07-node_exporter]# cat 01-node-exporter-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
hostPID: true
hostIPC: true
hostNetwork: true
# 容忍度
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: "Exists"
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /root
containers:
- name: node
image: prom/node-exporter:v1.7.0
args:
- "--web.listen-address=:9100"
- "--web.max-requests=40"
- "--collector.mountstats"
- "--collector.systemd"
- "--collector.ethtool"
- "--collector.tcpstat"
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/host/root"
volumeMounts:
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: root
mountPath: /host/root
ports:
- containerPort: 9100
resources:
requests:
cpu: 200m
memory: 200Mi
limits:
cpu: 200m
memory: 200Mi
2、检查 Node_exporter 的部署情况
[root@k8s-master01 07-node_exporter]# kubectl apply -f 01-node-exporter-daemonset.yaml
[root@k8s-master01 07-node_exporter]# kubectl get pods -n monitoring -o wide|grep node-exporter
node-exporter-55bsf 1/1 Running 0 2m48s 192.168.40.103 k8s-master03 <none> <none>
node-exporter-5ldbs 1/1 Running 0 2m48s 192.168.40.104 k8s-node01 <none> <none>
node-exporter-dj269 1/1 Running 0 2m48s 192.168.40.101 k8s-master01 <none> <none>
node-exporter-kqdmg 1/1 Running 0 2m48s 192.168.40.102 k8s-master02 <none> <none>
node-exporter-ml2sk 1/1 Running 0 2m48s 192.168.40.105 k8s-node02 <none> <none>
# 6.2 配置 Prometheus 监控 Node
1、修改 Prometheus 的配置⽂件中,使⽤ kubernetes_sd_configs 配置项指定服务发现的⻆⾊为 node。这样 Prometheus 将会⾃动发现 Kubernetes 集群中的所有 Node,并获取它们的元数据。
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
3、重启 Prometheus,⽽后检查 Prometheus 的 Target
#确保k8s-prom.hmallleasing.com可以解析
[root@k8s-master01 04-prometheus]# cat /etc/hosts
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.40.101 k8s-master01
192.168.40.102 k8s-master02
192.168.40.103 k8s-master03 k8s-prom.hmallleasing.com
192.168.40.100 k8s-master-lb # 如果不是高可用集群,该IP为Master01的IP
192.168.40.104 k8s-node01
192.168.40.105 k8s-node02
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
# 6.3 relabel 修改抓取的节点端⼝
1、重新修改配置,通过 relabel ⽅式将 10250 端⼝,修改为 9100 端⼝
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
3、重新加载 Prometheus,然后检查节点的 targets,发现端⼝变成了 9100,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
# 6.4 relabel 为节点添加新标签
查询节点的元数据标签,发现如下的⼀些 labels,希望保留下来,以便在监控中提供更多的维度和上下⽂。
1、修改 Prometheus 的配置⽂件,使⽤ labelmap 将标签映射到每个节点上。
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
3、重新加载 Prometheus,⽽后这些元数据是否附加到节点的 Labels 上了。
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
# 6.5 导⼊节点可视化图形
ID:16098
# 7、监控 K8S 控制组件
# 7.1 监控控制平⾯组件
监控 Kubernetes 集群的控制平⾯,⾸先需要知道要监控哪些组件,然后了解它们是如何提供 Metrics 指标的,最后确定这些指标的监控⽅法,是通过⼿动配置还是⾃动发现。
- 关键组件包括:APIServer、ControllerManager、Scheduler、etcd、CoreDNS、kubelet、kube-proxy
- Metrics 指标获取:这些控制平⾯组件都内建 Metrics 端点。但是某些组件可能默认只在本地接⼝(127.0.0.1)上暴露 Metrics,因此需要修改对应组件的配置,以确保这些 Metrics 可通过远程的⽅式进⾏访问。
# 7.2 监控控制平⾯组件策略
监控这些组件有两种主要⽅法:
- 1、⼿动配置监控,在 Prometheus 配置⽂件中,⼿动指定每个控制器组件服务的地址和端⼝,来完成监控。很明显这种⽅式⾮常繁琐,且维护成本太⾼。
- 2、⾃动化监控,利⽤ Kubernetes 的服务发现机制,来动态发现服务实例。有如下两种常⽤的⽅法
⾃动化监控⽅式:
- 1、基于 endpoints 的服务发现,⾃动发现所有的 endpints 端点,然后使⽤ relabel 匹配,来保留符合条件的端点实例。
- 2、基于 Pod 的服务发现,⾃动发现所有 Pod,然后使⽤ relabel 匹配,只保留符合标签条件的 Pod 实例。
注意事项:
1、使⽤ endpoints 作为服务发现的⽅式,它要求被监控的⽬标必须有对应的 Service 才可以实现,否则⽆法获取端点。但基于 Pod 的发现⽅式不依赖是否有 Service。
2、不论采⽤哪种服务的发现⽅式,最终都必须通过 relabel 来基于标签筛选所需要的端点,因此获取所需要监控的⽬标标签,⾄关重要。
# 7.3 监控 APIServer
# 7.3.1 获取 APIServer 的 Metrics
1、API-Server 在 https 协议的, 6443/metrics 接⼝上提供了指标数据。
[root@k8s-master01 ~]# netstat -lntp|grep 6443
tcp 0 0 127.0.0.1:16443 0.0.0.0:* LISTEN 1126/haproxy
tcp 0 0 0.0.0.0:16443 0.0.0.0:* LISTEN 1126/haproxy
tcp6 0 0 :::6443 :::* LISTEN 2301/kube-apiserver
2、APIServer 有对应的 Service,所以采⽤ Endpints ⽅式发现服务,因此我们需要获取 APIServer 的 Service 标签(labels),以便 Prometheus 只抓取 APIServer 服务的 Pod 实例
[root@k8s-master01 ~]# kubectl describe service -n default kubernet
Name: kubernetes
Namespace: default
Labels: component=apiserver
provider=kubernetes
Annotations: <none>
Selector: <none>
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.96.0.1
IPs: 10.96.0.1
Port: https 443/TCP
TargetPort: 6443/TCP
Endpoints: 192.168.40.101:6443,192.168.40.102:6443,192.168.40.103:6443
Session Affinity: None
Events: <none>
# 7.3.2 配置 Prometheus 监控 APIServer
- 1、添加⼀个新的 Job,名为: kube-apiserver ,metrics 路径是 /metrics ,协议是 https
- 2、基于 Kubernetes 的 Endpoints 来实现⾃动发现,由于 APIServer 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
- 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_service_label_component ,标签值为 apiserver 。(会查询所有名称空间,意味着范围查询会更⼴)
- 4、使⽤ relabel_configs,仅保留 __meta_kubernetes_namespace=default 、 __meta_kubernetes_service_name=kubernetes 并且 __meta_kubernetes_endpoint_port_name=https 的实例(明确名称空间和 service 名称,以及对应的端⼝,这种⽅式会更精准⼀些)
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过值映射获取标签
replacement: $1
action: labelmap
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过值映射获取标签
replacement: $1
action: labelmap[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查最终结果
# 7.3.3 APIServer 告警规则⽂件
1、编辑 Prometheus 的 Alert 规则⽂件
kube_apiserver_rules.yml: |-
groups:
- name: APIServer告警规则
rules:
- alert: APIServer请求错误率过高
expr: |
sum by (instance, namespace, job, group, code, resource, verb,subresource) (rate(apiserver_request_total{code=~"5..|4.."}[5m]))
/
sum by (instance, namespace, job, group, code, resource, verb,subresource) (rate(apiserver_request_total[5m])) * 100 > 10
for: 5m
labels:
severity: critical
annotations:
summary: "APIServer请求错误率超过10%"
description: "APIServer实例 {{ $labels.instance }} 在命名空间 {{ $labels.namespace }} 中的 {{ $labels.group }} 组中 {{ $labels.resource }} 类型请求错误率超过10%。当前错误率: {{ $value }}%,请求类型: {{ $labels.verb }},状态码: {{ $labels.code }}。"
- alert: APIServer Mutating请求负载过高
expr: avg_over_time(apiserver_current_inflight_requests{request_kind="mutating"}[5m]) > (400 * 0.8)
for: 5m
labels:
severity: warning
annotations:
summary: "APIServer Mutating请求负载过高"
description: "APIServer处理变更性请求的平均负载超过了最大限制的80%。当前负载: {{ $value }}。"
- alert: APIServer ReadOnly请求负载过高
expr: avg_over_time(apiserver_current_inflight_requests{request_kind="readOnly"}[5m]) > (800 * 0.8)
for: 5m
labels:
severity: warning
annotations:
summary: "APIServer ReadOnly请求负载过高"
description: "APIServer处理只读请求的平均负载超过了最大限制的80%。当前负载: {{ $value }},实例: {{ $labels.instance }},命名空间: {{ $labels.namespace }}。"
- alert: APIServer平均延迟过高
expr: |
rate(apiserver_request_duration_seconds_sum{verb!="WATCH"}[5m])
/
rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[5m]) > 5
for: 5m
labels:
severity: critical
annotations:
summary: "APIServer平均延迟过高"
description: "APIServer实例 {{ $labels.instance }} 对资源 {{ $labels.resource }} 的 {{ $labels.verb }} 请求的平均延迟超过5秒。当前平均延迟: {{ $value }}秒。"
[root@k8s-master01 04-prometheus]# kubectl apply -f 02-prom-rules-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- ls /etc/prometheus/rules/
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/rules/kube_apiserver_rules.yml
3、重新加载 Prometheus
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查告警规则
# 7.4 监控 K8S 核⼼组件 - Controller
# 7.4.1 获取 Controller 的 Metrics
Controller Manager 默认在 10257/metrics 接⼝上提供了指标数据,默认情况下,ControllerManager 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 --bind-address=127.0.0.1 修改为 --bind-address=0.0.0.0
1、修改 ControllerManager Pod 的配置清单,所有的 master 节点都需修改,生产环境需分开修改,防止业务不可用。
[root@k8s-master01 ~]# cat /etc/kubernetes/manifests/kube-controller-manager.yaml
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: null
labels:
component: kube-controller-manager
tier: control-plane
name: kube-controller-manager
namespace: kube-system
spec:
containers:
- command:
- kube-controller-manager
- --allocate-node-cidrs=true
- --authentication-kubeconfig=/etc/kubernetes/controller-manager.conf
- --authorization-kubeconfig=/etc/kubernetes/controller-manager.conf
- --bind-address=0.0.0.0
- --client-ca-file=/etc/kubernetes/pki/ca.crt
...
2、在 Kubernetes 1.28 版本中,默认配置不再允许通过 HTTP 访问 ControllerManager 的 metrics 端点,因此必须使⽤ HTTPS。在本地测试访问 Controller Manager 的 Metrics ,我们可以使⽤已存在的 prometheus-sa 服务账户来创建⼀个令牌 (token),然后在来访问 controllerManager 的 Metrics。
[root@k8s-master01 ~]# TOKEN=$(kubectl create token -n monitoring prometheus-sa)
[root@k8s-master01 ~]# curl -s -k -H "Authorization: Bearer $TOKEN" https://192.168.40.101:10257/metrics |grep kube-controller
leader_election_master_status{name="kube-controller-manager"} 1
running_managed_controllers{manager="kube-controller-manager",name="nodeipam"} 1
3、Controller Manager 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 Controller Manager 服务的 Pod 实例。
# 查看kube-controllerManager的标签
[root@k8s-master01 ~]# kubectl describe pod -n kube-system kube-controller-manager-k8s-master01|grep -i label
Labels: component=kube-controller-manager
# 7.4.2 配置 Prometheus 监控 Controller
- 1、添加⼀个新的 Job,名为: kube-controller ,metrics 路径是 /metrics ,协议是 https
- 2、基于 Kubernetes 的 Pod ⽅式实现⾃动发现,由于 ControllerManager 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
- 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_component ,标签值为 kube-controller-manager
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查最终结果
抓取的 ControllerManager 的地址没错,但是端口不是 10257,而是 https 默认的 443,因此我们需要重新
# 7.4.3 relabel 修改抓取的 Pod 端⼝
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查最终结果
# 7.4.4 relabel 为 Pod 添加新标签
Pod 的元数据标签,我们希望保留 __meta_kubernetes_namespace、__meta_kubernetes_pod_name ,这两个维度的标签。
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查最终结果
# 7.5 监控 K8S 核⼼组件 - Scheduler
# 7.5.1 获取 Scheduler 的 Metrics
Scheduler 默认在 10259/metrics 接⼝上提供了指标数据,默认情况下,Scheduler 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 --bind-address=127.0.0.1 修改为 --bind-address=0.0.0.0。
[root@k8s-master01 ~]# netstat -lntp|grep 10259
tcp 0 0 127.0.0.1:10259 0.0.0.0:* LISTEN 1762/kube-scheduler
1、修改 Scheduler Pod 的配置清单,3 台 master 都需修改,生产环境需不要同时修改,避免业务不可用。
[root@k8s-master01 ~]# cat /etc/kubernetes/manifests/kube-scheduler.yaml
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: null
labels:
component: kube-scheduler
tier: control-plane
name: kube-scheduler
namespace: kube-system
spec:
containers:
- command:
- kube-scheduler
- --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
- --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
- --bind-address=0.0.0.0
- --kubeconfig=/etc/kubernetes/scheduler.conf
...
2、本地测试访问 Scheduler 的 Metrics ,我们可以使⽤已存在的 prometheussa 服务账户来创建⼀个令牌 (token),然后在来访问 Scheduler 的 Metrics。
[root@k8s-master01 ~]# TOKEN=$(kubectl create token -n monitoring prometheus-sa)
[root@k8s-master01 ~]# curl -s -k -H "Authorization: Bearer $TOKEN" https://192.168.40.101:10259/metrics |grep kube-scheduler
leader_election_master_status{name="kube-scheduler"} 1
3、Scheduler 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 Scheduler 服务的 Pod 实例。
[root@k8s-master01 ~]# kubectl describe pod kube-scheduler-k8s-master01 -n kube-system
Name: kube-scheduler-k8s-master01
Namespace: kube-system
Priority: 2000001000
Priority Class Name: system-node-critical
Node: k8s-master01/192.168.40.101
Start Time: Sun, 17 Mar 2024 13:59:17 +0800
Labels: component=kube-scheduler
tier=control-plane
...
# 7.5.2 配置 Prometheus 监控 Scheduler
- 1、添加⼀个新的 Job,名为: kube-scheduler ,metrics 路径是 /metrics ,协议是 https
- 2、基于 Kubernetes 的 Pod 来实现⾃动发现,由于 scheduler 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
- 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_component ,标签值为 kube-scheduler
- 4、使⽤ relabel_configs,修改抓取 Pod 的端⼝为 10259,默认 pod ⾃动抓取的实例,使⽤ http80 端⼝和 https443 端⼝;
- 5、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、___meta_kubernetes_pod_name ,这两个维度的标签。
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
# 7.5.3 Schedule 告警规则⽂件
1、编写告警规则⽂件
kube_scheduler_rules.yml: |-
groups:
- name: scheduler告警规则文件
rules:
- alert: 调度器每秒调度Pod次数过高
expr: rate(scheduler_pod_scheduling_attempts_sum[1m]) > 20
for: 1m
labels:
severity: critical
annotations:
summary: "调度器每秒调度Pod次数过高 (当前值: {{ $value }}次)"
description: "调度器实例 {{ $labels.instance }} 在过去的一分钟内每秒调度的Pod次数超过了20次,当前值为 {{ $value }}次。"
- alert: Pending状态的Pod数量过多
expr: avg_over_time(scheduler_pending_pods{queue!="active"}[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Pending状态的Pod数量过多 (当前值: {{ $value }}个)"
description: "调度器实例 {{ $labels.instance }} 在过去五分钟内处于Pending状态的Pod数量平均超过了10个,当前值为 {{ $value }}个。"
- alert: 'Pod平均调度尝试次数过多'
expr: avg(rate(scheduler_pod_scheduling_attempts_sum[5m])) by (instance, job, pod_name) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Pod平均调度尝试次数过多 (当前值: {{ $value }}次)"
description: "调度器实例 `{{ $labels.instance }}` 的Pod在过去五分钟内平均尝试调度次数超过5次,当前值为 {{ $value }}次。"
- alert: '调度器扩展点平均延迟过高'
expr: |
rate(scheduler_framework_extension_point_duration_seconds_sum[5m])
/
rate(scheduler_framework_extension_point_duration_seconds_count[5m]) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "调度器扩展点平均延迟过高 (当前值: {{ $value }}秒)"
description: "调度器实例 `{{ $labels.instance }}` 的扩展点 `{{ $labels.extension_point }}` 在过去五分钟内平均延迟超过了1秒,当前值为 {{ $value }}秒。
# 7.6 监控 K8S 核⼼组件 - Etcd
# 7.6.1 获取 Etcd 的 Metrics
etcd 默认在 2381/metrics 接⼝上提供了指标数据,默认情况下,etcd 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 --listen-metrics-urls=http://127.0.0.1:2381 修改为 ---listen-metrics-urls=http://0.0.0.0:2381
[root@k8s-master01 04-prometheus]# netstat -lntp|grep 2381
tcp 0 0 127.0.0.1:2381 0.0.0.0:* LISTEN 1808/etcd
1、修改 etcd Pod 的配置清单,3 台 master 都需修改,生产环境需不要同时修改,避免业务不可用。
[root@k8s-master01 04-prometheus]# cat /etc/kubernetes/manifests/etcd.yaml
apiVersion: v1
kind: Pod
metadata:
annotations:
kubeadm.kubernetes.io/etcd.advertise-client-urls: https://192.168.40.101:2379
creationTimestamp: null
labels:
component: etcd
tier: control-plane
name: etcd
namespace: kube-system
spec:
containers:
- command:
- etcd
- --advertise-client-urls=https://192.168.40.101:2379
- --cert-file=/etc/kubernetes/pki/etcd/server.crt
- --client-cert-auth=true
- --data-dir=/var/lib/etcd
- --experimental-initial-corrupt-check=true
- --experimental-watch-progress-notify-interval=5s
- --initial-advertise-peer-urls=https://192.168.40.101:2380
- --initial-cluster=k8s-master01=https://192.168.40.101:2380
- --key-file=/etc/kubernetes/pki/etcd/server.key
- --listen-client-urls=https://127.0.0.1:2379,https://192.168.40.101:2379
- --listen-metrics-urls=http://0.0.0.0:2381
- --listen-peer-urls=https://192.168.40.101:2380
...
2、本地访问 etcd 的 Metrics
[root@k8s-master01 04-prometheus]# curl -s http://192.168.40.101:2381/metrics |grep etcd|head -n 10
# HELP etcd_cluster_version Which version is running. 1 for 'cluster_version' label with current cluster version
# TYPE etcd_cluster_version gauge
etcd_cluster_version{cluster_version="3.5"} 1
# HELP etcd_debugging_auth_revision The current revision of auth store.
# TYPE etcd_debugging_auth_revision gauge
etcd_debugging_auth_revision 1
# HELP etcd_debugging_disk_backend_commit_rebalance_duration_seconds The latency distributions of commit.rebalance called by bboltdb backend.
# TYPE etcd_debugging_disk_backend_commit_rebalance_duration_seconds histogram
etcd_debugging_disk_backend_commit_rebalance_duration_seconds_bucket{le="0.001"} 933
etcd_debugging_disk_backend_commit_rebalance_duration_seconds_bucket{le="0.002"} 933
3、etcd 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 etcd 服务的 Pod 实例。
[root@k8s-master01 04-prometheus]# kubectl describe pods etcd-k8s-master01 -n kube-system
Name: etcd-k8s-master01
Namespace: kube-system
Priority: 2000001000
Priority Class Name: system-node-critical
Node: k8s-master01/192.168.40.101
Start Time: Sun, 17 Mar 2024 13:48:58 +0800
Labels: component=etcd
tier=control-plane
...
# 7.6.2 配置 Prometheus 监控 Etcd
- 1、添加⼀个新的 Job,名为: kube-etcd ,metrics 路径是 /metrics,协议是 http
- 2、基于 Kubernetes 的 Pod 来实现⾃动发现;
- 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_component ,标签值为 etcd
- 4、使⽤ relabel_configs,修改抓取 Pod 的端⼝为 2381,默认 pod ⾃动抓取的实例,使⽤ http80 端⼝和 https443 端⼝;
- 5、使⽤ relabel_configs,保留 ____meta_kubernetes_namespace、__meta_kubernetes_pod_name ,这两个维度的标签。
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
# 7.6.3 Etcd 告警规则⽂件
kube_etcd_rules.yml: |-
groups:
- name: etcd告警规则文件
rules:
- alert: Etcd成员异常下线
expr: count(etcd_server_id) by (job) % 2 == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Etcd成员异常下线"
description: "Etcd集群成员数量为偶数,可能有成员下线导致集群无法正常提供服务。"
- alert: Etcd通信异常
expr: etcd_server_has_leader == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Etcd通信异常 (实例: {{ $labels.instance }})"
description: "实例 {{ $labels.instance }} 的Etcd节点无法与集群中的其它节点通信。"
- alert: Etcd领导者变更频繁
expr: rate(etcd_server_leader_changes_seen_total[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Etcd领导者变更频繁 (实例: {{ $labels.instance }})"
description: "在过去的5分钟内,实例 {{ $labels.instance }} 的Etcd领导者变更次数超过了5次,这可能会影响集群稳定性。"
- alert: Etcd后端提交到磁盘耗时异常
expr: |
sum by (instance, job, pod_name) (rate(etcd_disk_backend_commit_duration_seconds_sum[5m]))
/
sum by (instance, job, pod_name) (rate(etcd_disk_backend_commit_duration_seconds_count[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Etcd后端提交耗时异常 (实例: {{ $labels.instance }})"
description: "实例 {{ $labels.instance }} 的Etcd后端在过去5分钟内提交到磁盘的操作平均耗时超过了2秒。"
- alert: Etcd wal日志fsync耗时异常
expr: |
sum by (instance, job, pod_name) (rate(etcd_disk_wal_fsync_duration_seconds_sum[5m]))
/
sum by (instance, job, pod_name) (rate(etcd_disk_wal_fsync_duration_seconds_count[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Etcd wal日志fsync耗时异常 (实例: {{ $labels.instance }})"
description: "实例 {{ $labels.instance }} 的Etcd节点在过去5分钟内日志文件的fsync调用平均耗时超过了2秒。"
# 7.6.4 导⼊ Etcd 图形
导⼊ ID:9733
# 7.7 监控 K8S 核⼼组件 - CoreDNS
# 7.7.1 获取 CoreDNS 的 Metrics
1、CoreDNS 通过 9153 端⼝的 /metrics 路径提供指标数据。
[root@k8s-master01 04-prometheus]# kubectl get svc -n kube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
calico-typha ClusterIP 10.96.237.85 <none> 5473/TCP 198d
kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 198d
metrics-server ClusterIP 10.96.23.183 <none> 443/TCP 198d
[root@k8s-master01 04-prometheus]# curl -s http://10.96.0.10:9153/metrics|head -n 10
# HELP coredns_build_info A metric with a constant '1' value labeled by version, revision, and goversion from which CoreDNS was built.
# TYPE coredns_build_info gauge
coredns_build_info{goversion="go1.20",revision="055b2c3",version="1.10.1"} 1
# HELP coredns_cache_entries The number of elements in the cache.
# TYPE coredns_cache_entries gauge
coredns_cache_entries{server="dns://:53",type="denial",view="",zones="."} 39
coredns_cache_entries{server="dns://:53",type="success",view="",zones="."} 12
# HELP coredns_cache_hits_total The count of cache hits.
# TYPE coredns_cache_hits_total counter
coredns_cache_hits_total{server="dns://:53",type="denial",view="",zones="."} 878
2、CoreDNS 有对应的 Service,因此我们需要获取 DNS 对应的 Service 的标签(labels),以便 Prometheus 只抓取提供 DNS 服务的 Pod 实例。
[root@k8s-master01 04-prometheus]# kubectl describe svc kube-dns -n kube-system
Name: kube-dns
Namespace: kube-system
Labels: k8s-app=kube-dns
kubernetes.io/cluster-service=true
kubernetes.io/name=CoreDNS
...
# 7.7.2 配置 Prometheus 监控 DNS
- 1、添加⼀个新的 Job,名为: kube-dns ,metrics 路径是 /metrics ,协议是 http
- 2、基于 Kubernetes 的 endpoints 来实现⾃动发现;
- 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_service_label_k8s_app ,标签值为 kube-dns
- 4、使⽤ relabel_configs,重新调整 Pod 的端⼝,将标签 __meta_kubernetes_pod_ip 修改为 IP:9153 端⼝。默认将匹配到的 53、9153 端⼝ ,都视作⼀独⽴的实例。
- 5、使⽤ relabel_configs,保留 ___meta_kubernetes_namespace、____meta_kubernetes_pod_name、__meta_kubernetes_service_name ,这三个维度的标签。
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
# 7.7.3 CoreDNS 告警规则⽂件
1、编写告警规则⽂件
kube_coredns_rules.yml: |-
groups:
- name: CoreDNS告警规则文件
rules:
- alert: CoreDNS SERVFAIL响应率过高
expr: |
sum(rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m])) by (instance, job, server, pod_name, zone) > 10
for: 5m
labels:
severity: critical
annotations:
summary: "实例 {{ $labels.instance }} 的 CoreDNS SERVFAIL 响应率过高"
description: "在过去5分钟内,实例 {{ $labels.instance }},CoreDNS Pod名称 {{ $labels.pod_name }},服务端点 {{ $labels.server }},区域 {{ $labels.zone }} 的SERVFAIL响应率超过了10次,当前值:{{ $value }}次/秒。请检查CoreDNS服务状态。"
- alert: CoreDNS域名解析时延过高
expr: |
sum(rate(coredns_dns_request_duration_seconds_sum[5m])) by (instance, job, server, pod_name, zone)
/
sum(rate(coredns_dns_request_duration_seconds_count[5m])) by (instance, job, server, pod_name, zone) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "实例 {{ $labels.instance }} 的 CoreDNS 解析时延超过1秒"
description: "在过去5分钟内,实例 {{ $labels.instance }},CoreDNS Pod名称 {{ $labels.pod_name }},服务端点 {{ $labels.server }},区域 {{ $labels.zone }} 的平均域名解析时延超过了1秒,当前平均时延:{{ $value }}秒。
# 7.7.4 导⼊ CoreDNS 图形
导⼊ ID:15762
# 7.8 监控 K8S 核⼼组件 - Kubeproxy
# 7.8.1 获取 kube-proxy 的 Metrics
1、Kube-Proxy 在 10249/metrics 接⼝上提供指标数据,默认情况下 kube-proxy 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 metricsBindAddress: ""修改 metricsBindAddress:"0.0.0.0" ,然后重启 kube-proxy
[root@k8s-master01 04-prometheus]# kubectl edit configmap -n kube-system kube-proxy
...
kind: KubeProxyConfiguration
metricsBindAddress: "0.0.0.0"
mode: "ipvs"
nodePortAddresses: null
oomScoreAdj: null
portRange: ""
showHiddenMetricsForVersion: ""
winkernel:
...
# 重启kube-proxy的pod
[root@k8s-master01 04-prometheus]# kubectl rollout restart daemonset -n kube-system kube-proxy
2、检查 kube-proxy 的监听地址
[root@k8s-master01 04-prometheus]# netstat -lntp |grep kube-proxy
tcp6 0 0 :::10256 :::* LISTEN 60042/kube-proxy
tcp6 0 0 :::10249 :::* LISTEN 60042/kube-proxy
3、kube-proxy 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 kube-proxy 服务的 Pod 实例。
[root@k8s-master01 04-prometheus]# kubectl describe daemonsets.apps -n kube-system kube-proxy
Name: kube-proxy
Selector: k8s-app=kube-proxy
Node-Selector: kubernetes.io/os=linux
Labels: k8s-app=kube-proxy
Annotations: deprecated.daemonset.template.generation: 3
...
# 7.8.2 配置 Prometheus 监控 kube-proxy
- 1、添加⼀个新的 Job,名为: kube-proxy ,metrics 路径是 /metrics ,协议是 http
- 2、基于 Kubernetes 的 pod 来实现⾃动发现;
- 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_k8s_app ,标签值为 kube-proxy
- 4、使⽤ relabel_configs,重新调整 Pod 的端⼝,将标签 __meta_kubernetes_pod_ip 修改为 IP:10249 端⼝,默认 Pod 的⾃动发现端⼝ http 是 80,https 是 443。
- 5、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_pod_name ,这两个维度的标签。
1、修改 Prometheus 配置
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
# 7.8.3 Kube-Proxy 告警规则⽂件
1、编写告警规则⽂件
kube_proxy_rules.yml: |-
groups:
- name: kube-proxy告警规则文件
rules:
- alert: KubeProxy同步时间过长
expr: |
rate (kubeproxy_sync_proxy_rules_duration_seconds_sum[5m]) /
rate (kubeproxy_sync_proxy_rules_duration_seconds_count[5m]) > 3
for: 5m
labels:
severity: warning
annotations:
summary: "kube-proxy同步时间过长 (实例: {{ $labels.instance }})"
description: "实例 {{ $labels.instance }}, kube-proxy同步操作的平均时间超过了3秒。该Pod {{ $labels.pod_name }} 当前同步延迟:{{ $value }}/s"
- alert: Iptables规则同步失败次数过多
expr: rate(kubeproxy_sync_proxy_rules_iptables_restore_failures_total[5m]) > 10
for: 5m
labels:
severity: critical
annotations:
summary: "Iptables同步失败告警 (实例: {{ $labels.instance }})"
description: "实例 {{ $labels.instance }}, iptables规则同步失败次数超过10次。该Pod {{ $labels.pod_name }} 当前失败次数:{{ $value }}"
# 7.9 监控 Kubernetes 集群资源状态
# 7.9.1 什么是集群资源状态
集群资源状态是指, Kubernetes 集群中所有资源对象、以及这些资源对象的当前状态信息。这些资源对象,包括 Pod、Deployment、DaemonSet、StatefulSet、Job、CronJob 等 。⽽这些资源状态,则提供了这些资源的详细信息,例如:
- 1、当前集群中资源的数量;
- 2、当前集群中总共有多少个 Pod,分别处于什么状态(如 Running、Stopped、Terminated)。
- 3、有多少个 Deployment 正在运⾏,以及它们正在运⾏的 Pod 副本数量与实际期望运⾏的 Pod 副本数是否⼀致。
- 3、DaemonSet 控制的 Pod 是否已经在所有(或指定的)节点上运⾏。
- 4、Job 和 CronJob 是否按预定计划运⾏,以及执⾏成功与否。
- ...
但是在 Kubernetes 中的组件,并不提供关于资源状态的指标。因此我们需要使⽤ kube-state-metrics ,因为 kube-state-metrics 它会主动收集关于 Kubernetes 集群中的各种资源的状态信息。如 Pod、Deployment、Job 以及它们的数量、运⾏状况等,⽽后将些信息转换成 Prometheus 所兼容的指标格式,进⽽让 Prometheus 能抓取这些指标,并进⾏分析与展示。
# 7.9.2 安装 Kube-State-Metrics
kube-state-metrics 版本与 kubernetes 的版本对应关系(注意版本要兼容)
kube-state-metrics | Kubernetes client-go Version |
---|---|
v2.6.0 | v1.24 |
v2.7.0 | v1.25 |
v2.8.2 | v1.26 |
v2.9.2 | v1.26 |
v2.10.0 | v1.27 |
main | v1.28 |
1、安装 kube-state-metrics,⾸先克隆最新分⽀的源代码
[root@k8s-master01 ~]# yum install git -y
[root@k8s-master01 ~]# git clone https://github.com/kubernetes/kube-state-metrics.git
# 加速地址
[root@k8s-master01 ~]# git clone https://mirror.ghproxy.com/https://github.com/kubernetes/kube-state-metrics.git
2、修改 kube-state-metrics/examples/standard/deployment.yaml 镜像为国内的镜像
[root@k8s-master01 ~]# cat kube-state-metrics/examples/standard/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.11.0
name: kube-state-metrics
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.11.0
spec:
automountServiceAccountToken: true
containers:
# - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.11.0
- image: uhub.service.ucloud.cn/oldxu/kube-state-metrics:v2.10.0
...
3、应⽤资源清单⽂件
[root@k8s-master01 ~]# kubectl apply -f kube-state-metrics/examples/standard/
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics created
deployment.apps/kube-state-metrics created
serviceaccount/kube-state-metrics created
service/kube-state-metrics created
4、 Kube-state-metrics 的 Pod 运⾏在 kube-system 名称空间
[root@k8s-master01 standard]# kubectl get pods,svc -n kube-system -l app.kubernetes.io/name=kube-state-metrics
NAME READY STATUS RESTARTS AGE
pod/kube-state-metrics-5864c7d699-mwbp4 1/1 Running 0 2m15s
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/kube-state-metrics ClusterIP None <none> 8080/TCP,8081/TCP 2m15s
5、 Kube-state-metrics 有提供 Service ,因此我们需要获取 Service 的标签(labels),以便 Prometheus 只抓取提供 Kube-state-metrics 服务的实例。
[root@k8s-master01 standard]# curl -s http://172.16.85.199:8080/metrics | head -n 10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 2.1866e-05
go_gc_duration_seconds{quantile="0.25"} 8.7894e-05
go_gc_duration_seconds{quantile="0.5"} 0.000165906
go_gc_duration_seconds{quantile="0.75"} 0.000179164
go_gc_duration_seconds{quantile="1"} 0.000420888
go_gc_duration_seconds_sum 0.001599982
go_gc_duration_seconds_count 10
# HELP go_goroutines Number of goroutines that currently exist.
[root@k8s-master01 standard]# kubectl describe service -n kube-system kube-state-metrics
Name: kube-state-metrics
Namespace: kube-system
Labels: app.kubernetes.io/component=exporter
app.kubernetes.io/name=kube-state-metrics
app.kubernetes.io/version=2.11.0
...
# 7.9.3 配置 Prometheus 监控 KSM
- 1、添加⼀个新的 Job,名为: kube-state-metrics ,metrics 路径是 /metrics ,协议是 http
- 2、基于 Kubernetes 的 endpoints 来⾃动发现所有的 endpoints 端点;
- 3、使⽤ relabel_configs,仅保留标签名 __meta_kubernetes_service_label_app_kubernetes_io_name ,标签值是 kube-state-metrics 的实例。
- 4、使⽤ relabel_configs,重新调整 Pod 的端⼝,将标签 __meta_kubernetes_pod_ip 修改为 IP:8080 端⼝。
- 5、使⽤ relabel_configs,映射 _meta_kubernetes_service_label (.*) ,所有的标签以及标签值。
1、配置 Prometheus
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
# 7.9.4 资源状态告警规则⽂件
1、编写告警规则⽂件
kube_state_metrics_rules.yml: |-
groups:
- name: KSM告警规则文件
rules:
- alert: 节点kubelet未就绪
expr: kube_node_status_condition{condition="Ready", status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "节点 {{ $labels.node }} kubelet未就绪"
description: "节点 {{ $labels.node }} kubelet已经超过5分钟未处于就绪状态,需要立即检查。"
- alert: 节点内存压力大
expr: kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.node }} 内存压力过高"
description: "节点 {{ $labels.node }} 正在经历内存压力,可能需要增加内存资源或减少工作负载。"
- alert: 节点网络压力大
expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.node }} 网络压力过高"
description: "节点 {{ $labels.node }} 的网络接压力过大,可能存在网络瓶颈。"
- alert: 节点磁盘压力大
expr: kube_node_status_condition{condition="DiskPressure", status="true"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.node }}磁盘压力过高"
description: "节点 {{ $labels.node }} 正在经历磁盘压力,磁盘空间或inode可能不足。"
- alert: 节点PID压力大
expr: kube_node_status_condition{condition="PIDPressure", status="true"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "节点 {{ $labels.node }} PID压力过高"
description: "节点 {{ $labels.node }} 上的进程数可能已经达到上限。"
- alert: 启动失败的Pod
expr: sum (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) by (job,namespace, pod) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod启动失败"
description: "'{{ $labels.namespace }}' 名称空间中的Pod '{{ $labels.pod }}'启动失败。"
- alert: 因为OOM重启的Pod
expr: |
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1)
and ignoring (reason)
min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) >= 1
for: 10m
labels:
severity: warning
annotations:
summary: "{{ $labels.pod }} Pod因OOM重启"
description: "'{{ $labels.namespace }}' 名称空间中的Pod '{{ $labels.pod }}' 触发了OOM造成Pod重启,触发OOM的容器是 '{{ $labels.container }}'。"
- alert: Deployment副本数不一致
expr: kube_deployment_spec_replicas - kube_deployment_status_replicas_available > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Deployment副本数不一致"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.deployment }} 部署副本数与期望副本数不一致,当前偏差了{{ $value }}个副本。"
- alert: DaemonSet副本数不一致
expr: |
kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100
or
kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
for: 5m
labels:
severity: warning
annotations:
summary: "DaemonSet副本数不一致"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.daemonset }} 期望副本数与实际运行副本数不一致。"
- alert: DaemonSet调度出现错误
expr: kube_daemonset_status_number_misscheduled > 0
for: 5m
labels:
severity: warning
annotations:
summary: "DaemonSet调度错误"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.daemonset }} 调度了错误的Pod。"
- alert: StatefulSet副本数异常
expr: |
kube_statefulset_status_replicas_ready / kube_statefulset_replicas * 100 < 100
or
kube_statefulset_replicas - kube_statefulset_status_replicas_current > 0
for: 5m
labels:
severity: warning
annotations:
summary: "StatefulSet副本数异常"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.statefulset }} 期望副本数与实际运行副本数不一致。"
- alert: PV异常
expr: kube_persistentvolume_status_phase{phase="Failed"} > 0
for: 5m
labels:
severity: critical
annotations:
summary: "PV异常"
description: "持久卷'{{ $labels.persistentvolume }}'处于Failed状态。"
- alert: PVC异常
expr: kube_persistentvolumeclaim_status_phase{phase=~"Lost|Pending"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "PVC异常"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.persistentvolumeclaim }} 持久卷(PVC)处于 {{ $labels.phase }}状态。"
- alert: Job完成度低
expr: kube_job_status_succeeded / kube_job_spec_completions * 100 < 75
for: 5m
labels:
severity: warning
annotations:
summary: "Job完成度低于75%"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.job_name }} Job任务完成度低于预期的75%。"
- alert: Job失败次数高
expr: kube_job_status_failed > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Job失败次数过高"
description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.job_name }} Job任务执行失败次数超过5次以上,当前失败 {{ $value }} 次。"
# 7.9.5 导⼊资源状态图形
ID:13332 或 15757
# 7.10 监控 Kubernetes 集群 Pod 资源
# 7.10.1 Pod 资源是什么
所谓 Pod 资源,指的是,运⾏在 Pod 中的 "容器" 所使⽤的计算资源。这些计算资源指的是 CPU、内存、⽹络以及磁盘 IO 等相关指标。之前我们监控容器的资源时,使⽤的是 cAdvisor ⼯具来监控。不过在 Kubernetes 中,Cadvisor ⼯具已经被内置到了 kubelet 的组件中。因此,我们可以直接监控节点的 kubelet,来收集相关 Pod 的指标数据。
kubelet 的指标可以通过以下⽅式访问:
- 端⼝:10250,kubelet ⽤于指标数据。
- 协议:HTTPS,确保数据传输的安全性(因此需要进⾏认证才可以抓取数据)。
- 路径: /metrics/cadvisor ,特定的 URL 路径,我们需要从这个路径获取 cAdvisor 提供的指标数据。
# 7.10.2 配置 Prometheus 监控 Pod
- 1、添加⼀个新的 Job,名为: kube-kubelet ,metrics 路径是 /metrics/cadvisor ,协议是 https
- 2、基于 Kubernetes 的 node ⽅式来发现所有的主机实例,由于 kubelet 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
- 3、使⽤ relabel_configs,映射 _meta_kubernetes_node_label (.*) ,所有的标签以及标签值。
[root@k8s-master01 04-prometheus]# TOKEN=$(kubectl create token -n monitoring prometheus-sa)
[root@k8s-master01 04-prometheus]# curl -s -k -H "Authorization: Bearer $TOKEN" https://192.168.40.101:10250/metrics|head -n 10
# HELP aggregator_discovery_aggregation_count_total [ALPHA] Counter of number of times discovery was aggregated
# TYPE aggregator_discovery_aggregation_count_total counter
aggregator_discovery_aggregation_count_total 0
# HELP apiserver_audit_event_total [ALPHA] Counter of audit events generated and sent to the audit backend.
# TYPE apiserver_audit_event_total counter
apiserver_audit_event_total 0
# HELP apiserver_audit_requests_rejected_total [ALPHA] Counter of apiserver requests rejected due to an error in audit logging backend.
# TYPE apiserver_audit_requests_rejected_total counter
apiserver_audit_requests_rejected_total 0
# HELP apiserver_client_certificate_expiration_seconds [ALPHA] Distribution of the remaining lifetime on the certificate used to authenticate a request.
1、配置 Prometheus
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
3、检查此前创建好的 kube_pods_rules.yml 告警规则⽂件
kube_pods_rules.yml: |-
groups:
- name: Pods的告警规则文件
rules:
- alert: Pod中容器的CPU利用率高
expr: sum (rate(container_cpu_usage_seconds_total{image!=""}[5m])) by (instance,job,pod,namespace) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod CPU利用率高"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的CPU利用率当前为 {{ $value }}%,超过了80%的阈值。"
- alert: Pod中容器内存利用率高
expr: |
sum(container_memory_working_set_bytes{name!=""}) by (instance,job,pod,namespace)
/
sum(container_spec_memory_limit_bytes{name!=""} > 0) by (instance,job,pod,namespace) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod内存利用率高"
description: 在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的内存最大限制是 {{ printf `sum (container_spec_memory_limit_bytes{namespace="%s",pod="%s"} > 0 ) /1024 /1024` $labels.namespace $labels.pod | query | first | value }}MB , 目前利用率已达{{ $value }}%,超过限制的80%。
- alert: Pod容器网络发送速率过高
expr: sum(rate(container_network_transmit_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络发送速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
- alert: Pod容器网络接收速率过高
expr: sum(rate(container_network_receive_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络接收速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
- alert: Pod容器磁盘写入吞吐量过大
expr: sum (rate(container_fs_writes_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘写入吞吐量过大"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘写入吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
- alert: Pod容器磁盘读取吞吐量过大
expr: sum (rate(container_fs_reads_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
for: 1m
labels:
severity: warning
annotations:
summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘读取吞吐量过大"
description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘读取吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
# 7.11 监控 Kubernetes 集群 Service 资源
# 7.11.1 为何需要监控 Service 资源
监控 Service 资源是为了确保我们的服务,时刻处于持续运⾏状态,通常关注如下两个维度;
● 1、可⽤性:确保 Service 始终是可以被访问的,从⽽保障服务的连续性。
● 2、性能:监控 Service 的响应时间,确保其处理请求的速度,始终处于⼀个稳定的时间。
为了确保 Service 的服务始终可⽤,我们通常会采⽤ Blackbox 的 TCP 探测⽅法来进⾏监控。
之前 Blackbox 监控⽅式⽐较固定,不够灵活。为了改进这⼀点,在监控 Service 时,我们可以将监控⽬标配置为⾃动发现机制。这样,每当有新的 Service 出现或现有 Service 发⽣变化时,监控系统能够⾃动识别并开始监控,⽽不需要⼿动更新 Prometheus 的配置。
- job_name: 'blackbox_http'
metrics_path: /probe # metrics的path这次不是/metrics,⽽是/probe
params: # 传递参数
module: [http_2xx] # 调⽤哪个模块进⾏探测
static_configs:
- targets: ["https://www.xuliangwei.com","http://www.oldxu.net","https://www.baidu.com","http://httpbin.org/status/400","https://httpstat.us/500","https://httpstat.us/502"]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: prom-node04.oldxu.net:9115
# relabel_configs是标签重写的配置,这⾥进⾏了三次操作:
# 1、将⽬标地址(__address__)赋予给__param_target,这是Blackbox Exporter需要的⽬标target参数。
# 2、将__param_target的内容复制到instance标签,这样Prometheus UI中显示的instance实例名称会是⽬标站点地址,⽽不是Blackbox的地址。
# 3、最后,将实际发送探测请求的地址(__address__)设置为运⾏Blackbox Exporter的节点地址和端⼝(prom-node04.oldxu.net:9115),这样Prometheus就会向这个地址发送探测请求。
# 7.11.2 配置 Prometheus 监控 Service
1、添加⼀个新的 Job,名为: kube-blackbox-tcp ,协议是 tcp , metrics 路径是 /probe
2、基于 Kubernetes 的 service 来实现⾃动发现所有的 Service,⽽后进⾏⾃动监控;
3、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_service_name 这两个维度的标签。
1、配置 Prometheus
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
5、检查此前创建好的 kube_blackbox_tcp_rules.yml 告警规则⽂件
blackbox_tcp_rules.yml: |-
groups:
- name: Blackbox_tcp告警规则文件
rules:
- alert: Service TCP探测失败
expr: sum(probe_success{job=~".*tcp"}) by (instance,job,namespace,service_name) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "探测 '{{ $labels.instance }}' Service 的TCP接口探测失败。"
description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 地址探测失败。"
- alert: Service TCP请求的响应时间过长
expr: probe_duration_seconds{job=~".*tcp"} > 0.500
for: 5m
labels:
severity: critical
annotations:
summary: "探测 '{{ $labels.instance }}' Service 的TCP响应时间超过了500毫秒。"
description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"
- alert: Service的DNS解析响应时间过长
expr: probe_dns_lookup_time_seconds{job=~".*tcp"} > 0.500
for: 5m
labels:
severity: critical
annotations:
summary: "探测 '{{ $labels.instance }}' Service 的DNS解析响应时间超过了500毫秒。"
description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"
# 7.12 监控 Kubernetes 集群 Ingress 资源
# 7.12.1 为何需要监控 Ingress 资源
监控 Ingress 对应的域名,主要是为了确保⽤户,能时刻访问到对应域名所提供的服务;监控域名维度如下;
HTTP 请求延迟:监控站点处理请求的延迟,如果请求延迟过⾼可以推送告警消息,这样就可以第⼀时间进⾏处理。
证书过期时间:监控 TLS/SSL 证书的有效期,以便及时更新证书,避免因为证书过期造成访问中断。
可⽤性:持续检查 Ingress 绑定的域名是否可被持续访问,确保⽤户的访问不会中断。
为了实现这样的监控,通常会利⽤ Blackbox Exporter 的 HTTP 探测功能来检查 Ingress 对应域名的健康状态和响应时间。
之前 Blackbox 监控⽅式⽐较固定,不够灵活。为了改进这⼀点,在监控 Ingress 时,我们可以将监控⽬标配置为⾃动发现机制。这样,每当有新的 ingress 出现或现有 ingress 发⽣变化时,监控系统能够⾃动识别并开始监控,⽽不需要⼿动更新 Prometheus 的配置。
- job_name: 'blackbox_http'
metrics_path: /probe # metrics的path这次不是/metrics,⽽是/probe
params: # 传递参数
module: [http_2xx] # 调⽤哪个模块进⾏探测
static_configs:
- targets: ["https://www.xuliangwei.com","http://www.oldxu.net","https://www.baidu.com","http://httpbin.org/status/400","https://httpstat.us/500","https://httpstat.us/502"]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: prom-node04.oldxu.net:9115
# relabel_configs是标签重写的配置,这⾥进⾏了三次操作:
# 1、将⽬标地址(__address__)赋予给__param_target,这是Blackbox Exporter需要的⽬标target参数。
# 2、将__param_target的内容复制到instance标签,这样Prometheus UI中显示的instance实例名称会是⽬标站点地址,⽽不是Blackbox的地址。
# 3、最后,将实际发送探测请求的地址(__address__)设置为运⾏Blackbox Exporter的节点地址和端⼝(prom-node04.oldxu.net:9115),这样Prometheus就会向这个地址发送探测请求。
# 7.12.2 配置 Prometheus 监控 Ingress
- 1、添加⼀个新的 Job,名为: kube-blackbox-http ,协议是 http
- 2、基于 Kubernetes 的 ingress 来⾃动发现所有的 Ingress 资源,⽽后进⾏监控。
- 3、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_ingress_name、__meta_kubernetes_ingress_class_name 这三个维度的标签。
1、配置 Prometheus
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
# 监控ingress
- job_name: "kube-blackbox-http"
metrics_path: "/probe"
params:
module: [http_2xx] # 使用http模块进行探测
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
- source_labels: [__meta_kubernetes_ingress_scheme,__address__]
regex: (.*);(.*)
replacement: $1://$2
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: ingress_name
- source_labels: [__meta_kubernetes_ingress_class_name]
target_label: ingress_class_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
# 监控ingress
- job_name: "kube-blackbox-http"
metrics_path: "/probe"
params:
module: [http_2xx] # 使用http模块进行探测
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
- source_labels: [__meta_kubernetes_ingress_scheme,__address__]
regex: (.*);(.*)
replacement: $1://$2
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: ingress_name
- source_labels: [__meta_kubernetes_ingress_class_name]
target_label: ingress_class_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
5、检查此前创建好的 blackbox_http_rules.yml 告警规则⽂件
blackbox_http_rules.yml: |-
groups:
- name: Blackbox_http告警规则文件
rules:
- alert: 站点平均请求过长
expr: sum (avg_over_time(probe_http_duration_seconds[1m])) by (instance,job,namespace,ingress_name) > 3
for: 1m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' 域名整体请求时间超过了3秒。"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名最近1分钟的平均请求时间超过3秒。当前平均请求时间:{{ $value }}秒。"
- alert: 站点阶段耗时过长
expr: |
(
probe_http_duration_seconds{phase="connect"} > 1 or
probe_http_duration_seconds{phase="processing"} > 1 or
probe_http_duration_seconds{phase="resolve"} > 1 or
probe_http_duration_seconds{phase="tls"} > 1 or
probe_http_duration_seconds{phase="transfer"} > 1
)
for: 1m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' 域名在 '{{ $labels.phase }}' 阶段耗时过长"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在阶段 '{{ $labels.phase }}' 的耗时超过0.5秒。当前耗时:{{ $value }}秒。"
- alert: 站点响应状态码异常
expr: probe_http_status_code <= 199 or probe_http_status_code >= 400
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 域名返回异常状态码"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名返回的状态码为 {{ $value }},表明请求可能存在问题。"
- alert: 重定向次数过多
expr: probe_http_redirects > 5
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' 域名重定向次数过多"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在最近的探测中重定向次数超过5次。当前次数:{{ $value }}次。"
- alert: 证书即将过期<30
expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 30
for: 24h
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 域名的 SSL 证书即将过期"
description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"
- alert: 证书即将过期<7
expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 7
for: 24h
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}' 域名的 SSL 证书即将过期"
description: "{{ $labels.namespace }} 名称空间 {{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"
# 8、Prometheus 监控 Redis 应⽤的 Pod
# 8.1 监控 Redis 应⽤场景说明
运⾏ redis 的 Pod,⽽后为其创建⼀个 Service,我们现在希望监控它的 Pod 状态,Pod 资源的使⽤,redis 的指标、以及 redis 的 Service 的延迟和存活性。
- 1、redis 应⽤指标:可以在 Pod 中注⼊⼀个 redis_exporter 来抓取对应的 redis 指标,并暴露给 Prometheus;
- 2、pod 的运⾏状态:kube-state-metrics 能⾃动获取,因此⽆需考虑;
- 3、pod 的资源使⽤:kubelet 的 CadVisor 能⾃动获取到每个 Pod 的资源使⽤,因此⽆需考虑;
- 4、Pod 的存活性以及延迟:可以通过 Blackbox 来监控 6379、9121 端⼝,在此前创建的 Blackbox-tcp 会⾃动的将这些 Service 资源给监控起来,因此⽆需考虑。
# 8.2 运⾏ Redis 基础服务 Pod
1、在⼀个 Pod 中同时运⾏ Redis 和 Redis_exporter,清单⽂件如下:
[root@k8s-master01 08-redis-exporter]# cat 01-redis-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: default
spec:
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
annotations:
prometheus.io/scrape: "true"
prometheus.io/scheme: "http"
prometheus.io/path: "/metrics"
prometheus.io/port: "9121"
spec:
containers:
- name: redis
image: redis:6
# 设定redis最大的内存,默认最大内存为0
command: ["redis-server"]
args: ["--maxmemory", "200mb"]
ports:
- containerPort: 6379
- name: redis-exporter
image: oliver006/redis_exporter:v1.57.0
ports:
- containerPort: 9121
2、创建 Service 资源,需要暴露两个端⼝
[root@k8s-master01 08-redis-exporter]# cat 02-redis-service.yaml
kind: Service
apiVersion: v1
metadata:
name: redis-svc
namespace: default
labels:
app: redis
spec:
selector:
app: redis
ports:
- name: redis
port: 6379
targetPort: 6379
- name: exporter
port: 9121
targetPort: 9121
3、查看 Service 详情
[root@k8s-master01 08-redis-exporter]# kubectl apply -f .
[root@k8s-master01 08-redis-exporter]# kubectl describe service redis-svc
Name: redis-svc
Namespace: default
Labels: app=redis
Annotations: <none>
Selector: app=redis
Type: ClusterIP
IP Family Policy: SingleStack
IP Families: IPv4
IP: 10.96.140.137
IPs: 10.96.140.137
# 端点1
Port: redis 6379/TCP
TargetPort: 6379/TCP
Endpoints: 172.16.85.208:6379
# 端点2
Port: exporter 9121/TCP
TargetPort: 9121/TCP
Endpoints: 172.16.85.208:9121
Session Affinity: None
Events: <none>
# 8.3 配置 Prometheus 监控 Redis
- 1、添加⼀个新的 Job,名为: redis-pod
- 2、基于 Kubernetes 的 endpints 来实现⾃动发现
- 3、使⽤ relabel_configs,保留 address 地址对应的端⼝是 9121 的实例。
- 4、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_service_name、__meta_kubernetes_pod_name 这三个维度的标签。
1、配置 Prometheus
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
# 监控ingress
- job_name: "kube-blackbox-http"
metrics_path: "/probe"
params:
module: [http_2xx] # 使用http模块进行探测
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
- source_labels: [__meta_kubernetes_ingress_scheme,__address__]
regex: (.*);(.*)
replacement: $1://$2
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: ingress_name
- source_labels: [__meta_kubernetes_ingress_class_name]
target_label: ingress_class_name
# 监控redis
- job_name: "kube-redis"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
# 保留端口为9121的Pod实例
relabel_configs:
- source_labels: [__address__]
regex: (.*):9121
action: keep
# 保留特定维度的标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: service_name
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
# 监控ingress
- job_name: "kube-blackbox-http"
metrics_path: "/probe"
params:
module: [http_2xx] # 使用http模块进行探测
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
- source_labels: [__meta_kubernetes_ingress_scheme,__address__]
regex: (.*);(.*)
replacement: $1://$2
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: ingress_name
- source_labels: [__meta_kubernetes_ingress_class_name]
target_label: ingress_class_name
# 监控redis
- job_name: "kube-redis"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
# 保留端口为9121的Pod实例
relabel_configs:
- source_labels: [__address__]
regex: (.*):9121
action: keep
# 保留特定维度的标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: service_name
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
5、检查此前创建好的 redis_rules.yml 告警规则⽂件
redis_rules.yml: |-
groups:
- name: redis告警规则
rules:
- alert: Redis实例宕机
expr: redis_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例宕机"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去5分钟内无法连接。"
- alert: Redis连接数过高
expr: redis_connected_clients / redis_config_maxclients * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例连接数超过80%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod当前连接数占最大连接数的比率超过80%。当前比率: {{ $value }}%。"
- alert: Redis连接被拒绝
expr: increase(redis_rejected_connections_total[1h]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例有连接被拒绝"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去1小时内有连接被拒绝。当前被拒绝的连接数: {{ $value }}。"
- alert: Redis内存使用率过高
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例内存使用率超过80%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的内存使用率超过配置的最大内存值的80%。当前内存使用率: {{ $value }}%。"
- alert: Redis缓存命中率低
expr: |
irate(redis_keyspace_hits_total[5m])
/
(irate(redis_keyspace_hits_total[5m]) + irate(redis_keyspace_misses_total[5m])) * 100 < 90
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例缓存命中率低于90%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近5分钟内的缓存命中率低于90%。当前命中率: {{ $value }}%。"
- alert: Redis即将过期的Key数量过多
expr: |
sum(redis_db_keys_expiring) by (instance, job, namespace,pod_name,db)
/
sum(redis_db_keys) by (instance, job, namespace,pod_name,db) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例中的 '{{ $labels.db }}' 数据库有大量即将过期的Key"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod中的 '{{ $labels.db }}' 数据库有超过50%的Key即将过期。当前过期比率: {{ $value }}%。"
- alert: RedisRDB备份失败
expr: redis_rdb_last_bgsave_status == 0
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例 RDB备份失败"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近的RDB备份失败。"
- alert: RedisRDB备份时间过长
expr: redis_rdb_last_bgsave_duration_sec > 3 and redis_rdb_last_bgsave_status == 1
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例 RDB备份成功但耗时超过3秒"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod, RDB备份成功但耗时超过了3秒。持续时间: {{ $value }}秒。"
- alert: RedisRDB备份过期
expr: (time() - redis_rdb_last_save_timestamp_seconds) > 36000
for: 24h
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例超过10小时未进行RDB备份"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod已超过10小时没有生成新的RDB备份文件。"
- alert: Redis命令拒绝率过高
expr: |
sum(irate(redis_commands_rejected_calls_total[5m])) by (instance,job,namespace,pod_name)
/
sum(irate(redis_commands_total[5m])) by (instance,job,namespace,pod_name) * 100 > 25
for: 5m
labels:
severity: warning
annotations:
summary: "'{{ $labels.instance }}' Redis实例命令拒绝率超过25%"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的命令拒绝率超过了25%。当前拒绝率: {{ $value }}%。"
- alert: Redis命令平均响应时间过长
expr: |
sum(rate(redis_commands_duration_seconds_total[5m])) by (instance,job,namespace,pod_name)
/
sum(rate(redis_commands_processed_total[5m])) by (instance,job,namespace,pod_name) > 0.250
for: 5m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' Redis实例命令平均响应时间超过250ms"
description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的执行命令平均响应时间超过了250毫秒。当前平均响应时间: {{ $value }}秒。"
# 9、Prometheus 监控 java 业务应⽤的 Pod
# 9.1 监控业务应⽤场景说明
运⾏ javaapp 的业务应⽤ Pod,⽽后为其创建⼀个 Service、Ingress,现在我们希望监控 Pod 状态,Pod 资源的使⽤,jvm 内存相关指标、Service 的 TCP 检测、以及域名响应状态、延迟、存活性等检测。
1、java 应⽤指标:在初始化容器运⾏⼀个 jmx_exporter 的容器,并将相关的 jar 和 config.yaml 共享给主容器,主容器通过 JVM 环境变量传递相关启动参数,完成 jvm 的监控;
2、pod 的运⾏状态:kube-state-metrics 能⾃动获取,因此⽆需考虑;
3、pod 的资源使⽤:kubelet 的 CadVisor 能⾃动获取到每个 Pod 的资源使⽤,因此⽆需考虑;
4、service 的存活性以及延迟:通过此前 kube-blackbox-tcp,能⾃动监控其 Service 的 8080 端⼝和 12345 端⼝;
5、ingress 的 http 状态监控,通过此前 kube-blackbox-http,能⾃动监控其 ingress 的域名状态;
# 9.2 运⾏业务应⽤容器 Pod
1、由于官⽅没有提供 jmx_prometheus 的镜像,因此需要先⾃⾏制作镜像。(也可以直接使⽤我制作好的镜像 oldxu3957/jmx_prometheus:v0.20.0 )
[root@k8s-master01 dockerfile]# wget https://repo.maven.apache.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.20.0/jmx_prometheus_javaagent-0.20.0.jar
[root@k8s-master01 dockerfile]# cat config.yaml
rules:
- pattern: ".*"
# Dockerfile⽂件
[root@prom-node03 jmx_exporter]# cat Dockerfile
FROM alpine:latest
ENV VERSION="0.20.0"
ENV DIR=/jmx
COPY ./config.yaml ${DIR}/config.yaml
COPY ./jmx_prometheus_javaagent-${VERSION}.jar ${DIR}/jmx_prometheus.jar
2、运⾏ java 应⽤ Pod
[root@k8s-master01 09-java-exporter]# cat 01-javaapp-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: java-app
namespace: default
spec:
replicas: 2
selector:
matchLabels:
app: java
template:
metadata:
labels:
app: java
spec:
volumes:
- name: javaagent
emptyDir: {}
initContainers: # 运⾏初始化容器(将⽂件夹整体拷⻉⾄javaagent共享卷)
- name: jmx-prometheus
image: oldxu3957/jmx_prometheus:v0.20.0
command: ["sh","-c","cp -rp /jmx /data/"]
volumeMounts:
- name: javaagent
mountPath: /data
containers:
- name: javaapp
image: oldxu3957/javaapp:v1.0
env:
- name: JAVA_TOOL_OPTIONS # 通过JAVA_TOOL_OPTIONS传递JVM相关参数
value: "-Xms100m -Xmx100m \
-javaagent:/agent/jmx/jmx_prometheus.jar=12345:/agent/jmx/config.yaml"
volumeMounts:
- name: javaagent
mountPath: /agent
ports:
- name: java
containerPort: 8080
- name: jmx
containerPort: 12345
resources:
requests:
cpu: 100m
memory: 200Mi
limits:
cpu: 100m
memory: 200Mi
# 9.3 对外发布 java 业务应⽤
1、创建 Service,需要暴露 8080 端⼝和 12345 端⼝
[root@k8s-master01 09-java-exporter]# cat 02-javaapp-service.yaml
kind: Service
apiVersion: v1
metadata:
name: javaapp-svc
namespace: default
labels:
app: java
annotations:
prometheus.io/scrape: "true"
prometheus.io/scheme: "http"
prometheus.io/path: "/metrics"
prometheus.io/port: "12345"
spec:
selector:
app: java
ports:
- name: javaapp
port: 8080
targetPort: 8080
- name: jmx
port: 12345
targetPort: 12345
2、创建 Ingress
[root@k8s-master01 09-java-exporter]# cat 03-javaapp-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: javaapp-ingress
namespace: default
spec:
ingressClassName: "nginx"
rules:
- host: "javaapp.hmallleasing.com"
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: javaapp-svc
port:
number: 8080
3、更新资源清单
[root@k8s-master01 09-java-exporter]# kubectl apply -f .
# 9.4 配置 Prometheus 监控业务应⽤
- 1、添加⼀个新的 Job,名为: java-pod
- 2、基于 Kubernetes 的 endpints 来实现⾃动发现
- 3、使⽤ relabel_configs,仅抓取 address 地址对应的端⼝是 12345 的实例。
- 4、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_service_name、__meta_kubernetes_pod_name 这三个维度的标签。
1、配置 Prometheus
[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-configs
namespace: monitoring
data:
prometheus.yml: |-
global:
scrape_interval: 15s
scrape_timeout: 15s
# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager-svc:9093"]
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: "prometheus"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:9090"]
# 监控Kubernetes的节点
- job_name: "kube-nodes"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: ["__address__"]
regex: "(.*):10250"
replacement: "$1:9100"
target_label: __address__
action: replace
- regex: __meta_kubernetes_node_label_(.*)
replacement: $1
action: labelmap
# 监控APIServer
- job_name: "kube-apiserver"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
insecure_skip_verify: true # 跳过证书验证
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 标签重写
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_component"] #保留label为apiserver实例
regex: "apiserver"
action: "keep"
- source_labels: ["__meta_kubernetes_namespace"] #匹配__meta_kubernetes_namespace值,并赋值给namespace
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_service_name"] #__meta_kubernetes_service_name值并赋值给service_name
regex: "(.*)"
replacement: $1
target_label: service_name
- regex: __meta_kubernetes_service_label_(.*) #通过标签映射获取标签
replacement: $1
action: labelmap
# 监控controllerManager
- job_name: "kube-controller"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 仅保留标签名是component 值为kube-controller-manager
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-controller-manager"
action: keep
# 替换抓取的实例端口为10257
- source_labels: [__address__]
regex: (.*)
replacement: $1:10257
target_label: __address__
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控Scheduler
- job_name: "kube-schduler"
metrics_path: "/metrics"
scheme: https
kubernetes_sd_configs:
- role: pod
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 基于标签进行过滤
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "kube-scheduler"
action: keep
# 修订抓取的端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:10259
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控etcd
- job_name: "kube-etcd"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_component"]
regex: "etcd"
action: keep
# 修订端口
- source_labels: ["__address__"]
regex: (.*)
replacement: $1:2381
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控CoreDNS
- job_name: "kube-dns"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_k8s_app"]
regex: "kube-dns"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:9153
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
- source_labels: ["__meta_kubernetes_service_name"]
regex: "(.*)"
replacement: $1
target_label: service_name
# 监控kube-proxy
- job_name: "kube-proxy"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: pod
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
regex: "kube-proxy"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:10249
target_label: __address__
# 添加维度标签
- source_labels: ["__meta_kubernetes_namespace"]
regex: "(.*)"
replacement: $1
target_label: namespace
- source_labels: ["__meta_kubernetes_pod_name"]
regex: "(.*)"
replacement: $1
target_label: pod_name
# 监控kube-state-metrics
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
scheme: http
kubernetes_sd_configs:
- role: endpoints
# 保留对应标签的Pod
relabel_configs:
- source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
regex: "kube-state-metrics"
action: keep
# 修订端口
- source_labels: ["__meta_kubernetes_pod_ip"]
regex: (.*)
replacement: $1:8080
target_label: __address__
# 添加维度标签
- regex: __meta_kubernetes_service_label_(.*)
action: labelmap
# 监控kubelet(Pod)
- job_name: "kube-kubelet"
metrics_path: "/metrics/cadvisor"
scheme: https
kubernetes_sd_configs:
- role: node
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 添加标签的映射
relabel_configs:
- regex: __meta_kubernetes_node_label_(.*)
action: labelmap
# 监控service
- job_name: "kube-blackbox-tcp"
metrics_path: "/probe"
params:
module: [tcp_connect] # 使用tcp_connect模块
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
regex: (.*)
replacement: $1
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
regex: (.*)
replacement: $1
target_label: service_name
# 监控ingress
- job_name: "kube-blackbox-http"
metrics_path: "/probe"
params:
module: [http_2xx] # 使用http模块进行探测
kubernetes_sd_configs:
- role: ingress
relabel_configs:
# 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
- source_labels: [__meta_kubernetes_ingress_scheme,__address__]
regex: (.*);(.*)
replacement: $1://$2
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-svc:9115
# 保留特定标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: ingress_name
- source_labels: [__meta_kubernetes_ingress_class_name]
target_label: ingress_class_name
# 监控redis
- job_name: "kube-redis"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
# 保留端口为9121的Pod实例
relabel_configs:
- source_labels: [__address__]
regex: (.*):9121
action: keep
# 保留特定维度的标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: service_name
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
# 监控java
- job_name: "kube-java-pod"
metrics_path: "/metrics"
kubernetes_sd_configs:
- role: endpoints
# 保留端口为12345的Pod实例
relabel_configs:
- source_labels: [__address__]
regex: (.*):12345
action: keep
# 保留特定维度的标签
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: service_name
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod_name
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
2、检查配置是否更新
[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload
4、检查 Prometheus 抓取的结果
# 9.5 检查此前创建好的 jvm_rules.yml 告警规则⽂件
jvm_rules.yml: |-
groups:
- name: "JVM告警规则"
rules:
- alert: JVM堆内存使用率过高
expr: jvm_memory_bytes_used{area="heap",} / jvm_memory_bytes_max{area="heap",} * 100 > 90
for: 1m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 实例的JVM 堆内存使用率超过80%"
description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' PodJVM堆内存使用率超过80%, 当前使用率是 {{ $value }}%"
- alert: JVMGC时间过长
expr: sum (rate(jvm_gc_collection_seconds_sum[5m]) / rate(jvm_gc_collection_seconds_count[5m])) by (instance,job,gc,namespace,pod_name) > 1
for: 1m
labels:
severity: critical
annotations:
summary: "'{{ $labels.instance }}' 实例的JVM GC时间超过了1秒。"
description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod使用 {{ $labels.gc }} GC垃圾回收算法时间超过1s,当前值 {{ $value }}秒"
- alert: JVM死锁线程过多
expr: min_over_time(jvm_threads_deadlocked[5m]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "JVM检测到'{{ $labels.instance }}' 实例有死锁线程"
description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod,在过去5分钟检测到死锁线程, 当前死锁线程数是 {{ $value }}。"