# Prometheus 监控 Kubernetes

# 1、监控 Kubernetes 环境准备

[root@k8s-master01 ~]# kubectl get nodes
NAME           STATUS   ROLES    AGE   VERSION
k8s-master01   Ready    <none>   46d   v1.27.10
k8s-master02   Ready    <none>   46d   v1.27.10
k8s-master03   Ready    <none>   46d   v1.27.10
k8s-node01     Ready    <none>   46d   v1.27.10
k8s-node02     Ready    <none>   46d   v1.27.10

# 2、部署 AlertManager ⾄ Kubernetes

  • ①先交付 webhook_wechat、webhook_dingding;
  • ②创建 ConfigMap,准备邮件的告警 template 模板;
  • ③创建 ConfigMap,准备告警路由相关的配置;
  • ④创建 HeadlessService;
  • ⑤创建 statefulSet,运⾏ 3 个节点 AlertManager;
  • ⑥创建 Ingress 对外提供;
#创建 namespace
[root@k8s-master01 ~]# kubectl create ns monitoring
[root@k8s-master01 01-alert-webhook-wechat]# sed -i "s#kube-prom#monitoring#g" *.yaml
# 2.1 部署 webhook_wechat

1、编写 deployment,运⾏ webhook-wechat

[root@k8s-master01 01-alert-webhook-wechat]# cat 01-webhook-wechat-deploy.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: webhook-wechat
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: wechat
  template:
    metadata:
      labels:
        app: wechat
    spec:
      containers:
      - name: webchat
        image: oldxu3957/webhook_wechat_oldxu:v1.0
        args: ["--port","5001"]		# 默认就是5001端口
        ports:
        - containerPort: 5001

[root@k8s-master01 01-alert-webhook-wechat]# kubectl apply -f 01-webhook-wechat-deploy.yaml
[root@k8s-master01 01-alert-webhook-wechat]# kubectl get pods -n monitoring
NAME                              READY   STATUS    RESTARTS   AGE
webhook-wechat-54b5bbf677-rmr26   1/1     Running   0          16s

2、编写 Service

[root@k8s-master01 01-alert-webhook-wechat]# cat 02-webhook-wechat-service.yaml 
apiVersion: v1
kind: Service
metadata:
  name: webhook-wechat-svc
  namespace: monitoring
spec:
  selector:
    app: wechat
  ports:
   - port: 5001
     targetPort: 5001
     
[root@k8s-master01 01-alert-webhook-wechat]# kubectl apply -f 02-webhook-wechat-service.yaml 
[root@k8s-master01 01-alert-webhook-wechat]# kubectl get svc -n monitoring
NAME                 TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)    AGE
webhook-wechat-svc   ClusterIP   10.96.82.7   <none>        5001/TCP   13s

3、测试 webhook-wechat 是否能正常发送消息 (8e24a24d-3f48-4ea8-bbde-36ca84d857e4)

[root@prom-node01 ~]# curl -X POST http://10.96.82.7:5001/alert?token=8e24a24d-3f48-4ea8-bbde-36ca84d857e4 \
-H "Content-Type: application/json" \
-d '{
  "alerts": [
    {
      "status": "firing",
      "labels": {
        "severity": "critical",
        "alertname": "InstanceDown",
        "instance": "example1"
      },
      "annotations": {
        "summary": "Instance example1 down",
        "description": "The instance example1 is down."
      },
      "startsAt": "2024-12-20T15:04:05Z",
      "endsAt": "0001-01-01T00:00:00Z"
    },
   {
      "status": "resolved",
      "labels": {
        "severity": "critical",
        "alertname": "InstanceDown",
        "instance": "example1"
      },
      "annotations": {
        "summary": "Instance example1 is back up",
        "description": "The instance example1 has recovered."
      },
      "startsAt": "2024-12-20T15:04:05Z",
      "endsAt": "2024-12-20T16:04:05Z"
    }
  ]
}'
# 2.2 部署 webhook_dingding

1、编写 deployment,运⾏ webhook-dingding

[root@k8s-master01 02-alert-webhook-dingding]# cat 01-webhook-dingding-deploy.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: webhook-dingding
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: dingding
  template:
    metadata:
      labels:
        app: dingding
    spec:
      containers:
      - name: dingding
        image: oldxu3957/webhook_dingding_oldxu:v1.0
        args: ["--port","5002"]
        ports:
        - containerPort: 5002

[root@k8s-master01 02-alert-webhook-dingding]# kubectl apply -f 01-webhook-dingding-deploy.yaml
[root@k8s-master01 02-alert-webhook-dingding]# kubectl get pods -n monitoring
NAME                                READY   STATUS    RESTARTS   AGE
webhook-dingding-6d68854649-r6smd   1/1     Running   0          14s
webhook-wechat-54b5bbf677-rmr26     1/1     Running   0          13m

2、编写 Service

[root@k8s-master01 02-alert-webhook-dingding]# cat 02-webhook-dingding-service.yaml 
apiVersion: v1
kind: Service
metadata:
  name: webhook-dingding-svc
  namespace: monitoring
spec:
  selector:
    app: dingding
  ports:
  - port: 5002
    targetPort: 5002

[root@k8s-master01 02-alert-webhook-dingding]# kubectl apply -f 02-webhook-dingding-service.yaml 
[root@k8s-master01 02-alert-webhook-dingding]# kubectl get svc -n monitoring
NAME                   TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)    AGE
webhook-dingding-svc   ClusterIP   10.96.241.84   <none>        5002/TCP   0s
webhook-wechat-svc     ClusterIP   10.96.82.7     <none>        5001/TCP   12m

3、测试 webhook-wechat 是否能正常发送消息,49989606592d7ff06ee4b83120bf5a81ed1e4c3860696dcd7e663be1c66ef43f

[root@k8s-master01 ~]# curl -X POST http://10.96.241.84:5002/alert?token=49989606592d7ff06ee4b83120bf5a81ed1e4c3860696dcd7e663be1c66ef43f \
-H "Content-Type: application/json" \
-d '{
  "alerts": [
    {
      "status": "firing",
      "labels": {
        "severity": "critical",
        "alertname": "InstanceDown",
        "instance": "example1"
      },
      "annotations": {
        "summary": "Instance example1 down",
        "description": "The instance example1 is down."
      },
      "startsAt": "2024-12-20T15:04:05Z",
      "endsAt": "0001-01-01T00:00:00Z"
    },
   {
      "status": "resolved",
      "labels": {
        "severity": "critical",
        "alertname": "InstanceDown",
        "instance": "example1"
      },
      "annotations": {
        "summary": "Instance example1 is back up",
        "description": "The instance example1 has recovered."
      },
      "startsAt": "2024-12-20T15:04:05Z",
      "endsAt": "2024-12-20T16:04:05Z"
    }
  ]
}'
# 2.3 创建 AlertManager 配置

1、使⽤ ConfigMap 创建 AlertManager 所需的配置⽂件,名称为: alert-configs

[root@k8s-master01 03-alertmanager]# cat 01-alert-configs-configmap.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
  name: alert-configs
  namespace: monitoring
data:
  alertmanager.yml: |-
    # 全局配置
    global:
      smtp_smarthost: 'smtp.qq.com:25'
      smtp_from: '373370405@qq.com'  	
      smtp_auth_username: '373370405@qq.com'
      smtp_auth_password: 'jmtpwlkuijaybhic'
      smtp_hello: 'qq.com'
      smtp_require_tls: false
    
    # 加载模板的路径
    templates:
      - '/etc/alertmanager/template/*.tmpl'
    
    # 路由规则
    route:
      group_by: ['alertname']
      group_wait: 30s
      group_interval: 30s
      repeat_interval: 5m
      receiver: webhook-dingding-ops		# 默认发送给钉钉
    
      # 子路由
      routes:
      - match_re:
          job: 'kube.*'
        receiver: 'webhook-wechat'			# 如果匹配到job=kube.*的都发送给微信
        continue: true
    
      - match_re:
          job: 'redis_exporter'
        receiver: 'email'		        	# 如果job=domain_exporter则都发送给email
        continue: true
    
    receivers:
    - name: 'email'
      email_configs:
      - to: '373370405@qq.com'
        send_resolved: true
        html: '{{ template "email.html" . }}'   # 发送邮件内容,调用该模板进行渲染

    - name: 'webhook-wechat'
      webhook_configs:
      - url: 'http://webhook-wechat-svc:5001/alert?token=8e24a24d-3f48-4ea8-bbde-36ca84d857e4'
      
    - name: 'webhook-dingding-ops'
      webhook_configs:
      - url: 'http://webhook-dingding-svc:5002/alert?token=49989606592d7ff06ee4b83120bf5a81ed1e4c3860696dcd7e663be1c66ef43f'
      
[root@k8s-master01 03-alertmanager]# kubectl apply -f 01-alert-configs-configmap.yaml
[root@k8s-master01 03-alertmanager]# kubectl get cm -n monitoring
NAME               DATA   AGE
alert-configs      1      25s
alert-template     1      19s
kube-root-ca.crt   1      27m

2、使⽤ configmap 创建 AlertManager 邮件所依赖的模板⽂件,名称为: alert-template

[root@k8s-master01 03-alertmanager]# cat 02-alert-template-configmap.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
  name: alert-template
  namespace: monitoring
data:
  email.tmpl: |-
    {{ define "email.html" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{ range .Alerts }}
    <h2 style="color: red;">@告警通知</h2>
    告警程序: AlertManager <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    故障主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }} <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
    {{ end }}{{ end -}}
    
    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{ range .Alerts }}
    <h2 style="color: green;">@告警恢复</h2>
    告警程序: AlertManager <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }} <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
    恢复时间: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
    {{ end }}{{ end -}}
    {{ end }}
    
[root@k8s-master01 03-alertmanager]# kubectl apply -f 02-alert-template-configmap.yaml 
[root@k8s-master01 03-alertmanager]# kubectl get cm -n monitoring
NAME               DATA   AGE
alert-configs      1      25s
alert-template     1      19s
kube-root-ca.crt   1      27m
# 2.4 创建 HeadLessService
[root@k8s-master01 03-alertmanager]# cat 03-alertmanager-headlessService.yaml 
apiVersion: v1
kind: Service
metadata:
  name: alertmanager-svc
  namespace: monitoring
spec:
  clusterIP: "None"
  selector:
    app: alert
  ports:
  - name: web
    port: 9093
    targetPort: 9093
  - name: cluster
    port: 9094
    targetPort: 9094
[root@k8s-master01 03-alertmanager]# kubectl apply -f 03-alertmanager-headlessService.yaml 
[root@k8s-master01 03-alertmanager]# kubectl get svc -n monitoring
NAME                   TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)             AGE
alertmanager-svc       ClusterIP   None           <none>        9093/TCP,9094/TCP   6s
webhook-dingding-svc   ClusterIP   10.96.241.84   <none>        5002/TCP            12m
webhook-wechat-svc     ClusterIP   10.96.82.7     <none>        5001/TCP            24m
# 2.5 部署 AlertManager 服务

使⽤ statefulSet 编写 AlertManager 的⾼可⽤清单⽂件

  • 1、定义 Alertmanager 实例的启动命令,包含了对应的 “配置⽂件路径 “、“数据存储路径” 以及 “Gossip 集群通信” 相关的参数。
  • 2、定义 AlertManager 实例挂载 “邮件模板” 的 ConfigMap 资源,以及 AlertManager 主配置⽂件的 ConfigMap 资源。
  • 3、Alertmanager 的每个实例,都需要使⽤ PVC 模板来提供数据持久化;
  • 4、AlertManager 在启动时可以采⽤并⾏的⽅式,因为集群之间没有先后依赖关系;
[root@k8s-master01 03-alertmanager]# cat 04-alertmanager-statefulset.yaml 
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: alertmanager
  namespace: monitoring
spec:
  serviceName: "alertmanager-svc"
  podManagementPolicy: "Parallel"  #采用并行方式
  replicas: 3
  selector:
    matchLabels:
      app: alert
  template:
    metadata:
      labels:
        app: alert
    spec:
      volumes:
      - name: alert-cfg
        configMap:
          name: alert-configs
      - name: alert-temp-cfg
        configMap:
          name: alert-template
      containers:
      - name: alertmanager
        image: prom/alertmanager:v0.26.0
        args:
        - "--web.listen-address=:9093"
        - "--cluster.listen-address=0.0.0.0:9094"
        - "--cluster.peer=alertmanager-0.alertmanager-svc:9094"
        - "--cluster.peer=alertmanager-1.alertmanager-svc:9094"
        - "--cluster.peer=alertmanager-2.alertmanager-svc:9094"
        - "--cluster.peer-timeout=60s"
        - "--config.file=/etc/alertmanager/alertmanager.yml"
        - "--storage.path=/etc/alertmanager/data"
        - "--data.retention=120h"
        volumeMounts:
        - name: alert-cfg
          mountPath: /etc/alertmanager/
        - name: alert-temp-cfg
          mountPath: /etc/alertmanager/template
        - name: alert-data
          mountPath: /etc/alertmanager/data
        ports:
        - name: web
          containerPort: 9093
        - name: cluster
          containerPort: 9094
        resources:
          requests:
            cpu: 200m
            memory: 200Mi
          limits:
            cpu: 300m
            memory: 300Mi

  volumeClaimTemplates:
    - metadata:
        name: alert-data
      spec:
        accessModes: ["ReadWriteMany"]
        storageClassName: "nfs-storage"
        resources:
          requests:
            storage: 3Gi

[root@k8s-master01 03-alertmanager]# kubectl apply -f 04-alertmanager-statefulset.yaml
[root@k8s-master01 03-alertmanager]# kubectl get pods -n monitoring
NAME                                READY   STATUS    RESTARTS   AGE
alertmanager-0                      1/1     Running   0          97s
alertmanager-1                      1/1     Running   0          97s
alertmanager-2                      1/1     Running   0          97s
webhook-dingding-6d68854649-r6smd   1/1     Running   0          19m
webhook-wechat-54b5bbf677-rmr26     1/1     Running   0          32m
# 2.6 发布 AlertManager 服务

1、编写 Ingress 资源清单

[root@k8s-master01 03-alertmanager]# cat 05-alertmanager-ingress.yaml 
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: alert-ingress
  namespace: monitoring
spec:
  ingressClassName: "nginx"
  rules:
  - host: "k8s-alert.hmallleasing.com"
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: alertmanager-svc
            port:
              number: 9093

[root@k8s-master01 03-alertmanager]# kubectl apply -f 05-alertmanager-ingress.yaml 
[root@k8s-master01 03-alertmanager]# kubectl get ingress -n monitoring
NAME            CLASS   HOSTS                        ADDRESS                                        PORTS   AGE
alert-ingress   nginx   k8s-alert.hmallleasing.com   192.168.40.103,192.168.40.104,192.168.40.105   80      21s

2、访问 AlertManager 的⻚⾯

image-20240310213846950

# 2.7 测试 AlertManager 告警

1、模拟故障测试 AlertManager 告警是否能正常发送

[root@k8s-master01 ~]# kubectl run tools --image=uhub.service.ucloud.cn/oldxu/tools:v1.0
[root@k8s-master01 ~]# kubectl exec -it tools -- /bin/bash

# 路由给钉钉
/alert_test_oldxu --alertURL="http://alertmanager-svc.monitoring.svc.cluster.local:9093/api/v1/alerts" --label="alertname=CPU故障,instance=dingding,severity=critical,job=node_exporter"

# 路由给微信
/alert_test_oldxu --alertURL="http://alertmanager-svc.monitoring.svc.cluster.local:9093/api/v1/alerts" --label="alertname=节点故障,instance=wechat,severity=critical,job=kube-nodes"

# 路由给邮件
/alert_test_oldxu --alertURL="http://alertmanager-svc.monitoring.svc.cluster.local:9093/api/v1/alerts" --label="alertname=redis故障,instance=email,severity=critical,job=redis_exporter"

# 3、部署 Prometheus ⾄ Kubernetes

  • 1、创建 ConfigMap,准备 Prometheus 配置⽂件,定义 AlertManager 的地址、以及 Rules 规则⽂件的路径等
  • 2、创建 ConfigMap,准备 Prometheus 告警相关的规则⽂件;
  • 3、创建 RBAC 权限,我们需要使⽤ Prometheus 访问 APIServer 来抓取各种资源的指标,这意味着 Prometheus 的 Pod 需要相应的权限来访问 Kubernetes API
  • 4、创建 HeadlessService;
  • 5、创建 statefulSet,运⾏单节点的 Prometheus(⽣产环境不建议使⽤ NFS 作为后端存储);
  • 6、创建 Ingress,对外提供 Prometheus
# 3.1 创建 Prometheus 配置⽂件

1、编辑 Prometheus 配置⽂件,先使⽤最⼩化配置。(后期监控其他组件在进⾏修改或增加。) configMap 资源名称: prom-configs

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml
[root@k8s-master01 04-prometheus]# kubectl get cm -n monitoring
NAME               DATA   AGE
alert-configs      1      25m
alert-template     1      25m
kube-root-ca.crt   1      52m
prom-configs       1      16s
# 3.2 创建 Prometheus 告警规则

1、编辑 PrometheusRules 告警规则⽂件 以 node、pods、jvm、redis、blackbox 等告警规则⽂件为例,后续根据情况在添加, configMap 资源名称: prom-rules

注意:这些 rules 是基于此前节点监控的规则,然后结合 K8S 的标签进⾏了重新修订。

[root@k8s-master01 04-prometheus]# cat 02-prom-rules-configmap.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-rules
  namespace: monitoring
data:
  node_rules.yml: |-
    groups:
    - name: CPU告警规则
      rules:
      - alert: 节点CPU使用率超过80%
        expr: ( 1 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,job) ) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "主机CPU利用率过高,实例:{{ $labels.instance }} , {{ $labels.job }}"
          description: "该实例的CPU利用率低于20%,当前利用率:{{ $value }}%。可能存在CPU资源浪费情况。"
      - alert: CPU饱和度过高
        expr: sum(node_load1) by (instance,job) / (count(node_cpu_seconds_total{mode="idle"}) by (instance,job) * 2) * 100 > 80
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "CPU饱和度过高,实例:{{ $labels.instance }} , {{ $labels.job }}"
          description: "该实例的1分钟平均CPU负载超过了核心数的两倍,已经持续2分钟,当前CPU饱和度:{{ $value }}%。需要立即检查系统负载情况。"
    
      - alert: 主机内存不足
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)  / node_memory_MemTotal_bytes * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "主机内存使用率较高, 实例:{{ $labels.instance }}, 任务:{{ $labels.job }}"
          description: "该实例的内存使用率持续2分钟高于80%,当前利用率:{{ $value }}%"
    
    
      - alert: 内存饱和度高
        expr: ( 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes ) * 100 > 30
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "主机内存内存饱和度高, 实例:{{ $labels.instance }}, 任务:{{ $labels.job }}"
          description: "SWAP内存使用率已连续2分钟超过30%,表明内存饱和度过高,当前SWAP使用率为:{{ $value }}%。"
    

      - alert: 磁盘空间告急
        expr: ( node_filesystem_size_bytes{device!="tmpfs"} - node_filesystem_avail_bytes{device!="tmpfs"} ) / node_filesystem_size_bytes{device!="tmpfs"} * 100 > 70
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 磁盘{{ $labels.mountpoint }} 分区空间不足"
          description: "实例 {{ $labels.instance }} 的磁盘空间使用率已超过 70%,当前使用率为 {{ $value }}%,请及时处理。"
    
    
      - alert: 磁盘Inode空间告急
        expr: (node_filesystem_files{device!="tmpfs"} - node_filesystem_files_free{device!="tmpfs"} ) / node_filesystem_files{device!="tmpfs"} * 100 > 70
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 磁盘空间不足"
          description: "实例 {{ $labels.instance }} 的磁盘Inode空间使用率已超过 70%,当前使用率为 {{ $value }}%,请及时处理。"
    
      - alert: 磁盘IOPS写入较高
        #expr: sum(rate(node_disk_writes_completed_total[1m])) by (instance,job) / 120 * 100 >60
        #round函数可以对值进行四舍五入
        expr: round(max(irate(node_disk_writes_completed_total[1m])) by (instance,job) / 120 * 100) > 60
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} IOPS每秒写入次数超过120次/s"
          description: 
            目前磁盘IOPS写入饱和度是 {{ $value }}%
            目前磁盘IOPS每秒写入最大 {{ printf `max(rate(node_disk_writes_completed_total{instance="%s",job="%s"}[1m]))` $labels.instance $labels.job | query | first | value | printf "%.2f" }} 次/s
    
      - alert: 磁盘IOPS读取较高
        expr: round(max(irate(node_disk_reads_completed_total[1m])) by (instance,job) / 120 * 100) > 60
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} IOPS每秒读取次数超过120次/s"
          description: 
            目前磁盘IOPS读取饱和度是 {{ $value }}%
            目前磁盘IOPS每秒读取最大 {{ printf `max(rate(node_disk_reads_completed_total{instance="%s",job="%s"}[1m]))` $labels.instance $labels.job | query | first | value | printf "%.2f" }} 次/s
    
    
      - alert: 磁盘IO写入吞吐较高
        expr: round(max(rate(node_disk_written_bytes_total[1m])) by (instance,job) / 1024 /1024 / 30 * 100) > 60
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 磁盘IO写入每秒超过最大30MB/s"
          description: 
            目前磁盘IO写入吞吐量的饱和度是 {{ $value }}%。
            目前磁盘IO写入吞吐量每秒最大是 {{ printf `max(rate(node_disk_written_bytes_total{instance="%s",job="%s"}[1m])) /1024/1024` $labels.instance $labels.job | query | first | value | printf "%.2f" }}MB/s
    
      - alert: 磁盘IO读取吞吐较高
        expr: round(max(rate(node_disk_read_bytes_total[1m])) by (instance,job) / 1024 /1024 /30 * 100 ) > 60
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 磁盘IO读取每秒超过最大30MB/s"
          description:
            目前磁盘IO读取吞吐量的饱和度是 {{ $value }}%。
            目前磁盘IO读取吞吐量每秒最大是 {{ printf `max(rate(node_disk_read_bytes_total{instance="%s",job="%s"}[1m])) /1024/1024` $labels.instance $labels.job | query | first | value | printf "%.2f" }}MB/s
    
    
      - alert: 网络下载带宽异常
        expr: max(irate(node_network_receive_bytes_total[1m]) * 8 / 1024 / 1024) by (instance,job,device) / 50 * 100  >= 80
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 的 {{ $labels.device }}接口下载流量已经超过公司实际50Mbps"
          description: 
            目前下载带宽已经达到 {{ printf `(irate(node_network_receive_bytes_total{instance="%s",job="%s",device="%s"}[1m]) * 8 / 1024 / 1024)` $labels.instance $labels.job $labels.device | query | first | value | printf "%.2f" }} Mbps/s
            目前下载带宽使用率在 {{ $value }}%
    
      - alert: 网络上传带宽异常
        expr: max(irate(node_network_transmit_bytes_total[1m]) * 8 / 1024 / 1024) by (instance,job,device) / 50 * 100 >= 80
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 的 {{ $labels.device }}接口上传流量已经超过公司实际50Mbps"
          description: 
            目前上传带宽已经达到 {{ printf `(irate(node_network_transmit_bytes_total{instance="%s",job="%s",device="%s"}[1m]) * 8 / 1024 / 1024)` $labels.instance $labels.job $labels.device | query | first | value | printf "%.2f" }} Mbps/s
            目前上传带宽使用率在 {{ $value }}%
    
    
      - alert: 网络TCP连接数异常
        expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit * 100 > 80
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 的 tcp连接数超过80%"
          description: 
            目前TCP连接数是 {{ printf `node_nf_conntrack_entries{instance="%s",job="%s"}` $labels.instance $labels.job | query | first | value | printf "%.2f" }}
            目前TCP连接使用率是 {{ $value }}%
    
      - alert: 节点处于Down状态
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例:{{ $labels.instance }} 处于Down状态"
          description: "{{ $labels.instance }} 节点已连接超时"

  kube_pods_rules.yml: |-
    groups:
    - name: Pods的告警规则文件
      rules:
      - alert: Pod中容器的CPU利用率高
        expr: sum (rate(container_cpu_usage_seconds_total{image!=""}[5m])) by (instance,job,pod,namespace) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod CPU利用率高"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的CPU利用率当前为 {{ $value }}%,超过了80%的阈值。"
    
      - alert: Pod中容器内存利用率高
        expr: |
          sum(container_memory_working_set_bytes{name!=""}) by (instance,job,pod,namespace)
          /
          sum(container_spec_memory_limit_bytes{name!=""} > 0) by (instance,job,pod,namespace) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod内存利用率高"
          description: 在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的内存最大限制是 {{ printf `sum (container_spec_memory_limit_bytes{namespace="%s",pod="%s"} > 0 ) /1024 /1024` $labels.namespace $labels.pod | query | first | value }}MB , 目前利用率已达{{ $value }}%,超过限制的80%。

      - alert: Pod容器网络发送速率过高
        expr: sum(rate(container_network_transmit_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络发送速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
    
      - alert: Pod容器网络接收速率过高
        expr: sum(rate(container_network_receive_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络接收速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
    
      - alert: Pod容器磁盘写入吞吐量过大
        expr: sum (rate(container_fs_writes_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘写入吞吐量过大"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘写入吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
    
      - alert: Pod容器磁盘读取吞吐量过大
        expr: sum (rate(container_fs_reads_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘读取吞吐量过大"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘读取吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"


  jvm_rules.yml: |-
    groups:
    - name: "JVM告警规则"
      rules:
      - alert: JVM堆内存使用率过高
        expr: jvm_memory_bytes_used{area="heap",} / jvm_memory_bytes_max{area="heap",} * 100 > 90
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 实例的JVM 堆内存使用率超过80%"
          description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' PodJVM堆内存使用率超过80%, 当前使用率是 {{ $value }}%"

      - alert: JVMGC时间过长
        expr: sum (rate(jvm_gc_collection_seconds_sum[5m]) / rate(jvm_gc_collection_seconds_count[5m])) by (instance,job,gc,namespace,pod_name) > 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 实例的JVM  GC时间超过了1秒。"
          description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod使用 {{ $labels.gc }} GC垃圾回收算法时间超过1s,当前值 {{ $value }}秒"

      - alert: JVM死锁线程过多
        expr: min_over_time(jvm_threads_deadlocked[5m]) > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "JVM检测到'{{ $labels.instance }}' 实例有死锁线程"
          description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod,在过去5分钟检测到死锁线程, 当前死锁线程数是 {{ $value }}。"


  redis_rules.yml: |-
    groups:
    - name: redis告警规则
      rules:
      - alert: Redis实例宕机
        expr: redis_up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例宕机"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去5分钟内无法连接。"

      - alert: Redis连接数过高
        expr: redis_connected_clients / redis_config_maxclients * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例连接数超过80%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod当前连接数占最大连接数的比率超过80%。当前比率: {{ $value }}%。"

      - alert: Redis连接被拒绝
        expr: increase(redis_rejected_connections_total[1h]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例有连接被拒绝"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去1小时内有连接被拒绝。当前被拒绝的连接数: {{ $value }}。"

      - alert: Redis内存使用率过高
        expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例内存使用率超过80%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的内存使用率超过配置的最大内存值的80%。当前内存使用率: {{ $value }}%。"

      - alert: Redis缓存命中率低
        expr: |
          irate(redis_keyspace_hits_total[5m])
          / 
          (irate(redis_keyspace_hits_total[5m]) + irate(redis_keyspace_misses_total[5m])) * 100 < 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例缓存命中率低于90%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近5分钟内的缓存命中率低于90%。当前命中率: {{ $value }}%。"

      - alert: Redis即将过期的Key数量过多
        expr: |
          sum(redis_db_keys_expiring) by (instance, job, namespace,pod_name,db)
          / 
          sum(redis_db_keys) by (instance, job, namespace,pod_name,db) * 100 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例中的 '{{ $labels.db }}' 数据库有大量即将过期的Key"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod中的 '{{ $labels.db }}' 数据库有超过50%的Key即将过期。当前过期比率: {{ $value }}%。"

      - alert: RedisRDB备份失败
        expr: redis_rdb_last_bgsave_status == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例 RDB备份失败"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近的RDB备份失败。"

      - alert: RedisRDB备份时间过长
        expr: redis_rdb_last_bgsave_duration_sec > 3 and redis_rdb_last_bgsave_status == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例 RDB备份成功但耗时超过3秒"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod, RDB备份成功但耗时超过了3秒。持续时间: {{ $value }}秒。"

      - alert: RedisRDB备份过期
        expr: (time() - redis_rdb_last_save_timestamp_seconds) > 36000
        for: 24h
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例超过10小时未进行RDB备份"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod已超过10小时没有生成新的RDB备份文件。"

      - alert: Redis命令拒绝率过高
        expr: |
          sum(irate(redis_commands_rejected_calls_total[5m])) by (instance,job,namespace,pod_name)
          / 
          sum(irate(redis_commands_total[5m])) by (instance,job,namespace,pod_name) * 100 > 25
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例命令拒绝率超过25%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的命令拒绝率超过了25%。当前拒绝率: {{ $value }}%。"

      - alert: Redis命令平均响应时间过长
        expr: |
          sum(rate(redis_commands_duration_seconds_total[5m])) by (instance,job,namespace,pod_name)
          / 
          sum(rate(redis_commands_processed_total[5m])) by (instance,job,namespace,pod_name) > 0.250
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例命令平均响应时间超过250ms"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的执行命令平均响应时间超过了250毫秒。当前平均响应时间: {{ $value }}秒。"


  blackbox_tcp_rules.yml: |-
    groups:
    - name: Blackbox_tcp告警规则文件
      rules:
      - alert: Service TCP探测失败
        expr: sum(probe_success{job=~".*tcp"}) by (instance,job,namespace,service_name) == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "探测 '{{ $labels.instance }}' Service 的TCP接口探测失败。"
          description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 地址探测失败。"
    
      - alert: Service TCP请求的响应时间过长
        expr: probe_duration_seconds{job=~".*tcp"} > 0.500
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "探测 '{{ $labels.instance }}' Service 的TCP响应时间超过了500毫秒。"
          description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"

      - alert: Service的DNS解析响应时间过长
        expr: probe_dns_lookup_time_seconds{job=~".*tcp"} > 0.500
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "探测 '{{ $labels.instance }}' Service 的DNS解析响应时间超过了500毫秒。"
          description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"	

  blackbox_http_rules.yml: |-
    groups:
    - name: Blackbox_http告警规则文件
      rules:
      - alert: 站点平均请求过长
        expr: sum (avg_over_time(probe_http_duration_seconds[1m])) by (instance,job,namespace,ingress_name) > 3
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' 域名整体请求时间超过了3秒。"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名最近1分钟的平均请求时间超过3秒。当前平均请求时间:{{ $value }}秒。"

      - alert: 站点阶段耗时过长
        expr: |
          (
            probe_http_duration_seconds{phase="connect"} > 1 or
            probe_http_duration_seconds{phase="processing"} > 1 or
            probe_http_duration_seconds{phase="resolve"} > 1 or
            probe_http_duration_seconds{phase="tls"} > 1 or
            probe_http_duration_seconds{phase="transfer"} > 1
          )
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' 域名在 '{{ $labels.phase }}' 阶段耗时过长"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在阶段 '{{ $labels.phase }}' 的耗时超过0.5秒。当前耗时:{{ $value }}秒。"

      - alert: 站点响应状态码异常
        expr: probe_http_status_code <= 199 or probe_http_status_code >= 400
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 域名返回异常状态码"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名返回的状态码为 {{ $value }},表明请求可能存在问题。"
    
      - alert: 重定向次数过多
        expr: probe_http_redirects > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' 域名重定向次数过多"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在最近的探测中重定向次数超过5次。当前次数:{{ $value }}次。"

      - alert: 证书即将过期<30
        expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 30
        for: 24h
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 域名的 SSL 证书即将过期"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"
    
      - alert: 证书即将过期<7
        expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 7
        for: 24h
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.instance }}' 域名的 SSL 证书即将过期"
          description: "{{ $labels.namespace }} 名称空间 {{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"

2、创建 ConfigMap

[root@k8s-master01 04-prometheus]# kubectl apply -f 02-prom-rules-configmap.yaml 
[root@k8s-master01 04-prometheus]# kubectl get cm -n monitoring
NAME               DATA   AGE
alert-configs      1      34m
alert-template     1      34m
kube-root-ca.crt   1      61m
prom-configs       1      9m2s
prom-rules         6      13s
# 3.3 创建 PrometheusRBAC 权限

1、创建⼀个 ServiceAccount ⽤户,名称为 prometheus-sa

2、创建 ClusterRole,设定对应的权限规则,名称为 Prometheus-role 。

3、创建 ClusterRoleBinding,名称为 prometheus-rolebinding ,将 Prometheus-role ⻆⾊的权限关联⾄ kube-prom

名称空间 prometheus-sa ⽤户。

[root@k8s-master01 04-prometheus]# cat 03-prometheus-rbac.yaml 
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus-sa
  namespace: monitoring

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus-role
rules:
  - apiGroups:
      - ""
    resources:
      - nodes
      - services
      - endpoints
      - pods
      - nodes/proxy
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - "networking.k8s.io"
    resources:
      - ingresses
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - ""
    resources:
      - configmaps
      - nodes/metrics
    verbs:
      - get
  - nonResourceURLs:
      - /metrics
    verbs:
      - get

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus-rolebinding
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-role
subjects:
  - kind: ServiceAccount
    name: prometheus-sa
    namespace: monitoring
    
[root@k8s-master01 04-prometheus]# kubectl apply -f 03-prometheus-rbac.yaml 
serviceaccount/prometheus-sa created
clusterrole.rbac.authorization.k8s.io/prometheus-role created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-rolebinding created
# 3.4 创建 headlessSevice
[root@k8s-master01 04-prometheus]# cat 04-prometheus-headlessService.yaml 
apiVersion: v1
kind: Service
metadata:
  name: prometheus-svc
  namespace: monitoring
spec:
  clusterIP: "None"
  selector:
    app: prometheus
  ports:
  - name: http
    port: 9090
    targetPort: 9090
    
[root@k8s-master01 04-prometheus]# kubectl apply -f 04-prometheus-headlessService.yaml 
[root@k8s-master01 04-prometheus]# kubectl get svc -n monitoring
NAME                   TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)             AGE
alertmanager-svc       ClusterIP   None           <none>        9093/TCP,9094/TCP   37m
prometheus-svc         ClusterIP   None           <none>        9090/TCP            16s
webhook-dingding-svc   ClusterIP   10.96.241.84   <none>        5002/TCP            49m
webhook-wechat-svc     ClusterIP   10.96.82.7     <none>        5001/TCP            62m
# 3.5 部署 Prometheus 服务

使⽤ statefulSet 编写 Prometheus 清单⽂件

  • 1、定义 Prometheus 所关联的 ServiceAccount 账户,确保有权限能访问 APIServer
  • 1、定义 Prometheus 实例的启动命令,包含了对应的 “配置⽂件路径 “、“数据存储路径” 相关的参数。
  • 2、定义 Prometheus 实例挂载 “配置⽂件” 的 ConfigMap 资源,以及 rules 告警规则⽂件的 ConfigMap 资源。
  • 3、Prometheus 使⽤ PVC 模板来提供数据持久化;
[root@k8s-master01 04-prometheus]# cat 05-prometheus-statefulset.yaml 
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: prometheus
  namespace: monitoring
spec:
  serviceName: "prometheus-svc"
  replicas: 2
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: "prometheus-sa"		# sa账号
      volumes:
      - name: prom-cfg
        configMap:
          name: prom-configs
      - name: prom-rules-cfg
        configMap:
          name: prom-rules
      containers:
      - name: prom
        image: prom/prometheus:v2.49.1
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/etc/prometheus/data"
        - "--storage.tsdb.retention.time=10d"
        - "--web.enable-lifecycle"
        volumeMounts:
        - name: prom-cfg
          mountPath: /etc/prometheus
        - name: prom-rules-cfg
          mountPath: /etc/prometheus/rules
        - name: prom-data
          mountPath: /etc/prometheus/data
        ports:
        - containerPort: 9090
        resources:
          requests:
            cpu: 1000m
            memory: 1024Mi
          limits:
            cpu: 1000m
            memory: 1024Mi
  volumeClaimTemplates:
    - metadata:
        name: prom-data
      spec:
        accessModes: ["ReadWriteMany"]
        storageClassName: "nfs-storage"
        resources:
          requests:
            storage: 3Gi
            
[root@k8s-master01 04-prometheus]# kubectl apply -f 05-prometheus-statefulset.yaml
[root@k8s-master01 04-prometheus]# kubectl get pods -n monitoring
NAME                                READY   STATUS    RESTARTS   AGE
alertmanager-0                      1/1     Running   0          38m
alertmanager-1                      1/1     Running   0          38m
alertmanager-2                      1/1     Running   0          38m
prometheus-0                        1/1     Running   0          2m35s
prometheus-1                        1/1     Running   0          92s
webhook-dingding-6d68854649-r6smd   1/1     Running   0          56m
webhook-wechat-54b5bbf677-rmr26     1/1     Running   0          69m

# 3.6 发布 Prometheus 服务

1、编写 Ingress 资源清单

[root@k8s-master01 04-prometheus]# cat 06-prometheus-ingress.yaml 
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: prom-ingress
  namespace: monitoring
spec:
  ingressClassName: "nginx"
  rules:
  - host: "k8s-prom.hmallleasing.com"
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: prometheus-svc
            port:
              number: 9090

[root@k8s-master01 04-prometheus]# kubectl apply -f 06-prometheus-ingress.yaml 
[root@k8s-master01 04-prometheus]# kubectl get ingress -n monitoring
NAME            CLASS   HOSTS                        ADDRESS                                        PORTS   AGE
alert-ingress   nginx   k8s-alert.hmallleasing.com   192.168.40.103,192.168.40.104,192.168.40.105   80      38m
prom-ingress    nginx   k8s-prom.hmallleasing.com    192.168.40.103,192.168.40.104,192.168.40.105   80      78s

# 4、部署 Grafana ⾄ Kubernetes

  • 1、创建 HeadlessService;
  • 2、部署 Grafana,使⽤ statefulSet;
  • 3、创建 Ingress,对外提供 Grafana;
# 4.1 创建 HeadlessService

1、编写 Grafana 的 headlessService

[root@k8s-master01 05-grafana]# cat 01-grafana-headlessService.yaml 
apiVersion: v1
kind: Service
metadata:
  name: grafana-svc
  namespace: monitoring
spec:
  clusterIP: "None"
  selector:
    app: grafana
  ports:
  - name: http
    port: 3000
    targetPort: 3000

2、检查 service

[root@k8s-master01 05-grafana]# kubectl apply -f 01-grafana-headlessService.yaml 
[root@k8s-master01 05-grafana]# kubectl get svc -n monitoring
NAME                   TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)             AGE
alertmanager-svc       ClusterIP   None           <none>        9093/TCP,9094/TCP   23h
grafana-svc            ClusterIP   None           <none>        3000/TCP            5s
prometheus-svc         ClusterIP   None           <none>        9090/TCP            22h
webhook-dingding-svc   ClusterIP   10.96.241.84   <none>        5002/TCP            23h
webhook-wechat-svc     ClusterIP   10.96.82.7     <none>        5001/TCP            23h
# 4.2 部署 Grafana 服务

使⽤ statefulSet 编写 Grafana 清单⽂件

  • 1、Grafana 需要通过 GF_SECURITY_ADMIN_USER 传递⽤户名, GF_SECURITY_ADMIN_PASSWORD 传递密码。
  • 2、Grafana 的实例,需要使⽤ PVC 模板来提供数据持久化;
  • 3、Grafana 的持久存储需要考虑权限, fsGroup: 472
[root@k8s-master01 05-grafana]# cat 02-grafana-statefulset.yaml 
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: grafana
  namespace: monitoring
spec:
  serviceName: "grafana-svc"
  replicas: 1
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      securityContext:
        fsGroup: 472    # 当Pod启动时,Kubernetes会自动将此组ID应用到Pod级别共享的存储上(比如持久卷)。
      containers:
      - name: grafana
        image: grafana/grafana:10.2.2
        env:
        - name: GF_SECURITY_ADMIN_USER
          value: "admin"
        - name: GF_SECURITY_ADMIN_PASSWORD
          value: "talent"
        volumeMounts:
        - name: grafana-data
          mountPath: /var/lib/grafana
        ports:
        - containerPort: 3000
        readinessProbe:
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          initialDelaySeconds: 60
          failureThreshold: 10
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        livenessProbe:
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          initialDelaySeconds: 60
          failureThreshold: 10
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        resources:
          requests:
            cpu: 500m
            memory: 2048Mi
          limits:
            cpu: 500m
            memory: 2048Mi
  volumeClaimTemplates:
    - metadata:
        name: grafana-data
      spec:
        accessModes: ["ReadWriteMany"]
        storageClassName: "nfs-storage"
        resources:
          requests:
            storage: 3Gi

[root@k8s-master01 05-grafana]# kubectl apply -f 02-grafana-statefulset.yaml 
[root@k8s-master01 ~]# kubectl get pods -n monitoring
NAME                                READY   STATUS    RESTARTS            AGE
alertmanager-0                      1/1     Running   1 (6m18s ago)       23h
alertmanager-1                      1/1     Running   1 (<invalid> ago)   23h
alertmanager-2                      1/1     Running   1 (11m ago)         23h
grafana-0                           1/1     Running   0                   2m22s
prometheus-0                        1/1     Running   1 (4m34s ago)       23h
prometheus-1                        1/1     Running   1 (6m18s ago)       23h
webhook-dingding-6d68854649-r6smd   1/1     Running   1 (4m34s ago)       23h
webhook-wechat-54b5bbf677-rmr26     1/1     Running   1 (6m18s ago)       24h
# 4.3 发布 Grafana 服务

1、编辑 Grafana 的 Ingress

[root@k8s-master01 05-grafana]# cat 03-grafana-ingress.yaml 
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: grafana-ingress
  namespace: monitoring
spec:
  ingressClassName: "nginx"
  rules:
  - host: "k8s-grafana.hmallleasing.com"
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: grafana-svc
            port:
              number: 3000
       
[root@k8s-master01 05-grafana]# kubectl apply -f 03-grafana-ingress.yaml 
[root@k8s-master01 05-grafana]# kubectl get ingress -n monitoring
NAME              CLASS   HOSTS                          ADDRESS                         PORTS   AGE
alert-ingress     nginx   k8s-alert.hmallleasing.com     192.168.40.103,192.168.40.105   80      23h
grafana-ingress   nginx   k8s-grafana.hmallleasing.com   192.168.40.103,192.168.40.105   80      21s
prom-ingress      nginx   k8s-prom.hmallleasing.com      192.168.40.103,192.168.40.105   80      23h

2、访问 Grafana 的 web 界⾯

image-20240311211754517

# 4.4 配置 Grafana 连接 Prometheus

1、点击 Connections-->Add newConnection,搜索 Prometheus,点击添加 DataSource

image-20240311212003641

2、点击测试并保存

image-20240311212027200

# 5、部署 blackbox ⾄ Kubernetes

  • 1、创建 ConfigMap,定义 blackbox.yml 中的检测模块;
  • 2、部署 Blackbox,使⽤ Deployment;
  • 3、创建 Service、ingress,对外发布 Blackbox;
# 5.1 创建 Blackbox 的配置⽂件

1、创建 ConfigMap 配置,定义 Blackbox 的检测⽅法,名称为 blackbox-configs

[root@k8s-master01 06-blackbox]# cat 01-blackbox-configs-configmap.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
  name: blackbox-configs
  namespace: monitoring
data:
  blackbox.yml: |-
    modules:
      # http检查模块
      http_2xx:
        prober: http
        http:
          preferred_ip_protocol: "ip4"
          valid_http_versions: [ "HTTP/1.1", "HTTP/2.0" ]
      # Http Post检查模块
      http_post_2xx:
        prober: http
        http:
          method: POST
          preferred_ip_protocol: "ip4"
          valid_http_versions: [ "HTTP/1.1", "HTTP/2.0" ]
    
      # TCP检查模块
      tcp_connect:
        prober: tcp
        timeout: 5s
    
      # ICMP检查模块
      icmp:
        prober: icmp
        timeout: 5s
        icmp:
          preferred_ip_protocol: "ip4"
    
      # DNS检查模块
      dns_tcp:  
        prober: dns
        dns:
          transport_protocol: "tcp"
          preferred_ip_protocol: "ip4"
          query_name: "kubernetes.default.svc.cluster.local"

      # SSH检查模块
      ssh_banner:
        prober: tcp
        tcp:
          query_response:
          - expect: "^SSH-2.0-"
          - send: "SSH-2.0-blackbox-ssh-check"

2、创建 configmap

[root@k8s-master01 06-blackbox]# kubectl apply -f 01-blackbox-configs-configmap.yaml 
configmap/blackbox-configs created
[root@k8s-master01 06-blackbox]# kubectl get cm -n monitoring
NAME               DATA   AGE
alert-configs      1      23h
alert-template     1      23h
blackbox-configs   1      1s
kube-root-ca.crt   1      24h
prom-configs       1      23h
prom-rules         6      23h
# 5.2 部署 Blackbox 服务

编写 Blackbox 的部署清单⽂件

[root@k8s-master01 06-blackbox]# cat 02-blackbox-deployment.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: blackbox
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: blackbox
  template:
    metadata:
      labels:
        app: blackbox
    spec:
      volumes:
      - name: blackbox-cfg
        configMap:
          name: blackbox-configs
      containers:
      - name: blackbox
        image: prom/blackbox-exporter:v0.24.0
        args:
        - "--web.listen-address=:9115"
        - "--config.file=/etc/blackbox_exporter/blackbox.yml"
        volumeMounts:
        - name: blackbox-cfg
          mountPath: /etc/blackbox_exporter
        ports:
        - containerPort: 9115

[root@k8s-master01 06-blackbox]# kubectl apply -f 02-blackbox-deployment.yaml
[root@k8s-master01 06-blackbox]# kubectl get pods -n monitoring
NAME                                READY   STATUS    RESTARTS            AGE
alertmanager-0                      1/1     Running   1 (20m ago)         23h
alertmanager-1                      1/1     Running   1 (<invalid> ago)   23h
alertmanager-2                      1/1     Running   1 (25m ago)         23h
blackbox-7c7c8db4f7-hqs4c           1/1     Running   0                   112s
grafana-0                           1/1     Running   0                   16m
prometheus-0                        1/1     Running   1 (19m ago)         23h
prometheus-1                        1/1     Running   1 (20m ago)         23h
webhook-dingding-6d68854649-r6smd   1/1     Running   1 (19m ago)         24h
webhook-wechat-54b5bbf677-rmr26     1/1     Running   1 (20m ago)         24h
# 5.3 发布 blackbox 服务

1、创建 Service

[root@k8s-master01 06-blackbox]# cat 03-blackbox-service.yaml 
apiVersion: v1
kind: Service
metadata:
  name: blackbox-svc
  namespace: monitoring
spec:
  selector:
    app: blackbox
  ports:
  - name: http
    port: 9115
    targetPort: 9115

[root@k8s-master01 06-blackbox]# kubectl apply -f 03-blackbox-service.yaml 

2、创建 Ingress

[root@k8s-master01 06-blackbox]# cat 04-blackbox-ingress.yaml 
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: blackbox-ingress
  namespace: monitoring
spec:
  ingressClassName: "nginx"
  rules:
  - host: "k8s-blackbox.hmallleasing.com"
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: blackbox-svc
            port:
              number: 9115

[root@k8s-master01 06-blackbox]# kubectl apply -f 04-blackbox-ingress.yaml 

3、访问 blackbox ⻚⾯

# image-202403112132301406、监控 Kubernetes 集群节点

使⽤ Prometheus 监控 Kubernetes 集群中的 节点,⼤体需要如下⼏步:

  • 1、使⽤ DaemonSet 部署 Node_exporter
  • 2、使⽤ Prometheus 的 Kubernetes 服务发现功能,⾃动识别集群中的 Node 节点
  • 3、使⽤ relabeling 功能调整⽬标地址和端⼝:
  • 4、使⽤ relabel 功能,为节点增加⼀些必要的标签维度;
# 6.1 部署 Node-Exporter

使⽤ DaemonSet 部署 Node_exporter

  • 1、使⽤ DaemonSet 部署 Node Exporter,确保设置 hostPID、hostIPC 和 hostNetwork 为 true,让 Node Exporter 容器能够访问宿主机的⽹络、进程和 IPC 空间。
  • 2、需要确保 Node Exporter 可以在所有节点上运⾏,包括 Master 节点,因此需要在 DaemonSet 的 Pod 规范中添加容忍度,允许调度到有污点的节点上运⾏。
  • 3、通过 volumeMounts,将宿主机的 /proc、/sys 和根⽬录 / 挂载到 Node Exporter 容器的相应位置。确保 Node Exporter 能够直接访问宿主机的系统信息。
  • 4、最后在 Node Exporter 的启动参数中,指定 /proc、/sys 等挂载后的路径,以便正确读取宿主机的数据。
[root@k8s-master01 07-node_exporter]# cat 01-node-exporter-daemonset.yaml 
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitoring
spec:
  selector:
    matchLabels:
      app: node-exporter
  template:
    metadata:
      labels:
        app: node-exporter
    spec:
      hostPID: true
      hostIPC: true
      hostNetwork: true
      # 容忍度
      tolerations:
      - key: node-role.kubernetes.io/control-plane
        operator: "Exists"
      volumes:
      - name: proc
        hostPath:
          path: /proc
      - name: sys
        hostPath:
          path: /sys
      - name: root
        hostPath:
          path: /root
      containers:
      - name: node
        image: prom/node-exporter:v1.7.0
        args:
        - "--web.listen-address=:9100"
        - "--web.max-requests=40"
        - "--collector.mountstats"
        - "--collector.systemd"
        - "--collector.ethtool"
        - "--collector.tcpstat"
        - "--path.procfs=/host/proc"
        - "--path.sysfs=/host/sys"
        - "--path.rootfs=/host/root"
        volumeMounts:
        - name: proc
          mountPath: /host/proc
        - name: sys
          mountPath: /host/sys
        - name: root
          mountPath: /host/root
        ports:
        - containerPort: 9100
        resources:
          requests:
            cpu: 200m
            memory: 200Mi
          limits:
            cpu: 200m
            memory: 200Mi

2、检查 Node_exporter 的部署情况

[root@k8s-master01 07-node_exporter]# kubectl apply -f 01-node-exporter-daemonset.yaml 
[root@k8s-master01 07-node_exporter]# kubectl get pods -n monitoring -o wide|grep node-exporter
node-exporter-55bsf                 1/1     Running   0                   2m48s   192.168.40.103   k8s-master03   <none>           <none>
node-exporter-5ldbs                 1/1     Running   0                   2m48s   192.168.40.104   k8s-node01     <none>           <none>
node-exporter-dj269                 1/1     Running   0                   2m48s   192.168.40.101   k8s-master01   <none>           <none>
node-exporter-kqdmg                 1/1     Running   0                   2m48s   192.168.40.102   k8s-master02   <none>           <none>
node-exporter-ml2sk                 1/1     Running   0                   2m48s   192.168.40.105   k8s-node02     <none>           <none>
# 6.2 配置 Prometheus 监控 Node

1、修改 Prometheus 的配置⽂件中,使⽤ kubernetes_sd_configs 配置项指定服务发现的⻆⾊为 node。这样 Prometheus 将会⾃动发现 Kubernetes 集群中的所有 Node,并获取它们的元数据。

[root@k8s-master01 04-prometheus]# cat  01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]
      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]
  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node

3、重启 Prometheus,⽽后检查 Prometheus 的 Target

#确保k8s-prom.hmallleasing.com可以解析
[root@k8s-master01 04-prometheus]# cat /etc/hosts
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.40.101 k8s-master01
192.168.40.102 k8s-master02
192.168.40.103 k8s-master03 k8s-prom.hmallleasing.com
192.168.40.100 k8s-master-lb # 如果不是高可用集群,该IP为Master01的IP
192.168.40.104 k8s-node01
192.168.40.105 k8s-node02
[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

image-20240311222953117

# 6.3 relabel 修改抓取的节点端⼝

1、重新修改配置,通过 relabel ⽅式将 10250 端⼝,修改为 9100 端⼝

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]
      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
          
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]
  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace

3、重新加载 Prometheus,然后检查节点的 targets,发现端⼝变成了 9100,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

image-20240311223411745

# 6.4 relabel 为节点添加新标签

查询节点的元数据标签,发现如下的⼀些 labels,希望保留下来,以便在监控中提供更多的维度和上下⽂。

image-20240311223515199

1、修改 Prometheus 的配置⽂件,使⽤ labelmap 将标签映射到每个节点上。

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]
      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap
          
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]
  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

3、重新加载 Prometheus,⽽后这些元数据是否附加到节点的 Labels 上了。

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

image-20240311223925408

# 6.5 导⼊节点可视化图形

ID:16098

# 7、监控 K8S 控制组件

# 7.1 监控控制平⾯组件

监控 Kubernetes 集群的控制平⾯,⾸先需要知道要监控哪些组件,然后了解它们是如何提供 Metrics 指标的,最后确定这些指标的监控⽅法,是通过⼿动配置还是⾃动发现。

  • 关键组件包括:APIServer、ControllerManager、Scheduler、etcd、CoreDNS、kubelet、kube-proxy
  • Metrics 指标获取:这些控制平⾯组件都内建 Metrics 端点。但是某些组件可能默认只在本地接⼝(127.0.0.1)上暴露 Metrics,因此需要修改对应组件的配置,以确保这些 Metrics 可通过远程的⽅式进⾏访问。
# 7.2 监控控制平⾯组件策略

监控这些组件有两种主要⽅法:

  • 1、⼿动配置监控,在 Prometheus 配置⽂件中,⼿动指定每个控制器组件服务的地址和端⼝,来完成监控。很明显这种⽅式⾮常繁琐,且维护成本太⾼。
  • 2、⾃动化监控,利⽤ Kubernetes 的服务发现机制,来动态发现服务实例。有如下两种常⽤的⽅法

⾃动化监控⽅式:

  • 1、基于 endpoints 的服务发现,⾃动发现所有的 endpints 端点,然后使⽤ relabel 匹配,来保留符合条件的端点实例。
  • 2、基于 Pod 的服务发现,⾃动发现所有 Pod,然后使⽤ relabel 匹配,只保留符合标签条件的 Pod 实例。

注意事项:

1、使⽤ endpoints 作为服务发现的⽅式,它要求被监控的⽬标必须有对应的 Service 才可以实现,否则⽆法获取端点。但基于 Pod 的发现⽅式不依赖是否有 Service。

2、不论采⽤哪种服务的发现⽅式,最终都必须通过 relabel 来基于标签筛选所需要的端点,因此获取所需要监控的⽬标标签,⾄关重要。

# 7.3 监控 APIServer
# 7.3.1 获取 APIServer 的 Metrics

1、API-Server 在 https 协议的, 6443/metrics 接⼝上提供了指标数据。

[root@k8s-master01 ~]# netstat -lntp|grep 6443
tcp        0      0 127.0.0.1:16443         0.0.0.0:*               LISTEN      1126/haproxy        
tcp        0      0 0.0.0.0:16443           0.0.0.0:*               LISTEN      1126/haproxy        
tcp6       0      0 :::6443                 :::*                    LISTEN      2301/kube-apiserver 

2、APIServer 有对应的 Service,所以采⽤ Endpints ⽅式发现服务,因此我们需要获取 APIServer 的 Service 标签(labels),以便 Prometheus 只抓取 APIServer 服务的 Pod 实例

[root@k8s-master01 ~]#  kubectl describe service -n default kubernet
Name:              kubernetes
Namespace:         default
Labels:            component=apiserver
                   provider=kubernetes
Annotations:       <none>
Selector:          <none>
Type:              ClusterIP
IP Family Policy:  SingleStack
IP Families:       IPv4
IP:                10.96.0.1
IPs:               10.96.0.1
Port:              https  443/TCP
TargetPort:        6443/TCP
Endpoints:         192.168.40.101:6443,192.168.40.102:6443,192.168.40.103:6443
Session Affinity:  None
Events:            <none>
# 7.3.2 配置 Prometheus 监控 APIServer
  • 1、添加⼀个新的 Job,名为: kube-apiserver ,metrics 路径是 /metrics ,协议是 https
  • 2、基于 Kubernetes 的 Endpoints 来实现⾃动发现,由于 APIServer 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
  • 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_service_label_component ,标签值为 apiserver 。(会查询所有名称空间,意味着范围查询会更⼴)
  • 4、使⽤ relabel_configs,仅保留 __meta_kubernetes_namespace=default 、 __meta_kubernetes_service_name=kubernetes 并且 __meta_kubernetes_endpoint_port_name=https 的实例(明确名称空间和 service 名称,以及对应的端⼝,这种⽅式会更精准⼀些)

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过值映射获取标签
          replacement: $1
          action: labelmap
          
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过值映射获取标签
      replacement: $1
      action: labelmap[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查最终结果

image-20240313214452728

# 7.3.3 APIServer 告警规则⽂件

1、编辑 Prometheus 的 Alert 规则⽂件

  kube_apiserver_rules.yml: |-
    groups:
    - name: APIServer告警规则
      rules:
      - alert: APIServer请求错误率过高
        expr: |
          sum by (instance, namespace, job, group, code, resource, verb,subresource) (rate(apiserver_request_total{code=~"5..|4.."}[5m])) 
          /
          sum by (instance, namespace, job, group, code, resource, verb,subresource) (rate(apiserver_request_total[5m])) * 100  > 10
        for: 5m 
        labels:
          severity: critical
        annotations:
          summary: "APIServer请求错误率超过10%"
          description: "APIServer实例 {{ $labels.instance }} 在命名空间 {{ $labels.namespace }} 中的 {{ $labels.group }} 组中 {{ $labels.resource }} 类型请求错误率超过10%。当前错误率: {{ $value }}%,请求类型: {{ $labels.verb }},状态码: {{ $labels.code }}。"
    
      - alert: APIServer Mutating请求负载过高
        expr: avg_over_time(apiserver_current_inflight_requests{request_kind="mutating"}[5m]) > (400 * 0.8)
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "APIServer Mutating请求负载过高"
          description: "APIServer处理变更性请求的平均负载超过了最大限制的80%。当前负载: {{ $value }}。"
    
      - alert: APIServer ReadOnly请求负载过高
        expr: avg_over_time(apiserver_current_inflight_requests{request_kind="readOnly"}[5m]) > (800 * 0.8)
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "APIServer ReadOnly请求负载过高"
          description: "APIServer处理只读请求的平均负载超过了最大限制的80%。当前负载: {{ $value }},实例: {{ $labels.instance }},命名空间: {{ $labels.namespace }}。"
      
      - alert: APIServer平均延迟过高
        expr: |
          rate(apiserver_request_duration_seconds_sum{verb!="WATCH"}[5m])
          /
          rate(apiserver_request_duration_seconds_count{verb!="WATCH"}[5m]) > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "APIServer平均延迟过高"
          description: "APIServer实例 {{ $labels.instance }} 对资源 {{ $labels.resource }} 的 {{ $labels.verb }} 请求的平均延迟超过5秒。当前平均延迟: {{ $value }}秒。"
          
[root@k8s-master01 04-prometheus]# kubectl apply -f 02-prom-rules-configmap.yaml 

2、检查配置是否更新

[root@k8s-master01 04-prometheus]#  kubectl exec -it prometheus-0 -n monitoring -- ls /etc/prometheus/rules/
[root@k8s-master01 04-prometheus]#  kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/rules/kube_apiserver_rules.yml

3、重新加载 Prometheus

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查告警规则

image-20240313222355735

# 7.4 监控 K8S 核⼼组件 - Controller
# 7.4.1 获取 Controller 的 Metrics

Controller Manager 默认在 10257/metrics 接⼝上提供了指标数据,默认情况下,ControllerManager 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 --bind-address=127.0.0.1 修改为 --bind-address=0.0.0.0

1、修改 ControllerManager Pod 的配置清单,所有的 master 节点都需修改,生产环境需分开修改,防止业务不可用。

[root@k8s-master01 ~]#  cat /etc/kubernetes/manifests/kube-controller-manager.yaml
apiVersion: v1
kind: Pod
metadata:
  creationTimestamp: null
  labels:
    component: kube-controller-manager
    tier: control-plane
  name: kube-controller-manager
  namespace: kube-system
spec:
  containers:
  - command:
    - kube-controller-manager
    - --allocate-node-cidrs=true
    - --authentication-kubeconfig=/etc/kubernetes/controller-manager.conf
    - --authorization-kubeconfig=/etc/kubernetes/controller-manager.conf
    - --bind-address=0.0.0.0
    - --client-ca-file=/etc/kubernetes/pki/ca.crt
...

2、在 Kubernetes 1.28 版本中,默认配置不再允许通过 HTTP 访问 ControllerManager 的 metrics 端点,因此必须使⽤ HTTPS。在本地测试访问 Controller Manager 的 Metrics ,我们可以使⽤已存在的 prometheus-sa 服务账户来创建⼀个令牌 (token),然后在来访问 controllerManager 的 Metrics。

[root@k8s-master01 ~]# TOKEN=$(kubectl create token -n monitoring prometheus-sa)
[root@k8s-master01 ~]# curl -s -k -H "Authorization: Bearer $TOKEN" https://192.168.40.101:10257/metrics |grep kube-controller
leader_election_master_status{name="kube-controller-manager"} 1
running_managed_controllers{manager="kube-controller-manager",name="nodeipam"} 1

3、Controller Manager 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 Controller Manager 服务的 Pod 实例。

# 查看kube-controllerManager的标签
[root@k8s-master01 ~]# kubectl describe pod -n kube-system kube-controller-manager-k8s-master01|grep -i label
Labels:               component=kube-controller-manager
# 7.4.2 配置 Prometheus 监控 Controller
  • 1、添加⼀个新的 Job,名为: kube-controller ,metrics 路径是 /metrics ,协议是 https
  • 2、基于 Kubernetes 的 Pod ⽅式实现⾃动发现,由于 ControllerManager 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
  • 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_component ,标签值为 kube-controller-manager

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap

      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
       
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查最终结果

抓取的 ControllerManager 的地址没错,但是端口不是 10257,而是 https 默认的 443,因此我们需要重新

image-20240314215117208

# 7.4.3 relabel 修改抓取的 Pod 端⼝

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
          
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml     

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查最终结果

image-20240314215543314

# 7.4.4 relabel 为 Pod 添加新标签

Pod 的元数据标签,我们希望保留 __meta_kubernetes_namespace、__meta_kubernetes_pod_name ,这两个维度的标签。

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name


[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查最终结果

image-20240314220057566

# 7.5 监控 K8S 核⼼组件 - Scheduler
# 7.5.1 获取 Scheduler 的 Metrics

Scheduler 默认在 10259/metrics 接⼝上提供了指标数据,默认情况下,Scheduler 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 --bind-address=127.0.0.1 修改为 --bind-address=0.0.0.0。

[root@k8s-master01 ~]# netstat -lntp|grep 10259
tcp        0      0 127.0.0.1:10259         0.0.0.0:*               LISTEN      1762/kube-scheduler 

1、修改 Scheduler Pod 的配置清单,3 台 master 都需修改,生产环境需不要同时修改,避免业务不可用。

[root@k8s-master01 ~]# cat /etc/kubernetes/manifests/kube-scheduler.yaml 
apiVersion: v1
kind: Pod
metadata:
  creationTimestamp: null
  labels:
    component: kube-scheduler
    tier: control-plane
  name: kube-scheduler
  namespace: kube-system
spec:
  containers:
  - command:
    - kube-scheduler
    - --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
    - --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
    - --bind-address=0.0.0.0
    - --kubeconfig=/etc/kubernetes/scheduler.conf
...

2、本地测试访问 Scheduler 的 Metrics ,我们可以使⽤已存在的 prometheussa 服务账户来创建⼀个令牌 (token),然后在来访问 Scheduler 的 Metrics。

[root@k8s-master01 ~]# TOKEN=$(kubectl create token -n monitoring prometheus-sa)
[root@k8s-master01 ~]# curl -s -k -H "Authorization: Bearer $TOKEN" https://192.168.40.101:10259/metrics |grep kube-scheduler
leader_election_master_status{name="kube-scheduler"} 1

3、Scheduler 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 Scheduler 服务的 Pod 实例。

[root@k8s-master01 ~]# kubectl describe pod kube-scheduler-k8s-master01  -n kube-system
Name:                 kube-scheduler-k8s-master01
Namespace:            kube-system
Priority:             2000001000
Priority Class Name:  system-node-critical
Node:                 k8s-master01/192.168.40.101
Start Time:           Sun, 17 Mar 2024 13:59:17 +0800
Labels:               component=kube-scheduler
                      tier=control-plane
...
# 7.5.2 配置 Prometheus 监控 Scheduler
  • 1、添加⼀个新的 Job,名为: kube-scheduler ,metrics 路径是 /metrics ,协议是 https
  • 2、基于 Kubernetes 的 Pod 来实现⾃动发现,由于 scheduler 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
  • 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_component ,标签值为 kube-scheduler
  • 4、使⽤ relabel_configs,修改抓取 Pod 的端⼝为 10259,默认 pod ⾃动抓取的实例,使⽤ http80 端⼝和 https443 端⼝;
  • 5、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、___meta_kubernetes_pod_name ,这两个维度的标签。

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317141321154

# 7.5.3 Schedule 告警规则⽂件

1、编写告警规则⽂件

  kube_scheduler_rules.yml: |-
    groups:
    - name: scheduler告警规则文件
      rules:
      - alert: 调度器每秒调度Pod次数过高
        expr: rate(scheduler_pod_scheduling_attempts_sum[1m]) > 20
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "调度器每秒调度Pod次数过高 (当前值: {{ $value }}次)"
          description: "调度器实例 {{ $labels.instance }} 在过去的一分钟内每秒调度的Pod次数超过了20次,当前值为 {{ $value }}次。"
    
    
      - alert: Pending状态的Pod数量过多
        expr: avg_over_time(scheduler_pending_pods{queue!="active"}[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pending状态的Pod数量过多 (当前值: {{ $value }}个)"
          description: "调度器实例 {{ $labels.instance }} 在过去五分钟内处于Pending状态的Pod数量平均超过了10个,当前值为 {{ $value }}个。"
    
    
      - alert: 'Pod平均调度尝试次数过多'
        expr: avg(rate(scheduler_pod_scheduling_attempts_sum[5m])) by (instance, job, pod_name) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pod平均调度尝试次数过多 (当前值: {{ $value }}次)"
          description: "调度器实例 `{{ $labels.instance }}` 的Pod在过去五分钟内平均尝试调度次数超过5次,当前值为 {{ $value }}次。"
    
      - alert: '调度器扩展点平均延迟过高'
        expr: | 
          rate(scheduler_framework_extension_point_duration_seconds_sum[5m])
          /
          rate(scheduler_framework_extension_point_duration_seconds_count[5m]) > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "调度器扩展点平均延迟过高 (当前值: {{ $value }}秒)"
          description: "调度器实例 `{{ $labels.instance }}` 的扩展点 `{{ $labels.extension_point }}` 在过去五分钟内平均延迟超过了1秒,当前值为 {{ $value }}秒。
# 7.6 监控 K8S 核⼼组件 - Etcd
# 7.6.1 获取 Etcd 的 Metrics

etcd 默认在 2381/metrics 接⼝上提供了指标数据,默认情况下,etcd 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 --listen-metrics-urls=http://127.0.0.1:2381 修改为 ---listen-metrics-urls=http://0.0.0.0:2381

[root@k8s-master01 04-prometheus]# netstat -lntp|grep 2381
tcp        0      0 127.0.0.1:2381          0.0.0.0:*               LISTEN      1808/etcd 

1、修改 etcd Pod 的配置清单,3 台 master 都需修改,生产环境需不要同时修改,避免业务不可用。

[root@k8s-master01 04-prometheus]# cat /etc/kubernetes/manifests/etcd.yaml 
apiVersion: v1
kind: Pod
metadata:
  annotations:
    kubeadm.kubernetes.io/etcd.advertise-client-urls: https://192.168.40.101:2379
  creationTimestamp: null
  labels:
    component: etcd
    tier: control-plane
  name: etcd
  namespace: kube-system
spec:
  containers:
  - command:
    - etcd
    - --advertise-client-urls=https://192.168.40.101:2379
    - --cert-file=/etc/kubernetes/pki/etcd/server.crt
    - --client-cert-auth=true
    - --data-dir=/var/lib/etcd
    - --experimental-initial-corrupt-check=true
    - --experimental-watch-progress-notify-interval=5s
    - --initial-advertise-peer-urls=https://192.168.40.101:2380
    - --initial-cluster=k8s-master01=https://192.168.40.101:2380
    - --key-file=/etc/kubernetes/pki/etcd/server.key
    - --listen-client-urls=https://127.0.0.1:2379,https://192.168.40.101:2379
    - --listen-metrics-urls=http://0.0.0.0:2381
    - --listen-peer-urls=https://192.168.40.101:2380
...

2、本地访问 etcd 的 Metrics

[root@k8s-master01 04-prometheus]#  curl -s http://192.168.40.101:2381/metrics |grep etcd|head -n 10
# HELP etcd_cluster_version Which version is running. 1 for 'cluster_version' label with current cluster version
# TYPE etcd_cluster_version gauge
etcd_cluster_version{cluster_version="3.5"} 1
# HELP etcd_debugging_auth_revision The current revision of auth store.
# TYPE etcd_debugging_auth_revision gauge
etcd_debugging_auth_revision 1
# HELP etcd_debugging_disk_backend_commit_rebalance_duration_seconds The latency distributions of commit.rebalance called by bboltdb backend.
# TYPE etcd_debugging_disk_backend_commit_rebalance_duration_seconds histogram
etcd_debugging_disk_backend_commit_rebalance_duration_seconds_bucket{le="0.001"} 933
etcd_debugging_disk_backend_commit_rebalance_duration_seconds_bucket{le="0.002"} 933

3、etcd 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 etcd 服务的 Pod 实例。

[root@k8s-master01 04-prometheus]# kubectl describe pods etcd-k8s-master01 -n kube-system
Name:                 etcd-k8s-master01
Namespace:            kube-system
Priority:             2000001000
Priority Class Name:  system-node-critical
Node:                 k8s-master01/192.168.40.101
Start Time:           Sun, 17 Mar 2024 13:48:58 +0800
Labels:               component=etcd
                      tier=control-plane
...
# 7.6.2 配置 Prometheus 监控 Etcd
  • 1、添加⼀个新的 Job,名为: kube-etcd ,metrics 路径是 /metrics,协议是 http
  • 2、基于 Kubernetes 的 Pod 来实现⾃动发现;
  • 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_component ,标签值为 etcd
  • 4、使⽤ relabel_configs,修改抓取 Pod 的端⼝为 2381,默认 pod ⾃动抓取的实例,使⽤ http80 端⼝和 https443 端⼝;
  • 5、使⽤ relabel_configs,保留 ____meta_kubernetes_namespace、__meta_kubernetes_pod_name ,这两个维度的标签。

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317144233242

# 7.6.3 Etcd 告警规则⽂件
  kube_etcd_rules.yml: |-
    groups:
    - name: etcd告警规则文件
      rules:
      - alert: Etcd成员异常下线
        expr: count(etcd_server_id) by (job) % 2 == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Etcd成员异常下线"
          description: "Etcd集群成员数量为偶数,可能有成员下线导致集群无法正常提供服务。"
    
      - alert: Etcd通信异常
        expr: etcd_server_has_leader == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Etcd通信异常 (实例: {{ $labels.instance }})"
          description: "实例 {{ $labels.instance }} 的Etcd节点无法与集群中的其它节点通信。"
    
    
      - alert: Etcd领导者变更频繁
        expr: rate(etcd_server_leader_changes_seen_total[5m]) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd领导者变更频繁 (实例: {{ $labels.instance }})"
          description: "在过去的5分钟内,实例 {{ $labels.instance }} 的Etcd领导者变更次数超过了5次,这可能会影响集群稳定性。"
    
      - alert: Etcd后端提交到磁盘耗时异常
        expr: |
          sum by (instance, job, pod_name) (rate(etcd_disk_backend_commit_duration_seconds_sum[5m])) 
          / 
          sum by (instance, job, pod_name) (rate(etcd_disk_backend_commit_duration_seconds_count[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd后端提交耗时异常 (实例: {{ $labels.instance }})"
          description: "实例 {{ $labels.instance }} 的Etcd后端在过去5分钟内提交到磁盘的操作平均耗时超过了2秒。"
    
    
      - alert: Etcd wal日志fsync耗时异常
        expr: |
          sum by (instance, job, pod_name) (rate(etcd_disk_wal_fsync_duration_seconds_sum[5m]))
          /
          sum by (instance, job, pod_name) (rate(etcd_disk_wal_fsync_duration_seconds_count[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Etcd wal日志fsync耗时异常 (实例: {{ $labels.instance }})"
          description: "实例 {{ $labels.instance }} 的Etcd节点在过去5分钟内日志文件的fsync调用平均耗时超过了2秒。"
# 7.6.4 导⼊ Etcd 图形

导⼊ ID:9733

# 7.7 监控 K8S 核⼼组件 - CoreDNS
# 7.7.1 获取 CoreDNS 的 Metrics

1、CoreDNS 通过 9153 端⼝的 /metrics 路径提供指标数据。

[root@k8s-master01 04-prometheus]# kubectl get svc -n kube-system
NAME             TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)                  AGE
calico-typha     ClusterIP   10.96.237.85   <none>        5473/TCP                 198d
kube-dns         ClusterIP   10.96.0.10     <none>        53/UDP,53/TCP,9153/TCP   198d
metrics-server   ClusterIP   10.96.23.183   <none>        443/TCP                  198d

[root@k8s-master01 04-prometheus]# curl -s http://10.96.0.10:9153/metrics|head -n 10
# HELP coredns_build_info A metric with a constant '1' value labeled by version, revision, and goversion from which CoreDNS was built.
# TYPE coredns_build_info gauge
coredns_build_info{goversion="go1.20",revision="055b2c3",version="1.10.1"} 1
# HELP coredns_cache_entries The number of elements in the cache.
# TYPE coredns_cache_entries gauge
coredns_cache_entries{server="dns://:53",type="denial",view="",zones="."} 39
coredns_cache_entries{server="dns://:53",type="success",view="",zones="."} 12
# HELP coredns_cache_hits_total The count of cache hits.
# TYPE coredns_cache_hits_total counter
coredns_cache_hits_total{server="dns://:53",type="denial",view="",zones="."} 878

2、CoreDNS 有对应的 Service,因此我们需要获取 DNS 对应的 Service 的标签(labels),以便 Prometheus 只抓取提供 DNS 服务的 Pod 实例。

[root@k8s-master01 04-prometheus]# kubectl describe svc kube-dns -n kube-system
Name:              kube-dns
Namespace:         kube-system
Labels:            k8s-app=kube-dns
                   kubernetes.io/cluster-service=true
                   kubernetes.io/name=CoreDNS
...
# 7.7.2 配置 Prometheus 监控 DNS
  • 1、添加⼀个新的 Job,名为: kube-dns ,metrics 路径是 /metrics ,协议是 http
  • 2、基于 Kubernetes 的 endpoints 来实现⾃动发现;
  • 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_service_label_k8s_app ,标签值为 kube-dns
  • 4、使⽤ relabel_configs,重新调整 Pod 的端⼝,将标签 __meta_kubernetes_pod_ip 修改为 IP:9153 端⼝。默认将匹配到的 53、9153 端⼝ ,都视作⼀独⽴的实例。
  • 5、使⽤ relabel_configs,保留 ___meta_kubernetes_namespace、____meta_kubernetes_pod_name、__meta_kubernetes_service_name ,这三个维度的标签。

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317151422648

# 7.7.3 CoreDNS 告警规则⽂件

1、编写告警规则⽂件

  kube_coredns_rules.yml: |-
    groups:
    - name: CoreDNS告警规则文件
      rules:
      - alert: CoreDNS SERVFAIL响应率过高
        expr: |
          sum(rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m])) by (instance, job, server, pod_name, zone) > 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "实例 {{ $labels.instance }} 的 CoreDNS SERVFAIL 响应率过高"
          description: "在过去5分钟内,实例 {{ $labels.instance }},CoreDNS Pod名称 {{ $labels.pod_name }},服务端点 {{ $labels.server }},区域 {{ $labels.zone }} 的SERVFAIL响应率超过了10次,当前值:{{ $value }}次/秒。请检查CoreDNS服务状态。"
  
      - alert: CoreDNS域名解析时延过高
        expr: |
          sum(rate(coredns_dns_request_duration_seconds_sum[5m])) by (instance, job, server, pod_name, zone)
          /
          sum(rate(coredns_dns_request_duration_seconds_count[5m])) by (instance, job, server, pod_name, zone) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "实例 {{ $labels.instance }} 的 CoreDNS 解析时延超过1秒"
          description: "在过去5分钟内,实例 {{ $labels.instance }},CoreDNS Pod名称 {{ $labels.pod_name }},服务端点 {{ $labels.server }},区域 {{ $labels.zone }} 的平均域名解析时延超过了1秒,当前平均时延:{{ $value }}秒。
# 7.7.4 导⼊ CoreDNS 图形

导⼊ ID:15762

# 7.8 监控 K8S 核⼼组件 - Kubeproxy
# 7.8.1 获取 kube-proxy 的 Metrics

1、Kube-Proxy 在 10249/metrics 接⼝上提供指标数据,默认情况下 kube-proxy 的 Metrics 接⼝仅在本地(127.0.0.1)地址监听,因此我们需要修改其监听地址的参数,将 metricsBindAddress: ""修改 metricsBindAddress:"0.0.0.0" ,然后重启 kube-proxy

[root@k8s-master01 04-prometheus]#  kubectl edit configmap -n kube-system kube-proxy
...
    kind: KubeProxyConfiguration
    metricsBindAddress: "0.0.0.0"
    mode: "ipvs"
    nodePortAddresses: null
    oomScoreAdj: null
    portRange: ""
    showHiddenMetricsForVersion: ""
    winkernel:
...

# 重启kube-proxy的pod
[root@k8s-master01 04-prometheus]#  kubectl rollout restart daemonset -n kube-system kube-proxy

2、检查 kube-proxy 的监听地址

[root@k8s-master01 04-prometheus]#  netstat -lntp |grep kube-proxy
tcp6       0      0 :::10256                :::*                    LISTEN      60042/kube-proxy    
tcp6       0      0 :::10249                :::*                    LISTEN      60042/kube-proxy 

3、kube-proxy 没有 Service,因此我们直接获取对应 Pod 的标签(labels),以便 Prometheus 只抓取提供 kube-proxy 服务的 Pod 实例。

[root@k8s-master01 04-prometheus]#  kubectl describe daemonsets.apps -n kube-system kube-proxy
Name:           kube-proxy
Selector:       k8s-app=kube-proxy
Node-Selector:  kubernetes.io/os=linux
Labels:         k8s-app=kube-proxy
Annotations:    deprecated.daemonset.template.generation: 3
...
# 7.8.2 配置 Prometheus 监控 kube-proxy
  • 1、添加⼀个新的 Job,名为: kube-proxy ,metrics 路径是 /metrics ,协议是 http
  • 2、基于 Kubernetes 的 pod 来实现⾃动发现;
  • 3、使⽤ relabel_configs,仅保留标签名为 __meta_kubernetes_pod_label_k8s_app ,标签值为 kube-proxy
  • 4、使⽤ relabel_configs,重新调整 Pod 的端⼝,将标签 __meta_kubernetes_pod_ip 修改为 IP:10249 端⼝,默认 Pod 的⾃动发现端⼝ http 是 80,https 是 443。
  • 5、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_pod_name ,这两个维度的标签。

1、修改 Prometheus 配置

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

  # 监控kube-proxy
  - job_name: "kube-proxy"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
      regex: "kube-proxy"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:10249
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317153148617

# 7.8.3 Kube-Proxy 告警规则⽂件

1、编写告警规则⽂件

  kube_proxy_rules.yml: |-
    groups:
    - name: kube-proxy告警规则文件
      rules:
      - alert: KubeProxy同步时间过长
        expr: |
          rate (kubeproxy_sync_proxy_rules_duration_seconds_sum[5m]) /
          rate (kubeproxy_sync_proxy_rules_duration_seconds_count[5m]) > 3
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "kube-proxy同步时间过长 (实例: {{ $labels.instance }})"
          description: "实例 {{ $labels.instance }}, kube-proxy同步操作的平均时间超过了3秒。该Pod {{ $labels.pod_name }} 当前同步延迟:{{ $value }}/s"
      
      - alert: Iptables规则同步失败次数过多
        expr: rate(kubeproxy_sync_proxy_rules_iptables_restore_failures_total[5m]) > 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Iptables同步失败告警 (实例: {{ $labels.instance }})"
          description: "实例 {{ $labels.instance }}, iptables规则同步失败次数超过10次。该Pod {{ $labels.pod_name }} 当前失败次数:{{ $value }}"
# 7.9 监控 Kubernetes 集群资源状态
# 7.9.1 什么是集群资源状态

集群资源状态是指, Kubernetes 集群中所有资源对象、以及这些资源对象的当前状态信息。这些资源对象,包括 Pod、Deployment、DaemonSet、StatefulSet、Job、CronJob 等 。⽽这些资源状态,则提供了这些资源的详细信息,例如:

  • 1、当前集群中资源的数量;
  • 2、当前集群中总共有多少个 Pod,分别处于什么状态(如 Running、Stopped、Terminated)。
  • 3、有多少个 Deployment 正在运⾏,以及它们正在运⾏的 Pod 副本数量与实际期望运⾏的 Pod 副本数是否⼀致。
  • 3、DaemonSet 控制的 Pod 是否已经在所有(或指定的)节点上运⾏。
  • 4、Job 和 CronJob 是否按预定计划运⾏,以及执⾏成功与否。
  • ...

但是在 Kubernetes 中的组件,并不提供关于资源状态的指标。因此我们需要使⽤ kube-state-metrics ,因为 kube-state-metrics 它会主动收集关于 Kubernetes 集群中的各种资源的状态信息。如 Pod、Deployment、Job 以及它们的数量、运⾏状况等,⽽后将些信息转换成 Prometheus 所兼容的指标格式,进⽽让 Prometheus 能抓取这些指标,并进⾏分析与展示。

# 7.9.2 安装 Kube-State-Metrics

kube-state-metrics 版本与 kubernetes 的版本对应关系(注意版本要兼容)

kube-state-metricsKubernetes client-go Version
v2.6.0v1.24
v2.7.0v1.25
v2.8.2v1.26
v2.9.2v1.26
v2.10.0v1.27
mainv1.28

1、安装 kube-state-metrics,⾸先克隆最新分⽀的源代码

[root@k8s-master01 ~]# yum install git -y
[root@k8s-master01 ~]#  git clone https://github.com/kubernetes/kube-state-metrics.git
# 加速地址
[root@k8s-master01 ~]# git clone https://mirror.ghproxy.com/https://github.com/kubernetes/kube-state-metrics.git

2、修改 kube-state-metrics/examples/standard/deployment.yaml 镜像为国内的镜像

[root@k8s-master01 ~]# cat kube-state-metrics/examples/standard/deployment.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.11.0
  name: kube-state-metrics
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  template:
    metadata:
      labels:
        app.kubernetes.io/component: exporter
        app.kubernetes.io/name: kube-state-metrics
        app.kubernetes.io/version: 2.11.0
    spec:
      automountServiceAccountToken: true
      containers:
#      - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.11.0
      - image: uhub.service.ucloud.cn/oldxu/kube-state-metrics:v2.10.0
...

3、应⽤资源清单⽂件

[root@k8s-master01 ~]#  kubectl apply -f kube-state-metrics/examples/standard/
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics created
deployment.apps/kube-state-metrics created
serviceaccount/kube-state-metrics created
service/kube-state-metrics created

4、 Kube-state-metrics 的 Pod 运⾏在 kube-system 名称空间

[root@k8s-master01 standard]# kubectl get pods,svc -n kube-system -l app.kubernetes.io/name=kube-state-metrics
NAME                                      READY   STATUS    RESTARTS   AGE
pod/kube-state-metrics-5864c7d699-mwbp4   1/1     Running   0          2m15s

NAME                         TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)             AGE
service/kube-state-metrics   ClusterIP   None         <none>        8080/TCP,8081/TCP   2m15s

5、 Kube-state-metrics 有提供 Service ,因此我们需要获取 Service 的标签(labels),以便 Prometheus 只抓取提供 Kube-state-metrics 服务的实例。

[root@k8s-master01 standard]# curl -s http://172.16.85.199:8080/metrics | head -n 10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 2.1866e-05
go_gc_duration_seconds{quantile="0.25"} 8.7894e-05
go_gc_duration_seconds{quantile="0.5"} 0.000165906
go_gc_duration_seconds{quantile="0.75"} 0.000179164
go_gc_duration_seconds{quantile="1"} 0.000420888
go_gc_duration_seconds_sum 0.001599982
go_gc_duration_seconds_count 10
# HELP go_goroutines Number of goroutines that currently exist.

[root@k8s-master01 standard]#  kubectl describe service -n kube-system kube-state-metrics
Name:              kube-state-metrics
Namespace:         kube-system
Labels:            app.kubernetes.io/component=exporter
                   app.kubernetes.io/name=kube-state-metrics
                   app.kubernetes.io/version=2.11.0
...
# 7.9.3 配置 Prometheus 监控 KSM
  • 1、添加⼀个新的 Job,名为: kube-state-metrics ,metrics 路径是 /metrics ,协议是 http
  • 2、基于 Kubernetes 的 endpoints 来⾃动发现所有的 endpoints 端点;
  • 3、使⽤ relabel_configs,仅保留标签名 __meta_kubernetes_service_label_app_kubernetes_io_name ,标签值是 kube-state-metrics 的实例。
  • 4、使⽤ relabel_configs,重新调整 Pod 的端⼝,将标签 __meta_kubernetes_pod_ip 修改为 IP:8080 端⼝。
  • 5、使⽤ relabel_configs,映射 _meta_kubernetes_service_label (.*) ,所有的标签以及标签值。

1、配置 Prometheus

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控kube-state-metrics
      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
          regex: "kube-state-metrics"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:8080
          target_label: __address__

        # 添加维度标签
        - regex: __meta_kubernetes_service_label_(.*)
          action: labelmap

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]#  kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

  # 监控kube-proxy
  - job_name: "kube-proxy"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
      regex: "kube-proxy"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:10249
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控kube-state-metrics
  - job_name: "kube-state-metrics"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
      regex: "kube-state-metrics"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:8080
      target_label: __address__

    # 添加维度标签
    - regex: __meta_kubernetes_service_label_(.*)
      action: labelmap

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317205755162

# 7.9.4 资源状态告警规则⽂件

1、编写告警规则⽂件

  kube_state_metrics_rules.yml: |-
    groups:
    - name: KSM告警规则文件
      rules:
      - alert: 节点kubelet未就绪
        expr: kube_node_status_condition{condition="Ready", status="true"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "节点 {{ $labels.node }} kubelet未就绪"
          description: "节点 {{ $labels.node }} kubelet已经超过5分钟未处于就绪状态,需要立即检查。"

      - alert: 节点内存压力大
        expr: kube_node_status_condition{condition="MemoryPressure", status="true"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "节点 {{ $labels.node }} 内存压力过高"
          description: "节点 {{ $labels.node }} 正在经历内存压力,可能需要增加内存资源或减少工作负载。"

      - alert: 节点网络压力大
        expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "节点 {{ $labels.node }} 网络压力过高"
          description: "节点 {{ $labels.node }} 的网络接压力过大,可能存在网络瓶颈。"

      - alert: 节点磁盘压力大
        expr: kube_node_status_condition{condition="DiskPressure", status="true"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "节点 {{ $labels.node }}磁盘压力过高"
          description: "节点 {{ $labels.node }} 正在经历磁盘压力,磁盘空间或inode可能不足。"

      - alert: 节点PID压力大
        expr: kube_node_status_condition{condition="PIDPressure", status="true"} == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "节点 {{ $labels.node }} PID压力过高"
          description: "节点 {{ $labels.node }} 上的进程数可能已经达到上限。"

      - alert: 启动失败的Pod
        expr: sum (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) by (job,namespace, pod) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Pod启动失败"
          description: "'{{ $labels.namespace }}' 名称空间中的Pod '{{ $labels.pod }}'启动失败。"
      
      - alert: 因为OOM重启的Pod
        expr: |
          (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1)
          and ignoring (reason)
          min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) >= 1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "{{ $labels.pod }} Pod因OOM重启"
          description: "'{{ $labels.namespace }}' 名称空间中的Pod '{{ $labels.pod }}' 触发了OOM造成Pod重启,触发OOM的容器是 '{{ $labels.container }}'。"

      - alert: Deployment副本数不一致
        expr: kube_deployment_spec_replicas - kube_deployment_status_replicas_available > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Deployment副本数不一致"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.deployment }} 部署副本数与期望副本数不一致,当前偏差了{{ $value }}个副本。"
      
      - alert: DaemonSet副本数不一致
        expr: |
          kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100
          or
          kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "DaemonSet副本数不一致"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.daemonset }} 期望副本数与实际运行副本数不一致。"

      - alert: DaemonSet调度出现错误
        expr: kube_daemonset_status_number_misscheduled > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "DaemonSet调度错误"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.daemonset }} 调度了错误的Pod。"

      - alert: StatefulSet副本数异常
        expr: |
          kube_statefulset_status_replicas_ready  / kube_statefulset_replicas  * 100 < 100
          or
          kube_statefulset_replicas - kube_statefulset_status_replicas_current > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "StatefulSet副本数异常"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.statefulset }} 期望副本数与实际运行副本数不一致。"

      - alert: PV异常
        expr: kube_persistentvolume_status_phase{phase="Failed"} > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "PV异常"
          description: "持久卷'{{ $labels.persistentvolume }}'处于Failed状态。"

      - alert: PVC异常
        expr: kube_persistentvolumeclaim_status_phase{phase=~"Lost|Pending"} > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PVC异常"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.persistentvolumeclaim }} 持久卷(PVC)处于 {{ $labels.phase }}状态。"

      - alert: Job完成度低
        expr: kube_job_status_succeeded / kube_job_spec_completions * 100 < 75
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Job完成度低于75%"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.job_name }} Job任务完成度低于预期的75%。"

      - alert: Job失败次数高
        expr: kube_job_status_failed > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Job失败次数过高"
          description: "'{{ $labels.namespace }}' 名称空间中 {{ $labels.job_name }} Job任务执行失败次数超过5次以上,当前失败 {{ $value }} 次。"
# 7.9.5 导⼊资源状态图形

ID:13332 或 15757

# 7.10 监控 Kubernetes 集群 Pod 资源
# 7.10.1 Pod 资源是什么

所谓 Pod 资源,指的是,运⾏在 Pod 中的 "容器" 所使⽤的计算资源。这些计算资源指的是 CPU、内存、⽹络以及磁盘 IO 等相关指标。之前我们监控容器的资源时,使⽤的是 cAdvisor ⼯具来监控。不过在 Kubernetes 中,Cadvisor ⼯具已经被内置到了 kubelet 的组件中。因此,我们可以直接监控节点的 kubelet,来收集相关 Pod 的指标数据。

kubelet 的指标可以通过以下⽅式访问:

  • 端⼝:10250,kubelet ⽤于指标数据。
  • 协议:HTTPS,确保数据传输的安全性(因此需要进⾏认证才可以抓取数据)。
  • 路径: /metrics/cadvisor ,特定的 URL 路径,我们需要从这个路径获取 cAdvisor 提供的指标数据。
# 7.10.2 配置 Prometheus 监控 Pod
  • 1、添加⼀个新的 Job,名为: kube-kubelet ,metrics 路径是 /metrics/cadvisor ,协议是 https
  • 2、基于 Kubernetes 的 node ⽅式来发现所有的主机实例,由于 kubelet 采⽤的是 HTTPS,因此还需要指定 TLS 相关的配置;
  • 3、使⽤ relabel_configs,映射 _meta_kubernetes_node_label (.*) ,所有的标签以及标签值。
[root@k8s-master01 04-prometheus]# TOKEN=$(kubectl create token -n monitoring prometheus-sa)
[root@k8s-master01 04-prometheus]# curl -s -k -H "Authorization: Bearer $TOKEN" https://192.168.40.101:10250/metrics|head -n 10
# HELP aggregator_discovery_aggregation_count_total [ALPHA] Counter of number of times discovery was aggregated
# TYPE aggregator_discovery_aggregation_count_total counter
aggregator_discovery_aggregation_count_total 0
# HELP apiserver_audit_event_total [ALPHA] Counter of audit events generated and sent to the audit backend.
# TYPE apiserver_audit_event_total counter
apiserver_audit_event_total 0
# HELP apiserver_audit_requests_rejected_total [ALPHA] Counter of apiserver requests rejected due to an error in audit logging backend.
# TYPE apiserver_audit_requests_rejected_total counter
apiserver_audit_requests_rejected_total 0
# HELP apiserver_client_certificate_expiration_seconds [ALPHA] Distribution of the remaining lifetime on the certificate used to authenticate a request.

1、配置 Prometheus

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控kube-state-metrics
      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
          regex: "kube-state-metrics"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:8080
          target_label: __address__

        # 添加维度标签
        - regex: __meta_kubernetes_service_label_(.*)
          action: labelmap

      # 监控kubelet(Pod)
      - job_name: "kube-kubelet"
        metrics_path: "/metrics/cadvisor"
        scheme: https
        kubernetes_sd_configs:
        - role: node
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 添加标签的映射
        relabel_configs:
        - regex: __meta_kubernetes_node_label_(.*)
          action: labelmap

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

  # 监控kube-proxy
  - job_name: "kube-proxy"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
      regex: "kube-proxy"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:10249
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控kube-state-metrics
  - job_name: "kube-state-metrics"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
      regex: "kube-state-metrics"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:8080
      target_label: __address__

    # 添加维度标签
    - regex: __meta_kubernetes_service_label_(.*)
      action: labelmap

  # 监控kubelet(Pod)
  - job_name: "kube-kubelet"
    metrics_path: "/metrics/cadvisor"
    scheme: https
    kubernetes_sd_configs:
    - role: node
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 添加标签的映射
    relabel_configs:
    - regex: __meta_kubernetes_node_label_(.*)
      action: labelmap

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317212313946

3、检查此前创建好的 kube_pods_rules.yml 告警规则⽂件

  kube_pods_rules.yml: |-
    groups:
    - name: Pods的告警规则文件
      rules:
      - alert: Pod中容器的CPU利用率高
        expr: sum (rate(container_cpu_usage_seconds_total{image!=""}[5m])) by (instance,job,pod,namespace) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod CPU利用率高"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的CPU利用率当前为 {{ $value }}%,超过了80%的阈值。"
    
      - alert: Pod中容器内存利用率高
        expr: |
          sum(container_memory_working_set_bytes{name!=""}) by (instance,job,pod,namespace)
          /
          sum(container_spec_memory_limit_bytes{name!=""} > 0) by (instance,job,pod,namespace) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod内存利用率高"
          description: 在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的内存最大限制是 {{ printf `sum (container_spec_memory_limit_bytes{namespace="%s",pod="%s"} > 0 ) /1024 /1024` $labels.namespace $labels.pod | query | first | value }}MB , 目前利用率已达{{ $value }}%,超过限制的80%。

      - alert: Pod容器网络发送速率过高
        expr: sum(rate(container_network_transmit_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络发送速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
    
      - alert: Pod容器网络接收速率过高
        expr: sum(rate(container_network_receive_bytes_total{image!=""}[1m])) by (instance,job,pod,namespace) * 8 /1024 /1024 > 50
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod网络发送速率过高"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的网络接收速率达到{{ $value }}Mbps,超过了50Mbps的阈值。"
    
      - alert: Pod容器磁盘写入吞吐量过大
        expr: sum (rate(container_fs_writes_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘写入吞吐量过大"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘写入吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
    
      - alert: Pod容器磁盘读取吞吐量过大
        expr: sum (rate(container_fs_reads_bytes_total{name!=""}[1m])) by (instance,job,pod,namespace) /1024 /1024 > 20
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "在 '{{ $labels.instance }}' 节点上运行的 '{{ $labels.pod }}' Pod磁盘读取吞吐量过大"
          description: "在 '{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod }}' Pod的磁盘读取吞吐量达到{{ $value }}MB/s,超过了20MB/s的阈值。"
# 7.11 监控 Kubernetes 集群 Service 资源
# 7.11.1 为何需要监控 Service 资源

监控 Service 资源是为了确保我们的服务,时刻处于持续运⾏状态,通常关注如下两个维度;

● 1、可⽤性:确保 Service 始终是可以被访问的,从⽽保障服务的连续性。

● 2、性能:监控 Service 的响应时间,确保其处理请求的速度,始终处于⼀个稳定的时间。

为了确保 Service 的服务始终可⽤,我们通常会采⽤ Blackbox 的 TCP 探测⽅法来进⾏监控。

之前 Blackbox 监控⽅式⽐较固定,不够灵活。为了改进这⼀点,在监控 Service 时,我们可以将监控⽬标配置为⾃动发现机制。这样,每当有新的 Service 出现或现有 Service 发⽣变化时,监控系统能够⾃动识别并开始监控,⽽不需要⼿动更新 Prometheus 的配置。

  - job_name: 'blackbox_http'
    metrics_path: /probe # metrics的path这次不是/metrics,⽽是/probe
    params: # 传递参数
      module: [http_2xx] # 调⽤哪个模块进⾏探测
    static_configs:
    - targets: ["https://www.xuliangwei.com","http://www.oldxu.net","https://www.baidu.com","http://httpbin.org/status/400","https://httpstat.us/500","https://httpstat.us/502"]
    relabel_configs:
    - source_labels: [__address__]
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: prom-node04.oldxu.net:9115
      
# relabel_configs是标签重写的配置,这⾥进⾏了三次操作:
# 1、将⽬标地址(__address__)赋予给__param_target,这是Blackbox Exporter需要的⽬标target参数。
# 2、将__param_target的内容复制到instance标签,这样Prometheus UI中显示的instance实例名称会是⽬标站点地址,⽽不是Blackbox的地址。
# 3、最后,将实际发送探测请求的地址(__address__)设置为运⾏Blackbox Exporter的节点地址和端⼝(prom-node04.oldxu.net:9115),这样Prometheus就会向这个地址发送探测请求。
# 7.11.2 配置 Prometheus 监控 Service

1、添加⼀个新的 Job,名为: kube-blackbox-tcp ,协议是 tcp , metrics 路径是 /probe

2、基于 Kubernetes 的 service 来实现⾃动发现所有的 Service,⽽后进⾏⾃动监控;

3、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_service_name 这两个维度的标签。

1、配置 Prometheus

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控kube-state-metrics
      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
          regex: "kube-state-metrics"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:8080
          target_label: __address__

        # 添加维度标签
        - regex: __meta_kubernetes_service_label_(.*)
          action: labelmap

      # 监控kubelet(Pod)
      - job_name: "kube-kubelet"
        metrics_path: "/metrics/cadvisor"
        scheme: https
        kubernetes_sd_configs:
        - role: node
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 添加标签的映射
        relabel_configs:
        - regex: __meta_kubernetes_node_label_(.*)
          action: labelmap

      # 监控service
      - job_name: "kube-blackbox-tcp"
        metrics_path: "/probe"
        params:
          module: [tcp_connect]         # 使用tcp_connect模块
        kubernetes_sd_configs:
        - role: service
        relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: (.*)
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          regex: (.*)
          replacement: $1
          target_label: service_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]#  kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

  # 监控kube-proxy
  - job_name: "kube-proxy"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
      regex: "kube-proxy"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:10249
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控kube-state-metrics
  - job_name: "kube-state-metrics"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
      regex: "kube-state-metrics"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:8080
      target_label: __address__

    # 添加维度标签
    - regex: __meta_kubernetes_service_label_(.*)
      action: labelmap

  # 监控kubelet(Pod)
  - job_name: "kube-kubelet"
    metrics_path: "/metrics/cadvisor"
    scheme: https
    kubernetes_sd_configs:
    - role: node
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 添加标签的映射
    relabel_configs:
    - regex: __meta_kubernetes_node_label_(.*)
      action: labelmap

  # 监控service
  - job_name: "kube-blackbox-tcp"
    metrics_path: "/probe"
    params:
      module: [tcp_connect]         # 使用tcp_connect模块
    kubernetes_sd_configs:
    - role: service
    relabel_configs:
    - source_labels: [__address__]
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: blackbox-svc:9115
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: (.*)
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_service_name]
      regex: (.*)
      replacement: $1
      target_label: service_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317214533161

5、检查此前创建好的 kube_blackbox_tcp_rules.yml 告警规则⽂件

 blackbox_tcp_rules.yml: |-
    groups:
    - name: Blackbox_tcp告警规则文件
      rules:
      - alert: Service TCP探测失败
        expr: sum(probe_success{job=~".*tcp"}) by (instance,job,namespace,service_name) == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "探测 '{{ $labels.instance }}' Service 的TCP接口探测失败。"
          description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 地址探测失败。"
    
      - alert: Service TCP请求的响应时间过长
        expr: probe_duration_seconds{job=~".*tcp"} > 0.500
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "探测 '{{ $labels.instance }}' Service 的TCP响应时间超过了500毫秒。"
          description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"

      - alert: Service的DNS解析响应时间过长
        expr: probe_dns_lookup_time_seconds{job=~".*tcp"} > 0.500
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "探测 '{{ $labels.instance }}' Service 的DNS解析响应时间超过了500毫秒。"
          description: "'{{ $labels.namespace }}' 名称空间中的 '{{ $labels.service_name }}' Service资源 '{{ $labels.instance }}' 当前响应时长为 {{ $value }} 秒。"	
# 7.12 监控 Kubernetes 集群 Ingress 资源
# 7.12.1 为何需要监控 Ingress 资源

监控 Ingress 对应的域名,主要是为了确保⽤户,能时刻访问到对应域名所提供的服务;监控域名维度如下;

  1. HTTP 请求延迟:监控站点处理请求的延迟,如果请求延迟过⾼可以推送告警消息,这样就可以第⼀时间进⾏处理。

  2. 证书过期时间:监控 TLS/SSL 证书的有效期,以便及时更新证书,避免因为证书过期造成访问中断。

  3. 可⽤性:持续检查 Ingress 绑定的域名是否可被持续访问,确保⽤户的访问不会中断。

为了实现这样的监控,通常会利⽤ Blackbox Exporter 的 HTTP 探测功能来检查 Ingress 对应域名的健康状态和响应时间。

之前 Blackbox 监控⽅式⽐较固定,不够灵活。为了改进这⼀点,在监控 Ingress 时,我们可以将监控⽬标配置为⾃动发现机制。这样,每当有新的 ingress 出现或现有 ingress 发⽣变化时,监控系统能够⾃动识别并开始监控,⽽不需要⼿动更新 Prometheus 的配置。

  - job_name: 'blackbox_http'
    metrics_path: /probe # metrics的path这次不是/metrics,⽽是/probe
    params: # 传递参数
      module: [http_2xx] # 调⽤哪个模块进⾏探测
    static_configs:
    - targets: ["https://www.xuliangwei.com","http://www.oldxu.net","https://www.baidu.com","http://httpbin.org/status/400","https://httpstat.us/500","https://httpstat.us/502"]
    relabel_configs:
    - source_labels: [__address__]
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: prom-node04.oldxu.net:9115
      
# relabel_configs是标签重写的配置,这⾥进⾏了三次操作:
# 1、将⽬标地址(__address__)赋予给__param_target,这是Blackbox Exporter需要的⽬标target参数。
# 2、将__param_target的内容复制到instance标签,这样Prometheus UI中显示的instance实例名称会是⽬标站点地址,⽽不是Blackbox的地址。
# 3、最后,将实际发送探测请求的地址(__address__)设置为运⾏Blackbox Exporter的节点地址和端⼝(prom-node04.oldxu.net:9115),这样Prometheus就会向这个地址发送探测请求。
# 7.12.2 配置 Prometheus 监控 Ingress
  • 1、添加⼀个新的 Job,名为: kube-blackbox-http ,协议是 http
  • 2、基于 Kubernetes 的 ingress 来⾃动发现所有的 Ingress 资源,⽽后进⾏监控。
  • 3、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_ingress_name、__meta_kubernetes_ingress_class_name 这三个维度的标签。

1、配置 Prometheus

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控kube-state-metrics
      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
          regex: "kube-state-metrics"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:8080
          target_label: __address__

        # 添加维度标签
        - regex: __meta_kubernetes_service_label_(.*)
          action: labelmap

      # 监控kubelet(Pod)
      - job_name: "kube-kubelet"
        metrics_path: "/metrics/cadvisor"
        scheme: https
        kubernetes_sd_configs:
        - role: node
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 添加标签的映射
        relabel_configs:
        - regex: __meta_kubernetes_node_label_(.*)
          action: labelmap

      # 监控service
      - job_name: "kube-blackbox-tcp"
        metrics_path: "/probe"
        params:
          module: [tcp_connect]         # 使用tcp_connect模块
        kubernetes_sd_configs:
        - role: service
        relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: (.*)
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          regex: (.*)
          replacement: $1
          target_label: service_name

      # 监控ingress
      - job_name: "kube-blackbox-http"
        metrics_path: "/probe"
        params:
          module: [http_2xx]            # 使用http模块进行探测
        kubernetes_sd_configs:
        - role: ingress
        relabel_configs:
        # 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
        - source_labels: [__meta_kubernetes_ingress_scheme,__address__]
          regex: (.*);(.*)
          replacement: $1://$2
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          target_label: namespace
        - source_labels: [__meta_kubernetes_ingress_name]
          target_label: ingress_name
        - source_labels: [__meta_kubernetes_ingress_class_name]
          target_label: ingress_class_name
          
[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]#  kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

  # 监控kube-proxy
  - job_name: "kube-proxy"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
      regex: "kube-proxy"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:10249
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控kube-state-metrics
  - job_name: "kube-state-metrics"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
      regex: "kube-state-metrics"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:8080
      target_label: __address__

    # 添加维度标签
    - regex: __meta_kubernetes_service_label_(.*)
      action: labelmap

  # 监控kubelet(Pod)
  - job_name: "kube-kubelet"
    metrics_path: "/metrics/cadvisor"
    scheme: https
    kubernetes_sd_configs:
    - role: node
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 添加标签的映射
    relabel_configs:
    - regex: __meta_kubernetes_node_label_(.*)
      action: labelmap

  # 监控service
  - job_name: "kube-blackbox-tcp"
    metrics_path: "/probe"
    params:
      module: [tcp_connect]         # 使用tcp_connect模块
    kubernetes_sd_configs:
    - role: service
    relabel_configs:
    - source_labels: [__address__]
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: blackbox-svc:9115
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: (.*)
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_service_name]
      regex: (.*)
      replacement: $1
      target_label: service_name

  # 监控ingress
  - job_name: "kube-blackbox-http"
    metrics_path: "/probe"
    params:
      module: [http_2xx]            # 使用http模块进行探测
    kubernetes_sd_configs:
    - role: ingress
    relabel_configs:
    # 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
    - source_labels: [__meta_kubernetes_ingress_scheme,__address__]
      regex: (.*);(.*)
      replacement: $1://$2
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: blackbox-svc:9115
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      target_label: namespace
    - source_labels: [__meta_kubernetes_ingress_name]
      target_label: ingress_name
    - source_labels: [__meta_kubernetes_ingress_class_name]
      target_label: ingress_class_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240317215901654

5、检查此前创建好的 blackbox_http_rules.yml 告警规则⽂件

 blackbox_http_rules.yml: |-
    groups:
    - name: Blackbox_http告警规则文件
      rules:
      - alert: 站点平均请求过长
        expr: sum (avg_over_time(probe_http_duration_seconds[1m])) by (instance,job,namespace,ingress_name) > 3
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' 域名整体请求时间超过了3秒。"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名最近1分钟的平均请求时间超过3秒。当前平均请求时间:{{ $value }}秒。"

      - alert: 站点阶段耗时过长
        expr: |
          (
            probe_http_duration_seconds{phase="connect"} > 1 or
            probe_http_duration_seconds{phase="processing"} > 1 or
            probe_http_duration_seconds{phase="resolve"} > 1 or
            probe_http_duration_seconds{phase="tls"} > 1 or
            probe_http_duration_seconds{phase="transfer"} > 1
          )
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' 域名在 '{{ $labels.phase }}' 阶段耗时过长"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在阶段 '{{ $labels.phase }}' 的耗时超过0.5秒。当前耗时:{{ $value }}秒。"

      - alert: 站点响应状态码异常
        expr: probe_http_status_code <= 199 or probe_http_status_code >= 400
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 域名返回异常状态码"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名返回的状态码为 {{ $value }},表明请求可能存在问题。"
    
      - alert: 重定向次数过多
        expr: probe_http_redirects > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' 域名重定向次数过多"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名在最近的探测中重定向次数超过5次。当前次数:{{ $value }}次。"

      - alert: 证书即将过期<30
        expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 30
        for: 24h
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 域名的 SSL 证书即将过期"
          description: "{{ $labels.namespace }} 名称空间 '{{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"
    
      - alert: 证书即将过期<7
        expr: (probe_ssl_earliest_cert_expiry - time()) /86400 < 7
        for: 24h
        labels:
          severity: critical
        annotations:
          summary: "{{ $labels.instance }}' 域名的 SSL 证书即将过期"
          description: "{{ $labels.namespace }} 名称空间 {{ $labels.instance }}' 域名的 SSL 证书将在 {{ $value }} 天内过期。"

# 8、Prometheus 监控 Redis 应⽤的 Pod

# 8.1 监控 Redis 应⽤场景说明

运⾏ redis 的 Pod,⽽后为其创建⼀个 Service,我们现在希望监控它的 Pod 状态,Pod 资源的使⽤,redis 的指标、以及 redis 的 Service 的延迟和存活性。

  • 1、redis 应⽤指标:可以在 Pod 中注⼊⼀个 redis_exporter 来抓取对应的 redis 指标,并暴露给 Prometheus;
  • 2、pod 的运⾏状态:kube-state-metrics 能⾃动获取,因此⽆需考虑;
  • 3、pod 的资源使⽤:kubelet 的 CadVisor 能⾃动获取到每个 Pod 的资源使⽤,因此⽆需考虑;
  • 4、Pod 的存活性以及延迟:可以通过 Blackbox 来监控 6379、9121 端⼝,在此前创建的 Blackbox-tcp 会⾃动的将这些 Service 资源给监控起来,因此⽆需考虑。
# 8.2 运⾏ Redis 基础服务 Pod

1、在⼀个 Pod 中同时运⾏ Redis 和 Redis_exporter,清单⽂件如下:

[root@k8s-master01 08-redis-exporter]# cat 01-redis-deployment.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis
  namespace: default
spec:
  selector:
    matchLabels:
      app: redis
  template:
    metadata:
      labels:
        app: redis
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/scheme: "http"
        prometheus.io/path: "/metrics"
        prometheus.io/port: "9121"
    spec:
      containers:
      - name: redis
        image: redis:6
        # 设定redis最大的内存,默认最大内存为0
        command: ["redis-server"]
        args: ["--maxmemory", "200mb"]
        ports:
        - containerPort: 6379
      - name: redis-exporter
        image: oliver006/redis_exporter:v1.57.0
        ports:
        - containerPort: 9121

2、创建 Service 资源,需要暴露两个端⼝

[root@k8s-master01 08-redis-exporter]# cat 02-redis-service.yaml 
kind: Service
apiVersion: v1
metadata:
  name: redis-svc
  namespace: default
  labels:
    app: redis
spec:
  selector:
    app: redis
  ports:
  - name: redis
    port: 6379
    targetPort: 6379
  - name: exporter
    port: 9121
    targetPort: 9121

3、查看 Service 详情

[root@k8s-master01 08-redis-exporter]# kubectl apply -f .
[root@k8s-master01 08-redis-exporter]#  kubectl describe service redis-svc
Name:              redis-svc
Namespace:         default
Labels:            app=redis
Annotations:       <none>
Selector:          app=redis
Type:              ClusterIP
IP Family Policy:  SingleStack
IP Families:       IPv4
IP:                10.96.140.137
IPs:               10.96.140.137
# 端点1
Port:              redis  6379/TCP
TargetPort:        6379/TCP
Endpoints:         172.16.85.208:6379
# 端点2
Port:              exporter  9121/TCP
TargetPort:        9121/TCP
Endpoints:         172.16.85.208:9121
Session Affinity:  None
Events:            <none>
# 8.3 配置 Prometheus 监控 Redis
  • 1、添加⼀个新的 Job,名为: redis-pod
  • 2、基于 Kubernetes 的 endpints 来实现⾃动发现
  • 3、使⽤ relabel_configs,保留 address 地址对应的端⼝是 9121 的实例。
  • 4、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_service_name、__meta_kubernetes_pod_name 这三个维度的标签。

1、配置 Prometheus

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控kube-state-metrics
      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
          regex: "kube-state-metrics"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:8080
          target_label: __address__

        # 添加维度标签
        - regex: __meta_kubernetes_service_label_(.*)
          action: labelmap

      # 监控kubelet(Pod)
      - job_name: "kube-kubelet"
        metrics_path: "/metrics/cadvisor"
        scheme: https
        kubernetes_sd_configs:
        - role: node
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 添加标签的映射
        relabel_configs:
        - regex: __meta_kubernetes_node_label_(.*)
          action: labelmap

      # 监控service
      - job_name: "kube-blackbox-tcp"
        metrics_path: "/probe"
        params:
          module: [tcp_connect]         # 使用tcp_connect模块
        kubernetes_sd_configs:
        - role: service
        relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: (.*)
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          regex: (.*)
          replacement: $1
          target_label: service_name

      # 监控ingress
      - job_name: "kube-blackbox-http"
        metrics_path: "/probe"
        params:
          module: [http_2xx]            # 使用http模块进行探测
        kubernetes_sd_configs:
        - role: ingress
        relabel_configs:
        # 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
        - source_labels: [__meta_kubernetes_ingress_scheme,__address__]
          regex: (.*);(.*)
          replacement: $1://$2
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          target_label: namespace
        - source_labels: [__meta_kubernetes_ingress_name]
          target_label: ingress_name
        - source_labels: [__meta_kubernetes_ingress_class_name]
          target_label: ingress_class_name

      # 监控redis
      - job_name: "kube-redis"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints

        # 保留端口为9121的Pod实例
        relabel_configs:
        - source_labels: [__address__]
          regex: (.*):9121
          action: keep

        # 保留特定维度的标签
        - source_labels: [__meta_kubernetes_namespace]
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          target_label: service_name
        - source_labels: [__meta_kubernetes_pod_name]
          target_label: pod_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml
global:
  scrape_interval: 15s
  scrape_timeout:  15s

# 告警地址(填写AlertManager的负载均衡地址即可)
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["alertmanager-svc:9093"]

# 告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"

scrape_configs:
  - job_name: "prometheus"
    metrics_path: "/metrics"
    static_configs:
    - targets: ["localhost:9090"]

  # 监控Kubernetes的节点
  - job_name: "kube-nodes"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: node
    relabel_configs:
    - source_labels: ["__address__"]
      regex: "(.*):10250"
      replacement: "$1:9100"
      target_label: __address__
      action: replace
    - regex: __meta_kubernetes_node_label_(.*)
      replacement: $1
      action: labelmap

  # 监控APIServer
  - job_name: "kube-apiserver"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints
    scheme: https
    tls_config:
      insecure_skip_verify: true   # 跳过证书验证
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 标签重写
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
      regex: "apiserver"
      action: "keep"
    - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
      regex: "(.*)"
      replacement: $1
      target_label: service_name
    - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
      replacement: $1
      action: labelmap


  # 监控controllerManager
  - job_name: "kube-controller"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
    # 仅保留标签名是component 值为kube-controller-manager
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-controller-manager"
      action: keep
    # 替换抓取的实例端口为10257
    - source_labels: [__address__]
      regex: (.*)
      replacement: $1:10257
      target_label: __address__
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_pod_name]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控Scheduler
  - job_name: "kube-schduler"
    metrics_path: "/metrics"
    scheme: https
    kubernetes_sd_configs:
    - role: pod
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 基于标签进行过滤
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "kube-scheduler"
      action: keep

    # 修订抓取的端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:10259
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控etcd
  - job_name: "kube-etcd"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_component"]
      regex: "etcd"
      action: keep

    # 修订端口
    - source_labels: ["__address__"]
      regex: (.*)
      replacement: $1:2381
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控CoreDNS
  - job_name: "kube-dns"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
      regex: "kube-dns"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:9153
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name
    - source_labels: ["__meta_kubernetes_service_name"]
      regex: "(.*)"
      replacement: $1
      target_label: service_name

  # 监控kube-proxy
  - job_name: "kube-proxy"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: pod

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
      regex: "kube-proxy"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:10249
      target_label: __address__

    # 添加维度标签
    - source_labels: ["__meta_kubernetes_namespace"]
      regex: "(.*)"
      replacement: $1
      target_label: namespace
    - source_labels: ["__meta_kubernetes_pod_name"]
      regex: "(.*)"
      replacement: $1
      target_label: pod_name

  # 监控kube-state-metrics
  - job_name: "kube-state-metrics"
    metrics_path: "/metrics"
    scheme: http
    kubernetes_sd_configs:
    - role: endpoints

    # 保留对应标签的Pod
    relabel_configs:
    - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
      regex: "kube-state-metrics"
      action: keep

    # 修订端口
    - source_labels: ["__meta_kubernetes_pod_ip"]
      regex: (.*)
      replacement: $1:8080
      target_label: __address__

    # 添加维度标签
    - regex: __meta_kubernetes_service_label_(.*)
      action: labelmap

  # 监控kubelet(Pod)
  - job_name: "kube-kubelet"
    metrics_path: "/metrics/cadvisor"
    scheme: https
    kubernetes_sd_configs:
    - role: node
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

    # 添加标签的映射
    relabel_configs:
    - regex: __meta_kubernetes_node_label_(.*)
      action: labelmap

  # 监控service
  - job_name: "kube-blackbox-tcp"
    metrics_path: "/probe"
    params:
      module: [tcp_connect]         # 使用tcp_connect模块
    kubernetes_sd_configs:
    - role: service
    relabel_configs:
    - source_labels: [__address__]
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: blackbox-svc:9115
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      regex: (.*)
      replacement: $1
      target_label: namespace
    - source_labels: [__meta_kubernetes_service_name]
      regex: (.*)
      replacement: $1
      target_label: service_name

  # 监控ingress
  - job_name: "kube-blackbox-http"
    metrics_path: "/probe"
    params:
      module: [http_2xx]            # 使用http模块进行探测
    kubernetes_sd_configs:
    - role: ingress
    relabel_configs:
    # 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
    - source_labels: [__meta_kubernetes_ingress_scheme,__address__]
      regex: (.*);(.*)
      replacement: $1://$2
      target_label: __param_target
    - source_labels: [__param_target]
      target_label: instance
    - target_label: __address__
      replacement: blackbox-svc:9115
    # 保留特定标签
    - source_labels: [__meta_kubernetes_namespace]
      target_label: namespace
    - source_labels: [__meta_kubernetes_ingress_name]
      target_label: ingress_name
    - source_labels: [__meta_kubernetes_ingress_class_name]
      target_label: ingress_class_name

  # 监控redis
  - job_name: "kube-redis"
    metrics_path: "/metrics"
    kubernetes_sd_configs:
    - role: endpoints

    # 保留端口为9121的Pod实例
    relabel_configs:
    - source_labels: [__address__]
      regex: (.*):9121
      action: keep

    # 保留特定维度的标签
    - source_labels: [__meta_kubernetes_namespace]
      target_label: namespace
    - source_labels: [__meta_kubernetes_service_name]
      target_label: service_name
    - source_labels: [__meta_kubernetes_pod_name]
      target_label: pod_name

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

image-20240318214141751

5、检查此前创建好的 redis_rules.yml 告警规则⽂件

  redis_rules.yml: |-
    groups:
    - name: redis告警规则
      rules:
      - alert: Redis实例宕机
        expr: redis_up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例宕机"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去5分钟内无法连接。"

      - alert: Redis连接数过高
        expr: redis_connected_clients / redis_config_maxclients * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例连接数超过80%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod当前连接数占最大连接数的比率超过80%。当前比率: {{ $value }}%。"

      - alert: Redis连接被拒绝
        expr: increase(redis_rejected_connections_total[1h]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例有连接被拒绝"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod在过去1小时内有连接被拒绝。当前被拒绝的连接数: {{ $value }}。"

      - alert: Redis内存使用率过高
        expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例内存使用率超过80%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的内存使用率超过配置的最大内存值的80%。当前内存使用率: {{ $value }}%。"

      - alert: Redis缓存命中率低
        expr: |
          irate(redis_keyspace_hits_total[5m])
          / 
          (irate(redis_keyspace_hits_total[5m]) + irate(redis_keyspace_misses_total[5m])) * 100 < 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例缓存命中率低于90%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近5分钟内的缓存命中率低于90%。当前命中率: {{ $value }}%。"

      - alert: Redis即将过期的Key数量过多
        expr: |
          sum(redis_db_keys_expiring) by (instance, job, namespace,pod_name,db)
          / 
          sum(redis_db_keys) by (instance, job, namespace,pod_name,db) * 100 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例中的 '{{ $labels.db }}' 数据库有大量即将过期的Key"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod中的 '{{ $labels.db }}' 数据库有超过50%的Key即将过期。当前过期比率: {{ $value }}%。"

      - alert: RedisRDB备份失败
        expr: redis_rdb_last_bgsave_status == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例 RDB备份失败"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod最近的RDB备份失败。"

      - alert: RedisRDB备份时间过长
        expr: redis_rdb_last_bgsave_duration_sec > 3 and redis_rdb_last_bgsave_status == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例 RDB备份成功但耗时超过3秒"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod, RDB备份成功但耗时超过了3秒。持续时间: {{ $value }}秒。"

      - alert: RedisRDB备份过期
        expr: (time() - redis_rdb_last_save_timestamp_seconds) > 36000
        for: 24h
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例超过10小时未进行RDB备份"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod已超过10小时没有生成新的RDB备份文件。"

      - alert: Redis命令拒绝率过高
        expr: |
          sum(irate(redis_commands_rejected_calls_total[5m])) by (instance,job,namespace,pod_name)
          / 
          sum(irate(redis_commands_total[5m])) by (instance,job,namespace,pod_name) * 100 > 25
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例命令拒绝率超过25%"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的命令拒绝率超过了25%。当前拒绝率: {{ $value }}%。"

      - alert: Redis命令平均响应时间过长
        expr: |
          sum(rate(redis_commands_duration_seconds_total[5m])) by (instance,job,namespace,pod_name)
          / 
          sum(rate(redis_commands_processed_total[5m])) by (instance,job,namespace,pod_name) > 0.250
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' Redis实例命令平均响应时间超过250ms"
          description: "'{{ $labels.namespace }}' 名称空间中 '{{ $labels.pod_name }}' Pod的执行命令平均响应时间超过了250毫秒。当前平均响应时间: {{ $value }}秒。"

# 9、Prometheus 监控 java 业务应⽤的 Pod

# 9.1 监控业务应⽤场景说明

运⾏ javaapp 的业务应⽤ Pod,⽽后为其创建⼀个 Service、Ingress,现在我们希望监控 Pod 状态,Pod 资源的使⽤,jvm 内存相关指标、Service 的 TCP 检测、以及域名响应状态、延迟、存活性等检测。

  • 1、java 应⽤指标:在初始化容器运⾏⼀个 jmx_exporter 的容器,并将相关的 jar 和 config.yaml 共享给主容器,主容器通过 JVM 环境变量传递相关启动参数,完成 jvm 的监控;

  • 2、pod 的运⾏状态:kube-state-metrics 能⾃动获取,因此⽆需考虑;

  • 3、pod 的资源使⽤:kubelet 的 CadVisor 能⾃动获取到每个 Pod 的资源使⽤,因此⽆需考虑;

  • 4、service 的存活性以及延迟:通过此前 kube-blackbox-tcp,能⾃动监控其 Service 的 8080 端⼝和 12345 端⼝;

  • 5、ingress 的 http 状态监控,通过此前 kube-blackbox-http,能⾃动监控其 ingress 的域名状态;

# 9.2 运⾏业务应⽤容器 Pod

1、由于官⽅没有提供 jmx_prometheus 的镜像,因此需要先⾃⾏制作镜像。(也可以直接使⽤我制作好的镜像 oldxu3957/jmx_prometheus:v0.20.0 )

[root@k8s-master01 dockerfile]# wget https://repo.maven.apache.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.20.0/jmx_prometheus_javaagent-0.20.0.jar

[root@k8s-master01 dockerfile]# cat config.yaml
rules:
- pattern: ".*"

# Dockerfile⽂件
[root@prom-node03 jmx_exporter]# cat Dockerfile
FROM alpine:latest
ENV VERSION="0.20.0"
ENV DIR=/jmx
COPY ./config.yaml ${DIR}/config.yaml
COPY ./jmx_prometheus_javaagent-${VERSION}.jar ${DIR}/jmx_prometheus.jar

2、运⾏ java 应⽤ Pod

[root@k8s-master01 09-java-exporter]# cat 01-javaapp-deployment.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: java-app
  namespace: default
spec:
  replicas: 2
  selector:
    matchLabels:
      app: java
  template:
    metadata:
      labels:
        app: java
    spec:
      volumes:
      - name: javaagent
        emptyDir: {}
      initContainers:                # 运⾏初始化容器(将⽂件夹整体拷⻉⾄javaagent共享卷)
      - name: jmx-prometheus
        image: oldxu3957/jmx_prometheus:v0.20.0
        command: ["sh","-c","cp -rp /jmx /data/"]
        volumeMounts:
        - name: javaagent
          mountPath: /data
      containers:
      - name: javaapp
        image: oldxu3957/javaapp:v1.0
        env:
        - name: JAVA_TOOL_OPTIONS     # 通过JAVA_TOOL_OPTIONS传递JVM相关参数
          value: "-Xms100m -Xmx100m \
                 -javaagent:/agent/jmx/jmx_prometheus.jar=12345:/agent/jmx/config.yaml"
        volumeMounts:
        - name: javaagent
          mountPath: /agent
        ports:
        - name: java
          containerPort: 8080
        - name: jmx
          containerPort: 12345
        resources:
          requests:
            cpu: 100m
            memory: 200Mi
          limits:
            cpu: 100m
            memory: 200Mi
# 9.3 对外发布 java 业务应⽤

1、创建 Service,需要暴露 8080 端⼝和 12345 端⼝

[root@k8s-master01 09-java-exporter]# cat 02-javaapp-service.yaml 
kind: Service
apiVersion: v1
metadata:
  name: javaapp-svc
  namespace: default
  labels:
    app: java
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/scheme: "http"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "12345"
spec:
  selector:
    app: java
  ports:
  - name: javaapp
    port: 8080
    targetPort: 8080
  - name: jmx
    port: 12345
    targetPort: 12345

2、创建 Ingress

[root@k8s-master01 09-java-exporter]# cat 03-javaapp-ingress.yaml 
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: javaapp-ingress
  namespace: default
spec:
  ingressClassName: "nginx"
  rules:
  - host: "javaapp.hmallleasing.com"
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: javaapp-svc
            port:
              number: 8080

3、更新资源清单

[root@k8s-master01 09-java-exporter]# kubectl apply -f .
# 9.4 配置 Prometheus 监控业务应⽤
  • 1、添加⼀个新的 Job,名为: java-pod
  • 2、基于 Kubernetes 的 endpints 来实现⾃动发现
  • 3、使⽤ relabel_configs,仅抓取 address 地址对应的端⼝是 12345 的实例。
  • 4、使⽤ relabel_configs,保留 __meta_kubernetes_namespace、__meta_kubernetes_service_name、__meta_kubernetes_pod_name 这三个维度的标签。

1、配置 Prometheus

[root@k8s-master01 04-prometheus]# cat 01-prom-configs-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prom-configs
  namespace: monitoring
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
      scrape_timeout:  15s
    
    # 告警地址(填写AlertManager的负载均衡地址即可)
    alerting:
      alertmanagers:
      - static_configs:
        - targets: ["alertmanager-svc:9093"]
    
    # 告警规则文件
    rule_files:
      - "/etc/prometheus/rules/*.yml"

    scrape_configs:
      - job_name: "prometheus"
        metrics_path: "/metrics"
        static_configs:
        - targets: ["localhost:9090"]

      # 监控Kubernetes的节点
      - job_name: "kube-nodes"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: ["__address__"]
          regex: "(.*):10250"
          replacement: "$1:9100"
          target_label: __address__
          action: replace
        - regex: __meta_kubernetes_node_label_(.*)
          replacement: $1
          action: labelmap

      # 监控APIServer
      - job_name: "kube-apiserver"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          insecure_skip_verify: true   # 跳过证书验证
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 标签重写
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_component"]  #保留label为apiserver实例
          regex: "apiserver"
          action: "keep"
        - source_labels: ["__meta_kubernetes_namespace"]        #匹配__meta_kubernetes_namespace值,并赋值给namespace
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_service_name"]    #__meta_kubernetes_service_name值并赋值给service_name
          regex: "(.*)"
          replacement: $1
          target_label: service_name
        - regex: __meta_kubernetes_service_label_(.*)        #通过标签映射获取标签
          replacement: $1
          action: labelmap


      # 监控controllerManager
      - job_name: "kube-controller"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # 仅保留标签名是component 值为kube-controller-manager
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-controller-manager"
          action: keep
        # 替换抓取的实例端口为10257
        - source_labels: [__address__]
          regex: (.*)
          replacement: $1:10257
          target_label: __address__
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_pod_name]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控Scheduler
      - job_name: "kube-schduler"
        metrics_path: "/metrics"
        scheme: https
        kubernetes_sd_configs:
        - role: pod
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 基于标签进行过滤
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "kube-scheduler"
          action: keep

        # 修订抓取的端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:10259
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控etcd
      - job_name: "kube-etcd"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_component"]
          regex: "etcd"
          action: keep

        # 修订端口
        - source_labels: ["__address__"]
          regex: (.*)
          replacement: $1:2381
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控CoreDNS
      - job_name: "kube-dns"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_k8s_app"]
          regex: "kube-dns"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:9153
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name
        - source_labels: ["__meta_kubernetes_service_name"]
          regex: "(.*)"
          replacement: $1
          target_label: service_name

      # 监控kube-proxy
      - job_name: "kube-proxy"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: pod

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_pod_label_k8s_app"]
          regex: "kube-proxy"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:10249
          target_label: __address__

        # 添加维度标签
        - source_labels: ["__meta_kubernetes_namespace"]
          regex: "(.*)"
          replacement: $1
          target_label: namespace
        - source_labels: ["__meta_kubernetes_pod_name"]
          regex: "(.*)"
          replacement: $1
          target_label: pod_name

      # 监控kube-state-metrics
      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"
        scheme: http
        kubernetes_sd_configs:
        - role: endpoints

        # 保留对应标签的Pod
        relabel_configs:
        - source_labels: ["__meta_kubernetes_service_label_app_kubernetes_io_name"]
          regex: "kube-state-metrics"
          action: keep

        # 修订端口
        - source_labels: ["__meta_kubernetes_pod_ip"]
          regex: (.*)
          replacement: $1:8080
          target_label: __address__

        # 添加维度标签
        - regex: __meta_kubernetes_service_label_(.*)
          action: labelmap

      # 监控kubelet(Pod)
      - job_name: "kube-kubelet"
        metrics_path: "/metrics/cadvisor"
        scheme: https
        kubernetes_sd_configs:
        - role: node
        tls_config:
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # 添加标签的映射
        relabel_configs:
        - regex: __meta_kubernetes_node_label_(.*)
          action: labelmap

      # 监控service
      - job_name: "kube-blackbox-tcp"
        metrics_path: "/probe"
        params:
          module: [tcp_connect]         # 使用tcp_connect模块
        kubernetes_sd_configs:
        - role: service
        relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          regex: (.*)
          replacement: $1
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          regex: (.*)
          replacement: $1
          target_label: service_name

      # 监控ingress
      - job_name: "kube-blackbox-http"
        metrics_path: "/probe"
        params:
          module: [http_2xx]            # 使用http模块进行探测
        kubernetes_sd_configs:
        - role: ingress
        relabel_configs:
        # 协议有可能是http或https,因此需要根据抓取到的协议+端⼝,拼接出具体的探测示例
        - source_labels: [__meta_kubernetes_ingress_scheme,__address__]
          regex: (.*);(.*)
          replacement: $1://$2
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: blackbox-svc:9115
        # 保留特定标签
        - source_labels: [__meta_kubernetes_namespace]
          target_label: namespace
        - source_labels: [__meta_kubernetes_ingress_name]
          target_label: ingress_name
        - source_labels: [__meta_kubernetes_ingress_class_name]
          target_label: ingress_class_name

      # 监控redis
      - job_name: "kube-redis"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints

        # 保留端口为9121的Pod实例
        relabel_configs:
        - source_labels: [__address__]
          regex: (.*):9121
          action: keep

        # 保留特定维度的标签
        - source_labels: [__meta_kubernetes_namespace]
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          target_label: service_name
        - source_labels: [__meta_kubernetes_pod_name]
          target_label: pod_name

      # 监控java
      - job_name: "kube-java-pod"
        metrics_path: "/metrics"
        kubernetes_sd_configs:
        - role: endpoints

        # 保留端口为12345的Pod实例
        relabel_configs:
        - source_labels: [__address__]
          regex: (.*):12345
          action: keep

        # 保留特定维度的标签
        - source_labels: [__meta_kubernetes_namespace]
          target_label: namespace
        - source_labels: [__meta_kubernetes_service_name]
          target_label: service_name
        - source_labels: [__meta_kubernetes_pod_name]
          target_label: pod_name

[root@k8s-master01 04-prometheus]# kubectl apply -f 01-prom-configs-configmap.yaml

2、检查配置是否更新

[root@k8s-master01 04-prometheus]# kubectl exec -it prometheus-0 -n monitoring -- cat /etc/prometheus/prometheus.yml

3、重新加载 Prometheus,然后检查节点的 targets,能正常监控 Kubernets 的节点

[root@k8s-master01 04-prometheus]# curl -X POST http://k8s-prom.hmallleasing.com/-/reload

4、检查 Prometheus 抓取的结果

# 9.5 检查此前创建好的 jvm_rules.yml 告警规则⽂件
  jvm_rules.yml: |-
    groups:
    - name: "JVM告警规则"
      rules:
      - alert: JVM堆内存使用率过高
        expr: jvm_memory_bytes_used{area="heap",} / jvm_memory_bytes_max{area="heap",} * 100 > 90
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 实例的JVM 堆内存使用率超过80%"
          description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' PodJVM堆内存使用率超过80%, 当前使用率是 {{ $value }}%"

      - alert: JVMGC时间过长
        expr: sum (rate(jvm_gc_collection_seconds_sum[5m]) / rate(jvm_gc_collection_seconds_count[5m])) by (instance,job,gc,namespace,pod_name) > 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "'{{ $labels.instance }}' 实例的JVM  GC时间超过了1秒。"
          description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod使用 {{ $labels.gc }} GC垃圾回收算法时间超过1s,当前值 {{ $value }}秒"

      - alert: JVM死锁线程过多
        expr: min_over_time(jvm_threads_deadlocked[5m]) > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "JVM检测到'{{ $labels.instance }}' 实例有死锁线程"
          description: "'{{ $labels.namespace }}' 名称空间下的 '{{ $labels.pod_name }}' Pod,在过去5分钟检测到死锁线程, 当前死锁线程数是 {{ $value }}。"
此文章已被阅读次数:正在加载...更新于

请我喝[茶]~( ̄▽ ̄)~*

Xu Yong 微信支付

微信支付

Xu Yong 支付宝

支付宝