06、cilium监控告警模板 - Powered by MinDoc

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus: k8s
    role: alert-rules
  name: cilium-health-check
  namespace: kuboard
spec:
  groups:
    - name: cilium-health-check
      rules:
        - alert: unreachable-cilium-endpoints
          annotations:
            message: >-
              ns:{{ $labels.namespace }} pod:{{ $labels.pod }} instance: {{
              $labels.instance }} 存在无法到达的cilium端点
            summary: '{{ $labels.app }} 存在无法到达的cilium端点'
          expr: 'max(cilium_unreachable_nodes) by (namespace,pod) >0'
          for: 1m
          labels:
            severity: critical
        - alert: unreachable-health-cilium-endpoints
          annotations:
            message: >-
              ns:{{ $labels.namespace }} pod:{{ $labels.pod }} instance: {{
              $labels.instance }} 与健康端点连接不正常
            summary: '{{ $labels.app }} 与健康端点连接有问题'
          expr: 'max(unreachable_health_endpoints) by (namespace,pod) >0'
          for: 1m
          labels:
            severity: critical
        - alert: cilium-memory-used
          annotations:
            message: >-
              ns:{{ $labels.namespace }} pod:{{ $labels.pod }} 系统中安装的 eBPF
              映射使用的最大内存大于 200M
            summary: '{{ $labels.app }}  系统中安装的 eBPF 映射使用的最大内存大于 200M'
          expr: max(cilium_bpf_maps_virtual_memory_max_bytes) by (pod) > 209715200
          for: 1m
          labels:
            severity: warning
文档更新时间: 2023-04-24 10:53 作者：张尚