1. ---
  2. apiVersion: monitoring.coreos.com/v1
  3. kind: PrometheusRule
  4. metadata:
  5. labels:
  6. prometheus: k8s
  7. role: alert-rules
  8. name: cilium-health-check
  9. namespace: kuboard
  10. spec:
  11. groups:
  12. - name: cilium-health-check
  13. rules:
  14. - alert: unreachable-cilium-endpoints
  15. annotations:
  16. message: >-
  17. ns:{{ $labels.namespace }} pod:{{ $labels.pod }} instance: {{
  18. $labels.instance }} 存在无法到达的cilium端点
  19. summary: '{{ $labels.app }} 存在无法到达的cilium端点'
  20. expr: 'max(cilium_unreachable_nodes) by (namespace,pod) >0'
  21. for: 1m
  22. labels:
  23. severity: critical
  24. - alert: unreachable-health-cilium-endpoints
  25. annotations:
  26. message: >-
  27. ns:{{ $labels.namespace }} pod:{{ $labels.pod }} instance: {{
  28. $labels.instance }} 与健康端点连接不正常
  29. summary: '{{ $labels.app }} 与健康端点连接有问题'
  30. expr: 'max(unreachable_health_endpoints) by (namespace,pod) >0'
  31. for: 1m
  32. labels:
  33. severity: critical
  34. - alert: cilium-memory-used
  35. annotations:
  36. message: >-
  37. ns:{{ $labels.namespace }} pod:{{ $labels.pod }} 系统中安装的 eBPF
  38. 映射使用的最大内存大于 200M
  39. summary: '{{ $labels.app }} 系统中安装的 eBPF 映射使用的最大内存大于 200M'
  40. expr: max(cilium_bpf_maps_virtual_memory_max_bytes) by (pod) > 209715200
  41. for: 1m
  42. labels:
  43. severity: warning
文档更新时间: 2023-04-24 10:53   作者:张尚