Commit 33ceea4c authored by Kevin Di Lallo's avatar Kevin Di Lallo
Browse files

fixed faulty alert triggers

parent ca679995
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ additionalPrometheusRulesMap:
    - name: Node Alerts
      rules:
      - alert: HostRebooted
        expr: changes(node_boot_time_seconds[10m]) > 0
        expr: changes(node_boot_time_seconds[10m]) > 10
        for: 0m
        labels:
          severity: warning
@@ -96,13 +96,13 @@ additionalPrometheusRulesMap:
          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
          description: Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: KubernetesPodNotHealthy
        expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:]) > 0
        for: 5m
        expr: sum by (namespace, pod) (sum_over_time(kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}[5m])) >= 10
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
          description: Pod has been in a non-ready state for more than 5 minutes in the last 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
          description: Pod has been in a non-ready state for more than 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}

alertmanager:
  enabled: true