Commit ac1c1563 authored by Nikhil Doifode's avatar Nikhil Doifode
Browse files

Added prometheus config file

- Added prometheus config file with alerts
- Added code for copying user config file
- Incremented the meepctl minor version
parent d7e2f5cd
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
version: 1.6.6
version: 1.6.7
repo:
  name: AdvantEDGE

+147 −0
Original line number Diff line number Diff line
additionalPrometheusRulesMap:
  - rule-name: PrometheusAlerts
    groups:
    - name: MEC Sandbox Alerts
      rules:
      - alert: NoOfActiveSessions
        expr: auth_svc_session_active >= 8
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Active sessions more than or equal to 8 (instance {{ $labels.instance }})
          description: Active seesions at MEC Sanbox >= 8\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
    - name: Node Alerts
      rules:
      - alert: HostRebooted
        expr: changes(node_boot_time_seconds[1d]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host rebooted (instance {{ $labels.instance }})
          description: Node just rebooted\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostOutOfMemory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of memory (instance {{ $labels.instance }})
          description: Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostOutOfDiskSpace
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host out of disk space (instance {{ $labels.instance }})
          description: Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostMemoryUnderMemoryPressure
        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host memory under memory pressure (instance {{ $labels.instance }})
          description: The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostUnusualNetworkThroughputIn
        expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host unusual network throughput in (instance {{ $labels.instance }})
          description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostUnusualNetworkThroughputOut
        expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host unusual network throughput out (instance {{ $labels.instance }})
          description: Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABEL = {{ $labels }}
      - alert: HostUnusualDiskReadRate
        expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk read rate (instance {{ $labels.instance }})
          description: Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostUnusualDiskWriteRate
        expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk write rate (instance {{ $labels.instance }})
          description: Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: HostHighCpuLoad
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host high CPU load (instance {{ $labels.instance }})
          description: CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
    - name: Kubernetes Alerts
      rules:
      - alert: KubernetesPodCrashLooping
        expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
          description: Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
      - alert: KubernetesPodNotHealthy
        expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
          description: Pod has been in a non-ready state for longer than an hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}

alertmanager:
  enabled: true

  config:
    global:
      resolve_timeout: 5m
      slack_api_url: 'https://hooks.slack.com/services/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'
    route:
      group_by: [Alertname]
      receiver: slack-alerts
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 24h
    receivers:
    - name: slack-alerts
      slack_configs:
      - channel: '#sandbox_alerts'
        send_resolved: true
        icon_url: https://avatars3.githubusercontent.com/u/3380462
        title: |-
          [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
          {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
            {{" "}}(
            {{- with .CommonLabels.Remove .GroupLabels.Names }}
              {{- range $index, $label := .SortedPairs -}}
                {{ if $index }}, {{ end }}
                {{- $label.Name }}="{{ $label.Value -}}"
              {{- end }}
            {{- end -}}
            )
          {{- end }}
        text: >-
          {{ range .Alerts -}}
          *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}

          *Description:* {{ .Annotations.description }}

          *Details:*
            {{ range .Labels.SortedPairs }}  *{{ .Name }}:* `{{ .Value }}`
            {{ end }}
          {{ end }}
+8 −8
Original line number Diff line number Diff line
@@ -44,14 +44,14 @@ mkdir -p $SWAGGERDIR
rm -r $SWAGGERDIR/* 2> /dev/null
cp -r js-apps/swagger-ui/* $SWAGGERDIR

# echo ""
# echo "+ Updating configuration user values"
# echo "$VALUESDIR"
# mkdir -p $VALUESDIR
# rm -r $VALUESDIR-bak 2> /dev/null
# mkdir -p $VALUESDIR-bak
# cp -r  $VALUESDIR/* $VALUESDIR-bak
# cp -r config/values/* $VALUESDIR
echo ""
echo "+ Updating configuration user values"
echo "$VALUESDIR"
mkdir -p $VALUESDIR
rm -r $VALUESDIR-bak 2> /dev/null
mkdir -p $VALUESDIR-bak
cp -r  $VALUESDIR/* $VALUESDIR-bak
cp -r config/values/* $VALUESDIR

echo ""
echo ">>> MEC Sandbox deployment completed"