Added prometheus config file (ac1c1563) · Commits · MEC - Multi-access Edge Computing / ETSI MEC Sandbox Frontend

config/.meepctl-repocfg.yaml

+1 −1

Original line number	Diff line number	Diff line
		version: 1.6.6
		version: 1.6.7
		repo:
		name: AdvantEDGE

config/values/meep-prometheus.yaml

0 → 100644

+147 −0

Original line number	Diff line number	Diff line
		additionalPrometheusRulesMap:
		- rule-name: PrometheusAlerts
		groups:
		- name: MEC Sandbox Alerts
		rules:
		- alert: NoOfActiveSessions
		expr: auth_svc_session_active >= 8
		for: 2m
		labels:
		severity: warning
		annotations:
		summary: Active sessions more than or equal to 8 (instance {{ $labels.instance }})
		description: Active seesions at MEC Sanbox >= 8\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- name: Node Alerts
		rules:
		- alert: HostRebooted
		expr: changes(node_boot_time_seconds[1d]) > 0
		for: 0m
		labels:
		severity: warning
		annotations:
		summary: Host rebooted (instance {{ $labels.instance }})
		description: Node just rebooted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostOutOfMemory
		expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
		for: 2m
		labels:
		severity: warning
		annotations:
		summary: Host out of memory (instance {{ $labels.instance }})
		description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostOutOfDiskSpace
		expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
		for: 2m
		labels:
		severity: warning
		annotations:
		summary: Host out of disk space (instance {{ $labels.instance }})
		description: Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostMemoryUnderMemoryPressure
		expr: rate(node_vmstat_pgmajfault[1m]) > 1000
		for: 2m
		labels:
		severity: warning
		annotations:
		summary: Host memory under memory pressure (instance {{ $labels.instance }})
		description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostUnusualNetworkThroughputIn
		expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
		for: 5m
		labels:
		severity: warning
		annotations:
		summary: Host unusual network throughput in (instance {{ $labels.instance }})
		description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostUnusualNetworkThroughputOut
		expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
		for: 5m
		labels:
		severity: warning
		annotations:
		summary: Host unusual network throughput out (instance {{ $labels.instance }})
		description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABEL = {{ $labels }}
		- alert: HostUnusualDiskReadRate
		expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
		for: 5m
		labels:
		severity: warning
		annotations:
		summary: Host unusual disk read rate (instance {{ $labels.instance }})
		description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostUnusualDiskWriteRate
		expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
		for: 2m
		labels:
		severity: warning
		annotations:
		summary: Host unusual disk write rate (instance {{ $labels.instance }})
		description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: HostHighCpuLoad
		expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
		for: 0m
		labels:
		severity: warning
		annotations:
		summary: Host high CPU load (instance {{ $labels.instance }})
		description: CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- name: Kubernetes Alerts
		rules:
		- alert: KubernetesPodCrashLooping
		expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
		for: 2m
		labels:
		severity: warning
		annotations:
		summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
		description: Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}
		- alert: KubernetesPodNotHealthy
		expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending\|Unknown\|Failed"})[1h:]) > 0
		for: 0m
		labels:
		severity: critical
		annotations:
		summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
		description: Pod has been in a non-ready state for longer than an hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}

		alertmanager:
		enabled: true

		config:
		global:
		resolve_timeout: 5m
		slack_api_url: 'https://hooks.slack.com/services/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'
		route:
		group_by: [Alertname]
		receiver: slack-alerts
		group_wait: 30s
		group_interval: 5m
		repeat_interval: 24h
		receivers:
		- name: slack-alerts
		slack_configs:
		- channel: '#sandbox_alerts'
		send_resolved: true
		icon_url: https://avatars3.githubusercontent.com/u/3380462
		title: \|-
		[{{ .Status \| toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing \| len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
		{{- if gt (len .CommonLabels) (len .GroupLabels) -}}
		{{" "}}(
		{{- with .CommonLabels.Remove .GroupLabels.Names }}
		{{- range $index, $label := .SortedPairs -}}
		{{ if $index }}, {{ end }}
		{{- $label.Name }}="{{ $label.Value -}}"
		{{- end }}
		{{- end -}}
		)
		{{- end }}
		text: >-
		{{ range .Alerts -}}
		Alert: {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}

		Description: {{ .Annotations.description }}

		Details:
		{{ range .Labels.SortedPairs }} {{ .Name }}: `{{ .Value }}`
		{{ end }}
		{{ end }}

deploy.sh

+8 −8

Original line number	Diff line number	Diff line
		@@ -44,14 +44,14 @@ mkdir -p $SWAGGERDIR
		rm -r $SWAGGERDIR/* 2> /dev/null
		cp -r js-apps/swagger-ui/* $SWAGGERDIR

		# echo ""
		# echo "+ Updating configuration user values"
		# echo "$VALUESDIR"
		# mkdir -p $VALUESDIR
		# rm -r $VALUESDIR-bak 2> /dev/null
		# mkdir -p $VALUESDIR-bak
		# cp -r $VALUESDIR/* $VALUESDIR-bak
		# cp -r config/values/* $VALUESDIR
		echo ""
		echo "+ Updating configuration user values"
		echo "$VALUESDIR"
		mkdir -p $VALUESDIR
		rm -r $VALUESDIR-bak 2> /dev/null
		mkdir -p $VALUESDIR-bak
		cp -r $VALUESDIR/* $VALUESDIR-bak
		cp -r config/values/* $VALUESDIR

		echo ""
		echo ">>> MEC Sandbox deployment completed"

Admin message