Commit 9c199be4 authored by Lluis Gifre Renom's avatar Lluis Gifre Renom
Browse files

TFS component performance dashboards:

- Moved from common/method_wrappers to WebUI
- Generalized definition of dashboards
- Minor corrections in dashboards
- Corrected accounting of "not-found"/"already exists" exceptions in decorator
- Corrected type hints in decorator
- Added deployment of dashboards in deploy script
parent d425a5a4
Loading
Loading
Loading
Loading
+42 −6
Original line number Original line Diff line number Diff line
@@ -428,7 +428,7 @@ if [[ "$TFS_COMPONENTS" == *"webui"* ]]; then
    curl -X POST -H "Content-Type: application/json" -H "Accept: application/json" -d '{
    curl -X POST -H "Content-Type: application/json" -H "Accept: application/json" -d '{
        "access"   : "proxy",
        "access"   : "proxy",
        "type"     : "prometheus",
        "type"     : "prometheus",
        "name"     : "Prometheus",
        "name"     : "prometheus",
        "url"      : "http://prometheus-k8s.monitoring.svc:9090",
        "url"      : "http://prometheus-k8s.monitoring.svc:9090",
        "basicAuth": false,
        "basicAuth": false,
        "isDefault": false,
        "isDefault": false,
@@ -438,23 +438,59 @@ if [[ "$TFS_COMPONENTS" == *"webui"* ]]; then
    }' ${GRAFANA_URL_UPDATED}/api/datasources
    }' ${GRAFANA_URL_UPDATED}/api/datasources
    printf "\n\n"
    printf "\n\n"


    echo ">> Creating dashboards..."
    echo ">> Creating and staring dashboards..."
    # Ref: https://grafana.com/docs/grafana/latest/http_api/dashboard/
    # Ref: https://grafana.com/docs/grafana/latest/http_api/dashboard/

    # Dashboard: L3 Monitoring KPIs
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_db_mon_kpis_psql.json' \
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_db_mon_kpis_psql.json' \
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
    echo
    echo
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-l3-monit"
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    echo


    # Dashboard: Slice Grouping
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_db_slc_grps_psql.json' \
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_db_slc_grps_psql.json' \
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
    printf "\n\n"
    echo
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-slice-grps"
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    echo


    echo ">> Staring dashboards..."
    # Dashboard: Component RPCs
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-l3-monit"
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_prom_component_rpc.json' \
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
    echo
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-comp-rpc"
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    echo
    echo


    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-slice-grps"
    # Dashboard: Device Drivers
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_prom_device_driver.json' \
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
    echo
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-dev-drv"
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    echo

    # Dashboard: Service Handlers
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_prom_service_handler.json' \
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
    echo
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-svc-hdlr"
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    echo

    # Dashboard: Device ConfigureDevice Details
    curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_prom_device_config_exec_details.json' \
        ${GRAFANA_URL_UPDATED}/api/dashboards/db
    echo
    DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-dev-confdev"
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id')
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID}
    echo
    echo
+3 −1
Original line number Original line Diff line number Diff line
@@ -76,7 +76,7 @@ class MetricsPool:


    def get_metrics(
    def get_metrics(
        self, method : str
        self, method : str
    ) -> Tuple[MetricWrapperBase, MetricWrapperBase, MetricWrapperBase, MetricWrapperBase]:
    ) -> Tuple[Histogram, Counter, Counter, Counter]:
        histogram_duration : Histogram = self.get_or_create(method, MetricTypeEnum.HISTOGRAM_DURATION)
        histogram_duration : Histogram = self.get_or_create(method, MetricTypeEnum.HISTOGRAM_DURATION)
        counter_started    : Counter   = self.get_or_create(method, MetricTypeEnum.COUNTER_STARTED)
        counter_started    : Counter   = self.get_or_create(method, MetricTypeEnum.COUNTER_STARTED)
        counter_completed  : Counter   = self.get_or_create(method, MetricTypeEnum.COUNTER_COMPLETED)
        counter_completed  : Counter   = self.get_or_create(method, MetricTypeEnum.COUNTER_COMPLETED)
@@ -200,6 +200,8 @@ def safe_and_metered_rpc_method(metrics_pool : MetricsPool, logger : logging.Log
                    # Assume not found or already exists is just a condition, not an error
                    # Assume not found or already exists is just a condition, not an error
                    logger.exception('{:s} exception'.format(method_name))
                    logger.exception('{:s} exception'.format(method_name))
                    counter_failed.inc()
                    counter_failed.inc()
                else:
                    counter_completed.inc()
                grpc_context.abort(e.code, e.details)
                grpc_context.abort(e.code, e.details)
            except Exception as e:          # pragma: no cover, pylint: disable=broad-except
            except Exception as e:          # pragma: no cover, pylint: disable=broad-except
                logger.exception('{:s} exception'.format(method_name))
                logger.exception('{:s} exception'.format(method_name))
+0 −426
Original line number Original line Diff line number Diff line
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": "-- Grafana --",
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
  "id": 25,
  "iteration": 1671297223428,
  "links": [],
  "panels": [
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fieldConfig": {
        "defaults": {},
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 6,
        "w": 24,
        "x": 0,
        "y": 0
      },
      "hiddenSeries": false,
      "id": 4,
      "legend": {
        "alignAsTable": false,
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "rightSide": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "7.5.4",
      "pointradius": 2,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "exemplar": true,
          "expr": "sum(tfs_[[component]]_rpc_[[method]]_counter_requests_started_total{pod=~\"[[pod]]\"})",
          "interval": "",
          "legendFormat": "started",
          "queryType": "randomWalk",
          "refId": "A"
        },
        {
          "exemplar": true,
          "expr": "sum(tfs_[[component]]_rpc_[[method]]_counter_requests_completed_total{pod=~\"[[pod]]\"})",
          "hide": false,
          "interval": "",
          "legendFormat": "completed",
          "refId": "B"
        },
        {
          "exemplar": true,
          "expr": "sum(tfs_[[component]]_rpc_[[method]]_counter_requests_started_total{pod=~\"[[pod]]\"})",
          "hide": false,
          "interval": "",
          "legendFormat": "failed",
          "refId": "C"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
      "title": "Requests",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "transformations": [],
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "$$hashKey": "object:935",
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": "0",
          "show": true
        },
        {
          "$$hashKey": "object:936",
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "cards": {
        "cardPadding": null,
        "cardRound": null
      },
      "color": {
        "cardColor": "#b4ff00",
        "colorScale": "linear",
        "colorScheme": "interpolateRdYlGn",
        "exponent": 0.5,
        "max": null,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": "prometheus",
      "fieldConfig": {
        "defaults": {},
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 24,
        "x": 0,
        "y": 6
      },
      "heatmap": {},
      "hideZeroBuckets": true,
      "highlightCards": true,
      "id": 2,
      "interval": "60s",
      "legend": {
        "show": true
      },
      "pluginVersion": "7.5.4",
      "reverseYBuckets": false,
      "targets": [
        {
          "exemplar": true,
          "expr": "sum(\r\n    max_over_time(tfs_[[component]]_rpc_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\"}[1m]) -\r\n    min_over_time(tfs_[[component]]_rpc_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\"}[1m])\r\n) by (le)",
          "format": "heatmap",
          "instant": false,
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "{{le}}",
          "queryType": "randomWalk",
          "refId": "A"
        }
      ],
      "title": "Histogram",
      "tooltip": {
        "show": true,
        "showHistogram": true
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "xBucketNumber": null,
      "xBucketSize": null,
      "yAxis": {
        "decimals": null,
        "format": "s",
        "logBase": 1,
        "max": null,
        "min": null,
        "show": true,
        "splitFactor": null
      },
      "yBucketBound": "auto",
      "yBucketNumber": null,
      "yBucketSize": null
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "prometheus",
      "fieldConfig": {
        "defaults": {},
        "overrides": []
      },
      "fill": 1,
      "fillGradient": 0,
      "gridPos": {
        "h": 6,
        "w": 24,
        "x": 0,
        "y": 14
      },
      "hiddenSeries": false,
      "id": 5,
      "legend": {
        "alignAsTable": false,
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "rightSide": false,
        "show": false,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "nullPointMode": "null",
      "options": {
        "alertThreshold": true
      },
      "percentage": false,
      "pluginVersion": "7.5.4",
      "pointradius": 2,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "exemplar": true,
          "expr": "sum(tfs_[[component]]_rpc_[[method]]_histogram_duration_sum{pod=~\"[[pod]]\"})",
          "hide": false,
          "interval": "",
          "legendFormat": "total time",
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeRegions": [],
      "timeShift": null,
      "title": "Total Exec Time",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "transformations": [],
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "$$hashKey": "object:407",
          "format": "s",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": "0",
          "show": true
        },
        {
          "$$hashKey": "object:408",
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    }
  ],
  "refresh": "5s",
  "schemaVersion": 27,
  "style": "dark",
  "tags": [],
  "templating": {
    "list": [
      {
        "allValue": null,
        "current": {
          "selected": false,
          "text": "context",
          "value": "context"
        },
        "datasource": "prometheus",
        "definition": "metrics(tfs_)",
        "description": null,
        "error": null,
        "hide": 0,
        "includeAll": false,
        "label": "Component",
        "multi": false,
        "name": "component",
        "options": [],
        "query": {
          "query": "metrics(tfs_)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "/tfs_(.+)_rpc_.*/",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "allValue": "",
        "current": {
          "selected": false,
          "text": "getcontext",
          "value": "getcontext"
        },
        "datasource": "prometheus",
        "definition": "metrics(tfs_[[component]]_rpc_)",
        "description": null,
        "error": null,
        "hide": 0,
        "includeAll": false,
        "label": "Method",
        "multi": false,
        "name": "method",
        "options": [],
        "query": {
          "query": "metrics(tfs_[[component]]_rpc_)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "/tfs_[[component]]_rpc_(.+)_histogram_duration_bucket/",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "allValue": ".*",
        "current": {
          "selected": true,
          "text": [
            "All"
          ],
          "value": [
            "$__all"
          ]
        },
        "datasource": "prometheus",
        "definition": "label_values(tfs_[[component]]_rpc_[[method]]_histogram_duration_bucket, pod)",
        "description": null,
        "error": null,
        "hide": 0,
        "includeAll": true,
        "label": "Pod",
        "multi": true,
        "name": "pod",
        "options": [],
        "query": {
          "query": "label_values(tfs_[[component]]_rpc_[[method]]_histogram_duration_bucket, pod)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      }
    ]
  },
  "time": {
    "from": "now-15m",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "",
  "title": "TFS / Component RPCs",
  "uid": "KKxzxIFVz",
  "version": 21
}
 No newline at end of file
+0 −185
Original line number Original line Diff line number Diff line
{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": "-- Grafana --",
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
  "id": 30,
  "iteration": 1682003744753,
  "links": [],
  "panels": [
    {
      "cards": {
        "cardPadding": null,
        "cardRound": null
      },
      "color": {
        "cardColor": "#b4ff00",
        "colorScale": "linear",
        "colorScheme": "interpolateRdYlGn",
        "exponent": 0.5,
        "max": null,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": "prometheus",
      "fieldConfig": {
        "defaults": {},
        "overrides": []
      },
      "gridPos": {
        "h": 22,
        "w": 24,
        "x": 0,
        "y": 0
      },
      "heatmap": {},
      "hideZeroBuckets": true,
      "highlightCards": true,
      "id": 2,
      "interval": "60s",
      "legend": {
        "show": true
      },
      "pluginVersion": "7.5.4",
      "reverseYBuckets": false,
      "targets": [
        {
          "exemplar": true,
          "expr": "sum(\r\n    max_over_time(tfs_device_exec_details_configuredevice_histogram_duration_bucket{pod=~\"[[pod]]\", step_name=~\"[[step_name]]\"}[1m]) -\r\n    min_over_time(tfs_device_exec_details_configuredevice_histogram_duration_bucket{pod=~\"[[pod]]\", step_name=~\"[[step_name]]\"}[1m])\r\n) by (le)",
          "format": "heatmap",
          "instant": false,
          "interval": "1m",
          "intervalFactor": 1,
          "legendFormat": "{{le}}",
          "queryType": "randomWalk",
          "refId": "A"
        }
      ],
      "title": "Histogram",
      "tooltip": {
        "show": true,
        "showHistogram": true
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "xBucketNumber": null,
      "xBucketSize": null,
      "yAxis": {
        "decimals": null,
        "format": "s",
        "logBase": 1,
        "max": null,
        "min": null,
        "show": true,
        "splitFactor": null
      },
      "yBucketBound": "auto",
      "yBucketNumber": null,
      "yBucketSize": null
    }
  ],
  "refresh": false,
  "schemaVersion": 27,
  "style": "dark",
  "tags": [],
  "templating": {
    "list": [
      {
        "allValue": ".*",
        "current": {
          "selected": true,
          "text": [
            "All"
          ],
          "value": [
            "$__all"
          ]
        },
        "datasource": "prometheus",
        "definition": "label_values(tfs_device_exec_details_configuredevice_histogram_duration_bucket, pod)",
        "description": null,
        "error": null,
        "hide": 0,
        "includeAll": true,
        "label": "Pod",
        "multi": true,
        "name": "pod",
        "options": [],
        "query": {
          "query": "label_values(tfs_device_exec_details_configuredevice_histogram_duration_bucket, pod)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      },
      {
        "allValue": ".*",
        "current": {
          "selected": true,
          "text": [
            "get_device",
            "set_device"
          ],
          "value": [
            "get_device",
            "set_device"
          ]
        },
        "datasource": "prometheus",
        "definition": "label_values(tfs_device_exec_details_configuredevice_histogram_duration_bucket, step_name)",
        "description": null,
        "error": null,
        "hide": 0,
        "includeAll": true,
        "label": "Step Name",
        "multi": true,
        "name": "step_name",
        "options": [],
        "query": {
          "query": "label_values(tfs_device_exec_details_configuredevice_histogram_duration_bucket, step_name)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
        "tagValuesQuery": "",
        "tags": [],
        "tagsQuery": "",
        "type": "query",
        "useTags": false
      }
    ]
  },
  "time": {
    "from": "2023-04-20T15:11:48.272Z",
    "to": "2023-04-20T15:18:10.464Z"
  },
  "timepicker": {},
  "timezone": "",
  "title": "TFS / ConfigureDevice Details",
  "uid": "GfzKHbPVk",
  "version": 4
}
 No newline at end of file
+0 −431

File deleted.

Preview size limit exceeded, changes collapsed.

Loading