From 5f31fb8bb758f3a39850f37d178f4345126a9ed4 Mon Sep 17 00:00:00 2001 From: gifrerenom Date: Fri, 28 Apr 2023 14:27:15 +0000 Subject: [PATCH 1/6] Device component: - Add driver name attribute to drivers - Improve instantiation of per-driver MetricsPool - Added detailed metrics for AddDevice RPC method --- .../service/DeviceServiceServicerImpl.py | 77 ++++++++++++++++--- src/device/service/driver_api/_Driver.py | 19 ++++- .../drivers/emulated/EmulatedDriver.py | 8 +- .../drivers/ietf_l2vpn/IetfL2VpnDriver.py | 16 ++-- .../drivers/microwave/IETFApiDriver.py | 18 +++-- .../drivers/openconfig/OpenConfigDriver.py | 12 +-- src/device/service/drivers/p4/p4_driver.py | 3 +- .../transport_api/TransportApiDriver.py | 16 ++-- src/device/service/drivers/xr/XrDriver.py | 24 +++--- 9 files changed, 140 insertions(+), 53 deletions(-) diff --git a/src/device/service/DeviceServiceServicerImpl.py b/src/device/service/DeviceServiceServicerImpl.py index e7fec0418..d3d92d3a3 100644 --- a/src/device/service/DeviceServiceServicerImpl.py +++ b/src/device/service/DeviceServiceServicerImpl.py @@ -38,7 +38,7 @@ LOGGER = logging.getLogger(__name__) METRICS_POOL = MetricsPool('Device', 'RPC') METRICS_POOL_DETAILS = MetricsPool('Device', 'exec_details', labels={ - 'step_name': '', + 'driver': '', 'operation': '', 'step': '', }) class DeviceServiceServicerImpl(DeviceServiceServicer): @@ -51,11 +51,15 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): @safe_and_metered_rpc_method(METRICS_POOL, LOGGER) def AddDevice(self, request : Device, context : grpc.ServicerContext) -> DeviceId: + t0 = time.time() + device_uuid = request.device_id.device_uuid.uuid connection_config_rules = check_connect_rules(request.device_config) check_no_endpoints(request.device_endpoints) + t1 = time.time() + context_client = ContextClient() device = get_device(context_client, device_uuid, rw_copy=True) if device is None: @@ -73,10 +77,15 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): # update device_uuid to honor UUID provided by Context device_uuid = device.device_id.device_uuid.uuid + t2 = time.time() + self.mutex_queues.wait_my_turn(device_uuid) + t3 = time.time() try: driver : _Driver = get_driver(self.driver_instance_cache, device) + t4 = time.time() + errors = [] # Sub-devices and sub-links are exposed by intermediate controllers or represent mgmt links. @@ -86,13 +95,23 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): new_sub_links : Dict[str, Link] = dict() if len(device.device_endpoints) == 0: + t5 = time.time() # created from request, populate endpoints using driver errors.extend(populate_endpoints( device, driver, self.monitoring_loops, new_sub_devices, new_sub_links)) + t6 = time.time() + t_pop_endpoints = t6 - t5 + else: + t_pop_endpoints = None if len(device.device_config.config_rules) == len(connection_config_rules): # created from request, populate config rules using driver + t7 = time.time() errors.extend(populate_config_rules(device, driver)) + t8 = time.time() + t_pop_config_rules = t8 - t7 + else: + t_pop_config_rules = None # TODO: populate components @@ -100,22 +119,60 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): for error in errors: LOGGER.error(error) raise OperationFailedException('AddDevice', extra_details=errors) + t9 = time.time() + device.device_operational_status = DeviceOperationalStatusEnum.DEVICEOPERATIONALSTATUS_DISABLED device_id = context_client.SetDevice(device) + t10 = time.time() + for sub_device in new_sub_devices.values(): context_client.SetDevice(sub_device) + t11 = time.time() + for sub_links in new_sub_links.values(): context_client.SetLink(sub_links) + t12 = time.time() + # Update endpoint monitoring resources with UUIDs device_with_uuids = get_device( context_client, device_id.device_uuid.uuid, rw_copy=False, include_endpoints=True, include_components=False, include_config_rules=False) populate_endpoint_monitoring_resources(device_with_uuids, self.monitoring_loops) + t13 = time.time() + context_client.close() + + t14 = time.time() + + metrics_labels = dict(driver=driver.name, operation='add_device') + + histogram_duration : Histogram = METRICS_POOL_DETAILS.get_or_create( + 'AddDevice', MetricTypeEnum.HISTOGRAM_DURATION) + histogram_duration.labels(step='total' , **metrics_labels).observe(t14-t0) + histogram_duration.labels(step='execution' , **metrics_labels).observe(t14-t3) + histogram_duration.labels(step='endpoint_checks' , **metrics_labels).observe(t1-t0) + histogram_duration.labels(step='get_device' , **metrics_labels).observe(t2-t1) + histogram_duration.labels(step='wait_queue' , **metrics_labels).observe(t3-t2) + histogram_duration.labels(step='get_driver' , **metrics_labels).observe(t4-t3) + histogram_duration.labels(step='set_device' , **metrics_labels).observe(t10-t9) + histogram_duration.labels(step='populate_monit_rsrc', **metrics_labels).observe(t13-t12) + + if t_pop_endpoints is not None: + histogram_duration.labels(step='populate_endpoints', **metrics_labels).observe(t_pop_endpoints) + + if t_pop_config_rules is not None: + histogram_duration.labels(step='populate_config_rules', **metrics_labels).observe(t_pop_config_rules) + + if len(new_sub_devices) > 0: + histogram_duration.labels(step='set_sub_devices', **metrics_labels).observe(t11-t10) + + if len(new_sub_links) > 0: + histogram_duration.labels(step='set_sub_links', **metrics_labels).observe(t12-t11) + return device_id finally: self.mutex_queues.signal_done(device_uuid) @@ -195,16 +252,18 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): t9 = time.time() + metrics_labels = dict(driver=driver.name, operation='configure_device') + histogram_duration : Histogram = METRICS_POOL_DETAILS.get_or_create( 'ConfigureDevice', MetricTypeEnum.HISTOGRAM_DURATION) - histogram_duration.labels(step_name='total' ).observe(t9-t0) - histogram_duration.labels(step_name='wait_queue' ).observe(t1-t0) - histogram_duration.labels(step_name='execution' ).observe(t9-t1) - histogram_duration.labels(step_name='get_device' ).observe(t3-t2) - histogram_duration.labels(step_name='split_rules' ).observe(t5-t4) - histogram_duration.labels(step_name='configure_rules' ).observe(t6-t5) - histogram_duration.labels(step_name='deconfigure_rules').observe(t7-t6) - histogram_duration.labels(step_name='set_device' ).observe(t9-t8) + histogram_duration.labels(step='total' , **metrics_labels).observe(t9-t0) + histogram_duration.labels(step='wait_queue' , **metrics_labels).observe(t1-t0) + histogram_duration.labels(step='execution' , **metrics_labels).observe(t9-t1) + histogram_duration.labels(step='get_device' , **metrics_labels).observe(t3-t2) + histogram_duration.labels(step='split_rules' , **metrics_labels).observe(t5-t4) + histogram_duration.labels(step='configure_rules' , **metrics_labels).observe(t6-t5) + histogram_duration.labels(step='deconfigure_rules', **metrics_labels).observe(t7-t6) + histogram_duration.labels(step='set_device' , **metrics_labels).observe(t9-t8) return device_id finally: diff --git a/src/device/service/driver_api/_Driver.py b/src/device/service/driver_api/_Driver.py index 947bc8570..7adaec79d 100644 --- a/src/device/service/driver_api/_Driver.py +++ b/src/device/service/driver_api/_Driver.py @@ -27,7 +27,7 @@ RESOURCE_ACL = '__acl__' class _Driver: - def __init__(self, address: str, port: int, **settings) -> None: + def __init__(self, name : str, address: str, port: int, **settings) -> None: """ Initialize Driver. Parameters: address : str @@ -37,7 +37,22 @@ class _Driver: **settings Extra settings required by the driver. """ - raise NotImplementedError() + self._name = name + self._address = address + self._port = port + self._settings = settings + + @property + def name(self): return self._name + + @property + def address(self): return self._address + + @property + def port(self): return self._port + + @property + def settings(self): return self._settings def Connect(self) -> bool: """ Connect to the Device. diff --git a/src/device/service/drivers/emulated/EmulatedDriver.py b/src/device/service/drivers/emulated/EmulatedDriver.py index 2acb28878..8f9453574 100644 --- a/src/device/service/drivers/emulated/EmulatedDriver.py +++ b/src/device/service/drivers/emulated/EmulatedDriver.py @@ -31,16 +31,18 @@ LOGGER = logging.getLogger(__name__) RE_GET_ENDPOINT_FROM_INTERFACE = re.compile(r'^\/interface\[([^\]]+)\].*') -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'emulated'}) +DRIVER_NAME = 'emulated' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class EmulatedDriver(_Driver): - def __init__(self, address : str, port : int, **settings) -> None: # pylint: disable=super-init-not-called + def __init__(self, address : str, port : int, **settings) -> None: + super().__init__(DRIVER_NAME, address, port, **settings) self.__lock = threading.Lock() self.__initial = TreeNode('.') self.__running = TreeNode('.') self.__subscriptions = TreeNode('.') - endpoints = settings.get('endpoints', []) + endpoints = self.settings.get('endpoints', []) endpoint_resources = [] for endpoint in endpoints: endpoint_resource = compose_resource_endpoint(endpoint) diff --git a/src/device/service/drivers/ietf_l2vpn/IetfL2VpnDriver.py b/src/device/service/drivers/ietf_l2vpn/IetfL2VpnDriver.py index 96dfd2c15..9498dc84c 100644 --- a/src/device/service/drivers/ietf_l2vpn/IetfL2VpnDriver.py +++ b/src/device/service/drivers/ietf_l2vpn/IetfL2VpnDriver.py @@ -39,21 +39,23 @@ ALL_RESOURCE_KEYS = [ SERVICE_TYPE = 'ELINE' -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'ietf_l2vpn'}) +DRIVER_NAME = 'ietf_l2vpn' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class IetfL2VpnDriver(_Driver): - def __init__(self, address: str, port: int, **settings) -> None: # pylint: disable=super-init-not-called + def __init__(self, address: str, port: int, **settings) -> None: + super().__init__(DRIVER_NAME, address, port, **settings) self.__lock = threading.Lock() self.__started = threading.Event() self.__terminate = threading.Event() - username = settings.get('username') - password = settings.get('password') - scheme = settings.get('scheme', 'http') - wim = {'wim_url': '{:s}://{:s}:{:d}'.format(scheme, address, int(port))} + username = self.settings.get('username') + password = self.settings.get('password') + scheme = self.settings.get('scheme', 'http') + wim = {'wim_url': '{:s}://{:s}:{:d}'.format(scheme, self.address, int(self.port))} wim_account = {'user': username, 'password': password} # Mapping updated dynamically with each request config = {'mapping_not_needed': False, 'service_endpoint_mapping': []} - self.dac = TfsDebugApiClient(address, int(port), scheme=scheme, username=username, password=password) + self.dac = TfsDebugApiClient(self.address, int(self.port), scheme=scheme, username=username, password=password) self.wim = WimconnectorIETFL2VPN(wim, wim_account, config=config) self.conn_info = {} # internal database emulating OSM storage provided to WIM Connectors diff --git a/src/device/service/drivers/microwave/IETFApiDriver.py b/src/device/service/drivers/microwave/IETFApiDriver.py index fad7cd073..a8ef90946 100644 --- a/src/device/service/drivers/microwave/IETFApiDriver.py +++ b/src/device/service/drivers/microwave/IETFApiDriver.py @@ -23,20 +23,22 @@ from .Tools import create_connectivity_service, find_key, config_getter, delete_ LOGGER = logging.getLogger(__name__) -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'microwave'}) +DRIVER_NAME = 'microwave' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class IETFApiDriver(_Driver): - def __init__(self, address: str, port: int, **settings) -> None: # pylint: disable=super-init-not-called + def __init__(self, address: str, port: int, **settings) -> None: + super().__init__(DRIVER_NAME, address, port, **settings) self.__lock = threading.Lock() self.__started = threading.Event() self.__terminate = threading.Event() - username = settings.get('username') - password = settings.get('password') + username = self.settings.get('username') + password = self.settings.get('password') self.__auth = HTTPBasicAuth(username, password) if username is not None and password is not None else None - scheme = settings.get('scheme', 'http') - self.__ietf_root = '{:s}://{:s}:{:d}'.format(scheme, address, int(port)) - self.__timeout = int(settings.get('timeout', 120)) - self.__node_ids = set(settings.get('node_ids', [])) + scheme = self.settings.get('scheme', 'http') + self.__ietf_root = '{:s}://{:s}:{:d}'.format(scheme, self.address, int(self.port)) + self.__timeout = int(self.settings.get('timeout', 120)) + self.__node_ids = set(self.settings.get('node_ids', [])) def Connect(self) -> bool: url = self.__ietf_root + '/nmswebs/restconf/data/ietf-network:networks' diff --git a/src/device/service/drivers/openconfig/OpenConfigDriver.py b/src/device/service/drivers/openconfig/OpenConfigDriver.py index 2399b9ac0..ac67c4ab0 100644 --- a/src/device/service/drivers/openconfig/OpenConfigDriver.py +++ b/src/device/service/drivers/openconfig/OpenConfigDriver.py @@ -235,11 +235,13 @@ def edit_config( results = [e for _ in resources] # if commit fails, set exception in each resource return results -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'openconfig'}) +DRIVER_NAME = 'openconfig' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class OpenConfigDriver(_Driver): - def __init__(self, address : str, port : int, **settings) -> None: # pylint: disable=super-init-not-called - self.__logger = logging.getLogger('{:s}:[{:s}:{:s}]'.format(str(__name__), str(address), str(port))) + def __init__(self, address : str, port : int, **settings) -> None: + super().__init__(DRIVER_NAME, address, port, **settings) + self.__logger = logging.getLogger('{:s}:[{:s}:{:s}]'.format(str(__name__), str(self.address), str(self.port))) self.__lock = threading.Lock() #self.__initial = TreeNode('.') #self.__running = TreeNode('.') @@ -249,11 +251,11 @@ class OpenConfigDriver(_Driver): self.__scheduler = BackgroundScheduler(daemon=True) # scheduler used to emulate sampling events self.__scheduler.configure( jobstores = {'default': MemoryJobStore()}, - executors = {'default': ThreadPoolExecutor(max_workers=1)}, + executors = {'default': ThreadPoolExecutor(max_workers=1)}, # important! 1 = avoid concurrent requests job_defaults = {'coalesce': False, 'max_instances': 3}, timezone=pytz.utc) self.__out_samples = queue.Queue() - self.__netconf_handler : NetconfSessionHandler = NetconfSessionHandler(address, port, **settings) + self.__netconf_handler = NetconfSessionHandler(self.address, self.port, **(self.settings)) self.__samples_cache = SamplesCache(self.__netconf_handler, self.__logger) def Connect(self) -> bool: diff --git a/src/device/service/drivers/p4/p4_driver.py b/src/device/service/drivers/p4/p4_driver.py index de47f49c0..49e6ed246 100644 --- a/src/device/service/drivers/p4/p4_driver.py +++ b/src/device/service/drivers/p4/p4_driver.py @@ -41,7 +41,8 @@ except ImportError: LOGGER = logging.getLogger(__name__) -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'p4'}) +DRIVER_NAME = 'p4' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class P4Driver(_Driver): """ diff --git a/src/device/service/drivers/transport_api/TransportApiDriver.py b/src/device/service/drivers/transport_api/TransportApiDriver.py index 1991a34d0..98ed8e6aa 100644 --- a/src/device/service/drivers/transport_api/TransportApiDriver.py +++ b/src/device/service/drivers/transport_api/TransportApiDriver.py @@ -23,19 +23,21 @@ from .Tools import create_connectivity_service, find_key, config_getter, delete_ LOGGER = logging.getLogger(__name__) -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'transport_api'}) +DRIVER_NAME = 'transport_api' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class TransportApiDriver(_Driver): - def __init__(self, address: str, port: int, **settings) -> None: # pylint: disable=super-init-not-called + def __init__(self, address: str, port: int, **settings) -> None: + super().__init__(DRIVER_NAME, address, port, **settings) self.__lock = threading.Lock() self.__started = threading.Event() self.__terminate = threading.Event() - username = settings.get('username') - password = settings.get('password') + username = self.settings.get('username') + password = self.settings.get('password') self.__auth = HTTPBasicAuth(username, password) if username is not None and password is not None else None - scheme = settings.get('scheme', 'http') - self.__tapi_root = '{:s}://{:s}:{:d}'.format(scheme, address, int(port)) - self.__timeout = int(settings.get('timeout', 120)) + scheme = self.settings.get('scheme', 'http') + self.__tapi_root = '{:s}://{:s}:{:d}'.format(scheme, self.address, int(self.port)) + self.__timeout = int(self.settings.get('timeout', 120)) def Connect(self) -> bool: url = self.__tapi_root + '/restconf/data/tapi-common:context' diff --git a/src/device/service/drivers/xr/XrDriver.py b/src/device/service/drivers/xr/XrDriver.py index c1471a813..46269ff89 100644 --- a/src/device/service/drivers/xr/XrDriver.py +++ b/src/device/service/drivers/xr/XrDriver.py @@ -33,21 +33,23 @@ urllib3.disable_warnings() LOGGER = logging.getLogger(__name__) -METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': 'xr'}) +DRIVER_NAME = 'xr' +METRICS_POOL = MetricsPool('Device', 'Driver', labels={'driver': DRIVER_NAME}) class XrDriver(_Driver): - def __init__(self, address: str, port: int, **settings) -> None: # pylint: disable=super-init-not-called + def __init__(self, address: str, port: int, **settings) -> None: + super().__init__(DRIVER_NAME, address, port, **settings) self.__lock = threading.Lock() self.__started = threading.Event() self.__terminate = threading.Event() - self.__timeout = int(settings.get('timeout', 120)) - self.__cm_address = address + self.__timeout = int(self.settings.get('timeout', 120)) + self.__cm_address = self.address # Mandatory key, an exception will get thrown if missing - self.__hub_module_name = settings["hub_module_name"] + self.__hub_module_name = self.settings["hub_module_name"] tls_verify = False # Currently using self signed certificates - username = settings.get("username", "xr-user-1") - password = settings.get("password", "xr-user-1") + username = self.settings.get("username", "xr-user-1") + password = self.settings.get("password", "xr-user-1") # Options are: # disabled --> just import endpoints as usual @@ -55,7 +57,7 @@ class XrDriver(_Driver): # (a remotely-controlled transport domain might exist between them) # topology --> imports sub-devices and links connecting them. # (not supported by XR driver) - self.__import_topology = get_import_topology(settings, default=ImportTopologyEnum.DISABLED) + self.__import_topology = get_import_topology(self.settings, default=ImportTopologyEnum.DISABLED) # Options are: # asynchronous --> operation considered complete when IPM responds with suitable status code, @@ -64,12 +66,12 @@ class XrDriver(_Driver): # lifecycle --> operation is considered successfull once IPM has completed pluggaable configuration # or failed in it. This is typically unsuitable for production use # (as some optics may be transiently unreachable), but is convenient for demos and testin. - consistency_mode = ConsistencyMode.from_str(settings.get("consistency-mode", "asynchronous")) + consistency_mode = ConsistencyMode.from_str(self.settings.get("consistency-mode", "asynchronous")) - self.__cm_connection = CmConnection(address, int(port), username, password, self.__timeout, tls_verify = tls_verify, consistency_mode=consistency_mode) + self.__cm_connection = CmConnection(self.address, int(self.port), username, password, self.__timeout, tls_verify = tls_verify, consistency_mode=consistency_mode) self.__constellation = None - LOGGER.info(f"XrDriver instantiated, cm {address}:{port}, consistency mode {str(consistency_mode)}, {settings=}") + LOGGER.info(f"XrDriver instantiated, cm {self.address}:{self.port}, consistency mode {str(consistency_mode)}, {self.settings=}") def __str__(self): return f"{self.__hub_module_name}@{self.__cm_address}" -- GitLab From 7e2d23369d5be082fbd6853a916d385d699b686f Mon Sep 17 00:00:00 2001 From: gifrerenom Date: Fri, 28 Apr 2023 16:16:02 +0000 Subject: [PATCH 2/6] WebUI - Grafana: - Updated device details dashboard --- .../grafana_prom_device_exec_details.json | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 src/webui/grafana_prom_device_exec_details.json diff --git a/src/webui/grafana_prom_device_exec_details.json b/src/webui/grafana_prom_device_exec_details.json new file mode 100644 index 000000000..1b58253d8 --- /dev/null +++ b/src/webui/grafana_prom_device_exec_details.json @@ -0,0 +1,235 @@ +{"overwrite": true, "folderId": 0, "dashboard": + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1682697941779, + "links": [], + "liveNow": false, + "panels": [ + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "linear", + "colorScheme": "interpolateRdYlGn", + "exponent": 0.5, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 22, + "w": 24, + "x": 0, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 2, + "interval": "60s", + "legend": { + "show": true + }, + "pluginVersion": "7.5.4", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "exemplar": true, + "expr": "sum(\r\n max_over_time(tfs_device_exec_details_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\", step=~\"[[step]]\"}[1m]) -\r\n min_over_time(tfs_device_exec_details_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\", step=~\"[[step]]\"}[1m])\r\n) by (le)", + "format": "heatmap", + "instant": false, + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Histogram", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "s", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + } + ], + "refresh": "5s", + "schemaVersion": 36, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": true, + "text": "adddevice", + "value": "adddevice" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "metrics(tfs_device_exec_details_)", + "hide": 0, + "includeAll": false, + "label": "Method", + "multi": false, + "name": "method", + "options": [], + "query": { + "query": "metrics(tfs_device_exec_details_)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "/tfs_device_exec_details_(.+)_histogram_duration_bucket/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "endpoint_checks", + "execution", + "get_device", + "get_driver", + "populate_config_rules", + "populate_endpoints", + "populate_monit_rsrc", + "set_device", + "total", + "wait_queue" + ], + "value": [ + "endpoint_checks", + "execution", + "get_device", + "get_driver", + "populate_config_rules", + "populate_endpoints", + "populate_monit_rsrc", + "set_device", + "total", + "wait_queue" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, step)", + "hide": 0, + "includeAll": true, + "label": "Step", + "multi": true, + "name": "step", + "options": [], + "query": { + "query": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, step)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, pod)", + "hide": 0, + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "TFS / Device Details", + "uid": "tfs-dev-confdev", + "version": 6, + "weekStart": "" + } +} -- GitLab From 56a1e4d4de852e2f9daebfbb3917761da257417a Mon Sep 17 00:00:00 2001 From: gifrerenom Date: Fri, 28 Apr 2023 16:22:06 +0000 Subject: [PATCH 3/6] WebUI - Grafana: - Updated device details dashboard (to be completed) --- .../grafana_prom_device_exec_details.json | 92 +++++++------------ 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/src/webui/grafana_prom_device_exec_details.json b/src/webui/grafana_prom_device_exec_details.json index 1b58253d8..5e9e9f28a 100644 --- a/src/webui/grafana_prom_device_exec_details.json +++ b/src/webui/grafana_prom_device_exec_details.json @@ -25,8 +25,8 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, - "iteration": 1682697941779, + "id": 6, + "iteration": 1682697941785, "links": [], "liveNow": false, "panels": [ @@ -43,7 +43,7 @@ "dataFormat": "tsbuckets", "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "b1BgiIsVz" }, "gridPos": { "h": 22, @@ -65,7 +65,7 @@ { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "b1BgiIsVz" }, "exemplar": true, "expr": "sum(\r\n max_over_time(tfs_device_exec_details_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\", step=~\"[[step]]\"}[1m]) -\r\n min_over_time(tfs_device_exec_details_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\", step=~\"[[step]]\"}[1m])\r\n) by (le)", @@ -94,7 +94,7 @@ "yBucketBound": "auto" } ], - "refresh": "5s", + "refresh": "", "schemaVersion": 36, "style": "dark", "tags": [], @@ -104,26 +104,30 @@ "allValue": ".*", "current": { "selected": true, - "text": "adddevice", - "value": "adddevice" + "text": [ + "All" + ], + "value": [ + "$__all" + ] }, "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "b1BgiIsVz" }, - "definition": "metrics(tfs_device_exec_details_)", + "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, pod)", "hide": 0, - "includeAll": false, - "label": "Method", - "multi": false, - "name": "method", + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", "options": [], "query": { - "query": "metrics(tfs_device_exec_details_)", + "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, pod)", "refId": "StandardVariableQuery" }, "refresh": 2, - "regex": "/tfs_device_exec_details_(.+)_histogram_duration_bucket/", + "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", @@ -134,37 +138,15 @@ { "allValue": ".*", "current": { - "selected": true, - "text": [ - "endpoint_checks", - "execution", - "get_device", - "get_driver", - "populate_config_rules", - "populate_endpoints", - "populate_monit_rsrc", - "set_device", - "total", - "wait_queue" - ], - "value": [ - "endpoint_checks", - "execution", - "get_device", - "get_driver", - "populate_config_rules", - "populate_endpoints", - "populate_monit_rsrc", - "set_device", - "total", - "wait_queue" - ] + "selected": false, + "text": "All", + "value": "$__all" }, "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "b1BgiIsVz" }, - "definition": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, step)", + "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, step)", "hide": 0, "includeAll": true, "label": "Step", @@ -172,7 +154,7 @@ "name": "step", "options": [], "query": { - "query": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, step)", + "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, step)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -187,27 +169,23 @@ { "allValue": ".*", "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "selected": false, + "text": "All", + "value": "$__all" }, "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "b1BgiIsVz" }, - "definition": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, pod)", + "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, operation)", "hide": 0, "includeAll": true, - "label": "Pod", + "label": "Operation", "multi": true, - "name": "pod", + "name": "operation", "options": [], "query": { - "query": "label_values(tfs_device_exec_details_[[method]]_histogram_duration_bucket, pod)", + "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, operation)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -222,14 +200,14 @@ ] }, "time": { - "from": "now-15m", + "from": "now-2d", "to": "now" }, "timepicker": {}, "timezone": "", "title": "TFS / Device Details", "uid": "tfs-dev-confdev", - "version": 6, + "version": 7, "weekStart": "" } } -- GitLab From 35596ca1562ab21b2fb04e927b44e80c08724c1e Mon Sep 17 00:00:00 2001 From: gifrerenom Date: Fri, 28 Apr 2023 16:22:29 +0000 Subject: [PATCH 4/6] Device component: - Updated detailed metrics for RPC methods (to be completed) --- src/device/service/DeviceServiceServicerImpl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/device/service/DeviceServiceServicerImpl.py b/src/device/service/DeviceServiceServicerImpl.py index d3d92d3a3..38a6b735b 100644 --- a/src/device/service/DeviceServiceServicerImpl.py +++ b/src/device/service/DeviceServiceServicerImpl.py @@ -37,7 +37,7 @@ LOGGER = logging.getLogger(__name__) METRICS_POOL = MetricsPool('Device', 'RPC') -METRICS_POOL_DETAILS = MetricsPool('Device', 'exec_details', labels={ +METRICS_POOL_DETAILS = MetricsPool('Device', 'execution', labels={ 'driver': '', 'operation': '', 'step': '', }) @@ -151,7 +151,7 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): metrics_labels = dict(driver=driver.name, operation='add_device') histogram_duration : Histogram = METRICS_POOL_DETAILS.get_or_create( - 'AddDevice', MetricTypeEnum.HISTOGRAM_DURATION) + 'details', MetricTypeEnum.HISTOGRAM_DURATION) histogram_duration.labels(step='total' , **metrics_labels).observe(t14-t0) histogram_duration.labels(step='execution' , **metrics_labels).observe(t14-t3) histogram_duration.labels(step='endpoint_checks' , **metrics_labels).observe(t1-t0) @@ -255,7 +255,7 @@ class DeviceServiceServicerImpl(DeviceServiceServicer): metrics_labels = dict(driver=driver.name, operation='configure_device') histogram_duration : Histogram = METRICS_POOL_DETAILS.get_or_create( - 'ConfigureDevice', MetricTypeEnum.HISTOGRAM_DURATION) + 'details', MetricTypeEnum.HISTOGRAM_DURATION) histogram_duration.labels(step='total' , **metrics_labels).observe(t9-t0) histogram_duration.labels(step='wait_queue' , **metrics_labels).observe(t1-t0) histogram_duration.labels(step='execution' , **metrics_labels).observe(t9-t1) -- GitLab From 14042fec5664da9dd139207fbea84ff00cd7faed Mon Sep 17 00:00:00 2001 From: gifrerenom Date: Tue, 2 May 2023 14:32:44 +0000 Subject: [PATCH 5/6] WebUI - Grafana: - Updated device details dashboard --- deploy/tfs.sh | 6 +- .../grafana_prom_device_exec_details.json | 99 ++++++++++++++----- 2 files changed, 75 insertions(+), 30 deletions(-) diff --git a/deploy/tfs.sh b/deploy/tfs.sh index 399d3a248..fd8625cf2 100755 --- a/deploy/tfs.sh +++ b/deploy/tfs.sh @@ -486,11 +486,11 @@ if [[ "$TFS_COMPONENTS" == *"webui"* ]]; then curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID} echo - # Dashboard: Device ConfigureDevice Details - curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_prom_device_config_exec_details.json' \ + # Dashboard: Device Execution Details + curl -X POST -H "Content-Type: application/json" -d '@src/webui/grafana_prom_device_exec_details.json' \ ${GRAFANA_URL_UPDATED}/api/dashboards/db echo - DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-dev-confdev" + DASHBOARD_URL="${GRAFANA_URL_UPDATED}/api/dashboards/uid/tfs-dev-exec" DASHBOARD_ID=$(curl -s "${DASHBOARD_URL}" | jq '.dashboard.id') curl -X POST ${GRAFANA_URL_UPDATED}/api/user/stars/dashboard/${DASHBOARD_ID} echo diff --git a/src/webui/grafana_prom_device_exec_details.json b/src/webui/grafana_prom_device_exec_details.json index 5e9e9f28a..a18c2b91c 100644 --- a/src/webui/grafana_prom_device_exec_details.json +++ b/src/webui/grafana_prom_device_exec_details.json @@ -25,8 +25,8 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 6, - "iteration": 1682697941785, + "id": null, + "iteration": 1683036452435, "links": [], "liveNow": false, "panels": [ @@ -43,7 +43,7 @@ "dataFormat": "tsbuckets", "datasource": { "type": "prometheus", - "uid": "b1BgiIsVz" + "uid": "prometheus" }, "gridPos": { "h": 22, @@ -65,15 +65,17 @@ { "datasource": { "type": "prometheus", - "uid": "b1BgiIsVz" + "uid": "prometheus" }, + "editorMode": "code", "exemplar": true, - "expr": "sum(\r\n max_over_time(tfs_device_exec_details_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\", step=~\"[[step]]\"}[1m]) -\r\n min_over_time(tfs_device_exec_details_[[method]]_histogram_duration_bucket{pod=~\"[[pod]]\", step=~\"[[step]]\"}[1m])\r\n) by (le)", + "expr": "sum(\r\n max_over_time(tfs_device_execution_details_histogram_duration_bucket{driver=~\"[[driver]]\", operation=~\"[[operation]]\", step=~\"[[step]]\"}[1m]) -\r\n min_over_time(tfs_device_execution_details_histogram_duration_bucket{driver=~\"[[driver]]\", operation=~\"[[operation]]\", step=~\"[[step]]\"}[1m])\r\n) by (le)", "format": "heatmap", "instant": false, "interval": "1m", "intervalFactor": 1, "legendFormat": "{{le}}", + "range": true, "refId": "A" } ], @@ -94,7 +96,7 @@ "yBucketBound": "auto" } ], - "refresh": "", + "refresh": "5s", "schemaVersion": 36, "style": "dark", "tags": [], @@ -113,17 +115,52 @@ }, "datasource": { "type": "prometheus", - "uid": "b1BgiIsVz" + "uid": "prometheus" }, - "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, pod)", + "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, operation)", "hide": 0, "includeAll": true, - "label": "Pod", + "label": "Operation", "multi": true, - "name": "pod", + "name": "operation", "options": [], "query": { - "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, pod)", + "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, operation)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, driver)", + "hide": 0, + "includeAll": true, + "label": "Driver", + "multi": true, + "name": "driver", + "options": [], + "query": { + "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, driver)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -138,13 +175,17 @@ { "allValue": ".*", "current": { - "selected": false, - "text": "All", - "value": "$__all" + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] }, "datasource": { "type": "prometheus", - "uid": "b1BgiIsVz" + "uid": "prometheus" }, "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, step)", "hide": 0, @@ -169,23 +210,27 @@ { "allValue": ".*", "current": { - "selected": false, - "text": "All", - "value": "$__all" + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] }, "datasource": { "type": "prometheus", - "uid": "b1BgiIsVz" + "uid": "prometheus" }, - "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, operation)", + "definition": "label_values(tfs_device_execution_details_histogram_duration_bucket, pod)", "hide": 0, "includeAll": true, - "label": "Operation", + "label": "Pod", "multi": true, - "name": "operation", + "name": "pod", "options": [], "query": { - "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, operation)", + "query": "label_values(tfs_device_execution_details_histogram_duration_bucket, pod)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -200,14 +245,14 @@ ] }, "time": { - "from": "now-2d", + "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "", - "title": "TFS / Device Details", - "uid": "tfs-dev-confdev", - "version": 7, + "title": "TFS / Device Execution Details", + "uid": "tfs-dev-exec", + "version": 4, "weekStart": "" } } -- GitLab From 585b21c22478fdb693cb5fcf5a78c8f3b8360d83 Mon Sep 17 00:00:00 2001 From: gifrerenom Date: Tue, 2 May 2023 15:04:15 +0000 Subject: [PATCH 6/6] WebUI - Grafana: - Temporarily moved old dashboard to separate folder --- src/webui/{ => old}/grafana_prom_device_config_exec_details.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/webui/{ => old}/grafana_prom_device_config_exec_details.json (100%) diff --git a/src/webui/grafana_prom_device_config_exec_details.json b/src/webui/old/grafana_prom_device_config_exec_details.json similarity index 100% rename from src/webui/grafana_prom_device_config_exec_details.json rename to src/webui/old/grafana_prom_device_config_exec_details.json -- GitLab