Loading src/opticalattackmanager/service/__main__.py +21 −20 Original line number Diff line number Diff line Loading @@ -67,7 +67,7 @@ SERVICE_LIST_MODE = int( get_setting("OPTICALATTACKMANAGER_SERVICE_LIST_MODE", default=1) ) SERVICE_LIST_KEY = get_setting( "OPTICALATTACKMANAGER_SERVICE_LIST_KEY", default="opt-sec:active-services" "OPTICALATTACKMANAGER_SERVICE_LIST_KEY", default="opt-sec-active-services" ) MIN_NUMBER_WORKERS = int( get_setting("OPTICALATTACKMANAGERSERVICE_LOOP_MIN_WORKERS", default=2) Loading Loading @@ -295,6 +295,7 @@ async def monitor_services(terminate, service_list=None, cache=None): current_list = [] if SERVICE_LIST_MODE == LIST_REDIS_MODE: LOGGER.debug(f"Services at the Redis DB: {cache.llen(SERVICE_LIST_KEY)}") current_list.extend( [ pickle.loads(service) Loading Loading @@ -385,7 +386,6 @@ async def monitor_services(terminate, service_list=None, cache=None): (i + 1) * k + min(i + 1, m), # last index host, port, DROP_COUNTER, desired_monitoring_interval * 0.9, ) for i in range(cur_number_workers) Loading Loading @@ -434,22 +434,22 @@ def main(): logging.getLogger("hpack").setLevel(logging.CRITICAL) wait_for_environment_variables( [ get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_HOST), get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_PORT_GRPC), get_env_var_name(ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_HOST), get_env_var_name( ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_PORT_GRPC ), get_env_var_name( ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_HOST ), get_env_var_name( ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_PORT_GRPC ), ] ) # wait_for_environment_variables( # [ # get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_HOST), # get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_PORT_GRPC), # get_env_var_name(ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_HOST), # get_env_var_name( # ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_PORT_GRPC # ), # get_env_var_name( # ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_HOST # ), # get_env_var_name( # ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_PORT_GRPC # ), # ] # ) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) Loading @@ -476,9 +476,10 @@ def main(): if SERVICE_LIST_MODE == LIST_REDIS_MODE: cache = redis.Redis(host=redis_host, port=redis_port, password=redis_password) cache.ping() LOGGER.info(f"Connecting to Redis: host={redis_host}, port={redis_port}, password={redis_password}") # clean the existing list that will be populated later on in this function cache.delete(SERVICE_LIST_KEY) # cache.delete(SERVICE_LIST_KEY) elif SERVICE_LIST_MODE == LIST_SHARED_MODE: # creating a thread-safe list to be shared among threads service_list = Manager().list() Loading Loading @@ -544,7 +545,7 @@ def main(): # asyncio.create_task(monitor_services(service_list)) # Wait for Ctrl+C or termination signal while not terminate.wait(timeout=1): while not terminate.wait(timeout=10): pass LOGGER.info("Terminating...") Loading src/opticalattackmanager/utils/monitor.py +3 −6 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ import asyncio import logging import traceback from grpclib.client import Channel from prometheus_client import Counter Loading @@ -31,7 +32,6 @@ async def detect_attack( context_id: str, service_id: str, kpi_id: str, drop_counter: Counter, timeout: float = 20.0, ) -> None: try: Loading @@ -49,10 +49,9 @@ async def detect_attack( LOGGER.debug("Monitoring finished for {}/{}".format(service_id, kpi_id)) except Exception as e: LOGGER.warning( "Exception while processing service_id {}/{}".format(service_id, kpi_id) "Exception while processing service_id {}/{}: {}".format(service_id, kpi_id, e) ) # LOGGER.exception(e) drop_counter.inc() traceback.print_exc() def delegate_services( Loading @@ -61,7 +60,6 @@ def delegate_services( end_index: int, host: str, port: str, drop_counter: Counter, monitoring_interval: float, ): async def run_internal_loop(): Loading @@ -73,7 +71,6 @@ def delegate_services( service["context"], service["service"], service["kpi"], drop_counter, # allow at most 90% of the monitoring interval to succeed monitoring_interval * 0.9, ) Loading src/tests/scenario3/optical/deploy_specs.sh +2 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,8 @@ export TFS_COMPONENTS="context device automation monitoring pathcomp service sli # addition for the optical cybersecurity component export TFS_COMPONENTS="${TFS_COMPONENTS} dbscanserving opticalattackmitigator opticalattackdetector opticalattackmanager" export TFS_EXTRA_MANIFESTS="manifests/nginx_ingress_http.yaml manifests/servicemonitors.yaml" export TFS_EXTRA_MANIFESTS="${TFS_EXTRA_MANIFESTS} manifests/cachingservice.yaml" Loading src/tests/scenario3/optical/jocn/run_experiment.py +17 −184 Original line number Diff line number Diff line Loading @@ -29,15 +29,24 @@ from kubernetes import client, config from common.Constants import ServiceNameEnum from common.Settings import get_service_host, get_setting, wait_for_environment_variables from configs import base_results_folder, datetime_format, hpa_data LOGGER = None SERVICE_LIST_KEY = get_setting( "OPTICALATTACKMANAGER_SERVICE_LIST_KEY", default="opt-sec:active-services" ) # Configs can be set in Configuration class directly or using helper utility namespace = get_setting("TFS_K8S_NAMESPACE", default="tfs") config.load_kube_config() v1 = client.CoreV1Api() ret = v1.list_namespaced_endpoints(namespace=namespace, watch=False) for item in ret.items: if "caching" in item.metadata.name: for subset in item.subsets: for port in subset.ports: if "redis" in port.name: # endpoint is ready for being scraped CACHING_HOST = subset.addresses[0].ip CACHING_PORT = port.port logging.getLogger("kubernetes").setLevel(logging.INFO) # avoid lengthy messages Loading @@ -53,17 +62,11 @@ def signal_handler(signal, frame): # pylint: disable=redefined-outer-name def manage_number_services(terminate, folder): # connecting with Redis redis_host = get_service_host(ServiceNameEnum.CACHING) redis_password = None if redis_host is not None: redis_port = int(get_setting("CACHINGSERVICE_SERVICE_PORT_REDIS")) redis_password = get_setting("REDIS_PASSWORD") else: LOGGER.fatal("No environment variables set for Redis") cache = None try: cache = redis.Redis(host=redis_host, port=redis_port, password=redis_password) cache = redis.Redis(host=CACHING_HOST, port=CACHING_PORT, password=redis_password) except Exception as e: LOGGER.exception(e) Loading @@ -76,40 +79,11 @@ def manage_number_services(terminate, folder): # connecting to the HPA API autoscaling = client.AutoscalingV1Api() # connecting to the custom objects API api = client.CustomObjectsApi() # open the file that will store the information services_file = open(os.path.join(folder, "services.csv"), "wt", encoding="utf-8") services_file.write("# file with number of services\n") services_file.write("timestamp,number_services") hpa_file = open(os.path.join(folder, "hpas.csv"), "wt", encoding="utf-8") # writing headers for the HPA columns hpas = autoscaling.list_namespaced_horizontal_pod_autoscaler(namespace="tfs") for hpa in hpas.items: hpa_file.write(hpa.metadata.name + "\n") for d in hpa_data: services_file.write(f",{hpa.metadata.name}_{d}") # monitoring CPU and RAM usage of the single Pods for s in ["cache", "manager"]: for k in ["cpu", "ram"]: services_file.write(f",{s}_{k}") services_file.write("\n") services_file.flush() hpa_file.flush() hpa_file.close() # define number of services # 6 values followed by two zeros number_services = [0, 10] loads = [120, 240, 480, 960, 1440, 1920, 1922] loads = [120, 240, 480, 960, 1440, 1920, 1921] for load in loads: number_services.append(int(load/2)) for _ in range(5): Loading @@ -125,41 +99,6 @@ def manage_number_services(terminate, folder): if cur_tick % ticks == 0: LOGGER.debug("go load!") # getting data from autoscaler hpas = autoscaling.list_namespaced_horizontal_pod_autoscaler(namespace="tfs") # - "cur_utilization" # - "target_utilization" # - "cur_replicas" # - "desired_replicas" hpa_string = "" for hpa in hpas.items: hpa_string += f",{hpa.status.current_cpu_utilization_percentage}" hpa_string += f",{hpa.spec.target_cpu_utilization_percentage}" hpa_string += f",{hpa.status.current_replicas}" hpa_string += f",{hpa.status.desired_replicas}" # monitoring resource usage k8s_pods = api.list_cluster_custom_object( "metrics.k8s.io", "v1beta1", "namespaces/tfs/pods" ) # - "cache_cpu" # - "cache_ram" # - "manager_cpu" # - "manager_ram" resource_string = "" # we use two loops to ensure the same order for stats in k8s_pods["items"]: if "caching" in stats['metadata']['name']: resource_string += f",{stats['containers'][0]['usage']['cpu']}" resource_string += f",{stats['containers'][0]['usage']['memory']}" break for stats in k8s_pods["items"]: if "opticalattackmanager" in stats['metadata']['name']: resource_string += f",{stats['containers'][0]['usage']['cpu']}" resource_string += f",{stats['containers'][0]['usage']['memory']}" break # calculate the difference between current and expected cur_services = cache.llen(SERVICE_LIST_KEY) diff_services = cur_services - number_services[cur_tick % len(number_services)] Loading @@ -169,12 +108,6 @@ def manage_number_services(terminate, folder): LOGGER.info("Setting monitoring interval to 60") set_to_60 = True # write current number with one second difference cur_datetime = datetime.datetime.now() reported_datetime = cur_datetime - datetime.timedelta(seconds=1) reported_datetime_str = reported_datetime.strftime(datetime_format) services_file.write(f"{reported_datetime_str},{cur_services}{hpa_string}\n") if diff_services < 0: # current is lower than expected LOGGER.debug(f"inserting <{-diff_services}> services") for _ in range(-diff_services): Loading @@ -193,97 +126,20 @@ def manage_number_services(terminate, folder): LOGGER.debug(f"deleting <{diff_services}> services") cache.lpop(SERVICE_LIST_KEY, diff_services) # writing the new number with the current time services_file.write( f"{datetime.datetime.now().strftime(datetime_format)}," f"{number_services[cur_tick % len(number_services)]}" f"{hpa_string}{resource_string}\n" ) assert number_services[cur_tick % len(number_services)] == cache.llen(SERVICE_LIST_KEY) services_file.flush() else: LOGGER.debug("tick load!") cur_tick += 1 if cur_tick > len(number_services) + 1: break services_file.flush() services_file.close() # make sure we have the correct loop time cache.set("MONITORING_INTERVAL", 30) LOGGER.info("Finished load!") def monitor_endpoints(terminate): LOGGER.info("starting experiment!") v1 = client.CoreV1Api() while not terminate.wait(timeout=30): # load base yaml with open("/home/carda/projects/prometheus/prometheus.yml.backup", "rt") as file: current_version = yaml.load(file, Loader=yaml.FullLoader) # checking endpoints ret = v1.list_namespaced_endpoints(namespace="tfs", watch=False) for item in ret.items: found = False for subset in item.subsets: for p, q in enumerate(subset.ports): if q.name == "metrics": # endpoint is ready for being scraped found = True if not found: continue # if no `metrics` port, jump! found = False # now look for existing configuration for i in range(len(current_version["scrape_configs"])): if current_version["scrape_configs"][i]["job_name"] == item.metadata.name: found = True break # found it! `i` will contain the correct index if not found: # write it from zero! current_version["scrape_configs"].append({}) current_version["scrape_configs"][-1]["job_name"] = item.metadata.name # set the correct `i` value i = len(current_version["scrape_configs"]) - 1 if "static_configs" not in current_version["scrape_configs"][i]: current_version["scrape_configs"][i]["static_configs"] = [{"targets": []}] # reset IPs current_version["scrape_configs"][i]["static_configs"][0]["targets"] = [] for subset in item.subsets: for p, q in enumerate(subset.ports): if q.name == "metrics": for c, a in enumerate(subset.addresses): print(f"{item.metadata.name}\t{a.ip}:{q.port}") current_version["scrape_configs"][i]["static_configs"][0]["targets"].append(f"{a.ip}:9192") # write yaml with open("/home/carda/projects/prometheus/prometheus.yml", "wt") as file: yaml.dump(current_version, file) # reloading configuration # docs: https://www.robustperception.io/reloading-prometheus-configuration/ requests.post("http://127.0.0.1:9090/-/reload") # resetting prometheus to the original state # load base yaml with open("/home/carda/projects/prometheus/prometheus.yml.backup", "rt") as file: current_version = yaml.load(file, Loader=yaml.FullLoader) # write yaml with open("/home/carda/projects/prometheus/prometheus.yml", "wt") as file: yaml.dump(current_version, file) # reloading configuration # docs: https://www.robustperception.io/reloading-prometheus-configuration/ requests.post("http://127.0.0.1:9090/-/reload") LOGGER.info("Finished experiment!") if __name__ == "__main__": # logging.basicConfig(level="DEBUG") logging.basicConfig( Loading @@ -294,41 +150,18 @@ if __name__ == "__main__": LOGGER = logging.getLogger(__name__) wait_for_environment_variables( ["CACHINGSERVICE_SERVICE_PORT_REDIS", "REDIS_PASSWORD"] ["REDIS_PASSWORD"] ) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # generate results folder output_folder = os.path.join(base_results_folder, "jocn_" + datetime.datetime.now( datetime.timezone.utc ).strftime("%Y%m%dT%H%M%S.%fUTC")) os.makedirs(output_folder) # start load handler proc_load = multiprocessing.Process( target=manage_number_services, args=( terminate, output_folder, ), ) proc_load.start() # start experiment monitoring proc_experiment = multiprocessing.Process( target=monitor_endpoints, args=(terminate,) ) proc_experiment.start() manage_number_services(terminate=terminate) # Wait for Ctrl+C or termination signal while not terminate.wait(timeout=0.1): pass # waits for the processes to finish proc_load.join() proc_experiment.join() # exits LOGGER.info("Bye!") src/tests/scenario3/optical/ofc23/README.md +8 −0 Original line number Diff line number Diff line Loading @@ -4,14 +4,22 @@ __Authors__: [Carlos Natalino](https://www.chalmers.se/en/persons/carda/), Lluis ## Executing All the commands here assume you are in the TFS home folder. First, we need to load the TFS deploy specifications: ```bash source src/tests/scenario3/optical/deploy_specs.sh ``` Then, we load the environment variables that identify the TFS deployment: ```bash source tfs_runtime_env_vars.sh ``` Then, we are able to execute the load generator: ```bash python src/tests/scenario3/optical/ofc23/run_experiment_demo.py ``` No newline at end of file Loading
src/opticalattackmanager/service/__main__.py +21 −20 Original line number Diff line number Diff line Loading @@ -67,7 +67,7 @@ SERVICE_LIST_MODE = int( get_setting("OPTICALATTACKMANAGER_SERVICE_LIST_MODE", default=1) ) SERVICE_LIST_KEY = get_setting( "OPTICALATTACKMANAGER_SERVICE_LIST_KEY", default="opt-sec:active-services" "OPTICALATTACKMANAGER_SERVICE_LIST_KEY", default="opt-sec-active-services" ) MIN_NUMBER_WORKERS = int( get_setting("OPTICALATTACKMANAGERSERVICE_LOOP_MIN_WORKERS", default=2) Loading Loading @@ -295,6 +295,7 @@ async def monitor_services(terminate, service_list=None, cache=None): current_list = [] if SERVICE_LIST_MODE == LIST_REDIS_MODE: LOGGER.debug(f"Services at the Redis DB: {cache.llen(SERVICE_LIST_KEY)}") current_list.extend( [ pickle.loads(service) Loading Loading @@ -385,7 +386,6 @@ async def monitor_services(terminate, service_list=None, cache=None): (i + 1) * k + min(i + 1, m), # last index host, port, DROP_COUNTER, desired_monitoring_interval * 0.9, ) for i in range(cur_number_workers) Loading Loading @@ -434,22 +434,22 @@ def main(): logging.getLogger("hpack").setLevel(logging.CRITICAL) wait_for_environment_variables( [ get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_HOST), get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_PORT_GRPC), get_env_var_name(ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_HOST), get_env_var_name( ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_PORT_GRPC ), get_env_var_name( ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_HOST ), get_env_var_name( ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_PORT_GRPC ), ] ) # wait_for_environment_variables( # [ # get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_HOST), # get_env_var_name(ServiceNameEnum.CONTEXT, ENVVAR_SUFIX_SERVICE_PORT_GRPC), # get_env_var_name(ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_HOST), # get_env_var_name( # ServiceNameEnum.MONITORING, ENVVAR_SUFIX_SERVICE_PORT_GRPC # ), # get_env_var_name( # ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_HOST # ), # get_env_var_name( # ServiceNameEnum.OPTICALATTACKDETECTOR, ENVVAR_SUFIX_SERVICE_PORT_GRPC # ), # ] # ) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) Loading @@ -476,9 +476,10 @@ def main(): if SERVICE_LIST_MODE == LIST_REDIS_MODE: cache = redis.Redis(host=redis_host, port=redis_port, password=redis_password) cache.ping() LOGGER.info(f"Connecting to Redis: host={redis_host}, port={redis_port}, password={redis_password}") # clean the existing list that will be populated later on in this function cache.delete(SERVICE_LIST_KEY) # cache.delete(SERVICE_LIST_KEY) elif SERVICE_LIST_MODE == LIST_SHARED_MODE: # creating a thread-safe list to be shared among threads service_list = Manager().list() Loading Loading @@ -544,7 +545,7 @@ def main(): # asyncio.create_task(monitor_services(service_list)) # Wait for Ctrl+C or termination signal while not terminate.wait(timeout=1): while not terminate.wait(timeout=10): pass LOGGER.info("Terminating...") Loading
src/opticalattackmanager/utils/monitor.py +3 −6 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ import asyncio import logging import traceback from grpclib.client import Channel from prometheus_client import Counter Loading @@ -31,7 +32,6 @@ async def detect_attack( context_id: str, service_id: str, kpi_id: str, drop_counter: Counter, timeout: float = 20.0, ) -> None: try: Loading @@ -49,10 +49,9 @@ async def detect_attack( LOGGER.debug("Monitoring finished for {}/{}".format(service_id, kpi_id)) except Exception as e: LOGGER.warning( "Exception while processing service_id {}/{}".format(service_id, kpi_id) "Exception while processing service_id {}/{}: {}".format(service_id, kpi_id, e) ) # LOGGER.exception(e) drop_counter.inc() traceback.print_exc() def delegate_services( Loading @@ -61,7 +60,6 @@ def delegate_services( end_index: int, host: str, port: str, drop_counter: Counter, monitoring_interval: float, ): async def run_internal_loop(): Loading @@ -73,7 +71,6 @@ def delegate_services( service["context"], service["service"], service["kpi"], drop_counter, # allow at most 90% of the monitoring interval to succeed monitoring_interval * 0.9, ) Loading
src/tests/scenario3/optical/deploy_specs.sh +2 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,8 @@ export TFS_COMPONENTS="context device automation monitoring pathcomp service sli # addition for the optical cybersecurity component export TFS_COMPONENTS="${TFS_COMPONENTS} dbscanserving opticalattackmitigator opticalattackdetector opticalattackmanager" export TFS_EXTRA_MANIFESTS="manifests/nginx_ingress_http.yaml manifests/servicemonitors.yaml" export TFS_EXTRA_MANIFESTS="${TFS_EXTRA_MANIFESTS} manifests/cachingservice.yaml" Loading
src/tests/scenario3/optical/jocn/run_experiment.py +17 −184 Original line number Diff line number Diff line Loading @@ -29,15 +29,24 @@ from kubernetes import client, config from common.Constants import ServiceNameEnum from common.Settings import get_service_host, get_setting, wait_for_environment_variables from configs import base_results_folder, datetime_format, hpa_data LOGGER = None SERVICE_LIST_KEY = get_setting( "OPTICALATTACKMANAGER_SERVICE_LIST_KEY", default="opt-sec:active-services" ) # Configs can be set in Configuration class directly or using helper utility namespace = get_setting("TFS_K8S_NAMESPACE", default="tfs") config.load_kube_config() v1 = client.CoreV1Api() ret = v1.list_namespaced_endpoints(namespace=namespace, watch=False) for item in ret.items: if "caching" in item.metadata.name: for subset in item.subsets: for port in subset.ports: if "redis" in port.name: # endpoint is ready for being scraped CACHING_HOST = subset.addresses[0].ip CACHING_PORT = port.port logging.getLogger("kubernetes").setLevel(logging.INFO) # avoid lengthy messages Loading @@ -53,17 +62,11 @@ def signal_handler(signal, frame): # pylint: disable=redefined-outer-name def manage_number_services(terminate, folder): # connecting with Redis redis_host = get_service_host(ServiceNameEnum.CACHING) redis_password = None if redis_host is not None: redis_port = int(get_setting("CACHINGSERVICE_SERVICE_PORT_REDIS")) redis_password = get_setting("REDIS_PASSWORD") else: LOGGER.fatal("No environment variables set for Redis") cache = None try: cache = redis.Redis(host=redis_host, port=redis_port, password=redis_password) cache = redis.Redis(host=CACHING_HOST, port=CACHING_PORT, password=redis_password) except Exception as e: LOGGER.exception(e) Loading @@ -76,40 +79,11 @@ def manage_number_services(terminate, folder): # connecting to the HPA API autoscaling = client.AutoscalingV1Api() # connecting to the custom objects API api = client.CustomObjectsApi() # open the file that will store the information services_file = open(os.path.join(folder, "services.csv"), "wt", encoding="utf-8") services_file.write("# file with number of services\n") services_file.write("timestamp,number_services") hpa_file = open(os.path.join(folder, "hpas.csv"), "wt", encoding="utf-8") # writing headers for the HPA columns hpas = autoscaling.list_namespaced_horizontal_pod_autoscaler(namespace="tfs") for hpa in hpas.items: hpa_file.write(hpa.metadata.name + "\n") for d in hpa_data: services_file.write(f",{hpa.metadata.name}_{d}") # monitoring CPU and RAM usage of the single Pods for s in ["cache", "manager"]: for k in ["cpu", "ram"]: services_file.write(f",{s}_{k}") services_file.write("\n") services_file.flush() hpa_file.flush() hpa_file.close() # define number of services # 6 values followed by two zeros number_services = [0, 10] loads = [120, 240, 480, 960, 1440, 1920, 1922] loads = [120, 240, 480, 960, 1440, 1920, 1921] for load in loads: number_services.append(int(load/2)) for _ in range(5): Loading @@ -125,41 +99,6 @@ def manage_number_services(terminate, folder): if cur_tick % ticks == 0: LOGGER.debug("go load!") # getting data from autoscaler hpas = autoscaling.list_namespaced_horizontal_pod_autoscaler(namespace="tfs") # - "cur_utilization" # - "target_utilization" # - "cur_replicas" # - "desired_replicas" hpa_string = "" for hpa in hpas.items: hpa_string += f",{hpa.status.current_cpu_utilization_percentage}" hpa_string += f",{hpa.spec.target_cpu_utilization_percentage}" hpa_string += f",{hpa.status.current_replicas}" hpa_string += f",{hpa.status.desired_replicas}" # monitoring resource usage k8s_pods = api.list_cluster_custom_object( "metrics.k8s.io", "v1beta1", "namespaces/tfs/pods" ) # - "cache_cpu" # - "cache_ram" # - "manager_cpu" # - "manager_ram" resource_string = "" # we use two loops to ensure the same order for stats in k8s_pods["items"]: if "caching" in stats['metadata']['name']: resource_string += f",{stats['containers'][0]['usage']['cpu']}" resource_string += f",{stats['containers'][0]['usage']['memory']}" break for stats in k8s_pods["items"]: if "opticalattackmanager" in stats['metadata']['name']: resource_string += f",{stats['containers'][0]['usage']['cpu']}" resource_string += f",{stats['containers'][0]['usage']['memory']}" break # calculate the difference between current and expected cur_services = cache.llen(SERVICE_LIST_KEY) diff_services = cur_services - number_services[cur_tick % len(number_services)] Loading @@ -169,12 +108,6 @@ def manage_number_services(terminate, folder): LOGGER.info("Setting monitoring interval to 60") set_to_60 = True # write current number with one second difference cur_datetime = datetime.datetime.now() reported_datetime = cur_datetime - datetime.timedelta(seconds=1) reported_datetime_str = reported_datetime.strftime(datetime_format) services_file.write(f"{reported_datetime_str},{cur_services}{hpa_string}\n") if diff_services < 0: # current is lower than expected LOGGER.debug(f"inserting <{-diff_services}> services") for _ in range(-diff_services): Loading @@ -193,97 +126,20 @@ def manage_number_services(terminate, folder): LOGGER.debug(f"deleting <{diff_services}> services") cache.lpop(SERVICE_LIST_KEY, diff_services) # writing the new number with the current time services_file.write( f"{datetime.datetime.now().strftime(datetime_format)}," f"{number_services[cur_tick % len(number_services)]}" f"{hpa_string}{resource_string}\n" ) assert number_services[cur_tick % len(number_services)] == cache.llen(SERVICE_LIST_KEY) services_file.flush() else: LOGGER.debug("tick load!") cur_tick += 1 if cur_tick > len(number_services) + 1: break services_file.flush() services_file.close() # make sure we have the correct loop time cache.set("MONITORING_INTERVAL", 30) LOGGER.info("Finished load!") def monitor_endpoints(terminate): LOGGER.info("starting experiment!") v1 = client.CoreV1Api() while not terminate.wait(timeout=30): # load base yaml with open("/home/carda/projects/prometheus/prometheus.yml.backup", "rt") as file: current_version = yaml.load(file, Loader=yaml.FullLoader) # checking endpoints ret = v1.list_namespaced_endpoints(namespace="tfs", watch=False) for item in ret.items: found = False for subset in item.subsets: for p, q in enumerate(subset.ports): if q.name == "metrics": # endpoint is ready for being scraped found = True if not found: continue # if no `metrics` port, jump! found = False # now look for existing configuration for i in range(len(current_version["scrape_configs"])): if current_version["scrape_configs"][i]["job_name"] == item.metadata.name: found = True break # found it! `i` will contain the correct index if not found: # write it from zero! current_version["scrape_configs"].append({}) current_version["scrape_configs"][-1]["job_name"] = item.metadata.name # set the correct `i` value i = len(current_version["scrape_configs"]) - 1 if "static_configs" not in current_version["scrape_configs"][i]: current_version["scrape_configs"][i]["static_configs"] = [{"targets": []}] # reset IPs current_version["scrape_configs"][i]["static_configs"][0]["targets"] = [] for subset in item.subsets: for p, q in enumerate(subset.ports): if q.name == "metrics": for c, a in enumerate(subset.addresses): print(f"{item.metadata.name}\t{a.ip}:{q.port}") current_version["scrape_configs"][i]["static_configs"][0]["targets"].append(f"{a.ip}:9192") # write yaml with open("/home/carda/projects/prometheus/prometheus.yml", "wt") as file: yaml.dump(current_version, file) # reloading configuration # docs: https://www.robustperception.io/reloading-prometheus-configuration/ requests.post("http://127.0.0.1:9090/-/reload") # resetting prometheus to the original state # load base yaml with open("/home/carda/projects/prometheus/prometheus.yml.backup", "rt") as file: current_version = yaml.load(file, Loader=yaml.FullLoader) # write yaml with open("/home/carda/projects/prometheus/prometheus.yml", "wt") as file: yaml.dump(current_version, file) # reloading configuration # docs: https://www.robustperception.io/reloading-prometheus-configuration/ requests.post("http://127.0.0.1:9090/-/reload") LOGGER.info("Finished experiment!") if __name__ == "__main__": # logging.basicConfig(level="DEBUG") logging.basicConfig( Loading @@ -294,41 +150,18 @@ if __name__ == "__main__": LOGGER = logging.getLogger(__name__) wait_for_environment_variables( ["CACHINGSERVICE_SERVICE_PORT_REDIS", "REDIS_PASSWORD"] ["REDIS_PASSWORD"] ) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # generate results folder output_folder = os.path.join(base_results_folder, "jocn_" + datetime.datetime.now( datetime.timezone.utc ).strftime("%Y%m%dT%H%M%S.%fUTC")) os.makedirs(output_folder) # start load handler proc_load = multiprocessing.Process( target=manage_number_services, args=( terminate, output_folder, ), ) proc_load.start() # start experiment monitoring proc_experiment = multiprocessing.Process( target=monitor_endpoints, args=(terminate,) ) proc_experiment.start() manage_number_services(terminate=terminate) # Wait for Ctrl+C or termination signal while not terminate.wait(timeout=0.1): pass # waits for the processes to finish proc_load.join() proc_experiment.join() # exits LOGGER.info("Bye!")
src/tests/scenario3/optical/ofc23/README.md +8 −0 Original line number Diff line number Diff line Loading @@ -4,14 +4,22 @@ __Authors__: [Carlos Natalino](https://www.chalmers.se/en/persons/carda/), Lluis ## Executing All the commands here assume you are in the TFS home folder. First, we need to load the TFS deploy specifications: ```bash source src/tests/scenario3/optical/deploy_specs.sh ``` Then, we load the environment variables that identify the TFS deployment: ```bash source tfs_runtime_env_vars.sh ``` Then, we are able to execute the load generator: ```bash python src/tests/scenario3/optical/ofc23/run_experiment_demo.py ``` No newline at end of file