#!/usr/bin/env python3 """ Description: Wakurtosis simulation analysis """ """ Dependencies """ import sys, logging, json, argparse, tomllib, glob, re, requests from datetime import datetime from tqdm_loggable.auto import tqdm from tqdm_loggable.tqdm_logging import tqdm_logging import matplotlib.pyplot as plt from scipy import stats from prometheus_api_client import PrometheusConnect """ Globals """ G_APP_NAME = 'WLS-ANALYSIS' G_LOG_LEVEL = 'DEBUG' G_DEFAULT_CONFIG_FILE = './config/config.json' G_DEFAULT_CONFIG_FILE = './config/config.json' G_DEFAULT_TOPOLOGY_PATH = './config/topology_generated' G_DEFAULT_SIMULATION_PATH = './wakurtosis_logs' G_DEFAULT_FIG_FILENAME = 'analysis.pdf' G_DEFAULT_SUMMARY_FILENAME = 'summary.json' G_LOGGER = None """ Custom logging formatter """ class CustomFormatter(logging.Formatter): # Set different formats for every logging level time_name_stamp = "[%(asctime)s.%(msecs)03d] [" + G_APP_NAME + "]" FORMATS = { logging.ERROR: time_name_stamp + " ERROR in %(module)s.py %(funcName)s() %(lineno)d - %(msg)s", logging.WARNING: time_name_stamp + " WARNING - %(msg)s", logging.CRITICAL: time_name_stamp + " CRITICAL in %(module)s.py %(funcName)s() %(lineno)d - %(msg)s", logging.INFO: time_name_stamp + " %(msg)s", logging.DEBUG: time_name_stamp + " %(funcName)s() %(msg)s", 'DEFAULT': time_name_stamp + " %(msg)s", } def format(self, record): log_fmt = self.FORMATS.get(record.levelno, self.FORMATS['DEFAULT']) formatter = logging.Formatter(log_fmt, '%d-%m-%Y %H:%M:%S') return formatter.format(record) def generate_summary(): # summary = { # "end_ts" : time.time(), # "params" : config['general'], # "topics" : list(topics_msg_cnt.keys()), # "topics_msg_cnt" : topics_msg_cnt, # "simulation_time" : elapsed_s, # "total_messages" : len() # } # with open('./summary.json', 'w') as summary_file: # summary_file.write(json.dumps(summary, indent=4)) G_LOGGER.info('Analsysis sumnmary saved in %s' %summary) def plot_figure(msg_propagation_times, cpu_usage, memory_usage): fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 10)) ax1.violinplot(msg_propagation_times, showmedians=True) ax1.set_title('Message propagation times \n(sample size: %d messages)' %len(msg_propagation_times)) ax1.set_ylabel('Propagation Time (ms)') ax1.spines[['right', 'top']].set_visible(False) ax1.axes.xaxis.set_visible(False) ax2.violinplot(cpu_usage, showmedians=True) ax2.set_title('Maximum CPU usage per Waku node \n(sample size: %d nodes)' %len(cpu_usage)) ax2.set_ylabel('CPU Cycles') ax2.spines[['right', 'top']].set_visible(False) ax2.axes.xaxis.set_visible(False) ax3.violinplot(memory_usage, showmedians=True) ax3.set_title('Maximum memory usage per Waku node \n(sample size: %d nodes)' %len(memory_usage)) ax3.set_ylabel('Bytes') ax3.spines[['right', 'top']].set_visible(False) ax3.axes.xaxis.set_visible(False) plt.tight_layout() figure_path = '%s/%s' %(G_DEFAULT_SIMULATION_PATH, G_DEFAULT_FIG_FILENAME) plt.savefig(figure_path, format="pdf", bbox_inches="tight") G_LOGGER.info('Figure saved in %s' %figure_path) # def fetch_cadvisor_stats_from_container(container_id, start_ts, end_ts, prometheus_port=52118): # url='http://localhost:%d' %52118 # try: # G_LOGGER.debug('Connecting to Prometheus server in %s' %url) # prometheus = PrometheusConnect(url, disable_ssl=True, container_label="container_label_com_docker_container_id=%s" %container_id) # print(prometheus) # except Exception as e: # G_LOGGER.error('%s: %s' % (e.__doc__, e)) # return None # metrics = prometheus.get_label_values("__name__") # print(metrics) # try: # # query = '100 - (avg by(instance) (irate(container_cpu_usage_seconds_total{container_label_com_docker_container_id="<%s>"}[5m])) * 100)' %container_id # # query = "container_file_descriptors{process_cpu_seconds_total=\"<%s>\"}" %container_id # # result = prometheus.query(query) # query = 'process_cpu_seconds_total' # result = prometheus.custom_query(query) # G_LOGGER.debug('Querying: %s' %query) # except Exception as e: # G_LOGGER.error('%s: %s' % (e.__doc__, e)) # return None # print('--->', result) # return {'cpu_usage' : 0, 'memory_usage' : 0, 'bandwidth_in' : 0, 'bandwidth_out' : 0} def fetch_cadvisor_summary_from_container(container_id): # cAdvisor API URL endpoint url = 'http://localhost:8080/api/v2.1/summary/docker/%s' %container_id # Note: We can also use the endpoint /stats instead of summary to get timepoints G_LOGGER.debug('Fetching summary stats from %s ...' %url) # Make an HTTP request to the cAdvisor API to get the summary stats of the container try: response = requests.get(url) except Exception as e: G_LOGGER.error('%s: %s' % (e.__doc__, e)) return # Parse the response as JSON summary_stats = json.loads(response.text) # G_LOGGER.debug(summary_stats) return summary_stats def fetch_cadvisor_stats_from_container(container_id, start_ts, end_ts): # cAdvisor API URL endpoint url = 'http://localhost:8080/api/v2.1/stats/docker/%s?count=1000' %(container_id) # Note: We can also use the endpoint /stats instead of summary to get timepoints G_LOGGER.debug('Fetching cAdvisor stats from %s ...' %url) # Make an HTTP request to the cAdvisor API to get the summary stats of the container try: response = requests.get(url) except Exception as e: G_LOGGER.error('%s: %s' % (e.__doc__, e)) return # Parse the response as JSON stats_dict = json.loads(response.text) cpu_usage = [] memory_usage = [] for stats_obj in stats_dict.values(): for data_point in stats_obj['stats']: # Only take into account data points wihtin the simulation time datetime_str = data_point['timestamp'] # print(datetime_str) datetime_obj = datetime.fromisoformat(datetime_str[:-1]) # print(datetime_obj) # timestamp_ns = int(datetime_obj.timestamp() * 1e9) # Calculate the total number of seconds and microseconds since the Unix epoch unix_seconds = (datetime_obj - datetime(1970, 1, 1)).total_seconds() microseconds = datetime_obj.microsecond # Convert to nanoseconds timestamp_ns = int((unix_seconds * 1e9) + (microseconds * 1e3)) # if timestamp_ns < start_ts or timestamp_ns > end_ts: # G_LOGGER.debug('Data point %d out of the time window [%d-%d]' %(timestamp_ns, start_ts, end_ts)) # continue G_LOGGER.debug('Data point %d' %(timestamp_ns)) # print(data_point['timestamp']) # NOTE: This is comes empty. Check in Ubuntu # print(data_point['diskio']) # print('CPU:', data_point['cpu']['usage']['user']) # print('Memory:', data_point['memory']['usage']) cpu_usage.append(data_point['cpu']['usage']['user']) memory_usage.append(data_point['memory']['usage']) print(len(cpu_usage)) return {'cpu_usage' : cpu_usage, 'memory_usage' : memory_usage} def main(): global G_LOGGER """ Init Logging """ G_LOGGER = logging.getLogger(G_APP_NAME) handler = logging.StreamHandler(sys.stdout) handler.setFormatter(CustomFormatter()) G_LOGGER.addHandler(handler) tqdm_logging.set_level(logging.INFO) # Set loglevel from config G_LOGGER.setLevel(G_LOG_LEVEL) handler.setLevel(G_LOG_LEVEL) G_LOGGER.info('Started') """ Parse command line args """ parser = argparse.ArgumentParser() parser.add_argument("-sp", "--simulation_path", help="Simulation results path", action="store_true", default=G_DEFAULT_SIMULATION_PATH) args = parser.parse_args() simulation_path = args.simulation_path """ Load Topics Structure """ topics = set() nodes_topics = [] try: tomls = glob.glob('%s/*.toml' %G_DEFAULT_TOPOLOGY_PATH) # Index is the node id tomls.sort() for toml_file in tomls: with open(toml_file, mode='rb') as read_file: toml_config = tomllib.load(read_file) node_topics_str = toml_config['topics'] topics_list = list(node_topics_str.split(' ')) nodes_topics.append(topics_list) topics.update(topics_list) except Exception as e: G_LOGGER.error('%s: %s' % (e.__doc__, e)) sys.exit() G_LOGGER.info('Loaded topic structure with %d topic(s) and %d node(s).' %(len(topics), len(nodes_topics))) # G_LOGGER.debug(topics) # G_LOGGER.debug(nodes_topics) """ Load Simulation Messages """ injected_msgs_dict = {} try: with open('%s/messages.json' %simulation_path, 'r') as f: injected_msgs_dict = json.load(f) except Exception as e: G_LOGGER.error('%s: %s' % (e.__doc__, e)) sys.exit() G_LOGGER.info('Loaded %d messages.' %len(injected_msgs_dict)) # G_LOGGER.debug(injected_msgs_dict) node_logs = {} msgs_dict = {} # Helper list with all the timestamps tss = [] try: services_log_paths = glob.glob('%s/*--user-service--*' %simulation_path) pbar = tqdm(services_log_paths) for log_path in pbar: with open('%s/spec.json' %log_path, mode='r') as f: spec_json = json.load(f) if spec_json['Path'] == '/usr/bin/wakunode': node_id = spec_json['Config']['Labels']['com.kurtosistech.id'] # container_id = spec_json['Name'][1:] container_id = spec_json['Id'] node_logs[node_id] = {'published' : [], 'received' : [], 'container_id' : container_id} pbar.set_description("Parsing log of node %s" %node_id) with open('%s/output.log' %log_path, mode='r') as f: # Process log line by line as a text string for log_line in f: # At this stage we only care about Waku Relay protocol if 'waku.relay' in log_line: msg_topics = re.search(r'topics="([^"]+)"', log_line).group(1) msg_topic = re.search(r'pubsubTopic=([^ ]+)', log_line).group(1) msg_hash = re.search(r'hash=([^ ]+)', log_line).group(1) if 'published' in log_line: msg_publishTime = int(re.search(r'publishTime=([\d]+)', log_line).group(1)) tss.append(msg_publishTime) node_logs[node_id]['published'].append([msg_publishTime, msg_topics, msg_topic, msg_hash]) if msg_hash not in msgs_dict: msgs_dict[msg_hash] = {'published' : [{'ts' : msg_publishTime, 'node_id' : node_id}], 'received' : []} else: msgs_dict[msg_hash]['published'].append({'ts' : msg_publishTime, 'node_id' : node_id}) elif 'received' in log_line: msg_receivedTime = int(re.search(r'receivedTime=([\d]+)', log_line).group(1)) tss.append(msg_receivedTime) node_logs[node_id]['received'].append([msg_receivedTime, msg_topics, msg_topic, msg_hash]) if msg_hash not in msgs_dict: msgs_dict[msg_hash] = {'published' : [], 'received' : [{'ts' : msg_receivedTime, 'node_id' : node_id}]} else: msgs_dict[msg_hash]['received'].append({'ts' : msg_receivedTime, 'node_id' : node_id}) G_LOGGER.debug('Parsed node \"%s\" log in %s/output.log' %(node_id, log_path)) except Exception as e: G_LOGGER.error('%s: %s' % (e.__doc__, e)) sys.exit() # Compute simulation time window simulation_start_ts = min(tss) simulation_end_ts = max(tss) simulation_time_ms = round((simulation_end_ts - simulation_start_ts) / 1000000) G_LOGGER.info('Simulation started at %d, ended at %d Effective simulation time was %d ms. ' %(simulation_start_ts, simulation_end_ts, simulation_time_ms)) # Compute message delivery total_messages = len(injected_msgs_dict) delivered_messages = len(msgs_dict) lost_messages = total_messages - delivered_messages delivery_rate = delivered_messages * 100 / total_messages G_LOGGER.info('%d of %d messages delivered. Lost: %d Delivery rate %.2f%%' %(delivered_messages, total_messages, lost_messages, delivery_rate)) # Compute message latencies and propagation times througout the network pbar = tqdm(msgs_dict.items()) for msg_id, msg_data in pbar: # NOTE: Carefull here as I am assuming that every message is published once ... if len(msg_data['published']) > 1: G_LOGGER.warning('Several publishers of message %s') published_ts = int(msg_data['published'][0]['ts']) node_id = msg_data['published'][0]['node_id'] pbar.set_description('Computing latencies of message %s' %msg_id) # Compute latencies latencies = [] for received_data in msg_data['received']: # Skip self if received_data['node_id'] == node_id: continue # NOTE: We are getting some negative latencies meaning that the message appears to be received before it was sent ... I assume this must be because those are the nodes that got the message injected in the first place # TLDR: Should be safe to ignore all the negative latencies latency = int(received_data['ts']) - published_ts node_id = msg_data['published'][0]['node_id'] latencies.append(latency) msgs_dict[msg_id]['latencies'] = latencies msg_propagation_times = [] pbar = tqdm(msgs_dict.items()) for msg_id, msg_data in pbar: pbar.set_description('Computing propagation time of message %s' %msg_id) msg_propagation_times.append(round(max(msg_data['latencies'])/1000000)) # Fetch Hardware metrics from Node containers cpu_usage = [] memory_usage = [] pbar = tqdm(node_logs.items()) for node in pbar: pbar.set_description('Fetching hardware stats from container %s' %node[1]['container_id']) container_stats = fetch_cadvisor_stats_from_container(node[1]['container_id'], simulation_start_ts, simulation_end_ts) # NOTE: Here we could also chose a different statistic such as mean or average instead of max cpu_usage.append(max(container_stats['cpu_usage'])) memory_usage.append(max(container_stats['memory_usage'])) # Generate Figure plot_figure(msg_propagation_times, cpu_usage, memory_usage) # Generate summary # generate_summary() """ We are done """ G_LOGGER.info('Ended') if __name__ == "__main__": main()