wakurtosis/analysis.py

#!/usr/bin/env python3
"""
Description: Wakurtosis simulation analysis

"""

""" Dependencies """
import sys, logging, json, argparse, tomllib, glob, re, requests
from datetime import datetime
from tqdm_loggable.auto import tqdm
from tqdm_loggable.tqdm_logging import tqdm_logging
import matplotlib.pyplot as plt
from scipy import stats

from prometheus_api_client import PrometheusConnect

""" Globals """
G_APP_NAME = 'WLS-ANALYSIS'
G_LOG_LEVEL = 'DEBUG'
G_DEFAULT_CONFIG_FILE = './config/config.json'
G_DEFAULT_CONFIG_FILE = './config/config.json'
G_DEFAULT_TOPOLOGY_PATH = './config/topology_generated'
G_DEFAULT_SIMULATION_PATH = './wakurtosis_logs'
G_DEFAULT_FIG_FILENAME = 'analysis.pdf'
G_DEFAULT_SUMMARY_FILENAME = 'summary.json'
G_LOGGER = None

""" Custom logging formatter """
class CustomFormatter(logging.Formatter):

    # Set different formats for every logging level
    time_name_stamp = "[%(asctime)s.%(msecs)03d] [" + G_APP_NAME + "]"
    FORMATS = {
        logging.ERROR: time_name_stamp + " ERROR in %(module)s.py %(funcName)s() %(lineno)d - %(msg)s",
        logging.WARNING: time_name_stamp + " WARNING - %(msg)s",
        logging.CRITICAL: time_name_stamp + " CRITICAL in %(module)s.py %(funcName)s() %(lineno)d - %(msg)s",
        logging.INFO:  time_name_stamp + " %(msg)s",
        logging.DEBUG: time_name_stamp + " %(funcName)s() %(msg)s",
        'DEFAULT': time_name_stamp + " %(msg)s",
    }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno, self.FORMATS['DEFAULT'])
        formatter = logging.Formatter(log_fmt, '%d-%m-%Y %H:%M:%S')
        return formatter.format(record)

def generate_summary():

    # summary = {
    #     "end_ts" : time.time(),
    #     "params" : config['general'],
    #     "topics" : list(topics_msg_cnt.keys()),
    #     "topics_msg_cnt" : topics_msg_cnt,
    #     "simulation_time" : elapsed_s,
    #     "total_messages" : len()
    # }


    # with open('./summary.json', 'w') as summary_file:
    #     summary_file.write(json.dumps(summary, indent=4))

    G_LOGGER.info('Analsysis sumnmary saved in  %s' %summary)

def plot_figure(msg_propagation_times, cpu_usage, memory_usage):

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 10))

    ax1.violinplot(msg_propagation_times, showmedians=True)
    ax1.set_title('Message propagation times \n(sample size: %d messages)' %len(msg_propagation_times))
    ax1.set_ylabel('Propagation Time (ms)')
    ax1.spines[['right', 'top']].set_visible(False)
    ax1.axes.xaxis.set_visible(False)

    ax2.violinplot(cpu_usage, showmedians=True)
    ax2.set_title('Maximum CPU usage per Waku node \n(sample size: %d nodes)' %len(cpu_usage))
    ax2.set_ylabel('CPU Cycles')
    ax2.spines[['right', 'top']].set_visible(False)
    ax2.axes.xaxis.set_visible(False)

    ax3.violinplot(memory_usage, showmedians=True)
    ax3.set_title('Maximum memory usage per Waku node \n(sample size: %d nodes)' %len(memory_usage))
    ax3.set_ylabel('Bytes')
    ax3.spines[['right', 'top']].set_visible(False)
    ax3.axes.xaxis.set_visible(False)

    plt.tight_layout()

    figure_path = '%s/%s' %(G_DEFAULT_SIMULATION_PATH, G_DEFAULT_FIG_FILENAME)
    plt.savefig(figure_path, format="pdf", bbox_inches="tight")

    G_LOGGER.info('Figure saved in %s' %figure_path)

# def fetch_cadvisor_stats_from_container(container_id, start_ts, end_ts, prometheus_port=52118):

#     url='http://localhost:%d' %52118

#     try:
#         G_LOGGER.debug('Connecting to Prometheus server in %s' %url)
#         prometheus = PrometheusConnect(url, disable_ssl=True, container_label="container_label_com_docker_container_id=%s" %container_id)
#         print(prometheus)
#     except Exception as e:
#         G_LOGGER.error('%s: %s' % (e.__doc__, e))
#         return None

#     metrics = prometheus.get_label_values("__name__")
#     print(metrics)

#     try:
#         # query = '100 - (avg by(instance) (irate(container_cpu_usage_seconds_total{container_label_com_docker_container_id="<%s>"}[5m])) * 100)' %container_id
#         # query = "container_file_descriptors{process_cpu_seconds_total=\"<%s>\"}" %container_id
#         # result = prometheus.query(query)
#         query = 'process_cpu_seconds_total'
#         result = prometheus.custom_query(query)
#         G_LOGGER.debug('Querying: %s' %query)
#     except Exception as e:
#         G_LOGGER.error('%s: %s' % (e.__doc__, e))
#         return None


#     print('--->', result)

#     return {'cpu_usage' : 0, 'memory_usage' : 0, 'bandwidth_in' : 0, 'bandwidth_out' : 0}

def fetch_cadvisor_summary_from_container(container_id):

    # cAdvisor API URL endpoint
    url = 'http://localhost:8080/api/v2.1/summary/docker/%s' %container_id
    # Note: We can also use the endpoint /stats instead of summary to get timepoints
    G_LOGGER.debug('Fetching summary stats from %s ...' %url)

    # Make an HTTP request to the cAdvisor API to get the summary stats of the container
    try:
        response = requests.get(url)
    except Exception as e:
        G_LOGGER.error('%s: %s' % (e.__doc__, e))
        return

    # Parse the response as JSON
    summary_stats = json.loads(response.text)
    # G_LOGGER.debug(summary_stats)

    return summary_stats

def fetch_cadvisor_stats_from_container(container_id, start_ts, end_ts):

    # cAdvisor API URL endpoint
    url = 'http://localhost:8080/api/v2.1/stats/docker/%s?count=1000' %(container_id)
    # Note: We can also use the endpoint /stats instead of summary to get timepoints
    G_LOGGER.debug('Fetching cAdvisor stats from %s ...' %url)

    # Make an HTTP request to the cAdvisor API to get the summary stats of the container
    try:
        response = requests.get(url)
    except Exception as e:
        G_LOGGER.error('%s: %s' % (e.__doc__, e))
        return

    # Parse the response as JSON
    stats_dict = json.loads(response.text)

    cpu_usage = []
    memory_usage = []
    for stats_obj in stats_dict.values():

        for data_point in stats_obj['stats']:

            # Only take into account data points wihtin the simulation time
            datetime_str = data_point['timestamp']
            # print(datetime_str)
            datetime_obj = datetime.fromisoformat(datetime_str[:-1])
            # print(datetime_obj)
            # timestamp_ns = int(datetime_obj.timestamp() * 1e9)
            # Calculate the total number of seconds and microseconds since the Unix epoch
            unix_seconds = (datetime_obj - datetime(1970, 1, 1)).total_seconds()
            microseconds = datetime_obj.microsecond

            # Convert to nanoseconds
            timestamp_ns = int((unix_seconds * 1e9) + (microseconds * 1e3))

            # if timestamp_ns < start_ts or timestamp_ns > end_ts:
            #     G_LOGGER.debug('Data point %d out of the time window [%d-%d]' %(timestamp_ns, start_ts, end_ts))
            #     continue

            G_LOGGER.debug('Data point %d' %(timestamp_ns))

            # print(data_point['timestamp'])
            # NOTE: This is comes empty. Check in Ubuntu
            # print(data_point['diskio'])
            # print('CPU:', data_point['cpu']['usage']['user'])
            # print('Memory:', data_point['memory']['usage'])
            cpu_usage.append(data_point['cpu']['usage']['user'])
            memory_usage.append(data_point['memory']['usage'])

    print(len(cpu_usage))

    return {'cpu_usage' : cpu_usage, 'memory_usage' : memory_usage}

def main():

    global G_LOGGER

    """ Init Logging """
    G_LOGGER = logging.getLogger(G_APP_NAME)
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(CustomFormatter())
    G_LOGGER.addHandler(handler)

    tqdm_logging.set_level(logging.INFO)

    # Set loglevel from config
    G_LOGGER.setLevel(G_LOG_LEVEL)
    handler.setLevel(G_LOG_LEVEL)

    G_LOGGER.info('Started')

    """ Parse command line args """
    parser = argparse.ArgumentParser()
    parser.add_argument("-sp", "--simulation_path", help="Simulation results path", action="store_true", default=G_DEFAULT_SIMULATION_PATH)
    args = parser.parse_args()

    simulation_path = args.simulation_path

    """ Load Topics Structure """
    topics = set()
    nodes_topics = []
    try:
        tomls = glob.glob('%s/*.toml' %G_DEFAULT_TOPOLOGY_PATH)
        # Index is the node id
        tomls.sort()
        for toml_file in tomls:

            with open(toml_file, mode='rb') as read_file:
                toml_config = tomllib.load(read_file)
                node_topics_str = toml_config['topics']
                topics_list = list(node_topics_str.split(' '))
                nodes_topics.append(topics_list)
                topics.update(topics_list)
    except Exception as e:
        G_LOGGER.error('%s: %s' % (e.__doc__, e))
        sys.exit()

    G_LOGGER.info('Loaded topic structure with %d topic(s) and %d node(s).' %(len(topics), len(nodes_topics)))
    # G_LOGGER.debug(topics)
    # G_LOGGER.debug(nodes_topics)

    """ Load Simulation Messages """
    injected_msgs_dict = {}
    try:
        with open('%s/messages.json' %simulation_path, 'r') as f:
            injected_msgs_dict = json.load(f)
    except Exception as e:
        G_LOGGER.error('%s: %s' % (e.__doc__, e))
        sys.exit()

    G_LOGGER.info('Loaded %d messages.' %len(injected_msgs_dict))
    # G_LOGGER.debug(injected_msgs_dict)

    node_logs = {}
    msgs_dict = {}

    # Helper list with all the timestamps
    tss = []
    try:
        services_log_paths = glob.glob('%s/*--user-service--*' %simulation_path)

        pbar = tqdm(services_log_paths)
        for log_path in pbar:
            with open('%s/spec.json' %log_path, mode='r') as f:
                spec_json = json.load(f)
                if spec_json['Path'] == '/usr/bin/wakunode':
                    node_id = spec_json['Config']['Labels']['com.kurtosistech.id']

                    # container_id = spec_json['Name'][1:]
                    container_id = spec_json['Id']
                    node_logs[node_id] = {'published' : [], 'received' : [], 'container_id' : container_id}

                    pbar.set_description("Parsing log of node %s" %node_id)

                    with open('%s/output.log' %log_path, mode='r') as f:

                        # Process log line by line as a text string
                        for log_line in f:

                            # At this stage we only care about Waku Relay protocol
                            if 'waku.relay' in log_line:

                                msg_topics = re.search(r'topics="([^"]+)"', log_line).group(1)
                                msg_topic = re.search(r'pubsubTopic=([^ ]+)', log_line).group(1)
                                msg_hash = re.search(r'hash=([^ ]+)', log_line).group(1)

                                if 'published' in log_line:
                                    msg_publishTime = int(re.search(r'publishTime=([\d]+)', log_line).group(1))
                                    tss.append(msg_publishTime)

                                    node_logs[node_id]['published'].append([msg_publishTime, msg_topics, msg_topic, msg_hash])

                                    if msg_hash not in msgs_dict:
                                        msgs_dict[msg_hash] = {'published' : [{'ts' : msg_publishTime, 'node_id' : node_id}], 'received' : []}
                                    else:
                                        msgs_dict[msg_hash]['published'].append({'ts' : msg_publishTime, 'node_id' : node_id})

                                elif 'received' in log_line:
                                    msg_receivedTime = int(re.search(r'receivedTime=([\d]+)', log_line).group(1))
                                    tss.append(msg_receivedTime)

                                    node_logs[node_id]['received'].append([msg_receivedTime, msg_topics, msg_topic, msg_hash])

                                    if msg_hash not in msgs_dict:
                                        msgs_dict[msg_hash] = {'published' : [], 'received' : [{'ts' : msg_receivedTime, 'node_id' : node_id}]}
                                    else:
                                        msgs_dict[msg_hash]['received'].append({'ts' : msg_receivedTime, 'node_id' : node_id})

                    G_LOGGER.debug('Parsed node \"%s\" log in %s/output.log' %(node_id, log_path))
    except Exception as e:
        G_LOGGER.error('%s: %s' % (e.__doc__, e))
        sys.exit()

    # Compute simulation time window
    simulation_start_ts = min(tss)
    simulation_end_ts = max(tss)
    simulation_time_ms = round((simulation_end_ts - simulation_start_ts) / 1000000)
    G_LOGGER.info('Simulation started at %d, ended at %d Effective simulation time was %d ms. ' %(simulation_start_ts, simulation_end_ts, simulation_time_ms))

    # Compute message delivery
    total_messages = len(injected_msgs_dict)
    delivered_messages = len(msgs_dict)
    lost_messages = total_messages - delivered_messages
    delivery_rate = delivered_messages * 100 / total_messages

    G_LOGGER.info('%d of %d messages delivered. Lost: %d Delivery rate %.2f%%' %(delivered_messages, total_messages, lost_messages, delivery_rate))

    # Compute message latencies and propagation times througout the network
    pbar = tqdm(msgs_dict.items())
    for msg_id, msg_data in pbar:
        # NOTE: Carefull here as I am assuming that every message is published once ...
        if len(msg_data['published']) > 1:
            G_LOGGER.warning('Several publishers of message %s')

        published_ts = int(msg_data['published'][0]['ts'])
        node_id = msg_data['published'][0]['node_id']

        pbar.set_description('Computing latencies of message %s' %msg_id)

        # Compute latencies
        latencies = []
        for received_data in msg_data['received']:
            # Skip self
            if received_data['node_id'] == node_id:
                continue
            # NOTE: We are getting some negative latencies meaning that the message appears to be received before it was sent ... I assume this must be because those are the nodes that got the message injected in the first place
            #  TLDR: Should be safe to ignore all the negative latencies
            latency = int(received_data['ts']) - published_ts
            node_id = msg_data['published'][0]['node_id']
            latencies.append(latency)

        msgs_dict[msg_id]['latencies'] = latencies

    msg_propagation_times = []
    pbar = tqdm(msgs_dict.items())
    for msg_id, msg_data in pbar:
        pbar.set_description('Computing propagation time of message %s' %msg_id)
        msg_propagation_times.append(round(max(msg_data['latencies'])/1000000))

    # Fetch Hardware metrics from Node containers
    cpu_usage = []
    memory_usage = []
    pbar = tqdm(node_logs.items())
    for node in pbar:
        pbar.set_description('Fetching hardware stats from container %s' %node[1]['container_id'])
        container_stats = fetch_cadvisor_stats_from_container(node[1]['container_id'], simulation_start_ts, simulation_end_ts)
        # NOTE: Here we could also chose a different statistic such as mean or average instead of max
        cpu_usage.append(max(container_stats['cpu_usage']))
        memory_usage.append(max(container_stats['memory_usage']))

    # Generate Figure
    plot_figure(msg_propagation_times, cpu_usage, memory_usage)

    # Generate summary
    # generate_summary()

    """ We are done """
    G_LOGGER.info('Ended')

if __name__ == "__main__":

    main()