logos-blockchain-specs/mixnet/v2/sim/analysis.py

import itertools
import multiprocessing
import sys
from collections import Counter
from typing import TYPE_CHECKING

import pandas as pd
import seaborn
from matplotlib import pyplot as plt

from adversary import NodeState
from config import Config
from environment import Time
from simulation import Simulation

if TYPE_CHECKING:
    from node import Node

COL_TIME = "Time"
COL_NODE_ID = "Node ID"
COL_MSG_CNT = "Message Count"
COL_SENDER_CNT = "Sender Count"
COL_NODE_STATE = "Node State"
COL_HOPS = "Hops"
COL_EXPECTED = "Expected"
COL_MSG_SIZE = "Message Size"
COL_EGRESS = "Egress"
COL_INGRESS = "Ingress"
COL_SUCCESS_RATE = "Success Rate (%)"


class Analysis:
    def __init__(self, sim: Simulation, config: Config, show_plots: bool = True):
        self.sim = sim
        self.config = config
        self.show_plots = show_plots

    def run(self):
        message_size_df = self.message_size_distribution()
        self.bandwidth(message_size_df)
        self.messages_emitted_around_interval()
        self.messages_in_node_over_time()
        # self.node_states()
        median_hops = self.message_hops()
        self.timing_attack(median_hops)

    def bandwidth(self, message_size_df: pd.DataFrame):
        if not self.show_plots:
            return

        dataframes = []
        nonzero_egresses = []
        nonzero_ingresses = []
        for egress_bandwidths, ingress_bandwidths in zip(self.sim.p2p.measurement.egress_bandwidth_per_sec,
                                                         self.sim.p2p.measurement.ingress_bandwidth_per_sec):
            rows = []
            for node in self.sim.p2p.nodes:
                egress = egress_bandwidths[node] / 1024.0
                ingress = ingress_bandwidths[node] / 1024.0
                rows.append((node.id, egress, ingress))
                if egress > 0:
                    nonzero_egresses.append(egress)
                if ingress > 0:
                    nonzero_ingresses.append(ingress)
            df = pd.DataFrame(rows, columns=[COL_NODE_ID, COL_EGRESS, COL_INGRESS])
            dataframes.append(df)

        times = range(len(dataframes))
        df = pd.concat([df.assign(Time=time) for df, time in zip(dataframes, times)], ignore_index=True)
        df = df.pivot(index=COL_TIME, columns=COL_NODE_ID, values=[COL_EGRESS, COL_INGRESS])
        plt.figure(figsize=(12, 6))
        for column in df.columns:
            marker = "x" if column[0] == COL_INGRESS else "o"
            plt.plot(df.index, df[column], marker=marker, label=column[0])
        plt.title("Egress/ingress bandwidth of each node over time")
        plt.xlabel(COL_TIME)
        plt.ylabel("Bandwidth (KiB/s)")
        plt.ylim(bottom=0)
        # Customize the legend to show only "egress" and "ingress" regardless of node_id
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        plt.legend(by_label.values(), by_label.keys())
        plt.grid(True)

        # Adding descriptions on the right size of the plot
        egress_series = pd.Series(nonzero_egresses)
        ingress_series = pd.Series(nonzero_ingresses)
        desc = (
            f"message: {message_size_df[COL_MSG_SIZE].mean():.0f} bytes\n"
            f"{self.config.description()}\n\n"
            f"[egress(>0)]\nmean: {egress_series.mean():.2f} KiB/s\nmax: {egress_series.max():.2f} KiB/s\n\n"
            f"[ingress(>0)]\nmean: {ingress_series.mean():.2f} KiB/s\nmax: {ingress_series.max():.2f} KiB/s"
        )
        plt.text(1.02, 0.5, desc, transform=plt.gca().transAxes, verticalalignment="center", fontsize=12)
        plt.subplots_adjust(right=0.8)  # Adjust layout to make room for the text

        plt.show()

    def message_size_distribution(self) -> pd.DataFrame:
        df = pd.DataFrame(self.sim.p2p.adversary.message_sizes, columns=[COL_MSG_SIZE])
        print(df.describe())
        return df

    def messages_emitted_around_interval(self) -> (float, float, float):
        # A ground truth that shows how many times each node sent a real message
        truth_df = pd.DataFrame(
            [(node.id, count) for node, count in self.sim.p2p.measurement.original_senders.items()],
            columns=[COL_NODE_ID, COL_MSG_CNT])
        # A result of observing nodes who have sent messages around the promised message interval
        suspected_df = pd.DataFrame(
            [(node.id, self.sim.p2p.adversary.senders_around_interval[node]) for node in
             self.sim.p2p.measurement.original_senders.keys()],
            columns=[COL_NODE_ID, COL_MSG_CNT]
        )

        if self.show_plots:
            width = 0.4
            fig, ax = plt.subplots(figsize=(12, 8))
            ax.bar(truth_df[COL_NODE_ID] - width / 2, truth_df[COL_MSG_CNT], width, label="Ground Truth", color="b")
            ax.bar(truth_df[COL_NODE_ID] + width / 2, suspected_df[COL_MSG_CNT], width, label="Adversary's Inference",
                   color="r")
            ax.set_title("Nodes who generated real messages")
            ax.set_xlabel(COL_NODE_ID)
            ax.set_ylabel(COL_MSG_CNT)
            ax.set_xlim(-1, len(truth_df[COL_NODE_ID]))
            ax.legend()
            plt.tight_layout()
            plt.show()

        # Calculate precision, recall, and F1 score
        truth = set(truth_df[truth_df[COL_MSG_CNT] > 0][COL_NODE_ID])
        suspected = set(suspected_df[suspected_df[COL_MSG_CNT] > 0][COL_NODE_ID])
        true_positives = truth.intersection(suspected)
        precision = len(true_positives) / len(suspected) * 100.0 if len(suspected) > 0 else 0.0
        recall = len(true_positives) / len(truth) * 100.0 if len(truth) > 0 else 0.0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
        print(f"Precision: {precision:.2f}%, Recall: {recall:.2f}%, F1 Score: {f1_score:.2f}%")
        return precision, recall, f1_score

    def messages_in_node_over_time(self):
        if not self.show_plots:
            return

        dataframes = []
        for time, msg_pools in enumerate(self.sim.p2p.adversary.msg_pools_per_time):
            data = []
            for receiver, msg_pool in msg_pools.items():
                senders = self.sim.p2p.adversary.msgs_received_per_time[time][receiver].keys()
                data.append((time, receiver.id, len(msg_pool), len(senders)))
            df = pd.DataFrame(data, columns=[COL_TIME, COL_NODE_ID, COL_MSG_CNT, COL_SENDER_CNT])
            if not df.empty:
                dataframes.append(df)
        df = pd.concat(dataframes, ignore_index=True)

        msg_cnt_df = df.pivot(index=COL_TIME, columns=COL_NODE_ID, values=COL_MSG_CNT)
        plt.figure(figsize=(12, 6))
        for column in msg_cnt_df.columns:
            plt.plot(msg_cnt_df.index, msg_cnt_df[column], marker=None, label=column)
        plt.title("Messages within each node over time")
        plt.xlabel(COL_TIME)
        plt.ylabel(COL_MSG_CNT)
        plt.ylim(bottom=0)
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        sender_cnt_df = df.pivot(index=COL_TIME, columns=COL_NODE_ID, values=COL_SENDER_CNT)
        plt.figure(figsize=(12, 6))
        for column in sender_cnt_df.columns:
            plt.plot(sender_cnt_df.index, sender_cnt_df[column], marker=None, label=column)
        plt.title("Diversity of senders of messages received by each node over time")
        plt.xlabel(COL_TIME)
        plt.ylabel("# of senders of messages received by each node")
        plt.ylim(bottom=0)
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        plt.figure(figsize=(12, 6))
        df.boxplot(column=COL_SENDER_CNT, by=COL_TIME, medianprops={"color": "red", "linewidth": 2.5})
        plt.title("Diversity of senders of messages received by each node over time")
        plt.suptitle("")
        plt.xticks([])
        plt.xlabel(COL_TIME)
        plt.ylabel("# of senders of messages received by each node")
        plt.ylim(bottom=0)
        plt.grid(axis="x")
        plt.tight_layout()
        plt.show()

    def node_states(self):
        if not self.show_plots:
            return

        rows = []
        for time, node_states in self.sim.p2p.adversary.node_states.items():
            for node, state in node_states.items():
                rows.append((time, node.id, state))
        df = pd.DataFrame(rows, columns=[COL_TIME, COL_NODE_ID, COL_NODE_STATE])

        plt.figure(figsize=(10, 6))
        seaborn.scatterplot(data=df, x=COL_TIME, y=COL_NODE_ID, hue=COL_NODE_STATE,
                            palette={NodeState.SENDING: "red", NodeState.RECEIVING: "blue"})
        plt.title("Node states over time")
        plt.xlabel(COL_TIME)
        plt.ylabel(COL_NODE_ID)
        plt.legend(title=COL_NODE_STATE)
        plt.show()

    def message_hops(self) -> int:
        df = pd.DataFrame(self.sim.p2p.measurement.message_hops.values(), columns=[COL_HOPS])
        print(df.describe())
        if self.show_plots:
            plt.figure(figsize=(6, 6))
            seaborn.boxplot(data=df, y=COL_HOPS, medianprops={"color": "red", "linewidth": 2.5})
            plt.ylim(bottom=0)
            plt.title("The distribution of max hops of single broadcasting")
            plt.show()
        return int(df.median().iloc[0])

    def timing_attack(self, hops_between_layers: int) -> pd.DataFrame:
        success_rates = self.spawn_timing_attacks(hops_between_layers)
        df = pd.DataFrame(success_rates, columns=[COL_SUCCESS_RATE])
        print(df.describe())

        if self.show_plots:
            plt.figure(figsize=(6, 6))
            plt.boxplot(df[COL_SUCCESS_RATE], vert=True, patch_artist=True, boxprops=dict(facecolor="lightblue"),
                        medianprops=dict(color="orange"))
            mean = df[COL_SUCCESS_RATE].mean()
            median = df[COL_SUCCESS_RATE].median()
            plt.axhline(mean, color="red", linestyle="--", linewidth=1, label=f"Mean: {mean:.2f}%")
            plt.axhline(median, color="orange", linestyle="-", linewidth=1, label=f"Median: {median:.2f}%")
            plt.ylabel(COL_SUCCESS_RATE)
            plt.ylim(-5, 105)
            plt.title("Timing attack success rate distribution")
            plt.legend()
            plt.grid(True)
            plt.show()

        return df

    def spawn_timing_attacks(self, hops_between_layers: int) -> list[float]:
        tasks = self.prepare_timing_attack_tasks(hops_between_layers)
        print(f"{len(tasks)} TASKS")

        processes = []
        results = multiprocessing.Manager().list()
        for task in tasks:
            process = multiprocessing.Process(target=self.spawn_timing_attack, args=(task, results))
            process.start()
            processes.append(process)

        for process in processes:
            process.join()

        return list(results)

    def spawn_timing_attack(self, task, results):
        origin_id, receiver, time_received, remaining_hops, observed_hops, suspected_origins, senders = task
        result = self.run_and_evaluate_timing_attack(
            origin_id, receiver, time_received, remaining_hops, observed_hops, suspected_origins, senders
        )
        results.append(result)
        print(f"{len(results)} PROCESSES DONE")

    def prepare_timing_attack_tasks(self, hops_between_layers: int) -> list:
        hops_to_observe = hops_between_layers * (self.config.mixnet.num_mix_layers + 1)
        tasks = []

        for receiver, times_and_msgs in self.sim.p2p.adversary.final_msgs_received.items():
            for time_received, msgs in times_and_msgs.items():
                for sender, time_sent, origin_id in msgs:
                    tasks.append((
                        origin_id, receiver, time_received, hops_to_observe, 0, Counter(), {sender: [time_sent]}
                    ))
                    if len(tasks) >= self.config.adversary.timing_attack_max_targets:
                        return tasks

        return tasks

    def run_and_evaluate_timing_attack(self, origin_id: int, receiver: "Node", time_received: Time,
                                       remaining_hops: int, observed_hops: int, suspected_origins: Counter,
                                       senders: dict["Node", list[Time]] = None) -> float:
        suspected_origins = self.timing_attack_from_receiver(
            receiver, time_received, remaining_hops, observed_hops, suspected_origins, senders
        )
        suspected_origin_ids = {node.id for node in suspected_origins}
        if origin_id in suspected_origin_ids:
            return 1 / len(suspected_origin_ids) * 100.0
        else:
            return 0.0

    def timing_attack_from_receiver(self, receiver: "Node", time_received: Time,
                                    remaining_hops: int, observed_hops: int, suspected_origins: Counter,
                                    senders: dict["Node", list[Time]] = None) -> Counter:
        if remaining_hops <= 0:
            return suspected_origins

        # If all nodes are already suspected, no need to inspect further.
        if len(suspected_origins) == self.config.mixnet.num_nodes:
            return suspected_origins

        # Start inspecting senders who sent messages that were arrived in the receiver at the given time.
        # If the specific sender is given, inspect only that sender to maximize the success rate.
        if senders is None:
            senders = self.sim.p2p.adversary.msgs_received_per_time[time_received][receiver]

        senders = dict(itertools.islice(senders.items(), self.config.adversary.timing_attack_max_pool_size))

        # Inspect each sender who sent messages to the receiver
        for sender, times_sent in senders.items():
            # Calculate the time range where the sender might have received any messages
            # related to the message being traced.
            min_time, max_time = sys.maxsize, 0
            for time_sent in times_sent:
                min_time = min(min_time, time_sent - self.config.mixnet.max_mix_delay)
                max_time = max(max_time, time_sent - self.config.mixnet.min_mix_delay)
                # If the sender is sent the message around the message interval, suspect the sender as the origin.
                if (self.sim.p2p.adversary.is_around_message_interval(time_sent)
                        and observed_hops + 1 >= self.min_hops_to_observe_for_timing_attack()):
                    suspected_origins.update({sender})

            # Track back to each time when that sender might have received any messages.
            for time_sender_received in range(max_time, min_time - 1, -1):
                if time_sender_received < 0:
                    break
                self.timing_attack_from_receiver(
                    sender, time_sender_received, remaining_hops - 1, observed_hops + 1, suspected_origins
                )

        return suspected_origins

    def min_hops_to_observe_for_timing_attack(self) -> int:
        return self.config.mixnet.num_mix_layers + 1