In [None]:
%load_ext autotime
%matplotlib inline
import string
import sqlite3
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import snappy
from scipy.interpolate import make_interp_spline
from pathlib import Path
from io import StringIO

**Database connection:**

In [None]:
database_dir = "../build/data/mainnetValidatorDb/validatorDb.sqlite3"
connection = sqlite3.connect(database_dir)

**Rewards and penalties components:**

In [None]:
SOURCE = "source"
TARGET = "target"
HEAD = "head"
INCLUSION_DELAY = "inclusion_delay"
SYNC_COMMITTEE = "sync_committee"

CSV_DATA_COLUMNS_NAMES = [
    "source_outcome",
    "max_source_reward",
    "target_outcome",
    "max_target_reward",
    "head_outcome",
    "max_head_reward",
    "inclusion_delay_outcome",
    "max_inclusion_delay_reward",
    "sync_committee_outcome",
    "max_sync_committee_reward",
    "proposer_outcome",
    "inactivity_penalty",
    "slashing_outcome",
    "deposits",
    "inclusion_delay"]

**Helper functions:**

In [None]:
def valid_public_key(public_key):
    """Checks whether a string is a valid hex representation of a public key of an Eth2 validator."""
    if len(public_key) != 96:
        return False
    return all(c in string.hexdigits for c in public_key)

def idx(public_key):
    """Returns validator's index by its public key."""
    
    if public_key.startswith("0x"):
        public_key = public_key[2:]
    
    if not valid_public_key(public_key):
        raise ValueError(f"The string '{public_key}' is not a valid public key of a validator.")
        
    QUERY_FIELD = "validator_index"
    query = f"SELECT {QUERY_FIELD} FROM validators_raw WHERE pubkey=x'{public_key}';"
    query_result = pd.read_sql_query(query, connection)
    
    if len(query_result[QUERY_FIELD]) == 0:
        raise ValueError(f"Not found a validator with a public key '{public_key}'.")
    
    if len(query_result[QUERY_FIELD]) > 1:
        raise ValueError(f"Found multiple validators with a public key '{public_key}'.")
    
    return query_result[QUERY_FIELD][0]

**Input parameters:**

In [None]:
start_epoch = 1
end_epoch = 94275
files_dir = "../build/data/mainnetCompactedValidatorDb/"
rewards = [SOURCE, TARGET, HEAD, INCLUSION_DELAY, SYNC_COMMITTEE]
validators_sets = {
    "set1": list(range(10)),
    "set2": list(map(idx, [
        "0x8efba2238a00d678306c6258105b058e3c8b0c1f36e821de42da7319c4221b77aa74135dab1860235e19d6515575c381",
        "0xa2dce641f347a9e46f58458390e168fa4b3a0166d74fc495457cb00c8e4054b5d284c62aa0d9578af1996c2e08e36fb6",
        "0x98b7d0eac7ab95d34dbf2b7baa39a8ec451671328c063ab1207c2055d9d5d6f1115403dc5ea19a1111a404823bd9a6e9",
        "0xb0fd08e2e06d1f4d90d0d6843feb543ebeca684cde397fe230e6cdf6f255d234f2c26f4b36c07170dfdfcbbe355d0848",
        "0xab7a5aa955382906be3d76e322343bd439e8690f286ecf2f2a7646363b249f5c133d0501d766ccf1aa1640f0283047b3",
        "0x980c0c001645a00b71c720935ce193e1ed0e917782c4cb07dd476a4fdb7decb8d91daf2770eb413055f0c1d14b5ed6df",
        "0xac7cbdc535ce8254eb9cdedf10d5b1e75de4cd5e91756c3467d0492b01b70b5c6a81530e9849c6b696c8bc157861d0c3",
        "0x98ea289db7ea9714699ec93701a3b6db43900e04ae5497be01fa8cc5a56754c23589eaf1f674de718e291376f452d68c",
        "0x92451d4c099e51f54ab20f5c1a4edf405595c60122ccfb0f39250b7e80986fe0fe457bacd8a887e9087cd6fc323f492c",
        "0xa06f6c678f0129aec056df309a4fe18760116ecaea2292947c5a9cc997632ff437195309783c269ffca7bb2704e675a0"
        ])),
    "set3": list(range(20, 30))
    }

**Loading the data and losses calculation:**

In [None]:
COMPACTED_EPOCH_INFO_FILE_PATTERN = re.compile(r"(\d{8})\_(\d{8})\.epoch")

def get_first_and_last_epoch(file_name):
    m = re.match(COMPACTED_EPOCH_INFO_FILE_PATTERN, file_name)
    if m == None:
        return None
    return int(m.group(1)), int(m.group(2))

def isEpochInfoFile(file_name):
    r = get_first_and_last_epoch(file_name)
    if r == None:
        return False
    file_start_epoch, file_end_epoch = r
    if file_start_epoch > file_end_epoch:
        return False
    if file_end_epoch < start_epoch:
        return False
    if file_start_epoch > end_epoch:
        return False
    return True

def adjust_constraints(sorted_file_names):
    first_start_epoch, first_end_epoch = get_first_and_last_epoch(sorted_file_names[0])
    _, last_end_epoch = get_first_and_last_epoch(sorted_file_names[-1])
    global start_epoch, end_epoch, resolution
    start_epoch = first_start_epoch
    end_epoch = last_end_epoch
    resolution = first_end_epoch - first_start_epoch + 1

def read_csv(file_path):
     return pd.read_csv(
        StringIO(snappy.decompress(file_path.read_bytes()).decode("utf-8")),
        names = CSV_DATA_COLUMNS_NAMES, usecols = set(range(0, 10)))

def get_outcome_var(component):
    return component + "_outcome"

def get_max_reward_var(component):
    return "max_" + component + "_reward"

max_reward_vars = [get_max_reward_var(reward_type) for reward_type in rewards]
outcome_vars = [get_outcome_var(reward_type) for reward_type in rewards]

def sum_max_values(t):
    return sum(getattr(t, max_reward) for max_reward in max_reward_vars)

def sum_actual_values(t):
    return sum(getattr(t, outcome) for outcome in outcome_vars)

def compute_losses_median(data):
    max_values = data[max_reward_vars].sum(axis = 1)
    actual_values = data[outcome_vars].sum(axis = 1)
    losses = max_values - actual_values
    return losses.median(axis = 0)

total_losses_per_epoch_point = {}
average_losses_per_epoch_point = {}
validators_sets_queries = {}
medians = {}

for set_name, set_values in validators_sets.items():
    total_losses_per_epoch_point[set_name] = {}
    average_losses_per_epoch_point[set_name] = {}
    validators_sets_queries[set_name] = []

file_names = [file_name for file_name in os.listdir(files_dir)
              if isEpochInfoFile(file_name)]
file_names.sort()
adjust_constraints(file_names)

previous_validators_count = 0
for file_name in file_names:
    data = read_csv(Path(files_dir + file_name))
    file_first_epoch, file_last_epoch = get_first_and_last_epoch(file_name)
    file_epoch_range = file_last_epoch - file_first_epoch + 1
    epoch_point = file_first_epoch // resolution
    validators_count = len(data.index)
    for set_name, validators in validators_sets.items():
        for i in range(previous_validators_count, validators_count):
            if i in validators:
                validators_sets_queries[set_name].append(i)
        sums = data.iloc[validators_sets_queries[set_name]].sum(axis = 0)
        difference = sum_max_values(sums) - sum_actual_values(sums)
        set_validators_count = len(validators_sets_queries[set_name])
        average_losses_per_epoch_point[set_name][epoch_point] = \
            difference / set_validators_count if set_validators_count > 0 else 0
        total_losses_per_epoch_point[set_name][epoch_point] = difference * file_epoch_range
    medians[epoch_point] = compute_losses_median(data)
    previous_validators_count = validators_count


**Average losses graph:** 

In [None]:
plt.subplots(figsize = (20, 5))
plt.title("Average losses per epoch")
plt.xlabel("Epoch")
plt.ylabel("Gwei")

num_samples = (end_epoch - start_epoch + 1) // resolution * 100

def plot(set_name, set_values):
    epochs = np.array([ep * resolution + resolution // 2 for ep in set_values.keys()])
    values = np.array(list(set_values.values()))
    spline = make_interp_spline(epochs, values)
    x = np.linspace(epochs.min(), epochs.max(), num_samples)
    y = spline(x)
    plt.plot(x, y, label=set_name)

for name, value in average_losses_per_epoch_point.items():
    plot(name, value)

plot("median", medians)

plt.legend(loc="best")

**Total losses:**

In [None]:
sets_total_losses = {}
for set_name, epoch_points in total_losses_per_epoch_point.items():
    sets_total_losses[set_name] = 0
    for _, losses in epoch_points.items():
        sets_total_losses[set_name] += losses

plt.title("Total losses")
plt.xlabel("Set")
plt.ylabel("Ethers")
plt.bar(list(sets_total_losses.keys()), [loss * 1e-9 for loss in sets_total_losses.values()])
print(sets_total_losses)