Script for comparing csv outputs from block import

2025-02-18 15:07:27 +00:00 · 2024-06-06 14:33:49 +02:00 · 2024-06-06 14:33:49 +02:00 · 0c6c84f2ce
commit 0c6c84f2ce
parent 1e65093b3e
5 changed files with 251 additions and 0 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@ -0,0 +1,40 @@
 # Utility scripts
 ## block-import-stats.py
 This script compares outputs from two `nimbus import --debug-csv-stats`, a
 baseline and a contender.
 To use it, set up a virtual environment:
 ```bash
 # Create a venv for the tool
 python -m venv stats
 . stats/bin/activate
 pip install -r requirements.txt
 python block-import-stats.py
 ```
 * Generate a baseline version by processing a long range of blocks using
  `nimbus import`
 * Modify your code and commit to git (to generate a unique identifier for the code)
 * Re-run the same import over the range of blocks of interest, saving the import
  statistics to a new CSV
 * Pass the two CSV files to the script
 By default, the script will skip block numbers below 500k since these are mostly
 unintersting.
 See `-h` for help text on running the script.
 ### Testing a particular range of blocks
 As long as block import is run on similar hardware, each run can be saved for
 future reference using the git hash.
 The block import can be run repeatedly with `--max-blocks` to stop after
 processing a number of blocks - by copying the state at that point, one can
 resume or replay the import of a particular block range
 See `make_states.sh` for such an example.
--- a/scripts/block-import-stats.py
+++ b/scripts/block-import-stats.py
@ -0,0 +1,137 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import os
 import argparse
 plt.rcParams["figure.figsize"] = [40, 30]
 from pandas.plotting import register_matplotlib_converters
 register_matplotlib_converters()
 def readStats(name: str, min_block_number: int):
    df = pd.read_csv(name).convert_dtypes()
    df = df[df.block_number >= min_block_number]
    df.set_index("block_number", inplace=True)
    df.time /= 1000000000
    df.drop(columns=["gas"], inplace=True)
    df["bps"] = df.blocks / df.time
    df["tps"] = df.txs / df.time
    return df
 def prettySecs(s: float):
    sa = abs(int(s))
    ss = sa % 60
    m = sa // 60 % 60
    h = sa // (60 * 60)
    sign = "" if s >= 0 else "-"
    if h > 0:
        return f"{sign}{h}h{m}m{ss}s"
    elif m > 0:
        return f"{sign}{m}m{ss}s"
    else:
        return f"{sign}{ss}s"
 def formatBins(df: pd.DataFrame, bins: int):
    if bins > 0:
        bins = np.linspace(
            df.block_number.iloc[0], df.block_number.iloc[-1], bins, dtype=int
        )
        return df.groupby(pd.cut(df["block_number"], bins), observed=True)
    else:
        return df
 parser = argparse.ArgumentParser()
 parser.add_argument("baseline")
 parser.add_argument("contender")
 parser.add_argument("--plot", action="store_true")
 parser.add_argument(
    "--bins",
    default=10,
    type=int,
    help="Number of bins to group block ranges into in overview, 0=all rows",
 )
 parser.add_argument(
    "--min-block-number",
    default=500000,
    type=int,
    help="Skip block blocks below the given number",
 )
 args = parser.parse_args()
 baseline = readStats(args.baseline, args.min_block_number)
 contender = readStats(args.contender, args.min_block_number)
 # Pick out the rows to match - a more sophisticated version of this would
 # interpolate, perhaps - also, maybe should check for non-matching block/tx counts
 df = baseline.merge(contender, on=("block_number", "blocks", "txs"))
 df["bpsd"] = (df.bps_y - df.bps_x) / df.bps_x
 df["tpsd"] = (df.tps_y - df.tps_x) / df.tps_x
 df["timed"] = (df.time_y - df.time_x) / df.time_x
 df.reset_index(inplace=True)
 if args.plot:
    plt.rcParams["axes.grid"] = True
    fig = plt.figure()
    bps = fig.add_subplot(2, 2, 1, title="Blocks per second (more is better)")
    bpsd = fig.add_subplot(2, 2, 2, title="Difference (>0 is better)")
    tps = fig.add_subplot(2, 2, 3, title="Transactions per second (more is better)")
    tpsd = fig.add_subplot(2, 2, 4, title="Difference (>0 is better)")
    bps.plot(df.block_number, df.bps_x.rolling(3).mean(), label="baseline")
    bps.plot(df.block_number, df.bps_y.rolling(3).mean(), label="contender")
    bpsd.plot(df.block_number, df.bpsd.rolling(3).mean())
    tps.plot(df.block_number, df.tps_x.rolling(3).mean(), label="baseline")
    tps.plot(df.block_number, df.tps_y.rolling(3).mean(), label="contender")
    tpsd.plot(df.block_number, df.tpsd.rolling(3).mean())
    bps.legend()
    tps.legend()
    fig.subplots_adjust(bottom=0.05, right=0.95, top=0.95, left=0.05)
    plt.show()
 print(f"{os.path.basename(args.baseline)} vs {os.path.basename(args.contender)}")
 print(
    formatBins(df, args.bins)
    .agg(
        dict.fromkeys(
            ["bps_x", "bps_y", "tps_x", "tps_y", "bpsd", "tpsd", "timed"], "mean"
        ),
    )
    .to_string(
        formatters=dict(
            dict.fromkeys(["bpsd", "tpsd", "timed"], "{:,.2%}".format),
            **dict.fromkeys(["bps_x", "bps_y", "tps_x"], "{:,.2f}".format),
        )
    )
 )
 print(
    f"\nblocks: {df.blocks.sum()}, baseline: {prettySecs(df.time_x.sum())}, contender: {prettySecs(df.time_y.sum())}"
 )
 print(f"bpsd (mean): {df.bpsd.mean():.2%}")
 print(f"tpsd (mean): {df.tpsd.mean():.2%}")
 print(
    f"Time (sum): {prettySecs(df.time_y.sum()-df.time_x.sum())}, {df.timed.mean():.2%}"
 )
 print()
 print(
    "bpsd = blocks per sec diff (+), tpsd = txs per sec diff, timed = time to process diff (-)"
 )
 print("+ = more is better, - = less is better")
--- a/scripts/make_states.sh
+++ b/scripts/make_states.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 # Create a set of states, each advanced by 100k blocks
 set -e
 trap "exit" INT
 if [ -z "$3"  ]
  then
    echo "Syntax: make_states.sh datadir era1dir statsdir"
    exit 1;
 fi
 counter=0
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 DATE="$(date -u +%Y%m%d_%H%M)"
 REV=$(git rev-parse --short=8 HEAD)
 while true;
 do
  "$SCRIPT_DIR/../build/nimbus" import \
    --data-dir:"$1/${DATE}-${REV}" \
    --era1-dir:"$2" \
    --debug-csv-stats:"$3/stats-${DATE}-${REV}.csv" \
    --max-blocks:100000
  cp -ar "$1/${DATE}-${REV}" "$1/${DATE}-${REV}"-$(printf "%04d" $counter)
  counter=$((counter+1))
 done
--- a/scripts/requirements.in
+++ b/scripts/requirements.in
@ -0,0 +1,3 @@
 pandas
 matplotlib
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@ -0,0 +1,41 @@
 alabaster==0.7.16
 attrs==23.2.0
 Babel==2.15.0
 cattrs==23.2.3
 certifi==2024.2.2
 charset-normalizer==3.3.2
 contourpy==1.2.1
 cycler==0.12.1
 docutils==0.20.1
 esbonio==0.16.4
 fonttools==4.53.0
 idna==3.7
 imagesize==1.4.1
 Jinja2==3.1.3
 kiwisolver==1.4.5
 lsprotocol==2023.0.1
 MarkupSafe==2.1.5
 matplotlib==3.9.0
 numpy==1.26.4
 packaging==24.0
 pandas==2.2.2
 pillow==10.3.0
 platformdirs==4.2.1
 pygls==1.3.1
 Pygments==2.18.0
 pyparsing==3.1.2
 pyspellchecker==0.8.1
 python-dateutil==2.9.0.post0
 pytz==2024.1
 requests==2.31.0
 six==1.16.0
 snowballstemmer==2.2.0
 Sphinx==7.3.7
 sphinxcontrib-applehelp==1.0.8
 sphinxcontrib-devhelp==1.0.6
 sphinxcontrib-htmlhelp==2.0.5
 sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.7
 sphinxcontrib-serializinghtml==1.1.10
 tzdata==2024.1
 urllib3==2.2.1