Script for comparing csv outputs from block import

2024-06-06 14:33:49 +02:00 · 2024-06-06 14:33:49 +02:00 · 0c6c84f2ce
parent 1e65093b3e
commit 0c6c84f2ce
5 changed files with 251 additions and 0 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@ -0,0 +1,40 @@
+# Utility scripts
+
+## block-import-stats.py
+
+This script compares outputs from two `nimbus import --debug-csv-stats`, a
+baseline and a contender.
+
+To use it, set up a virtual environment:
+
+```bash
+# Create a venv for the tool
+python -m venv stats
+. stats/bin/activate
+pip install -r requirements.txt
+
+python block-import-stats.py
+```
+
+* Generate a baseline version by processing a long range of blocks using
+  `nimbus import`
+* Modify your code and commit to git (to generate a unique identifier for the code)
+* Re-run the same import over the range of blocks of interest, saving the import
+  statistics to a new CSV
+* Pass the two CSV files to the script
+
+By default, the script will skip block numbers below 500k since these are mostly
+unintersting.
+
+See `-h` for help text on running the script.
+
+### Testing a particular range of blocks
+
+As long as block import is run on similar hardware, each run can be saved for
+future reference using the git hash.
+
+The block import can be run repeatedly with `--max-blocks` to stop after
+processing a number of blocks - by copying the state at that point, one can
+resume or replay the import of a particular block range
+
+See `make_states.sh` for such an example.
--- a/scripts/block-import-stats.py
+++ b/scripts/block-import-stats.py
@ -0,0 +1,137 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+
+import argparse
+
+plt.rcParams["figure.figsize"] = [40, 30]
+
+from pandas.plotting import register_matplotlib_converters
+
+register_matplotlib_converters()
+
+
+def readStats(name: str, min_block_number: int):
+    df = pd.read_csv(name).convert_dtypes()
+    df = df[df.block_number >= min_block_number]
+    df.set_index("block_number", inplace=True)
+    df.time /= 1000000000
+    df.drop(columns=["gas"], inplace=True)
+    df["bps"] = df.blocks / df.time
+    df["tps"] = df.txs / df.time
+    return df
+
+
+def prettySecs(s: float):
+    sa = abs(int(s))
+    ss = sa % 60
+    m = sa // 60 % 60
+    h = sa // (60 * 60)
+    sign = "" if s >= 0 else "-"
+
+    if h > 0:
+        return f"{sign}{h}h{m}m{ss}s"
+    elif m > 0:
+        return f"{sign}{m}m{ss}s"
+    else:
+        return f"{sign}{ss}s"
+
+
+def formatBins(df: pd.DataFrame, bins: int):
+    if bins > 0:
+        bins = np.linspace(
+            df.block_number.iloc[0], df.block_number.iloc[-1], bins, dtype=int
+        )
+        return df.groupby(pd.cut(df["block_number"], bins), observed=True)
+    else:
+        return df
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("baseline")
+parser.add_argument("contender")
+parser.add_argument("--plot", action="store_true")
+parser.add_argument(
+    "--bins",
+    default=10,
+    type=int,
+    help="Number of bins to group block ranges into in overview, 0=all rows",
+)
+parser.add_argument(
+    "--min-block-number",
+    default=500000,
+    type=int,
+    help="Skip block blocks below the given number",
+)
+args = parser.parse_args()
+
+baseline = readStats(args.baseline, args.min_block_number)
+contender = readStats(args.contender, args.min_block_number)
+
+# Pick out the rows to match - a more sophisticated version of this would
+# interpolate, perhaps - also, maybe should check for non-matching block/tx counts
+df = baseline.merge(contender, on=("block_number", "blocks", "txs"))
+
+df["bpsd"] = (df.bps_y - df.bps_x) / df.bps_x
+df["tpsd"] = (df.tps_y - df.tps_x) / df.tps_x
+df["timed"] = (df.time_y - df.time_x) / df.time_x
+
+df.reset_index(inplace=True)
+
+if args.plot:
+    plt.rcParams["axes.grid"] = True
+
+    fig = plt.figure()
+    bps = fig.add_subplot(2, 2, 1, title="Blocks per second (more is better)")
+    bpsd = fig.add_subplot(2, 2, 2, title="Difference (>0 is better)")
+    tps = fig.add_subplot(2, 2, 3, title="Transactions per second (more is better)")
+    tpsd = fig.add_subplot(2, 2, 4, title="Difference (>0 is better)")
+
+    bps.plot(df.block_number, df.bps_x.rolling(3).mean(), label="baseline")
+    bps.plot(df.block_number, df.bps_y.rolling(3).mean(), label="contender")
+
+    bpsd.plot(df.block_number, df.bpsd.rolling(3).mean())
+
+    tps.plot(df.block_number, df.tps_x.rolling(3).mean(), label="baseline")
+    tps.plot(df.block_number, df.tps_y.rolling(3).mean(), label="contender")
+
+    tpsd.plot(df.block_number, df.tpsd.rolling(3).mean())
+
+    bps.legend()
+    tps.legend()
+
+    fig.subplots_adjust(bottom=0.05, right=0.95, top=0.95, left=0.05)
+    plt.show()
+
+
+print(f"{os.path.basename(args.baseline)} vs {os.path.basename(args.contender)}")
+print(
+    formatBins(df, args.bins)
+    .agg(
+        dict.fromkeys(
+            ["bps_x", "bps_y", "tps_x", "tps_y", "bpsd", "tpsd", "timed"], "mean"
+        ),
+    )
+    .to_string(
+        formatters=dict(
+            dict.fromkeys(["bpsd", "tpsd", "timed"], "{:,.2%}".format),
+            **dict.fromkeys(["bps_x", "bps_y", "tps_x"], "{:,.2f}".format),
+        )
+    )
+)
+
+print(
+    f"\nblocks: {df.blocks.sum()}, baseline: {prettySecs(df.time_x.sum())}, contender: {prettySecs(df.time_y.sum())}"
+)
+print(f"bpsd (mean): {df.bpsd.mean():.2%}")
+print(f"tpsd (mean): {df.tpsd.mean():.2%}")
+print(
+    f"Time (sum): {prettySecs(df.time_y.sum()-df.time_x.sum())}, {df.timed.mean():.2%}"
+)
+
+print()
+print(
+    "bpsd = blocks per sec diff (+), tpsd = txs per sec diff, timed = time to process diff (-)"
+)
+print("+ = more is better, - = less is better")
--- a/scripts/make_states.sh
+++ b/scripts/make_states.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Create a set of states, each advanced by 100k blocks
+
+set -e
+
+trap "exit" INT
+
+if [ -z "$3"  ]
+  then
+    echo "Syntax: make_states.sh datadir era1dir statsdir"
+    exit 1;
+fi
+
+counter=0
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+DATE="$(date -u +%Y%m%d_%H%M)"
+REV=$(git rev-parse --short=8 HEAD)
+
+while true;
+do
+  "$SCRIPT_DIR/../build/nimbus" import \
+    --data-dir:"$1/${DATE}-${REV}" \
+    --era1-dir:"$2" \
+    --debug-csv-stats:"$3/stats-${DATE}-${REV}.csv" \
+    --max-blocks:100000
+  cp -ar "$1/${DATE}-${REV}" "$1/${DATE}-${REV}"-$(printf "%04d" $counter)
+  counter=$((counter+1))
+done
--- a/scripts/requirements.in
+++ b/scripts/requirements.in
@ -0,0 +1,3 @@
+pandas
+matplotlib
+
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@ -0,0 +1,41 @@
+alabaster==0.7.16
+attrs==23.2.0
+Babel==2.15.0
+cattrs==23.2.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+contourpy==1.2.1
+cycler==0.12.1
+docutils==0.20.1
+esbonio==0.16.4
+fonttools==4.53.0
+idna==3.7
+imagesize==1.4.1
+Jinja2==3.1.3
+kiwisolver==1.4.5
+lsprotocol==2023.0.1
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+numpy==1.26.4
+packaging==24.0
+pandas==2.2.2
+pillow==10.3.0
+platformdirs==4.2.1
+pygls==1.3.1
+Pygments==2.18.0
+pyparsing==3.1.2
+pyspellchecker==0.8.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+requests==2.31.0
+six==1.16.0
+snowballstemmer==2.2.0
+Sphinx==7.3.7
+sphinxcontrib-applehelp==1.0.8
+sphinxcontrib-devhelp==1.0.6
+sphinxcontrib-htmlhelp==2.0.5
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.7
+sphinxcontrib-serializinghtml==1.1.10
+tzdata==2024.1
+urllib3==2.2.1