diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 000000000..84deb2867 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,40 @@ +# Utility scripts + +## block-import-stats.py + +This script compares outputs from two `nimbus import --debug-csv-stats`, a +baseline and a contender. + +To use it, set up a virtual environment: + +```bash +# Create a venv for the tool +python -m venv stats +. stats/bin/activate +pip install -r requirements.txt + +python block-import-stats.py +``` + +* Generate a baseline version by processing a long range of blocks using + `nimbus import` +* Modify your code and commit to git (to generate a unique identifier for the code) +* Re-run the same import over the range of blocks of interest, saving the import + statistics to a new CSV +* Pass the two CSV files to the script + +By default, the script will skip block numbers below 500k since these are mostly +unintersting. + +See `-h` for help text on running the script. + +### Testing a particular range of blocks + +As long as block import is run on similar hardware, each run can be saved for +future reference using the git hash. + +The block import can be run repeatedly with `--max-blocks` to stop after +processing a number of blocks - by copying the state at that point, one can +resume or replay the import of a particular block range + +See `make_states.sh` for such an example. diff --git a/scripts/block-import-stats.py b/scripts/block-import-stats.py new file mode 100644 index 000000000..fc3c6028c --- /dev/null +++ b/scripts/block-import-stats.py @@ -0,0 +1,137 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import os + +import argparse + +plt.rcParams["figure.figsize"] = [40, 30] + +from pandas.plotting import register_matplotlib_converters + +register_matplotlib_converters() + + +def readStats(name: str, min_block_number: int): + df = pd.read_csv(name).convert_dtypes() + df = df[df.block_number >= min_block_number] + df.set_index("block_number", inplace=True) + df.time /= 1000000000 + df.drop(columns=["gas"], inplace=True) + df["bps"] = df.blocks / df.time + df["tps"] = df.txs / df.time + return df + + +def prettySecs(s: float): + sa = abs(int(s)) + ss = sa % 60 + m = sa // 60 % 60 + h = sa // (60 * 60) + sign = "" if s >= 0 else "-" + + if h > 0: + return f"{sign}{h}h{m}m{ss}s" + elif m > 0: + return f"{sign}{m}m{ss}s" + else: + return f"{sign}{ss}s" + + +def formatBins(df: pd.DataFrame, bins: int): + if bins > 0: + bins = np.linspace( + df.block_number.iloc[0], df.block_number.iloc[-1], bins, dtype=int + ) + return df.groupby(pd.cut(df["block_number"], bins), observed=True) + else: + return df + + +parser = argparse.ArgumentParser() +parser.add_argument("baseline") +parser.add_argument("contender") +parser.add_argument("--plot", action="store_true") +parser.add_argument( + "--bins", + default=10, + type=int, + help="Number of bins to group block ranges into in overview, 0=all rows", +) +parser.add_argument( + "--min-block-number", + default=500000, + type=int, + help="Skip block blocks below the given number", +) +args = parser.parse_args() + +baseline = readStats(args.baseline, args.min_block_number) +contender = readStats(args.contender, args.min_block_number) + +# Pick out the rows to match - a more sophisticated version of this would +# interpolate, perhaps - also, maybe should check for non-matching block/tx counts +df = baseline.merge(contender, on=("block_number", "blocks", "txs")) + +df["bpsd"] = (df.bps_y - df.bps_x) / df.bps_x +df["tpsd"] = (df.tps_y - df.tps_x) / df.tps_x +df["timed"] = (df.time_y - df.time_x) / df.time_x + +df.reset_index(inplace=True) + +if args.plot: + plt.rcParams["axes.grid"] = True + + fig = plt.figure() + bps = fig.add_subplot(2, 2, 1, title="Blocks per second (more is better)") + bpsd = fig.add_subplot(2, 2, 2, title="Difference (>0 is better)") + tps = fig.add_subplot(2, 2, 3, title="Transactions per second (more is better)") + tpsd = fig.add_subplot(2, 2, 4, title="Difference (>0 is better)") + + bps.plot(df.block_number, df.bps_x.rolling(3).mean(), label="baseline") + bps.plot(df.block_number, df.bps_y.rolling(3).mean(), label="contender") + + bpsd.plot(df.block_number, df.bpsd.rolling(3).mean()) + + tps.plot(df.block_number, df.tps_x.rolling(3).mean(), label="baseline") + tps.plot(df.block_number, df.tps_y.rolling(3).mean(), label="contender") + + tpsd.plot(df.block_number, df.tpsd.rolling(3).mean()) + + bps.legend() + tps.legend() + + fig.subplots_adjust(bottom=0.05, right=0.95, top=0.95, left=0.05) + plt.show() + + +print(f"{os.path.basename(args.baseline)} vs {os.path.basename(args.contender)}") +print( + formatBins(df, args.bins) + .agg( + dict.fromkeys( + ["bps_x", "bps_y", "tps_x", "tps_y", "bpsd", "tpsd", "timed"], "mean" + ), + ) + .to_string( + formatters=dict( + dict.fromkeys(["bpsd", "tpsd", "timed"], "{:,.2%}".format), + **dict.fromkeys(["bps_x", "bps_y", "tps_x"], "{:,.2f}".format), + ) + ) +) + +print( + f"\nblocks: {df.blocks.sum()}, baseline: {prettySecs(df.time_x.sum())}, contender: {prettySecs(df.time_y.sum())}" +) +print(f"bpsd (mean): {df.bpsd.mean():.2%}") +print(f"tpsd (mean): {df.tpsd.mean():.2%}") +print( + f"Time (sum): {prettySecs(df.time_y.sum()-df.time_x.sum())}, {df.timed.mean():.2%}" +) + +print() +print( + "bpsd = blocks per sec diff (+), tpsd = txs per sec diff, timed = time to process diff (-)" +) +print("+ = more is better, - = less is better") diff --git a/scripts/make_states.sh b/scripts/make_states.sh new file mode 100755 index 000000000..90cb42c25 --- /dev/null +++ b/scripts/make_states.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Create a set of states, each advanced by 100k blocks + +set -e + +trap "exit" INT + +if [ -z "$3" ] + then + echo "Syntax: make_states.sh datadir era1dir statsdir" + exit 1; +fi + +counter=0 + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +DATE="$(date -u +%Y%m%d_%H%M)" +REV=$(git rev-parse --short=8 HEAD) + +while true; +do + "$SCRIPT_DIR/../build/nimbus" import \ + --data-dir:"$1/${DATE}-${REV}" \ + --era1-dir:"$2" \ + --debug-csv-stats:"$3/stats-${DATE}-${REV}.csv" \ + --max-blocks:100000 + cp -ar "$1/${DATE}-${REV}" "$1/${DATE}-${REV}"-$(printf "%04d" $counter) + counter=$((counter+1)) +done diff --git a/scripts/requirements.in b/scripts/requirements.in new file mode 100644 index 000000000..6182dc317 --- /dev/null +++ b/scripts/requirements.in @@ -0,0 +1,3 @@ +pandas +matplotlib + diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 000000000..d8142bbc8 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,41 @@ +alabaster==0.7.16 +attrs==23.2.0 +Babel==2.15.0 +cattrs==23.2.3 +certifi==2024.2.2 +charset-normalizer==3.3.2 +contourpy==1.2.1 +cycler==0.12.1 +docutils==0.20.1 +esbonio==0.16.4 +fonttools==4.53.0 +idna==3.7 +imagesize==1.4.1 +Jinja2==3.1.3 +kiwisolver==1.4.5 +lsprotocol==2023.0.1 +MarkupSafe==2.1.5 +matplotlib==3.9.0 +numpy==1.26.4 +packaging==24.0 +pandas==2.2.2 +pillow==10.3.0 +platformdirs==4.2.1 +pygls==1.3.1 +Pygments==2.18.0 +pyparsing==3.1.2 +pyspellchecker==0.8.1 +python-dateutil==2.9.0.post0 +pytz==2024.1 +requests==2.31.0 +six==1.16.0 +snowballstemmer==2.2.0 +Sphinx==7.3.7 +sphinxcontrib-applehelp==1.0.8 +sphinxcontrib-devhelp==1.0.6 +sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-serializinghtml==1.1.10 +tzdata==2024.1 +urllib3==2.2.1