Script for comparing csv outputs from block import

This commit is contained in:
Jacek Sieka 2024-06-06 14:33:49 +02:00
parent 1e65093b3e
commit 0c6c84f2ce
No known key found for this signature in database
GPG Key ID: A1B09461ABB656B8
5 changed files with 251 additions and 0 deletions

40
scripts/README.md Normal file
View File

@ -0,0 +1,40 @@
# Utility scripts
## block-import-stats.py
This script compares outputs from two `nimbus import --debug-csv-stats`, a
baseline and a contender.
To use it, set up a virtual environment:
```bash
# Create a venv for the tool
python -m venv stats
. stats/bin/activate
pip install -r requirements.txt
python block-import-stats.py
```
* Generate a baseline version by processing a long range of blocks using
`nimbus import`
* Modify your code and commit to git (to generate a unique identifier for the code)
* Re-run the same import over the range of blocks of interest, saving the import
statistics to a new CSV
* Pass the two CSV files to the script
By default, the script will skip block numbers below 500k since these are mostly
unintersting.
See `-h` for help text on running the script.
### Testing a particular range of blocks
As long as block import is run on similar hardware, each run can be saved for
future reference using the git hash.
The block import can be run repeatedly with `--max-blocks` to stop after
processing a number of blocks - by copying the state at that point, one can
resume or replay the import of a particular block range
See `make_states.sh` for such an example.

View File

@ -0,0 +1,137 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import argparse
plt.rcParams["figure.figsize"] = [40, 30]
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
def readStats(name: str, min_block_number: int):
df = pd.read_csv(name).convert_dtypes()
df = df[df.block_number >= min_block_number]
df.set_index("block_number", inplace=True)
df.time /= 1000000000
df.drop(columns=["gas"], inplace=True)
df["bps"] = df.blocks / df.time
df["tps"] = df.txs / df.time
return df
def prettySecs(s: float):
sa = abs(int(s))
ss = sa % 60
m = sa // 60 % 60
h = sa // (60 * 60)
sign = "" if s >= 0 else "-"
if h > 0:
return f"{sign}{h}h{m}m{ss}s"
elif m > 0:
return f"{sign}{m}m{ss}s"
else:
return f"{sign}{ss}s"
def formatBins(df: pd.DataFrame, bins: int):
if bins > 0:
bins = np.linspace(
df.block_number.iloc[0], df.block_number.iloc[-1], bins, dtype=int
)
return df.groupby(pd.cut(df["block_number"], bins), observed=True)
else:
return df
parser = argparse.ArgumentParser()
parser.add_argument("baseline")
parser.add_argument("contender")
parser.add_argument("--plot", action="store_true")
parser.add_argument(
"--bins",
default=10,
type=int,
help="Number of bins to group block ranges into in overview, 0=all rows",
)
parser.add_argument(
"--min-block-number",
default=500000,
type=int,
help="Skip block blocks below the given number",
)
args = parser.parse_args()
baseline = readStats(args.baseline, args.min_block_number)
contender = readStats(args.contender, args.min_block_number)
# Pick out the rows to match - a more sophisticated version of this would
# interpolate, perhaps - also, maybe should check for non-matching block/tx counts
df = baseline.merge(contender, on=("block_number", "blocks", "txs"))
df["bpsd"] = (df.bps_y - df.bps_x) / df.bps_x
df["tpsd"] = (df.tps_y - df.tps_x) / df.tps_x
df["timed"] = (df.time_y - df.time_x) / df.time_x
df.reset_index(inplace=True)
if args.plot:
plt.rcParams["axes.grid"] = True
fig = plt.figure()
bps = fig.add_subplot(2, 2, 1, title="Blocks per second (more is better)")
bpsd = fig.add_subplot(2, 2, 2, title="Difference (>0 is better)")
tps = fig.add_subplot(2, 2, 3, title="Transactions per second (more is better)")
tpsd = fig.add_subplot(2, 2, 4, title="Difference (>0 is better)")
bps.plot(df.block_number, df.bps_x.rolling(3).mean(), label="baseline")
bps.plot(df.block_number, df.bps_y.rolling(3).mean(), label="contender")
bpsd.plot(df.block_number, df.bpsd.rolling(3).mean())
tps.plot(df.block_number, df.tps_x.rolling(3).mean(), label="baseline")
tps.plot(df.block_number, df.tps_y.rolling(3).mean(), label="contender")
tpsd.plot(df.block_number, df.tpsd.rolling(3).mean())
bps.legend()
tps.legend()
fig.subplots_adjust(bottom=0.05, right=0.95, top=0.95, left=0.05)
plt.show()
print(f"{os.path.basename(args.baseline)} vs {os.path.basename(args.contender)}")
print(
formatBins(df, args.bins)
.agg(
dict.fromkeys(
["bps_x", "bps_y", "tps_x", "tps_y", "bpsd", "tpsd", "timed"], "mean"
),
)
.to_string(
formatters=dict(
dict.fromkeys(["bpsd", "tpsd", "timed"], "{:,.2%}".format),
**dict.fromkeys(["bps_x", "bps_y", "tps_x"], "{:,.2f}".format),
)
)
)
print(
f"\nblocks: {df.blocks.sum()}, baseline: {prettySecs(df.time_x.sum())}, contender: {prettySecs(df.time_y.sum())}"
)
print(f"bpsd (mean): {df.bpsd.mean():.2%}")
print(f"tpsd (mean): {df.tpsd.mean():.2%}")
print(
f"Time (sum): {prettySecs(df.time_y.sum()-df.time_x.sum())}, {df.timed.mean():.2%}"
)
print()
print(
"bpsd = blocks per sec diff (+), tpsd = txs per sec diff, timed = time to process diff (-)"
)
print("+ = more is better, - = less is better")

30
scripts/make_states.sh Executable file
View File

@ -0,0 +1,30 @@
#!/bin/bash
# Create a set of states, each advanced by 100k blocks
set -e
trap "exit" INT
if [ -z "$3" ]
then
echo "Syntax: make_states.sh datadir era1dir statsdir"
exit 1;
fi
counter=0
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
DATE="$(date -u +%Y%m%d_%H%M)"
REV=$(git rev-parse --short=8 HEAD)
while true;
do
"$SCRIPT_DIR/../build/nimbus" import \
--data-dir:"$1/${DATE}-${REV}" \
--era1-dir:"$2" \
--debug-csv-stats:"$3/stats-${DATE}-${REV}.csv" \
--max-blocks:100000
cp -ar "$1/${DATE}-${REV}" "$1/${DATE}-${REV}"-$(printf "%04d" $counter)
counter=$((counter+1))
done

3
scripts/requirements.in Normal file
View File

@ -0,0 +1,3 @@
pandas
matplotlib

41
scripts/requirements.txt Normal file
View File

@ -0,0 +1,41 @@
alabaster==0.7.16
attrs==23.2.0
Babel==2.15.0
cattrs==23.2.3
certifi==2024.2.2
charset-normalizer==3.3.2
contourpy==1.2.1
cycler==0.12.1
docutils==0.20.1
esbonio==0.16.4
fonttools==4.53.0
idna==3.7
imagesize==1.4.1
Jinja2==3.1.3
kiwisolver==1.4.5
lsprotocol==2023.0.1
MarkupSafe==2.1.5
matplotlib==3.9.0
numpy==1.26.4
packaging==24.0
pandas==2.2.2
pillow==10.3.0
platformdirs==4.2.1
pygls==1.3.1
Pygments==2.18.0
pyparsing==3.1.2
pyspellchecker==0.8.1
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.31.0
six==1.16.0
snowballstemmer==2.2.0
Sphinx==7.3.7
sphinxcontrib-applehelp==1.0.8
sphinxcontrib-devhelp==1.0.6
sphinxcontrib-htmlhelp==2.0.5
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.7
sphinxcontrib-serializinghtml==1.1.10
tzdata==2024.1
urllib3==2.2.1