nimbus-eth1/fluffy/scripts/launch_local_testnet.sh
Kim De Mey edc87b702b
Launch Fluffy builds directly from make to avoid compile issue (#1646)
* Launch Fluffy builds directly from make to avoid compile issue

Without this change, builds on latest macos fails when ulimit is
not set to 1024. But it will still cause libbacktrace error to occur
when launching the binaries so it would be still advised to
set it to 1024.

* Fix fluffy local testnet for some macOS systems

And some additional improvements to the script + run the fluffy
nodes at INFO log-level to speed-up the testing time.

* Split up fluffy tests in separate targets

This way the two test binaries can be build and ran
concurrently.
2023-07-21 16:30:22 +02:00

343 lines
9.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# Copyright (c) 2021-2023 Status Research & Development GmbH. Licensed under
# either of:
# - Apache License, version 2.0
# - MIT license
# at your option. This file may not be copied, modified, or distributed except
# according to those terms.
# This script is for a big part a copy of the nimbus-eth2 launch_local_testnet
# script. This script however does not expect fluffy nodes to exit 0 in the good
# case, but instead the json-rpc interface is used to check whether certain
# values are what we expect them to be.
set -e
cd "$(dirname "${BASH_SOURCE[0]}")"/../..
####################
# argument parsing #
####################
GETOPT_BINARY="getopt"
if uname | grep -qi darwin; then
# macOS
GETOPT_BINARY=$(find /opt/homebrew/opt/gnu-getopt/bin/getopt /usr/local/opt/gnu-getopt/bin/getopt 2> /dev/null | head -n1 || true)
[[ -f "$GETOPT_BINARY" ]] || { echo "GNU getopt not installed. Please run 'brew install gnu-getopt'. Aborting."; exit 1; }
fi
! ${GETOPT_BINARY} --test > /dev/null
if [ ${PIPESTATUS[0]} != 4 ]; then
echo '`getopt --test` failed in this environment.'
exit 1
fi
OPTS="h:n:d"
LONGOPTS="help,nodes:,data-dir:,enable-htop,log-level:,base-port:,base-rpc-port:,base-metrics-port:,reuse-existing-data-dir,timeout:,kill-old-processes"
# default values
NUM_NODES="64"
DATA_DIR="local_testnet_data"
USE_HTOP="0"
LOG_LEVEL="INFO"
BASE_PORT="9000"
BASE_METRICS_PORT="8008"
BASE_RPC_PORT="10000"
REUSE_EXISTING_DATA_DIR="0"
TIMEOUT_DURATION="0"
KILL_OLD_PROCESSES="0"
SCRIPTS_DIR="fluffy/scripts/"
print_help() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS] -- [FLUFFY OPTIONS]
E.g.: $(basename "$0") --nodes ${NUM_NODES} --data-dir "${DATA_DIR}" # defaults
-h, --help this help message
-n, --nodes number of nodes to launch (default: ${NUM_NODES})
-d, --data-dir directory where all the node data and logs will end up
(default: "${DATA_DIR}")
--base-port bootstrap node's discv5 port (default: ${BASE_PORT})
--base-rpc-port bootstrap node's RPC port (default: ${BASE_RPC_PORT})
--base-metrics-port bootstrap node's metrics server port (default: ${BASE_METRICS_PORT})
--enable-htop use "htop" to see the fluffy processes without doing any tests
--log-level set the log level (default: ${LOG_LEVEL})
--reuse-existing-data-dir instead of deleting and recreating the data dir, keep it and reuse everything we can from it
--timeout timeout in seconds (default: ${TIMEOUT_DURATION} - no timeout)
--kill-old-processes if any process is found listening on a port we use, kill it (default: disabled)
EOF
}
! PARSED=$(${GETOPT_BINARY} --options=${OPTS} --longoptions=${LONGOPTS} --name "$0" -- "$@")
if [ ${PIPESTATUS[0]} != 0 ]; then
# getopt has complained about wrong arguments to stdout
exit 1
fi
# read getopt's output this way to handle the quoting right
eval set -- "$PARSED"
while true; do
case "$1" in
-h|--help)
print_help
exit
;;
-n|--nodes)
NUM_NODES="$2"
shift 2
;;
-d|--data-dir)
DATA_DIR="$2"
shift 2
;;
--enable-htop)
USE_HTOP="1"
shift
;;
--log-level)
LOG_LEVEL="$2"
shift 2
;;
--base-port)
BASE_PORT="$2"
shift 2
;;
--base-rpc-port)
BASE_RPC_PORT="$2"
shift 2
;;
--base-metrics-port)
BASE_METRICS_PORT="$2"
shift 2
;;
--reuse-existing-data-dir)
REUSE_EXISTING_DATA_DIR="1"
shift
;;
--timeout)
TIMEOUT_DURATION="$2"
shift 2
;;
--kill-old-processes)
KILL_OLD_PROCESSES="1"
shift
;;
--)
shift
break
;;
*)
echo "argument parsing error"
print_help
exit 1
esac
done
# when sourcing env.sh, it will try to execute $@, so empty it
EXTRA_ARGS="$@"
if [[ $# != 0 ]]; then
shift $#
fi
if [[ "$REUSE_EXISTING_DATA_DIR" == "0" ]]; then
rm -rf "${DATA_DIR}"
fi
"${SCRIPTS_DIR}"/makedir.sh "${DATA_DIR}"
HAVE_LSOF=0
# Windows detection
if uname | grep -qiE "mingw|msys"; then
MAKE="mingw32-make"
else
MAKE="make"
which lsof &>/dev/null && HAVE_LSOF=1 || { echo "'lsof' not installed and we need it to check for ports already in use. Aborting."; exit 1; }
fi
# number of CPU cores
if uname | grep -qi darwin; then
NPROC="$(sysctl -n hw.logicalcpu)"
else
NPROC="$(nproc)"
fi
# kill lingering processes from a previous run
if [[ "${HAVE_LSOF}" == "1" ]]; then
for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
for PORT in $(( BASE_PORT + NUM_NODE )) $(( BASE_METRICS_PORT + NUM_NODE )) $(( BASE_RPC_PORT + NUM_NODE )); do
for PID in $(lsof -n -i tcp:${PORT} -sTCP:LISTEN -t); do
echo -n "Found old process listening on port ${PORT}, with PID ${PID}. "
if [[ "${KILL_OLD_PROCESSES}" == "1" ]]; then
echo "Killing it."
kill -9 ${PID} || true
else
echo "Aborting."
exit 1
fi
done
done
done
fi
# Build the binaries
BINARIES="fluffy"
TEST_BINARIES="test_portal_testnet"
$MAKE -j ${NPROC} LOG_LEVEL=TRACE ${BINARIES}
$MAKE -j ${NPROC} LOG_LEVEL=INFO ${TEST_BINARIES}
# Kill child processes on Ctrl-C/SIGTERM/exit, passing the PID of this shell
# instance as the parent and the target process name as a pattern to the
# "pkill" command.
cleanup() {
pkill -f -P $$ fluffy &>/dev/null || true
sleep 2
pkill -f -9 -P $$ fluffy &>/dev/null || true
# Delete the binaries we just built, because these are with none default logs.
# TODO: When fluffy gets run time log options a la nimbus-eth2 we can keep
# the binaries around.
for BINARY in ${BINARIES}; do
rm build/${BINARY}
done
}
trap 'cleanup' SIGINT SIGTERM EXIT
# timeout - implemented with a background job
timeout_reached() {
echo -e "\nTimeout reached. Aborting.\n"
cleanup
}
trap 'timeout_reached' SIGALRM
# TODO: This doesn't seem to work in Windows CI as it can't find the process
# with WATCHER_PID when doing the taskkill later on.
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
export PARENT_PID=$$
( sleep ${TIMEOUT_DURATION} && kill -ALRM ${PARENT_PID} ) 2>/dev/null & WATCHER_PID=$!
fi
PIDS=""
NUM_JOBS=${NUM_NODES}
dump_logs() {
LOG_LINES=20
for LOG in "${DATA_DIR}"/log*.txt; do
echo "Last ${LOG_LINES} lines of ${LOG}:"
tail -n ${LOG_LINES} "${LOG}"
echo "======"
done
}
BOOTSTRAP_NODE=0
BOOTSTRAP_TIMEOUT=5 # in seconds
BOOTSTRAP_ENR_FILE="${DATA_DIR}/node${BOOTSTRAP_NODE}/fluffy_node.enr"
for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}"
rm -rf "${NODE_DATA_DIR}"
"${SCRIPTS_DIR}"/makedir.sh "${NODE_DATA_DIR}" 2>&1
done
echo "Starting ${NUM_NODES} nodes."
for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
# Reset arguments
BOOTSTRAP_ARG=""
NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}"
if [[ ${NUM_NODE} != ${BOOTSTRAP_NODE} ]]; then
BOOTSTRAP_ARG="--bootstrap-file=${BOOTSTRAP_ENR_FILE}"
# All nodes but bootstrap node run with log. radius of 254 which should
# result in ~1/4th of the data set stored.
RADIUS_ARG="--radius=static:254"
# Wait for the bootstrap node to write out its enr file
START_TIMESTAMP=$(date +%s)
while [[ ! -f "${BOOTSTRAP_ENR_FILE}" ]]; do
sleep 0.1
NOW_TIMESTAMP=$(date +%s)
if [[ "$(( NOW_TIMESTAMP - START_TIMESTAMP - GENESIS_OFFSET ))" -ge "$BOOTSTRAP_TIMEOUT" ]]; then
echo "Bootstrap node failed to start in ${BOOTSTRAP_TIMEOUT} seconds. Aborting."
dump_logs
exit 1
fi
done
fi
# Running with bits-per-hop of 1 to make the lookups more likely requiring
# to request to nodes over the network instead of having most of them in the
# own routing table.
./build/fluffy \
--listen-address:127.0.0.1 \
--nat:extip:127.0.0.1 \
--log-level="${LOG_LEVEL}" \
--udp-port=$(( BASE_PORT + NUM_NODE )) \
--data-dir="${NODE_DATA_DIR}" \
${BOOTSTRAP_ARG} \
--rpc \
--rpc-address="127.0.0.1" \
--rpc-port="$(( BASE_RPC_PORT + NUM_NODE ))" \
--metrics \
--metrics-address="127.0.0.1" \
--metrics-port="$(( BASE_METRICS_PORT + NUM_NODE ))" \
--table-ip-limit=1024 \
--bucket-ip-limit=24 \
--bits-per-hop=1 \
${RADIUS_ARG} \
${EXTRA_ARGS} \
> "${DATA_DIR}/log${NUM_NODE}.txt" 2>&1 &
if [[ "${PIDS}" == "" ]]; then
PIDS="$!"
else
PIDS="${PIDS},$!"
fi
done
# give the regular nodes time to crash
sleep 5
BG_JOBS="$(jobs | wc -l | tr -d ' ')"
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
BG_JOBS=$(( BG_JOBS - 1 )) # minus the timeout bg job
fi
if [[ "$BG_JOBS" != "$NUM_JOBS" ]]; then
echo "$(( NUM_JOBS - BG_JOBS )) fluffy instance(s) exited early. Aborting."
dump_logs
exit 1
fi
# launch htop and run until `TIMEOUT_DURATION` or check the nodes and quit.
if [[ "$USE_HTOP" == "1" ]]; then
htop -p "$PIDS"
cleanup
else
# Need to let to settle the network a bit, as currently at start discv5 and
# the Portal networks all send messages at once to the same nodes, causing
# messages to drop when handshakes are going on.
sleep 5
./build/test_portal_testnet --node-count:${NUM_NODES}
FAILED=$?
if [[ "$FAILED" != "0" ]]; then
dump_logs
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
if uname | grep -qiE "mingw|msys"; then
echo ${WATCHER_PID}
taskkill //F //PID ${WATCHER_PID}
else
pkill -HUP -P ${WATCHER_PID}
fi
fi
exit 1
fi
fi
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
if uname | grep -qiE "mingw|msys"; then
taskkill //F //PID ${WATCHER_PID}
else
pkill -HUP -P ${WATCHER_PID}
fi
fi