mirror of
https://github.com/status-im/nimbus-eth1.git
synced 2025-01-25 19:50:30 +00:00
edc87b702b
* Launch Fluffy builds directly from make to avoid compile issue Without this change, builds on latest macos fails when ulimit is not set to 1024. But it will still cause libbacktrace error to occur when launching the binaries so it would be still advised to set it to 1024. * Fix fluffy local testnet for some macOS systems And some additional improvements to the script + run the fluffy nodes at INFO log-level to speed-up the testing time. * Split up fluffy tests in separate targets This way the two test binaries can be build and ran concurrently.
343 lines
9.6 KiB
Bash
Executable File
343 lines
9.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Copyright (c) 2021-2023 Status Research & Development GmbH. Licensed under
|
|
# either of:
|
|
# - Apache License, version 2.0
|
|
# - MIT license
|
|
# at your option. This file may not be copied, modified, or distributed except
|
|
# according to those terms.
|
|
|
|
# This script is for a big part a copy of the nimbus-eth2 launch_local_testnet
|
|
# script. This script however does not expect fluffy nodes to exit 0 in the good
|
|
# case, but instead the json-rpc interface is used to check whether certain
|
|
# values are what we expect them to be.
|
|
|
|
set -e
|
|
|
|
cd "$(dirname "${BASH_SOURCE[0]}")"/../..
|
|
|
|
####################
|
|
# argument parsing #
|
|
####################
|
|
|
|
GETOPT_BINARY="getopt"
|
|
if uname | grep -qi darwin; then
|
|
# macOS
|
|
GETOPT_BINARY=$(find /opt/homebrew/opt/gnu-getopt/bin/getopt /usr/local/opt/gnu-getopt/bin/getopt 2> /dev/null | head -n1 || true)
|
|
[[ -f "$GETOPT_BINARY" ]] || { echo "GNU getopt not installed. Please run 'brew install gnu-getopt'. Aborting."; exit 1; }
|
|
fi
|
|
|
|
! ${GETOPT_BINARY} --test > /dev/null
|
|
if [ ${PIPESTATUS[0]} != 4 ]; then
|
|
echo '`getopt --test` failed in this environment.'
|
|
exit 1
|
|
fi
|
|
|
|
OPTS="h:n:d"
|
|
LONGOPTS="help,nodes:,data-dir:,enable-htop,log-level:,base-port:,base-rpc-port:,base-metrics-port:,reuse-existing-data-dir,timeout:,kill-old-processes"
|
|
|
|
# default values
|
|
NUM_NODES="64"
|
|
DATA_DIR="local_testnet_data"
|
|
USE_HTOP="0"
|
|
LOG_LEVEL="INFO"
|
|
BASE_PORT="9000"
|
|
BASE_METRICS_PORT="8008"
|
|
BASE_RPC_PORT="10000"
|
|
REUSE_EXISTING_DATA_DIR="0"
|
|
TIMEOUT_DURATION="0"
|
|
KILL_OLD_PROCESSES="0"
|
|
SCRIPTS_DIR="fluffy/scripts/"
|
|
|
|
print_help() {
|
|
cat <<EOF
|
|
Usage: $(basename "$0") [OPTIONS] -- [FLUFFY OPTIONS]
|
|
E.g.: $(basename "$0") --nodes ${NUM_NODES} --data-dir "${DATA_DIR}" # defaults
|
|
|
|
-h, --help this help message
|
|
-n, --nodes number of nodes to launch (default: ${NUM_NODES})
|
|
-d, --data-dir directory where all the node data and logs will end up
|
|
(default: "${DATA_DIR}")
|
|
--base-port bootstrap node's discv5 port (default: ${BASE_PORT})
|
|
--base-rpc-port bootstrap node's RPC port (default: ${BASE_RPC_PORT})
|
|
--base-metrics-port bootstrap node's metrics server port (default: ${BASE_METRICS_PORT})
|
|
--enable-htop use "htop" to see the fluffy processes without doing any tests
|
|
--log-level set the log level (default: ${LOG_LEVEL})
|
|
--reuse-existing-data-dir instead of deleting and recreating the data dir, keep it and reuse everything we can from it
|
|
--timeout timeout in seconds (default: ${TIMEOUT_DURATION} - no timeout)
|
|
--kill-old-processes if any process is found listening on a port we use, kill it (default: disabled)
|
|
EOF
|
|
}
|
|
|
|
! PARSED=$(${GETOPT_BINARY} --options=${OPTS} --longoptions=${LONGOPTS} --name "$0" -- "$@")
|
|
if [ ${PIPESTATUS[0]} != 0 ]; then
|
|
# getopt has complained about wrong arguments to stdout
|
|
exit 1
|
|
fi
|
|
|
|
# read getopt's output this way to handle the quoting right
|
|
eval set -- "$PARSED"
|
|
while true; do
|
|
case "$1" in
|
|
-h|--help)
|
|
print_help
|
|
exit
|
|
;;
|
|
-n|--nodes)
|
|
NUM_NODES="$2"
|
|
shift 2
|
|
;;
|
|
-d|--data-dir)
|
|
DATA_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--enable-htop)
|
|
USE_HTOP="1"
|
|
shift
|
|
;;
|
|
--log-level)
|
|
LOG_LEVEL="$2"
|
|
shift 2
|
|
;;
|
|
--base-port)
|
|
BASE_PORT="$2"
|
|
shift 2
|
|
;;
|
|
--base-rpc-port)
|
|
BASE_RPC_PORT="$2"
|
|
shift 2
|
|
;;
|
|
--base-metrics-port)
|
|
BASE_METRICS_PORT="$2"
|
|
shift 2
|
|
;;
|
|
--reuse-existing-data-dir)
|
|
REUSE_EXISTING_DATA_DIR="1"
|
|
shift
|
|
;;
|
|
--timeout)
|
|
TIMEOUT_DURATION="$2"
|
|
shift 2
|
|
;;
|
|
--kill-old-processes)
|
|
KILL_OLD_PROCESSES="1"
|
|
shift
|
|
;;
|
|
--)
|
|
shift
|
|
break
|
|
;;
|
|
*)
|
|
echo "argument parsing error"
|
|
print_help
|
|
exit 1
|
|
esac
|
|
done
|
|
|
|
# when sourcing env.sh, it will try to execute $@, so empty it
|
|
EXTRA_ARGS="$@"
|
|
if [[ $# != 0 ]]; then
|
|
shift $#
|
|
fi
|
|
|
|
if [[ "$REUSE_EXISTING_DATA_DIR" == "0" ]]; then
|
|
rm -rf "${DATA_DIR}"
|
|
fi
|
|
|
|
"${SCRIPTS_DIR}"/makedir.sh "${DATA_DIR}"
|
|
|
|
HAVE_LSOF=0
|
|
|
|
# Windows detection
|
|
if uname | grep -qiE "mingw|msys"; then
|
|
MAKE="mingw32-make"
|
|
else
|
|
MAKE="make"
|
|
which lsof &>/dev/null && HAVE_LSOF=1 || { echo "'lsof' not installed and we need it to check for ports already in use. Aborting."; exit 1; }
|
|
fi
|
|
|
|
# number of CPU cores
|
|
if uname | grep -qi darwin; then
|
|
NPROC="$(sysctl -n hw.logicalcpu)"
|
|
else
|
|
NPROC="$(nproc)"
|
|
fi
|
|
|
|
# kill lingering processes from a previous run
|
|
if [[ "${HAVE_LSOF}" == "1" ]]; then
|
|
for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
|
|
for PORT in $(( BASE_PORT + NUM_NODE )) $(( BASE_METRICS_PORT + NUM_NODE )) $(( BASE_RPC_PORT + NUM_NODE )); do
|
|
for PID in $(lsof -n -i tcp:${PORT} -sTCP:LISTEN -t); do
|
|
echo -n "Found old process listening on port ${PORT}, with PID ${PID}. "
|
|
if [[ "${KILL_OLD_PROCESSES}" == "1" ]]; then
|
|
echo "Killing it."
|
|
kill -9 ${PID} || true
|
|
else
|
|
echo "Aborting."
|
|
exit 1
|
|
fi
|
|
done
|
|
done
|
|
done
|
|
fi
|
|
|
|
# Build the binaries
|
|
BINARIES="fluffy"
|
|
TEST_BINARIES="test_portal_testnet"
|
|
$MAKE -j ${NPROC} LOG_LEVEL=TRACE ${BINARIES}
|
|
$MAKE -j ${NPROC} LOG_LEVEL=INFO ${TEST_BINARIES}
|
|
|
|
# Kill child processes on Ctrl-C/SIGTERM/exit, passing the PID of this shell
|
|
# instance as the parent and the target process name as a pattern to the
|
|
# "pkill" command.
|
|
cleanup() {
|
|
pkill -f -P $$ fluffy &>/dev/null || true
|
|
sleep 2
|
|
pkill -f -9 -P $$ fluffy &>/dev/null || true
|
|
|
|
# Delete the binaries we just built, because these are with none default logs.
|
|
# TODO: When fluffy gets run time log options a la nimbus-eth2 we can keep
|
|
# the binaries around.
|
|
for BINARY in ${BINARIES}; do
|
|
rm build/${BINARY}
|
|
done
|
|
}
|
|
trap 'cleanup' SIGINT SIGTERM EXIT
|
|
|
|
# timeout - implemented with a background job
|
|
timeout_reached() {
|
|
echo -e "\nTimeout reached. Aborting.\n"
|
|
cleanup
|
|
}
|
|
trap 'timeout_reached' SIGALRM
|
|
|
|
# TODO: This doesn't seem to work in Windows CI as it can't find the process
|
|
# with WATCHER_PID when doing the taskkill later on.
|
|
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
|
|
export PARENT_PID=$$
|
|
( sleep ${TIMEOUT_DURATION} && kill -ALRM ${PARENT_PID} ) 2>/dev/null & WATCHER_PID=$!
|
|
fi
|
|
|
|
PIDS=""
|
|
NUM_JOBS=${NUM_NODES}
|
|
|
|
dump_logs() {
|
|
LOG_LINES=20
|
|
for LOG in "${DATA_DIR}"/log*.txt; do
|
|
echo "Last ${LOG_LINES} lines of ${LOG}:"
|
|
tail -n ${LOG_LINES} "${LOG}"
|
|
echo "======"
|
|
done
|
|
}
|
|
|
|
BOOTSTRAP_NODE=0
|
|
BOOTSTRAP_TIMEOUT=5 # in seconds
|
|
BOOTSTRAP_ENR_FILE="${DATA_DIR}/node${BOOTSTRAP_NODE}/fluffy_node.enr"
|
|
|
|
for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
|
|
NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}"
|
|
rm -rf "${NODE_DATA_DIR}"
|
|
"${SCRIPTS_DIR}"/makedir.sh "${NODE_DATA_DIR}" 2>&1
|
|
done
|
|
|
|
echo "Starting ${NUM_NODES} nodes."
|
|
for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do
|
|
# Reset arguments
|
|
BOOTSTRAP_ARG=""
|
|
|
|
NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}"
|
|
|
|
if [[ ${NUM_NODE} != ${BOOTSTRAP_NODE} ]]; then
|
|
BOOTSTRAP_ARG="--bootstrap-file=${BOOTSTRAP_ENR_FILE}"
|
|
# All nodes but bootstrap node run with log. radius of 254 which should
|
|
# result in ~1/4th of the data set stored.
|
|
RADIUS_ARG="--radius=static:254"
|
|
|
|
# Wait for the bootstrap node to write out its enr file
|
|
START_TIMESTAMP=$(date +%s)
|
|
while [[ ! -f "${BOOTSTRAP_ENR_FILE}" ]]; do
|
|
sleep 0.1
|
|
NOW_TIMESTAMP=$(date +%s)
|
|
if [[ "$(( NOW_TIMESTAMP - START_TIMESTAMP - GENESIS_OFFSET ))" -ge "$BOOTSTRAP_TIMEOUT" ]]; then
|
|
echo "Bootstrap node failed to start in ${BOOTSTRAP_TIMEOUT} seconds. Aborting."
|
|
dump_logs
|
|
exit 1
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# Running with bits-per-hop of 1 to make the lookups more likely requiring
|
|
# to request to nodes over the network instead of having most of them in the
|
|
# own routing table.
|
|
./build/fluffy \
|
|
--listen-address:127.0.0.1 \
|
|
--nat:extip:127.0.0.1 \
|
|
--log-level="${LOG_LEVEL}" \
|
|
--udp-port=$(( BASE_PORT + NUM_NODE )) \
|
|
--data-dir="${NODE_DATA_DIR}" \
|
|
${BOOTSTRAP_ARG} \
|
|
--rpc \
|
|
--rpc-address="127.0.0.1" \
|
|
--rpc-port="$(( BASE_RPC_PORT + NUM_NODE ))" \
|
|
--metrics \
|
|
--metrics-address="127.0.0.1" \
|
|
--metrics-port="$(( BASE_METRICS_PORT + NUM_NODE ))" \
|
|
--table-ip-limit=1024 \
|
|
--bucket-ip-limit=24 \
|
|
--bits-per-hop=1 \
|
|
${RADIUS_ARG} \
|
|
${EXTRA_ARGS} \
|
|
> "${DATA_DIR}/log${NUM_NODE}.txt" 2>&1 &
|
|
|
|
if [[ "${PIDS}" == "" ]]; then
|
|
PIDS="$!"
|
|
else
|
|
PIDS="${PIDS},$!"
|
|
fi
|
|
done
|
|
|
|
# give the regular nodes time to crash
|
|
sleep 5
|
|
BG_JOBS="$(jobs | wc -l | tr -d ' ')"
|
|
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
|
|
BG_JOBS=$(( BG_JOBS - 1 )) # minus the timeout bg job
|
|
fi
|
|
if [[ "$BG_JOBS" != "$NUM_JOBS" ]]; then
|
|
echo "$(( NUM_JOBS - BG_JOBS )) fluffy instance(s) exited early. Aborting."
|
|
dump_logs
|
|
exit 1
|
|
fi
|
|
|
|
# launch htop and run until `TIMEOUT_DURATION` or check the nodes and quit.
|
|
if [[ "$USE_HTOP" == "1" ]]; then
|
|
htop -p "$PIDS"
|
|
cleanup
|
|
else
|
|
# Need to let to settle the network a bit, as currently at start discv5 and
|
|
# the Portal networks all send messages at once to the same nodes, causing
|
|
# messages to drop when handshakes are going on.
|
|
sleep 5
|
|
./build/test_portal_testnet --node-count:${NUM_NODES}
|
|
FAILED=$?
|
|
if [[ "$FAILED" != "0" ]]; then
|
|
dump_logs
|
|
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
|
|
if uname | grep -qiE "mingw|msys"; then
|
|
echo ${WATCHER_PID}
|
|
taskkill //F //PID ${WATCHER_PID}
|
|
else
|
|
pkill -HUP -P ${WATCHER_PID}
|
|
fi
|
|
fi
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
if [[ "${TIMEOUT_DURATION}" != "0" ]]; then
|
|
if uname | grep -qiE "mingw|msys"; then
|
|
taskkill //F //PID ${WATCHER_PID}
|
|
else
|
|
pkill -HUP -P ${WATCHER_PID}
|
|
fi
|
|
fi
|