#!/usr/bin/env bash # Copyright (c) 2021-2023 Status Research & Development GmbH. Licensed under # either of: # - Apache License, version 2.0 # - MIT license # at your option. This file may not be copied, modified, or distributed except # according to those terms. # This script is for a big part a copy of the nimbus-eth2 launch_local_testnet # script. This script however does not expect fluffy nodes to exit 0 in the good # case, but instead the json-rpc interface is used to check whether certain # values are what we expect them to be. set -e cd "$(dirname "${BASH_SOURCE[0]}")"/../.. #################### # argument parsing # #################### GETOPT_BINARY="getopt" if uname | grep -qi darwin; then # macOS GETOPT_BINARY=$(find /opt/homebrew/opt/gnu-getopt/bin/getopt /usr/local/opt/gnu-getopt/bin/getopt 2> /dev/null | head -n1 || true) [[ -f "$GETOPT_BINARY" ]] || { echo "GNU getopt not installed. Please run 'brew install gnu-getopt'. Aborting."; exit 1; } fi ! ${GETOPT_BINARY} --test > /dev/null if [ ${PIPESTATUS[0]} != 4 ]; then echo '`getopt --test` failed in this environment.' exit 1 fi OPTS="h:n:d" LONGOPTS="help,nodes:,data-dir:,run-tests,log-level:,base-port:,base-rpc-port:,trusted-block-root:,portal-bridge,base-metrics-port:,reuse-existing-data-dir,timeout:,kill-old-processes" # default values NUM_NODES="64" DATA_DIR="local_testnet_data" RUN_TESTS="0" LOG_LEVEL="INFO" BASE_PORT="9000" BASE_METRICS_PORT="8008" BASE_RPC_PORT="10000" REUSE_EXISTING_DATA_DIR="0" TIMEOUT_DURATION="0" KILL_OLD_PROCESSES="0" SCRIPTS_DIR="fluffy/scripts/" PORTAL_BRIDGE="0" TRUSTED_BLOCK_ROOT="" # REST_URL="http://127.0.0.1:5052" REST_URL="http://testing.mainnet.beacon-api.nimbus.team" print_help() { cat </dev/null && HAVE_LSOF=1 || { echo "'lsof' not installed and we need it to check for ports already in use. Aborting."; exit 1; } fi # number of CPU cores if uname | grep -qi darwin; then NPROC="$(sysctl -n hw.logicalcpu)" else NPROC="$(nproc)" fi # kill lingering processes from a previous run if [[ "${HAVE_LSOF}" == "1" ]]; then for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do for PORT in $(( BASE_PORT + NUM_NODE )) $(( BASE_METRICS_PORT + NUM_NODE )) $(( BASE_RPC_PORT + NUM_NODE )); do for PID in $(lsof -n -i tcp:${PORT} -sTCP:LISTEN -t); do echo -n "Found old process listening on port ${PORT}, with PID ${PID}. " if [[ "${KILL_OLD_PROCESSES}" == "1" ]]; then echo "Killing it." kill -9 ${PID} || true else echo "Aborting." exit 1 fi done done done fi # Build the binaries BINARIES="fluffy" if [[ "${PORTAL_BRIDGE}" == "1" ]]; then BINARIES="${BINARIES} portal_bridge" fi $MAKE -j ${NPROC} LOG_LEVEL=TRACE ${BINARIES} if [[ "$RUN_TESTS" == "1" ]]; then TEST_BINARIES="test_portal_testnet" $MAKE -j ${NPROC} LOG_LEVEL=INFO ${TEST_BINARIES} fi # Kill child processes on Ctrl-C/SIGTERM/exit, passing the PID of this shell # instance as the parent and the target process name as a pattern to the # "pkill" command. cleanup() { for BINARY in ${BINARIES}; do pkill -f -P $$ ${BINARY} &>/dev/null || true done sleep 2 for BINARY in ${BINARIES}; do pkill -f -9 -P $$ ${BINARY} &>/dev/null || true done # Delete the binaries we just built, because these are with none default logs. # TODO: When fluffy gets run time log options a la nimbus-eth2 we can keep # the binaries around. for BINARY in ${BINARIES}; do rm build/${BINARY} done } trap 'cleanup' SIGINT SIGTERM EXIT # timeout - implemented with a background job timeout_reached() { echo -e "\nTimeout reached. Aborting.\n" cleanup } trap 'timeout_reached' SIGALRM # TODO: This doesn't seem to work in Windows CI as it can't find the process # with WATCHER_PID when doing the taskkill later on. if [[ "${TIMEOUT_DURATION}" != "0" ]]; then export PARENT_PID=$$ ( sleep ${TIMEOUT_DURATION} && kill -ALRM ${PARENT_PID} ) 2>/dev/null & WATCHER_PID=$! fi PIDS="" NUM_JOBS=$(( NUM_NODES + PORTAL_BRIDGE )) dump_logs() { LOG_LINES=20 for LOG in "${DATA_DIR}"/log*.txt; do echo "Last ${LOG_LINES} lines of ${LOG}:" tail -n ${LOG_LINES} "${LOG}" echo "======" done } BOOTSTRAP_NODE=0 BOOTSTRAP_TIMEOUT=5 # in seconds BOOTSTRAP_ENR_FILE="${DATA_DIR}/node${BOOTSTRAP_NODE}/fluffy_node.enr" TRUSTED_BLOCK_ROOT_ARG="" if [[ -n ${TRUSTED_BLOCK_ROOT} ]]; then TRUSTED_BLOCK_ROOT_ARG="--trusted-block-root=${TRUSTED_BLOCK_ROOT}" fi for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}" rm -rf "${NODE_DATA_DIR}" "${SCRIPTS_DIR}"/makedir.sh "${NODE_DATA_DIR}" 2>&1 done echo "Starting ${NUM_NODES} nodes." for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do # Reset arguments BOOTSTRAP_ARG="" NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}" if [[ ${NUM_NODE} != ${BOOTSTRAP_NODE} ]]; then BOOTSTRAP_ARG="--bootstrap-file=${BOOTSTRAP_ENR_FILE}" # All nodes but bootstrap node run with log. radius of 254 which should # result in ~1/4th of the data set stored. RADIUS_ARG="--radius=static:254" # Wait for the bootstrap node to write out its enr file START_TIMESTAMP=$(date +%s) while [[ ! -f "${BOOTSTRAP_ENR_FILE}" ]]; do sleep 0.1 NOW_TIMESTAMP=$(date +%s) if [[ "$(( NOW_TIMESTAMP - START_TIMESTAMP - GENESIS_OFFSET ))" -ge "$BOOTSTRAP_TIMEOUT" ]]; then echo "Bootstrap node failed to start in ${BOOTSTRAP_TIMEOUT} seconds. Aborting." dump_logs exit 1 fi done fi # Running with bits-per-hop of 1 to make the lookups more likely requiring # to request to nodes over the network instead of having most of them in the # own routing table. ./build/fluffy \ --listen-address:127.0.0.1 \ --nat:extip:127.0.0.1 \ --log-level="${LOG_LEVEL}" \ --udp-port=$(( BASE_PORT + NUM_NODE )) \ --data-dir="${NODE_DATA_DIR}" \ --network="none" \ ${BOOTSTRAP_ARG} \ --rpc \ --rpc-address="127.0.0.1" \ --rpc-port="$(( BASE_RPC_PORT + NUM_NODE ))" \ --metrics \ --metrics-address="127.0.0.1" \ --metrics-port="$(( BASE_METRICS_PORT + NUM_NODE ))" \ --table-ip-limit=1024 \ --bucket-ip-limit=24 \ --bits-per-hop=1 \ ${TRUSTED_BLOCK_ROOT_ARG} \ ${RADIUS_ARG} \ ${EXTRA_ARGS} \ > "${DATA_DIR}/log${NUM_NODE}.txt" 2>&1 & if [[ "${PIDS}" == "" ]]; then PIDS="$!" else PIDS="${PIDS},$!" fi done if [[ "$PORTAL_BRIDGE" == "1" ]]; then # Give the nodes time to connect before the bridge (node 0) starts gossip sleep 10 echo "Starting portal bridge for beacon network." ./build/portal_bridge beacon \ --rest-url="${REST_URL}" \ --rpc-address="127.0.0.1" \ --rpc-port="${BASE_RPC_PORT}" \ --backfill-amount=128 \ ${TRUSTED_BLOCK_ROOT_ARG} \ > "${DATA_DIR}/log_portal_bridge.txt" 2>&1 & PIDS="${PIDS},$!" fi # give the regular nodes time to crash sleep 5 BG_JOBS="$(jobs | wc -l | tr -d ' ')" if [[ "${TIMEOUT_DURATION}" != "0" ]]; then BG_JOBS=$(( BG_JOBS - 1 )) # minus the timeout bg job fi if [[ "$BG_JOBS" != "$NUM_JOBS" ]]; then echo "$(( NUM_JOBS - BG_JOBS )) fluffy instance(s) exited early. Aborting." dump_logs exit 1 fi # launch htop and run until `TIMEOUT_DURATION` or check the nodes and quit. if [[ "$RUN_TESTS" == "0" ]]; then htop -p "$PIDS" cleanup else # Need to let to settle the network a bit, as currently at start discv5 and # the Portal networks all send messages at once to the same nodes, causing # messages to drop when handshakes are going on. sleep 5 ./build/test_portal_testnet --node-count:${NUM_NODES} FAILED=$? if [[ "$FAILED" != "0" ]]; then dump_logs if [[ "${TIMEOUT_DURATION}" != "0" ]]; then if uname | grep -qiE "mingw|msys"; then echo ${WATCHER_PID} taskkill //F //PID ${WATCHER_PID} else pkill -HUP -P ${WATCHER_PID} fi fi exit 1 fi fi if [[ "${TIMEOUT_DURATION}" != "0" ]]; then if uname | grep -qiE "mingw|msys"; then taskkill //F //PID ${WATCHER_PID} else pkill -HUP -P ${WATCHER_PID} fi fi