#!/usr/bin/env bash # Copyright (c) 2021 Status Research & Development GmbH. Licensed under # either of: # - Apache License, version 2.0 # - MIT license # at your option. This file may not be copied, modified, or distributed except # according to those terms. # This script is for a big part a copy of the nimbus-eth2 launch_local_testnet # script. This script however does not expect fluffy nodes to exit 0 in the good # case, but instead the json-rpc interface is used to check whether certain # values are what we expect them to be. set -e cd "$(dirname "${BASH_SOURCE[0]}")"/../.. #################### # argument parsing # #################### GETOPT_BINARY="getopt" if uname | grep -qi darwin; then # macOS GETOPT_BINARY="/usr/local/opt/gnu-getopt/bin/getopt" [[ -f "$GETOPT_BINARY" ]] || { echo "GNU getopt not installed. Please run 'brew install gnu-getopt'. Aborting."; exit 1; } fi ! ${GETOPT_BINARY} --test > /dev/null if [ ${PIPESTATUS[0]} != 4 ]; then echo '`getopt --test` failed in this environment.' exit 1 fi OPTS="h:n:d" LONGOPTS="help,nodes:,data-dir:,enable-htop,log-level:,base-port:,base-rpc-port:,base-metrics-port:,reuse-existing-data-dir,timeout:,kill-old-processes" # default values NUM_NODES="64" DATA_DIR="local_testnet_data" USE_HTOP="0" LOG_LEVEL="TRACE" BASE_PORT="9000" BASE_METRICS_PORT="8008" BASE_RPC_PORT="7000" REUSE_EXISTING_DATA_DIR="0" TIMEOUT_DURATION="0" KILL_OLD_PROCESSES="0" SCRIPTS_DIR="fluffy/scripts/" print_help() { cat </dev/null && HAVE_LSOF=1 || { echo "'lsof' not installed and we need it to check for ports already in use. Aborting."; exit 1; } fi # number of CPU cores if uname | grep -qi darwin; then NPROC="$(sysctl -n hw.logicalcpu)" else NPROC="$(nproc)" fi # kill lingering processes from a previous run if [[ "${HAVE_LSOF}" == "1" ]]; then for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do for PORT in $(( BASE_PORT + NUM_NODE )) $(( BASE_METRICS_PORT + NUM_NODE )) $(( BASE_RPC_PORT + NUM_NODE )); do for PID in $(lsof -n -i tcp:${PORT} -sTCP:LISTEN -t); do echo -n "Found old process listening on port ${PORT}, with PID ${PID}. " if [[ "${KILL_OLD_PROCESSES}" == "1" ]]; then echo "Killing it." kill -9 ${PID} || true else echo "Aborting." exit 1 fi done done done fi # Build the binaries BINARIES="fluffy" TEST_BINARIES="fluffy-test-portal-testnet" $MAKE -j ${NPROC} LOG_LEVEL=TRACE ${BINARIES} NIMFLAGS="-d:chronicles_colors=off -d:chronicles_sinks=textlines" $MAKE -j ${NPROC} LOG_LEVEL=INFO ${TEST_BINARIES} NIMFLAGS="-d:chronicles_sinks=textlines" # Kill child processes on Ctrl-C/SIGTERM/exit, passing the PID of this shell # instance as the parent and the target process name as a pattern to the # "pkill" command. cleanup() { pkill -f -P $$ fluffy &>/dev/null || true sleep 2 pkill -f -9 -P $$ fluffy &>/dev/null || true # Delete the binaries we just built, because these are with none default logs. # TODO: When fluffy gets run time log options a la nimbus-eth2 we can keep # the binaries around. for BINARY in ${BINARIES}; do rm build/${BINARY} done } trap 'cleanup' SIGINT SIGTERM EXIT # timeout - implemented with a background job timeout_reached() { echo -e "\nTimeout reached. Aborting.\n" cleanup } trap 'timeout_reached' SIGALRM # TODO: This doesn't seem to work in Windows CI as it can't find the process # with WATCHER_PID when doing the taskkill later on. if [[ "${TIMEOUT_DURATION}" != "0" ]]; then export PARENT_PID=$$ ( sleep ${TIMEOUT_DURATION} && kill -ALRM ${PARENT_PID} ) 2>/dev/null & WATCHER_PID=$! fi PIDS="" NUM_JOBS=${NUM_NODES} dump_logs() { LOG_LINES=20 for LOG in "${DATA_DIR}"/log*.txt; do echo "Last ${LOG_LINES} lines of ${LOG}:" tail -n ${LOG_LINES} "${LOG}" echo "======" done } BOOTSTRAP_NODE=0 BOOTSTRAP_TIMEOUT=5 # in seconds BOOTSTRAP_ENR_FILE="${DATA_DIR}/node${BOOTSTRAP_NODE}/fluffy_node.enr" for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}" rm -rf "${NODE_DATA_DIR}" "${SCRIPTS_DIR}"/makedir.sh "${NODE_DATA_DIR}" 2>&1 done echo "Starting ${NUM_NODES} nodes." for NUM_NODE in $(seq 0 $(( NUM_NODES - 1 ))); do NODE_DATA_DIR="${DATA_DIR}/node${NUM_NODE}" if [[ ${NUM_NODE} != ${BOOTSTRAP_NODE} ]]; then BOOTSTRAP_ARG="--bootstrap-file=${BOOTSTRAP_ENR_FILE}" # All nodes but bootstrap node run with log. radius of 254 which should # result in ~1/4th of the data set stored. RADIUS_ARG="--radius=254" # Wait for the bootstrap node to write out its enr file START_TIMESTAMP=$(date +%s) while [[ ! -f "${BOOTSTRAP_ENR_FILE}" ]]; do sleep 0.1 NOW_TIMESTAMP=$(date +%s) if [[ "$(( NOW_TIMESTAMP - START_TIMESTAMP - GENESIS_OFFSET ))" -ge "$BOOTSTRAP_TIMEOUT" ]]; then echo "Bootstrap node failed to start in ${BOOTSTRAP_TIMEOUT} seconds. Aborting." dump_logs exit 1 fi done fi # Running with bits-per-hop of 1 to make the lookups more likely requiring # to request to nodes over the network instead of having most of them in the # own routing table. ./build/fluffy \ --listen-address:127.0.0.1 \ --nat:extip:127.0.0.1 \ --log-level="${LOG_LEVEL}" \ --udp-port=$(( BASE_PORT + NUM_NODE )) \ --data-dir="${NODE_DATA_DIR}" \ ${BOOTSTRAP_ARG} \ --rpc \ --rpc-address="127.0.0.1" \ --rpc-port="$(( BASE_RPC_PORT + NUM_NODE ))" \ --metrics \ --metrics-address="127.0.0.1" \ --metrics-port="$(( BASE_METRICS_PORT + NUM_NODE ))" \ --table-ip-limit=1024 \ --bucket-ip-limit=24 \ --bits-per-hop=1 \ ${RADIUS_ARG} \ ${EXTRA_ARGS} \ > "${DATA_DIR}/log${NUM_NODE}.txt" 2>&1 & if [[ "${PIDS}" == "" ]]; then PIDS="$!" else PIDS="${PIDS},$!" fi done # give the regular nodes time to crash sleep 5 BG_JOBS="$(jobs | wc -l | tr -d ' ')" if [[ "${TIMEOUT_DURATION}" != "0" ]]; then BG_JOBS=$(( BG_JOBS - 1 )) # minus the timeout bg job fi if [[ "$BG_JOBS" != "$NUM_JOBS" ]]; then echo "$(( NUM_JOBS - BG_JOBS )) fluffy instance(s) exited early. Aborting." dump_logs exit 1 fi # launch htop and run until `TIMEOUT_DURATION` or check the nodes and quit. if [[ "$USE_HTOP" == "1" ]]; then htop -p "$PIDS" cleanup else # Need to let to settle the network a bit, as currently at start discv5 and # the Portal networks all send messages at once to the same nodes, causing # messages to drop when handshakes are going on. sleep 5 ./build/test_portal_testnet --node-count:${NUM_NODES} FAILED=$? if [[ "$FAILED" != "0" ]]; then dump_logs if [[ "${TIMEOUT_DURATION}" != "0" ]]; then if uname | grep -qiE "mingw|msys"; then echo ${WATCHER_PID} taskkill //F //PID ${WATCHER_PID} else pkill -HUP -P ${WATCHER_PID} fi fi exit 1 fi fi if [[ "${TIMEOUT_DURATION}" != "0" ]]; then if uname | grep -qiE "mingw|msys"; then taskkill //F //PID ${WATCHER_PID} else pkill -HUP -P ${WATCHER_PID} fi fi