logos-storage-nim/.github/workflows/bootstrap-health-check.yml

101 lines
3.4 KiB
YAML

name: Bootstrap nodes health check
# Scheduled liveness check for the preset bootstrap nodes. Runs on a
# GitHub-hosted runner (public internet) so nodes advertising private/cloud
# internal IPs are correctly seen as unreachable. On any unreachable node it
# fails the job and opens/updates a tracking issue labelled `bootstrap-health`.
on:
schedule:
- cron: "0 6 * * *" # daily 06:00 UTC
workflow_dispatch:
env:
nim_version: v2.2.10
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
permissions:
contents: read
issues: write
jobs:
ping:
name: Ping preset bootstrap nodes
runs-on: ubuntu-latest
steps:
- name: Checkout sources
uses: actions/checkout@v6
with:
submodules: recursive
- name: Setup Nimbus Build System
uses: ./.github/actions/nimbus-build-system
with:
os: linux
nim_version: ${{ env.nim_version }}
- name: Ping bootstrap nodes
id: ping
continue-on-error: true
run: make CI=true bootstrapHealthCheck
shell: bash
- name: Build report
id: report
if: always()
run: |
json=build/bootstrap-health-report.json
if [ ! -f "$json" ]; then
echo "no_output=true" >> "$GITHUB_OUTPUT"
echo "::error::check_spr produced no output file"
exit 0
fi
dead=$(jq '[.[] | select(.alive==false)] | length' "$json")
total=$(jq 'length' "$json")
echo "dead=$dead" >> "$GITHUB_OUTPUT"
{
echo "## Bootstrap node liveness ($((total - dead))/$total reachable)"
echo
echo "| Network | Result | Address | Reason |"
echo "|---|---|---|---|"
jq -r '.[] | "| \(.network) | \(if .alive then "✅ ALIVE" else "❌ DEAD" end) | \(.address) | \(.reason) |"' "$json"
} | tee report.md >> "$GITHUB_STEP_SUMMARY"
shell: bash
- name: Open or update tracking issue
if: always() && steps.report.outputs.dead != '0' && steps.report.outputs.dead != ''
env:
GH_TOKEN: ${{ github.token }}
DEAD: ${{ steps.report.outputs.dead }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
gh label create bootstrap-health --color B60205 \
--description "Automated bootstrap-node liveness alerts" 2>/dev/null || true
{
echo "Scheduled bootstrap-node liveness check found **${DEAD}** unreachable node(s)."
echo
echo "Run: ${RUN_URL}"
echo
cat report.md
} > issue-body.md
existing=$(gh issue list --label bootstrap-health --state open --json number --jq '.[0].number')
if [ -n "$existing" ]; then
gh issue comment "$existing" --body-file issue-body.md
else
gh issue create --title "Bootstrap nodes unreachable" \
--label bootstrap-health --body-file issue-body.md
fi
shell: bash
- name: Fail if any node is unreachable
if: always() && ((steps.report.outputs.dead != '0' && steps.report.outputs.dead != '') || steps.report.outputs.no_output == 'true')
env:
DEAD: ${{ steps.report.outputs.dead }}
run: |
echo "Bootstrap liveness check failed: ${DEAD} unreachable node(s)."
exit 1
shell: bash