From 20caa4f7446aa29994142094dffe77d43573bf1a Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" Date: Wed, 10 Jul 2019 15:58:25 -0500 Subject: [PATCH] test: for envoy integration tests, wait until 's2' is healthy in consul before interrogating envoy (#6108) When the envoy healthy panic threshold was explicitly disabled as part of L7 traffic management it changed how envoy decided to load balance to endpoints in a cluster. This only matters when envoy is in "panic mode" aka "when you have a bunch of unhealthy endpoints". Panic mode sends traffic to unhealthy instances in certain circumstances. Note: Prior to explicitly disabling the healthy panic threshold, the default value is 50%. What was happening is that the test harness was bringing up consul the sidecars, and the service instances all at once and sometimes the proxies wouldn't have time to be checked by consul to be labeled as 'passing' in the catalog before a round of EDS happened. The xDS server in consul effectively queries /v1/health/connect/s2 and gets 1 result, but that one result has a 'critical' check so the xDS server sends back that endpoint labeled as UNHEALTHY. Envoy sees that 100% of the endpoints in the cluster are unhealthy and would enter panic mode and still send traffic to s2. This is why the test suites PRIOR to disabling the healthy panic threshold worked. They were _incorrectly_ passing. When the healthy panic threshol is disabled, envoy never enters panic mode in this situation and thus the cluster has zero healthy endpoints so load balancing goes nowhere and the tests fail. Why does this only affect the test suites for envoy 1.8.0? My guess is that https://github.com/envoyproxy/envoy/pull/4442 was merged into the 1.9.x series and somehow that plays a role. This PR modifies the bats scripts to explicitly wait until the upstream sidecar is healthy as measured by /v1/health/connect/s2?passing BEFORE trying to interrogate envoy which should make the tests less racy. --- .../connect/envoy/case-badauthz/verify.bats | 4 ++++ .../connect/envoy/case-basic/verify.bats | 4 ++++ .../envoy/case-centralconf/verify.bats | 7 ++++-- .../envoy/case-dogstatsd-udp/verify.bats | 6 ++++- .../connect/envoy/case-grpc/verify.bats | 4 ++++ .../envoy/case-http-badauthz/verify.bats | 4 ++++ .../connect/envoy/case-http/verify.bats | 7 ++++-- .../connect/envoy/case-http2/verify.bats | 7 ++++-- .../connect/envoy/case-prometheus/verify.bats | 7 ++++-- .../connect/envoy/case-statsd-udp/verify.bats | 6 ++++- .../connect/envoy/case-zipkin/verify.bats | 7 ++++-- test/integration/connect/envoy/helpers.bash | 24 +++++++++++++++++++ 12 files changed, 75 insertions(+), 12 deletions(-) diff --git a/test/integration/connect/envoy/case-badauthz/verify.bats b/test/integration/connect/envoy/case-badauthz/verify.bats index ef632638bb..1a07f48d35 100644 --- a/test/integration/connect/envoy/case-badauthz/verify.bats +++ b/test/integration/connect/envoy/case-badauthz/verify.bats @@ -18,6 +18,10 @@ load helpers assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should NOT be able to connect to s2" { run retry_default must_fail_tcp_connection localhost:5000 diff --git a/test/integration/connect/envoy/case-basic/verify.bats b/test/integration/connect/envoy/case-basic/verify.bats index 6e915f947d..bb31749524 100644 --- a/test/integration/connect/envoy/case-basic/verify.bats +++ b/test/integration/connect/envoy/case-basic/verify.bats @@ -22,6 +22,10 @@ load helpers assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2" { run retry_default curl -s -f -d hello localhost:5000 [ "$status" -eq 0 ] diff --git a/test/integration/connect/envoy/case-centralconf/verify.bats b/test/integration/connect/envoy/case-centralconf/verify.bats index ca2f2f274a..b9af6d7112 100644 --- a/test/integration/connect/envoy/case-centralconf/verify.bats +++ b/test/integration/connect/envoy/case-centralconf/verify.bats @@ -12,13 +12,16 @@ load helpers @test "s1 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21000 s1 - } @test "s2 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2 with http/1.1" { run retry_default curl --http1.1 -s -f -d hello localhost:5000 [ "$status" -eq 0 ] @@ -46,4 +49,4 @@ load helpers retry_default \ must_match_in_prometheus_response localhost:1234 \ '[\{,]envoy_http_conn_manager_prefix="upstream_s2_http"[,}]' -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-dogstatsd-udp/verify.bats b/test/integration/connect/envoy/case-dogstatsd-udp/verify.bats index f8d7c06365..eb4ca3bad7 100644 --- a/test/integration/connect/envoy/case-dogstatsd-udp/verify.bats +++ b/test/integration/connect/envoy/case-dogstatsd-udp/verify.bats @@ -10,6 +10,10 @@ load helpers retry_default curl -f -s localhost:19001/stats -o /dev/null } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2" { run retry_default curl -s -f -d hello localhost:5000 @@ -60,4 +64,4 @@ load helpers echo "INTERVAL = $INTERVAL" [ "$INTERVAL" == "1s" ] -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-grpc/verify.bats b/test/integration/connect/envoy/case-grpc/verify.bats index 33a0d60e3d..bccc27f238 100644 --- a/test/integration/connect/envoy/case-grpc/verify.bats +++ b/test/integration/connect/envoy/case-grpc/verify.bats @@ -10,6 +10,10 @@ load helpers retry_default curl -f -s localhost:19001/stats -o /dev/null } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2 via grpc" { run fortio grpcping localhost:5000 diff --git a/test/integration/connect/envoy/case-http-badauthz/verify.bats b/test/integration/connect/envoy/case-http-badauthz/verify.bats index 9df381648b..597dfbb519 100644 --- a/test/integration/connect/envoy/case-http-badauthz/verify.bats +++ b/test/integration/connect/envoy/case-http-badauthz/verify.bats @@ -18,6 +18,10 @@ load helpers assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should NOT be able to connect to s2" { run retry_default must_fail_http_connection localhost:5000 diff --git a/test/integration/connect/envoy/case-http/verify.bats b/test/integration/connect/envoy/case-http/verify.bats index 55cbcd9409..2c7d2b8e45 100644 --- a/test/integration/connect/envoy/case-http/verify.bats +++ b/test/integration/connect/envoy/case-http/verify.bats @@ -12,13 +12,16 @@ load helpers @test "s1 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21000 s1 - } @test "s2 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2 with http/1.1" { run retry_default curl --http1.1 -s -f -d hello localhost:5000 [ "$status" -eq 0 ] @@ -36,4 +39,4 @@ load helpers [ "$PUB" = "envoy.ext_authz,envoy.http_connection_manager" ] [ "$UPS" = "envoy.http_connection_manager" ] -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-http2/verify.bats b/test/integration/connect/envoy/case-http2/verify.bats index cc4cd3247f..00c1a47192 100644 --- a/test/integration/connect/envoy/case-http2/verify.bats +++ b/test/integration/connect/envoy/case-http2/verify.bats @@ -12,13 +12,16 @@ load helpers @test "s1 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21000 s1 - } @test "s2 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2 via http2" { # We use grpc here because it's the easiest way to test http2. The server # needs to support h2c since the proxy doesn't talk TLS to the local app. @@ -28,4 +31,4 @@ load helpers echo "OUTPUT: $output" [ "$status" == 0 ] -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-prometheus/verify.bats b/test/integration/connect/envoy/case-prometheus/verify.bats index 9d8d8665f3..b076022441 100644 --- a/test/integration/connect/envoy/case-prometheus/verify.bats +++ b/test/integration/connect/envoy/case-prometheus/verify.bats @@ -12,13 +12,16 @@ load helpers @test "s1 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21000 s1 - } @test "s2 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2 with http/1.1" { run retry_default curl --http1.1 -s -f -d hello localhost:5000 [ "$status" -eq 0 ] @@ -41,4 +44,4 @@ load helpers retry_default \ must_match_in_prometheus_response localhost:1234 \ '[\{,]envoy_http_conn_manager_prefix="public_listener_http"[,}]' -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-statsd-udp/verify.bats b/test/integration/connect/envoy/case-statsd-udp/verify.bats index 179efe4152..57d1445033 100644 --- a/test/integration/connect/envoy/case-statsd-udp/verify.bats +++ b/test/integration/connect/envoy/case-statsd-udp/verify.bats @@ -10,6 +10,10 @@ load helpers retry_default curl -f -s localhost:19001/stats -o /dev/null } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2" { run retry_default curl -s -f -d hello localhost:5000 [ "$status" == 0 ] @@ -22,4 +26,4 @@ load helpers echo "OUTPUT: $output" [ "$status" == 0 ] -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-zipkin/verify.bats b/test/integration/connect/envoy/case-zipkin/verify.bats index 576c76ec93..88743051b6 100644 --- a/test/integration/connect/envoy/case-zipkin/verify.bats +++ b/test/integration/connect/envoy/case-zipkin/verify.bats @@ -12,13 +12,16 @@ load helpers @test "s1 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21000 s1 - } @test "s2 proxy listener should be up and have right cert" { assert_proxy_presents_cert_uri localhost:21001 s2 } +@test "s2 proxy should be healthy" { + assert_service_has_healthy_instances s2 1 +} + @test "s1 upstream should be able to connect to s2" { run retry_default curl -s -f -d hello localhost:5000 [ "$status" == "0" ] @@ -41,4 +44,4 @@ load helpers # Get the trace from Jaeger. Won't bother parsing it just seeing it show up # there is enough to know that the tracing config worked. run retry_default curl -s -f "localhost:16686/api/traces/$TRACEID" -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/helpers.bash b/test/integration/connect/envoy/helpers.bash index 4f9eefbed7..1304575359 100755 --- a/test/integration/connect/envoy/helpers.bash +++ b/test/integration/connect/envoy/helpers.bash @@ -120,6 +120,30 @@ function snapshot_envoy_admin { docker_wget "http://${HOSTPORT}/clusters" -q -O - > "./workdir/envoy/${ENVOY_NAME}-clusters.out" } +function get_healthy_service_count { + local SERVICE_NAME=$1 + run retry_default curl -s -f 127.0.0.1:8500/v1/health/connect/${SERVICE_NAME}?passing + [ "$status" -eq 0 ] + echo "$output" | jq --raw-output '. | length' +} + +function health_service_count_matches { + local SERVICE_NAME=$1 + local EXPECT_COUNT=$2 + + GOT_COUNT=$(get_healthy_service_count $SERVICE_NAME) + + [ "$GOT_COUNT" -eq $EXPECT_COUNT ] +} + +function assert_service_has_healthy_instances { + local SERVICE_NAME=$1 + local EXPECT_COUNT=$2 + + run retry 10 2 health_service_count_matches $SERVICE_NAME $EXPECT_COUNT + [ "$status" -eq 0 ] +} + function docker_consul { docker run -ti --rm --network container:envoy_consul_1 consul-dev $@ }