Address flakiness in command/exec tests (#4517)

* Add fn to wait for TestAgent node and check registration * Add waits for TestAgent and retries before timeouts in exec_test
2018-08-10 15:04:07 -04:00 · 2018-08-10 15:04:07 -04:00 · e305443db4
parent a03512496f
commit e305443db4
2 changed files with 92 additions and 37 deletions
--- a/command/exec/exec_test.go
+++ b/command/exec/exec_test.go
@ -5,6 +5,8 @@ import (
 	"testing"
 	"time"

+	"github.com/hashicorp/consul/testrpc"
+
 	"github.com/hashicorp/consul/agent"
 	consulapi "github.com/hashicorp/consul/api"
 	"github.com/hashicorp/consul/testutil/retry"
@ -25,6 +27,8 @@ func TestExecCommand(t *testing.T) {
 	`)
 	defer a.Shutdown()

+	testrpc.WaitForTestAgent(t, a.RPC, "dc1")
+
 	ui := cli.NewMockUi()
 	c := New(ui, nil)
 	args := []string{"-http-addr=" + a.HTTPAddr(), "-wait=1s", "uptime"}
@ -46,6 +50,8 @@ func TestExecCommand_NoShell(t *testing.T) {
 	`)
 	defer a.Shutdown()

+	testrpc.WaitForTestAgent(t, a.RPC, "dc1")
+
 	ui := cli.NewMockUi()
 	c := New(ui, nil)
 	args := []string{"-http-addr=" + a.HTTPAddr(), "-shell=false", "-wait=1s", "uptime"}
@ -67,26 +73,28 @@ func TestExecCommand_CrossDC(t *testing.T) {
 	`)
 	defer a1.Shutdown()

+	testrpc.WaitForTestAgent(t, a1.RPC, "dc1")
+
 	a2 := agent.NewTestAgent(t.Name(), `
 		datacenter = "dc2"
 		disable_remote_exec = false
 	`)
 	defer a2.Shutdown()

+	testrpc.WaitForTestAgent(t, a2.RPC, "dc2")
+
 	// Join over the WAN
 	_, err := a2.JoinWAN([]string{a1.Config.SerfBindAddrWAN.String()})
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}

-	retry.Run(t, func(r *retry.R) {
 	if got, want := len(a1.WANMembers()), 2; got != want {
-			r.Fatalf("got %d WAN members on a1 want %d", got, want)
+		t.Fatalf("got %d WAN members on a1 want %d", got, want)
 	}
 	if got, want := len(a2.WANMembers()), 2; got != want {
-			r.Fatalf("got %d WAN members on a2 want %d", got, want)
+		t.Fatalf("got %d WAN members on a2 want %d", got, want)
 	}
-	})

 	ui := cli.NewMockUi()
 	c := New(ui, nil)
@ -145,6 +153,8 @@ func TestExecCommand_Sessions(t *testing.T) {
 	`)
 	defer a.Shutdown()

+	testrpc.WaitForTestAgent(t, a.RPC, "dc1")
+
 	ui := cli.NewMockUi()
 	c := New(ui, nil)
 	c.apiclient = a.Client()
@ -233,6 +243,8 @@ func TestExecCommand_UploadDestroy(t *testing.T) {
 	`)
 	defer a.Shutdown()

+	testrpc.WaitForTestAgent(t, a.RPC, "dc1")
+
 	ui := cli.NewMockUi()

 	c := New(ui, nil)
@ -288,6 +300,8 @@ func TestExecCommand_StreamResults(t *testing.T) {
 	`)
 	defer a.Shutdown()

+	testrpc.WaitForTestAgent(t, a.RPC, "dc1")
+
 	ui := cli.NewMockUi()
 	c := New(ui, nil)
 	c.apiclient = a.Client()
@ -320,14 +334,16 @@ func TestExecCommand_StreamResults(t *testing.T) {
 		t.Fatalf("should be ok bro")
 	}

+	retry.Run(t, func(r *retry.R) {
 		select {
 		case a := <-ackCh:
 			if a.Node != "foo" {
-			t.Fatalf("bad: %#v", a)
+				r.Fatalf("bad: %#v", a)
 			}
 		case <-time.After(50 * time.Millisecond):
-		t.Fatalf("timeout")
+			r.Fatalf("timeout")
 		}
+	})

 	ok, _, err = a.Client().KV().Acquire(&consulapi.KVPair{
 		Key:     prefix + "foo/exit",
@ -341,14 +357,16 @@ func TestExecCommand_StreamResults(t *testing.T) {
 		t.Fatalf("should be ok bro")
 	}

+	retry.Run(t, func(r *retry.R) {
 		select {
 		case e := <-exitCh:
 			if e.Node != "foo" || e.Code != 127 {
-			t.Fatalf("bad: %#v", e)
+				r.Fatalf("bad: %#v", e)
 			}
 		case <-time.After(50 * time.Millisecond):
-		t.Fatalf("timeout")
+			r.Fatalf("timeout")
 		}
+	})

 	// Random key, should ignore
 	ok, _, err = a.Client().KV().Acquire(&consulapi.KVPair{
@ -374,14 +392,16 @@ func TestExecCommand_StreamResults(t *testing.T) {
 		t.Fatalf("should be ok bro")
 	}

+	retry.Run(t, func(r *retry.R) {
 		select {
 		case h := <-heartCh:
 			if h.Node != "foo" {
-			t.Fatalf("bad: %#v", h)
+				r.Fatalf("bad: %#v", h)
 			}
 		case <-time.After(50 * time.Millisecond):
-		t.Fatalf("timeout")
+			r.Fatalf("timeout")
 		}
+	})

 	// Output value
 	ok, _, err = a.Client().KV().Acquire(&consulapi.KVPair{
@ -396,12 +416,14 @@ func TestExecCommand_StreamResults(t *testing.T) {
 		t.Fatalf("should be ok bro")
 	}

+	retry.Run(t, func(r *retry.R) {
 		select {
 		case o := <-outputCh:
 			if o.Node != "foo" || string(o.Output) != "test" {
-			t.Fatalf("bad: %#v", o)
+				r.Fatalf("bad: %#v", o)
 			}
 		case <-time.After(50 * time.Millisecond):
-		t.Fatalf("timeout")
+			r.Fatalf("timeout")
 		}
+	})
 }
--- a/testrpc/wait.go
+++ b/testrpc/wait.go
@ -9,10 +9,10 @@ import (

 type rpcFn func(string, interface{}, interface{}) error

+// WaitForLeader ensures we have a leader and a node registration.
 func WaitForLeader(t *testing.T, rpc rpcFn, dc string) {
 	var out structs.IndexedNodes
 	retry.Run(t, func(r *retry.R) {
-		// Ensure we have a leader and a node registration.
 		args := &structs.DCSpecificRequest{Datacenter: dc}
 		if err := rpc("Catalog.ListNodes", args, &out); err != nil {
 			r.Fatalf("Catalog.ListNodes failed: %v", err)
@ -25,3 +25,36 @@ func WaitForLeader(t *testing.T, rpc rpcFn, dc string) {
 		}
 	})
 }
+
+// WaitForTestAgent ensures we have a node with serfHealth check registered
+func WaitForTestAgent(t *testing.T, rpc rpcFn, dc string) {
+	var nodes structs.IndexedNodes
+	var checks structs.IndexedHealthChecks
+
+	retry.Run(t, func(r *retry.R) {
+		dcReq := &structs.DCSpecificRequest{Datacenter: dc}
+		if err := rpc("Catalog.ListNodes", dcReq, &nodes); err != nil {
+			r.Fatalf("Catalog.ListNodes failed: %v", err)
+		}
+		if len(nodes.Nodes) == 0 {
+			r.Fatalf("No registered nodes")
+		}
+
+		// This assumes that there is a single agent per dc, typically a TestAgent
+		nodeReq := &structs.NodeSpecificRequest{Datacenter: dc, Node: nodes.Nodes[0].Node}
+		if err := rpc("Health.NodeChecks", nodeReq, &checks); err != nil {
+			r.Fatalf("Health.NodeChecks failed: %v", err)
+		}
+
+		var found bool
+		for _, check := range checks.HealthChecks {
+			if check.CheckID == "serfHealth" {
+				found = true
+				break
+			}
+		}
+		if !found {
+			r.Fatalf("serfHealth check not found")
+		}
+	})
+}