Mark its own cluster as healthy when rebalancing. (#8406)

This code started as an optimization to avoid doing an RPC Ping to itself. But in a single server cluster the rebalancing was led to believe that there were no healthy servers because foundHealthyServer was not set. Now this is being set properly. Fixes #8401 and #8403.
2020-08-06 10:42:09 +02:00 · 2020-08-06 10:42:09 +02:00 · 1a351d53bb
parent 9d5f9c6bec
commit 1a351d53bb
2 changed files with 72 additions and 14 deletions
--- a/agent/router/manager.go
+++ b/agent/router/manager.go
@ -321,6 +321,25 @@ func (m *Manager) NumServers() int {
 	return len(l.servers)
 }

+func (m *Manager) healthyServer(server *metadata.Server) bool {
+	// Check to see if the manager is trying to ping itself. This
+	// is a small optimization to avoid performing an unnecessary
+	// RPC call.
+	// If this is true, we know there are healthy servers for this
+	// manager and we don't need to continue.
+	if m.serverName != "" && server.Name == m.serverName {
+		return true
+	}
+	if ok, err := m.connPoolPinger.Ping(server.Datacenter, server.ShortName, server.Addr); !ok {
+		m.logger.Debug("pinging server failed",
+			"server", server.String(),
+			"error", err,
+		)
+		return false
+	}
+	return true
+}
+
 // RebalanceServers shuffles the list of servers on this metadata.  The server
 // at the front of the list is selected for the next RPC.  RPC calls that
 // fail for a particular server are rotated to the end of the list.  This
@ -345,24 +364,12 @@ func (m *Manager) RebalanceServers() {
 	// this loop mutates the server list in-place.
 	var foundHealthyServer bool
 	for i := 0; i < len(l.servers); i++ {
-		// Always test the first server.  Failed servers are cycled
+		// Always test the first server. Failed servers are cycled
 		// while Serf detects the node has failed.
-		srv := l.servers[0]
-
-		// check to see if the manager is trying to ping itself,
-		// continue if that is the case.
-		if m.serverName != "" && srv.Name == m.serverName {
-			continue
-		}
-		ok, err := m.connPoolPinger.Ping(srv.Datacenter, srv.ShortName, srv.Addr)
-		if ok {
+		if m.healthyServer(l.servers[0]) {
 			foundHealthyServer = true
 			break
 		}
-		m.logger.Debug("pinging server failed",
-			"server", srv.String(),
-			"error", err,
-		)
 		l.servers = l.cycleServer()
 	}

--- a/agent/router/manager_internal_test.go
+++ b/agent/router/manager_internal_test.go
@ -10,6 +10,7 @@ import (

 	"github.com/hashicorp/consul/agent/metadata"
 	"github.com/hashicorp/go-hclog"
+	"github.com/stretchr/testify/require"
 )

 var (
@ -350,3 +351,53 @@ func TestManagerInternal_saveServerList(t *testing.T) {
 		}
 	}()
 }
+
+func TestManager_healthyServer(t *testing.T) {
+	t.Run("checking itself", func(t *testing.T) {
+		m := testManager()
+		m.serverName = "s1"
+		server := metadata.Server{Name: m.serverName}
+		require.True(t, m.healthyServer(&server))
+	})
+	t.Run("checking another server with successful ping", func(t *testing.T) {
+		m := testManager()
+		server := metadata.Server{Name: "s1"}
+		require.True(t, m.healthyServer(&server))
+	})
+	t.Run("checking another server with failed ping", func(t *testing.T) {
+		m := testManagerFailProb(1)
+		server := metadata.Server{Name: "s1"}
+		require.False(t, m.healthyServer(&server))
+	})
+}
+
+func TestManager_Rebalance(t *testing.T) {
+	t.Run("single server cluster checking itself", func(t *testing.T) {
+		m := testManager()
+		m.serverName = "s1"
+		m.AddServer(&metadata.Server{Name: m.serverName})
+		m.RebalanceServers()
+		require.False(t, m.IsOffline())
+	})
+	t.Run("multi server cluster is unhealthy when pings always fail", func(t *testing.T) {
+		m := testManagerFailProb(1)
+		m.AddServer(&metadata.Server{Name: "s1"})
+		m.AddServer(&metadata.Server{Name: "s2"})
+		m.AddServer(&metadata.Server{Name: "s3"})
+		for i := 0; i < 100; i++ {
+			m.RebalanceServers()
+			require.True(t, m.IsOffline())
+		}
+	})
+	t.Run("multi server cluster checking itself remains healthy despite pings always fail", func(t *testing.T) {
+		m := testManagerFailProb(1)
+		m.serverName = "s1"
+		m.AddServer(&metadata.Server{Name: m.serverName})
+		m.AddServer(&metadata.Server{Name: "s2"})
+		m.AddServer(&metadata.Server{Name: "s3"})
+		for i := 0; i < 100; i++ {
+			m.RebalanceServers()
+			require.False(t, m.IsOffline())
+		}
+	})
+}