From 1a351d53bbf02b4dd8b309d9bd0dbdd11007d585 Mon Sep 17 00:00:00 2001 From: Hans Hasselberg Date: Thu, 6 Aug 2020 10:42:09 +0200 Subject: [PATCH] Mark its own cluster as healthy when rebalancing. (#8406) This code started as an optimization to avoid doing an RPC Ping to itself. But in a single server cluster the rebalancing was led to believe that there were no healthy servers because foundHealthyServer was not set. Now this is being set properly. Fixes #8401 and #8403. --- agent/router/manager.go | 35 ++++++++++-------- agent/router/manager_internal_test.go | 51 +++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/agent/router/manager.go b/agent/router/manager.go index 470064945a..1e42d36e79 100644 --- a/agent/router/manager.go +++ b/agent/router/manager.go @@ -321,6 +321,25 @@ func (m *Manager) NumServers() int { return len(l.servers) } +func (m *Manager) healthyServer(server *metadata.Server) bool { + // Check to see if the manager is trying to ping itself. This + // is a small optimization to avoid performing an unnecessary + // RPC call. + // If this is true, we know there are healthy servers for this + // manager and we don't need to continue. + if m.serverName != "" && server.Name == m.serverName { + return true + } + if ok, err := m.connPoolPinger.Ping(server.Datacenter, server.ShortName, server.Addr); !ok { + m.logger.Debug("pinging server failed", + "server", server.String(), + "error", err, + ) + return false + } + return true +} + // RebalanceServers shuffles the list of servers on this metadata. The server // at the front of the list is selected for the next RPC. RPC calls that // fail for a particular server are rotated to the end of the list. This @@ -345,24 +364,12 @@ func (m *Manager) RebalanceServers() { // this loop mutates the server list in-place. var foundHealthyServer bool for i := 0; i < len(l.servers); i++ { - // Always test the first server. Failed servers are cycled + // Always test the first server. Failed servers are cycled // while Serf detects the node has failed. - srv := l.servers[0] - - // check to see if the manager is trying to ping itself, - // continue if that is the case. - if m.serverName != "" && srv.Name == m.serverName { - continue - } - ok, err := m.connPoolPinger.Ping(srv.Datacenter, srv.ShortName, srv.Addr) - if ok { + if m.healthyServer(l.servers[0]) { foundHealthyServer = true break } - m.logger.Debug("pinging server failed", - "server", srv.String(), - "error", err, - ) l.servers = l.cycleServer() } diff --git a/agent/router/manager_internal_test.go b/agent/router/manager_internal_test.go index 10f39fbf9c..76d9512168 100644 --- a/agent/router/manager_internal_test.go +++ b/agent/router/manager_internal_test.go @@ -10,6 +10,7 @@ import ( "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/go-hclog" + "github.com/stretchr/testify/require" ) var ( @@ -350,3 +351,53 @@ func TestManagerInternal_saveServerList(t *testing.T) { } }() } + +func TestManager_healthyServer(t *testing.T) { + t.Run("checking itself", func(t *testing.T) { + m := testManager() + m.serverName = "s1" + server := metadata.Server{Name: m.serverName} + require.True(t, m.healthyServer(&server)) + }) + t.Run("checking another server with successful ping", func(t *testing.T) { + m := testManager() + server := metadata.Server{Name: "s1"} + require.True(t, m.healthyServer(&server)) + }) + t.Run("checking another server with failed ping", func(t *testing.T) { + m := testManagerFailProb(1) + server := metadata.Server{Name: "s1"} + require.False(t, m.healthyServer(&server)) + }) +} + +func TestManager_Rebalance(t *testing.T) { + t.Run("single server cluster checking itself", func(t *testing.T) { + m := testManager() + m.serverName = "s1" + m.AddServer(&metadata.Server{Name: m.serverName}) + m.RebalanceServers() + require.False(t, m.IsOffline()) + }) + t.Run("multi server cluster is unhealthy when pings always fail", func(t *testing.T) { + m := testManagerFailProb(1) + m.AddServer(&metadata.Server{Name: "s1"}) + m.AddServer(&metadata.Server{Name: "s2"}) + m.AddServer(&metadata.Server{Name: "s3"}) + for i := 0; i < 100; i++ { + m.RebalanceServers() + require.True(t, m.IsOffline()) + } + }) + t.Run("multi server cluster checking itself remains healthy despite pings always fail", func(t *testing.T) { + m := testManagerFailProb(1) + m.serverName = "s1" + m.AddServer(&metadata.Server{Name: m.serverName}) + m.AddServer(&metadata.Server{Name: "s2"}) + m.AddServer(&metadata.Server{Name: "s3"}) + for i := 0; i < 100; i++ { + m.RebalanceServers() + require.False(t, m.IsOffline()) + } + }) +}