mirror of https://github.com/status-im/consul.git
Mark its own cluster as healthy when rebalancing. (#8406)
This code started as an optimization to avoid doing an RPC Ping to itself. But in a single server cluster the rebalancing was led to believe that there were no healthy servers because foundHealthyServer was not set. Now this is being set properly. Fixes #8401 and #8403.
This commit is contained in:
parent
9d5f9c6bec
commit
1a351d53bb
|
@ -321,6 +321,25 @@ func (m *Manager) NumServers() int {
|
||||||
return len(l.servers)
|
return len(l.servers)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *Manager) healthyServer(server *metadata.Server) bool {
|
||||||
|
// Check to see if the manager is trying to ping itself. This
|
||||||
|
// is a small optimization to avoid performing an unnecessary
|
||||||
|
// RPC call.
|
||||||
|
// If this is true, we know there are healthy servers for this
|
||||||
|
// manager and we don't need to continue.
|
||||||
|
if m.serverName != "" && server.Name == m.serverName {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if ok, err := m.connPoolPinger.Ping(server.Datacenter, server.ShortName, server.Addr); !ok {
|
||||||
|
m.logger.Debug("pinging server failed",
|
||||||
|
"server", server.String(),
|
||||||
|
"error", err,
|
||||||
|
)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// RebalanceServers shuffles the list of servers on this metadata. The server
|
// RebalanceServers shuffles the list of servers on this metadata. The server
|
||||||
// at the front of the list is selected for the next RPC. RPC calls that
|
// at the front of the list is selected for the next RPC. RPC calls that
|
||||||
// fail for a particular server are rotated to the end of the list. This
|
// fail for a particular server are rotated to the end of the list. This
|
||||||
|
@ -345,24 +364,12 @@ func (m *Manager) RebalanceServers() {
|
||||||
// this loop mutates the server list in-place.
|
// this loop mutates the server list in-place.
|
||||||
var foundHealthyServer bool
|
var foundHealthyServer bool
|
||||||
for i := 0; i < len(l.servers); i++ {
|
for i := 0; i < len(l.servers); i++ {
|
||||||
// Always test the first server. Failed servers are cycled
|
// Always test the first server. Failed servers are cycled
|
||||||
// while Serf detects the node has failed.
|
// while Serf detects the node has failed.
|
||||||
srv := l.servers[0]
|
if m.healthyServer(l.servers[0]) {
|
||||||
|
|
||||||
// check to see if the manager is trying to ping itself,
|
|
||||||
// continue if that is the case.
|
|
||||||
if m.serverName != "" && srv.Name == m.serverName {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ok, err := m.connPoolPinger.Ping(srv.Datacenter, srv.ShortName, srv.Addr)
|
|
||||||
if ok {
|
|
||||||
foundHealthyServer = true
|
foundHealthyServer = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
m.logger.Debug("pinging server failed",
|
|
||||||
"server", srv.String(),
|
|
||||||
"error", err,
|
|
||||||
)
|
|
||||||
l.servers = l.cycleServer()
|
l.servers = l.cycleServer()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ import (
|
||||||
|
|
||||||
"github.com/hashicorp/consul/agent/metadata"
|
"github.com/hashicorp/consul/agent/metadata"
|
||||||
"github.com/hashicorp/go-hclog"
|
"github.com/hashicorp/go-hclog"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -350,3 +351,53 @@ func TestManagerInternal_saveServerList(t *testing.T) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestManager_healthyServer(t *testing.T) {
|
||||||
|
t.Run("checking itself", func(t *testing.T) {
|
||||||
|
m := testManager()
|
||||||
|
m.serverName = "s1"
|
||||||
|
server := metadata.Server{Name: m.serverName}
|
||||||
|
require.True(t, m.healthyServer(&server))
|
||||||
|
})
|
||||||
|
t.Run("checking another server with successful ping", func(t *testing.T) {
|
||||||
|
m := testManager()
|
||||||
|
server := metadata.Server{Name: "s1"}
|
||||||
|
require.True(t, m.healthyServer(&server))
|
||||||
|
})
|
||||||
|
t.Run("checking another server with failed ping", func(t *testing.T) {
|
||||||
|
m := testManagerFailProb(1)
|
||||||
|
server := metadata.Server{Name: "s1"}
|
||||||
|
require.False(t, m.healthyServer(&server))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManager_Rebalance(t *testing.T) {
|
||||||
|
t.Run("single server cluster checking itself", func(t *testing.T) {
|
||||||
|
m := testManager()
|
||||||
|
m.serverName = "s1"
|
||||||
|
m.AddServer(&metadata.Server{Name: m.serverName})
|
||||||
|
m.RebalanceServers()
|
||||||
|
require.False(t, m.IsOffline())
|
||||||
|
})
|
||||||
|
t.Run("multi server cluster is unhealthy when pings always fail", func(t *testing.T) {
|
||||||
|
m := testManagerFailProb(1)
|
||||||
|
m.AddServer(&metadata.Server{Name: "s1"})
|
||||||
|
m.AddServer(&metadata.Server{Name: "s2"})
|
||||||
|
m.AddServer(&metadata.Server{Name: "s3"})
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
m.RebalanceServers()
|
||||||
|
require.True(t, m.IsOffline())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
t.Run("multi server cluster checking itself remains healthy despite pings always fail", func(t *testing.T) {
|
||||||
|
m := testManagerFailProb(1)
|
||||||
|
m.serverName = "s1"
|
||||||
|
m.AddServer(&metadata.Server{Name: m.serverName})
|
||||||
|
m.AddServer(&metadata.Server{Name: "s2"})
|
||||||
|
m.AddServer(&metadata.Server{Name: "s3"})
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
m.RebalanceServers()
|
||||||
|
require.False(t, m.IsOffline())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue