mirror of
https://github.com/status-im/consul.git
synced 2025-01-11 22:34:55 +00:00
autopilot: fix dead server removal condition to use correct failure tolerance (#4017)
* Make dead server removal condition in autopilot use correct failure tolerance rules * Introduce func with explanation
This commit is contained in:
parent
4f5d5020b8
commit
c47dbffe1c
@ -174,6 +174,20 @@ func (a *Autopilot) RemoveDeadServers() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func canRemoveServers(peers, minQuorum, deadServers int) (bool, string) {
|
||||||
|
if peers-deadServers < int(minQuorum) {
|
||||||
|
return false, fmt.Sprintf("denied, because removing %d/%d servers would leave less then minimal allowed quorum of %d servers", deadServers, peers, minQuorum)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only do removals if a minority of servers will be affected.
|
||||||
|
// For failure tolerance of F we need n = 2F+1 servers.
|
||||||
|
// This means we can safely remove up to (n-1)/2 servers.
|
||||||
|
if deadServers > (peers-1)/2 {
|
||||||
|
return false, fmt.Sprintf("denied, because removing the majority of servers %d/%d is not safe", deadServers, peers)
|
||||||
|
}
|
||||||
|
return true, fmt.Sprintf("allowed, because removing %d/%d servers leaves a majority of servers above the minimal allowed quorum %d", deadServers, peers, minQuorum)
|
||||||
|
}
|
||||||
|
|
||||||
// pruneDeadServers removes up to numPeers/2 failed servers
|
// pruneDeadServers removes up to numPeers/2 failed servers
|
||||||
func (a *Autopilot) pruneDeadServers() error {
|
func (a *Autopilot) pruneDeadServers() error {
|
||||||
conf := a.delegate.AutopilotConfig()
|
conf := a.delegate.AutopilotConfig()
|
||||||
@ -226,15 +240,18 @@ func (a *Autopilot) pruneDeadServers() error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can bail early if there's nothing to do.
|
deadServers := len(failed) + len(staleRaftServers)
|
||||||
removalCount := len(failed) + len(staleRaftServers)
|
|
||||||
if removalCount == 0 {
|
// nothing to do
|
||||||
|
if deadServers == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if ok, msg := canRemoveServers(NumPeers(raftConfig), int(conf.MinQuorum), deadServers); !ok {
|
||||||
|
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: %s.", msg)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Only do removals if a minority of servers will be affected.
|
|
||||||
peers := NumPeers(raftConfig)
|
|
||||||
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
|
|
||||||
for _, node := range failed {
|
for _, node := range failed {
|
||||||
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
|
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
|
||||||
go serfLAN.RemoveFailedNode(node.Name)
|
go serfLAN.RemoveFailedNode(node.Name)
|
||||||
@ -260,9 +277,6 @@ func (a *Autopilot) pruneDeadServers() error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/hashicorp/serf/serf"
|
"github.com/hashicorp/serf/serf"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMinRaftProtocol(t *testing.T) {
|
func TestMinRaftProtocol(t *testing.T) {
|
||||||
@ -84,3 +85,27 @@ func TestMinRaftProtocol(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAutopilot_canRemoveServers(t *testing.T) {
|
||||||
|
type test struct {
|
||||||
|
peers int
|
||||||
|
minQuorum int
|
||||||
|
deadServers int
|
||||||
|
ok bool
|
||||||
|
}
|
||||||
|
|
||||||
|
tests := []test{
|
||||||
|
{1, 1, 1, false},
|
||||||
|
{3, 3, 1, false},
|
||||||
|
{4, 3, 3, false},
|
||||||
|
{5, 3, 3, false},
|
||||||
|
{5, 3, 2, true},
|
||||||
|
{5, 3, 1, true},
|
||||||
|
{9, 3, 5, false},
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
ok, msg := canRemoveServers(test.peers, test.minQuorum, test.deadServers)
|
||||||
|
require.Equal(t, test.ok, ok)
|
||||||
|
t.Logf("%+v: %s", test, msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -9,6 +9,7 @@ import (
|
|||||||
"github.com/hashicorp/consul/testrpc"
|
"github.com/hashicorp/consul/testrpc"
|
||||||
"github.com/hashicorp/raft"
|
"github.com/hashicorp/raft"
|
||||||
"github.com/hashicorp/serf/serf"
|
"github.com/hashicorp/serf/serf"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestAutopilot_IdempotentShutdown(t *testing.T) {
|
func TestAutopilot_IdempotentShutdown(t *testing.T) {
|
||||||
@ -37,7 +38,7 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
|||||||
conf := func(c *Config) {
|
conf := func(c *Config) {
|
||||||
c.Datacenter = dc
|
c.Datacenter = dc
|
||||||
c.Bootstrap = false
|
c.Bootstrap = false
|
||||||
c.BootstrapExpect = 3
|
c.BootstrapExpect = 5
|
||||||
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
|
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
|
||||||
}
|
}
|
||||||
dir1, s1 := testServerWithConfig(t, conf)
|
dir1, s1 := testServerWithConfig(t, conf)
|
||||||
@ -52,44 +53,69 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
|||||||
defer os.RemoveAll(dir3)
|
defer os.RemoveAll(dir3)
|
||||||
defer s3.Shutdown()
|
defer s3.Shutdown()
|
||||||
|
|
||||||
servers := []*Server{s1, s2, s3}
|
|
||||||
|
|
||||||
// Try to join
|
|
||||||
joinLAN(t, s2, s1)
|
|
||||||
joinLAN(t, s3, s1)
|
|
||||||
|
|
||||||
for _, s := range servers {
|
|
||||||
testrpc.WaitForLeader(t, s.RPC, dc)
|
|
||||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bring up a new server
|
|
||||||
dir4, s4 := testServerWithConfig(t, conf)
|
dir4, s4 := testServerWithConfig(t, conf)
|
||||||
defer os.RemoveAll(dir4)
|
defer os.RemoveAll(dir4)
|
||||||
defer s4.Shutdown()
|
defer s4.Shutdown()
|
||||||
|
|
||||||
// Kill a non-leader server
|
dir5, s5 := testServerWithConfig(t, conf)
|
||||||
s3.Shutdown()
|
defer os.RemoveAll(dir5)
|
||||||
|
defer s5.Shutdown()
|
||||||
|
|
||||||
|
servers := []*Server{s1, s2, s3, s4, s5}
|
||||||
|
|
||||||
|
// Try to join
|
||||||
|
joinLAN(t, s2, s1)
|
||||||
|
joinLAN(t, s3, s1)
|
||||||
|
joinLAN(t, s4, s1)
|
||||||
|
joinLAN(t, s5, s1)
|
||||||
|
|
||||||
|
for _, s := range servers {
|
||||||
|
testrpc.WaitForLeader(t, s.RPC, dc)
|
||||||
|
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
|
||||||
|
}
|
||||||
|
|
||||||
|
require := require.New(t)
|
||||||
|
testrpc.WaitForLeader(t, s1.RPC, "dc1")
|
||||||
|
leaderIndex := -1
|
||||||
|
for i, s := range servers {
|
||||||
|
if s.IsLeader() {
|
||||||
|
leaderIndex = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
require.NotEqual(leaderIndex, -1)
|
||||||
|
|
||||||
|
// Shutdown two non-leader servers
|
||||||
|
killed := make(map[string]struct{})
|
||||||
|
for i, s := range servers {
|
||||||
|
if i != leaderIndex {
|
||||||
|
s.Shutdown()
|
||||||
|
killed[string(s.config.NodeID)] = struct{}{}
|
||||||
|
}
|
||||||
|
if len(killed) == 2 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
retry.Run(t, func(r *retry.R) {
|
retry.Run(t, func(r *retry.R) {
|
||||||
alive := 0
|
alive := 0
|
||||||
for _, m := range s1.LANMembers() {
|
for _, m := range servers[leaderIndex].LANMembers() {
|
||||||
if m.Status == serf.StatusAlive {
|
if m.Status == serf.StatusAlive {
|
||||||
alive++
|
alive++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if alive != 2 {
|
if alive != 3 {
|
||||||
r.Fatal(nil)
|
r.Fatalf("Expected three alive servers instead of %d", alive)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
// Join the new server
|
// Make sure the dead servers are removed and we're back to 3 total peers
|
||||||
joinLAN(t, s4, s1)
|
|
||||||
servers[2] = s4
|
|
||||||
|
|
||||||
// Make sure the dead server is removed and we're back to 3 total peers
|
|
||||||
for _, s := range servers {
|
for _, s := range servers {
|
||||||
|
_, killed := killed[string(s.config.NodeID)]
|
||||||
|
if !killed {
|
||||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAutopilot_CleanupDeadNonvoter(t *testing.T) {
|
func TestAutopilot_CleanupDeadNonvoter(t *testing.T) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user