diff --git a/consul/config.go b/consul/config.go index a11f73f454..e07500f5c7 100644 --- a/consul/config.go +++ b/consul/config.go @@ -274,6 +274,10 @@ type Config struct { // This period is meant to be long enough for a leader election to take // place, and a small jitter is applied to avoid a thundering herd. RPCHoldTimeout time.Duration + + // AutopilotServerCleanup controls whether to remove dead servers when a new + // server is added to the Raft peers + AutopilotServerCleanup bool } // CheckVersion is used to check if the ProtocolVersion is valid @@ -346,6 +350,8 @@ func DefaultConfig() *Config { RPCHoldTimeout: 7 * time.Second, TLSMinVersion: "tls10", + + AutopilotServerCleanup: true, } // Increase our reap interval to 3 days instead of 24h. diff --git a/consul/leader.go b/consul/leader.go index 5cfcec4632..fc0a32bf28 100644 --- a/consul/leader.go +++ b/consul/leader.go @@ -567,6 +567,20 @@ func (s *Server) joinConsulServer(m serf.Member, parts *agent.Server) error { s.logger.Printf("[ERR] consul: failed to add raft peer: %v", err) return err } + + // Look for dead servers to clean up + if s.config.AutopilotServerCleanup { + for _, member := range s.serfLAN.Members() { + valid, _ := agent.IsConsulServer(member) + if valid && member.Name != m.Name && member.Status == serf.StatusFailed { + if err := s.handleDeregisterMember("Removing failed server", member); err != nil { + return fmt.Errorf("[ERROR] consul: Couldn't deregister failed server (%s): %v", member.Name, err) + } + s.logger.Printf("[INFO] consul: Removed failed server: %v", member.Name) + } + } + } + return nil } diff --git a/consul/leader_test.go b/consul/leader_test.go index 329bf41a35..b85acee058 100644 --- a/consul/leader_test.go +++ b/consul/leader_test.go @@ -622,3 +622,72 @@ func TestLeader_ReapTombstones(t *testing.T) { t.Fatalf("err: %v", err) }) } + +func TestLeader_DeadServerCleanup(t *testing.T) { + dir1, s1 := testServerDCExpect(t, "dc1", 3) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + + dir2, s2 := testServerDCExpect(t, "dc1", 3) + defer os.RemoveAll(dir2) + defer s2.Shutdown() + + dir3, s3 := testServerDCExpect(t, "dc1", 3) + defer os.RemoveAll(dir3) + defer s3.Shutdown() + + servers := []*Server{s1, s2, s3} + + // Try to join + addr := fmt.Sprintf("127.0.0.1:%d", + s1.config.SerfLANConfig.MemberlistConfig.BindPort) + if _, err := s2.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + if _, err := s3.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + + for _, s := range servers { + testutil.WaitForResult(func() (bool, error) { + peers, _ := s.numPeers() + return peers == 3, nil + }, func(err error) { + t.Fatalf("should have 3 peers") + }) + } + + // Kill a non-leader server (s2 or s3, so s4 can still join s1) + var nonLeader *Server + var removedIndex int + for i, s := range servers { + if !s.IsLeader() && i > 0 { + nonLeader = s + removedIndex = i + break + } + } + nonLeader.Shutdown() + + time.Sleep(1 * time.Second) + + // Bring up and join a new server + dir4, s4 := testServerDCExpect(t, "dc1", 3) + defer os.RemoveAll(dir4) + defer s4.Shutdown() + + if _, err := s4.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + + // Make sure the dead server is removed and we're back to 3 total peers + servers[removedIndex] = s4 + for _, s := range servers { + testutil.WaitForResult(func() (bool, error) { + peers, _ := s.numPeers() + return peers == 3, nil + }, func(err error) { + t.Fatalf("should have 3 peers") + }) + } +} diff --git a/consul/server.go b/consul/server.go index 13d3088052..d58d0d4a4f 100644 --- a/consul/server.go +++ b/consul/server.go @@ -317,6 +317,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w conf.Tags["vsn"] = fmt.Sprintf("%d", s.config.ProtocolVersion) conf.Tags["vsn_min"] = fmt.Sprintf("%d", ProtocolVersionMin) conf.Tags["vsn_max"] = fmt.Sprintf("%d", ProtocolVersionMax) + conf.Tags["raft_vsn"] = fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion) conf.Tags["build"] = s.config.Build conf.Tags["port"] = fmt.Sprintf("%d", addr.Port) if s.config.Bootstrap {