From 0cc024399d0b1e2dd6b5b0a9c4d09a8f8072a291 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Wed, 29 Mar 2017 12:52:00 -0700 Subject: [PATCH 1/2] Clean up raft servers without a corresponding serf entry --- consul/autopilot.go | 36 +++++++++++++++++++---- consul/autopilot_test.go | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/consul/autopilot.go b/consul/autopilot.go index fb28d01021..07be2fc151 100644 --- a/consul/autopilot.go +++ b/consul/autopilot.go @@ -78,17 +78,37 @@ func (s *Server) pruneDeadServers() error { // Find any failed servers var failed []string + staleRaftServers := make(map[string]raft.Server) if autopilotConf.CleanupDeadServers { + future := s.raft.GetConfiguration() + if future.Error() != nil { + return err + } + + for _, server := range future.Configuration().Servers { + staleRaftServers[string(server.Address)] = server + } + for _, member := range s.serfLAN.Members() { - valid, _ := agent.IsConsulServer(member) - if valid && member.Status == serf.StatusFailed { - failed = append(failed, member.Name) + valid, parts := agent.IsConsulServer(member) + + if valid { + // Remove this server from the stale list; it has a serf entry + if _, ok := staleRaftServers[parts.Addr.String()]; ok { + delete(staleRaftServers, parts.Addr.String()) + } + + if member.Status == serf.StatusFailed { + failed = append(failed, member.Name) + } } } } + removalCount := len(failed) + len(staleRaftServers) + // Nothing to remove, return early - if len(failed) == 0 { + if removalCount == 0 { return nil } @@ -98,13 +118,17 @@ func (s *Server) pruneDeadServers() error { } // Only do removals if a minority of servers will be affected - if len(failed) < peers/2 { + if removalCount < peers/2 { for _, server := range failed { s.logger.Printf("[INFO] consul: Attempting removal of failed server: %v", server) go s.serfLAN.RemoveFailedNode(server) } + for _, raftServer := range staleRaftServers { + s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID) + s.raft.RemoveServer(raftServer.ID, 0, 0) + } } else { - s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", len(failed), peers) + s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers) } return nil diff --git a/consul/autopilot_test.go b/consul/autopilot_test.go index c171eabd13..e2811a49bb 100644 --- a/consul/autopilot_test.go +++ b/consul/autopilot_test.go @@ -153,6 +153,69 @@ func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) { } } +func TestAutopilot_CleanupStaleRaftServer(t *testing.T) { + dir1, s1 := testServerDCBootstrap(t, "dc1", true) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + + dir2, s2 := testServerDCBootstrap(t, "dc1", false) + defer os.RemoveAll(dir2) + defer s2.Shutdown() + + dir3, s3 := testServerDCBootstrap(t, "dc1", false) + defer os.RemoveAll(dir3) + defer s3.Shutdown() + + dir4, s4 := testServerDCBootstrap(t, "dc1", false) + defer os.RemoveAll(dir4) + defer s4.Shutdown() + + servers := []*Server{s1, s2, s3} + + // Join the servers to s1 + addr := fmt.Sprintf("127.0.0.1:%d", + s1.config.SerfLANConfig.MemberlistConfig.BindPort) + + for _, s := range servers[1:] { + if _, err := s.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + } + + for _, s := range servers { + if err := testutil.WaitForResult(func() (bool, error) { + peers, _ := s.numPeers() + return peers == 3, nil + }); err != nil { + t.Fatal(err) + } + } + + // Add s4 to peers directly + s4addr := fmt.Sprintf("127.0.0.1:%d", + s4.config.SerfLANConfig.MemberlistConfig.BindPort) + s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(s4addr),0, 0) + + // Verify we have 4 peers + peers, err := s1.numPeers() + if err != nil { + t.Fatal(err) + } + if peers != 4 { + t.Fatalf("bad: %v", peers) + } + + // Wait for s4 to be removed + for _, s := range []*Server{s1, s2, s3} { + if err := testutil.WaitForResult(func() (bool, error) { + peers, _ := s.numPeers() + return peers == 3, nil + }); err != nil { + t.Fatal(err) + } + } +} + func TestAutopilot_PromoteNonVoter(t *testing.T) { dir1, s1 := testServerWithConfig(t, func(c *Config) { c.Datacenter = "dc1" From e74e83fc6c0423d0c0c609d493ab94a5c140d459 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Wed, 29 Mar 2017 13:38:40 -0700 Subject: [PATCH 2/2] Remove stale raft servers differently depending on minRaftVersion --- consul/autopilot.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/consul/autopilot.go b/consul/autopilot.go index 07be2fc151..e1a87479b5 100644 --- a/consul/autopilot.go +++ b/consul/autopilot.go @@ -123,9 +123,23 @@ func (s *Server) pruneDeadServers() error { s.logger.Printf("[INFO] consul: Attempting removal of failed server: %v", server) go s.serfLAN.RemoveFailedNode(server) } + + minRaftProtocol, err := ServerMinRaftProtocol(s.serfLAN.Members()) + if err != nil { + return err + } for _, raftServer := range staleRaftServers { - s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID) - s.raft.RemoveServer(raftServer.ID, 0, 0) + var future raft.Future + if minRaftProtocol >= 2 { + s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID) + future = s.raft.RemoveServer(raftServer.ID, 0, 0) + } else { + s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID) + future = s.raft.RemovePeer(raftServer.Address) + } + if err := future.Error(); err != nil { + return err + } } } else { s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)