mirror of
https://github.com/status-im/consul.git
synced 2025-01-26 13:40:20 +00:00
Merge pull request #2845 from hashicorp/stale-raft-servers
Clean up raft servers without a corresponding serf entry
This commit is contained in:
commit
73f0e6f8f6
@ -78,17 +78,37 @@ func (s *Server) pruneDeadServers() error {
|
||||
|
||||
// Find any failed servers
|
||||
var failed []string
|
||||
staleRaftServers := make(map[string]raft.Server)
|
||||
if autopilotConf.CleanupDeadServers {
|
||||
future := s.raft.GetConfiguration()
|
||||
if future.Error() != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, server := range future.Configuration().Servers {
|
||||
staleRaftServers[string(server.Address)] = server
|
||||
}
|
||||
|
||||
for _, member := range s.serfLAN.Members() {
|
||||
valid, _ := agent.IsConsulServer(member)
|
||||
if valid && member.Status == serf.StatusFailed {
|
||||
failed = append(failed, member.Name)
|
||||
valid, parts := agent.IsConsulServer(member)
|
||||
|
||||
if valid {
|
||||
// Remove this server from the stale list; it has a serf entry
|
||||
if _, ok := staleRaftServers[parts.Addr.String()]; ok {
|
||||
delete(staleRaftServers, parts.Addr.String())
|
||||
}
|
||||
|
||||
if member.Status == serf.StatusFailed {
|
||||
failed = append(failed, member.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
removalCount := len(failed) + len(staleRaftServers)
|
||||
|
||||
// Nothing to remove, return early
|
||||
if len(failed) == 0 {
|
||||
if removalCount == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -98,13 +118,31 @@ func (s *Server) pruneDeadServers() error {
|
||||
}
|
||||
|
||||
// Only do removals if a minority of servers will be affected
|
||||
if len(failed) < peers/2 {
|
||||
if removalCount < peers/2 {
|
||||
for _, server := range failed {
|
||||
s.logger.Printf("[INFO] consul: Attempting removal of failed server: %v", server)
|
||||
go s.serfLAN.RemoveFailedNode(server)
|
||||
}
|
||||
|
||||
minRaftProtocol, err := ServerMinRaftProtocol(s.serfLAN.Members())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, raftServer := range staleRaftServers {
|
||||
var future raft.Future
|
||||
if minRaftProtocol >= 2 {
|
||||
s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID)
|
||||
future = s.raft.RemoveServer(raftServer.ID, 0, 0)
|
||||
} else {
|
||||
s.logger.Printf("[INFO] consul: Attempting removal of stale raft server : %v", raftServer.ID)
|
||||
future = s.raft.RemovePeer(raftServer.Address)
|
||||
}
|
||||
if err := future.Error(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", len(failed), peers)
|
||||
s.logger.Printf("[DEBUG] consul: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -153,6 +153,69 @@ func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
|
||||
dir1, s1 := testServerDCBootstrap(t, "dc1", true)
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
|
||||
dir2, s2 := testServerDCBootstrap(t, "dc1", false)
|
||||
defer os.RemoveAll(dir2)
|
||||
defer s2.Shutdown()
|
||||
|
||||
dir3, s3 := testServerDCBootstrap(t, "dc1", false)
|
||||
defer os.RemoveAll(dir3)
|
||||
defer s3.Shutdown()
|
||||
|
||||
dir4, s4 := testServerDCBootstrap(t, "dc1", false)
|
||||
defer os.RemoveAll(dir4)
|
||||
defer s4.Shutdown()
|
||||
|
||||
servers := []*Server{s1, s2, s3}
|
||||
|
||||
// Join the servers to s1
|
||||
addr := fmt.Sprintf("127.0.0.1:%d",
|
||||
s1.config.SerfLANConfig.MemberlistConfig.BindPort)
|
||||
|
||||
for _, s := range servers[1:] {
|
||||
if _, err := s.JoinLAN([]string{addr}); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
for _, s := range servers {
|
||||
if err := testutil.WaitForResult(func() (bool, error) {
|
||||
peers, _ := s.numPeers()
|
||||
return peers == 3, nil
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Add s4 to peers directly
|
||||
s4addr := fmt.Sprintf("127.0.0.1:%d",
|
||||
s4.config.SerfLANConfig.MemberlistConfig.BindPort)
|
||||
s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(s4addr),0, 0)
|
||||
|
||||
// Verify we have 4 peers
|
||||
peers, err := s1.numPeers()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if peers != 4 {
|
||||
t.Fatalf("bad: %v", peers)
|
||||
}
|
||||
|
||||
// Wait for s4 to be removed
|
||||
for _, s := range []*Server{s1, s2, s3} {
|
||||
if err := testutil.WaitForResult(func() (bool, error) {
|
||||
peers, _ := s.numPeers()
|
||||
return peers == 3, nil
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAutopilot_PromoteNonVoter(t *testing.T) {
|
||||
dir1, s1 := testServerWithConfig(t, func(c *Config) {
|
||||
c.Datacenter = "dc1"
|
||||
|
Loading…
x
Reference in New Issue
Block a user