Add logic to allow changing a failed node's ID

This commit is contained in:
Kyle Havlovitz 2019-03-07 22:42:54 -08:00
parent f1864b128e
commit 1e4523f55e
3 changed files with 94 additions and 2 deletions

View File

@ -1335,8 +1335,13 @@ AFTER_CHECK:
// If there's existing information about the node, do not
// clobber it.
SkipNodeUpdate: true,
//SkipNodeUpdate: true,
}
if node != nil {
req.TaggedAddresses = node.TaggedAddresses
req.NodeMeta = node.Meta
}
_, err = s.raftApply(structs.RegisterRequestType, &req)
return err
}

View File

@ -953,6 +953,78 @@ func TestLeader_ChangeServerID(t *testing.T) {
})
}
func TestLeader_ChangeNodeID(t *testing.T) {
t.Parallel()
conf := func(c *Config) {
c.Bootstrap = false
c.BootstrapExpect = 3
c.Datacenter = "dc1"
}
dir1, s1 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
dir2, s2 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir3)
defer s3.Shutdown()
servers := []*Server{s1, s2, s3}
// Try to join and wait for all servers to get promoted
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
for _, s := range servers {
testrpc.WaitForTestAgent(t, s.RPC, "dc1")
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
// Shut down a server, freeing up its address/port
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.LANMembers() {
if m.Status == serf.StatusAlive {
alive++
}
}
if got, want := alive, 2; got != want {
r.Fatalf("got %d alive members want %d", got, want)
}
})
// Bring up a new server with s3's address that will get a different ID
dir4, s4 := testServerWithConfig(t, func(c *Config) {
c.Bootstrap = false
c.Datacenter = "dc1"
c.NodeName = s3.config.NodeName
})
defer os.RemoveAll(dir4)
defer s4.Shutdown()
joinLAN(t, s4, s1)
servers[2] = s4
// Make sure the dead server is removed and we're back to 3 total peers
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
})
retry.Run(t, func(r *retry.R) {
for _, m := range s1.LANMembers() {
if m.Status != serf.StatusAlive {
r.Fatalf("bad status: %v", m)
}
}
})
}
func TestLeader_ACL_Initialization(t *testing.T) {
t.Parallel()

View File

@ -369,7 +369,22 @@ func (s *Store) ensureNoNodeWithSimilarNameTxn(tx *memdb.Txn, node *structs.Node
for nodeIt := enodes.Next(); nodeIt != nil; nodeIt = enodes.Next() {
enode := nodeIt.(*structs.Node)
if strings.EqualFold(node.Node, enode.Node) && node.ID != enode.ID {
if !(enode.ID == "" && allowClashWithoutID) {
// Look up the existing node's Serf health check to see if it's failed.
// If it is, the node can be renamed.
enodeCheck, err := tx.First("checks", "id", enode.Node, string(structs.SerfCheckID))
if err != nil {
return fmt.Errorf("Cannot get status of node %s: %s", enode.Node, err)
}
if enodeCheck == nil {
return fmt.Errorf("Cannot rename node %s: Serf health check not found for existing node", enode.Node)
}
enodeSerfCheck, ok := enodeCheck.(*structs.HealthCheck)
if !ok {
return fmt.Errorf("Existing node %q's Serf health check has type %T", enode.Node, enodeSerfCheck)
}
if !((enode.ID == "" || enodeSerfCheck.Status == api.HealthCritical) && allowClashWithoutID) {
return fmt.Errorf("Node name %s is reserved by node %s with name %s", node.Node, enode.ID, enode.Node)
}
}