Merge pull request #2319 from hashicorp/f-bootstrap-abort

Adds check that aborts bootstrap mode if there's an existing cluster.
This commit is contained in:
James Phillips 2016-09-01 09:49:03 -07:00 committed by GitHub
commit 1488af4277
2 changed files with 75 additions and 19 deletions

View File

@ -1,7 +1,6 @@
package consul
import (
"net"
"strings"
"github.com/hashicorp/consul/consul/agent"
@ -192,7 +191,7 @@ func (s *Server) wanNodeJoin(me serf.MemberEvent) {
}
}
// maybeBootsrap is used to handle bootstrapping when a new consul server joins
// maybeBootstrap is used to handle bootstrapping when a new consul server joins.
func (s *Server) maybeBootstrap() {
// Bootstrap can only be done if there are no committed logs, remove our
// expectations of bootstrapping. This is slightly cheaper than the full
@ -203,13 +202,14 @@ func (s *Server) maybeBootstrap() {
return
}
if index != 0 {
s.logger.Printf("[INFO] consul: Raft data found, disabling bootstrap mode")
s.config.BootstrapExpect = 0
return
}
// Scan for all the known servers.
members := s.serfLAN.Members()
addrs := make([]string, 0)
var servers []agent.Server
for _, member := range members {
valid, p := agent.IsConsulServer(member)
if !valid {
@ -227,34 +227,62 @@ func (s *Server) maybeBootstrap() {
s.logger.Printf("[ERR] consul: Member %v has bootstrap mode. Expect disabled.", member)
return
}
addr := &net.TCPAddr{IP: member.Addr, Port: p.Port}
addrs = append(addrs, addr.String())
servers = append(servers, *p)
}
// Skip if we haven't met the minimum expect count.
if len(addrs) < s.config.BootstrapExpect {
if len(servers) < s.config.BootstrapExpect {
return
}
// Query each of the servers and make sure they report no Raft peers.
for _, server := range servers {
var peers []string
if err := s.connPool.RPC(s.config.Datacenter, server.Addr, server.Version,
"Status.Peers", &struct{}{}, &peers); err != nil {
s.logger.Printf("[ERR] consul: Failed to confirm peer status for %s: %v", server.Name, err)
return
}
// Found a node with some Raft peers, stop bootstrap since there's
// evidence of an existing cluster. We should get folded in by the
// existing servers if that's the case, so it's cleaner to sit as a
// candidate with no peers so we don't cause spurious elections.
// It's OK this is racy, because even with an initial bootstrap
// as long as one peer runs bootstrap things will work, and if we
// have multiple peers bootstrap in the same way, that's OK. We
// just don't want a server added much later to do a live bootstrap
// and interfere with the cluster. This isn't required for Raft's
// correctness because no server in the existing cluster will vote
// for this server, but it makes things much more stable.
if len(peers) > 0 {
s.logger.Printf("[INFO] consul: Existing Raft peers reported by %s, disabling bootstrap mode", server.Name)
s.config.BootstrapExpect = 0
return
}
}
// Attempt a live bootstrap!
var configuration raft.Configuration
for _, addr := range addrs {
// TODO (slackpad) - This will need to be updated once we support
// node IDs.
server := raft.Server{
var addrs []string
for _, server := range servers {
addr := server.Addr.String()
addrs = append(addrs, addr)
peer := raft.Server{
ID: raft.ServerID(addr),
Address: raft.ServerAddress(addr),
}
configuration.Servers = append(configuration.Servers, server)
configuration.Servers = append(configuration.Servers, peer)
}
s.logger.Printf("[INFO] consul: Found expected number of peers (%s), attempting to bootstrap cluster...",
s.logger.Printf("[INFO] consul: Found expected number of peers, attempting bootstrap: %s",
strings.Join(addrs, ","))
future := s.raft.BootstrapCluster(configuration)
if err := future.Error(); err != nil {
s.logger.Printf("[ERR] consul: Failed to bootstrap cluster: %v", err)
}
// Bootstrapping complete, don't enter this again.
// Bootstrapping complete, or failed for some reason, don't enter this
// again.
s.config.BootstrapExpect = 0
}

View File

@ -500,7 +500,9 @@ func TestServer_JoinLAN_TLS(t *testing.T) {
}
func TestServer_Expect(t *testing.T) {
// all test servers should be in expect=3 mode
// All test servers should be in expect=3 mode, except for the 3rd one,
// but one with expect=0 can cause a bootstrap to occur from the other
// servers as currently implemented.
dir1, s1 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
@ -513,7 +515,11 @@ func TestServer_Expect(t *testing.T) {
defer os.RemoveAll(dir3)
defer s3.Shutdown()
// Try to join
dir4, s4 := testServerDCExpect(t, "dc1", 3)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
// Join the first two servers.
addr := fmt.Sprintf("127.0.0.1:%d",
s1.config.SerfLANConfig.MemberlistConfig.BindPort)
if _, err := s2.JoinLAN([]string{addr}); err != nil {
@ -523,7 +529,7 @@ func TestServer_Expect(t *testing.T) {
var p1 int
var p2 int
// should have no peers yet
// Should have no peers yet since the bootstrap didn't occur.
testutil.WaitForResult(func() (bool, error) {
p1, _ = s1.numPeers()
return p1 == 0, errors.New(fmt.Sprintf("%d", p1))
@ -538,14 +544,14 @@ func TestServer_Expect(t *testing.T) {
t.Fatalf("should have 0 peers: %v", err)
})
// join the third node
// Join the third node.
if _, err := s3.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
var p3 int
// should now have all three peers
// Now we have three servers so we should bootstrap.
testutil.WaitForResult(func() (bool, error) {
p1, _ = s1.numPeers()
return p1 == 3, errors.New(fmt.Sprintf("%d", p1))
@ -567,8 +573,30 @@ func TestServer_Expect(t *testing.T) {
t.Fatalf("should have 3 peers: %v", err)
})
// check if there is one leader now
// Make sure a leader is elected, grab the current term and then add in
// the fourth server.
testutil.WaitForLeader(t, s1.RPC, "dc1")
termBefore := s1.raft.Stats()["last_log_term"]
if _, err := s4.JoinLAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
// Wait for the new server to see itself added to the cluster.
var p4 int
testutil.WaitForResult(func() (bool, error) {
p4, _ = s4.numPeers()
return p4 == 4, errors.New(fmt.Sprintf("%d", p4))
}, func(err error) {
t.Fatalf("should have 4 peers: %v", err)
})
// Make sure there's still a leader and that the term didn't change,
// so we know an election didn't occur.
testutil.WaitForLeader(t, s1.RPC, "dc1")
termAfter := s1.raft.Stats()["last_log_term"]
if termAfter != termBefore {
t.Fatalf("looks like an election took place")
}
}
func TestServer_BadExpect(t *testing.T) {