Bump raft-autopilot version to the latest. (#10306)

2025-02-25 11:55:41 +00:00 · 2021-05-27 12:59:14 -04:00 · 2021-05-27 12:59:14 -04:00 · 3a0007b158
commit 3a0007b158
parent 1802f47f5c
8 changed files with 91 additions and 46 deletions
--- a/.changelog/10306.txt
+++ b/.changelog/10306.txt
@ -0,0 +1,4 @@
+```release-note:bug
+autopilot: **(Enterprise only)** Fixed an issue where autopilot could cause a new leader to demote the wrong voter when redundancy zones are in use and the previous leader failed.
+```
+
--- a/go.mod
+++ b/go.mod
@ -52,7 +52,7 @@ require (
 	github.com/hashicorp/memberlist v0.2.4
 	github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
 	github.com/hashicorp/raft v1.3.1
-	github.com/hashicorp/raft-autopilot v0.1.2
+	github.com/hashicorp/raft-autopilot v0.1.5
 	github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
 	github.com/hashicorp/serf v0.9.5
 	github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086
--- a/go.sum
+++ b/go.sum
@ -282,8 +282,8 @@ github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7
 github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
 github.com/hashicorp/raft v1.3.1 h1:zDT8ke8y2aP4wf9zPTB2uSIeavJ3Hx/ceY4jxI2JxuY=
 github.com/hashicorp/raft v1.3.1/go.mod h1:4Ak7FSPnuvmb0GV6vgIAJ4vYT4bek9bb6Q+7HVbyzqM=
-github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs=
-github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
+github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I=
+github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
 github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=
--- a/vendor/github.com/hashicorp/raft-autopilot/autopilot.go
+++ b/vendor/github.com/hashicorp/raft-autopilot/autopilot.go
@ -147,16 +147,6 @@ type Autopilot struct {
 	// racing.
 	stateLock sync.RWMutex

-	// startTime is recorded so that we can make better determinations about server
-	// stability during the initial period of time after autopilot first starts.
-	// If autopilot has just started the default behavior to check if a server is
-	// stable will not work as it will ensure the server has been healthy for
-	// the configured server stabilization time. If that configure time is longer
-	// than the amount of time autopilot has been running you can run into issues
-	// with leadership flapping during some scenarios where a cluster is being
-	// brought up.
-	startTime time.Time
-
 	// removeDeadCh is used to trigger the running autopilot go routines to
 	// find and remove any dead/failed servers
 	removeDeadCh chan struct{}
--- a/vendor/github.com/hashicorp/raft-autopilot/run.go
+++ b/vendor/github.com/hashicorp/raft-autopilot/run.go
@ -18,7 +18,6 @@ func (a *Autopilot) Start(ctx context.Context) {
 	}

 	ctx, shutdown := context.WithCancel(ctx)
-	a.startTime = a.time.Now()

 	exec := &execInfo{
 		status:   Running,
@ -128,6 +127,21 @@ func (a *Autopilot) beginExecution(ctx context.Context, exec *execInfo) {

 		a.logger.Debug("autopilot is now stopped")

+		// We need to gain this lock so that we can zero out the previous state.
+		// This prevents us from accidentally tracking stale state in the event
+		// that we used to be the leader at some point in time, then weren't
+		// and now are again. In particular this will ensure that that we forget
+		// about our tracking of the firstStateTime so that once restarted, we
+		// will ignore server stabilization time just like we do the very
+		// first time this process ever was the leader.
+		//
+		// This isn't included in finishExecution so that we don't perform it
+		// if we fail to gain the leaderLock before the context gets cancelled
+		// back at the beginning of this function.
+		a.stateLock.Lock()
+		defer a.stateLock.Unlock()
+		a.state = &State{}
+
 		a.finishExecution(exec)
 		a.leaderLock.Unlock()
 	}()
--- a/vendor/github.com/hashicorp/raft-autopilot/state.go
+++ b/vendor/github.com/hashicorp/raft-autopilot/state.go
@ -27,15 +27,15 @@ func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server {
 // nextStateInputs is the collection of values that can influence
 // creation of the next State.
 type nextStateInputs struct {
-	Now          time.Time
-	StartTime    time.Time
-	Config       *Config
-	RaftConfig   *raft.Configuration
-	KnownServers map[raft.ServerID]*Server
-	LatestIndex  uint64
-	LastTerm     uint64
-	FetchedStats map[raft.ServerID]*ServerStats
-	LeaderID     raft.ServerID
+	Now            time.Time
+	FirstStateTime time.Time
+	Config         *Config
+	RaftConfig     *raft.Configuration
+	KnownServers   map[raft.ServerID]*Server
+	LatestIndex    uint64
+	LastTerm       uint64
+	FetchedStats   map[raft.ServerID]*ServerStats
+	LeaderID       raft.ServerID
 }

 // gatherNextStateInputs gathers all the information that would be used to
@ -52,9 +52,34 @@ type nextStateInputs struct {
 func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) {
 	// there are a lot of inputs to computing the next state so they get put into a
 	// struct so that we don't have to return 8 values.
+
+	now := a.time.Now()
+
+	// We need to pull the previous states knowledge of the first time a state was generated.
+	// This is really only important for when autopilot is first started. We will use the
+	// first state's time when determining if a server is stable. Under normal circumstances
+	// we need to just check that the current time - the servers StableSince time is greater
+	// than the configured stabilization time. However while autopilot has been running for
+	// less time than the stabilization time we need to consider all servers as stable
+	// to prevent unnecessary leader elections. Therefore its important to track the first
+	// time a state was generated so we know if we have a state old enough where there is
+	// any chance of seeing servers as stable based off that configured threshold.
+	var firstStateTime time.Time
+	a.stateLock.Lock()
+	if a.state != nil {
+		firstStateTime = a.state.firstStateTime
+	}
+	a.stateLock.Unlock()
+
+	// firstStateTime will be the zero value if we are in the process of generating
+	// the first state. In that case we set it to the now time.
+	if firstStateTime.IsZero() {
+		firstStateTime = now
+	}
+
 	inputs := &nextStateInputs{
-		Now:       a.time.Now(),
-		StartTime: a.startTime,
+		Now:            now,
+		FirstStateTime: firstStateTime,
 	}

 	// grab the latest autopilot configuration
@ -71,16 +96,30 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs
 	}
 	inputs.RaftConfig = raftConfig

-	leader := a.raft.Leader()
-	for _, s := range inputs.RaftConfig.Servers {
-		if s.Address == leader {
-			inputs.LeaderID = s.ID
+	// get the known servers which may include left/failed ones
+	inputs.KnownServers = a.delegate.KnownServers()
+
+	// Try to retrieve leader id from the delegate.
+	for id, srv := range inputs.KnownServers {
+		if srv.IsLeader {
+			inputs.LeaderID = id
 			break
 		}
 	}

+	// Delegate setting the leader information is optional. If leader detection is
+	// not successful, fallback on raft config to do the same.
 	if inputs.LeaderID == "" {
-		return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
+		leader := a.raft.Leader()
+		for _, s := range inputs.RaftConfig.Servers {
+			if s.Address == leader {
+				inputs.LeaderID = s.ID
+				break
+			}
+		}
+		if inputs.LeaderID == "" {
+			return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
+		}
 	}

 	// get the latest Raft index - this should be kept close to the call to
@ -101,9 +140,6 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs
 		return nil, ctx.Err()
 	}

-	// get the known servers which may include left/failed ones
-	inputs.KnownServers = a.delegate.KnownServers()
-
 	// in most cases getting the known servers should be quick but as we cannot
 	// account for every potential delegate and prevent them from making
 	// blocking network requests we should probably check the context again.
@ -146,10 +182,13 @@ func (a *Autopilot) nextState(ctx context.Context) (*State, error) {
 func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State {
 	nextServers := a.nextServers(inputs)

+	// we record the firstStateTime so that we can ignore the server stabilization
+	// time up until the time we generated the first state becomes far enough
+	// in the past. Until that point in time all servers are considered stable.
 	newState := &State{
-		startTime: inputs.StartTime,
-		Healthy:   true,
-		Servers:   nextServers,
+		firstStateTime: inputs.FirstStateTime,
+		Healthy:        true,
+		Servers:        nextServers,
 	}

 	voterCount := 0
--- a/vendor/github.com/hashicorp/raft-autopilot/types.go
+++ b/vendor/github.com/hashicorp/raft-autopilot/types.go
@ -85,6 +85,7 @@ type Server struct {
 	Version     string
 	Meta        map[string]string
 	RaftVersion int
+	IsLeader    bool

 	// The remaining fields are those that the promoter
 	// will fill in
@ -166,7 +167,7 @@ type ServerStats struct {
 }

 type State struct {
-	startTime        time.Time
+	firstStateTime   time.Time
 	Healthy          bool
 	FailureTolerance int
 	Servers          map[raft.ServerID]*ServerState
@ -177,14 +178,11 @@ type State struct {

 func (s *State) ServerStabilizationTime(c *Config) time.Duration {
 	// Only use the configured stabilization time when autopilot has
-	// been running for 110% of the configured stabilization time.
-	// Before that time we haven't been running long enough to
-	// be able to take these values into account. 110% is pretty
-	// arbitrary but with the default config would prevent the
-	// stabilization time from mattering for an extra second. This
-	// allows for leeway in how quickly we get the healthy RPC responses
-	// after autopilot is started.
-	if time.Since(s.startTime) > (c.ServerStabilizationTime*110)/100 {
+	// been running for at least as long as when the first state was
+	// generated. If it hasn't been running that long then we would
+	// guarantee that all checks against the stabilization time will
+	// fail which will result in excessive leader elections.
+	if time.Since(s.firstStateTime) > c.ServerStabilizationTime {
 		return c.ServerStabilizationTime
 	}

--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -483,7 +483,7 @@ github.com/hashicorp/memberlist
 github.com/hashicorp/net-rpc-msgpackrpc
 # github.com/hashicorp/raft v1.3.1
 github.com/hashicorp/raft
-# github.com/hashicorp/raft-autopilot v0.1.2
+# github.com/hashicorp/raft-autopilot v0.1.5
 github.com/hashicorp/raft-autopilot
 # github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
 github.com/hashicorp/raft-boltdb