Bump raft-autopilot to the latest version (#10310)

This commit is contained in:
Matt Keeler 2021-05-27 13:23:18 -04:00 committed by GitHub
parent 5beeb44eee
commit ada4d21285
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 90 additions and 55 deletions

3
.changelog/10306.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
autopilot: **(Enterprise only)** Fixed an issue where autopilot could cause a new leader to demote the wrong voter when redundancy zones are in use and the previous leader failed.
```

2
go.mod
View File

@ -53,7 +53,7 @@ require (
github.com/hashicorp/memberlist v0.2.2
github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69
github.com/hashicorp/raft v1.2.0
github.com/hashicorp/raft-autopilot v0.1.2
github.com/hashicorp/raft-autopilot v0.1.5
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
github.com/hashicorp/serf v0.9.5
github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086

13
go.sum
View File

@ -32,7 +32,6 @@ github.com/Azure/go-autorest/logger v0.1.0 h1:ruG4BSDXONFRrZZJ2GUXDiUyVpayPmb1Gn
github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6LSNgds39diKLz7Vrc=
github.com/Azure/go-autorest/tracing v0.5.0 h1:TRn4WjSnkcSy5AEG3pnbtFSwNtwzjr4VYyQflFE619k=
github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
github.com/DataDog/datadog-go v3.2.0+incompatible h1:qSG2N4FghB1He/r2mFrWKCaL7dXCilEuNEeAn20fdD4=
@ -68,7 +67,6 @@ github.com/aws/aws-sdk-go v1.25.37/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpi
github.com/aws/aws-sdk-go v1.25.41 h1:/hj7nZ0586wFqpwjNpzWiUTwtaMgxAZNZKHay80MdXw=
github.com/aws/aws-sdk-go v1.25.41/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
@ -139,7 +137,6 @@ github.com/frankban/quicktest v1.11.0 h1:Yyrghcw93e1jKo4DTZkRFTTFvBsVhzbblBUPNU1
github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/go-asn1-ber/asn1-ber v1.3.1/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0=
github.com/go-check/check v0.0.0-20140225173054-eb6ee6f84d0a/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98=
@ -171,7 +168,6 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb
github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.5 h1:F768QJ1E9tib+q5Sc8MkdJi1RxLTbRcTf8LJV56aRls=
@ -182,10 +178,8 @@ github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Z
github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
@ -193,7 +187,6 @@ github.com/google/go-querystring v0.0.0-20170111101155-53e6ce116135/go.mod h1:od
github.com/google/go-querystring v1.0.0 h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk=
github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI=
github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
@ -232,7 +225,6 @@ github.com/hashicorp/go-discover v0.0.0-20200501174627-ad1e96bde088/go.mod h1:vZ
github.com/hashicorp/go-hclog v0.0.0-20180709165350-ff2cf002a8dd/go.mod h1:9bjs9uLqI8l75knNv3lV1kA55veR+WUPSiKIWcQHudI=
github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
github.com/hashicorp/go-hclog v0.12.0 h1:d4QkX8FRTYaKaCZBoXYY8zJX2BXjWxurN/GA2tkrmZM=
github.com/hashicorp/go-hclog v0.12.0/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ=
github.com/hashicorp/go-hclog v0.14.1 h1:nQcJDQwIAGnmoUWp8ubocEX40cCml/17YkF6csQLReU=
github.com/hashicorp/go-hclog v0.14.1/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ=
@ -278,7 +270,6 @@ github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T
github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038 h1:n9J0rwVWXDpNd5iZnwY7w4WZyq53/rROeI7OVvLW8Ok=
github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038/go.mod h1:n2TSygSNwsLJ76m8qFXTSc7beTb+auJxYdqrnoqwZWE=
github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64=
github.com/hashicorp/mdns v1.0.1 h1:XFSOubp8KWB+Jd2PDyaX5xUd5bhSP/+pTDZVDMzZJM8=
github.com/hashicorp/mdns v1.0.1/go.mod h1:4gW7WsVCke5TE7EPeYliwHlRUyBtfCwuFwuMg2DmyNY=
github.com/hashicorp/mdns v1.0.4 h1:sY0CMhFmjIPDMlTB+HfymFHCaYLhgifZ0QhjaYKD/UQ=
github.com/hashicorp/mdns v1.0.4/go.mod h1:mtBihi+LeNXGtG8L9dX59gAEa12BDtBQSp4v/YAJqrc=
@ -289,8 +280,8 @@ github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69/go.mo
github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft v1.2.0 h1:mHzHIrF0S91d3A7RPBvuqkgB4d/7oFJZyvf1Q4m7GA0=
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs=
github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I=
github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM=

View File

@ -147,16 +147,6 @@ type Autopilot struct {
// racing.
stateLock sync.RWMutex
// startTime is recorded so that we can make better determinations about server
// stability during the initial period of time after autopilot first starts.
// If autopilot has just started the default behavior to check if a server is
// stable will not work as it will ensure the server has been healthy for
// the configured server stabilization time. If that configure time is longer
// than the amount of time autopilot has been running you can run into issues
// with leadership flapping during some scenarios where a cluster is being
// brought up.
startTime time.Time
// removeDeadCh is used to trigger the running autopilot go routines to
// find and remove any dead/failed servers
removeDeadCh chan struct{}

View File

@ -18,7 +18,6 @@ func (a *Autopilot) Start(ctx context.Context) {
}
ctx, shutdown := context.WithCancel(ctx)
a.startTime = a.time.Now()
exec := &execInfo{
status: Running,
@ -128,6 +127,21 @@ func (a *Autopilot) beginExecution(ctx context.Context, exec *execInfo) {
a.logger.Debug("autopilot is now stopped")
// We need to gain this lock so that we can zero out the previous state.
// This prevents us from accidentally tracking stale state in the event
// that we used to be the leader at some point in time, then weren't
// and now are again. In particular this will ensure that that we forget
// about our tracking of the firstStateTime so that once restarted, we
// will ignore server stabilization time just like we do the very
// first time this process ever was the leader.
//
// This isn't included in finishExecution so that we don't perform it
// if we fail to gain the leaderLock before the context gets cancelled
// back at the beginning of this function.
a.stateLock.Lock()
defer a.stateLock.Unlock()
a.state = &State{}
a.finishExecution(exec)
a.leaderLock.Unlock()
}()

View File

@ -27,15 +27,15 @@ func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server {
// nextStateInputs is the collection of values that can influence
// creation of the next State.
type nextStateInputs struct {
Now time.Time
StartTime time.Time
Config *Config
RaftConfig *raft.Configuration
KnownServers map[raft.ServerID]*Server
LatestIndex uint64
LastTerm uint64
FetchedStats map[raft.ServerID]*ServerStats
LeaderID raft.ServerID
Now time.Time
FirstStateTime time.Time
Config *Config
RaftConfig *raft.Configuration
KnownServers map[raft.ServerID]*Server
LatestIndex uint64
LastTerm uint64
FetchedStats map[raft.ServerID]*ServerStats
LeaderID raft.ServerID
}
// gatherNextStateInputs gathers all the information that would be used to
@ -52,9 +52,34 @@ type nextStateInputs struct {
func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) {
// there are a lot of inputs to computing the next state so they get put into a
// struct so that we don't have to return 8 values.
now := a.time.Now()
// We need to pull the previous states knowledge of the first time a state was generated.
// This is really only important for when autopilot is first started. We will use the
// first state's time when determining if a server is stable. Under normal circumstances
// we need to just check that the current time - the servers StableSince time is greater
// than the configured stabilization time. However while autopilot has been running for
// less time than the stabilization time we need to consider all servers as stable
// to prevent unnecessary leader elections. Therefore its important to track the first
// time a state was generated so we know if we have a state old enough where there is
// any chance of seeing servers as stable based off that configured threshold.
var firstStateTime time.Time
a.stateLock.Lock()
if a.state != nil {
firstStateTime = a.state.firstStateTime
}
a.stateLock.Unlock()
// firstStateTime will be the zero value if we are in the process of generating
// the first state. In that case we set it to the now time.
if firstStateTime.IsZero() {
firstStateTime = now
}
inputs := &nextStateInputs{
Now: a.time.Now(),
StartTime: a.startTime,
Now: now,
FirstStateTime: firstStateTime,
}
// grab the latest autopilot configuration
@ -71,16 +96,30 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs
}
inputs.RaftConfig = raftConfig
leader := a.raft.Leader()
for _, s := range inputs.RaftConfig.Servers {
if s.Address == leader {
inputs.LeaderID = s.ID
// get the known servers which may include left/failed ones
inputs.KnownServers = a.delegate.KnownServers()
// Try to retrieve leader id from the delegate.
for id, srv := range inputs.KnownServers {
if srv.IsLeader {
inputs.LeaderID = id
break
}
}
// Delegate setting the leader information is optional. If leader detection is
// not successful, fallback on raft config to do the same.
if inputs.LeaderID == "" {
return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
leader := a.raft.Leader()
for _, s := range inputs.RaftConfig.Servers {
if s.Address == leader {
inputs.LeaderID = s.ID
break
}
}
if inputs.LeaderID == "" {
return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader)
}
}
// get the latest Raft index - this should be kept close to the call to
@ -101,9 +140,6 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs
return nil, ctx.Err()
}
// get the known servers which may include left/failed ones
inputs.KnownServers = a.delegate.KnownServers()
// in most cases getting the known servers should be quick but as we cannot
// account for every potential delegate and prevent them from making
// blocking network requests we should probably check the context again.
@ -146,10 +182,13 @@ func (a *Autopilot) nextState(ctx context.Context) (*State, error) {
func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State {
nextServers := a.nextServers(inputs)
// we record the firstStateTime so that we can ignore the server stabilization
// time up until the time we generated the first state becomes far enough
// in the past. Until that point in time all servers are considered stable.
newState := &State{
startTime: inputs.StartTime,
Healthy: true,
Servers: nextServers,
firstStateTime: inputs.FirstStateTime,
Healthy: true,
Servers: nextServers,
}
voterCount := 0

View File

@ -85,6 +85,7 @@ type Server struct {
Version string
Meta map[string]string
RaftVersion int
IsLeader bool
// The remaining fields are those that the promoter
// will fill in
@ -166,7 +167,7 @@ type ServerStats struct {
}
type State struct {
startTime time.Time
firstStateTime time.Time
Healthy bool
FailureTolerance int
Servers map[raft.ServerID]*ServerState
@ -177,14 +178,11 @@ type State struct {
func (s *State) ServerStabilizationTime(c *Config) time.Duration {
// Only use the configured stabilization time when autopilot has
// been running for 110% of the configured stabilization time.
// Before that time we haven't been running long enough to
// be able to take these values into account. 110% is pretty
// arbitrary but with the default config would prevent the
// stabilization time from mattering for an extra second. This
// allows for leeway in how quickly we get the healthy RPC responses
// after autopilot is started.
if time.Since(s.startTime) > (c.ServerStabilizationTime*110)/100 {
// been running for at least as long as when the first state was
// generated. If it hasn't been running that long then we would
// guarantee that all checks against the stabilization time will
// fail which will result in excessive leader elections.
if time.Since(s.firstStateTime) > c.ServerStabilizationTime {
return c.ServerStabilizationTime
}

2
vendor/modules.txt vendored
View File

@ -280,7 +280,7 @@ github.com/hashicorp/memberlist
github.com/hashicorp/net-rpc-msgpackrpc
# github.com/hashicorp/raft v1.2.0
github.com/hashicorp/raft
# github.com/hashicorp/raft-autopilot v0.1.2
# github.com/hashicorp/raft-autopilot v0.1.5
github.com/hashicorp/raft-autopilot
# github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea
github.com/hashicorp/raft-boltdb