From ada4d21285888af51c74e12b4fb9a7f0448acb6d Mon Sep 17 00:00:00 2001 From: Matt Keeler Date: Thu, 27 May 2021 13:23:18 -0400 Subject: [PATCH] Bump raft-autopilot to the latest version (#10310) --- .changelog/10306.txt | 3 + go.mod | 2 +- go.sum | 13 +-- .../hashicorp/raft-autopilot/autopilot.go | 10 --- .../hashicorp/raft-autopilot/run.go | 16 +++- .../hashicorp/raft-autopilot/state.go | 83 ++++++++++++++----- .../hashicorp/raft-autopilot/types.go | 16 ++-- vendor/modules.txt | 2 +- 8 files changed, 90 insertions(+), 55 deletions(-) create mode 100644 .changelog/10306.txt diff --git a/.changelog/10306.txt b/.changelog/10306.txt new file mode 100644 index 0000000000..3a0154b924 --- /dev/null +++ b/.changelog/10306.txt @@ -0,0 +1,3 @@ +```release-note:bug + autopilot: **(Enterprise only)** Fixed an issue where autopilot could cause a new leader to demote the wrong voter when redundancy zones are in use and the previous leader failed. + ``` diff --git a/go.mod b/go.mod index 40ebe74479..b193dc6d9c 100644 --- a/go.mod +++ b/go.mod @@ -53,7 +53,7 @@ require ( github.com/hashicorp/memberlist v0.2.2 github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69 github.com/hashicorp/raft v1.2.0 - github.com/hashicorp/raft-autopilot v0.1.2 + github.com/hashicorp/raft-autopilot v0.1.5 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/serf v0.9.5 github.com/hashicorp/vault/api v1.0.5-0.20200717191844-f687267c8086 diff --git a/go.sum b/go.sum index c02b31c71d..ef46f0235f 100644 --- a/go.sum +++ b/go.sum @@ -32,7 +32,6 @@ github.com/Azure/go-autorest/logger v0.1.0 h1:ruG4BSDXONFRrZZJ2GUXDiUyVpayPmb1Gn github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6LSNgds39diKLz7Vrc= github.com/Azure/go-autorest/tracing v0.5.0 h1:TRn4WjSnkcSy5AEG3pnbtFSwNtwzjr4VYyQflFE619k= github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk= -github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go v3.2.0+incompatible h1:qSG2N4FghB1He/r2mFrWKCaL7dXCilEuNEeAn20fdD4= @@ -68,7 +67,6 @@ github.com/aws/aws-sdk-go v1.25.37/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpi github.com/aws/aws-sdk-go v1.25.41 h1:/hj7nZ0586wFqpwjNpzWiUTwtaMgxAZNZKHay80MdXw= github.com/aws/aws-sdk-go v1.25.41/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= -github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= @@ -139,7 +137,6 @@ github.com/frankban/quicktest v1.11.0 h1:Yyrghcw93e1jKo4DTZkRFTTFvBsVhzbblBUPNU1 github.com/frankban/quicktest v1.11.0/go.mod h1:K+q6oSqb0W0Ininfk863uOk1lMy69l/P6txr3mVT54s= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/ghodss/yaml v0.0.0-20150909031657-73d445a93680/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= -github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-asn1-ber/asn1-ber v1.3.1/go.mod h1:hEBeB/ic+5LoWskz+yKT7vGhhPYkProFKoKdwZRWMe0= github.com/go-check/check v0.0.0-20140225173054-eb6ee6f84d0a/go.mod h1:9ES+weclKsC9YodN5RgxqK/VD9HM9JsCSh7rNhMZE98= @@ -171,7 +168,6 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.5 h1:F768QJ1E9tib+q5Sc8MkdJi1RxLTbRcTf8LJV56aRls= @@ -182,10 +178,8 @@ github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Z github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= @@ -193,7 +187,6 @@ github.com/google/go-querystring v0.0.0-20170111101155-53e6ce116135/go.mod h1:od github.com/google/go-querystring v1.0.0 h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk= github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= -github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= @@ -232,7 +225,6 @@ github.com/hashicorp/go-discover v0.0.0-20200501174627-ad1e96bde088/go.mod h1:vZ github.com/hashicorp/go-hclog v0.0.0-20180709165350-ff2cf002a8dd/go.mod h1:9bjs9uLqI8l75knNv3lV1kA55veR+WUPSiKIWcQHudI= github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= -github.com/hashicorp/go-hclog v0.12.0 h1:d4QkX8FRTYaKaCZBoXYY8zJX2BXjWxurN/GA2tkrmZM= github.com/hashicorp/go-hclog v0.12.0/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-hclog v0.14.1 h1:nQcJDQwIAGnmoUWp8ubocEX40cCml/17YkF6csQLReU= github.com/hashicorp/go-hclog v0.14.1/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= @@ -278,7 +270,6 @@ github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038 h1:n9J0rwVWXDpNd5iZnwY7w4WZyq53/rROeI7OVvLW8Ok= github.com/hashicorp/hil v0.0.0-20200423225030-a18a1cd20038/go.mod h1:n2TSygSNwsLJ76m8qFXTSc7beTb+auJxYdqrnoqwZWE= github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= -github.com/hashicorp/mdns v1.0.1 h1:XFSOubp8KWB+Jd2PDyaX5xUd5bhSP/+pTDZVDMzZJM8= github.com/hashicorp/mdns v1.0.1/go.mod h1:4gW7WsVCke5TE7EPeYliwHlRUyBtfCwuFwuMg2DmyNY= github.com/hashicorp/mdns v1.0.4 h1:sY0CMhFmjIPDMlTB+HfymFHCaYLhgifZ0QhjaYKD/UQ= github.com/hashicorp/mdns v1.0.4/go.mod h1:mtBihi+LeNXGtG8L9dX59gAEa12BDtBQSp4v/YAJqrc= @@ -289,8 +280,8 @@ github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69/go.mo github.com/hashicorp/raft v1.1.1/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= github.com/hashicorp/raft v1.2.0 h1:mHzHIrF0S91d3A7RPBvuqkgB4d/7oFJZyvf1Q4m7GA0= github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8= -github.com/hashicorp/raft-autopilot v0.1.2 h1:yeqdUjWLjVJkBM+mcVxqwxi+w+aHsb9cEON2dz69OCs= -github.com/hashicorp/raft-autopilot v0.1.2/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= +github.com/hashicorp/raft-autopilot v0.1.5 h1:onEfMH5uHVdXQqtas36zXUHEZxLdsJVu/nXHLcLdL1I= +github.com/hashicorp/raft-autopilot v0.1.5/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBAS2J0VBzVa5e80b5ZtYuNQtgXjN40qBZlD4= github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= github.com/hashicorp/serf v0.9.5 h1:EBWvyu9tcRszt3Bxp3KNssBMP1KuHWyO51lz9+786iM= diff --git a/vendor/github.com/hashicorp/raft-autopilot/autopilot.go b/vendor/github.com/hashicorp/raft-autopilot/autopilot.go index d0640fc85a..14c3f81419 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/autopilot.go +++ b/vendor/github.com/hashicorp/raft-autopilot/autopilot.go @@ -147,16 +147,6 @@ type Autopilot struct { // racing. stateLock sync.RWMutex - // startTime is recorded so that we can make better determinations about server - // stability during the initial period of time after autopilot first starts. - // If autopilot has just started the default behavior to check if a server is - // stable will not work as it will ensure the server has been healthy for - // the configured server stabilization time. If that configure time is longer - // than the amount of time autopilot has been running you can run into issues - // with leadership flapping during some scenarios where a cluster is being - // brought up. - startTime time.Time - // removeDeadCh is used to trigger the running autopilot go routines to // find and remove any dead/failed servers removeDeadCh chan struct{} diff --git a/vendor/github.com/hashicorp/raft-autopilot/run.go b/vendor/github.com/hashicorp/raft-autopilot/run.go index 382fd11464..970f616900 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/run.go +++ b/vendor/github.com/hashicorp/raft-autopilot/run.go @@ -18,7 +18,6 @@ func (a *Autopilot) Start(ctx context.Context) { } ctx, shutdown := context.WithCancel(ctx) - a.startTime = a.time.Now() exec := &execInfo{ status: Running, @@ -128,6 +127,21 @@ func (a *Autopilot) beginExecution(ctx context.Context, exec *execInfo) { a.logger.Debug("autopilot is now stopped") + // We need to gain this lock so that we can zero out the previous state. + // This prevents us from accidentally tracking stale state in the event + // that we used to be the leader at some point in time, then weren't + // and now are again. In particular this will ensure that that we forget + // about our tracking of the firstStateTime so that once restarted, we + // will ignore server stabilization time just like we do the very + // first time this process ever was the leader. + // + // This isn't included in finishExecution so that we don't perform it + // if we fail to gain the leaderLock before the context gets cancelled + // back at the beginning of this function. + a.stateLock.Lock() + defer a.stateLock.Unlock() + a.state = &State{} + a.finishExecution(exec) a.leaderLock.Unlock() }() diff --git a/vendor/github.com/hashicorp/raft-autopilot/state.go b/vendor/github.com/hashicorp/raft-autopilot/state.go index 035357bea1..d44cdbc26d 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/state.go +++ b/vendor/github.com/hashicorp/raft-autopilot/state.go @@ -27,15 +27,15 @@ func aliveServers(servers map[raft.ServerID]*Server) map[raft.ServerID]*Server { // nextStateInputs is the collection of values that can influence // creation of the next State. type nextStateInputs struct { - Now time.Time - StartTime time.Time - Config *Config - RaftConfig *raft.Configuration - KnownServers map[raft.ServerID]*Server - LatestIndex uint64 - LastTerm uint64 - FetchedStats map[raft.ServerID]*ServerStats - LeaderID raft.ServerID + Now time.Time + FirstStateTime time.Time + Config *Config + RaftConfig *raft.Configuration + KnownServers map[raft.ServerID]*Server + LatestIndex uint64 + LastTerm uint64 + FetchedStats map[raft.ServerID]*ServerStats + LeaderID raft.ServerID } // gatherNextStateInputs gathers all the information that would be used to @@ -52,9 +52,34 @@ type nextStateInputs struct { func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs, error) { // there are a lot of inputs to computing the next state so they get put into a // struct so that we don't have to return 8 values. + + now := a.time.Now() + + // We need to pull the previous states knowledge of the first time a state was generated. + // This is really only important for when autopilot is first started. We will use the + // first state's time when determining if a server is stable. Under normal circumstances + // we need to just check that the current time - the servers StableSince time is greater + // than the configured stabilization time. However while autopilot has been running for + // less time than the stabilization time we need to consider all servers as stable + // to prevent unnecessary leader elections. Therefore its important to track the first + // time a state was generated so we know if we have a state old enough where there is + // any chance of seeing servers as stable based off that configured threshold. + var firstStateTime time.Time + a.stateLock.Lock() + if a.state != nil { + firstStateTime = a.state.firstStateTime + } + a.stateLock.Unlock() + + // firstStateTime will be the zero value if we are in the process of generating + // the first state. In that case we set it to the now time. + if firstStateTime.IsZero() { + firstStateTime = now + } + inputs := &nextStateInputs{ - Now: a.time.Now(), - StartTime: a.startTime, + Now: now, + FirstStateTime: firstStateTime, } // grab the latest autopilot configuration @@ -71,16 +96,30 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs } inputs.RaftConfig = raftConfig - leader := a.raft.Leader() - for _, s := range inputs.RaftConfig.Servers { - if s.Address == leader { - inputs.LeaderID = s.ID + // get the known servers which may include left/failed ones + inputs.KnownServers = a.delegate.KnownServers() + + // Try to retrieve leader id from the delegate. + for id, srv := range inputs.KnownServers { + if srv.IsLeader { + inputs.LeaderID = id break } } + // Delegate setting the leader information is optional. If leader detection is + // not successful, fallback on raft config to do the same. if inputs.LeaderID == "" { - return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader) + leader := a.raft.Leader() + for _, s := range inputs.RaftConfig.Servers { + if s.Address == leader { + inputs.LeaderID = s.ID + break + } + } + if inputs.LeaderID == "" { + return nil, fmt.Errorf("cannot detect the current leader server id from its address: %s", leader) + } } // get the latest Raft index - this should be kept close to the call to @@ -101,9 +140,6 @@ func (a *Autopilot) gatherNextStateInputs(ctx context.Context) (*nextStateInputs return nil, ctx.Err() } - // get the known servers which may include left/failed ones - inputs.KnownServers = a.delegate.KnownServers() - // in most cases getting the known servers should be quick but as we cannot // account for every potential delegate and prevent them from making // blocking network requests we should probably check the context again. @@ -146,10 +182,13 @@ func (a *Autopilot) nextState(ctx context.Context) (*State, error) { func (a *Autopilot) nextStateWithInputs(inputs *nextStateInputs) *State { nextServers := a.nextServers(inputs) + // we record the firstStateTime so that we can ignore the server stabilization + // time up until the time we generated the first state becomes far enough + // in the past. Until that point in time all servers are considered stable. newState := &State{ - startTime: inputs.StartTime, - Healthy: true, - Servers: nextServers, + firstStateTime: inputs.FirstStateTime, + Healthy: true, + Servers: nextServers, } voterCount := 0 diff --git a/vendor/github.com/hashicorp/raft-autopilot/types.go b/vendor/github.com/hashicorp/raft-autopilot/types.go index c96fb9a31d..021fd8b06b 100644 --- a/vendor/github.com/hashicorp/raft-autopilot/types.go +++ b/vendor/github.com/hashicorp/raft-autopilot/types.go @@ -85,6 +85,7 @@ type Server struct { Version string Meta map[string]string RaftVersion int + IsLeader bool // The remaining fields are those that the promoter // will fill in @@ -166,7 +167,7 @@ type ServerStats struct { } type State struct { - startTime time.Time + firstStateTime time.Time Healthy bool FailureTolerance int Servers map[raft.ServerID]*ServerState @@ -177,14 +178,11 @@ type State struct { func (s *State) ServerStabilizationTime(c *Config) time.Duration { // Only use the configured stabilization time when autopilot has - // been running for 110% of the configured stabilization time. - // Before that time we haven't been running long enough to - // be able to take these values into account. 110% is pretty - // arbitrary but with the default config would prevent the - // stabilization time from mattering for an extra second. This - // allows for leeway in how quickly we get the healthy RPC responses - // after autopilot is started. - if time.Since(s.startTime) > (c.ServerStabilizationTime*110)/100 { + // been running for at least as long as when the first state was + // generated. If it hasn't been running that long then we would + // guarantee that all checks against the stabilization time will + // fail which will result in excessive leader elections. + if time.Since(s.firstStateTime) > c.ServerStabilizationTime { return c.ServerStabilizationTime } diff --git a/vendor/modules.txt b/vendor/modules.txt index 0ddb68f70e..baa4e91dd0 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -280,7 +280,7 @@ github.com/hashicorp/memberlist github.com/hashicorp/net-rpc-msgpackrpc # github.com/hashicorp/raft v1.2.0 github.com/hashicorp/raft -# github.com/hashicorp/raft-autopilot v0.1.2 +# github.com/hashicorp/raft-autopilot v0.1.5 github.com/hashicorp/raft-autopilot # github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/raft-boltdb