mirror of https://github.com/status-im/consul.git
Set MinQuorum variable in Autopilot (#6654)
* Add MinQuorum to Autopilot
This commit is contained in:
parent
64d099c020
commit
5e1c6e907b
|
@ -1204,6 +1204,7 @@ func (a *Agent) consulConfig() (*consul.Config, error) {
|
|||
base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
|
||||
base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
|
||||
base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
|
||||
base.AutopilotConfig.MinQuorum = a.config.AutopilotMinQuorum
|
||||
base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
|
||||
base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
|
||||
base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration
|
||||
|
|
|
@ -728,6 +728,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
|
|||
AutopilotDisableUpgradeMigration: b.boolVal(c.Autopilot.DisableUpgradeMigration),
|
||||
AutopilotLastContactThreshold: b.durationVal("autopilot.last_contact_threshold", c.Autopilot.LastContactThreshold),
|
||||
AutopilotMaxTrailingLogs: b.intVal(c.Autopilot.MaxTrailingLogs),
|
||||
AutopilotMinQuorum: b.uintVal(c.Autopilot.MinQuorum),
|
||||
AutopilotRedundancyZoneTag: b.stringVal(c.Autopilot.RedundancyZoneTag),
|
||||
AutopilotServerStabilizationTime: b.durationVal("autopilot.server_stabilization_time", c.Autopilot.ServerStabilizationTime),
|
||||
AutopilotUpgradeVersionTag: b.stringVal(c.Autopilot.UpgradeVersionTag),
|
||||
|
@ -1444,6 +1445,17 @@ func (b *Builder) intVal(v *int) int {
|
|||
return b.intValWithDefault(v, 0)
|
||||
}
|
||||
|
||||
func (b *Builder) uintVal(v *uint) uint {
|
||||
return b.uintValWithDefault(v, 0)
|
||||
}
|
||||
|
||||
func (b *Builder) uintValWithDefault(v *uint, defaultVal uint) uint {
|
||||
if v == nil {
|
||||
return defaultVal
|
||||
}
|
||||
return *v
|
||||
}
|
||||
|
||||
func (b *Builder) uint64ValWithDefault(v *uint64, defaultVal uint64) uint64 {
|
||||
if v == nil {
|
||||
return defaultVal
|
||||
|
|
|
@ -362,6 +362,7 @@ type Autopilot struct {
|
|||
DisableUpgradeMigration *bool `json:"disable_upgrade_migration,omitempty" hcl:"disable_upgrade_migration" mapstructure:"disable_upgrade_migration"`
|
||||
LastContactThreshold *string `json:"last_contact_threshold,omitempty" hcl:"last_contact_threshold" mapstructure:"last_contact_threshold"`
|
||||
MaxTrailingLogs *int `json:"max_trailing_logs,omitempty" hcl:"max_trailing_logs" mapstructure:"max_trailing_logs"`
|
||||
MinQuorum *uint `json:"min_quorum,omitempty" hcl:"min_quorum" mapstructure:"min_quorum"`
|
||||
RedundancyZoneTag *string `json:"redundancy_zone_tag,omitempty" hcl:"redundancy_zone_tag" mapstructure:"redundancy_zone_tag"`
|
||||
ServerStabilizationTime *string `json:"server_stabilization_time,omitempty" hcl:"server_stabilization_time" mapstructure:"server_stabilization_time"`
|
||||
UpgradeVersionTag *string `json:"upgrade_version_tag,omitempty" hcl:"upgrade_version_tag" mapstructure:"upgrade_version_tag"`
|
||||
|
|
|
@ -196,6 +196,12 @@ type RuntimeConfig struct {
|
|||
// hcl: autopilot { max_trailing_logs = int }
|
||||
AutopilotMaxTrailingLogs int
|
||||
|
||||
// AutopilotMinQuorum sets the minimum number of servers required in a cluster
|
||||
// before autopilot can prune dead servers.
|
||||
//
|
||||
//hcl: autopilot { min_quorum = int }
|
||||
AutopilotMinQuorum uint
|
||||
|
||||
// AutopilotRedundancyZoneTag is the Meta tag to use for separating servers
|
||||
// into zones for redundancy. If left blank, this feature will be disabled.
|
||||
// (Enterprise-only)
|
||||
|
|
|
@ -3520,6 +3520,7 @@ func TestFullConfig(t *testing.T) {
|
|||
"disable_upgrade_migration": true,
|
||||
"last_contact_threshold": "12705s",
|
||||
"max_trailing_logs": 17849,
|
||||
"min_quorum": 3,
|
||||
"redundancy_zone_tag": "3IsufDJf",
|
||||
"server_stabilization_time": "23057s",
|
||||
"upgrade_version_tag": "W9pDwFAL"
|
||||
|
@ -4117,6 +4118,7 @@ func TestFullConfig(t *testing.T) {
|
|||
disable_upgrade_migration = true
|
||||
last_contact_threshold = "12705s"
|
||||
max_trailing_logs = 17849
|
||||
min_quorum = 3
|
||||
redundancy_zone_tag = "3IsufDJf"
|
||||
server_stabilization_time = "23057s"
|
||||
upgrade_version_tag = "W9pDwFAL"
|
||||
|
@ -4819,6 +4821,7 @@ func TestFullConfig(t *testing.T) {
|
|||
AutopilotDisableUpgradeMigration: true,
|
||||
AutopilotLastContactThreshold: 12705 * time.Second,
|
||||
AutopilotMaxTrailingLogs: 17849,
|
||||
AutopilotMinQuorum: 3,
|
||||
AutopilotRedundancyZoneTag: "3IsufDJf",
|
||||
AutopilotServerStabilizationTime: 23057 * time.Second,
|
||||
AutopilotUpgradeVersionTag: "W9pDwFAL",
|
||||
|
@ -5703,6 +5706,7 @@ func TestSanitize(t *testing.T) {
|
|||
"AutopilotDisableUpgradeMigration": false,
|
||||
"AutopilotLastContactThreshold": "0s",
|
||||
"AutopilotMaxTrailingLogs": 0,
|
||||
"AutopilotMinQuorum": 0,
|
||||
"AutopilotRedundancyZoneTag": "",
|
||||
"AutopilotServerStabilizationTime": "0s",
|
||||
"AutopilotUpgradeVersionTag": "",
|
||||
|
|
|
@ -234,7 +234,7 @@ func (a *Autopilot) pruneDeadServers() error {
|
|||
|
||||
// Only do removals if a minority of servers will be affected.
|
||||
peers := NumPeers(raftConfig)
|
||||
if removalCount < peers/2 {
|
||||
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
|
||||
for _, node := range failed {
|
||||
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
|
||||
go serfLAN.RemoveFailedNode(node.Name)
|
||||
|
|
|
@ -20,6 +20,10 @@ type Config struct {
|
|||
// be behind before being considered unhealthy.
|
||||
MaxTrailingLogs uint64
|
||||
|
||||
// MinQuorum sets the minimum number of servers required in a cluster
|
||||
// before autopilot can prune dead servers.
|
||||
MinQuorum uint
|
||||
|
||||
// ServerStabilizationTime is the minimum amount of time a server must be
|
||||
// in a stable, healthy state before it can be added to the cluster. Only
|
||||
// applicable with Raft protocol version 3 or higher.
|
||||
|
|
|
@ -369,3 +369,97 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
|
|||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestAutopilot_BootstrapExpect(t *testing.T) {
|
||||
dc := "dc1"
|
||||
closeMap := make(map[string]chan struct{})
|
||||
conf := func(c *Config) {
|
||||
c.Datacenter = dc
|
||||
c.Bootstrap = false
|
||||
c.BootstrapExpect = 3
|
||||
c.AutopilotConfig.MinQuorum = 3
|
||||
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(2)
|
||||
c.AutopilotInterval = 100 * time.Millisecond
|
||||
//Let us know when a server is actually gone
|
||||
ch := make(chan struct{})
|
||||
c.NotifyShutdown = func() {
|
||||
t.Logf("%v is shutdown", c.NodeName)
|
||||
close(ch)
|
||||
}
|
||||
closeMap[c.NodeName] = ch
|
||||
}
|
||||
dir1, s1 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir1)
|
||||
defer s1.Shutdown()
|
||||
|
||||
dir2, s2 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir2)
|
||||
defer s2.Shutdown()
|
||||
|
||||
dir3, s3 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir3)
|
||||
defer s3.Shutdown()
|
||||
|
||||
dir4, s4 := testServerWithConfig(t, conf)
|
||||
defer os.RemoveAll(dir4)
|
||||
defer s4.Shutdown()
|
||||
|
||||
servers := map[string]*Server{s1.config.NodeName: s1,
|
||||
s2.config.NodeName: s2,
|
||||
s3.config.NodeName: s3,
|
||||
s4.config.NodeName: s4}
|
||||
|
||||
// Try to join
|
||||
joinLAN(t, s2, s1)
|
||||
joinLAN(t, s3, s1)
|
||||
joinLAN(t, s4, s1)
|
||||
|
||||
//Differentiate between leader and server
|
||||
findStatus := func(leader bool) *Server {
|
||||
for _, mem := range servers {
|
||||
if mem.IsLeader() && leader {
|
||||
return mem
|
||||
}
|
||||
if !leader && !mem.IsLeader() {
|
||||
return mem
|
||||
}
|
||||
}
|
||||
|
||||
t.Fatalf("no members set")
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, s := range servers {
|
||||
testrpc.WaitForLeader(t, s.RPC, dc)
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 4)) })
|
||||
}
|
||||
|
||||
// Have autopilot take one into left
|
||||
dead := findStatus(false)
|
||||
dead.Shutdown()
|
||||
<-closeMap[dead.config.NodeName]
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
leader := findStatus(true)
|
||||
for _, m := range leader.LANMembers() {
|
||||
if m.Name == dead.config.NodeName && m.Status != serf.StatusLeft {
|
||||
r.Fatalf("%v should be left, got %v", m.Name, m.Status.String())
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
delete(servers, dead.config.NodeName)
|
||||
//Autopilot should not take this one into left
|
||||
dead = findStatus(false)
|
||||
dead.Shutdown()
|
||||
<-closeMap[dead.config.NodeName]
|
||||
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
leader := findStatus(true)
|
||||
for _, m := range leader.LANMembers() {
|
||||
if m.Name == dead.config.NodeName && m.Status != serf.StatusFailed {
|
||||
r.Fatalf("%v should be failed, got %v", m.Name, m.Status.String())
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ import (
|
|||
"github.com/hashicorp/consul/agent/structs"
|
||||
"github.com/hashicorp/consul/sdk/testutil/retry"
|
||||
"github.com/hashicorp/consul/testrpc"
|
||||
"github.com/hashicorp/net-rpc-msgpackrpc"
|
||||
msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
|
||||
"github.com/hashicorp/raft"
|
||||
)
|
||||
|
||||
|
@ -116,6 +116,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
|
|||
Datacenter: "dc1",
|
||||
Config: autopilot.Config{
|
||||
CleanupDeadServers: true,
|
||||
MinQuorum: 3,
|
||||
},
|
||||
}
|
||||
var reply *bool
|
||||
|
@ -130,7 +131,7 @@ func TestOperator_Autopilot_SetConfiguration(t *testing.T) {
|
|||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !config.CleanupDeadServers {
|
||||
if !config.CleanupDeadServers && config.MinQuorum != 3 {
|
||||
t.Fatalf("bad: %#v", config)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -201,6 +201,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
|
|||
CleanupDeadServers: reply.CleanupDeadServers,
|
||||
LastContactThreshold: api.NewReadableDuration(reply.LastContactThreshold),
|
||||
MaxTrailingLogs: reply.MaxTrailingLogs,
|
||||
MinQuorum: reply.MinQuorum,
|
||||
ServerStabilizationTime: api.NewReadableDuration(reply.ServerStabilizationTime),
|
||||
RedundancyZoneTag: reply.RedundancyZoneTag,
|
||||
DisableUpgradeMigration: reply.DisableUpgradeMigration,
|
||||
|
@ -226,6 +227,7 @@ func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, re
|
|||
CleanupDeadServers: conf.CleanupDeadServers,
|
||||
LastContactThreshold: conf.LastContactThreshold.Duration(),
|
||||
MaxTrailingLogs: conf.MaxTrailingLogs,
|
||||
MinQuorum: conf.MinQuorum,
|
||||
ServerStabilizationTime: conf.ServerStabilizationTime.Duration(),
|
||||
RedundancyZoneTag: conf.RedundancyZoneTag,
|
||||
DisableUpgradeMigration: conf.DisableUpgradeMigration,
|
||||
|
|
|
@ -25,6 +25,10 @@ type AutopilotConfiguration struct {
|
|||
// be behind before being considered unhealthy.
|
||||
MaxTrailingLogs uint64
|
||||
|
||||
// MinQuorum sets the minimum number of servers allowed in a cluster before
|
||||
// autopilot can prune dead servers.
|
||||
MinQuorum uint
|
||||
|
||||
// ServerStabilizationTime is the minimum amount of time a server must be
|
||||
// in a stable, healthy state before it can be added to the cluster. Only
|
||||
// applicable with Raft protocol version 3 or higher.
|
||||
|
|
|
@ -58,6 +58,7 @@ func (c *cmd) Run(args []string) int {
|
|||
c.UI.Output(fmt.Sprintf("CleanupDeadServers = %v", config.CleanupDeadServers))
|
||||
c.UI.Output(fmt.Sprintf("LastContactThreshold = %v", config.LastContactThreshold.String()))
|
||||
c.UI.Output(fmt.Sprintf("MaxTrailingLogs = %v", config.MaxTrailingLogs))
|
||||
c.UI.Output(fmt.Sprintf("MinQuorum = %v", config.MinQuorum))
|
||||
c.UI.Output(fmt.Sprintf("ServerStabilizationTime = %v", config.ServerStabilizationTime.String()))
|
||||
c.UI.Output(fmt.Sprintf("RedundancyZoneTag = %q", config.RedundancyZoneTag))
|
||||
c.UI.Output(fmt.Sprintf("DisableUpgradeMigration = %v", config.DisableUpgradeMigration))
|
||||
|
|
|
@ -25,6 +25,7 @@ type cmd struct {
|
|||
// flags
|
||||
cleanupDeadServers flags.BoolValue
|
||||
maxTrailingLogs flags.UintValue
|
||||
minQuorum flags.UintValue
|
||||
lastContactThreshold flags.DurationValue
|
||||
serverStabilizationTime flags.DurationValue
|
||||
redundancyZoneTag flags.StringValue
|
||||
|
@ -40,6 +41,9 @@ func (c *cmd) init() {
|
|||
c.flags.Var(&c.maxTrailingLogs, "max-trailing-logs",
|
||||
"Controls the maximum number of log entries that a server can trail the "+
|
||||
"leader by before being considered unhealthy.")
|
||||
c.flags.Var(&c.minQuorum, "min-quorum",
|
||||
"Sets the minimum number of of servers required in a cluster before autopilot "+
|
||||
"is allowed to prune dead servers.")
|
||||
c.flags.Var(&c.lastContactThreshold, "last-contact-threshold",
|
||||
"Controls the maximum amount of time a server can go without contact "+
|
||||
"from the leader before being considered unhealthy. Must be a duration value "+
|
||||
|
@ -94,6 +98,7 @@ func (c *cmd) Run(args []string) int {
|
|||
c.redundancyZoneTag.Merge(&conf.RedundancyZoneTag)
|
||||
c.disableUpgradeMigration.Merge(&conf.DisableUpgradeMigration)
|
||||
c.upgradeVersionTag.Merge(&conf.UpgradeVersionTag)
|
||||
c.minQuorum.Merge(&conf.MinQuorum)
|
||||
|
||||
trailing := uint(conf.MaxTrailingLogs)
|
||||
c.maxTrailingLogs.Merge(&trailing)
|
||||
|
|
|
@ -34,6 +34,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
|
|||
"-max-trailing-logs=99",
|
||||
"-last-contact-threshold=123ms",
|
||||
"-server-stabilization-time=123ms",
|
||||
"-min-quorum=3",
|
||||
}
|
||||
|
||||
code := c.Run(args)
|
||||
|
@ -65,4 +66,7 @@ func TestOperatorAutopilotSetConfigCommand(t *testing.T) {
|
|||
if reply.ServerStabilizationTime != 123*time.Millisecond {
|
||||
t.Fatalf("bad: %#v", reply)
|
||||
}
|
||||
if reply.MinQuorum != 3 {
|
||||
t.Fatalf("bad: %#v", reply)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,6 +108,9 @@ The table below shows this endpoint's support for
|
|||
- `MaxTrailingLogs` `(int: 250)` specifies the maximum number of log entries
|
||||
that a server can trail the leader by before being considered unhealthy.
|
||||
|
||||
- `MinQuorum` `int: 0` - specifies the minimum number of servers needed before
|
||||
Autopilot can prune dead servers.
|
||||
|
||||
- `ServerStabilizationTime` `(string: "10s")` - Specifies the minimum amount of
|
||||
time a server must be stable in the 'healthy' state before being added to the
|
||||
cluster. Only takes effect if all servers are running Raft protocol version 3
|
||||
|
@ -134,6 +137,7 @@ The table below shows this endpoint's support for
|
|||
"CleanupDeadServers": true,
|
||||
"LastContactThreshold": "200ms",
|
||||
"MaxTrailingLogs": 250,
|
||||
"MinQuorum": 3,
|
||||
"ServerStabilizationTime": "10s",
|
||||
"RedundancyZoneTag": "",
|
||||
"DisableUpgradeMigration": false,
|
||||
|
|
|
@ -782,6 +782,9 @@ default will automatically work with some tooling.
|
|||
the maximum number of log entries that a server can trail the leader by before being considered unhealthy. Defaults
|
||||
to 250.
|
||||
|
||||
* <a name="min_quorum"></a><a href="#min_quorum">`min_quorum`</a> - Sets the minimum number of servers necessary in a cluster
|
||||
before autopilot can prune dead servers. There is no default.
|
||||
|
||||
* <a name="server_stabilization_time"></a><a href="#server_stabilization_time">`server_stabilization_time`</a> -
|
||||
Controls the minimum amount of time a server must be stable in the 'healthy' state before being added to the
|
||||
cluster. Only takes effect if all servers are running Raft protocol version 3 or higher. Must be a duration value
|
||||
|
|
Loading…
Reference in New Issue