From cf00c11221b5f4938a8fcf40825812a8582234de Mon Sep 17 00:00:00 2001 From: James Phillips Date: Sun, 10 Apr 2016 23:31:16 -0700 Subject: [PATCH] Sets an anti-footgun floor for the configurable reap time. --- command/agent/agent_test.go | 8 +++--- command/agent/config.go | 8 +++++- command/agent/config_test.go | 28 +++++++++++++------ .../source/docs/agent/options.html.markdown | 8 ++++-- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 32e5fc4a53..baf5c5dc53 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -194,20 +194,20 @@ func TestAgent_ReconnectConfigSettings(t *testing.T) { } }() - c.ReconnectTimeoutLan = 2 * time.Hour - c.ReconnectTimeoutWan = 3 * time.Hour + c.ReconnectTimeoutLan = 24 * time.Hour + c.ReconnectTimeoutWan = 36 * time.Hour func() { dir, agent := makeAgent(t, c) defer os.RemoveAll(dir) defer agent.Shutdown() lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout - if lan != 2*time.Hour { + if lan != 24*time.Hour { t.Fatalf("bad: %s", lan.String()) } wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout - if wan != 3*time.Hour { + if wan != 36*time.Hour { t.Fatalf("bad: %s", wan.String()) } }() diff --git a/command/agent/config.go b/command/agent/config.go index ce8287ce95..21a3fa454b 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -786,19 +786,25 @@ func DecodeConfig(r io.Reader) (*Config, error) { result.RetryIntervalWan = dur } + const reconnectTimeoutMin = 8 * time.Hour if raw := result.ReconnectTimeoutLanRaw; raw != "" { dur, err := time.ParseDuration(raw) if err != nil { return nil, fmt.Errorf("ReconnectTimeoutLan invalid: %v", err) } + if dur < reconnectTimeoutMin { + return nil, fmt.Errorf("ReconnectTimeoutLan must be >= %s", reconnectTimeoutMin.String()) + } result.ReconnectTimeoutLan = dur } - if raw := result.ReconnectTimeoutWanRaw; raw != "" { dur, err := time.ParseDuration(raw) if err != nil { return nil, fmt.Errorf("ReconnectTimeoutWan invalid: %v", err) } + if dur < reconnectTimeoutMin { + return nil, fmt.Errorf("ReconnectTimeoutWan must be >= %s", reconnectTimeoutMin.String()) + } result.ReconnectTimeoutWan = dur } diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 09dad36b8b..3fc0a6880d 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -463,17 +463,27 @@ func TestDecodeConfig(t *testing.T) { } // Reconnect timeout LAN and WAN - input = `{"reconnect_timeout": "1m", "reconnect_timeout_wan": "2m"}` + input = `{"reconnect_timeout": "8h", "reconnect_timeout_wan": "10h"}` config, err = DecodeConfig(bytes.NewReader([]byte(input))) if err != nil { t.Fatalf("err: %s", err) } - if config.ReconnectTimeoutLanRaw != "1m" || - config.ReconnectTimeoutLan.String() != "1m0s" || - config.ReconnectTimeoutWanRaw != "2m" || - config.ReconnectTimeoutWan.String() != "2m0s" { + if config.ReconnectTimeoutLanRaw != "8h" || + config.ReconnectTimeoutLan.String() != "8h0m0s" || + config.ReconnectTimeoutWanRaw != "10h" || + config.ReconnectTimeoutWan.String() != "10h0m0s" { t.Fatalf("bad: %#v", config) } + input = `{"reconnect_timeout": "7h"}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err == nil { + t.Fatalf("decode should have failed") + } + input = `{"reconnect_timeout_wan": "7h"}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err == nil { + t.Fatalf("decode should have failed") + } // Static UI server input = `{"ui": true}` @@ -1364,10 +1374,10 @@ func TestMergeConfig(t *testing.T) { RetryJoinWan: []string{"1.1.1.1"}, RetryIntervalWanRaw: "10s", RetryIntervalWan: 10 * time.Second, - ReconnectTimeoutLanRaw: "1s", - ReconnectTimeoutLan: 1 * time.Second, - ReconnectTimeoutWanRaw: "2s", - ReconnectTimeoutWan: 2 * time.Second, + ReconnectTimeoutLanRaw: "24h", + ReconnectTimeoutLan: 24 * time.Hour, + ReconnectTimeoutWanRaw: "36h", + ReconnectTimeoutWan: 36 * time.Hour, CheckUpdateInterval: 8 * time.Minute, CheckUpdateIntervalRaw: "8m", ACLToken: "1234", diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index fc21a7e2e8..06c1e404d0 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -583,13 +583,15 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass * `reconnect_timeout` This controls how long it takes for a failed node to be completely removed from the cluster. This defaults to 72 hours and it is recommended that this is set to at least double the maximum expected recoverable - outage time for a node or network partition. The value is a time with a unit suffix, which can be - "s", "m", "h" for seconds, minutes, or hours. + outage time for a node or network partition. WARNING: Setting this time too low could cause Consul + servers to be removed from quorum during an extended node failure or partition, which could complicate + recovery of the cluster. The value is a time with a unit suffix, which can be "s", "m", "h" for seconds, + minutes, or hours. The value must be >= 8 hours. * `reconnect_timeout_wan` This is the WAN equivalent of the `reconnect_timeout` parameter, which controls how long it takes for a failed server to be completely removed from the WAN pool. This also - defaults to 72 hours. + defaults to 72 hours, and must be >= 8 hours. * `recursor` Provides a single recursor address. This has been deprecated, and the value is appended to the [`recursors`](#recursors) list for