From 94f58199b19fb70f2e8dc7b03efbca820122fcbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frank=20Schr=C3=B6der?= Date: Wed, 11 Oct 2017 02:04:52 +0200 Subject: [PATCH] agent: add option to discard health output (#3562) * agent: add option to discard health output In high volatile environments consul will have checks with "noisy" output which changes every time even though the status does not change. Since the output is stored in the raft log every health check update unblocks a blocking call on health checks since the raft index has changed even though the status of the health checks may not have changed at all. By discarding the output of the health checks the users can choose a different tradeoff. Less visibility on why a check failed in exchange for a reduced change rate on the raft log. * agent: discard output also when adding a check * agent: add test for discard check output * agent: update docs * go vet * Adds discard_check_output to reloadable config table. * Updates the change log. --- CHANGELOG.md | 2 +- agent/agent.go | 2 + agent/config/builder.go | 1 + agent/config/config.go | 1 + agent/config/runtime.go | 1 + agent/config/runtime_test.go | 4 ++ agent/local.go | 24 +++++++++-- agent/local_test.go | 50 +++++++++++++++++++++++ website/source/docs/agent/options.html.md | 8 +++- 9 files changed, 88 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df97d856e1..101c70c0b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -134,7 +134,6 @@ FEATURES: * **Support for Running Subproccesses Directly Without a Shell:** Consul agent checks and watches now support an `args` configuration which is a list of arguments to run for the subprocess, which runs the subprocess directly without a shell. The old `script` and `handler` configurations are now deprecated (specify a shell explicitly if you require one). A `-shell=false` option is also available on `consul lock`, `consul watch`, and `consul exec` to run the subprocesses associated with those without a shell. [[GH-3509](https://github.com/hashicorp/consul/issues/3509)] * **Sentinel Integration:** (Consul Enterprise) Consul's ACL system integrates with [Sentinel](https://www.consul.io/docs/guides/sentinel.html) to enable code policies that apply to KV writes. - IMPROVEMENTS: * agent: Added support to detect public IPv4 and IPv6 addresses on AWS. [[GH-3471](https://github.com/hashicorp/consul/issues/3471)] @@ -142,6 +141,7 @@ IMPROVEMENTS: * agent: Improved ACL system for the KV store to support list permissions. This behavior can be opted in. For more information, see the [ACL Guide](https://www.consul.io/docs/guides/acl.html#list-policy-for-keys). [[GH-3511](https://github.com/hashicorp/consul/issues/3511)] * agent: Updates miekg/dns library to later version to pick up bug fixes and improvements. [[GH-3547](https://github.com/hashicorp/consul/issues/3547)] * agent: Added automatic retries to the RPC path, and a brief RPC drain time when servers leave. These changes make Consul more robust during graceful leaves of Consul servers, such as during upgrades, and help shield applications from "no leader" errors. These are configured with new [`performance`](https://www.consul.io/docs/agent/options.html#performance) options. [[GH-5414](https://github.com/hashicorp/consul/issues/5414)] +* agent: Added a new `discard_check_output` agent-level configuration option that can be used to trade off write load to the Consul servers vs. visibility of health check output. This is reloadable so it can be toggled without fully restarting the agent. [[GH-3562](https://github.com/hashicorp/consul/issues/3562)] * build: Updated Go toolchain to version 1.9.1. [[GH-3537](https://github.com/hashicorp/consul/issues/3537)] * cli: `consul lock` and `consul watch` commands will forward `TERM` and `KILL` signals to their child subprocess. [[GH-3509](https://github.com/hashicorp/consul/issues/3509)] * cli: Added support for [autocompletion](https://www.consul.io/docs/commands/index.html#autocompletion). [[GH-3412](https://github.com/hashicorp/consul/issues/3412)] diff --git a/agent/agent.go b/agent/agent.go index b9479c0e46..6eec5d27e7 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -2386,5 +2386,7 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { // Update filtered metrics metrics.UpdateFilter(newCfg.TelemetryAllowedPrefixes, newCfg.TelemetryBlockedPrefixes) + a.state.SetDiscardCheckOutput(newCfg.DiscardCheckOutput) + return nil } diff --git a/agent/config/builder.go b/agent/config/builder.go index 414fe778f4..890074282c 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -579,6 +579,7 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) { DisableKeyringFile: b.boolVal(c.DisableKeyringFile), DisableRemoteExec: b.boolVal(c.DisableRemoteExec), DisableUpdateCheck: b.boolVal(c.DisableUpdateCheck), + DiscardCheckOutput: b.boolVal(c.DiscardCheckOutput), EnableDebug: b.boolVal(c.EnableDebug), EnableScriptChecks: b.boolVal(c.EnableScriptChecks), EnableSyslog: b.boolVal(c.EnableSyslog), diff --git a/agent/config/config.go b/agent/config/config.go index ab98118ecc..03c0d6994c 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -173,6 +173,7 @@ type Config struct { DisableKeyringFile *bool `json:"disable_keyring_file,omitempty" hcl:"disable_keyring_file" mapstructure:"disable_keyring_file"` DisableRemoteExec *bool `json:"disable_remote_exec,omitempty" hcl:"disable_remote_exec" mapstructure:"disable_remote_exec"` DisableUpdateCheck *bool `json:"disable_update_check,omitempty" hcl:"disable_update_check" mapstructure:"disable_update_check"` + DiscardCheckOutput *bool `json:"discard_check_output" hcl:"discard_check_output" mapstructure:"discard_check_output"` EnableACLReplication *bool `json:"enable_acl_replication,omitempty" hcl:"enable_acl_replication" mapstructure:"enable_acl_replication"` EnableDebug *bool `json:"enable_debug,omitempty" hcl:"enable_debug" mapstructure:"enable_debug"` EnableScriptChecks *bool `json:"enable_script_checks,omitempty" hcl:"enable_script_checks" mapstructure:"enable_script_checks"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 5fc2ab3618..e3ef7ad2b7 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -133,6 +133,7 @@ type RuntimeConfig struct { DisableKeyringFile bool DisableRemoteExec bool DisableUpdateCheck bool + DiscardCheckOutput bool EnableACLReplication bool EnableDebug bool EnableScriptChecks bool diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 77f50ddf77..3076d97fc6 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -2116,6 +2116,7 @@ func TestFullConfig(t *testing.T) { "disable_keyring_file": true, "disable_remote_exec": true, "disable_update_check": true, + "discard_check_output": true, "domain": "7W1xXSqd", "dns_config": { "allow_stale": true, @@ -2549,6 +2550,7 @@ func TestFullConfig(t *testing.T) { disable_keyring_file = true disable_remote_exec = true disable_update_check = true + discard_check_output = true domain = "7W1xXSqd" dns_config { allow_stale = true @@ -3132,6 +3134,7 @@ func TestFullConfig(t *testing.T) { DisableKeyringFile: true, DisableRemoteExec: true, DisableUpdateCheck: true, + DiscardCheckOutput: true, EnableACLReplication: true, EnableDebug: true, EnableScriptChecks: true, @@ -3808,6 +3811,7 @@ func TestSanitize(t *testing.T) { "DisableKeyringFile": false, "DisableRemoteExec": false, "DisableUpdateCheck": false, + "DiscardCheckOutput": false, "EnableACLReplication": false, "EnableDebug": false, "EnableScriptChecks": false, diff --git a/agent/local.go b/agent/local.go index 2fe90a1a5f..2a29a9c6c0 100644 --- a/agent/local.go +++ b/agent/local.go @@ -88,6 +88,10 @@ type localState struct { // triggerCh is used to inform of a change to local state // that requires anti-entropy with the server triggerCh chan struct{} + + // discardCheckOutput stores whether the output of health checks + // is stored in the raft log. + discardCheckOutput atomic.Value // bool } // NewLocalState creates a is used to initialize the local state @@ -106,7 +110,7 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) lc.TaggedAddresses[k] = v } - return &localState{ + l := &localState{ config: lc, logger: lg, services: make(map[string]*structs.NodeService), @@ -121,6 +125,8 @@ func NewLocalState(c *config.RuntimeConfig, lg *log.Logger, tokens *token.Store) consulCh: make(chan struct{}, 1), triggerCh: make(chan struct{}, 1), } + l.discardCheckOutput.Store(c.DiscardCheckOutput) + return l } // changeMade is used to trigger an anti-entropy run @@ -161,6 +167,10 @@ func (l *localState) isPaused() bool { return atomic.LoadInt32(&l.paused) > 0 } +func (l *localState) SetDiscardCheckOutput(b bool) { + l.discardCheckOutput.Store(b) +} + // ServiceToken returns the configured ACL token for the given // service ID. If none is present, the agent's token is returned. func (l *localState) ServiceToken(id string) string { @@ -249,11 +259,15 @@ func (l *localState) checkToken(checkID types.CheckID) string { // This entry is persistent and the agent will make a best effort to // ensure it is registered func (l *localState) AddCheck(check *structs.HealthCheck, token string) error { + l.Lock() + defer l.Unlock() + // Set the node name check.Node = l.config.NodeName - l.Lock() - defer l.Unlock() + if l.discardCheckOutput.Load().(bool) { + check.Output = "" + } // if there is a serviceID associated with the check, make sure it exists before adding it // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor @@ -293,6 +307,10 @@ func (l *localState) UpdateCheck(checkID types.CheckID, status, output string) { return } + if l.discardCheckOutput.Load().(bool) { + output = "" + } + // Update the critical time tracking (this doesn't cause a server updates // so we can always keep this up to date). if status == api.HealthCritical { diff --git a/agent/local_test.go b/agent/local_test.go index 2b2d78d3d2..2d0d2fb022 100644 --- a/agent/local_test.go +++ b/agent/local_test.go @@ -1151,6 +1151,56 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) { } } +func TestAgent_UpdateCheck_DiscardOutput(t *testing.T) { + t.Parallel() + a := NewTestAgent(t.Name(), ` + discard_check_output = true + check_update_interval = "0s" # set to "0s" since otherwise output checks are deferred + `) + defer a.Shutdown() + + inSync := func(id string) bool { + a.state.Lock() + defer a.state.Unlock() + return a.state.checkStatus[types.CheckID(id)].inSync + } + + // register a check + check := &structs.HealthCheck{ + Node: a.Config.NodeName, + CheckID: "web", + Name: "web", + Status: api.HealthPassing, + Output: "first output", + } + if err := a.state.AddCheck(check, ""); err != nil { + t.Fatalf("bad: %s", err) + } + + // wait until the check is in sync + retry.Run(t, func(r *retry.R) { + if inSync("web") { + return + } + r.FailNow() + }) + + // update the check with the same status but different output + // and the check should still be in sync. + a.state.UpdateCheck(check.CheckID, api.HealthPassing, "second output") + if !inSync("web") { + t.Fatal("check should be in sync") + } + + // disable discarding of check output and update the check again with different + // output. Then the check should be out of sync. + a.state.SetDiscardCheckOutput(false) + a.state.UpdateCheck(check.CheckID, api.HealthPassing, "third output") + if inSync("web") { + t.Fatal("check should be out of sync") + } +} + func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) { t.Parallel() a := &TestAgent{Name: t.Name(), HCL: ` diff --git a/website/source/docs/agent/options.html.md b/website/source/docs/agent/options.html.md index 7eeaae208c..346d1404a5 100644 --- a/website/source/docs/agent/options.html.md +++ b/website/source/docs/agent/options.html.md @@ -771,6 +771,11 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass * `disable_update_check` Disables automatic checking for security bulletins and new version releases. +* `discard_check_output` + Discards the output of health checks before storing them. This reduces the number of writes + to the Consul raft log in environments where health checks have volatile output like + timestamps, process ids, ... + * `dns_config` This object allows a number of sub-keys to be set which can tune how DNS queries are serviced. See this guide on [DNS caching](/docs/guides/dns-cache.html) for more detail. @@ -1331,4 +1336,5 @@ items which are reloaded include: * Watches * HTTP Client Address * Node Metadata -* Metric Prefix Filter +* Metric Prefix Filter +* Discard Check Output