From 7597d3d79880ce5cca079c82f8c612b80be114ba Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Fri, 5 Jun 2015 16:17:07 -0700 Subject: [PATCH 1/7] agent: first stab at persisting check state --- command/agent/agent.go | 80 +++++++++++++++++++++++++++++++++++++++++- command/agent/check.go | 11 ++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index ea3c515079..19a0a6303b 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -12,6 +12,7 @@ import ( "regexp" "strconv" "sync" + "time" "github.com/hashicorp/consul/consul" "github.com/hashicorp/consul/consul/structs" @@ -23,7 +24,8 @@ const ( servicesDir = "services" // Path to save local agent checks - checksDir = "checks" + checksDir = "checks" + checkStateDir = "checks/state" // The ID of the faux health checks for maintenance mode serviceMaintCheckPrefix = "_service_maintenance" @@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist TTL: chkType.TTL, Logger: a.logger, } + + // Restore persisted state, if any + if err := a.recallCheckState(check); err != nil { + a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", + check.CheckID, err) + } + ttl.Start() a.checkTTLs[check.CheckID] = ttl @@ -861,6 +870,75 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error { // Set the status through CheckTTL to reset the TTL check.SetStatus(status, output) + + // Always persist the state for TTL checks + if err := a.persistCheckState(check, status, output); err != nil { + return fmt.Errorf("failed persisting state for check %q: %s", checkID, err) + } + + return nil +} + +// persistCheckState is used to record the check status into the data dir. +// This allows the state to be restored on a later agent start. Currently +// only useful for TTL based checks. +func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error { + // Create the persisted state + state := persistedCheckState{ + CheckID: check.CheckID, + Status: status, + Output: output, + Expires: time.Now().Add(check.TTL).Unix(), + } + + // Encode the state + buf, err := json.Marshal(state) + if err != nil { + return err + } + + // Create the state dir if it doesn't exist + dir := filepath.Join(a.config.DataDir, checkStateDir) + if err := os.MkdirAll(dir, 0700); err != nil { + return fmt.Errorf("failed creating check state dir %q: %s", dir, err) + } + + // Write the state to the file + file := filepath.Join(dir, stringHash(check.CheckID)) + if err := ioutil.WriteFile(file, buf, 0600); err != nil { + return fmt.Errorf("failed writing file %q: %s", file, err) + } + + return nil +} + +// recallCheckState is used to restore the persisted state of a check. +func (a *Agent) recallCheckState(check *structs.HealthCheck) error { + // Try to read the persisted state for this check + file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID)) + buf, err := ioutil.ReadFile(file) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("failed reading file %q: %s", file, err) + } + + // Decode the state data + var p persistedCheckState + if err := json.Unmarshal(buf, &p); err != nil { + return fmt.Errorf("failed decoding check state: %s", err) + } + + // Check if the state has expired + if time.Now().Unix() > p.Expires { + a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) + return nil + } + + // Restore the fields from the state + check.Output = p.Output + check.Status = p.Status return nil } diff --git a/command/agent/check.go b/command/agent/check.go index 66578db8e7..6677483886 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -266,6 +266,17 @@ type persistedCheck struct { Token string } +// persistedCheckState is used to persist the current state of a given +// check. This is different from the check definition, and includes an +// expiration timestamp which is used to determine staleness on later +// agent restarts. +type persistedCheckState struct { + CheckID string + Output string + Status string + Expires int64 +} + // CheckHTTP is used to periodically make an HTTP request to // determine the health of a given check. // The check is passing if the response code is 2XX. From 7e6e8613941672d4e67af676e96f2d326eff32d2 Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Fri, 5 Jun 2015 16:45:05 -0700 Subject: [PATCH 2/7] agent: testing state persistence, recovery, and expiration --- command/agent/agent.go | 2 +- command/agent/agent_test.go | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 19a0a6303b..f6556f0db6 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -931,7 +931,7 @@ func (a *Agent) recallCheckState(check *structs.HealthCheck) error { } // Check if the state has expired - if time.Now().Unix() > p.Expires { + if time.Now().Unix() >= p.Expires { a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) return nil } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 7a2b43f764..db35adde4b 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -1349,3 +1349,107 @@ func TestAgent_loadChecks_checkFails(t *testing.T) { t.Fatalf("should have purged check") } } + +func TestAgent_persistCheckStatus(t *testing.T) { + config := nextConfig() + dir, agent := makeAgent(t, config) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // Create the TTL check to persist + check := &CheckTTL{ + CheckID: "check1", + TTL: 10 * time.Minute, + } + + // Persist some check state for the check + err := agent.persistCheckState(check, structs.HealthCritical, "nope") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Check the persisted file exists and has the content + file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1")) + buf, err := ioutil.ReadFile(file) + if err != nil { + t.Fatalf("err: %s", err) + } + + // Decode the state + var p persistedCheckState + if err := json.Unmarshal(buf, &p); err != nil { + t.Fatalf("err: %s", err) + } + + // Check the fields + if p.CheckID != "check1" { + t.Fatalf("bad: %#v", p) + } + if p.Output != "nope" { + t.Fatalf("bad: %#v", p) + } + if p.Status != structs.HealthCritical { + t.Fatalf("bad: %#v", p) + } + + // Check the expiration time was set + if p.Expires < time.Now().Unix() { + t.Fatalf("bad: %#v", p) + } +} + +func TestAgent_recallCheckState(t *testing.T) { + config := nextConfig() + dir, agent := makeAgent(t, config) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // Create a check whose state will expire immediately + check := &CheckTTL{ + CheckID: "check1", + TTL: 0, + } + + // Persist the check state + err := agent.persistCheckState(check, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Try to recall the state + health := &structs.HealthCheck{ + CheckID: "check1", + Status: structs.HealthCritical, + } + if err := agent.recallCheckState(health); err != nil { + t.Fatalf("err: %s", err) + } + + // Should not have restored the status due to expiration + if health.Status != structs.HealthCritical { + t.Fatalf("bad: %#v", health) + } + if health.Output != "" { + t.Fatalf("bad: %#v", health) + } + + // Set a TTL which will not expire before we check it + check.TTL = time.Minute + err = agent.persistCheckState(check, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Try to recall + if err := agent.recallCheckState(health); err != nil { + t.Fatalf("err: %s", err) + } + + // Should have restored + if health.Status != structs.HealthPassing { + t.Fatalf("bad: %#v", health) + } + if health.Output != "yup" { + t.Fatalf("bad: %#v", health) + } +} From 2ee8fa8e156ccef99ffcfb0f25552f8c584bff5c Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Fri, 5 Jun 2015 16:57:14 -0700 Subject: [PATCH 3/7] agent: purge check state when checks are deregistered --- command/agent/agent.go | 17 ++++++++++++++++- command/agent/agent_test.go | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index f6556f0db6..7e4ce4c2b3 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -851,7 +851,12 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error { delete(a.checkTTLs, checkID) } if persist { - return a.purgeCheck(checkID) + if err := a.purgeCheck(checkID); err != nil { + return err + } + if err := a.purgeCheckState(checkID); err != nil { + return err + } } log.Printf("[DEBUG] agent: removed check %q", checkID) return nil @@ -942,6 +947,16 @@ func (a *Agent) recallCheckState(check *structs.HealthCheck) error { return nil } +// purgeCheckState is used to purge the state of a check from the data dir +func (a *Agent) purgeCheckState(checkID string) error { + file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(checkID)) + err := os.Remove(file) + if os.IsNotExist(err) { + return nil + } + return err +} + // Stats is used to get various debugging state from the sub-systems func (a *Agent) Stats() map[string]map[string]string { toString := func(v uint64) string { diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index db35adde4b..c5439383a2 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -1453,3 +1453,36 @@ func TestAgent_recallCheckState(t *testing.T) { t.Fatalf("bad: %#v", health) } } + +func TestAgent_purgeCheckState(t *testing.T) { + config := nextConfig() + dir, agent := makeAgent(t, config) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // No error if the state does not exist + if err := agent.purgeCheckState("check1"); err != nil { + t.Fatalf("err: %s", err) + } + + // Persist some state to the data dir + check := &CheckTTL{ + CheckID: "check1", + TTL: time.Minute, + } + err := agent.persistCheckState(check, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Purge the check state + if err := agent.purgeCheckState("check1"); err != nil { + t.Fatalf("err: %s", err) + } + + // Removed the file + file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1")) + if _, err := os.Stat(file); !os.IsNotExist(err) { + t.Fatalf("should have removed file") + } +} From 1636a352891fac37e5083ed29ff9a9f2b66c75b8 Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Fri, 5 Jun 2015 16:59:41 -0700 Subject: [PATCH 4/7] agent: check state is purged if expired --- command/agent/agent.go | 2 +- command/agent/agent_test.go | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 7e4ce4c2b3..16d1b741a5 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -938,7 +938,7 @@ func (a *Agent) recallCheckState(check *structs.HealthCheck) error { // Check if the state has expired if time.Now().Unix() >= p.Expires { a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) - return nil + return a.purgeCheckState(check.CheckID) } // Restore the fields from the state diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index c5439383a2..97cfa98684 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -1350,7 +1350,7 @@ func TestAgent_loadChecks_checkFails(t *testing.T) { } } -func TestAgent_persistCheckStatus(t *testing.T) { +func TestAgent_persistCheckState(t *testing.T) { config := nextConfig() dir, agent := makeAgent(t, config) defer os.RemoveAll(dir) @@ -1433,6 +1433,12 @@ func TestAgent_recallCheckState(t *testing.T) { t.Fatalf("bad: %#v", health) } + // Should have purged the state + file := filepath.Join(agent.config.DataDir, checksDir, stringHash("check1")) + if _, err := os.Stat(file); !os.IsNotExist(err) { + t.Fatalf("should have purged state") + } + // Set a TTL which will not expire before we check it check.TTL = time.Minute err = agent.persistCheckState(check, structs.HealthPassing, "yup") From e872587e195a94f41b32bff356548750f84de8c3 Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Fri, 5 Jun 2015 17:15:57 -0700 Subject: [PATCH 5/7] website: document TTL check persistence --- website/source/docs/agent/checks.html.markdown | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/website/source/docs/agent/checks.html.markdown b/website/source/docs/agent/checks.html.markdown index 324661b249..d9b0c0f17b 100644 --- a/website/source/docs/agent/checks.html.markdown +++ b/website/source/docs/agent/checks.html.markdown @@ -37,7 +37,10 @@ There are three different kinds of checks: set to the failed state. This mechanism, conceptually similar to a dead man's switch, relies on the application to directly report its health. For example, a healthy app can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will - expire and the health check enters a critical state. + expire and the health check enters a critical state. TTL checks also persist + their last known status to disk. This allows the Consul agent to restore the + last known status of the check across restarts. Persisted check status is + valid through the end of the TTL from the time of the last check. ## Check Definition From 2d1b873e4b1a0aed05e4e65408223055299e167b Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Fri, 5 Jun 2015 17:33:34 -0700 Subject: [PATCH 6/7] agent: test check state restoration from AddCheck --- command/agent/agent_test.go | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 97cfa98684..bb21464c72 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -459,6 +459,49 @@ func TestAgent_AddCheck_MissingService(t *testing.T) { } } +func TestAgent_AddCheck_RestoreState(t *testing.T) { + dir, agent := makeAgent(t, nextConfig()) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // Create some state and persist it + ttl := &CheckTTL{ + CheckID: "baz", + TTL: time.Minute, + } + err := agent.persistCheckState(ttl, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Build and register the check definition and initial state + health := &structs.HealthCheck{ + Node: "foo", + CheckID: "baz", + Name: "baz check 1", + } + chk := &CheckType{ + TTL: time.Minute, + } + err = agent.AddCheck(health, chk, false, "") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure the check status was restored during registration + checks := agent.state.Checks() + check, ok := checks["baz"] + if !ok { + t.Fatalf("missing check") + } + if check.Status != structs.HealthPassing { + t.Fatalf("bad: %#v", check) + } + if check.Output != "yup" { + t.Fatalf("bad: %#v", check) + } +} + func TestAgent_RemoveCheck(t *testing.T) { dir, agent := makeAgent(t, nextConfig()) defer os.RemoveAll(dir) From 69921808ee3158a0f14fefc6260cf5fe3077c36b Mon Sep 17 00:00:00 2001 From: Ryan Uber Date: Mon, 8 Jun 2015 09:35:10 -0700 Subject: [PATCH 7/7] agent: use persist/load/purge convention for function names --- command/agent/agent.go | 6 +++--- command/agent/agent_test.go | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 16d1b741a5..d1d9e92915 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -761,7 +761,7 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist } // Restore persisted state, if any - if err := a.recallCheckState(check); err != nil { + if err := a.loadCheckState(check); err != nil { a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", check.CheckID, err) } @@ -917,8 +917,8 @@ func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error return nil } -// recallCheckState is used to restore the persisted state of a check. -func (a *Agent) recallCheckState(check *structs.HealthCheck) error { +// loadCheckState is used to restore the persisted state of a check. +func (a *Agent) loadCheckState(check *structs.HealthCheck) error { // Try to read the persisted state for this check file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID)) buf, err := ioutil.ReadFile(file) diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index bb21464c72..b36a8a2741 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -1441,7 +1441,7 @@ func TestAgent_persistCheckState(t *testing.T) { } } -func TestAgent_recallCheckState(t *testing.T) { +func TestAgent_loadCheckState(t *testing.T) { config := nextConfig() dir, agent := makeAgent(t, config) defer os.RemoveAll(dir) @@ -1459,12 +1459,12 @@ func TestAgent_recallCheckState(t *testing.T) { t.Fatalf("err: %s", err) } - // Try to recall the state + // Try to load the state health := &structs.HealthCheck{ CheckID: "check1", Status: structs.HealthCritical, } - if err := agent.recallCheckState(health); err != nil { + if err := agent.loadCheckState(health); err != nil { t.Fatalf("err: %s", err) } @@ -1489,8 +1489,8 @@ func TestAgent_recallCheckState(t *testing.T) { t.Fatalf("err: %s", err) } - // Try to recall - if err := agent.recallCheckState(health); err != nil { + // Try to load + if err := agent.loadCheckState(health); err != nil { t.Fatalf("err: %s", err) }