diff --git a/command/agent/agent.go b/command/agent/agent.go index ea3c515079..d1d9e92915 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -12,6 +12,7 @@ import ( "regexp" "strconv" "sync" + "time" "github.com/hashicorp/consul/consul" "github.com/hashicorp/consul/consul/structs" @@ -23,7 +24,8 @@ const ( servicesDir = "services" // Path to save local agent checks - checksDir = "checks" + checksDir = "checks" + checkStateDir = "checks/state" // The ID of the faux health checks for maintenance mode serviceMaintCheckPrefix = "_service_maintenance" @@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist TTL: chkType.TTL, Logger: a.logger, } + + // Restore persisted state, if any + if err := a.loadCheckState(check); err != nil { + a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", + check.CheckID, err) + } + ttl.Start() a.checkTTLs[check.CheckID] = ttl @@ -842,7 +851,12 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error { delete(a.checkTTLs, checkID) } if persist { - return a.purgeCheck(checkID) + if err := a.purgeCheck(checkID); err != nil { + return err + } + if err := a.purgeCheckState(checkID); err != nil { + return err + } } log.Printf("[DEBUG] agent: removed check %q", checkID) return nil @@ -861,9 +875,88 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error { // Set the status through CheckTTL to reset the TTL check.SetStatus(status, output) + + // Always persist the state for TTL checks + if err := a.persistCheckState(check, status, output); err != nil { + return fmt.Errorf("failed persisting state for check %q: %s", checkID, err) + } + return nil } +// persistCheckState is used to record the check status into the data dir. +// This allows the state to be restored on a later agent start. Currently +// only useful for TTL based checks. +func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error { + // Create the persisted state + state := persistedCheckState{ + CheckID: check.CheckID, + Status: status, + Output: output, + Expires: time.Now().Add(check.TTL).Unix(), + } + + // Encode the state + buf, err := json.Marshal(state) + if err != nil { + return err + } + + // Create the state dir if it doesn't exist + dir := filepath.Join(a.config.DataDir, checkStateDir) + if err := os.MkdirAll(dir, 0700); err != nil { + return fmt.Errorf("failed creating check state dir %q: %s", dir, err) + } + + // Write the state to the file + file := filepath.Join(dir, stringHash(check.CheckID)) + if err := ioutil.WriteFile(file, buf, 0600); err != nil { + return fmt.Errorf("failed writing file %q: %s", file, err) + } + + return nil +} + +// loadCheckState is used to restore the persisted state of a check. +func (a *Agent) loadCheckState(check *structs.HealthCheck) error { + // Try to read the persisted state for this check + file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID)) + buf, err := ioutil.ReadFile(file) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("failed reading file %q: %s", file, err) + } + + // Decode the state data + var p persistedCheckState + if err := json.Unmarshal(buf, &p); err != nil { + return fmt.Errorf("failed decoding check state: %s", err) + } + + // Check if the state has expired + if time.Now().Unix() >= p.Expires { + a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) + return a.purgeCheckState(check.CheckID) + } + + // Restore the fields from the state + check.Output = p.Output + check.Status = p.Status + return nil +} + +// purgeCheckState is used to purge the state of a check from the data dir +func (a *Agent) purgeCheckState(checkID string) error { + file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(checkID)) + err := os.Remove(file) + if os.IsNotExist(err) { + return nil + } + return err +} + // Stats is used to get various debugging state from the sub-systems func (a *Agent) Stats() map[string]map[string]string { toString := func(v uint64) string { diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 7a2b43f764..b36a8a2741 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -459,6 +459,49 @@ func TestAgent_AddCheck_MissingService(t *testing.T) { } } +func TestAgent_AddCheck_RestoreState(t *testing.T) { + dir, agent := makeAgent(t, nextConfig()) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // Create some state and persist it + ttl := &CheckTTL{ + CheckID: "baz", + TTL: time.Minute, + } + err := agent.persistCheckState(ttl, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Build and register the check definition and initial state + health := &structs.HealthCheck{ + Node: "foo", + CheckID: "baz", + Name: "baz check 1", + } + chk := &CheckType{ + TTL: time.Minute, + } + err = agent.AddCheck(health, chk, false, "") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Ensure the check status was restored during registration + checks := agent.state.Checks() + check, ok := checks["baz"] + if !ok { + t.Fatalf("missing check") + } + if check.Status != structs.HealthPassing { + t.Fatalf("bad: %#v", check) + } + if check.Output != "yup" { + t.Fatalf("bad: %#v", check) + } +} + func TestAgent_RemoveCheck(t *testing.T) { dir, agent := makeAgent(t, nextConfig()) defer os.RemoveAll(dir) @@ -1349,3 +1392,146 @@ func TestAgent_loadChecks_checkFails(t *testing.T) { t.Fatalf("should have purged check") } } + +func TestAgent_persistCheckState(t *testing.T) { + config := nextConfig() + dir, agent := makeAgent(t, config) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // Create the TTL check to persist + check := &CheckTTL{ + CheckID: "check1", + TTL: 10 * time.Minute, + } + + // Persist some check state for the check + err := agent.persistCheckState(check, structs.HealthCritical, "nope") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Check the persisted file exists and has the content + file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1")) + buf, err := ioutil.ReadFile(file) + if err != nil { + t.Fatalf("err: %s", err) + } + + // Decode the state + var p persistedCheckState + if err := json.Unmarshal(buf, &p); err != nil { + t.Fatalf("err: %s", err) + } + + // Check the fields + if p.CheckID != "check1" { + t.Fatalf("bad: %#v", p) + } + if p.Output != "nope" { + t.Fatalf("bad: %#v", p) + } + if p.Status != structs.HealthCritical { + t.Fatalf("bad: %#v", p) + } + + // Check the expiration time was set + if p.Expires < time.Now().Unix() { + t.Fatalf("bad: %#v", p) + } +} + +func TestAgent_loadCheckState(t *testing.T) { + config := nextConfig() + dir, agent := makeAgent(t, config) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // Create a check whose state will expire immediately + check := &CheckTTL{ + CheckID: "check1", + TTL: 0, + } + + // Persist the check state + err := agent.persistCheckState(check, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Try to load the state + health := &structs.HealthCheck{ + CheckID: "check1", + Status: structs.HealthCritical, + } + if err := agent.loadCheckState(health); err != nil { + t.Fatalf("err: %s", err) + } + + // Should not have restored the status due to expiration + if health.Status != structs.HealthCritical { + t.Fatalf("bad: %#v", health) + } + if health.Output != "" { + t.Fatalf("bad: %#v", health) + } + + // Should have purged the state + file := filepath.Join(agent.config.DataDir, checksDir, stringHash("check1")) + if _, err := os.Stat(file); !os.IsNotExist(err) { + t.Fatalf("should have purged state") + } + + // Set a TTL which will not expire before we check it + check.TTL = time.Minute + err = agent.persistCheckState(check, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Try to load + if err := agent.loadCheckState(health); err != nil { + t.Fatalf("err: %s", err) + } + + // Should have restored + if health.Status != structs.HealthPassing { + t.Fatalf("bad: %#v", health) + } + if health.Output != "yup" { + t.Fatalf("bad: %#v", health) + } +} + +func TestAgent_purgeCheckState(t *testing.T) { + config := nextConfig() + dir, agent := makeAgent(t, config) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + // No error if the state does not exist + if err := agent.purgeCheckState("check1"); err != nil { + t.Fatalf("err: %s", err) + } + + // Persist some state to the data dir + check := &CheckTTL{ + CheckID: "check1", + TTL: time.Minute, + } + err := agent.persistCheckState(check, structs.HealthPassing, "yup") + if err != nil { + t.Fatalf("err: %s", err) + } + + // Purge the check state + if err := agent.purgeCheckState("check1"); err != nil { + t.Fatalf("err: %s", err) + } + + // Removed the file + file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1")) + if _, err := os.Stat(file); !os.IsNotExist(err) { + t.Fatalf("should have removed file") + } +} diff --git a/command/agent/check.go b/command/agent/check.go index 66578db8e7..6677483886 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -266,6 +266,17 @@ type persistedCheck struct { Token string } +// persistedCheckState is used to persist the current state of a given +// check. This is different from the check definition, and includes an +// expiration timestamp which is used to determine staleness on later +// agent restarts. +type persistedCheckState struct { + CheckID string + Output string + Status string + Expires int64 +} + // CheckHTTP is used to periodically make an HTTP request to // determine the health of a given check. // The check is passing if the response code is 2XX. diff --git a/website/source/docs/agent/checks.html.markdown b/website/source/docs/agent/checks.html.markdown index 324661b249..d9b0c0f17b 100644 --- a/website/source/docs/agent/checks.html.markdown +++ b/website/source/docs/agent/checks.html.markdown @@ -37,7 +37,10 @@ There are three different kinds of checks: set to the failed state. This mechanism, conceptually similar to a dead man's switch, relies on the application to directly report its health. For example, a healthy app can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will - expire and the health check enters a critical state. + expire and the health check enters a critical state. TTL checks also persist + their last known status to disk. This allows the Consul agent to restore the + last known status of the check across restarts. Persisted check status is + valid through the end of the TTL from the time of the last check. ## Check Definition