agent: first stab at persisting check state

This commit is contained in:
Ryan Uber 2015-06-05 16:17:07 -07:00
parent ebe57a1f65
commit 7597d3d798
2 changed files with 90 additions and 1 deletions

View File

@ -12,6 +12,7 @@ import (
"regexp" "regexp"
"strconv" "strconv"
"sync" "sync"
"time"
"github.com/hashicorp/consul/consul" "github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/structs" "github.com/hashicorp/consul/consul/structs"
@ -24,6 +25,7 @@ const (
// Path to save local agent checks // Path to save local agent checks
checksDir = "checks" checksDir = "checks"
checkStateDir = "checks/state"
// The ID of the faux health checks for maintenance mode // The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance" serviceMaintCheckPrefix = "_service_maintenance"
@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
TTL: chkType.TTL, TTL: chkType.TTL,
Logger: a.logger, Logger: a.logger,
} }
// Restore persisted state, if any
if err := a.recallCheckState(check); err != nil {
a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
check.CheckID, err)
}
ttl.Start() ttl.Start()
a.checkTTLs[check.CheckID] = ttl a.checkTTLs[check.CheckID] = ttl
@ -861,6 +870,75 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error {
// Set the status through CheckTTL to reset the TTL // Set the status through CheckTTL to reset the TTL
check.SetStatus(status, output) check.SetStatus(status, output)
// Always persist the state for TTL checks
if err := a.persistCheckState(check, status, output); err != nil {
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
}
return nil
}
// persistCheckState is used to record the check status into the data dir.
// This allows the state to be restored on a later agent start. Currently
// only useful for TTL based checks.
func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
// Create the persisted state
state := persistedCheckState{
CheckID: check.CheckID,
Status: status,
Output: output,
Expires: time.Now().Add(check.TTL).Unix(),
}
// Encode the state
buf, err := json.Marshal(state)
if err != nil {
return err
}
// Create the state dir if it doesn't exist
dir := filepath.Join(a.config.DataDir, checkStateDir)
if err := os.MkdirAll(dir, 0700); err != nil {
return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
}
// Write the state to the file
file := filepath.Join(dir, stringHash(check.CheckID))
if err := ioutil.WriteFile(file, buf, 0600); err != nil {
return fmt.Errorf("failed writing file %q: %s", file, err)
}
return nil
}
// recallCheckState is used to restore the persisted state of a check.
func (a *Agent) recallCheckState(check *structs.HealthCheck) error {
// Try to read the persisted state for this check
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID))
buf, err := ioutil.ReadFile(file)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("failed reading file %q: %s", file, err)
}
// Decode the state data
var p persistedCheckState
if err := json.Unmarshal(buf, &p); err != nil {
return fmt.Errorf("failed decoding check state: %s", err)
}
// Check if the state has expired
if time.Now().Unix() > p.Expires {
a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
return nil
}
// Restore the fields from the state
check.Output = p.Output
check.Status = p.Status
return nil return nil
} }

View File

@ -266,6 +266,17 @@ type persistedCheck struct {
Token string Token string
} }
// persistedCheckState is used to persist the current state of a given
// check. This is different from the check definition, and includes an
// expiration timestamp which is used to determine staleness on later
// agent restarts.
type persistedCheckState struct {
CheckID string
Output string
Status string
Expires int64
}
// CheckHTTP is used to periodically make an HTTP request to // CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check. // determine the health of a given check.
// The check is passing if the response code is 2XX. // The check is passing if the response code is 2XX.