Merge pull request #1785 from hashicorp/f-check-put-api

Adds a PUT-based API for TTL checks and retains output on timeouts.
This commit is contained in:
James Phillips 2016-03-02 20:17:50 -08:00
commit ad13b34c09
6 changed files with 248 additions and 14 deletions

View File

@ -167,6 +167,58 @@ func (s *HTTPServer) AgentCheckFail(resp http.ResponseWriter, req *http.Request)
return nil, nil
}
// checkUpdate is the payload for a PUT to AgentCheckUpdate.
type checkUpdate struct {
// Status us one of the structs.Health* states, "passing", "warning", or
// "critical".
Status string
// Output is the information to post to the UI for operators as the
// output of the process that decided to hit the TTL check. This is
// different from the note field that's associated with the check
// itself.
Output string
}
// AgentCheckUpdate is a PUT-based alternative to the GET-based Pass/Warn/Fail
// APIs.
func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
if req.Method != "PUT" {
resp.WriteHeader(405)
return nil, nil
}
var update checkUpdate
if err := decodeBody(req, &update, nil); err != nil {
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Request decode failed: %v", err)))
return nil, nil
}
switch update.Status {
case structs.HealthPassing:
case structs.HealthWarning:
case structs.HealthCritical:
default:
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Invalid check status: '%s'", update.Status)))
return nil, nil
}
total := len(update.Output)
if total > CheckBufSize {
update.Output = fmt.Sprintf("%s ... (captured %d of %d bytes)",
update.Output[:CheckBufSize], CheckBufSize, total)
}
checkID := strings.TrimPrefix(req.URL.Path, "/v1/agent/check/update/")
if err := s.agent.UpdateCheck(checkID, update.Status, update.Output); err != nil {
return nil, err
}
s.syncChanges()
return nil, nil
}
func (s *HTTPServer) AgentRegisterService(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args ServiceDefinition
// Fixup the type decode of TTL or Interval if a check if provided

View File

@ -7,6 +7,7 @@ import (
"net/http/httptest"
"os"
"reflect"
"strings"
"testing"
"time"
@ -428,7 +429,6 @@ func TestHTTPAgentPassCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}
// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/pass/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
@ -461,7 +461,6 @@ func TestHTTPAgentWarnCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}
// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/warn/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
@ -494,7 +493,6 @@ func TestHTTPAgentFailCheck(t *testing.T) {
t.Fatalf("err: %v", err)
}
// Register node
req, err := http.NewRequest("GET", "/v1/agent/check/fail/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
@ -515,6 +513,134 @@ func TestHTTPAgentFailCheck(t *testing.T) {
}
}
func TestHTTPAgentUpdateCheck(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
chk := &structs.HealthCheck{Name: "test", CheckID: "test"}
chkType := &CheckType{TTL: 15 * time.Second}
if err := srv.agent.AddCheck(chk, chkType, false, ""); err != nil {
t.Fatalf("err: %v", err)
}
cases := []checkUpdate{
checkUpdate{"passing", "hello-passing"},
checkUpdate{"critical", "hello-critical"},
checkUpdate{"warning", "hello-warning"},
}
for _, c := range cases {
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
req.Body = encodeReq(c)
resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
state := srv.agent.state.Checks()["test"]
if state.Status != c.Status || state.Output != c.Output {
t.Fatalf("bad: %v", state)
}
}
// Make sure abusive levels of output are capped.
{
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
update := checkUpdate{
Status: "passing",
Output: strings.Repeat("-= bad -=", 5*CheckBufSize),
}
req.Body = encodeReq(update)
resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
// Since we append some notes about truncating, we just do a
// rough check that the output buffer was cut down so this test
// isn't super brittle.
state := srv.agent.state.Checks()["test"]
if state.Status != structs.HealthPassing || len(state.Output) > 2*CheckBufSize {
t.Fatalf("bad: %v", state)
}
}
// Check a bogus status.
{
req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
update := checkUpdate{
Status: "itscomplicated",
}
req.Body = encodeReq(update)
resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 400 {
t.Fatalf("expected 400, got %d", resp.Code)
}
}
// Check a bogus verb.
{
req, err := http.NewRequest("POST", "/v1/agent/check/update/test", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
update := checkUpdate{
Status: "passing",
}
req.Body = encodeReq(update)
resp := httptest.NewRecorder()
obj, err := srv.AgentCheckUpdate(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if obj != nil {
t.Fatalf("bad: %v", obj)
}
if resp.Code != 405 {
t.Fatalf("expected 405, got %d", resp.Code)
}
}
}
func TestHTTPAgentRegisterService(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)

View File

@ -232,6 +232,9 @@ type CheckTTL struct {
timer *time.Timer
lastOutput string
lastOutputLock sync.RWMutex
stop bool
stopCh chan struct{}
stopLock sync.Mutex
@ -265,7 +268,7 @@ func (c *CheckTTL) run() {
case <-c.timer.C:
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, "TTL expired")
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, c.getExpiredOutput())
case <-c.stopCh:
return
@ -273,12 +276,31 @@ func (c *CheckTTL) run() {
}
}
// getExpiredOutput formats the output for the case when the TTL is expired.
func (c *CheckTTL) getExpiredOutput() string {
c.lastOutputLock.RLock()
defer c.lastOutputLock.RUnlock()
const prefix = "TTL expired"
if c.lastOutput == "" {
return prefix
}
return fmt.Sprintf("%s (last output before timeout follows): %s", prefix, c.lastOutput)
}
// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) {
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
c.CheckID, status)
c.Notify.UpdateCheck(c.CheckID, status, output)
// Store the last output so we can retain it if the TTL expires.
c.lastOutputLock.Lock()
c.lastOutput = output
c.lastOutputLock.Unlock()
c.timer.Reset(c.TTL)
}

View File

@ -9,6 +9,7 @@ import (
"net/http/httptest"
"os"
"os/exec"
"strings"
"sync"
"testing"
"time"
@ -150,7 +151,7 @@ func TestCheckTTL(t *testing.T) {
defer check.Stop()
time.Sleep(50 * time.Millisecond)
check.SetStatus(structs.HealthPassing, "")
check.SetStatus(structs.HealthPassing, "test-output")
if mock.updates["foo"] != 1 {
t.Fatalf("should have 1 updates %v", mock.updates)
@ -176,6 +177,10 @@ func TestCheckTTL(t *testing.T) {
if mock.state["foo"] != structs.HealthCritical {
t.Fatalf("should be critical %v", mock.state)
}
if !strings.Contains(mock.output["foo"], "test-output") {
t.Fatalf("should have retained output %v", mock.output)
}
}
func mockHTTPServer(responseCode int) *httptest.Server {

View File

@ -232,6 +232,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/agent/check/pass/", s.wrap(s.AgentCheckPass))
s.mux.HandleFunc("/v1/agent/check/warn/", s.wrap(s.AgentCheckWarn))
s.mux.HandleFunc("/v1/agent/check/fail/", s.wrap(s.AgentCheckFail))
s.mux.HandleFunc("/v1/agent/check/update/", s.wrap(s.AgentCheckUpdate))
s.mux.HandleFunc("/v1/agent/service/register", s.wrap(s.AgentRegisterService))
s.mux.HandleFunc("/v1/agent/service/deregister/", s.wrap(s.AgentDeregisterService))

View File

@ -25,9 +25,10 @@ The following endpoints are supported:
* [`/v1/agent/force-leave/<node>`](#agent_force_leave)>: Forces removal of a node
* [`/v1/agent/check/register`](#agent_check_register) : Registers a new local check
* [`/v1/agent/check/deregister/<checkID>`](#agent_check_deregister) : Deregisters a local check
* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local test as passing
* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local test as warning
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local test as critical
* [`/v1/agent/check/pass/<checkID>`](#agent_check_pass) : Marks a local check as passing
* [`/v1/agent/check/warn/<checkID>`](#agent_check_warn) : Marks a local check as warning
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Marks a local check as critical
* [`/v1/agent/check/update/<checkID>`](#agent_check_update) : Updates a local check
* [`/v1/agent/service/register`](#agent_service_register) : Registers a new local service
* [`/v1/agent/service/deregister/<serviceID>`](#agent_service_deregister) : Deregisters a local service
* [`/v1/agent/service/maintenance/<serviceID>`](#agent_service_maintenance) : Manages service maintenance mode
@ -310,8 +311,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `passing`
and the TTL clock is reset.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.
The return code is 200 on success.
@ -321,8 +323,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `warning`,
and the TTL clock is reset.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.
The return code is 200 on success.
@ -332,8 +335,33 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.
When this endpoint is accessed via a GET, the status of the check is set to `critical`,
and the TTL clock is reset.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check.
The optional "?note=" query parameter can be used to associate a human-readable message
with the status of the check. This will be passed through to the check's `Output` field
in the check endpoints.
The return code is 200 on success.
### <a name="agent_check_update"></a> /v1/agent/check/update/\<checkId\>
This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.html).
When this endpoint is accessed with a PUT, the status and output of the check are
updated and the TTL clock is reset.
This endpoint expects a JSON request body to be put. The request body must look like:
```javascript
{
"Status": "passing",
"Output": "curl reported a failure:\n\n..."
}
```
The `Status` field is mandatory, and must be set to "passing", "warning", or "critical".
`Output` is an optional field that will associate a human-readable message with the status
of the check, such as the output of the checking script or process. This will be truncated
if it exceeds 4KB in size. This will be passed through to the check's `Output` field in the
check endpoints.
The return code is 200 on success.