From 7ebad899dad1ca061d9810e124870daa61c38a2b Mon Sep 17 00:00:00 2001 From: James Phillips Date: Wed, 2 Mar 2016 17:08:06 -0800 Subject: [PATCH 1/3] Adds a new PUT-based TTL check update endpoint. --- command/agent/agent_endpoint.go | 52 +++++++ command/agent/agent_endpoint_test.go | 132 +++++++++++++++++- command/agent/http.go | 1 + .../docs/agent/http/agent.html.markdown | 46 ++++-- 4 files changed, 219 insertions(+), 12 deletions(-) diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 8e4eb559c6..9a64838b64 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -167,6 +167,58 @@ func (s *HTTPServer) AgentCheckFail(resp http.ResponseWriter, req *http.Request) return nil, nil } +// checkUpdate is the payload for a PUT to AgentCheckUpdate. +type checkUpdate struct { + // Status us one of the structs.Health* states, "passing", "warning", or + // "critical". + Status string + + // Output is the information to post to the UI for operators as the + // output of the process that decided to hit the TTL check. This is + // different from the note field that's associated with the check + // itself. + Output string +} + +// AgentCheckUpdate is a PUT-based alternative to the GET-based Pass/Warn/Fail +// APIs. +func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Request) (interface{}, error) { + if req.Method != "PUT" { + resp.WriteHeader(405) + return nil, nil + } + + var update checkUpdate + if err := decodeBody(req, &update, nil); err != nil { + resp.WriteHeader(400) + resp.Write([]byte(fmt.Sprintf("Request decode failed: %v", err))) + return nil, nil + } + + switch update.Status { + case structs.HealthPassing: + case structs.HealthWarning: + case structs.HealthCritical: + default: + resp.WriteHeader(400) + resp.Write([]byte(fmt.Sprintf("Invalid check status: '%s'", update.Status))) + return nil, nil + } + + total := len(update.Output) + if total > CheckBufSize { + update.Output = fmt.Sprintf("%s\n...\nCaptured %d of %d bytes", + update.Output[:CheckBufSize], CheckBufSize, total) + } + + checkID := strings.TrimPrefix(req.URL.Path, "/v1/agent/check/update/") + if err := s.agent.UpdateCheck(checkID, update.Status, update.Output); err != nil { + return nil, err + } + s.syncChanges() + return nil, nil +} + func (s *HTTPServer) AgentRegisterService(resp http.ResponseWriter, req *http.Request) (interface{}, error) { var args ServiceDefinition // Fixup the type decode of TTL or Interval if a check if provided diff --git a/command/agent/agent_endpoint_test.go b/command/agent/agent_endpoint_test.go index 9376f53945..2a3b14844b 100644 --- a/command/agent/agent_endpoint_test.go +++ b/command/agent/agent_endpoint_test.go @@ -7,6 +7,7 @@ import ( "net/http/httptest" "os" "reflect" + "strings" "testing" "time" @@ -428,7 +429,6 @@ func TestHTTPAgentPassCheck(t *testing.T) { t.Fatalf("err: %v", err) } - // Register node req, err := http.NewRequest("GET", "/v1/agent/check/pass/test", nil) if err != nil { t.Fatalf("err: %v", err) @@ -461,7 +461,6 @@ func TestHTTPAgentWarnCheck(t *testing.T) { t.Fatalf("err: %v", err) } - // Register node req, err := http.NewRequest("GET", "/v1/agent/check/warn/test", nil) if err != nil { t.Fatalf("err: %v", err) @@ -494,7 +493,6 @@ func TestHTTPAgentFailCheck(t *testing.T) { t.Fatalf("err: %v", err) } - // Register node req, err := http.NewRequest("GET", "/v1/agent/check/fail/test", nil) if err != nil { t.Fatalf("err: %v", err) @@ -515,6 +513,134 @@ func TestHTTPAgentFailCheck(t *testing.T) { } } +func TestHTTPAgentUpdateCheck(t *testing.T) { + dir, srv := makeHTTPServer(t) + defer os.RemoveAll(dir) + defer srv.Shutdown() + defer srv.agent.Shutdown() + + chk := &structs.HealthCheck{Name: "test", CheckID: "test"} + chkType := &CheckType{TTL: 15 * time.Second} + if err := srv.agent.AddCheck(chk, chkType, false, ""); err != nil { + t.Fatalf("err: %v", err) + } + + cases := []checkUpdate{ + checkUpdate{"passing", "hello-passing"}, + checkUpdate{"critical", "hello-critical"}, + checkUpdate{"warning", "hello-warning"}, + } + + for _, c := range cases { + req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil) + if err != nil { + t.Fatalf("err: %v", err) + } + req.Body = encodeReq(c) + + resp := httptest.NewRecorder() + obj, err := srv.AgentCheckUpdate(resp, req) + if err != nil { + t.Fatalf("err: %v", err) + } + if obj != nil { + t.Fatalf("bad: %v", obj) + } + if resp.Code != 200 { + t.Fatalf("expected 200, got %d", resp.Code) + } + + state := srv.agent.state.Checks()["test"] + if state.Status != c.Status || state.Output != c.Output { + t.Fatalf("bad: %v", state) + } + } + + // Make sure abusive levels of output are capped. + { + req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + update := checkUpdate{ + Status: "passing", + Output: strings.Repeat("-= bad -=", 5*CheckBufSize), + } + req.Body = encodeReq(update) + + resp := httptest.NewRecorder() + obj, err := srv.AgentCheckUpdate(resp, req) + if err != nil { + t.Fatalf("err: %v", err) + } + if obj != nil { + t.Fatalf("bad: %v", obj) + } + if resp.Code != 200 { + t.Fatalf("expected 200, got %d", resp.Code) + } + + // Since we append some notes about truncating, we just do a + // rough check that the output buffer was cut down so this test + // isn't super brittle. + state := srv.agent.state.Checks()["test"] + if state.Status != structs.HealthPassing || len(state.Output) > 2*CheckBufSize { + t.Fatalf("bad: %v", state) + } + } + + // Check a bogus status. + { + req, err := http.NewRequest("PUT", "/v1/agent/check/update/test", nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + update := checkUpdate{ + Status: "itscomplicated", + } + req.Body = encodeReq(update) + + resp := httptest.NewRecorder() + obj, err := srv.AgentCheckUpdate(resp, req) + if err != nil { + t.Fatalf("err: %v", err) + } + if obj != nil { + t.Fatalf("bad: %v", obj) + } + if resp.Code != 400 { + t.Fatalf("expected 400, got %d", resp.Code) + } + } + + // Check a bogus verb. + { + req, err := http.NewRequest("POST", "/v1/agent/check/update/test", nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + update := checkUpdate{ + Status: "passing", + } + req.Body = encodeReq(update) + + resp := httptest.NewRecorder() + obj, err := srv.AgentCheckUpdate(resp, req) + if err != nil { + t.Fatalf("err: %v", err) + } + if obj != nil { + t.Fatalf("bad: %v", obj) + } + if resp.Code != 405 { + t.Fatalf("expected 405, got %d", resp.Code) + } + } +} + func TestHTTPAgentRegisterService(t *testing.T) { dir, srv := makeHTTPServer(t) defer os.RemoveAll(dir) diff --git a/command/agent/http.go b/command/agent/http.go index 749ed0ef0e..a6891d0149 100644 --- a/command/agent/http.go +++ b/command/agent/http.go @@ -232,6 +232,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) { s.mux.HandleFunc("/v1/agent/check/pass/", s.wrap(s.AgentCheckPass)) s.mux.HandleFunc("/v1/agent/check/warn/", s.wrap(s.AgentCheckWarn)) s.mux.HandleFunc("/v1/agent/check/fail/", s.wrap(s.AgentCheckFail)) + s.mux.HandleFunc("/v1/agent/check/update/", s.wrap(s.AgentCheckUpdate)) s.mux.HandleFunc("/v1/agent/service/register", s.wrap(s.AgentRegisterService)) s.mux.HandleFunc("/v1/agent/service/deregister/", s.wrap(s.AgentDeregisterService)) diff --git a/website/source/docs/agent/http/agent.html.markdown b/website/source/docs/agent/http/agent.html.markdown index 9319c4b18a..2ffb654350 100644 --- a/website/source/docs/agent/http/agent.html.markdown +++ b/website/source/docs/agent/http/agent.html.markdown @@ -25,9 +25,10 @@ The following endpoints are supported: * [`/v1/agent/force-leave/`](#agent_force_leave)>: Forces removal of a node * [`/v1/agent/check/register`](#agent_check_register) : Registers a new local check * [`/v1/agent/check/deregister/`](#agent_check_deregister) : Deregisters a local check -* [`/v1/agent/check/pass/`](#agent_check_pass) : Marks a local test as passing -* [`/v1/agent/check/warn/`](#agent_check_warn) : Marks a local test as warning -* [`/v1/agent/check/fail/`](#agent_check_fail) : Marks a local test as critical +* [`/v1/agent/check/pass/`](#agent_check_pass) : Marks a local check as passing +* [`/v1/agent/check/warn/`](#agent_check_warn) : Marks a local check as warning +* [`/v1/agent/check/fail/`](#agent_check_fail) : Marks a local check as critical +* [`/v1/agent/check/update/`](#agent_check_update) : Updates a local check * [`/v1/agent/service/register`](#agent_service_register) : Registers a new local service * [`/v1/agent/service/deregister/`](#agent_service_deregister) : Deregisters a local service * [`/v1/agent/service/maintenance/`](#agent_service_maintenance) : Manages service maintenance mode @@ -310,8 +311,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks. When this endpoint is accessed via a GET, the status of the check is set to `passing` and the TTL clock is reset. -The optional "?note=" query parameter can be used to associate a human-readable message -with the status of the check. +The optional "?note=" query parameter can be used to associate a human-readable message +with the status of the check. This will be passed through to the check's `Output` field +in the check endpoints. The return code is 200 on success. @@ -321,8 +323,9 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks. When this endpoint is accessed via a GET, the status of the check is set to `warning`, and the TTL clock is reset. -The optional "?note=" query parameter can be used to associate a human-readable message -with the status of the check. +The optional "?note=" query parameter can be used to associate a human-readable message +with the status of the check. This will be passed through to the check's `Output` field +in the check endpoints. The return code is 200 on success. @@ -332,8 +335,33 @@ This endpoint is used with a check that is of the [TTL type](/docs/agent/checks. When this endpoint is accessed via a GET, the status of the check is set to `critical`, and the TTL clock is reset. -The optional "?note=" query parameter can be used to associate a human-readable message -with the status of the check. +The optional "?note=" query parameter can be used to associate a human-readable message +with the status of the check. This will be passed through to the check's `Output` field +in the check endpoints. + +The return code is 200 on success. + +### /v1/agent/check/update/\ + +This endpoint is used with a check that is of the [TTL type](/docs/agent/checks.html). +When this endpoint is accessed with a PUT, the status and output of the check are +updated and the TTL clock is reset. + +This endpoint expects a JSON request body to be put. The request body must look like: + +```javascript +{ + "Status": "passing", + "Output": "curl reported a failure:\n\n..." +} +``` + +The `Status` field is mandatory, and must be set to "passing", "warning", or "critical". + +`Output` is an optional field that will associate a human-readable message with the status +of the check, such as the output of the checking script or process. This will be truncated +if it exceeds 4KB in size. This will be passed through to the check's `Output` field in the +check endpoints. The return code is 200 on success. From 70575002d9621bc671e9c75f8d6ee3ac50915d06 Mon Sep 17 00:00:00 2001 From: James Phillips Date: Wed, 2 Mar 2016 17:58:01 -0800 Subject: [PATCH 2/3] Retains the last output when a TTL check times out. --- command/agent/check.go | 24 +++++++++++++++++++++++- command/agent/check_test.go | 7 ++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/command/agent/check.go b/command/agent/check.go index c3210f7b64..b82cd0544d 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -232,6 +232,9 @@ type CheckTTL struct { timer *time.Timer + lastOutput string + lastOutputLock sync.RWMutex + stop bool stopCh chan struct{} stopLock sync.Mutex @@ -265,7 +268,7 @@ func (c *CheckTTL) run() { case <-c.timer.C: c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical", c.CheckID) - c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, "TTL expired") + c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, c.getExpiredOutput()) case <-c.stopCh: return @@ -273,12 +276,31 @@ func (c *CheckTTL) run() { } } +// getExpiredOutput formats the output for the case when the TTL is expired. +func (c *CheckTTL) getExpiredOutput() string { + c.lastOutputLock.RLock() + defer c.lastOutputLock.RUnlock() + + const prefix = "TTL expired" + if c.lastOutput == "" { + return fmt.Sprintf("%s (no output was available before timeout)", prefix) + } + + return fmt.Sprintf("%s (last output before timeout follows)\n\n%s", prefix, c.lastOutput) +} + // SetStatus is used to update the status of the check, // and to renew the TTL. If expired, TTL is restarted. func (c *CheckTTL) SetStatus(status, output string) { c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v", c.CheckID, status) c.Notify.UpdateCheck(c.CheckID, status, output) + + // Store the last output so we can retain it if the TTL expires. + c.lastOutputLock.Lock() + c.lastOutput = output + c.lastOutputLock.Unlock() + c.timer.Reset(c.TTL) } diff --git a/command/agent/check_test.go b/command/agent/check_test.go index 75099c8f08..9d0a610134 100644 --- a/command/agent/check_test.go +++ b/command/agent/check_test.go @@ -9,6 +9,7 @@ import ( "net/http/httptest" "os" "os/exec" + "strings" "sync" "testing" "time" @@ -150,7 +151,7 @@ func TestCheckTTL(t *testing.T) { defer check.Stop() time.Sleep(50 * time.Millisecond) - check.SetStatus(structs.HealthPassing, "") + check.SetStatus(structs.HealthPassing, "test-output") if mock.updates["foo"] != 1 { t.Fatalf("should have 1 updates %v", mock.updates) @@ -176,6 +177,10 @@ func TestCheckTTL(t *testing.T) { if mock.state["foo"] != structs.HealthCritical { t.Fatalf("should be critical %v", mock.state) } + + if !strings.Contains(mock.output["foo"], "test-output") { + t.Fatalf("should have retained output %v", mock.output) + } } func mockHTTPServer(responseCode int) *httptest.Server { From f46fa3327811a6a7adaf25c0ef534aa70e0a6f4a Mon Sep 17 00:00:00 2001 From: James Phillips Date: Wed, 2 Mar 2016 19:47:00 -0800 Subject: [PATCH 3/3] Tweaks formatting of inline output messages. --- command/agent/agent_endpoint.go | 2 +- command/agent/check.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/command/agent/agent_endpoint.go b/command/agent/agent_endpoint.go index 9a64838b64..b897d52427 100644 --- a/command/agent/agent_endpoint.go +++ b/command/agent/agent_endpoint.go @@ -207,7 +207,7 @@ func (s *HTTPServer) AgentCheckUpdate(resp http.ResponseWriter, req *http.Reques total := len(update.Output) if total > CheckBufSize { - update.Output = fmt.Sprintf("%s\n...\nCaptured %d of %d bytes", + update.Output = fmt.Sprintf("%s ... (captured %d of %d bytes)", update.Output[:CheckBufSize], CheckBufSize, total) } diff --git a/command/agent/check.go b/command/agent/check.go index b82cd0544d..c35d0320e4 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -283,10 +283,10 @@ func (c *CheckTTL) getExpiredOutput() string { const prefix = "TTL expired" if c.lastOutput == "" { - return fmt.Sprintf("%s (no output was available before timeout)", prefix) + return prefix } - return fmt.Sprintf("%s (last output before timeout follows)\n\n%s", prefix, c.lastOutput) + return fmt.Sprintf("%s (last output before timeout follows): %s", prefix, c.lastOutput) } // SetStatus is used to update the status of the check,