Cleans up check logging.

There were places where we still didn't have the script vs. args sorted
correctly so changed all the logging to be just based on check IDs and
also made everything uniform.

Also removed some annoying debug logging, and moved some of the large output
logging to TRACE level.

Closes #3602
This commit is contained in:
James Phillips 2017-11-10 12:43:59 -08:00
parent b797f465d3
commit 990fbbb86b
No known key found for this signature in database
GPG Key ID: 77183E682AC5FC11
1 changed files with 23 additions and 30 deletions

View File

@ -103,16 +103,13 @@ func (c *CheckMonitor) check() {
// Create the command // Create the command
var cmd *osexec.Cmd var cmd *osexec.Cmd
var err error var err error
var cmdDisplay string
if len(c.ScriptArgs) > 0 { if len(c.ScriptArgs) > 0 {
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
cmd, err = exec.Subprocess(c.ScriptArgs) cmd, err = exec.Subprocess(c.ScriptArgs)
} else { } else {
cmdDisplay = c.Script
cmd, err = exec.Script(c.Script) cmd, err = exec.Script(c.Script)
} }
if err != nil { if err != nil {
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err) c.Logger.Printf("[ERR] agent: Check %q failed to setup: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
@ -129,14 +126,13 @@ func (c *CheckMonitor) check() {
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s", outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
output.Size(), output.TotalWritten(), outputStr) output.Size(), output.TotalWritten(), outputStr)
} }
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s", c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, outputStr)
c.CheckID, cmdDisplay, outputStr)
return outputStr return outputStr
} }
// Start the check // Start the check
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err) c.Logger.Printf("[ERR] agent: Check %q failed to invoke: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
@ -154,11 +150,11 @@ func (c *CheckMonitor) check() {
select { select {
case <-time.After(timeout): case <-time.After(timeout):
if err := exec.KillCommandSubtree(cmd); err != nil { if err := exec.KillCommandSubtree(cmd); err != nil {
c.Logger.Printf("[WARN] Failed to kill check '%s' after timeout: %v", cmdDisplay, err) c.Logger.Printf("[WARN] Check %q failed to kill after timeout: %s", c.CheckID, err)
} }
msg := fmt.Sprintf("Timed out (%s) running check", timeout.String()) msg := fmt.Sprintf("Timed out (%s) running check", timeout.String())
c.Logger.Printf("[WARN] %s '%s'", msg, cmdDisplay) c.Logger.Printf("[WARN] Check %q: %s", c.CheckID, msg)
outputStr := truncateAndLogOutput() outputStr := truncateAndLogOutput()
if len(outputStr) > 0 { if len(outputStr) > 0 {
@ -178,7 +174,7 @@ func (c *CheckMonitor) check() {
// Check if the check passed // Check if the check passed
outputStr := truncateAndLogOutput() outputStr := truncateAndLogOutput()
if err == nil { if err == nil {
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID) c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr) c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
return return
} }
@ -189,7 +185,7 @@ func (c *CheckMonitor) check() {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
code := status.ExitStatus() code := status.ExitStatus()
if code == 1 { if code == 1 {
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID) c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr) c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
return return
} }
@ -197,7 +193,7 @@ func (c *CheckMonitor) check() {
} }
// Set the health as critical // Set the health as critical
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID) c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
} }
@ -247,7 +243,7 @@ func (c *CheckTTL) run() {
for { for {
select { select {
case <-c.timer.C: case <-c.timer.C:
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical", c.Logger.Printf("[WARN] agent: Check %q missed TTL, is now critical",
c.CheckID) c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, c.getExpiredOutput()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, c.getExpiredOutput())
@ -273,8 +269,7 @@ func (c *CheckTTL) getExpiredOutput() string {
// SetStatus is used to update the status of the check, // SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted. // and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) { func (c *CheckTTL) SetStatus(status, output string) {
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v", c.Logger.Printf("[DEBUG] agent: Check %q status is now %s", c.CheckID, status)
c.CheckID, status)
c.Notify.UpdateCheck(c.CheckID, status, output) c.Notify.UpdateCheck(c.CheckID, status, output)
// Store the last output so we can retain it if the TTL expires. // Store the last output so we can retain it if the TTL expires.
@ -358,7 +353,6 @@ func (c *CheckHTTP) Stop() {
func (c *CheckHTTP) run() { func (c *CheckHTTP) run() {
// Get the randomized initial pause time // Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval) initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
next := time.After(initialPauseTime) next := time.After(initialPauseTime)
for { for {
select { select {
@ -380,7 +374,7 @@ func (c *CheckHTTP) check() {
req, err := http.NewRequest(method, c.HTTP, nil) req, err := http.NewRequest(method, c.HTTP, nil)
if err != nil { if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err) c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
@ -405,7 +399,7 @@ func (c *CheckHTTP) check() {
resp, err := c.httpClient.Do(req) resp, err := c.httpClient.Do(req)
if err != nil { if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err) c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
@ -414,7 +408,7 @@ func (c *CheckHTTP) check() {
// Read the response into a circular buffer to limit the size // Read the response into a circular buffer to limit the size
output, _ := circbuf.NewBuffer(BufSize) output, _ := circbuf.NewBuffer(BufSize)
if _, err := io.Copy(output, resp.Body); err != nil { if _, err := io.Copy(output, resp.Body); err != nil {
c.Logger.Printf("[WARN] agent: Check '%v': Get error while reading body: %s", c.CheckID, err) c.Logger.Printf("[WARN] agent: Check %q error while reading body: %s", c.CheckID, err)
} }
// Format the response body // Format the response body
@ -422,19 +416,19 @@ func (c *CheckHTTP) check() {
if resp.StatusCode >= 200 && resp.StatusCode <= 299 { if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
// PASSING (2xx) // PASSING (2xx)
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID) c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result) c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result)
} else if resp.StatusCode == 429 { } else if resp.StatusCode == 429 {
// WARNING // WARNING
// 429 Too Many Requests (RFC 6585) // 429 Too Many Requests (RFC 6585)
// The user has sent too many requests in a given amount of time. // The user has sent too many requests in a given amount of time.
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID) c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result) c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result)
} else { } else {
// CRITICAL // CRITICAL
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID) c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result)
} }
} }
@ -496,7 +490,6 @@ func (c *CheckTCP) Stop() {
func (c *CheckTCP) run() { func (c *CheckTCP) run() {
// Get the randomized initial pause time // Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval) initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
next := time.After(initialPauseTime) next := time.After(initialPauseTime)
for { for {
select { select {
@ -513,12 +506,12 @@ func (c *CheckTCP) run() {
func (c *CheckTCP) check() { func (c *CheckTCP) check() {
conn, err := c.dialer.Dial(`tcp`, c.TCP) conn, err := c.dialer.Dial(`tcp`, c.TCP)
if err != nil { if err != nil {
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err) c.Logger.Printf("[WARN] agent: Check %q socket connection failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
conn.Close() conn.Close()
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID) c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP)) c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
} }
@ -585,7 +578,7 @@ func (c *CheckDocker) check() {
var out string var out string
status, b, err := c.doCheck() status, b, err := c.doCheck()
if err != nil { if err != nil {
c.Logger.Printf("[DEBUG] agent: Check '%s': %s", c.CheckID, err) c.Logger.Printf("[DEBUG] agent: Check %q: %s", c.CheckID, err)
out = err.Error() out = err.Error()
} else { } else {
// out is already limited to CheckBufSize since we're getting a // out is already limited to CheckBufSize since we're getting a
@ -595,11 +588,11 @@ func (c *CheckDocker) check() {
if int(b.TotalWritten()) > len(out) { if int(b.TotalWritten()) > len(out) {
out = fmt.Sprintf("Captured %d of %d bytes\n...\n%s", len(out), b.TotalWritten(), out) out = fmt.Sprintf("Captured %d of %d bytes\n...\n%s", len(out), b.TotalWritten(), out)
} }
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s", c.CheckID, c.Script, out) c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, out)
} }
if status == api.HealthCritical { if status == api.HealthCritical {
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID) c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
} }
c.Notify.UpdateCheck(c.CheckID, status, out) c.Notify.UpdateCheck(c.CheckID, status, out)
@ -632,10 +625,10 @@ func (c *CheckDocker) doCheck() (string, *circbuf.Buffer, error) {
case 0: case 0:
return api.HealthPassing, buf, nil return api.HealthPassing, buf, nil
case 1: case 1:
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", exitCode) c.Logger.Printf("[DEBUG] Check %q failed with exit code: %d", c.CheckID, exitCode)
return api.HealthWarning, buf, nil return api.HealthWarning, buf, nil
default: default:
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", exitCode) c.Logger.Printf("[DEBUG] Check %q failed with exit code: %d", c.CheckID, exitCode)
return api.HealthCritical, buf, nil return api.HealthCritical, buf, nil
} }
} }