mirror of https://github.com/status-im/consul.git
Cleans up check logging.
There were places where we still didn't have the script vs. args sorted correctly so changed all the logging to be just based on check IDs and also made everything uniform. Also removed some annoying debug logging, and moved some of the large output logging to TRACE level. Closes #3602
This commit is contained in:
parent
b797f465d3
commit
990fbbb86b
|
@ -103,16 +103,13 @@ func (c *CheckMonitor) check() {
|
|||
// Create the command
|
||||
var cmd *osexec.Cmd
|
||||
var err error
|
||||
var cmdDisplay string
|
||||
if len(c.ScriptArgs) > 0 {
|
||||
cmdDisplay = fmt.Sprintf("%v", c.ScriptArgs)
|
||||
cmd, err = exec.Subprocess(c.ScriptArgs)
|
||||
} else {
|
||||
cmdDisplay = c.Script
|
||||
cmd, err = exec.Script(c.Script)
|
||||
}
|
||||
if err != nil {
|
||||
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", cmdDisplay, err)
|
||||
c.Logger.Printf("[ERR] agent: Check %q failed to setup: %s", c.CheckID, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
|
@ -129,14 +126,13 @@ func (c *CheckMonitor) check() {
|
|||
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
|
||||
output.Size(), output.TotalWritten(), outputStr)
|
||||
}
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
|
||||
c.CheckID, cmdDisplay, outputStr)
|
||||
c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, outputStr)
|
||||
return outputStr
|
||||
}
|
||||
|
||||
// Start the check
|
||||
if err := cmd.Start(); err != nil {
|
||||
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", cmdDisplay, err)
|
||||
c.Logger.Printf("[ERR] agent: Check %q failed to invoke: %s", c.CheckID, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
|
@ -154,11 +150,11 @@ func (c *CheckMonitor) check() {
|
|||
select {
|
||||
case <-time.After(timeout):
|
||||
if err := exec.KillCommandSubtree(cmd); err != nil {
|
||||
c.Logger.Printf("[WARN] Failed to kill check '%s' after timeout: %v", cmdDisplay, err)
|
||||
c.Logger.Printf("[WARN] Check %q failed to kill after timeout: %s", c.CheckID, err)
|
||||
}
|
||||
|
||||
msg := fmt.Sprintf("Timed out (%s) running check", timeout.String())
|
||||
c.Logger.Printf("[WARN] %s '%s'", msg, cmdDisplay)
|
||||
c.Logger.Printf("[WARN] Check %q: %s", c.CheckID, msg)
|
||||
|
||||
outputStr := truncateAndLogOutput()
|
||||
if len(outputStr) > 0 {
|
||||
|
@ -178,7 +174,7 @@ func (c *CheckMonitor) check() {
|
|||
// Check if the check passed
|
||||
outputStr := truncateAndLogOutput()
|
||||
if err == nil {
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
|
||||
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
|
||||
return
|
||||
}
|
||||
|
@ -189,7 +185,7 @@ func (c *CheckMonitor) check() {
|
|||
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
|
||||
code := status.ExitStatus()
|
||||
if code == 1 {
|
||||
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
|
||||
c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
|
||||
return
|
||||
}
|
||||
|
@ -197,7 +193,7 @@ func (c *CheckMonitor) check() {
|
|||
}
|
||||
|
||||
// Set the health as critical
|
||||
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
|
||||
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
|
||||
}
|
||||
|
||||
|
@ -247,7 +243,7 @@ func (c *CheckTTL) run() {
|
|||
for {
|
||||
select {
|
||||
case <-c.timer.C:
|
||||
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
|
||||
c.Logger.Printf("[WARN] agent: Check %q missed TTL, is now critical",
|
||||
c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, c.getExpiredOutput())
|
||||
|
||||
|
@ -273,8 +269,7 @@ func (c *CheckTTL) getExpiredOutput() string {
|
|||
// SetStatus is used to update the status of the check,
|
||||
// and to renew the TTL. If expired, TTL is restarted.
|
||||
func (c *CheckTTL) SetStatus(status, output string) {
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
|
||||
c.CheckID, status)
|
||||
c.Logger.Printf("[DEBUG] agent: Check %q status is now %s", c.CheckID, status)
|
||||
c.Notify.UpdateCheck(c.CheckID, status, output)
|
||||
|
||||
// Store the last output so we can retain it if the TTL expires.
|
||||
|
@ -358,7 +353,6 @@ func (c *CheckHTTP) Stop() {
|
|||
func (c *CheckHTTP) run() {
|
||||
// Get the randomized initial pause time
|
||||
initialPauseTime := lib.RandomStagger(c.Interval)
|
||||
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
|
||||
next := time.After(initialPauseTime)
|
||||
for {
|
||||
select {
|
||||
|
@ -380,7 +374,7 @@ func (c *CheckHTTP) check() {
|
|||
|
||||
req, err := http.NewRequest(method, c.HTTP, nil)
|
||||
if err != nil {
|
||||
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
||||
c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
|
@ -405,7 +399,7 @@ func (c *CheckHTTP) check() {
|
|||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
||||
c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
|
@ -414,7 +408,7 @@ func (c *CheckHTTP) check() {
|
|||
// Read the response into a circular buffer to limit the size
|
||||
output, _ := circbuf.NewBuffer(BufSize)
|
||||
if _, err := io.Copy(output, resp.Body); err != nil {
|
||||
c.Logger.Printf("[WARN] agent: Check '%v': Get error while reading body: %s", c.CheckID, err)
|
||||
c.Logger.Printf("[WARN] agent: Check %q error while reading body: %s", c.CheckID, err)
|
||||
}
|
||||
|
||||
// Format the response body
|
||||
|
@ -422,19 +416,19 @@ func (c *CheckHTTP) check() {
|
|||
|
||||
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
|
||||
// PASSING (2xx)
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
|
||||
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result)
|
||||
|
||||
} else if resp.StatusCode == 429 {
|
||||
// WARNING
|
||||
// 429 Too Many Requests (RFC 6585)
|
||||
// The user has sent too many requests in a given amount of time.
|
||||
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
|
||||
c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result)
|
||||
|
||||
} else {
|
||||
// CRITICAL
|
||||
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
|
||||
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result)
|
||||
}
|
||||
}
|
||||
|
@ -496,7 +490,6 @@ func (c *CheckTCP) Stop() {
|
|||
func (c *CheckTCP) run() {
|
||||
// Get the randomized initial pause time
|
||||
initialPauseTime := lib.RandomStagger(c.Interval)
|
||||
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
|
||||
next := time.After(initialPauseTime)
|
||||
for {
|
||||
select {
|
||||
|
@ -513,12 +506,12 @@ func (c *CheckTCP) run() {
|
|||
func (c *CheckTCP) check() {
|
||||
conn, err := c.dialer.Dial(`tcp`, c.TCP)
|
||||
if err != nil {
|
||||
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
|
||||
c.Logger.Printf("[WARN] agent: Check %q socket connection failed: %s", c.CheckID, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
conn.Close()
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
|
||||
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
|
||||
}
|
||||
|
||||
|
@ -585,7 +578,7 @@ func (c *CheckDocker) check() {
|
|||
var out string
|
||||
status, b, err := c.doCheck()
|
||||
if err != nil {
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%s': %s", c.CheckID, err)
|
||||
c.Logger.Printf("[DEBUG] agent: Check %q: %s", c.CheckID, err)
|
||||
out = err.Error()
|
||||
} else {
|
||||
// out is already limited to CheckBufSize since we're getting a
|
||||
|
@ -595,11 +588,11 @@ func (c *CheckDocker) check() {
|
|||
if int(b.TotalWritten()) > len(out) {
|
||||
out = fmt.Sprintf("Captured %d of %d bytes\n...\n%s", len(out), b.TotalWritten(), out)
|
||||
}
|
||||
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s", c.CheckID, c.Script, out)
|
||||
c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, out)
|
||||
}
|
||||
|
||||
if status == api.HealthCritical {
|
||||
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
|
||||
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
|
||||
}
|
||||
|
||||
c.Notify.UpdateCheck(c.CheckID, status, out)
|
||||
|
@ -632,10 +625,10 @@ func (c *CheckDocker) doCheck() (string, *circbuf.Buffer, error) {
|
|||
case 0:
|
||||
return api.HealthPassing, buf, nil
|
||||
case 1:
|
||||
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", exitCode)
|
||||
c.Logger.Printf("[DEBUG] Check %q failed with exit code: %d", c.CheckID, exitCode)
|
||||
return api.HealthWarning, buf, nil
|
||||
default:
|
||||
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", exitCode)
|
||||
c.Logger.Printf("[DEBUG] Check %q failed with exit code: %d", c.CheckID, exitCode)
|
||||
return api.HealthCritical, buf, nil
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue