2014-01-20 16:44:23 -10:00
|
|
|
package agent
|
|
|
|
|
|
|
|
import (
|
2016-11-03 15:17:30 -05:00
|
|
|
"crypto/tls"
|
2014-01-30 13:39:02 -08:00
|
|
|
"fmt"
|
2016-04-14 14:28:07 -07:00
|
|
|
"io"
|
2014-01-20 16:44:23 -10:00
|
|
|
"log"
|
2015-07-23 21:45:08 +10:00
|
|
|
"net"
|
2015-01-09 16:43:24 -06:00
|
|
|
"net/http"
|
2015-10-22 15:29:13 -07:00
|
|
|
"os"
|
2014-01-20 16:44:23 -10:00
|
|
|
"os/exec"
|
|
|
|
"sync"
|
|
|
|
"syscall"
|
|
|
|
"time"
|
2015-01-13 12:18:18 -08:00
|
|
|
|
|
|
|
"github.com/armon/circbuf"
|
2015-10-22 15:29:13 -07:00
|
|
|
docker "github.com/fsouza/go-dockerclient"
|
2017-04-19 16:00:11 -07:00
|
|
|
"github.com/hashicorp/consul/api"
|
2015-01-13 12:18:18 -08:00
|
|
|
"github.com/hashicorp/consul/consul/structs"
|
2016-01-29 11:42:34 -08:00
|
|
|
"github.com/hashicorp/consul/lib"
|
2016-06-06 13:19:31 -07:00
|
|
|
"github.com/hashicorp/consul/types"
|
2015-10-23 17:14:35 -07:00
|
|
|
"github.com/hashicorp/go-cleanhttp"
|
2014-01-20 16:44:23 -10:00
|
|
|
)
|
|
|
|
|
2014-04-21 14:42:42 -07:00
|
|
|
const (
|
2017-04-20 20:14:10 -07:00
|
|
|
// MinInterval is the minimal interval between
|
|
|
|
// two checks. Do not allow for a interval below this value.
|
2014-04-21 14:42:42 -07:00
|
|
|
// Otherwise we risk fork bombing a system.
|
|
|
|
MinInterval = time.Second
|
2014-04-29 15:28:56 -07:00
|
|
|
|
2017-04-20 20:14:10 -07:00
|
|
|
// CheckBufSize is the maximum size of the captured
|
|
|
|
// check output. Prevents an enormous buffer
|
2014-04-29 15:28:56 -07:00
|
|
|
// from being captured
|
|
|
|
CheckBufSize = 4 * 1024 // 4KB
|
2015-05-18 19:12:10 +02:00
|
|
|
|
2017-04-20 20:14:10 -07:00
|
|
|
// UserAgent is the value of the User-Agent header
|
|
|
|
// for HTTP health checks.
|
2017-04-20 17:02:42 -07:00
|
|
|
UserAgent = "Consul Health Check"
|
2014-04-21 14:42:42 -07:00
|
|
|
)
|
|
|
|
|
2016-08-16 00:05:55 -07:00
|
|
|
// CheckType is used to create either the CheckMonitor or the CheckTTL.
|
|
|
|
// Five types are supported: Script, HTTP, TCP, Docker and TTL. Script, HTTP,
|
|
|
|
// Docker and TCP all require Interval. Only one of the types may to be
|
|
|
|
// provided: TTL or Script/Interval or HTTP/Interval or TCP/Interval or
|
|
|
|
// Docker/Interval.
|
2014-01-30 13:18:05 -08:00
|
|
|
type CheckType struct {
|
2017-05-15 21:49:13 +02:00
|
|
|
// fields already embedded in CheckDefinition
|
|
|
|
// Note: CheckType.CheckID == CheckDefinition.ID
|
|
|
|
|
|
|
|
CheckID types.CheckID
|
|
|
|
Name string
|
|
|
|
Status string
|
|
|
|
Notes string
|
|
|
|
|
|
|
|
// fields copied to CheckDefinition
|
|
|
|
// Update CheckDefinition when adding fields here
|
|
|
|
|
2015-10-22 15:29:13 -07:00
|
|
|
Script string
|
|
|
|
HTTP string
|
|
|
|
TCP string
|
|
|
|
Interval time.Duration
|
2015-11-18 07:40:02 -08:00
|
|
|
DockerContainerID string
|
2015-10-22 15:29:13 -07:00
|
|
|
Shell string
|
2016-11-03 15:17:30 -05:00
|
|
|
TLSSkipVerify bool
|
2017-05-15 21:49:13 +02:00
|
|
|
Timeout time.Duration
|
|
|
|
TTL time.Duration
|
2014-11-06 18:24:04 -08:00
|
|
|
|
2016-08-16 00:05:55 -07:00
|
|
|
// DeregisterCriticalServiceAfter, if >0, will cause the associated
|
|
|
|
// service, if any, to be deregistered if this check is critical for
|
|
|
|
// longer than this duration.
|
|
|
|
DeregisterCriticalServiceAfter time.Duration
|
2014-01-30 13:18:05 -08:00
|
|
|
}
|
2015-01-13 17:52:17 -08:00
|
|
|
type CheckTypes []*CheckType
|
2014-01-30 13:18:05 -08:00
|
|
|
|
|
|
|
// Valid checks if the CheckType is valid
|
|
|
|
func (c *CheckType) Valid() bool {
|
2015-10-22 15:29:13 -07:00
|
|
|
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP() || c.IsDocker()
|
2014-01-30 13:18:05 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// IsTTL checks if this is a TTL type
|
|
|
|
func (c *CheckType) IsTTL() bool {
|
|
|
|
return c.TTL != 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// IsMonitor checks if this is a Monitor type
|
|
|
|
func (c *CheckType) IsMonitor() bool {
|
2015-11-18 07:40:02 -08:00
|
|
|
return c.Script != "" && c.DockerContainerID == "" && c.Interval != 0
|
2014-01-30 13:18:05 -08:00
|
|
|
}
|
|
|
|
|
2015-01-09 16:43:24 -06:00
|
|
|
// IsHTTP checks if this is a HTTP type
|
|
|
|
func (c *CheckType) IsHTTP() bool {
|
|
|
|
return c.HTTP != "" && c.Interval != 0
|
|
|
|
}
|
|
|
|
|
2015-07-23 21:45:08 +10:00
|
|
|
// IsTCP checks if this is a TCP type
|
|
|
|
func (c *CheckType) IsTCP() bool {
|
|
|
|
return c.TCP != "" && c.Interval != 0
|
|
|
|
}
|
|
|
|
|
2017-04-20 20:14:10 -07:00
|
|
|
// IsDocker returns true when checking a docker container.
|
2015-10-22 15:29:13 -07:00
|
|
|
func (c *CheckType) IsDocker() bool {
|
2015-11-18 07:40:02 -08:00
|
|
|
return c.DockerContainerID != "" && c.Script != "" && c.Interval != 0
|
2015-10-22 15:29:13 -07:00
|
|
|
}
|
|
|
|
|
2014-01-20 16:44:23 -10:00
|
|
|
// CheckNotifier interface is used by the CheckMonitor
|
|
|
|
// to notify when a check has a status update. The update
|
|
|
|
// should take care to be idempotent.
|
|
|
|
type CheckNotifier interface {
|
2016-06-06 13:19:31 -07:00
|
|
|
UpdateCheck(checkID types.CheckID, status, output string)
|
2014-01-20 16:44:23 -10:00
|
|
|
}
|
|
|
|
|
|
|
|
// CheckMonitor is used to periodically invoke a script to
|
|
|
|
// determine the health of a given check. It is compatible with
|
|
|
|
// nagios plugins and expects the output in the same format.
|
|
|
|
type CheckMonitor struct {
|
|
|
|
Notify CheckNotifier
|
2016-06-06 13:19:31 -07:00
|
|
|
CheckID types.CheckID
|
2014-01-20 16:44:23 -10:00
|
|
|
Script string
|
|
|
|
Interval time.Duration
|
2016-02-25 19:18:20 -08:00
|
|
|
Timeout time.Duration
|
2014-01-20 16:44:23 -10:00
|
|
|
Logger *log.Logger
|
|
|
|
|
|
|
|
stop bool
|
|
|
|
stopCh chan struct{}
|
|
|
|
stopLock sync.Mutex
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start is used to start a check monitor.
|
|
|
|
// Monitor runs until stop is called
|
|
|
|
func (c *CheckMonitor) Start() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
c.stop = false
|
|
|
|
c.stopCh = make(chan struct{})
|
|
|
|
go c.run()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop is used to stop a check monitor.
|
|
|
|
func (c *CheckMonitor) Stop() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
if !c.stop {
|
|
|
|
c.stop = true
|
|
|
|
close(c.stopCh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// run is invoked by a goroutine to run until Stop() is called
|
|
|
|
func (c *CheckMonitor) run() {
|
2014-12-17 21:44:12 -05:00
|
|
|
// Get the randomized initial pause time
|
2016-01-29 11:42:34 -08:00
|
|
|
initialPauseTime := lib.RandomStagger(c.Interval)
|
2014-12-18 09:00:51 -05:00
|
|
|
c.Logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s", initialPauseTime, c.Script)
|
2014-12-17 21:44:12 -05:00
|
|
|
next := time.After(initialPauseTime)
|
2014-01-20 16:46:01 -10:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-next:
|
|
|
|
c.check()
|
|
|
|
next = time.After(c.Interval)
|
|
|
|
case <-c.stopCh:
|
|
|
|
return
|
|
|
|
}
|
2014-01-20 16:44:23 -10:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// check is invoked periodically to perform the script check
|
|
|
|
func (c *CheckMonitor) check() {
|
|
|
|
// Create the command
|
2014-08-21 14:28:16 -07:00
|
|
|
cmd, err := ExecScript(c.Script)
|
|
|
|
if err != nil {
|
|
|
|
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
2014-08-21 14:28:16 -07:00
|
|
|
return
|
|
|
|
}
|
2014-01-20 16:44:23 -10:00
|
|
|
|
|
|
|
// Collect the output
|
2014-04-29 15:28:56 -07:00
|
|
|
output, _ := circbuf.NewBuffer(CheckBufSize)
|
|
|
|
cmd.Stdout = output
|
|
|
|
cmd.Stderr = output
|
2014-01-20 16:44:23 -10:00
|
|
|
|
|
|
|
// Start the check
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
|
|
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
2014-01-20 16:44:23 -10:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for the check to complete
|
2014-01-30 13:39:02 -08:00
|
|
|
errCh := make(chan error, 2)
|
|
|
|
go func() {
|
|
|
|
errCh <- cmd.Wait()
|
|
|
|
}()
|
|
|
|
go func() {
|
2016-02-25 19:18:20 -08:00
|
|
|
if c.Timeout > 0 {
|
|
|
|
time.Sleep(c.Timeout)
|
|
|
|
} else {
|
|
|
|
time.Sleep(30 * time.Second)
|
|
|
|
}
|
2014-01-30 13:39:02 -08:00
|
|
|
errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
|
|
|
|
}()
|
2014-08-21 14:28:16 -07:00
|
|
|
err = <-errCh
|
2014-01-30 13:39:02 -08:00
|
|
|
|
2014-04-29 15:28:56 -07:00
|
|
|
// Get the output, add a message about truncation
|
2014-04-21 16:20:22 -07:00
|
|
|
outputStr := string(output.Bytes())
|
2014-04-29 15:28:56 -07:00
|
|
|
if output.TotalWritten() > output.Size() {
|
|
|
|
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
|
|
|
|
output.Size(), output.TotalWritten(), outputStr)
|
|
|
|
}
|
|
|
|
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
|
2014-04-21 16:20:22 -07:00
|
|
|
c.CheckID, c.Script, outputStr)
|
2014-01-20 16:44:23 -10:00
|
|
|
|
|
|
|
// Check if the check passed
|
|
|
|
if err == nil {
|
2015-01-13 12:18:18 -08:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
|
2014-01-20 16:44:23 -10:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the exit code is 1, set check as warning
|
|
|
|
exitErr, ok := err.(*exec.ExitError)
|
|
|
|
if ok {
|
|
|
|
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
|
|
|
|
code := status.ExitStatus()
|
|
|
|
if code == 1 {
|
2015-01-13 12:18:18 -08:00
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
|
2014-01-20 16:44:23 -10:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the health as critical
|
2015-01-13 12:18:18 -08:00
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
|
2014-01-20 16:44:23 -10:00
|
|
|
}
|
2014-01-20 17:12:40 -10:00
|
|
|
|
|
|
|
// CheckTTL is used to apply a TTL to check status,
|
|
|
|
// and enables clients to set the status of a check
|
|
|
|
// but upon the TTL expiring, the check status is
|
|
|
|
// automatically set to critical.
|
|
|
|
type CheckTTL struct {
|
|
|
|
Notify CheckNotifier
|
2016-06-06 13:19:31 -07:00
|
|
|
CheckID types.CheckID
|
2014-01-20 17:12:40 -10:00
|
|
|
TTL time.Duration
|
|
|
|
Logger *log.Logger
|
|
|
|
|
|
|
|
timer *time.Timer
|
|
|
|
|
2016-03-02 17:58:01 -08:00
|
|
|
lastOutput string
|
|
|
|
lastOutputLock sync.RWMutex
|
|
|
|
|
2014-01-20 17:12:40 -10:00
|
|
|
stop bool
|
|
|
|
stopCh chan struct{}
|
|
|
|
stopLock sync.Mutex
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start is used to start a check ttl, runs until Stop()
|
|
|
|
func (c *CheckTTL) Start() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
c.stop = false
|
|
|
|
c.stopCh = make(chan struct{})
|
|
|
|
c.timer = time.NewTimer(c.TTL)
|
|
|
|
go c.run()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop is used to stop a check ttl.
|
|
|
|
func (c *CheckTTL) Stop() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
if !c.stop {
|
|
|
|
c.timer.Stop()
|
|
|
|
c.stop = true
|
|
|
|
close(c.stopCh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// run is used to handle TTL expiration and to update the check status
|
|
|
|
func (c *CheckTTL) run() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-c.timer.C:
|
2015-01-13 12:18:18 -08:00
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
|
2014-01-20 17:12:40 -10:00
|
|
|
c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, c.getExpiredOutput())
|
2014-01-20 17:12:40 -10:00
|
|
|
|
|
|
|
case <-c.stopCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-02 17:58:01 -08:00
|
|
|
// getExpiredOutput formats the output for the case when the TTL is expired.
|
|
|
|
func (c *CheckTTL) getExpiredOutput() string {
|
|
|
|
c.lastOutputLock.RLock()
|
|
|
|
defer c.lastOutputLock.RUnlock()
|
|
|
|
|
|
|
|
const prefix = "TTL expired"
|
|
|
|
if c.lastOutput == "" {
|
2016-03-02 19:47:00 -08:00
|
|
|
return prefix
|
2016-03-02 17:58:01 -08:00
|
|
|
}
|
|
|
|
|
2016-03-02 19:47:00 -08:00
|
|
|
return fmt.Sprintf("%s (last output before timeout follows): %s", prefix, c.lastOutput)
|
2016-03-02 17:58:01 -08:00
|
|
|
}
|
|
|
|
|
2014-01-20 17:12:40 -10:00
|
|
|
// SetStatus is used to update the status of the check,
|
|
|
|
// and to renew the TTL. If expired, TTL is restarted.
|
2014-04-21 16:20:22 -07:00
|
|
|
func (c *CheckTTL) SetStatus(status, output string) {
|
2015-01-13 12:18:18 -08:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
|
2014-01-20 17:12:40 -10:00
|
|
|
c.CheckID, status)
|
2014-04-21 16:20:22 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, status, output)
|
2016-03-02 17:58:01 -08:00
|
|
|
|
|
|
|
// Store the last output so we can retain it if the TTL expires.
|
|
|
|
c.lastOutputLock.Lock()
|
|
|
|
c.lastOutput = output
|
|
|
|
c.lastOutputLock.Unlock()
|
|
|
|
|
2014-01-20 17:12:40 -10:00
|
|
|
c.timer.Reset(c.TTL)
|
|
|
|
}
|
2014-11-29 12:25:01 -08:00
|
|
|
|
|
|
|
// persistedCheck is used to serialize a check and write it to disk
|
|
|
|
// so that it may be restored later on.
|
|
|
|
type persistedCheck struct {
|
|
|
|
Check *structs.HealthCheck
|
|
|
|
ChkType *CheckType
|
2015-04-27 19:01:02 -07:00
|
|
|
Token string
|
2014-11-29 12:25:01 -08:00
|
|
|
}
|
2015-01-09 16:43:24 -06:00
|
|
|
|
2015-06-05 16:17:07 -07:00
|
|
|
// persistedCheckState is used to persist the current state of a given
|
|
|
|
// check. This is different from the check definition, and includes an
|
|
|
|
// expiration timestamp which is used to determine staleness on later
|
|
|
|
// agent restarts.
|
|
|
|
type persistedCheckState struct {
|
2016-06-06 13:19:31 -07:00
|
|
|
CheckID types.CheckID
|
2015-06-05 16:17:07 -07:00
|
|
|
Output string
|
|
|
|
Status string
|
|
|
|
Expires int64
|
|
|
|
}
|
|
|
|
|
2015-01-09 16:43:24 -06:00
|
|
|
// CheckHTTP is used to periodically make an HTTP request to
|
|
|
|
// determine the health of a given check.
|
2015-01-13 12:18:18 -08:00
|
|
|
// The check is passing if the response code is 2XX.
|
|
|
|
// The check is warning if the response code is 429.
|
2015-01-09 16:43:24 -06:00
|
|
|
// The check is critical if the response code is anything else
|
|
|
|
// or if the request returns an error
|
|
|
|
type CheckHTTP struct {
|
2016-11-03 15:17:30 -05:00
|
|
|
Notify CheckNotifier
|
|
|
|
CheckID types.CheckID
|
|
|
|
HTTP string
|
|
|
|
Interval time.Duration
|
|
|
|
Timeout time.Duration
|
|
|
|
Logger *log.Logger
|
|
|
|
TLSSkipVerify bool
|
2015-01-09 16:43:24 -06:00
|
|
|
|
|
|
|
httpClient *http.Client
|
|
|
|
stop bool
|
|
|
|
stopCh chan struct{}
|
|
|
|
stopLock sync.Mutex
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start is used to start an HTTP check.
|
|
|
|
// The check runs until stop is called
|
|
|
|
func (c *CheckHTTP) Start() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
2015-01-12 23:58:25 +00:00
|
|
|
|
|
|
|
if c.httpClient == nil {
|
2015-03-15 13:30:50 -07:00
|
|
|
// Create the transport. We disable HTTP Keep-Alive's to prevent
|
|
|
|
// failing checks due to the keepalive interval.
|
2015-10-22 10:47:50 -04:00
|
|
|
trans := cleanhttp.DefaultTransport()
|
2015-03-15 13:30:50 -07:00
|
|
|
trans.DisableKeepAlives = true
|
|
|
|
|
2016-11-03 15:17:30 -05:00
|
|
|
// Skip SSL certificate verification if TLSSkipVerify is true
|
|
|
|
if trans.TLSClientConfig == nil {
|
|
|
|
trans.TLSClientConfig = &tls.Config{
|
|
|
|
InsecureSkipVerify: c.TLSSkipVerify,
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
trans.TLSClientConfig.InsecureSkipVerify = c.TLSSkipVerify
|
|
|
|
}
|
|
|
|
|
2015-03-15 13:30:50 -07:00
|
|
|
// Create the HTTP client.
|
|
|
|
c.httpClient = &http.Client{
|
|
|
|
Timeout: 10 * time.Second,
|
2015-10-22 10:47:50 -04:00
|
|
|
Transport: trans,
|
2015-03-15 13:30:50 -07:00
|
|
|
}
|
|
|
|
|
2015-01-12 23:58:25 +00:00
|
|
|
// For long (>10s) interval checks the http timeout is 10s, otherwise the
|
|
|
|
// timeout is the interval. This means that a check *should* return
|
|
|
|
// before the next check begins.
|
2015-01-29 13:37:48 +07:00
|
|
|
if c.Timeout > 0 && c.Timeout < c.Interval {
|
2015-03-15 13:30:50 -07:00
|
|
|
c.httpClient.Timeout = c.Timeout
|
2015-01-29 13:37:48 +07:00
|
|
|
} else if c.Interval < 10*time.Second {
|
2015-03-15 13:30:50 -07:00
|
|
|
c.httpClient.Timeout = c.Interval
|
2015-01-12 23:58:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-09 16:43:24 -06:00
|
|
|
c.stop = false
|
|
|
|
c.stopCh = make(chan struct{})
|
|
|
|
go c.run()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop is used to stop an HTTP check.
|
|
|
|
func (c *CheckHTTP) Stop() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
if !c.stop {
|
|
|
|
c.stop = true
|
|
|
|
close(c.stopCh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// run is invoked by a goroutine to run until Stop() is called
|
|
|
|
func (c *CheckHTTP) run() {
|
|
|
|
// Get the randomized initial pause time
|
2016-01-29 11:42:34 -08:00
|
|
|
initialPauseTime := lib.RandomStagger(c.Interval)
|
2015-01-09 16:43:24 -06:00
|
|
|
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
|
|
|
|
next := time.After(initialPauseTime)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-next:
|
|
|
|
c.check()
|
|
|
|
next = time.After(c.Interval)
|
|
|
|
case <-c.stopCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// check is invoked periodically to perform the HTTP check
|
|
|
|
func (c *CheckHTTP) check() {
|
2015-05-18 19:12:10 +02:00
|
|
|
req, err := http.NewRequest("GET", c.HTTP, nil)
|
|
|
|
if err != nil {
|
|
|
|
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
2015-05-18 19:12:10 +02:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2017-04-20 17:02:42 -07:00
|
|
|
req.Header.Set("User-Agent", UserAgent)
|
2016-03-09 21:11:20 -08:00
|
|
|
req.Header.Set("Accept", "text/plain, text/*, */*")
|
2015-05-18 19:12:10 +02:00
|
|
|
|
|
|
|
resp, err := c.httpClient.Do(req)
|
2015-01-09 16:43:24 -06:00
|
|
|
if err != nil {
|
|
|
|
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
2015-01-09 16:43:24 -06:00
|
|
|
return
|
|
|
|
}
|
2015-01-12 22:35:28 +00:00
|
|
|
defer resp.Body.Close()
|
2015-01-09 16:43:24 -06:00
|
|
|
|
2016-04-14 14:28:07 -07:00
|
|
|
// Read the response into a circular buffer to limit the size
|
|
|
|
output, _ := circbuf.NewBuffer(CheckBufSize)
|
|
|
|
if _, err := io.Copy(output, resp.Body); err != nil {
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v': Get error while reading body: %s", c.CheckID, err)
|
2015-01-13 12:18:18 -08:00
|
|
|
}
|
2016-04-14 14:28:07 -07:00
|
|
|
|
|
|
|
// Format the response body
|
|
|
|
result := fmt.Sprintf("HTTP GET %s: %s Output: %s", c.HTTP, resp.Status, output.String())
|
2015-01-13 12:18:18 -08:00
|
|
|
|
2015-01-12 21:58:57 +00:00
|
|
|
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
|
|
|
|
// PASSING (2xx)
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result)
|
2015-01-09 16:43:24 -06:00
|
|
|
|
2015-01-12 21:58:57 +00:00
|
|
|
} else if resp.StatusCode == 429 {
|
|
|
|
// WARNING
|
|
|
|
// 429 Too Many Requests (RFC 6585)
|
|
|
|
// The user has sent too many requests in a given amount of time.
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result)
|
2015-01-09 16:43:24 -06:00
|
|
|
|
2015-01-12 21:58:57 +00:00
|
|
|
} else {
|
|
|
|
// CRITICAL
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result)
|
2015-01-09 16:43:24 -06:00
|
|
|
}
|
|
|
|
}
|
2015-07-23 21:45:08 +10:00
|
|
|
|
|
|
|
// CheckTCP is used to periodically make an TCP/UDP connection to
|
|
|
|
// determine the health of a given check.
|
|
|
|
// The check is passing if the connection succeeds
|
|
|
|
// The check is critical if the connection returns an error
|
|
|
|
type CheckTCP struct {
|
|
|
|
Notify CheckNotifier
|
2016-06-06 13:19:31 -07:00
|
|
|
CheckID types.CheckID
|
2015-07-23 21:45:08 +10:00
|
|
|
TCP string
|
|
|
|
Interval time.Duration
|
|
|
|
Timeout time.Duration
|
|
|
|
Logger *log.Logger
|
|
|
|
|
|
|
|
dialer *net.Dialer
|
|
|
|
stop bool
|
|
|
|
stopCh chan struct{}
|
|
|
|
stopLock sync.Mutex
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start is used to start a TCP check.
|
|
|
|
// The check runs until stop is called
|
|
|
|
func (c *CheckTCP) Start() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
|
|
|
|
if c.dialer == nil {
|
|
|
|
// Create the socket dialer
|
|
|
|
c.dialer = &net.Dialer{DualStack: true}
|
|
|
|
|
|
|
|
// For long (>10s) interval checks the socket timeout is 10s, otherwise
|
|
|
|
// the timeout is the interval. This means that a check *should* return
|
|
|
|
// before the next check begins.
|
|
|
|
if c.Timeout > 0 && c.Timeout < c.Interval {
|
|
|
|
c.dialer.Timeout = c.Timeout
|
|
|
|
} else if c.Interval < 10*time.Second {
|
|
|
|
c.dialer.Timeout = c.Interval
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
c.stop = false
|
|
|
|
c.stopCh = make(chan struct{})
|
|
|
|
go c.run()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop is used to stop a TCP check.
|
|
|
|
func (c *CheckTCP) Stop() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
if !c.stop {
|
|
|
|
c.stop = true
|
|
|
|
close(c.stopCh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// run is invoked by a goroutine to run until Stop() is called
|
|
|
|
func (c *CheckTCP) run() {
|
|
|
|
// Get the randomized initial pause time
|
2016-01-29 11:42:34 -08:00
|
|
|
initialPauseTime := lib.RandomStagger(c.Interval)
|
2015-07-23 21:45:08 +10:00
|
|
|
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
|
|
|
|
next := time.After(initialPauseTime)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-next:
|
|
|
|
c.check()
|
|
|
|
next = time.After(c.Interval)
|
|
|
|
case <-c.stopCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// check is invoked periodically to perform the TCP check
|
|
|
|
func (c *CheckTCP) check() {
|
|
|
|
conn, err := c.dialer.Dial(`tcp`, c.TCP)
|
|
|
|
if err != nil {
|
|
|
|
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
|
2015-07-23 21:45:08 +10:00
|
|
|
return
|
|
|
|
}
|
|
|
|
conn.Close()
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
|
2015-07-23 21:45:08 +10:00
|
|
|
}
|
2015-10-22 15:29:13 -07:00
|
|
|
|
2017-04-20 20:14:10 -07:00
|
|
|
// DockerClient defines an interface for a docker client
|
|
|
|
// which is used for injecting a fake client during tests.
|
2015-10-26 12:59:40 -07:00
|
|
|
type DockerClient interface {
|
|
|
|
CreateExec(docker.CreateExecOptions) (*docker.Exec, error)
|
|
|
|
StartExec(string, docker.StartExecOptions) error
|
|
|
|
InspectExec(string) (*docker.ExecInspect, error)
|
|
|
|
}
|
|
|
|
|
2015-10-22 15:29:13 -07:00
|
|
|
// CheckDocker is used to periodically invoke a script to
|
|
|
|
// determine the health of an application running inside a
|
|
|
|
// Docker Container. We assume that the script is compatible
|
|
|
|
// with nagios plugins and expects the output in the same format.
|
|
|
|
type CheckDocker struct {
|
|
|
|
Notify CheckNotifier
|
2016-06-06 13:19:31 -07:00
|
|
|
CheckID types.CheckID
|
2015-10-22 15:29:13 -07:00
|
|
|
Script string
|
2015-11-18 07:40:02 -08:00
|
|
|
DockerContainerID string
|
2015-10-22 15:29:13 -07:00
|
|
|
Shell string
|
|
|
|
Interval time.Duration
|
|
|
|
Logger *log.Logger
|
|
|
|
|
2015-10-26 12:59:40 -07:00
|
|
|
dockerClient DockerClient
|
2015-10-26 09:44:59 -07:00
|
|
|
cmd []string
|
|
|
|
stop bool
|
|
|
|
stopCh chan struct{}
|
|
|
|
stopLock sync.Mutex
|
2015-10-22 15:29:13 -07:00
|
|
|
}
|
|
|
|
|
2017-04-20 20:14:10 -07:00
|
|
|
// Init initializes the Docker Client
|
2015-10-26 16:45:12 -07:00
|
|
|
func (c *CheckDocker) Init() error {
|
|
|
|
var err error
|
|
|
|
c.dockerClient, err = docker.NewClientFromEnv()
|
|
|
|
if err != nil {
|
2015-10-29 12:45:48 -07:00
|
|
|
c.Logger.Printf("[DEBUG] Error creating the Docker client: %s", err.Error())
|
2015-10-26 16:45:12 -07:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-10-22 15:29:13 -07:00
|
|
|
// Start is used to start checks.
|
|
|
|
// Docker Checks runs until stop is called
|
|
|
|
func (c *CheckDocker) Start() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
|
|
|
|
//figure out the shell
|
|
|
|
if c.Shell == "" {
|
2015-10-26 15:00:34 -07:00
|
|
|
c.Shell = shell()
|
2015-10-22 15:29:13 -07:00
|
|
|
}
|
|
|
|
|
2015-10-26 09:44:59 -07:00
|
|
|
c.cmd = []string{c.Shell, "-c", c.Script}
|
2015-10-22 15:29:13 -07:00
|
|
|
|
|
|
|
c.stop = false
|
|
|
|
c.stopCh = make(chan struct{})
|
|
|
|
go c.run()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop is used to stop a docker check.
|
|
|
|
func (c *CheckDocker) Stop() {
|
|
|
|
c.stopLock.Lock()
|
|
|
|
defer c.stopLock.Unlock()
|
|
|
|
if !c.stop {
|
|
|
|
c.stop = true
|
|
|
|
close(c.stopCh)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// run is invoked by a goroutine to run until Stop() is called
|
|
|
|
func (c *CheckDocker) run() {
|
|
|
|
// Get the randomized initial pause time
|
2016-01-29 11:42:34 -08:00
|
|
|
initialPauseTime := lib.RandomStagger(c.Interval)
|
2015-11-18 07:40:02 -08:00
|
|
|
c.Logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s -c %s in container %s", initialPauseTime, c.Shell, c.Script, c.DockerContainerID)
|
2015-10-22 15:29:13 -07:00
|
|
|
next := time.After(initialPauseTime)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-next:
|
|
|
|
c.check()
|
|
|
|
next = time.After(c.Interval)
|
|
|
|
case <-c.stopCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *CheckDocker) check() {
|
2015-10-26 09:44:59 -07:00
|
|
|
//Set up the Exec since
|
|
|
|
execOpts := docker.CreateExecOptions{
|
|
|
|
AttachStdin: false,
|
|
|
|
AttachStdout: true,
|
|
|
|
AttachStderr: true,
|
|
|
|
Tty: false,
|
|
|
|
Cmd: c.cmd,
|
2015-11-18 07:40:02 -08:00
|
|
|
Container: c.DockerContainerID,
|
2015-10-26 09:44:59 -07:00
|
|
|
}
|
|
|
|
var (
|
|
|
|
exec *docker.Exec
|
|
|
|
err error
|
|
|
|
)
|
2015-10-26 15:00:34 -07:00
|
|
|
if exec, err = c.dockerClient.CreateExec(execOpts); err != nil {
|
2015-10-26 09:44:59 -07:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Error while creating Exec: %s", err.Error())
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, fmt.Sprintf("Unable to create Exec, error: %s", err.Error()))
|
2015-10-26 11:16:11 -07:00
|
|
|
return
|
2015-10-26 09:44:59 -07:00
|
|
|
}
|
|
|
|
|
2015-10-26 15:19:35 -07:00
|
|
|
// Collect the output
|
|
|
|
output, _ := circbuf.NewBuffer(CheckBufSize)
|
|
|
|
|
|
|
|
err = c.dockerClient.StartExec(exec.ID, docker.StartExecOptions{Detach: false, Tty: false, OutputStream: output, ErrorStream: output})
|
2015-10-22 15:29:13 -07:00
|
|
|
if err != nil {
|
|
|
|
c.Logger.Printf("[DEBUG] Error in executing health checks: %s", err.Error())
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, fmt.Sprintf("Unable to start Exec: %s", err.Error()))
|
2015-10-22 15:29:13 -07:00
|
|
|
return
|
|
|
|
}
|
2015-10-26 09:44:59 -07:00
|
|
|
|
2015-10-26 15:19:35 -07:00
|
|
|
// Get the output, add a message about truncation
|
|
|
|
outputStr := string(output.Bytes())
|
|
|
|
if output.TotalWritten() > output.Size() {
|
|
|
|
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
|
|
|
|
output.Size(), output.TotalWritten(), outputStr)
|
|
|
|
}
|
|
|
|
|
2017-01-05 08:28:25 +00:00
|
|
|
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
|
2015-10-26 15:19:35 -07:00
|
|
|
c.CheckID, c.Script, outputStr)
|
|
|
|
|
2015-10-26 09:44:59 -07:00
|
|
|
execInfo, err := c.dockerClient.InspectExec(exec.ID)
|
|
|
|
if err != nil {
|
|
|
|
c.Logger.Printf("[DEBUG] Error in inspecting check result : %s", err.Error())
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, fmt.Sprintf("Unable to inspect Exec: %s", err.Error()))
|
2015-10-26 09:44:59 -07:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-10-26 15:19:35 -07:00
|
|
|
// Sets the status of the check to healthy if exit code is 0
|
2015-10-26 09:44:59 -07:00
|
|
|
if execInfo.ExitCode == 0 {
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
|
2015-10-26 15:19:35 -07:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the status of the check to Warning if exit code is 1
|
|
|
|
if execInfo.ExitCode == 1 {
|
2015-10-26 10:35:51 -07:00
|
|
|
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", execInfo.ExitCode)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
|
2015-10-26 15:19:35 -07:00
|
|
|
return
|
2015-10-26 09:44:59 -07:00
|
|
|
}
|
|
|
|
|
2015-10-26 15:19:35 -07:00
|
|
|
// Set the health as critical
|
|
|
|
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
|
2017-04-19 16:00:11 -07:00
|
|
|
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
|
2015-10-22 15:29:13 -07:00
|
|
|
}
|
2015-10-26 15:00:34 -07:00
|
|
|
|
|
|
|
func shell() string {
|
2017-04-20 18:59:42 -07:00
|
|
|
if sh := os.Getenv("SHELL"); sh != "" {
|
|
|
|
return sh
|
2015-10-26 15:00:34 -07:00
|
|
|
}
|
2017-04-20 18:59:42 -07:00
|
|
|
return "/bin/sh"
|
2015-10-26 15:00:34 -07:00
|
|
|
}
|