consul/command/agent/check.go
Frank Schroeder eddb1af603 Remove duplicate constants
This patch removes duplicate internal copies of constants in the structs
package which are also defined in the api package. The api.KVOp type
with all its values for the TXN endpoint and the api.HealthXXX constants
are now used throughout the codebase.

This resulted in some circular dependencies in the testutil package
which have been resolved by copying code and constants and moving the
WaitForLeader function into a separate testrpc package.
2017-04-20 09:54:49 -07:00

710 lines
19 KiB
Go

package agent
import (
"crypto/tls"
"fmt"
"io"
"log"
"net"
"net/http"
"os"
"os/exec"
"sync"
"syscall"
"time"
"github.com/armon/circbuf"
docker "github.com/fsouza/go-dockerclient"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/types"
"github.com/hashicorp/go-cleanhttp"
)
const (
// Do not allow for a interval below this value.
// Otherwise we risk fork bombing a system.
MinInterval = time.Second
// Limit the size of a check's output to the
// last CheckBufSize. Prevents an enormous buffer
// from being captured
CheckBufSize = 4 * 1024 // 4KB
// Use this user agent when doing requests for
// HTTP health checks.
HttpUserAgent = "Consul Health Check"
)
// CheckType is used to create either the CheckMonitor or the CheckTTL.
// Five types are supported: Script, HTTP, TCP, Docker and TTL. Script, HTTP,
// Docker and TCP all require Interval. Only one of the types may to be
// provided: TTL or Script/Interval or HTTP/Interval or TCP/Interval or
// Docker/Interval.
type CheckType struct {
Script string
HTTP string
TCP string
Interval time.Duration
DockerContainerID string
Shell string
TLSSkipVerify bool
Timeout time.Duration
TTL time.Duration
// DeregisterCriticalServiceAfter, if >0, will cause the associated
// service, if any, to be deregistered if this check is critical for
// longer than this duration.
DeregisterCriticalServiceAfter time.Duration
Status string
Notes string
}
type CheckTypes []*CheckType
// Valid checks if the CheckType is valid
func (c *CheckType) Valid() bool {
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP() || c.IsDocker()
}
// IsTTL checks if this is a TTL type
func (c *CheckType) IsTTL() bool {
return c.TTL != 0
}
// IsMonitor checks if this is a Monitor type
func (c *CheckType) IsMonitor() bool {
return c.Script != "" && c.DockerContainerID == "" && c.Interval != 0
}
// IsHTTP checks if this is a HTTP type
func (c *CheckType) IsHTTP() bool {
return c.HTTP != "" && c.Interval != 0
}
// IsTCP checks if this is a TCP type
func (c *CheckType) IsTCP() bool {
return c.TCP != "" && c.Interval != 0
}
func (c *CheckType) IsDocker() bool {
return c.DockerContainerID != "" && c.Script != "" && c.Interval != 0
}
// CheckNotifier interface is used by the CheckMonitor
// to notify when a check has a status update. The update
// should take care to be idempotent.
type CheckNotifier interface {
UpdateCheck(checkID types.CheckID, status, output string)
}
// CheckMonitor is used to periodically invoke a script to
// determine the health of a given check. It is compatible with
// nagios plugins and expects the output in the same format.
type CheckMonitor struct {
Notify CheckNotifier
CheckID types.CheckID
Script string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start a check monitor.
// Monitor runs until stop is called
func (c *CheckMonitor) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop a check monitor.
func (c *CheckMonitor) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckMonitor) run() {
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s", initialPauseTime, c.Script)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
// check is invoked periodically to perform the script check
func (c *CheckMonitor) check() {
// Create the command
cmd, err := ExecScript(c.Script)
if err != nil {
c.Logger.Printf("[ERR] agent: failed to setup invoke '%s': %s", c.Script, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
// Collect the output
output, _ := circbuf.NewBuffer(CheckBufSize)
cmd.Stdout = output
cmd.Stderr = output
// Start the check
if err := cmd.Start(); err != nil {
c.Logger.Printf("[ERR] agent: failed to invoke '%s': %s", c.Script, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
// Wait for the check to complete
errCh := make(chan error, 2)
go func() {
errCh <- cmd.Wait()
}()
go func() {
if c.Timeout > 0 {
time.Sleep(c.Timeout)
} else {
time.Sleep(30 * time.Second)
}
errCh <- fmt.Errorf("Timed out running check '%s'", c.Script)
}()
err = <-errCh
// Get the output, add a message about truncation
outputStr := string(output.Bytes())
if output.TotalWritten() > output.Size() {
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
output.Size(), output.TotalWritten(), outputStr)
}
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
c.CheckID, c.Script, outputStr)
// Check if the check passed
if err == nil {
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
return
}
// If the exit code is 1, set check as warning
exitErr, ok := err.(*exec.ExitError)
if ok {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
code := status.ExitStatus()
if code == 1 {
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
return
}
}
}
// Set the health as critical
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
}
// CheckTTL is used to apply a TTL to check status,
// and enables clients to set the status of a check
// but upon the TTL expiring, the check status is
// automatically set to critical.
type CheckTTL struct {
Notify CheckNotifier
CheckID types.CheckID
TTL time.Duration
Logger *log.Logger
timer *time.Timer
lastOutput string
lastOutputLock sync.RWMutex
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start a check ttl, runs until Stop()
func (c *CheckTTL) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
c.stop = false
c.stopCh = make(chan struct{})
c.timer = time.NewTimer(c.TTL)
go c.run()
}
// Stop is used to stop a check ttl.
func (c *CheckTTL) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.timer.Stop()
c.stop = true
close(c.stopCh)
}
}
// run is used to handle TTL expiration and to update the check status
func (c *CheckTTL) run() {
for {
select {
case <-c.timer.C:
c.Logger.Printf("[WARN] agent: Check '%v' missed TTL, is now critical",
c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, c.getExpiredOutput())
case <-c.stopCh:
return
}
}
}
// getExpiredOutput formats the output for the case when the TTL is expired.
func (c *CheckTTL) getExpiredOutput() string {
c.lastOutputLock.RLock()
defer c.lastOutputLock.RUnlock()
const prefix = "TTL expired"
if c.lastOutput == "" {
return prefix
}
return fmt.Sprintf("%s (last output before timeout follows): %s", prefix, c.lastOutput)
}
// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
func (c *CheckTTL) SetStatus(status, output string) {
c.Logger.Printf("[DEBUG] agent: Check '%v' status is now %v",
c.CheckID, status)
c.Notify.UpdateCheck(c.CheckID, status, output)
// Store the last output so we can retain it if the TTL expires.
c.lastOutputLock.Lock()
c.lastOutput = output
c.lastOutputLock.Unlock()
c.timer.Reset(c.TTL)
}
// persistedCheck is used to serialize a check and write it to disk
// so that it may be restored later on.
type persistedCheck struct {
Check *structs.HealthCheck
ChkType *CheckType
Token string
}
// persistedCheckState is used to persist the current state of a given
// check. This is different from the check definition, and includes an
// expiration timestamp which is used to determine staleness on later
// agent restarts.
type persistedCheckState struct {
CheckID types.CheckID
Output string
Status string
Expires int64
}
// CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check.
// The check is passing if the response code is 2XX.
// The check is warning if the response code is 429.
// The check is critical if the response code is anything else
// or if the request returns an error
type CheckHTTP struct {
Notify CheckNotifier
CheckID types.CheckID
HTTP string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger
TLSSkipVerify bool
httpClient *http.Client
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start an HTTP check.
// The check runs until stop is called
func (c *CheckHTTP) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if c.httpClient == nil {
// Create the transport. We disable HTTP Keep-Alive's to prevent
// failing checks due to the keepalive interval.
trans := cleanhttp.DefaultTransport()
trans.DisableKeepAlives = true
// Skip SSL certificate verification if TLSSkipVerify is true
if trans.TLSClientConfig == nil {
trans.TLSClientConfig = &tls.Config{
InsecureSkipVerify: c.TLSSkipVerify,
}
} else {
trans.TLSClientConfig.InsecureSkipVerify = c.TLSSkipVerify
}
// Create the HTTP client.
c.httpClient = &http.Client{
Timeout: 10 * time.Second,
Transport: trans,
}
// For long (>10s) interval checks the http timeout is 10s, otherwise the
// timeout is the interval. This means that a check *should* return
// before the next check begins.
if c.Timeout > 0 && c.Timeout < c.Interval {
c.httpClient.Timeout = c.Timeout
} else if c.Interval < 10*time.Second {
c.httpClient.Timeout = c.Interval
}
}
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop an HTTP check.
func (c *CheckHTTP) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckHTTP) run() {
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
// check is invoked periodically to perform the HTTP check
func (c *CheckHTTP) check() {
req, err := http.NewRequest("GET", c.HTTP, nil)
if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
req.Header.Set("User-Agent", HttpUserAgent)
req.Header.Set("Accept", "text/plain, text/*, */*")
resp, err := c.httpClient.Do(req)
if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
defer resp.Body.Close()
// Read the response into a circular buffer to limit the size
output, _ := circbuf.NewBuffer(CheckBufSize)
if _, err := io.Copy(output, resp.Body); err != nil {
c.Logger.Printf("[WARN] agent: Check '%v': Get error while reading body: %s", c.CheckID, err)
}
// Format the response body
result := fmt.Sprintf("HTTP GET %s: %s Output: %s", c.HTTP, resp.Status, output.String())
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
// PASSING (2xx)
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result)
} else if resp.StatusCode == 429 {
// WARNING
// 429 Too Many Requests (RFC 6585)
// The user has sent too many requests in a given amount of time.
c.Logger.Printf("[WARN] agent: Check '%v' is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result)
} else {
// CRITICAL
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result)
}
}
// CheckTCP is used to periodically make an TCP/UDP connection to
// determine the health of a given check.
// The check is passing if the connection succeeds
// The check is critical if the connection returns an error
type CheckTCP struct {
Notify CheckNotifier
CheckID types.CheckID
TCP string
Interval time.Duration
Timeout time.Duration
Logger *log.Logger
dialer *net.Dialer
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start a TCP check.
// The check runs until stop is called
func (c *CheckTCP) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if c.dialer == nil {
// Create the socket dialer
c.dialer = &net.Dialer{DualStack: true}
// For long (>10s) interval checks the socket timeout is 10s, otherwise
// the timeout is the interval. This means that a check *should* return
// before the next check begins.
if c.Timeout > 0 && c.Timeout < c.Interval {
c.dialer.Timeout = c.Timeout
} else if c.Interval < 10*time.Second {
c.dialer.Timeout = c.Interval
}
}
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop a TCP check.
func (c *CheckTCP) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckTCP) run() {
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
// check is invoked periodically to perform the TCP check
func (c *CheckTCP) check() {
conn, err := c.dialer.Dial(`tcp`, c.TCP)
if err != nil {
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return
}
conn.Close()
c.Logger.Printf("[DEBUG] agent: Check '%v' is passing", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
}
// A custom interface since go-dockerclient doesn't have one
// We will use this interface in our test to inject a fake client
type DockerClient interface {
CreateExec(docker.CreateExecOptions) (*docker.Exec, error)
StartExec(string, docker.StartExecOptions) error
InspectExec(string) (*docker.ExecInspect, error)
}
// CheckDocker is used to periodically invoke a script to
// determine the health of an application running inside a
// Docker Container. We assume that the script is compatible
// with nagios plugins and expects the output in the same format.
type CheckDocker struct {
Notify CheckNotifier
CheckID types.CheckID
Script string
DockerContainerID string
Shell string
Interval time.Duration
Logger *log.Logger
dockerClient DockerClient
cmd []string
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
//Initializes the Docker Client
func (c *CheckDocker) Init() error {
//create the docker client
var err error
c.dockerClient, err = docker.NewClientFromEnv()
if err != nil {
c.Logger.Printf("[DEBUG] Error creating the Docker client: %s", err.Error())
return err
}
return nil
}
// Start is used to start checks.
// Docker Checks runs until stop is called
func (c *CheckDocker) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
//figure out the shell
if c.Shell == "" {
c.Shell = shell()
}
c.cmd = []string{c.Shell, "-c", c.Script}
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop a docker check.
func (c *CheckDocker) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckDocker) run() {
// Get the randomized initial pause time
initialPauseTime := lib.RandomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first invocation of %s -c %s in container %s", initialPauseTime, c.Shell, c.Script, c.DockerContainerID)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
func (c *CheckDocker) check() {
//Set up the Exec since
execOpts := docker.CreateExecOptions{
AttachStdin: false,
AttachStdout: true,
AttachStderr: true,
Tty: false,
Cmd: c.cmd,
Container: c.DockerContainerID,
}
var (
exec *docker.Exec
err error
)
if exec, err = c.dockerClient.CreateExec(execOpts); err != nil {
c.Logger.Printf("[DEBUG] agent: Error while creating Exec: %s", err.Error())
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, fmt.Sprintf("Unable to create Exec, error: %s", err.Error()))
return
}
// Collect the output
output, _ := circbuf.NewBuffer(CheckBufSize)
err = c.dockerClient.StartExec(exec.ID, docker.StartExecOptions{Detach: false, Tty: false, OutputStream: output, ErrorStream: output})
if err != nil {
c.Logger.Printf("[DEBUG] Error in executing health checks: %s", err.Error())
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, fmt.Sprintf("Unable to start Exec: %s", err.Error()))
return
}
// Get the output, add a message about truncation
outputStr := string(output.Bytes())
if output.TotalWritten() > output.Size() {
outputStr = fmt.Sprintf("Captured %d of %d bytes\n...\n%s",
output.Size(), output.TotalWritten(), outputStr)
}
c.Logger.Printf("[DEBUG] agent: Check '%s' script '%s' output: %s",
c.CheckID, c.Script, outputStr)
execInfo, err := c.dockerClient.InspectExec(exec.ID)
if err != nil {
c.Logger.Printf("[DEBUG] Error in inspecting check result : %s", err.Error())
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, fmt.Sprintf("Unable to inspect Exec: %s", err.Error()))
return
}
// Sets the status of the check to healthy if exit code is 0
if execInfo.ExitCode == 0 {
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
return
}
// Set the status of the check to Warning if exit code is 1
if execInfo.ExitCode == 1 {
c.Logger.Printf("[DEBUG] Check failed with exit code: %d", execInfo.ExitCode)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
return
}
// Set the health as critical
c.Logger.Printf("[WARN] agent: Check '%v' is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
}
func shell() string {
if otherShell := os.Getenv("SHELL"); otherShell != "" {
return otherShell
} else {
return "/bin/sh"
}
}