2017-10-25 11:18:07 +02:00
package checks
import (
"crypto/tls"
"fmt"
"io"
"io/ioutil"
"log"
"net"
"net/http"
"os"
osexec "os/exec"
"sync"
"syscall"
"time"
2019-10-14 22:49:49 +02:00
"github.com/hashicorp/consul/agent/structs"
2017-10-25 11:18:07 +02:00
"github.com/armon/circbuf"
"github.com/hashicorp/consul/agent/exec"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/go-cleanhttp"
)
const (
// MinInterval is the minimal interval between
// two checks. Do not allow for a interval below this value.
// Otherwise we risk fork bombing a system.
MinInterval = time . Second
2019-06-26 17:43:25 +02:00
// DefaultBufSize is the maximum size of the captured
2020-01-27 05:00:33 -08:00
// check output by default. Prevents an enormous buffer
2017-10-25 11:18:07 +02:00
// from being captured
2019-06-26 17:43:25 +02:00
DefaultBufSize = 4 * 1024 // 4KB
2017-10-25 11:18:07 +02:00
// UserAgent is the value of the User-Agent header
// for HTTP health checks.
UserAgent = "Consul Health Check"
)
2018-06-29 18:15:48 -07:00
// RPC is an interface that an RPC client must implement. This is a helper
// interface that is implemented by the agent delegate for checks that need
// to make RPC calls.
type RPC interface {
RPC ( method string , args interface { } , reply interface { } ) error
}
2017-10-25 11:18:07 +02:00
// CheckNotifier interface is used by the CheckMonitor
// to notify when a check has a status update. The update
// should take care to be idempotent.
type CheckNotifier interface {
2019-12-09 21:26:41 -05:00
UpdateCheck ( checkID structs . CheckID , status , output string )
2017-10-25 11:18:07 +02:00
}
// CheckMonitor is used to periodically invoke a script to
// determine the health of a given check. It is compatible with
// nagios plugins and expects the output in the same format.
2019-10-14 22:49:49 +02:00
// Supports failures_before_critical and success_before_passing.
2017-10-25 11:18:07 +02:00
type CheckMonitor struct {
2019-06-26 17:43:25 +02:00
Notify CheckNotifier
2019-12-09 21:26:41 -05:00
CheckID structs . CheckID
ServiceID structs . ServiceID
2019-06-26 17:43:25 +02:00
Script string
ScriptArgs [ ] string
Interval time . Duration
Timeout time . Duration
Logger * log . Logger
OutputMaxSize int
2019-10-14 22:49:49 +02:00
StatusHandler * StatusHandler
2017-10-25 11:18:07 +02:00
stop bool
stopCh chan struct { }
stopLock sync . Mutex
}
// Start is used to start a check monitor.
// Monitor runs until stop is called
func ( c * CheckMonitor ) Start ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
c . stop = false
c . stopCh = make ( chan struct { } )
go c . run ( )
}
// Stop is used to stop a check monitor.
func ( c * CheckMonitor ) Stop ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if ! c . stop {
c . stop = true
close ( c . stopCh )
}
}
// run is invoked by a goroutine to run until Stop() is called
func ( c * CheckMonitor ) run ( ) {
// Get the randomized initial pause time
initialPauseTime := lib . RandomStagger ( c . Interval )
next := time . After ( initialPauseTime )
for {
select {
case <- next :
c . check ( )
next = time . After ( c . Interval )
case <- c . stopCh :
return
}
}
}
// check is invoked periodically to perform the script check
func ( c * CheckMonitor ) check ( ) {
// Create the command
var cmd * osexec . Cmd
var err error
if len ( c . ScriptArgs ) > 0 {
cmd , err = exec . Subprocess ( c . ScriptArgs )
} else {
cmd , err = exec . Script ( c . Script )
}
if err != nil {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[ERR] agent: Check %q failed to setup: %s" , c . CheckID . String ( ) , err )
2017-10-25 11:18:07 +02:00
c . Notify . UpdateCheck ( c . CheckID , api . HealthCritical , err . Error ( ) )
return
}
// Collect the output
2019-06-26 17:43:25 +02:00
output , _ := circbuf . NewBuffer ( int64 ( c . OutputMaxSize ) )
2017-10-25 11:18:07 +02:00
cmd . Stdout = output
cmd . Stderr = output
exec . SetSysProcAttr ( cmd )
truncateAndLogOutput := func ( ) string {
outputStr := string ( output . Bytes ( ) )
if output . TotalWritten ( ) > output . Size ( ) {
outputStr = fmt . Sprintf ( "Captured %d of %d bytes\n...\n%s" ,
output . Size ( ) , output . TotalWritten ( ) , outputStr )
}
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[TRACE] agent: Check %q output: %s" , c . CheckID . String ( ) , outputStr )
2017-10-25 11:18:07 +02:00
return outputStr
}
// Start the check
if err := cmd . Start ( ) ; err != nil {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[ERR] agent: Check %q failed to invoke: %s" , c . CheckID . String ( ) , err )
2017-10-25 11:18:07 +02:00
c . Notify . UpdateCheck ( c . CheckID , api . HealthCritical , err . Error ( ) )
return
}
// Wait for the check to complete
waitCh := make ( chan error , 1 )
go func ( ) {
waitCh <- cmd . Wait ( )
} ( )
timeout := 30 * time . Second
if c . Timeout > 0 {
timeout = c . Timeout
}
select {
case <- time . After ( timeout ) :
if err := exec . KillCommandSubtree ( cmd ) ; err != nil {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[WARN] agent: Check %q failed to kill after timeout: %s" , c . CheckID . String ( ) , err )
2017-10-25 11:18:07 +02:00
}
msg := fmt . Sprintf ( "Timed out (%s) running check" , timeout . String ( ) )
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[WARN] agent: Check %q: %s" , c . CheckID . String ( ) , msg )
2017-10-25 11:18:07 +02:00
outputStr := truncateAndLogOutput ( )
if len ( outputStr ) > 0 {
msg += "\n\n" + outputStr
}
c . Notify . UpdateCheck ( c . CheckID , api . HealthCritical , msg )
// Now wait for the process to exit so we never start another
// instance concurrently.
<- waitCh
return
case err = <- waitCh :
// The process returned before the timeout, proceed normally
}
// Check if the check passed
outputStr := truncateAndLogOutput ( )
if err == nil {
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthPassing , outputStr )
2017-10-25 11:18:07 +02:00
return
}
// If the exit code is 1, set check as warning
exitErr , ok := err . ( * osexec . ExitError )
if ok {
if status , ok := exitErr . Sys ( ) . ( syscall . WaitStatus ) ; ok {
code := status . ExitStatus ( )
if code == 1 {
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthWarning , outputStr )
2017-10-25 11:18:07 +02:00
return
}
}
}
// Set the health as critical
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthCritical , outputStr )
2017-10-25 11:18:07 +02:00
}
// CheckTTL is used to apply a TTL to check status,
// and enables clients to set the status of a check
// but upon the TTL expiring, the check status is
// automatically set to critical.
type CheckTTL struct {
2019-09-25 20:55:52 -06:00
Notify CheckNotifier
2019-12-09 21:26:41 -05:00
CheckID structs . CheckID
ServiceID structs . ServiceID
2019-09-25 20:55:52 -06:00
TTL time . Duration
Logger * log . Logger
2017-10-25 11:18:07 +02:00
timer * time . Timer
lastOutput string
lastOutputLock sync . RWMutex
stop bool
stopCh chan struct { }
stopLock sync . Mutex
2019-06-26 17:43:25 +02:00
OutputMaxSize int
2017-10-25 11:18:07 +02:00
}
// Start is used to start a check ttl, runs until Stop()
func ( c * CheckTTL ) Start ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
2019-06-26 17:43:25 +02:00
if c . OutputMaxSize < 1 {
c . OutputMaxSize = DefaultBufSize
}
2017-10-25 11:18:07 +02:00
c . stop = false
c . stopCh = make ( chan struct { } )
c . timer = time . NewTimer ( c . TTL )
go c . run ( )
}
// Stop is used to stop a check ttl.
func ( c * CheckTTL ) Stop ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if ! c . stop {
c . timer . Stop ( )
c . stop = true
close ( c . stopCh )
}
}
// run is used to handle TTL expiration and to update the check status
func ( c * CheckTTL ) run ( ) {
for {
select {
case <- c . timer . C :
2017-11-10 12:43:59 -08:00
c . Logger . Printf ( "[WARN] agent: Check %q missed TTL, is now critical" ,
2019-12-18 13:46:53 -05:00
c . CheckID . String ( ) )
2017-10-25 11:18:07 +02:00
c . Notify . UpdateCheck ( c . CheckID , api . HealthCritical , c . getExpiredOutput ( ) )
case <- c . stopCh :
return
}
}
}
// getExpiredOutput formats the output for the case when the TTL is expired.
func ( c * CheckTTL ) getExpiredOutput ( ) string {
c . lastOutputLock . RLock ( )
defer c . lastOutputLock . RUnlock ( )
const prefix = "TTL expired"
if c . lastOutput == "" {
return prefix
}
return fmt . Sprintf ( "%s (last output before timeout follows): %s" , prefix , c . lastOutput )
}
// SetStatus is used to update the status of the check,
// and to renew the TTL. If expired, TTL is restarted.
2019-06-26 17:43:25 +02:00
// output is returned (might be truncated)
func ( c * CheckTTL ) SetStatus ( status , output string ) string {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[DEBUG] agent: Check %q status is now %s" , c . CheckID . String ( ) , status )
2019-06-26 17:43:25 +02:00
total := len ( output )
if total > c . OutputMaxSize {
output = fmt . Sprintf ( "%s ... (captured %d of %d bytes)" ,
output [ : c . OutputMaxSize ] , c . OutputMaxSize , total )
}
2017-10-25 11:18:07 +02:00
c . Notify . UpdateCheck ( c . CheckID , status , output )
// Store the last output so we can retain it if the TTL expires.
c . lastOutputLock . Lock ( )
c . lastOutput = output
c . lastOutputLock . Unlock ( )
c . timer . Reset ( c . TTL )
2019-06-26 17:43:25 +02:00
return output
2017-10-25 11:18:07 +02:00
}
// CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check.
// The check is passing if the response code is 2XX.
// The check is warning if the response code is 429.
// The check is critical if the response code is anything else
// or if the request returns an error
2019-10-14 22:49:49 +02:00
// Supports failures_before_critical and success_before_passing.
2017-10-25 11:18:07 +02:00
type CheckHTTP struct {
2019-12-09 21:26:41 -05:00
CheckID structs . CheckID
ServiceID structs . ServiceID
2017-11-07 18:22:09 -08:00
HTTP string
Header map [ string ] [ ] string
Method string
Interval time . Duration
Timeout time . Duration
Logger * log . Logger
TLSClientConfig * tls . Config
2019-06-26 17:43:25 +02:00
OutputMaxSize int
2019-10-14 22:49:49 +02:00
StatusHandler * StatusHandler
2017-10-25 11:18:07 +02:00
httpClient * http . Client
stop bool
stopCh chan struct { }
stopLock sync . Mutex
2019-09-25 20:55:52 -06:00
// Set if checks are exposed through Connect proxies
// If set, this is the target of check()
ProxyHTTP string
}
func ( c * CheckHTTP ) CheckType ( ) structs . CheckType {
return structs . CheckType {
2019-12-09 21:26:41 -05:00
CheckID : c . CheckID . ID ,
2019-09-25 20:55:52 -06:00
HTTP : c . HTTP ,
Method : c . Method ,
Header : c . Header ,
Interval : c . Interval ,
ProxyHTTP : c . ProxyHTTP ,
Timeout : c . Timeout ,
OutputMaxSize : c . OutputMaxSize ,
}
2017-10-25 11:18:07 +02:00
}
// Start is used to start an HTTP check.
// The check runs until stop is called
func ( c * CheckHTTP ) Start ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if c . httpClient == nil {
// Create the transport. We disable HTTP Keep-Alive's to prevent
// failing checks due to the keepalive interval.
trans := cleanhttp . DefaultTransport ( )
trans . DisableKeepAlives = true
2017-11-07 18:22:09 -08:00
// Take on the supplied TLS client config.
trans . TLSClientConfig = c . TLSClientConfig
2017-10-25 11:18:07 +02:00
// Create the HTTP client.
c . httpClient = & http . Client {
Timeout : 10 * time . Second ,
Transport : trans ,
}
2019-07-16 15:13:26 -07:00
if c . Timeout > 0 {
2017-10-25 11:18:07 +02:00
c . httpClient . Timeout = c . Timeout
}
2019-07-16 15:13:26 -07:00
2019-06-26 17:43:25 +02:00
if c . OutputMaxSize < 1 {
c . OutputMaxSize = DefaultBufSize
}
2017-10-25 11:18:07 +02:00
}
c . stop = false
c . stopCh = make ( chan struct { } )
go c . run ( )
}
// Stop is used to stop an HTTP check.
func ( c * CheckHTTP ) Stop ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if ! c . stop {
c . stop = true
close ( c . stopCh )
}
}
// run is invoked by a goroutine to run until Stop() is called
func ( c * CheckHTTP ) run ( ) {
// Get the randomized initial pause time
initialPauseTime := lib . RandomStagger ( c . Interval )
next := time . After ( initialPauseTime )
for {
select {
case <- next :
c . check ( )
next = time . After ( c . Interval )
case <- c . stopCh :
return
}
}
}
// check is invoked periodically to perform the HTTP check
func ( c * CheckHTTP ) check ( ) {
method := c . Method
if method == "" {
method = "GET"
}
2019-09-25 20:55:52 -06:00
target := c . HTTP
if c . ProxyHTTP != "" {
target = c . ProxyHTTP
}
req , err := http . NewRequest ( method , target , nil )
2017-10-25 11:18:07 +02:00
if err != nil {
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthCritical , err . Error ( ) )
2017-10-25 11:18:07 +02:00
return
}
req . Header = http . Header ( c . Header )
// this happens during testing but not in prod
if req . Header == nil {
req . Header = make ( http . Header )
}
if host := req . Header . Get ( "Host" ) ; host != "" {
req . Host = host
}
if req . Header . Get ( "User-Agent" ) == "" {
req . Header . Set ( "User-Agent" , UserAgent )
}
if req . Header . Get ( "Accept" ) == "" {
req . Header . Set ( "Accept" , "text/plain, text/*, */*" )
}
resp , err := c . httpClient . Do ( req )
if err != nil {
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthCritical , err . Error ( ) )
2017-10-25 11:18:07 +02:00
return
}
defer resp . Body . Close ( )
// Read the response into a circular buffer to limit the size
2019-06-26 17:43:25 +02:00
output , _ := circbuf . NewBuffer ( int64 ( c . OutputMaxSize ) )
2017-10-25 11:18:07 +02:00
if _ , err := io . Copy ( output , resp . Body ) ; err != nil {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[WARN] agent: Check %q error while reading body: %s" , c . CheckID . String ( ) , err )
2017-10-25 11:18:07 +02:00
}
// Format the response body
2019-09-25 20:55:52 -06:00
result := fmt . Sprintf ( "HTTP %s %s: %s Output: %s" , method , target , resp . Status , output . String ( ) )
2017-10-25 11:18:07 +02:00
if resp . StatusCode >= 200 && resp . StatusCode <= 299 {
// PASSING (2xx)
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthPassing , result )
2017-10-25 11:18:07 +02:00
} else if resp . StatusCode == 429 {
// WARNING
// 429 Too Many Requests (RFC 6585)
// The user has sent too many requests in a given amount of time.
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthWarning , result )
2017-10-25 11:18:07 +02:00
} else {
// CRITICAL
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthCritical , result )
2017-10-25 11:18:07 +02:00
}
}
// CheckTCP is used to periodically make an TCP/UDP connection to
// determine the health of a given check.
// The check is passing if the connection succeeds
// The check is critical if the connection returns an error
2019-10-14 22:49:49 +02:00
// Supports failures_before_critical and success_before_passing.
2017-10-25 11:18:07 +02:00
type CheckTCP struct {
2019-12-09 21:26:41 -05:00
CheckID structs . CheckID
ServiceID structs . ServiceID
2019-10-14 22:49:49 +02:00
TCP string
Interval time . Duration
Timeout time . Duration
Logger * log . Logger
StatusHandler * StatusHandler
2017-10-25 11:18:07 +02:00
dialer * net . Dialer
stop bool
stopCh chan struct { }
stopLock sync . Mutex
}
// Start is used to start a TCP check.
// The check runs until stop is called
func ( c * CheckTCP ) Start ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if c . dialer == nil {
// Create the socket dialer
2019-07-16 15:13:26 -07:00
c . dialer = & net . Dialer {
Timeout : 10 * time . Second ,
DualStack : true ,
}
if c . Timeout > 0 {
2017-10-25 11:18:07 +02:00
c . dialer . Timeout = c . Timeout
}
}
c . stop = false
c . stopCh = make ( chan struct { } )
go c . run ( )
}
// Stop is used to stop a TCP check.
func ( c * CheckTCP ) Stop ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if ! c . stop {
c . stop = true
close ( c . stopCh )
}
}
// run is invoked by a goroutine to run until Stop() is called
func ( c * CheckTCP ) run ( ) {
// Get the randomized initial pause time
initialPauseTime := lib . RandomStagger ( c . Interval )
next := time . After ( initialPauseTime )
for {
select {
case <- next :
c . check ( )
next = time . After ( c . Interval )
case <- c . stopCh :
return
}
}
}
// check is invoked periodically to perform the TCP check
func ( c * CheckTCP ) check ( ) {
conn , err := c . dialer . Dial ( ` tcp ` , c . TCP )
if err != nil {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[WARN] agent: Check %q socket connection failed: %s" , c . CheckID . String ( ) , err )
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthCritical , err . Error ( ) )
2017-10-25 11:18:07 +02:00
return
}
conn . Close ( )
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthPassing , fmt . Sprintf ( "TCP connect %s: Success" , c . TCP ) )
2017-10-25 11:18:07 +02:00
}
// CheckDocker is used to periodically invoke a script to
// determine the health of an application running inside a
// Docker Container. We assume that the script is compatible
// with nagios plugins and expects the output in the same format.
2019-10-14 22:49:49 +02:00
// Supports failures_before_critical and success_before_passing.
2017-10-25 11:18:07 +02:00
type CheckDocker struct {
2019-12-09 21:26:41 -05:00
CheckID structs . CheckID
ServiceID structs . ServiceID
2017-10-25 11:18:07 +02:00
Script string
ScriptArgs [ ] string
DockerContainerID string
Shell string
Interval time . Duration
Logger * log . Logger
Client * DockerClient
2019-10-14 22:49:49 +02:00
StatusHandler * StatusHandler
2017-10-25 11:18:07 +02:00
stop chan struct { }
}
func ( c * CheckDocker ) Start ( ) {
if c . stop != nil {
panic ( "Docker check already started" )
}
if c . Logger == nil {
c . Logger = log . New ( ioutil . Discard , "" , 0 )
}
if c . Shell == "" {
c . Shell = os . Getenv ( "SHELL" )
if c . Shell == "" {
c . Shell = "/bin/sh"
}
}
c . stop = make ( chan struct { } )
go c . run ( )
}
func ( c * CheckDocker ) Stop ( ) {
if c . stop == nil {
panic ( "Stop called before start" )
}
close ( c . stop )
}
func ( c * CheckDocker ) run ( ) {
2017-10-26 11:57:18 +02:00
defer c . Client . Close ( )
2017-10-25 11:18:07 +02:00
firstWait := lib . RandomStagger ( c . Interval )
next := time . After ( firstWait )
for {
select {
case <- next :
c . check ( )
next = time . After ( c . Interval )
case <- c . stop :
return
}
}
}
func ( c * CheckDocker ) check ( ) {
var out string
status , b , err := c . doCheck ( )
if err != nil {
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[DEBUG] agent: Check %q: %s" , c . CheckID . String ( ) , err )
2017-10-25 11:18:07 +02:00
out = err . Error ( )
} else {
// out is already limited to CheckBufSize since we're getting a
// limited buffer. So we don't need to truncate it just report
// that it was truncated.
out = string ( b . Bytes ( ) )
if int ( b . TotalWritten ( ) ) > len ( out ) {
out = fmt . Sprintf ( "Captured %d of %d bytes\n...\n%s" , len ( out ) , b . TotalWritten ( ) , out )
}
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[TRACE] agent: Check %q output: %s" , c . CheckID . String ( ) , out )
2017-10-25 11:18:07 +02:00
}
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , status , out )
2017-10-25 11:18:07 +02:00
}
func ( c * CheckDocker ) doCheck ( ) ( string , * circbuf . Buffer , error ) {
var cmd [ ] string
if len ( c . ScriptArgs ) > 0 {
cmd = c . ScriptArgs
} else {
cmd = [ ] string { c . Shell , "-c" , c . Script }
}
execID , err := c . Client . CreateExec ( c . DockerContainerID , cmd )
if err != nil {
return api . HealthCritical , nil , err
}
buf , err := c . Client . StartExec ( c . DockerContainerID , execID )
if err != nil {
return api . HealthCritical , nil , err
}
exitCode , err := c . Client . InspectExec ( c . DockerContainerID , execID )
if err != nil {
return api . HealthCritical , nil , err
}
switch exitCode {
case 0 :
return api . HealthPassing , buf , nil
case 1 :
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[DEBUG] agent: Check %q failed with exit code: %d" , c . CheckID . String ( ) , exitCode )
2017-10-25 11:18:07 +02:00
return api . HealthWarning , buf , nil
default :
2019-12-18 13:46:53 -05:00
c . Logger . Printf ( "[DEBUG] agent: Check %q failed with exit code: %d" , c . CheckID . String ( ) , exitCode )
2017-10-25 11:18:07 +02:00
return api . HealthCritical , buf , nil
}
}
2017-12-26 23:35:22 -05:00
// CheckGRPC is used to periodically send request to a gRPC server
// application that implements gRPC health-checking protocol.
// The check is passing if returned status is SERVING.
// The check is critical if connection fails or returned status is
// not SERVING.
2019-10-14 22:49:49 +02:00
// Supports failures_before_critical and success_before_passing.
2017-12-26 23:35:22 -05:00
type CheckGRPC struct {
2019-12-09 21:26:41 -05:00
CheckID structs . CheckID
ServiceID structs . ServiceID
2017-12-26 23:35:22 -05:00
GRPC string
Interval time . Duration
Timeout time . Duration
TLSClientConfig * tls . Config
Logger * log . Logger
2019-10-14 22:49:49 +02:00
StatusHandler * StatusHandler
2017-12-26 23:35:22 -05:00
probe * GrpcHealthProbe
stop bool
stopCh chan struct { }
stopLock sync . Mutex
2019-09-25 20:55:52 -06:00
// Set if checks are exposed through Connect proxies
// If set, this is the target of check()
ProxyGRPC string
}
func ( c * CheckGRPC ) CheckType ( ) structs . CheckType {
return structs . CheckType {
2019-12-09 21:26:41 -05:00
CheckID : c . CheckID . ID ,
2019-09-25 20:55:52 -06:00
GRPC : c . GRPC ,
ProxyGRPC : c . ProxyGRPC ,
Interval : c . Interval ,
Timeout : c . Timeout ,
}
2017-12-26 23:35:22 -05:00
}
func ( c * CheckGRPC ) Start ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
timeout := 10 * time . Second
if c . Timeout > 0 {
timeout = c . Timeout
}
c . probe = NewGrpcHealthProbe ( c . GRPC , timeout , c . TLSClientConfig )
c . stop = false
c . stopCh = make ( chan struct { } )
go c . run ( )
}
func ( c * CheckGRPC ) run ( ) {
// Get the randomized initial pause time
initialPauseTime := lib . RandomStagger ( c . Interval )
next := time . After ( initialPauseTime )
for {
select {
case <- next :
c . check ( )
next = time . After ( c . Interval )
case <- c . stopCh :
return
}
}
}
func ( c * CheckGRPC ) check ( ) {
2019-09-25 20:55:52 -06:00
target := c . GRPC
if c . ProxyGRPC != "" {
target = c . ProxyGRPC
}
err := c . probe . Check ( target )
2017-12-26 23:35:22 -05:00
if err != nil {
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthCritical , err . Error ( ) )
2017-12-26 23:35:22 -05:00
} else {
2019-10-14 22:49:49 +02:00
c . StatusHandler . updateCheck ( c . CheckID , api . HealthPassing , fmt . Sprintf ( "gRPC check %s: success" , target ) )
2017-12-26 23:35:22 -05:00
}
}
func ( c * CheckGRPC ) Stop ( ) {
c . stopLock . Lock ( )
defer c . stopLock . Unlock ( )
if ! c . stop {
c . stop = true
close ( c . stopCh )
}
}
2019-10-14 22:49:49 +02:00
// StatusHandler keep tracks of successive error/success counts and ensures
// that status can be set to critical/passing only once the successive number of event
// reaches the given threshold.
type StatusHandler struct {
inner CheckNotifier
logger * log . Logger
successBeforePassing int
successCounter int
failuresBeforeCritical int
failuresCounter int
}
// NewStatusHandler set counters values to threshold in order to immediatly update status after first check.
func NewStatusHandler ( inner CheckNotifier , logger * log . Logger , successBeforePassing , failuresBeforeCritical int ) * StatusHandler {
return & StatusHandler {
logger : logger ,
inner : inner ,
successBeforePassing : successBeforePassing ,
successCounter : successBeforePassing ,
failuresBeforeCritical : failuresBeforeCritical ,
failuresCounter : failuresBeforeCritical ,
}
}
2019-12-09 21:26:41 -05:00
func ( s * StatusHandler ) updateCheck ( checkID structs . CheckID , status , output string ) {
2019-10-14 22:49:49 +02:00
if status == api . HealthPassing || status == api . HealthWarning {
s . successCounter ++
s . failuresCounter = 0
if s . successCounter >= s . successBeforePassing {
2019-12-18 13:46:53 -05:00
s . logger . Printf ( "[DEBUG] agent: Check %q is %q" , checkID . String ( ) , status )
2019-10-14 22:49:49 +02:00
s . inner . UpdateCheck ( checkID , status , output )
return
}
2019-12-18 13:46:53 -05:00
s . logger . Printf ( "[WARN] agent: Check %q was %q but has not reached success threshold %d/%d" , checkID . String ( ) , status , s . successCounter , s . successBeforePassing )
2019-10-14 22:49:49 +02:00
} else {
s . failuresCounter ++
s . successCounter = 0
if s . failuresCounter >= s . failuresBeforeCritical {
2019-12-18 13:46:53 -05:00
s . logger . Printf ( "[WARN] agent: Check %q is now critical" , checkID . String ( ) )
2019-10-14 22:49:49 +02:00
s . inner . UpdateCheck ( checkID , status , output )
return
}
2019-12-18 13:46:53 -05:00
s . logger . Printf ( "[WARN] agent: Check %q failed but has not reached failure threshold %d/%d" , checkID . String ( ) , s . failuresCounter , s . failuresBeforeCritical )
2019-10-14 22:49:49 +02:00
}
}