Checks to passing/critical only after reaching a consecutive success/failure threshold (#5739)

A check may be set to become passing/critical only if a specified number of successive
checks return passing/critical in a row. Status will stay identical as before until
the threshold is reached.
This feature is available for HTTP, TCP, gRPC, Docker & Monitor checks.
This commit is contained in:
PHBourquin 2019-10-14 22:49:49 +02:00 committed by Paul Banks
parent b0310364c6
commit 039615641e
10 changed files with 326 additions and 117 deletions

View File

@ -2627,6 +2627,8 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
} }
} }
statusHandler := checks.NewStatusHandler(a.State, a.logger, chkType.SuccessBeforePassing, chkType.FailuresBeforeCritical)
switch { switch {
case chkType.IsTTL(): case chkType.IsTTL():
@ -2667,7 +2669,6 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
tlsClientConfig := a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify) tlsClientConfig := a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify)
http := &checks.CheckHTTP{ http := &checks.CheckHTTP{
Notify: a.State,
CheckID: check.CheckID, CheckID: check.CheckID,
ServiceID: check.ServiceID, ServiceID: check.ServiceID,
HTTP: chkType.HTTP, HTTP: chkType.HTTP,
@ -2678,6 +2679,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
Logger: a.logger, Logger: a.logger,
OutputMaxSize: maxOutputSize, OutputMaxSize: maxOutputSize,
TLSClientConfig: tlsClientConfig, TLSClientConfig: tlsClientConfig,
StatusHandler: statusHandler,
} }
if proxy != nil && proxy.Proxy.Expose.Checks { if proxy != nil && proxy.Proxy.Expose.Checks {
@ -2704,13 +2706,13 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
} }
tcp := &checks.CheckTCP{ tcp := &checks.CheckTCP{
Notify: a.State, CheckID: check.CheckID,
CheckID: check.CheckID, ServiceID: check.ServiceID,
ServiceID: check.ServiceID, TCP: chkType.TCP,
TCP: chkType.TCP, Interval: chkType.Interval,
Interval: chkType.Interval, Timeout: chkType.Timeout,
Timeout: chkType.Timeout, Logger: a.logger,
Logger: a.logger, StatusHandler: statusHandler,
} }
tcp.Start() tcp.Start()
a.checkTCPs[check.CheckID] = tcp a.checkTCPs[check.CheckID] = tcp
@ -2732,7 +2734,6 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
} }
grpc := &checks.CheckGRPC{ grpc := &checks.CheckGRPC{
Notify: a.State,
CheckID: check.CheckID, CheckID: check.CheckID,
ServiceID: check.ServiceID, ServiceID: check.ServiceID,
GRPC: chkType.GRPC, GRPC: chkType.GRPC,
@ -2740,6 +2741,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
Timeout: chkType.Timeout, Timeout: chkType.Timeout,
Logger: a.logger, Logger: a.logger,
TLSClientConfig: tlsClientConfig, TLSClientConfig: tlsClientConfig,
StatusHandler: statusHandler,
} }
if proxy != nil && proxy.Proxy.Expose.Checks { if proxy != nil && proxy.Proxy.Expose.Checks {
@ -2776,7 +2778,6 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
} }
dockerCheck := &checks.CheckDocker{ dockerCheck := &checks.CheckDocker{
Notify: a.State,
CheckID: check.CheckID, CheckID: check.CheckID,
ServiceID: check.ServiceID, ServiceID: check.ServiceID,
DockerContainerID: chkType.DockerContainerID, DockerContainerID: chkType.DockerContainerID,
@ -2785,6 +2786,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
Interval: chkType.Interval, Interval: chkType.Interval,
Logger: a.logger, Logger: a.logger,
Client: a.dockerClient, Client: a.dockerClient,
StatusHandler: statusHandler,
} }
if prev := a.checkDockers[check.CheckID]; prev != nil { if prev := a.checkDockers[check.CheckID]; prev != nil {
prev.Stop() prev.Stop()
@ -2811,6 +2813,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType,
Timeout: chkType.Timeout, Timeout: chkType.Timeout,
Logger: a.logger, Logger: a.logger,
OutputMaxSize: maxOutputSize, OutputMaxSize: maxOutputSize,
StatusHandler: statusHandler,
} }
monitor.Start() monitor.Start()
a.checkMonitors[check.CheckID] = monitor a.checkMonitors[check.CheckID] = monitor

View File

@ -3,7 +3,6 @@ package checks
import ( import (
"crypto/tls" "crypto/tls"
"fmt" "fmt"
"github.com/hashicorp/consul/agent/structs"
"io" "io"
"io/ioutil" "io/ioutil"
"log" "log"
@ -15,6 +14,8 @@ import (
"syscall" "syscall"
"time" "time"
"github.com/hashicorp/consul/agent/structs"
"github.com/armon/circbuf" "github.com/armon/circbuf"
"github.com/hashicorp/consul/agent/exec" "github.com/hashicorp/consul/agent/exec"
"github.com/hashicorp/consul/api" "github.com/hashicorp/consul/api"
@ -56,6 +57,7 @@ type CheckNotifier interface {
// CheckMonitor is used to periodically invoke a script to // CheckMonitor is used to periodically invoke a script to
// determine the health of a given check. It is compatible with // determine the health of a given check. It is compatible with
// nagios plugins and expects the output in the same format. // nagios plugins and expects the output in the same format.
// Supports failures_before_critical and success_before_passing.
type CheckMonitor struct { type CheckMonitor struct {
Notify CheckNotifier Notify CheckNotifier
CheckID types.CheckID CheckID types.CheckID
@ -66,6 +68,7 @@ type CheckMonitor struct {
Timeout time.Duration Timeout time.Duration
Logger *log.Logger Logger *log.Logger
OutputMaxSize int OutputMaxSize int
StatusHandler *StatusHandler
stop bool stop bool
stopCh chan struct{} stopCh chan struct{}
@ -184,8 +187,7 @@ func (c *CheckMonitor) check() {
// Check if the check passed // Check if the check passed
outputStr := truncateAndLogOutput() outputStr := truncateAndLogOutput()
if err == nil { if err == nil {
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthPassing, outputStr)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, outputStr)
return return
} }
@ -195,16 +197,14 @@ func (c *CheckMonitor) check() {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { if status, ok := exitErr.Sys().(syscall.WaitStatus); ok {
code := status.ExitStatus() code := status.ExitStatus()
if code == 1 { if code == 1 {
c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthWarning, outputStr)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, outputStr)
return return
} }
} }
} }
// Set the health as critical // Set the health as critical
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, outputStr)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, outputStr)
} }
// CheckTTL is used to apply a TTL to check status, // CheckTTL is used to apply a TTL to check status,
@ -308,8 +308,8 @@ func (c *CheckTTL) SetStatus(status, output string) string {
// The check is warning if the response code is 429. // The check is warning if the response code is 429.
// The check is critical if the response code is anything else // The check is critical if the response code is anything else
// or if the request returns an error // or if the request returns an error
// Supports failures_before_critical and success_before_passing.
type CheckHTTP struct { type CheckHTTP struct {
Notify CheckNotifier
CheckID types.CheckID CheckID types.CheckID
ServiceID string ServiceID string
HTTP string HTTP string
@ -320,6 +320,7 @@ type CheckHTTP struct {
Logger *log.Logger Logger *log.Logger
TLSClientConfig *tls.Config TLSClientConfig *tls.Config
OutputMaxSize int OutputMaxSize int
StatusHandler *StatusHandler
httpClient *http.Client httpClient *http.Client
stop bool stop bool
@ -418,8 +419,7 @@ func (c *CheckHTTP) check() {
req, err := http.NewRequest(method, target, nil) req, err := http.NewRequest(method, target, nil)
if err != nil { if err != nil {
c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err) c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, err.Error())
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
@ -443,8 +443,7 @@ func (c *CheckHTTP) check() {
resp, err := c.httpClient.Do(req) resp, err := c.httpClient.Do(req)
if err != nil { if err != nil {
c.Logger.Printf("[WARN] agent: Check %q HTTP request failed: %s", c.CheckID, err) c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, err.Error())
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
defer resp.Body.Close() defer resp.Body.Close()
@ -460,20 +459,15 @@ func (c *CheckHTTP) check() {
if resp.StatusCode >= 200 && resp.StatusCode <= 299 { if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
// PASSING (2xx) // PASSING (2xx)
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthPassing, result)
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, result)
} else if resp.StatusCode == 429 { } else if resp.StatusCode == 429 {
// WARNING // WARNING
// 429 Too Many Requests (RFC 6585) // 429 Too Many Requests (RFC 6585)
// The user has sent too many requests in a given amount of time. // The user has sent too many requests in a given amount of time.
c.Logger.Printf("[WARN] agent: Check %q is now warning", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthWarning, result)
c.Notify.UpdateCheck(c.CheckID, api.HealthWarning, result)
} else { } else {
// CRITICAL // CRITICAL
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, result)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, result)
} }
} }
@ -481,14 +475,15 @@ func (c *CheckHTTP) check() {
// determine the health of a given check. // determine the health of a given check.
// The check is passing if the connection succeeds // The check is passing if the connection succeeds
// The check is critical if the connection returns an error // The check is critical if the connection returns an error
// Supports failures_before_critical and success_before_passing.
type CheckTCP struct { type CheckTCP struct {
Notify CheckNotifier CheckID types.CheckID
CheckID types.CheckID ServiceID string
ServiceID string TCP string
TCP string Interval time.Duration
Interval time.Duration Timeout time.Duration
Timeout time.Duration Logger *log.Logger
Logger *log.Logger StatusHandler *StatusHandler
dialer *net.Dialer dialer *net.Dialer
stop bool stop bool
@ -549,20 +544,19 @@ func (c *CheckTCP) check() {
conn, err := c.dialer.Dial(`tcp`, c.TCP) conn, err := c.dialer.Dial(`tcp`, c.TCP)
if err != nil { if err != nil {
c.Logger.Printf("[WARN] agent: Check %q socket connection failed: %s", c.CheckID, err) c.Logger.Printf("[WARN] agent: Check %q socket connection failed: %s", c.CheckID, err)
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error()) c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, err.Error())
return return
} }
conn.Close() conn.Close()
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
} }
// CheckDocker is used to periodically invoke a script to // CheckDocker is used to periodically invoke a script to
// determine the health of an application running inside a // determine the health of an application running inside a
// Docker Container. We assume that the script is compatible // Docker Container. We assume that the script is compatible
// with nagios plugins and expects the output in the same format. // with nagios plugins and expects the output in the same format.
// Supports failures_before_critical and success_before_passing.
type CheckDocker struct { type CheckDocker struct {
Notify CheckNotifier
CheckID types.CheckID CheckID types.CheckID
ServiceID string ServiceID string
Script string Script string
@ -572,6 +566,7 @@ type CheckDocker struct {
Interval time.Duration Interval time.Duration
Logger *log.Logger Logger *log.Logger
Client *DockerClient Client *DockerClient
StatusHandler *StatusHandler
stop chan struct{} stop chan struct{}
} }
@ -633,12 +628,7 @@ func (c *CheckDocker) check() {
} }
c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, out) c.Logger.Printf("[TRACE] agent: Check %q output: %s", c.CheckID, out)
} }
c.StatusHandler.updateCheck(c.CheckID, status, out)
if status == api.HealthCritical {
c.Logger.Printf("[WARN] agent: Check %q is now critical", c.CheckID)
}
c.Notify.UpdateCheck(c.CheckID, status, out)
} }
func (c *CheckDocker) doCheck() (string, *circbuf.Buffer, error) { func (c *CheckDocker) doCheck() (string, *circbuf.Buffer, error) {
@ -681,8 +671,8 @@ func (c *CheckDocker) doCheck() (string, *circbuf.Buffer, error) {
// The check is passing if returned status is SERVING. // The check is passing if returned status is SERVING.
// The check is critical if connection fails or returned status is // The check is critical if connection fails or returned status is
// not SERVING. // not SERVING.
// Supports failures_before_critical and success_before_passing.
type CheckGRPC struct { type CheckGRPC struct {
Notify CheckNotifier
CheckID types.CheckID CheckID types.CheckID
ServiceID string ServiceID string
GRPC string GRPC string
@ -690,6 +680,7 @@ type CheckGRPC struct {
Timeout time.Duration Timeout time.Duration
TLSClientConfig *tls.Config TLSClientConfig *tls.Config
Logger *log.Logger Logger *log.Logger
StatusHandler *StatusHandler
probe *GrpcHealthProbe probe *GrpcHealthProbe
stop bool stop bool
@ -747,11 +738,9 @@ func (c *CheckGRPC) check() {
err := c.probe.Check(target) err := c.probe.Check(target)
if err != nil { if err != nil {
c.Logger.Printf("[DEBUG] agent: Check %q failed: %s", c.CheckID, err.Error()) c.StatusHandler.updateCheck(c.CheckID, api.HealthCritical, err.Error())
c.Notify.UpdateCheck(c.CheckID, api.HealthCritical, err.Error())
} else { } else {
c.Logger.Printf("[DEBUG] agent: Check %q is passing", c.CheckID) c.StatusHandler.updateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("gRPC check %s: success", target))
c.Notify.UpdateCheck(c.CheckID, api.HealthPassing, fmt.Sprintf("gRPC check %s: success", target))
} }
} }
@ -763,3 +752,50 @@ func (c *CheckGRPC) Stop() {
close(c.stopCh) close(c.stopCh)
} }
} }
// StatusHandler keep tracks of successive error/success counts and ensures
// that status can be set to critical/passing only once the successive number of event
// reaches the given threshold.
type StatusHandler struct {
inner CheckNotifier
logger *log.Logger
successBeforePassing int
successCounter int
failuresBeforeCritical int
failuresCounter int
}
// NewStatusHandler set counters values to threshold in order to immediatly update status after first check.
func NewStatusHandler(inner CheckNotifier, logger *log.Logger, successBeforePassing, failuresBeforeCritical int) *StatusHandler {
return &StatusHandler{
logger: logger,
inner: inner,
successBeforePassing: successBeforePassing,
successCounter: successBeforePassing,
failuresBeforeCritical: failuresBeforeCritical,
failuresCounter: failuresBeforeCritical,
}
}
func (s *StatusHandler) updateCheck(checkID types.CheckID, status, output string) {
if status == api.HealthPassing || status == api.HealthWarning {
s.successCounter++
s.failuresCounter = 0
if s.successCounter >= s.successBeforePassing {
s.logger.Printf("[DEBUG] agent: Check %q is %q", checkID, status)
s.inner.UpdateCheck(checkID, status, output)
return
}
s.logger.Printf("[WARN] agent: Check %q was %q but has not reached success threshold %d/%d", checkID, status, s.successCounter, s.successBeforePassing)
} else {
s.failuresCounter++
s.successCounter = 0
if s.failuresCounter >= s.failuresBeforeCritical {
s.logger.Printf("[WARN] agent: Check %q is now critical", checkID)
s.inner.UpdateCheck(checkID, status, output)
return
}
s.logger.Printf("[WARN] agent: Check %q failed but has not reached failure threshold %d/%d", checkID, s.failuresCounter, s.failuresBeforeCritical)
}
}

View File

@ -20,6 +20,7 @@ import (
"github.com/hashicorp/consul/sdk/testutil/retry" "github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/types" "github.com/hashicorp/consul/types"
"github.com/hashicorp/go-uuid" "github.com/hashicorp/go-uuid"
"github.com/stretchr/testify/require"
) )
func uniqueID() string { func uniqueID() string {
@ -43,13 +44,17 @@ func TestCheckMonitor_Script(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.status, func(t *testing.T) { t.Run(tt.status, func(t *testing.T) {
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckMonitor{ check := &CheckMonitor{
Notify: notif, Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
Script: tt.script, Script: tt.script,
Interval: 25 * time.Millisecond, Interval: 25 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags),
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Logger: logger,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -79,13 +84,16 @@ func TestCheckMonitor_Args(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.status, func(t *testing.T) { t.Run(tt.status, func(t *testing.T) {
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckMonitor{ check := &CheckMonitor{
Notify: notif, Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
ScriptArgs: tt.args, ScriptArgs: tt.args,
Interval: 25 * time.Millisecond, Interval: 25 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags),
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Logger: logger,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -104,14 +112,18 @@ func TestCheckMonitor_Args(t *testing.T) {
func TestCheckMonitor_Timeout(t *testing.T) { func TestCheckMonitor_Timeout(t *testing.T) {
// t.Parallel() // timing test. no parallel // t.Parallel() // timing test. no parallel
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckMonitor{ check := &CheckMonitor{
Notify: notif, Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
ScriptArgs: []string{"sh", "-c", "sleep 1 && exit 0"}, ScriptArgs: []string{"sh", "-c", "sleep 1 && exit 0"},
Interval: 50 * time.Millisecond, Interval: 50 * time.Millisecond,
Timeout: 25 * time.Millisecond, Timeout: 25 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags),
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Logger: logger,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -130,13 +142,16 @@ func TestCheckMonitor_Timeout(t *testing.T) {
func TestCheckMonitor_RandomStagger(t *testing.T) { func TestCheckMonitor_RandomStagger(t *testing.T) {
// t.Parallel() // timing test. no parallel // t.Parallel() // timing test. no parallel
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckMonitor{ check := &CheckMonitor{
Notify: notif, Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
ScriptArgs: []string{"sh", "-c", "exit 0"}, ScriptArgs: []string{"sh", "-c", "exit 0"},
Interval: 25 * time.Millisecond, Interval: 25 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags),
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Logger: logger,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -156,13 +171,16 @@ func TestCheckMonitor_RandomStagger(t *testing.T) {
func TestCheckMonitor_LimitOutput(t *testing.T) { func TestCheckMonitor_LimitOutput(t *testing.T) {
t.Parallel() t.Parallel()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckMonitor{ check := &CheckMonitor{
Notify: notif, Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
ScriptArgs: []string{"od", "-N", "81920", "/dev/urandom"}, ScriptArgs: []string{"od", "-N", "81920", "/dev/urandom"},
Interval: 25 * time.Millisecond, Interval: 25 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags),
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Logger: logger,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -299,15 +317,17 @@ func TestCheckHTTP(t *testing.T) {
defer server.Close() defer server.Close()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
HTTP: server.URL, HTTP: server.URL,
Method: tt.method, Method: tt.method,
OutputMaxSize: DefaultBufSize,
Header: tt.header, Header: tt.header,
Interval: 10 * time.Millisecond, Interval: 10 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -337,15 +357,18 @@ func TestCheckHTTP_Proxied(t *testing.T) {
defer proxy.Close() defer proxy.Close()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
HTTP: "", HTTP: "",
Method: "GET", Method: "GET",
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Interval: 10 * time.Millisecond, Interval: 10 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
ProxyHTTP: proxy.URL, ProxyHTTP: proxy.URL,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
@ -369,15 +392,18 @@ func TestCheckHTTP_NotProxied(t *testing.T) {
defer server.Close() defer server.Close()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif,
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
HTTP: server.URL, HTTP: server.URL,
Method: "GET", Method: "GET",
OutputMaxSize: DefaultBufSize, OutputMaxSize: DefaultBufSize,
Interval: 10 * time.Millisecond, Interval: 10 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
ProxyHTTP: "", ProxyHTTP: "",
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -480,15 +506,16 @@ func TestCheckMaxOutputSize(t *testing.T) {
defer server.Close() defer server.Close()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
maxOutputSize := 32 maxOutputSize := 32
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif,
CheckID: types.CheckID("bar"), CheckID: types.CheckID("bar"),
HTTP: server.URL + "/v1/agent/self", HTTP: server.URL + "/v1/agent/self",
Timeout: timeout, Timeout: timeout,
Interval: 2 * time.Millisecond, Interval: 2 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
OutputMaxSize: maxOutputSize, OutputMaxSize: maxOutputSize,
StatusHandler: NewStatusHandler(notif, logger, 0, 0),
} }
check.Start() check.Start()
@ -515,13 +542,16 @@ func TestCheckHTTPTimeout(t *testing.T) {
defer server.Close() defer server.Close()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif, CheckID: types.CheckID("bar"),
CheckID: types.CheckID("bar"), HTTP: server.URL,
HTTP: server.URL, Timeout: timeout,
Timeout: timeout, Interval: 10 * time.Millisecond,
Interval: 10 * time.Millisecond, Logger: logger,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), StatusHandler: statusHandler,
} }
check.Start() check.Start()
@ -538,11 +568,14 @@ func TestCheckHTTPTimeout(t *testing.T) {
func TestCheckHTTP_disablesKeepAlives(t *testing.T) { func TestCheckHTTP_disablesKeepAlives(t *testing.T) {
t.Parallel() t.Parallel()
notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
check := &CheckHTTP{ check := &CheckHTTP{
CheckID: types.CheckID("foo"), CheckID: types.CheckID("foo"),
HTTP: "http://foo.bar/baz", HTTP: "http://foo.bar/baz",
Interval: 10 * time.Second, Interval: 10 * time.Second,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
StatusHandler: NewStatusHandler(notif, logger, 0, 0),
} }
check.Start() check.Start()
@ -576,13 +609,16 @@ func TestCheckHTTP_TLS_SkipVerify(t *testing.T) {
} }
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif,
CheckID: types.CheckID("skipverify_true"), CheckID: types.CheckID("skipverify_true"),
HTTP: server.URL, HTTP: server.URL,
Interval: 25 * time.Millisecond, Interval: 25 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
TLSClientConfig: tlsClientConfig, TLSClientConfig: tlsClientConfig,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
@ -610,13 +646,15 @@ func TestCheckHTTP_TLS_BadVerify(t *testing.T) {
} }
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckHTTP{ check := &CheckHTTP{
Notify: notif,
CheckID: types.CheckID("skipverify_false"), CheckID: types.CheckID("skipverify_false"),
HTTP: server.URL, HTTP: server.URL,
Interval: 100 * time.Millisecond, Interval: 100 * time.Millisecond,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), Logger: logger,
TLSClientConfig: tlsClientConfig, TLSClientConfig: tlsClientConfig,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
@ -658,12 +696,14 @@ func mockTCPServer(network string) net.Listener {
func expectTCPStatus(t *testing.T, tcp string, status string) { func expectTCPStatus(t *testing.T, tcp string, status string) {
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckTCP{ check := &CheckTCP{
Notify: notif, CheckID: types.CheckID("foo"),
CheckID: types.CheckID("foo"), TCP: tcp,
TCP: tcp, Interval: 10 * time.Millisecond,
Interval: 10 * time.Millisecond, Logger: logger,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -677,6 +717,98 @@ func expectTCPStatus(t *testing.T, tcp string, status string) {
}) })
} }
func TestStatusHandlerUpdateStatusAfterConsecutiveChecksThresholdIsReached(t *testing.T) {
t.Parallel()
checkID := types.CheckID("foo")
notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 2, 3)
// Set the initial status to passing after a single success
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
// Status should become critical after 3 failed checks only
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 1, notif.Updates("foo"))
require.Equal(r, api.HealthPassing, notif.State("foo"))
})
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 2, notif.Updates("foo"))
require.Equal(r, api.HealthCritical, notif.State("foo"))
})
// Status should be passing after 2 passing check
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 2, notif.Updates("foo"))
require.Equal(r, api.HealthCritical, notif.State("foo"))
})
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 3, notif.Updates("foo"))
require.Equal(r, api.HealthPassing, notif.State("foo"))
})
}
func TestStatusHandlerResetCountersOnNonIdenticalsConsecutiveChecks(t *testing.T) {
t.Parallel()
checkID := types.CheckID("foo")
notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 2, 3)
// Set the initial status to passing after a single success
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
// Status should remain passing after FAIL PASS FAIL FAIL sequence
// Although we have 3 FAILS, they are not consecutive
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 1, notif.Updates("foo"))
require.Equal(r, api.HealthPassing, notif.State("foo"))
})
// Critical after a 3rd consecutive FAIL
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 2, notif.Updates("foo"))
require.Equal(r, api.HealthCritical, notif.State("foo"))
})
// Status should remain critical after PASS FAIL PASS sequence
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
statusHandler.updateCheck(checkID, api.HealthCritical, "bar")
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 2, notif.Updates("foo"))
require.Equal(r, api.HealthCritical, notif.State("foo"))
})
// Passing after a 2nd consecutive PASS
statusHandler.updateCheck(checkID, api.HealthPassing, "bar")
retry.Run(t, func(r *retry.R) {
require.Equal(r, 3, notif.Updates("foo"))
require.Equal(r, api.HealthPassing, notif.State("foo"))
})
}
func TestCheckTCPCritical(t *testing.T) { func TestCheckTCPCritical(t *testing.T) {
t.Parallel() t.Parallel()
var ( var (
@ -971,14 +1103,15 @@ func TestCheck_Docker(t *testing.T) {
} }
notif, upd := mock.NewNotifyChan() notif, upd := mock.NewNotifyChan()
statusHandler := NewStatusHandler(notif, log.New(ioutil.Discard, uniqueID(), log.LstdFlags), 0, 0)
id := types.CheckID("chk") id := types.CheckID("chk")
check := &CheckDocker{ check := &CheckDocker{
Notify: notif,
CheckID: id, CheckID: id,
ScriptArgs: []string{"/health.sh"}, ScriptArgs: []string{"/health.sh"},
DockerContainerID: "123", DockerContainerID: "123",
Interval: 25 * time.Millisecond, Interval: 25 * time.Millisecond,
Client: c, Client: c,
StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()

View File

@ -4,10 +4,6 @@ import (
"crypto/tls" "crypto/tls"
"flag" "flag"
"fmt" "fmt"
"github.com/hashicorp/consul/agent/mock"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/types"
"io/ioutil" "io/ioutil"
"log" "log"
"net" "net"
@ -15,6 +11,11 @@ import (
"testing" "testing"
"time" "time"
"github.com/hashicorp/consul/agent/mock"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/types"
"google.golang.org/grpc" "google.golang.org/grpc"
"google.golang.org/grpc/health" "google.golang.org/grpc/health"
hv1 "google.golang.org/grpc/health/grpc_health_v1" hv1 "google.golang.org/grpc/health/grpc_health_v1"
@ -106,13 +107,15 @@ func TestGRPC_Proxied(t *testing.T) {
t.Parallel() t.Parallel()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckGRPC{ check := &CheckGRPC{
Notify: notif, CheckID: types.CheckID("foo"),
CheckID: types.CheckID("foo"), GRPC: "",
GRPC: "", Interval: 10 * time.Millisecond,
Interval: 10 * time.Millisecond, Logger: logger,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), ProxyGRPC: server,
ProxyGRPC: server, StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()
@ -132,13 +135,15 @@ func TestGRPC_NotProxied(t *testing.T) {
t.Parallel() t.Parallel()
notif := mock.NewNotify() notif := mock.NewNotify()
logger := log.New(ioutil.Discard, uniqueID(), log.LstdFlags)
statusHandler := NewStatusHandler(notif, logger, 0, 0)
check := &CheckGRPC{ check := &CheckGRPC{
Notify: notif, CheckID: types.CheckID("foo"),
CheckID: types.CheckID("foo"), GRPC: server,
GRPC: server, Interval: 10 * time.Millisecond,
Interval: 10 * time.Millisecond, Logger: logger,
Logger: log.New(ioutil.Discard, uniqueID(), log.LstdFlags), ProxyGRPC: "",
ProxyGRPC: "", StatusHandler: statusHandler,
} }
check.Start() check.Start()
defer check.Stop() defer check.Stop()

View File

@ -1211,6 +1211,8 @@ func (b *Builder) checkVal(v *CheckDefinition) *structs.CheckDefinition {
AliasService: b.stringVal(v.AliasService), AliasService: b.stringVal(v.AliasService),
Timeout: b.durationVal(fmt.Sprintf("check[%s].timeout", id), v.Timeout), Timeout: b.durationVal(fmt.Sprintf("check[%s].timeout", id), v.Timeout),
TTL: b.durationVal(fmt.Sprintf("check[%s].ttl", id), v.TTL), TTL: b.durationVal(fmt.Sprintf("check[%s].ttl", id), v.TTL),
SuccessBeforePassing: b.intVal(v.SuccessBeforePassing),
FailuresBeforeCritical: b.intVal(v.FailuresBeforeCritical),
DeregisterCriticalServiceAfter: b.durationVal(fmt.Sprintf("check[%s].deregister_critical_service_after", id), v.DeregisterCriticalServiceAfter), DeregisterCriticalServiceAfter: b.durationVal(fmt.Sprintf("check[%s].deregister_critical_service_after", id), v.DeregisterCriticalServiceAfter),
OutputMaxSize: b.intValWithDefault(v.OutputMaxSize, checks.DefaultBufSize), OutputMaxSize: b.intValWithDefault(v.OutputMaxSize, checks.DefaultBufSize),
} }

View File

@ -419,6 +419,8 @@ type CheckDefinition struct {
AliasService *string `json:"alias_service,omitempty" hcl:"alias_service" mapstructure:"alias_service"` AliasService *string `json:"alias_service,omitempty" hcl:"alias_service" mapstructure:"alias_service"`
Timeout *string `json:"timeout,omitempty" hcl:"timeout" mapstructure:"timeout"` Timeout *string `json:"timeout,omitempty" hcl:"timeout" mapstructure:"timeout"`
TTL *string `json:"ttl,omitempty" hcl:"ttl" mapstructure:"ttl"` TTL *string `json:"ttl,omitempty" hcl:"ttl" mapstructure:"ttl"`
SuccessBeforePassing *int `json:"success_before_passing,omitempty" hcl:"success_before_passing" mapstructure:"success_before_passing"`
FailuresBeforeCritical *int `json:"failures_before_critical,omitempty" hcl:"failures_before_critical" mapstructure:"failures_before_critical"`
DeregisterCriticalServiceAfter *string `json:"deregister_critical_service_after,omitempty" hcl:"deregister_critical_service_after" mapstructure:"deregister_critical_service_after"` DeregisterCriticalServiceAfter *string `json:"deregister_critical_service_after,omitempty" hcl:"deregister_critical_service_after" mapstructure:"deregister_critical_service_after"`
} }

View File

@ -5721,6 +5721,8 @@ func TestSanitize(t *testing.T) {
"AliasService": "", "AliasService": "",
"DeregisterCriticalServiceAfter": "0s", "DeregisterCriticalServiceAfter": "0s",
"DockerContainerID": "", "DockerContainerID": "",
"SuccessBeforePassing": 0,
"FailuresBeforeCritical": 0,
"GRPC": "", "GRPC": "",
"GRPCUseTLS": false, "GRPCUseTLS": false,
"HTTP": "", "HTTP": "",
@ -5893,6 +5895,8 @@ func TestSanitize(t *testing.T) {
"CheckID": "", "CheckID": "",
"DeregisterCriticalServiceAfter": "0s", "DeregisterCriticalServiceAfter": "0s",
"DockerContainerID": "", "DockerContainerID": "",
"SuccessBeforePassing": 0,
"FailuresBeforeCritical": 0,
"GRPC": "", "GRPC": "",
"GRPCUseTLS": false, "GRPCUseTLS": false,
"HTTP": "", "HTTP": "",

View File

@ -36,6 +36,8 @@ type CheckDefinition struct {
AliasService string AliasService string
Timeout time.Duration Timeout time.Duration
TTL time.Duration TTL time.Duration
SuccessBeforePassing int
FailuresBeforeCritical int
DeregisterCriticalServiceAfter time.Duration DeregisterCriticalServiceAfter time.Duration
OutputMaxSize int OutputMaxSize int
} }
@ -81,6 +83,8 @@ func (c *CheckDefinition) CheckType() *CheckType {
TLSSkipVerify: c.TLSSkipVerify, TLSSkipVerify: c.TLSSkipVerify,
Timeout: c.Timeout, Timeout: c.Timeout,
TTL: c.TTL, TTL: c.TTL,
SuccessBeforePassing: c.SuccessBeforePassing,
FailuresBeforeCritical: c.FailuresBeforeCritical,
DeregisterCriticalServiceAfter: c.DeregisterCriticalServiceAfter, DeregisterCriticalServiceAfter: c.DeregisterCriticalServiceAfter,
} }
} }

View File

@ -27,21 +27,23 @@ type CheckType struct {
// fields copied to CheckDefinition // fields copied to CheckDefinition
// Update CheckDefinition when adding fields here // Update CheckDefinition when adding fields here
ScriptArgs []string ScriptArgs []string
HTTP string HTTP string
Header map[string][]string Header map[string][]string
Method string Method string
TCP string TCP string
Interval time.Duration Interval time.Duration
AliasNode string AliasNode string
AliasService string AliasService string
DockerContainerID string DockerContainerID string
Shell string Shell string
GRPC string GRPC string
GRPCUseTLS bool GRPCUseTLS bool
TLSSkipVerify bool TLSSkipVerify bool
Timeout time.Duration Timeout time.Duration
TTL time.Duration TTL time.Duration
SuccessBeforePassing int
FailuresBeforeCritical int
// Definition fields used when exposing checks through a proxy // Definition fields used when exposing checks through a proxy
ProxyHTTP string ProxyHTTP string

View File

@ -367,3 +367,21 @@ key in your configuration file.
] ]
} }
``` ```
## Success/Failures before passing/critical
A check may be set to become passing/critical only if a specified number of consecutive
checks return passing/critical. Status will stay identical as before until
the threshold is reached.
This feature is available for HTTP, TCP, gRPC, Docker & Monitor checks.
By default, both passing and critical thresholds will be set to 0 so the check status will always reflect the last check result.
```javascript
{
"checks": {
...
"success_before_passing" : 3
"failures_before_critical" : 3
},
}
```