Add an API method for determining the best status

Given a list of HealthChecks, this determines the "best" status for the
collective group. This is useful for nodes and services, which may have
multiple checks associated with them.
This commit is contained in:
Seth Vargo 2016-11-29 16:15:20 -05:00
parent 916f3c85b0
commit 4179aacf11
No known key found for this signature in database
GPG Key ID: 905A90C2949E8787
6 changed files with 213 additions and 17 deletions

View File

@ -2,6 +2,8 @@ package api
import (
"fmt"
"log"
"strings"
)
const (
@ -11,6 +13,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)
const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)
// HealthCheck is used to represent a single check
@ -25,6 +36,52 @@ type HealthCheck struct {
ServiceName string
}
// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck
// AggregatedStatus returns the "best" status for the list of health checks.
// Because a given entry may have many service and node-level health checks
// attached, this function determines the best representative of the status as
// as single string using the following heuristic:
//
// maintenance > critical > warning > passing
//
func (c HealthChecks) AggregatedStatus() string {
var passing, warning, critical, maintenance bool
for _, check := range c {
id := string(check.CheckID)
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
maintenance = true
continue
}
switch check.Status {
case HealthPassing:
passing = true
case HealthWarning:
warning = true
case HealthCritical:
critical = true
default:
log.Printf("[WARN] unknown status %q", check.Status)
return ""
}
}
switch {
case maintenance:
return HealthMaint
case critical:
return HealthCritical
case warning:
return HealthWarning
case passing:
return HealthPassing
default:
return HealthPassing
}
}
// ServiceEntry is used for the health service endpoint
type ServiceEntry struct {
Node *Node

View File

@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
})
}
func TestHealthChecks_AggregatedStatus(t *testing.T) {
t.Parallel()
cases := []struct {
name string
checks HealthChecks
exp string
}{
{
"empty",
nil,
HealthPassing,
},
{
"passing",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
},
HealthPassing,
},
{
"warning",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
{
"critical",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
},
HealthCritical,
},
{
"node_maintenance",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
},
HealthMaint,
},
{
"service_maintenance",
HealthChecks{
&HealthCheck{
CheckID: ServiceMaintPrefix + "service",
},
},
HealthMaint,
},
{
"unknown",
HealthChecks{
&HealthCheck{
Status: "nope-nope-noper",
},
},
"",
},
{
"maintenance_over_critical",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
&HealthCheck{
Status: HealthCritical,
},
},
HealthMaint,
},
{
"critical_over_warning",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthCritical,
},
{
"warning_over_passing",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
&HealthCheck{
Status: HealthPassing,
},
},
HealthWarning,
},
{
"lots",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
}
for i, tc := range cases {
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
act := tc.checks.AggregatedStatus()
if tc.exp != act {
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
}
})
}
}
func TestHealth_Checks(t *testing.T) {
t.Parallel()
c, s := makeClient(t)

View File

@ -34,10 +34,6 @@ const (
checksDir = "checks"
checkStateDir = "checks/state"
// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
nodeMaintCheckID = "_node_maintenance"
// Default reasons for node/service maintenance mode
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
"but no reason was provided. This is a default message."
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
// serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) types.CheckID {
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
}
// EnableServiceMaintenance will register a false health check against the given
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
// EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
return
}
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Create and register the node maintenance check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: nodeMaintCheckID,
CheckID: structs.NodeMaint,
Name: "Node Maintenance Mode",
Notes: reason,
Status: structs.HealthCritical,
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
return
}
a.RemoveCheck(nodeMaintCheckID, true)
a.RemoveCheck(structs.NodeMaint, true)
a.logger.Printf("[INFO] agent: Node left maintenance mode")
}

View File

@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
}
// Ensure the maintenance check was registered
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered maintenance check")
}
// Check that the token was used
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got '%s'", token)
}
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
}
// Ensure the maintenance check was removed
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have removed maintenance check")
}
}

View File

@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("broken", "mytoken")
// Make sure the critical health check was added
check, ok := agent.state.Checks()[nodeMaintCheckID]
check, ok := agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}
// Check that the token was used to register the check
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got: '%s'", token)
}
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.DisableNodeMaintenance()
// Ensure the check was deregistered
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have deregistered critical node check")
}
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("", "")
// Make sure the check was registered with the default note
check, ok = agent.state.Checks()[nodeMaintCheckID]
check, ok = agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}

View File

@ -56,6 +56,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)
const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)
func ValidStatus(s string) bool {
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
return clone
}
// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck
// CheckServiceNode is used to provide the node, its service
@ -460,7 +470,7 @@ type NodeInfo struct {
Address string
TaggedAddresses map[string]string
Services []*NodeService
Checks []*HealthCheck
Checks HealthChecks
}
// NodeDump is used to dump all the nodes with all their