From 4179aacf11b73ab0b5700df269923c1eb8ea6fa5 Mon Sep 17 00:00:00 2001
From: Seth Vargo <sethvargo@gmail.com>
Date: Tue, 29 Nov 2016 16:15:20 -0500
Subject: [PATCH] Add an API method for determining the best status

Given a list of HealthChecks, this determines the "best" status for the
collective group. This is useful for nodes and services, which may have
multiple checks associated with them.
---
 api/health.go                        |  57 ++++++++++++
 api/health_test.go                   | 133 +++++++++++++++++++++++++++
 command/agent/agent.go               |  14 +--
 command/agent/agent_endpoint_test.go |   6 +-
 command/agent/agent_test.go          |   8 +-
 consul/structs/structs.go            |  12 ++-
 6 files changed, 213 insertions(+), 17 deletions(-)

diff --git a/api/health.go b/api/health.go
index 74da949c8d..94fbfc46e2 100644
--- a/api/health.go
+++ b/api/health.go
@@ -2,6 +2,8 @@ package api
 
 import (
 	"fmt"
+	"log"
+	"strings"
 )
 
 const (
@@ -11,6 +13,15 @@ const (
 	HealthPassing  = "passing"
 	HealthWarning  = "warning"
 	HealthCritical = "critical"
+	HealthMaint    = "maintenance"
+)
+
+const (
+	// NodeMaint is the special key set by a node in maintenance mode.
+	NodeMaint = "_node_maintenance"
+
+	// ServiceMaintPrefix is the prefix for a service in maintenance mode.
+	ServiceMaintPrefix = "_service_maintenance:"
 )
 
 // HealthCheck is used to represent a single check
@@ -25,6 +36,52 @@ type HealthCheck struct {
 	ServiceName string
 }
 
+// HealthChecks is a collection of HealthCheck structs.
+type HealthChecks []*HealthCheck
+
+// AggregatedStatus returns the "best" status for the list of health checks.
+// Because a given entry may have many service and node-level health checks
+// attached, this function determines the best representative of the status as
+// as single string using the following heuristic:
+//
+//  maintenance > critical > warning > passing
+//
+func (c HealthChecks) AggregatedStatus() string {
+	var passing, warning, critical, maintenance bool
+	for _, check := range c {
+		id := string(check.CheckID)
+		if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
+			maintenance = true
+			continue
+		}
+
+		switch check.Status {
+		case HealthPassing:
+			passing = true
+		case HealthWarning:
+			warning = true
+		case HealthCritical:
+			critical = true
+		default:
+			log.Printf("[WARN] unknown status %q", check.Status)
+			return ""
+		}
+	}
+
+	switch {
+	case maintenance:
+		return HealthMaint
+	case critical:
+		return HealthCritical
+	case warning:
+		return HealthWarning
+	case passing:
+		return HealthPassing
+	default:
+		return HealthPassing
+	}
+}
+
 // ServiceEntry is used for the health service endpoint
 type ServiceEntry struct {
 	Node    *Node
diff --git a/api/health_test.go b/api/health_test.go
index 1d3247b18e..e971d8023b 100644
--- a/api/health_test.go
+++ b/api/health_test.go
@@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
 	})
 }
 
+func TestHealthChecks_AggregatedStatus(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name   string
+		checks HealthChecks
+		exp    string
+	}{
+		{
+			"empty",
+			nil,
+			HealthPassing,
+		},
+		{
+			"passing",
+			HealthChecks{
+				&HealthCheck{
+					Status: HealthPassing,
+				},
+			},
+			HealthPassing,
+		},
+		{
+			"warning",
+			HealthChecks{
+				&HealthCheck{
+					Status: HealthWarning,
+				},
+			},
+			HealthWarning,
+		},
+		{
+			"critical",
+			HealthChecks{
+				&HealthCheck{
+					Status: HealthCritical,
+				},
+			},
+			HealthCritical,
+		},
+		{
+			"node_maintenance",
+			HealthChecks{
+				&HealthCheck{
+					CheckID: NodeMaint,
+				},
+			},
+			HealthMaint,
+		},
+		{
+			"service_maintenance",
+			HealthChecks{
+				&HealthCheck{
+					CheckID: ServiceMaintPrefix + "service",
+				},
+			},
+			HealthMaint,
+		},
+		{
+			"unknown",
+			HealthChecks{
+				&HealthCheck{
+					Status: "nope-nope-noper",
+				},
+			},
+			"",
+		},
+		{
+			"maintenance_over_critical",
+			HealthChecks{
+				&HealthCheck{
+					CheckID: NodeMaint,
+				},
+				&HealthCheck{
+					Status: HealthCritical,
+				},
+			},
+			HealthMaint,
+		},
+		{
+			"critical_over_warning",
+			HealthChecks{
+				&HealthCheck{
+					Status: HealthCritical,
+				},
+				&HealthCheck{
+					Status: HealthWarning,
+				},
+			},
+			HealthCritical,
+		},
+		{
+			"warning_over_passing",
+			HealthChecks{
+				&HealthCheck{
+					Status: HealthWarning,
+				},
+				&HealthCheck{
+					Status: HealthPassing,
+				},
+			},
+			HealthWarning,
+		},
+		{
+			"lots",
+			HealthChecks{
+				&HealthCheck{
+					Status: HealthPassing,
+				},
+				&HealthCheck{
+					Status: HealthPassing,
+				},
+				&HealthCheck{
+					Status: HealthPassing,
+				},
+				&HealthCheck{
+					Status: HealthWarning,
+				},
+			},
+			HealthWarning,
+		},
+	}
+
+	for i, tc := range cases {
+		t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
+			act := tc.checks.AggregatedStatus()
+			if tc.exp != act {
+				t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
+			}
+		})
+	}
+}
+
 func TestHealth_Checks(t *testing.T) {
 	t.Parallel()
 	c, s := makeClient(t)
diff --git a/command/agent/agent.go b/command/agent/agent.go
index f3c5461b84..11d1670bae 100644
--- a/command/agent/agent.go
+++ b/command/agent/agent.go
@@ -34,10 +34,6 @@ const (
 	checksDir     = "checks"
 	checkStateDir = "checks/state"
 
-	// The ID of the faux health checks for maintenance mode
-	serviceMaintCheckPrefix = "_service_maintenance"
-	nodeMaintCheckID        = "_node_maintenance"
-
 	// Default reasons for node/service maintenance mode
 	defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
 		"but no reason was provided. This is a default message."
@@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
 
 // serviceMaintCheckID returns the ID of a given service's maintenance check
 func serviceMaintCheckID(serviceID string) types.CheckID {
-	return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
+	return types.CheckID(structs.ServiceMaintPrefix + serviceID)
 }
 
 // EnableServiceMaintenance will register a false health check against the given
@@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
 // EnableNodeMaintenance places a node into maintenance mode.
 func (a *Agent) EnableNodeMaintenance(reason, token string) {
 	// Ensure node maintenance is not already enabled
-	if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
+	if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
 		return
 	}
 
@@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
 	// Create and register the node maintenance check
 	check := &structs.HealthCheck{
 		Node:    a.config.NodeName,
-		CheckID: nodeMaintCheckID,
+		CheckID: structs.NodeMaint,
 		Name:    "Node Maintenance Mode",
 		Notes:   reason,
 		Status:  structs.HealthCritical,
@@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
 
 // DisableNodeMaintenance removes a node from maintenance mode
 func (a *Agent) DisableNodeMaintenance() {
-	if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
+	if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
 		return
 	}
-	a.RemoveCheck(nodeMaintCheckID, true)
+	a.RemoveCheck(structs.NodeMaint, true)
 	a.logger.Printf("[INFO] agent: Node left maintenance mode")
 }
 
diff --git a/command/agent/agent_endpoint_test.go b/command/agent/agent_endpoint_test.go
index 1919855e56..0de3483270 100644
--- a/command/agent/agent_endpoint_test.go
+++ b/command/agent/agent_endpoint_test.go
@@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
 	}
 
 	// Ensure the maintenance check was registered
-	check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
+	check, ok := srv.agent.state.Checks()[structs.NodeMaint]
 	if !ok {
 		t.Fatalf("should have registered maintenance check")
 	}
 
 	// Check that the token was used
-	if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
+	if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
 		t.Fatalf("expected 'mytoken', got '%s'", token)
 	}
 
@@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
 	}
 
 	// Ensure the maintenance check was removed
-	if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
+	if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
 		t.Fatalf("should have removed maintenance check")
 	}
 }
diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go
index 42aed2f8f5..3481991161 100644
--- a/command/agent/agent_test.go
+++ b/command/agent/agent_test.go
@@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
 	agent.EnableNodeMaintenance("broken", "mytoken")
 
 	// Make sure the critical health check was added
-	check, ok := agent.state.Checks()[nodeMaintCheckID]
+	check, ok := agent.state.Checks()[structs.NodeMaint]
 	if !ok {
 		t.Fatalf("should have registered critical node check")
 	}
 
 	// Check that the token was used to register the check
-	if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
+	if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
 		t.Fatalf("expected 'mytoken', got: '%s'", token)
 	}
 
@@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
 	agent.DisableNodeMaintenance()
 
 	// Ensure the check was deregistered
-	if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
+	if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
 		t.Fatalf("should have deregistered critical node check")
 	}
 
@@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
 	agent.EnableNodeMaintenance("", "")
 
 	// Make sure the check was registered with the default note
-	check, ok = agent.state.Checks()[nodeMaintCheckID]
+	check, ok = agent.state.Checks()[structs.NodeMaint]
 	if !ok {
 		t.Fatalf("should have registered critical node check")
 	}
diff --git a/consul/structs/structs.go b/consul/structs/structs.go
index a51658ddcb..9c28077c38 100644
--- a/consul/structs/structs.go
+++ b/consul/structs/structs.go
@@ -56,6 +56,15 @@ const (
 	HealthPassing  = "passing"
 	HealthWarning  = "warning"
 	HealthCritical = "critical"
+	HealthMaint    = "maintenance"
+)
+
+const (
+	// NodeMaint is the special key set by a node in maintenance mode.
+	NodeMaint = "_node_maintenance"
+
+	// ServiceMaintPrefix is the prefix for a service in maintenance mode.
+	ServiceMaintPrefix = "_service_maintenance:"
 )
 
 func ValidStatus(s string) bool {
@@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
 	return clone
 }
 
+// HealthChecks is a collection of HealthCheck structs.
 type HealthChecks []*HealthCheck
 
 // CheckServiceNode is used to provide the node, its service
@@ -460,7 +470,7 @@ type NodeInfo struct {
 	Address         string
 	TaggedAddresses map[string]string
 	Services        []*NodeService
-	Checks          []*HealthCheck
+	Checks          HealthChecks
 }
 
 // NodeDump is used to dump all the nodes with all their