diff --git a/agent/consul/prepared_query/template_test.go b/agent/consul/prepared_query/template_test.go index a14fb44ad2..05cbc17da6 100644 --- a/agent/consul/prepared_query/template_test.go +++ b/agent/consul/prepared_query/template_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/types" "github.com/mitchellh/copystructure" ) @@ -32,6 +33,15 @@ var ( "${agent.segment}", }, }, + IgnoreCheckIDs: []types.CheckID{ + "${name.full}", + "${name.prefix}", + "${name.suffix}", + "${match(0)}", + "${match(1)}", + "${match(2)}", + "${agent.segment}", + }, Tags: []string{ "${name.full}", "${name.prefix}", @@ -124,6 +134,7 @@ func TestTemplate_Compile(t *testing.T) { query.Template.Type = structs.QueryTemplateTypeNamePrefixMatch query.Template.Regexp = "^(hello)there$" query.Service.Service = "${name.full}" + query.Service.IgnoreCheckIDs = []types.CheckID{"${match(1)}", "${agent.segment}"} query.Service.Tags = []string{"${match(1)}", "${agent.segment}"} backup, err := copystructure.Copy(query) if err != nil { @@ -151,6 +162,10 @@ func TestTemplate_Compile(t *testing.T) { }, Service: structs.ServiceQuery{ Service: "hellothere", + IgnoreCheckIDs: []types.CheckID{ + "hello", + "segment-foo", + }, Tags: []string{ "hello", "segment-foo", diff --git a/agent/consul/prepared_query_endpoint.go b/agent/consul/prepared_query_endpoint.go index ff7fa4d386..8d8fd35f66 100644 --- a/agent/consul/prepared_query_endpoint.go +++ b/agent/consul/prepared_query_endpoint.go @@ -496,7 +496,8 @@ func (p *PreparedQuery) execute(query *structs.PreparedQuery, } // Filter out any unhealthy nodes. - nodes = nodes.Filter(query.Service.OnlyPassing) + nodes = nodes.FilterIgnore(query.Service.OnlyPassing, + query.Service.IgnoreCheckIDs) // Apply the node metadata filters, if any. if len(query.Service.NodeMeta) > 0 { diff --git a/agent/consul/prepared_query_endpoint_test.go b/agent/consul/prepared_query_endpoint_test.go index 6543ba9f0f..e4bc49e514 100644 --- a/agent/consul/prepared_query_endpoint_test.go +++ b/agent/consul/prepared_query_endpoint_test.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/testrpc" "github.com/hashicorp/consul/testutil/retry" + "github.com/hashicorp/consul/types" "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/serf/coordinate" ) @@ -2076,6 +2077,41 @@ func TestPreparedQuery_Execute(t *testing.T) { } } + // Make the query ignore all our health checks (which have "failing" ID + // implicitly from their name). + query.Query.Service.IgnoreCheckIDs = []types.CheckID{"failing"} + if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Apply", &query, &query.Query.ID); err != nil { + t.Fatalf("err: %v", err) + } + + // We should end up with 10 nodes again + { + req := structs.PreparedQueryExecuteRequest{ + Datacenter: "dc1", + QueryIDOrName: query.Query.ID, + QueryOptions: structs.QueryOptions{Token: execToken}, + } + + var reply structs.PreparedQueryExecuteResponse + if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Execute", &req, &reply); err != nil { + t.Fatalf("err: %v", err) + } + + if len(reply.Nodes) != 10 || + reply.Datacenter != "dc1" || + reply.Service != query.Query.Service.Service || + !reflect.DeepEqual(reply.DNS, query.Query.DNS) || + !reply.QueryMeta.KnownLeader { + t.Fatalf("bad: %v", reply) + } + } + + // Undo that so all the following tests aren't broken! + query.Query.Service.IgnoreCheckIDs = nil + if err := msgpackrpc.CallWithCodec(codec1, "PreparedQuery.Apply", &query, &query.Query.ID); err != nil { + t.Fatalf("err: %v", err) + } + // Make the query more picky by adding a tag filter. This just proves we // call into the tag filter, it is tested more thoroughly in a separate // test. diff --git a/agent/prepared_query_endpoint_test.go b/agent/prepared_query_endpoint_test.go index 58e27df2a5..65b4e04fc6 100644 --- a/agent/prepared_query_endpoint_test.go +++ b/agent/prepared_query_endpoint_test.go @@ -10,6 +10,7 @@ import ( "testing" "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/types" ) // MockPreparedQuery is a fake endpoint that we inject into the Consul server @@ -87,9 +88,10 @@ func TestPreparedQuery_Create(t *testing.T) { NearestN: 4, Datacenters: []string{"dc1", "dc2"}, }, - OnlyPassing: true, - Tags: []string{"foo", "bar"}, - NodeMeta: map[string]string{"somekey": "somevalue"}, + IgnoreCheckIDs: []types.CheckID{"broken_check"}, + OnlyPassing: true, + Tags: []string{"foo", "bar"}, + NodeMeta: map[string]string{"somekey": "somevalue"}, }, DNS: structs.QueryDNSOptions{ TTL: "10s", @@ -122,9 +124,10 @@ func TestPreparedQuery_Create(t *testing.T) { "NearestN": 4, "Datacenters": []string{"dc1", "dc2"}, }, - "OnlyPassing": true, - "Tags": []string{"foo", "bar"}, - "NodeMeta": map[string]string{"somekey": "somevalue"}, + "IgnoreCheckIDs": []string{"broken_check"}, + "OnlyPassing": true, + "Tags": []string{"foo", "bar"}, + "NodeMeta": map[string]string{"somekey": "somevalue"}, }, "DNS": map[string]interface{}{ "TTL": "10s", diff --git a/agent/structs/prepared_query.go b/agent/structs/prepared_query.go index 7967b85397..8171aaefe6 100644 --- a/agent/structs/prepared_query.go +++ b/agent/structs/prepared_query.go @@ -1,5 +1,7 @@ package structs +import "github.com/hashicorp/consul/types" + // QueryDatacenterOptions sets options about how we fail over if there are no // healthy nodes in the local datacenter. type QueryDatacenterOptions struct { @@ -34,6 +36,12 @@ type ServiceQuery struct { // discarded) OnlyPassing bool + // IgnoreCheckIDs is an optional list of health check IDs to ignore when + // considering which nodes are healthy. It is useful as an emergency measure + // to temporarily override some health check that is producing false negatives + // for example. + IgnoreCheckIDs []types.CheckID + // Near allows the query to always prefer the node nearest the given // node. If the node does not exist, results are returned in their // normal randomly-shuffled order. Supplying the magic "_agent" value diff --git a/agent/structs/structs.go b/agent/structs/structs.go index 9661e5ac1d..af4e761ce3 100644 --- a/agent/structs/structs.go +++ b/agent/structs/structs.go @@ -580,16 +580,33 @@ func (nodes CheckServiceNodes) Shuffle() { // check if that option is selected). Note that this returns the filtered // results AND modifies the receiver for performance. func (nodes CheckServiceNodes) Filter(onlyPassing bool) CheckServiceNodes { + return nodes.FilterIgnore(onlyPassing, nil) +} + +// FilterIgnore removes nodes that are failing health checks just like Filter. +// It also ignores the status of any check with an ID present in ignoreCheckIDs +// as if that check didn't exist. Note that this returns the filtered results +// AND modifies the receiver for performance. +func (nodes CheckServiceNodes) FilterIgnore(onlyPassing bool, + ignoreCheckIDs []types.CheckID) CheckServiceNodes { n := len(nodes) OUTER: for i := 0; i < n; i++ { node := nodes[i] + INNER: for _, check := range node.Checks { + for _, ignore := range ignoreCheckIDs { + if check.CheckID == ignore { + // Skip this _check_ but keep looking at other checks for this node. + continue INNER + } + } if check.Status == api.HealthCritical || (onlyPassing && check.Status != api.HealthPassing) { nodes[i], nodes[n-1] = nodes[n-1], CheckServiceNode{} n-- i-- + // Skip this _node_ now we've swapped it off the end of the list. continue OUTER } } diff --git a/agent/structs/structs_test.go b/agent/structs/structs_test.go index e1cb8ed8a9..dcb8e0c4ef 100644 --- a/agent/structs/structs_test.go +++ b/agent/structs/structs_test.go @@ -441,6 +441,20 @@ func TestStructs_CheckServiceNodes_Filter(t *testing.T) { }, }, }, + CheckServiceNode{ + Node: &Node{ + Node: "node4", + Address: "127.0.0.4", + }, + Checks: HealthChecks{ + // This check has a different ID to the others to ensure it is not + // ignored by accident + &HealthCheck{ + CheckID: "failing2", + Status: api.HealthCritical, + }, + }, + }, } // Test the case where warnings are allowed. @@ -473,6 +487,26 @@ func TestStructs_CheckServiceNodes_Filter(t *testing.T) { t.Fatalf("bad: %v", filtered) } } + + // Allow failing checks to be ignored (note that the test checks have empty + // CheckID which is valid). + { + twiddle := make(CheckServiceNodes, len(nodes)) + if n := copy(twiddle, nodes); n != len(nodes) { + t.Fatalf("bad: %d", n) + } + filtered := twiddle.FilterIgnore(true, []types.CheckID{""}) + expected := CheckServiceNodes{ + nodes[0], + nodes[1], + nodes[2], // Node 3's critical check should be ignored. + // Node 4 should still be failing since it's got a critical check with a + // non-ignored ID. + } + if !reflect.DeepEqual(filtered, expected) { + t.Fatalf("bad: %v", filtered) + } + } } func TestStructs_DirEntry_Clone(t *testing.T) { diff --git a/api/prepared_query.go b/api/prepared_query.go index 9020b720e2..d322dd8679 100644 --- a/api/prepared_query.go +++ b/api/prepared_query.go @@ -34,6 +34,12 @@ type ServiceQuery struct { // local datacenter. Failover QueryDatacenterOptions + // IgnoreCheckIDs is an optional list of health check IDs to ignore when + // considering which nodes are healthy. It is useful as an emergency measure + // to temporarily override some health check that is producing false negatives + // for example. + IgnoreCheckIDs []string + // If OnlyPassing is true then we will only include nodes with passing // health checks (critical AND warning checks will cause a node to be // discarded) diff --git a/api/prepared_query_test.go b/api/prepared_query_test.go index f06218001f..1db2ce4bda 100644 --- a/api/prepared_query_test.go +++ b/api/prepared_query_test.go @@ -116,6 +116,53 @@ func TestAPI_PreparedQuery(t *testing.T) { t.Fatalf("bad datacenter: %v", results) } + // Add new node with failing health check. + reg2 := reg + reg2.Node = "failingnode" + reg2.Check = &AgentCheck{ + Node: "failingnode", + ServiceID: "redis1", + ServiceName: "redis", + Name: "failingcheck", + Status: "critical", + } + retry.Run(t, func(r *retry.R) { + if _, err := catalog.Register(reg2, nil); err != nil { + r.Fatal(err) + } + if _, _, err := catalog.Node("failingnode", nil); err != nil { + r.Fatal(err) + } + }) + + // Execute by ID. Should return only healthy node. + results, _, err = query.Execute(def.ID, nil) + if err != nil { + t.Fatalf("err: %s", err) + } + if len(results.Nodes) != 1 || results.Nodes[0].Node.Node != "foobar" { + t.Fatalf("bad: %v", results) + } + if wan, ok := results.Nodes[0].Node.TaggedAddresses["wan"]; !ok || wan != "127.0.0.1" { + t.Fatalf("bad: %v", results) + } + + // Update PQ with ignore rule for the failing check + def.Service.IgnoreCheckIDs = []string{"failingcheck"} + _, err = query.Update(def, nil) + if err != nil { + t.Fatalf("err: %s", err) + } + + // Execute by ID. Should return BOTH nodes ignoring the failing check. + results, _, err = query.Execute(def.ID, nil) + if err != nil { + t.Fatalf("err: %s", err) + } + if len(results.Nodes) != 2 { + t.Fatalf("got %d nodes, want 2", len(results.Nodes)) + } + // Delete it. _, err = query.Delete(def.ID, nil) if err != nil { diff --git a/website/source/api/query.html.md b/website/source/api/query.html.md index 586244fcf7..bc9f3fa099 100644 --- a/website/source/api/query.html.md +++ b/website/source/api/query.html.md @@ -25,7 +25,7 @@ section for more details about how prepared queries work with Consul's ACL syste ### Prepared Query Templates Consul 0.6.4 and later support prepared query templates. These are created -similar to static templates, except with some additional fields and features. +similar to static queries, except with some additional fields and features. Here is an example prepared query template: ```json @@ -206,6 +206,13 @@ The table below shows this endpoint's support for failover, even if it is selected by both `NearestN` and is listed in `Datacenters`. + - `IgnoreCheckIDs` `(array: nil)` - Specifies a list of check IDs that + should be ignored when filtering unhealthy instances. This is mostly useful + in an emergency or as a temporary measure when a health check is found to be + unreliable. Being able to ignore it in centrally-defined queries can be + simpler than de-registering the check as an interim solution until the check + can be fixed. + - `OnlyPassing` `(bool: false)` - Specifies the behavior of the query's health check filtering. If this is set to false, the results will include nodes with checks in the passing as well as the warning states. If this is set to