diff --git a/command/agent/command.go b/command/agent/command.go index 3f21ae2730..73acf307e2 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -189,6 +189,11 @@ func (c *Command) readConfig() *Config { } } + // Verify DNS settings + if config.DNSConfig.UDPAnswerLimit < 1 { + c.Ui.Error(fmt.Sprintf("dns_config.udp_answer_limit %d too low, must always be greater than zero", config.DNSConfig.UDPAnswerLimit)) + } + if config.EncryptKey != "" { if _, err := config.EncryptBytes(); err != nil { c.Ui.Error(fmt.Sprintf("Invalid encryption key: %s", err)) diff --git a/command/agent/config.go b/command/agent/config.go index 6d4ebf2bd8..eb3e659b24 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -77,6 +77,21 @@ type DNSConfig struct { // returned by default for UDP. EnableTruncate bool `mapstructure:"enable_truncate"` + // UDPAnswerLimit is used to limit the maximum number of DNS Resource + // Records returned in the ANSWER section of a DNS response. This is + // not normally useful and will be limited based on the querying + // protocol, however systems that implemented §6 Rule 9 in RFC3484 + // may want to set this to `1` in order to subvert §6 Rule 9 and + // re-obtain the effect of randomized resource records (i.e. each + // answer contains only one IP, but the IP changes every request). + // RFC3484 sorts answers in a deterministic order, which defeats the + // purpose of randomized DNS responses. This RFC has been obsoleted + // by RFC6724 and restores the desired behavior of randomized + // responses, however a large number of Linux hosts using glibc(3) + // implemented §6 Rule 9 and may need this option (e.g. CentOS 5-6, + // Debian Squeeze, etc). + UDPAnswerLimit int `mapstructure:"udp_answer_limit"` + // MaxStale is used to bound how stale of a result is // accepted for a DNS lookup. This can be used with // AllowStale to limit how old of a value is served up. @@ -527,7 +542,8 @@ func DefaultConfig() *Config { Server: 8300, }, DNSConfig: DNSConfig{ - MaxStale: 5 * time.Second, + UDPAnswerLimit: 3, + MaxStale: 5 * time.Second, }, Telemetry: Telemetry{ StatsitePrefix: "consul", @@ -1127,6 +1143,9 @@ func MergeConfig(a, b *Config) *Config { if b.DNSConfig.AllowStale { result.DNSConfig.AllowStale = true } + if b.DNSConfig.UDPAnswerLimit != 0 { + result.DNSConfig.UDPAnswerLimit = b.DNSConfig.UDPAnswerLimit + } if b.DNSConfig.EnableTruncate { result.DNSConfig.EnableTruncate = true } diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 6e0c446b2b..e78736702e 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -521,19 +521,28 @@ func TestDecodeConfig(t *testing.T) { } // DNS node ttl, max stale - input = `{"dns_config": {"node_ttl": "5s", "max_stale": "15s", "allow_stale": true}}` + input = `{"dns_config": {"allow_stale": true, "enable_truncate": false, "max_stale": "15s", "node_ttl": "5s", "only_passing": true, "udp_answer_limit": 6}}` config, err = DecodeConfig(bytes.NewReader([]byte(input))) if err != nil { t.Fatalf("err: %s", err) } - if config.DNSConfig.NodeTTL != 5*time.Second { + if !config.DNSConfig.AllowStale { + t.Fatalf("bad: %#v", config) + } + if config.DNSConfig.EnableTruncate { t.Fatalf("bad: %#v", config) } if config.DNSConfig.MaxStale != 15*time.Second { t.Fatalf("bad: %#v", config) } - if !config.DNSConfig.AllowStale { + if config.DNSConfig.NodeTTL != 5*time.Second { + t.Fatalf("bad: %#v", config) + } + if !config.DNSConfig.OnlyPassing { + t.Fatalf("bad: %#v", config) + } + if config.DNSConfig.UDPAnswerLimit != 6 { t.Fatalf("bad: %#v", config) } @@ -1252,13 +1261,14 @@ func TestMergeConfig(t *testing.T) { DataDir: "/tmp/bar", DNSRecursors: []string{"127.0.0.2:1001"}, DNSConfig: DNSConfig{ - NodeTTL: 10 * time.Second, + AllowStale: false, + EnableTruncate: true, + MaxStale: 30 * time.Second, + NodeTTL: 10 * time.Second, ServiceTTL: map[string]time.Duration{ "api": 10 * time.Second, }, - AllowStale: true, - MaxStale: 30 * time.Second, - EnableTruncate: true, + UDPAnswerLimit: 4, }, Domain: "other", LogLevel: "info", diff --git a/command/agent/dns.go b/command/agent/dns.go index 409fca710c..2a8d8dd5c3 100644 --- a/command/agent/dns.go +++ b/command/agent/dns.go @@ -12,12 +12,17 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/consul/consul" "github.com/hashicorp/consul/consul/structs" + "github.com/hashicorp/consul/lib" "github.com/miekg/dns" ) const ( - maxServiceResponses = 3 // For UDP only - maxRecurseRecords = 5 + // UDP can fit ~25 A records in a 512B response, and ~14 AAAA + // records. Limit further to prevent unintentional configuration + // abuse that would have a negative effect on application response + // times. + maxUDPAnswerLimit = 8 + maxRecurseRecords = 5 ) // DNSServer is used to wrap an Agent and expose various @@ -487,15 +492,16 @@ func (d *DNSServer) formatNodeRecord(node *structs.Node, addr, qName string, qTy return records } -// trimAnswers makes sure a UDP response is not longer than allowed by RFC 1035. -// We first enforce an arbitrary limit, and then make sure the response doesn't -// exceed 512 bytes. -func trimAnswers(resp *dns.Msg) (trimmed bool) { +// trimUDPAnswers makes sure a UDP response is not longer than allowed by RFC +// 1035. Enforce an arbitrary limit that can be further ratcheted down by +// config, and then make sure the response doesn't exceed 512 bytes. +func trimUDPAnswers(config *DNSConfig, resp *dns.Msg) (trimmed bool) { numAnswers := len(resp.Answer) // This cuts UDP responses to a useful but limited number of responses. - if numAnswers > maxServiceResponses { - resp.Answer = resp.Answer[:maxServiceResponses] + maxAnswers := lib.MinInt(maxUDPAnswerLimit, config.UDPAnswerLimit) + if numAnswers > maxAnswers { + resp.Answer = resp.Answer[:maxAnswers] } // This enforces the hard limit of 512 bytes per the RFC. @@ -567,7 +573,7 @@ RPC: // If the network is not TCP, restrict the number of responses if network != "tcp" { - wasTrimmed := trimAnswers(resp) + wasTrimmed := trimUDPAnswers(d.config, resp) // Flag that there are more records to return in the UDP response if wasTrimmed && d.config.EnableTruncate { @@ -599,7 +605,7 @@ func (d *DNSServer) preparedQueryLookup(network, datacenter, query string, req, // match the previous behavior. We can optimize by pushing more filtering // into the query execution, but for now I think we need to get the full // response. We could also choose a large arbitrary number that will - // likely work in practice, like 10*maxServiceResponses which should help + // likely work in practice, like 10*maxUDPAnswerLimit which should help // reduce bandwidth if there are thousands of nodes available. endpoint := d.agent.getEndpoint(preparedQueryEndpoint) @@ -662,7 +668,7 @@ RPC: // If the network is not TCP, restrict the number of responses. if network != "tcp" { - wasTrimmed := trimAnswers(resp) + wasTrimmed := trimUDPAnswers(d.config, resp) // Flag that there are more records to return in the UDP response if wasTrimmed && d.config.EnableTruncate { @@ -682,6 +688,7 @@ func (d *DNSServer) serviceNodeRecords(dc string, nodes structs.CheckServiceNode qName := req.Question[0].Name qType := req.Question[0].Qtype handled := make(map[string]struct{}) + for _, node := range nodes { // Start with the translated address but use the service address, // if specified. diff --git a/command/agent/dns_test.go b/command/agent/dns_test.go index 94cc1e14d9..658b26b5c2 100644 --- a/command/agent/dns_test.go +++ b/command/agent/dns_test.go @@ -2,6 +2,7 @@ package agent import ( "fmt" + "math/rand" "net" "os" "strings" @@ -13,6 +14,18 @@ import ( "github.com/miekg/dns" ) +const ( + configUDPAnswerLimit = 4 + defaultNumUDPResponses = 3 + testUDPTruncateLimit = 8 + + pctNodesWithIPv6 = 0.5 + + // generateNumNodes is the upper bounds for the number of hosts used + // in testing below. Generate an arbitrarily large number of hosts. + generateNumNodes = testUDPTruncateLimit * defaultNumUDPResponses * configUDPAnswerLimit +) + func makeDNSServer(t *testing.T) (string, *DNSServer) { return makeDNSServerConfig(t, nil, nil) } @@ -26,7 +39,7 @@ func makeDNSServerConfig( if agentFn != nil { agentFn(agentConf) } - dnsConf := &DNSConfig{} + dnsConf := &DefaultConfig().DNSConfig if dnsFn != nil { dnsFn(dnsConf) } @@ -1808,8 +1821,8 @@ func TestDNS_ServiceLookup_Randomize(t *testing.T) { testutil.WaitForLeader(t, srv.agent.RPC, "dc1") - // Register a large set of nodes. - for i := 0; i < 3*maxServiceResponses; i++ { + // Register a large number of nodes. + for i := 0; i < generateNumNodes; i++ { args := &structs.RegisterRequest{ Datacenter: "dc1", Node: fmt.Sprintf("foo%d", i), @@ -1856,7 +1869,7 @@ func TestDNS_ServiceLookup_Randomize(t *testing.T) { m := new(dns.Msg) m.SetQuestion(question, dns.TypeANY) - c := new(dns.Client) + c := &dns.Client{Net: "udp"} in, _, err := c.Exchange(m, addr.String()) if err != nil { t.Fatalf("err: %v", err) @@ -1864,7 +1877,7 @@ func TestDNS_ServiceLookup_Randomize(t *testing.T) { // Response length should be truncated and we should get // an A record for each response. - if len(in.Answer) != maxServiceResponses { + if len(in.Answer) != defaultNumUDPResponses { t.Fatalf("Bad: %#v", len(in.Answer)) } @@ -1902,8 +1915,8 @@ func TestDNS_ServiceLookup_Truncate(t *testing.T) { testutil.WaitForLeader(t, srv.agent.RPC, "dc1") - // Register nodes a large number of nodes. - for i := 0; i < 3*maxServiceResponses; i++ { + // Register a large number of nodes. + for i := 0; i < generateNumNodes; i++ { args := &structs.RegisterRequest{ Datacenter: "dc1", Node: fmt.Sprintf("foo%d", i), @@ -2043,17 +2056,19 @@ func TestDNS_ServiceLookup_LargeResponses(t *testing.T) { } } -func TestDNS_ServiceLookup_MaxResponses(t *testing.T) { - dir, srv := makeDNSServer(t) +func testDNS_ServiceLookup_responseLimits(t *testing.T, answerLimit int, qType uint16, + expectedService, expectedQuery, expectedQueryID int) (bool, error) { + dir, srv := makeDNSServerConfig(t, nil, func(c *DNSConfig) { + c.UDPAnswerLimit = answerLimit + }) defer os.RemoveAll(dir) defer srv.agent.Shutdown() testutil.WaitForLeader(t, srv.agent.RPC, "dc1") - // Register a large number of nodes. - for i := 0; i < 6*maxServiceResponses; i++ { + for i := 0; i < generateNumNodes; i++ { nodeAddress := fmt.Sprintf("127.0.0.%d", i+1) - if i > 3 { + if rand.Float64() < pctNodesWithIPv6 { nodeAddress = fmt.Sprintf("fe80::%d", i+1) } args := &structs.RegisterRequest{ @@ -2061,72 +2076,115 @@ func TestDNS_ServiceLookup_MaxResponses(t *testing.T) { Node: fmt.Sprintf("foo%d", i), Address: nodeAddress, Service: &structs.NodeService{ - Service: "web", - Port: 8000, + Service: "api-tier", + Port: 8080, }, } var out struct{} if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil { - t.Fatalf("err: %v", err) + return false, fmt.Errorf("err: %v", err) } } - - // Register an equivalent prepared query. var id string { args := &structs.PreparedQueryRequest{ Datacenter: "dc1", Op: structs.PreparedQueryCreate, Query: &structs.PreparedQuery{ + Name: "api-tier", Service: structs.ServiceQuery{ - Service: "web", + Service: "api-tier", }, }, } + if err := srv.agent.RPC("PreparedQuery.Apply", args, &id); err != nil { - t.Fatalf("err: %v", err) + return false, fmt.Errorf("err: %v", err) } } // Look up the service directly and via prepared query. questions := []string{ - "web.service.consul.", + "api-tier.service.consul.", + "api-tier.query.consul.", id + ".query.consul.", } - for _, question := range questions { + for idx, question := range questions { m := new(dns.Msg) - m.SetQuestion(question, dns.TypeANY) + m.SetQuestion(question, qType) addr, _ := srv.agent.config.ClientListener("", srv.agent.config.Ports.DNS) - c := new(dns.Client) + c := &dns.Client{Net: "udp"} in, _, err := c.Exchange(m, addr.String()) if err != nil { - t.Fatalf("err: %v", err) + return false, fmt.Errorf("err: %v", err) } - if len(in.Answer) != 3 { - t.Fatalf("should receive 3 answers for ANY") + switch idx { + case 0: + if len(in.Answer) != expectedService { + return false, fmt.Errorf("%d/%d answers received for type %v for %s", len(in.Answer), answerLimit, question) + } + case 1: + if len(in.Answer) != expectedQuery { + return false, fmt.Errorf("%d/%d answers received for type %v for %s", len(in.Answer), answerLimit, question) + } + case 2: + if len(in.Answer) != expectedQueryID { + return false, fmt.Errorf("%d/%d answers received for type %v for %s", len(in.Answer), answerLimit, question) + } + default: + panic("abort") + } + } + + return true, nil +} + +func TestDNS_ServiceLookup_AnswerLimits(t *testing.T) { + // Build a matrix of config parameters (udpAnswerLimit), and the + // length of the response per query type and question. + tests := []struct { + name string + udpAnswerLimit int + expectedAService int + expectedAQuery int + expectedAQueryID int + expectedAAAAService int + expectedAAAAQuery int + expectedAAAAQueryID int + expectedANYService int + expectedANYQuery int + expectedANYQueryID int + }{ + {"0", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {"1", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {"2", 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {"3", 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, + {"4", 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}, + {"5", 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {"6", 6, 6, 6, 6, 6, 6, 5, 6, 6, 6}, + {"7", 7, 7, 7, 6, 7, 7, 5, 7, 7, 6}, + {"8", 8, 8, 8, 6, 8, 8, 5, 8, 8, 6}, + {"9", 9, 8, 8, 6, 8, 8, 5, 8, 8, 6}, + {"20", 20, 8, 8, 6, 8, 8, 5, 8, 8, 6}, + {"30", 30, 8, 8, 6, 8, 8, 5, 8, 8, 6}, + } + for _, test := range tests { + ok, err := testDNS_ServiceLookup_responseLimits(t, test.udpAnswerLimit, dns.TypeA, test.expectedAService, test.expectedAQuery, test.expectedAQueryID) + if !ok { + t.Errorf("Expected service A lookup %d to pass: %v", test.name, err) } - m.SetQuestion(question, dns.TypeA) - in, _, err = c.Exchange(m, addr.String()) - if err != nil { - t.Fatalf("err: %v", err) + ok, err = testDNS_ServiceLookup_responseLimits(t, test.udpAnswerLimit, dns.TypeAAAA, test.expectedAAAAService, test.expectedAAAAQuery, test.expectedAAAAQueryID) + if !ok { + t.Errorf("Expected service AAAA lookup %d to pass: %v", test.name, err) } - if len(in.Answer) != 3 { - t.Fatalf("should receive 3 answers for A") - } - - m.SetQuestion(question, dns.TypeAAAA) - in, _, err = c.Exchange(m, addr.String()) - if err != nil { - t.Fatalf("err: %v", err) - } - - if len(in.Answer) != 3 { - t.Fatalf("should receive 3 answers for AAAA") + ok, err = testDNS_ServiceLookup_responseLimits(t, test.udpAnswerLimit, dns.TypeANY, test.expectedANYService, test.expectedANYQuery, test.expectedANYQueryID) + if !ok { + t.Errorf("Expected service ANY lookup %d to %v: %v", test.name, err) } } } diff --git a/lib/math.go b/lib/math.go new file mode 100644 index 0000000000..36b483c64e --- /dev/null +++ b/lib/math.go @@ -0,0 +1,15 @@ +package lib + +func MaxInt(a, b int) int { + if a > b { + return a + } + return b +} + +func MinInt(a, b int) int { + if a > b { + return b + } + return a +} diff --git a/lib/math_test.go b/lib/math_test.go new file mode 100644 index 0000000000..fdb963ecde --- /dev/null +++ b/lib/math_test.go @@ -0,0 +1,29 @@ +package lib_test + +import ( + "testing" + + "github.com/hashicorp/consul/lib" +) + +func TestMathMaxInt(t *testing.T) { + tests := [][3]int{{1, 2, 2}, {-1, 1, 1}, {2, 0, 2}} + for i, _ := range tests { + expected := tests[i][2] + actual := lib.MaxInt(tests[i][0], tests[i][1]) + if expected != actual { + t.Fatalf("in iteration %d expected %d, got %d", i, expected, actual) + } + } +} + +func TestMathMinInt(t *testing.T) { + tests := [][3]int{{1, 2, 1}, {-1, 1, -1}, {2, 0, 0}} + for i, _ := range tests { + expected := tests[i][2] + actual := lib.MinInt(tests[i][0], tests[i][1]) + if expected != actual { + t.Fatalf("in iteration %d expected %d, got %d", i, expected, actual) + } + } +} diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index ddd19b20db..2b8c6b81b5 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -499,12 +499,27 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass using TCP to get the full set of records. * `only_passing` - When set to true, the default, the querying agent will only receive node - or service addresses for healthy services. A healthy service is defined - as a service with one or more healthchecks and all defined healthchecks - for the service are in a passing or warning state (i.e. not - critical). Set to false to have the querying agent include all node and - service addresses regardless of the health of the service. + When set to true, the default, the querying agent will only receive node + or service addresses for healthy services. A healthy service is defined + as a service with one or more healthchecks and all defined healthchecks + for the service are in a passing or warning state (i.e. not + critical). Set to false to have the querying agent include all node and + service addresses regardless of the health of the service. + + * `udp_answer_limit` - Limit the number of + resource records contained in the answer section of a UDP-based DNS + response. When answering a question, Consul will use the complete list of + matching hosts, shuffle the list randomly, and then limit the number of + answers to `udp_answer_limit` (default `3`). In environments where + [RFC 3484 Section 6](https://tools.ietf.org/html/rfc3484#section-6) Rule 9 + is implemented and enforced (i.e. DNS answers are always sorted and + therefore never random), clients may need to set this value to `1` to + preserve the expected randomized distribution behavior (note: + [https://tools.ietf.org/html/rfc3484](RFC 3484) has been obsoleted by + [RFC 6724](https://tools.ietf.org/html/rfc6724) and as a result it should + be increasingly uncommon to need to change this value with modern + resolvers). * `domain` Equivalent to the [`-domain` command-line flag](#_domain).