fix(dns): bug with standard lookup tags not working; SRV questions returning duplicate hostnames (#21361)

This commit is contained in:
Dan Stough 2024-06-25 13:42:25 -04:00 committed by GitHub
parent a04cc5aeae
commit a4a3aec567
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 400 additions and 32 deletions

8
.changelog/21361.txt Normal file
View File

@ -0,0 +1,8 @@
```release-note:bug
dns: Fix a regression where DNS tags using the standard lookup syntax, `tag.name.service.consul`, were being disregarded.
```
```release-note:bug
dns: Fix a regression where DNS SRV questions were returning duplicate hostnames instead of encoded IPs.
This affected Nomad integrations with Consul.
```

View File

@ -106,7 +106,10 @@ func buildQueryFromDNSMessage(req *dns.Msg, reqCtx Context, domain, altDomain st
return nil, err
}
name, tag := getQueryNameAndTagFromParts(queryType, queryParts)
name, tag, err := getQueryNameAndTagFromParts(queryType, queryParts)
if err != nil {
return nil, err
}
portName := parsePort(queryParts)
@ -157,14 +160,28 @@ func buildAddressResults(req *dns.Msg) ([]*discovery.Result, error) {
}
// getQueryNameAndTagFromParts returns the query name and tag from the query parts that are taken from the original dns question.
func getQueryNameAndTagFromParts(queryType discovery.QueryType, queryParts []string) (string, string) {
//
// Valid Query Parts:
// [<tag>.]<service>
// [<port>.port.]<service>
// _<service>._<tag> // RFC 2782 style
func getQueryNameAndTagFromParts(queryType discovery.QueryType, queryParts []string) (string, string, error) {
n := len(queryParts)
if n == 0 {
return "", ""
return "", "", errInvalidQuestion
}
switch queryType {
case discovery.QueryTypeService:
if n > 3 {
// Having this many fields is never valid.
return "", "", errInvalidQuestion
}
if n == 3 && queryParts[n-2] != "port" {
// This probably means that someone was trying to use a tag name with a period.
// This was deprecated in Consul 0.3.
return "", "", errInvalidQuestion
}
// Support RFC 2782 style syntax
if n == 2 && strings.HasPrefix(queryParts[1], "_") && strings.HasPrefix(queryParts[0], "_") {
// Grab the tag since we make nuke it if it's tcp
@ -177,9 +194,14 @@ func getQueryNameAndTagFromParts(queryType discovery.QueryType, queryParts []str
name := queryParts[0][1:]
// _name._tag.service.consul
return name, tag
return name, tag, nil
}
return queryParts[n-1], ""
// Standard-style lookup w/ tag
if n == 2 {
return queryParts[1], queryParts[0], nil
}
// This works for the v1 and v2 catalog queries, even if a port name was specified.
return queryParts[n-1], "", nil
case discovery.QueryTypePreparedQuery:
name := ""
@ -197,9 +219,17 @@ func getQueryNameAndTagFromParts(queryType discovery.QueryType, queryParts []str
// Allow a "." in the query name, just join all the parts.
name = strings.Join(queryParts, ".")
}
return name, ""
if name == "" {
return "", "", errInvalidQuestion
}
return name, "", nil
}
return queryParts[n-1], ""
name := queryParts[n-1]
if name == "" {
return "", "", errInvalidQuestion
}
return queryParts[n-1], "", nil
}
// getQueryTenancy returns a discovery.QueryTenancy from a DNS message.

View File

@ -19,6 +19,7 @@ type testCaseBuildQueryFromDNSMessage struct {
request *dns.Msg
requestContext *Context
expectedQuery *discovery.Query
expectedError string
}
// Test_buildQueryFromDNSMessage tests the buildQueryFromDNSMessage function.
@ -123,6 +124,114 @@ func Test_buildQueryFromDNSMessage(t *testing.T) {
},
},
},
// V1 Service Queries
{
name: "test A 'service.' standard query with tag",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,
},
Question: []dns.Question{
{
Name: "primary.db.service.dc1.consul", // "intentionally missing the trailing dot"
Qtype: dns.TypeA,
Qclass: dns.ClassINET,
},
},
},
expectedQuery: &discovery.Query{
QueryType: discovery.QueryTypeService,
QueryPayload: discovery.QueryPayload{
Name: "db",
Tag: "primary",
Tenancy: discovery.QueryTenancy{
Datacenter: "dc1",
},
},
},
},
{
name: "test A 'service.' RFC 2782 query with tag",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,
},
Question: []dns.Question{
{
Name: "_db._primary.service.dc1.consul", // "intentionally missing the trailing dot"
Qtype: dns.TypeA,
Qclass: dns.ClassINET,
},
},
},
expectedQuery: &discovery.Query{
QueryType: discovery.QueryTypeService,
QueryPayload: discovery.QueryPayload{
Name: "db",
Tag: "primary",
Tenancy: discovery.QueryTenancy{
Datacenter: "dc1",
},
},
},
},
{
name: "test A 'service.' RFC 2782 query",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,
},
Question: []dns.Question{
{
Name: "_db._tcp.service.dc1.consul", // the `tcp` tag should be ignored
Qtype: dns.TypeA,
Qclass: dns.ClassINET,
},
},
},
expectedQuery: &discovery.Query{
QueryType: discovery.QueryTypeService,
QueryPayload: discovery.QueryPayload{
Name: "db",
Tenancy: discovery.QueryTenancy{
Datacenter: "dc1",
},
},
},
},
{
name: "test A 'service.' with too many query parts (RFC 2782 style)",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,
},
Question: []dns.Question{
{
Name: "nope._db._tcp.service.dc1.consul", // the `tcp` tag should be ignored
Qtype: dns.TypeA,
Qclass: dns.ClassINET,
},
},
},
expectedError: "invalid question",
},
{
name: "test A 'service.' with too many query parts (standard style)",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,
},
Question: []dns.Question{
{
Name: "too.many.parts.service.dc1.consul.",
Qtype: dns.TypeA,
Qclass: dns.ClassINET,
},
},
},
expectedError: "invalid question",
},
// V2 Catalog Queries
{
name: "test A 'workload.'",
request: &dns.Msg{
@ -213,6 +322,13 @@ func Test_buildQueryFromDNSMessage(t *testing.T) {
context = &Context{}
}
query, err := buildQueryFromDNSMessage(tc.request, *context, "consul.", ".", nil)
if tc.expectedError != "" {
require.Error(t, err)
assert.Contains(t, err.Error(), tc.expectedError)
return
}
require.NoError(t, err)
assert.Equal(t, tc.expectedQuery, query)
})

View File

@ -235,11 +235,7 @@ func (d messageSerializer) getAnswerExtraAndNs(opts *getAnswerExtraAndNsOptions)
ns = append(ns, opts.dnsRecordMaker.makeNS(opts.responseDomain, fqdn, opts.ttl))
extra = append(extra, extraRecord)
case qType == dns.TypeSRV:
// We put A/AAAA/CNAME records in the additional section for SRV requests
a, e := d.getAnswerExtrasForAddressAndTarget(nodeAddress, serviceAddress, opts)
answer = append(answer, a...)
extra = append(extra, e...)
fallthrough
default:
a, e := d.getAnswerExtrasForAddressAndTarget(nodeAddress, serviceAddress, opts)
answer = append(answer, a...)
@ -291,16 +287,14 @@ func (d messageSerializer) getAnswerExtrasForAddressAndTarget(nodeAddress *dnsAd
switch {
case (reqType == requestTypeAddress || opts.result.Type == discovery.ResultTypeVirtual) &&
serviceAddress.IsEmptyString() && nodeAddress.IsIP():
a, e := getAnswerExtrasForIP(qName, nodeAddress, opts.req.Question[0],
reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker)
a, e := getAnswerExtrasForIP(qName, nodeAddress, opts.req.Question[0], reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker, false)
answer = append(answer, a...)
extra = append(extra, e...)
case opts.result.Type == discovery.ResultTypeNode && nodeAddress.IsIP():
canonicalNodeName := canonicalNameForResult(opts.result.Type,
opts.result.Node.Name, opts.responseDomain, opts.result.Tenancy, opts.port.Name)
a, e := getAnswerExtrasForIP(canonicalNodeName, nodeAddress, opts.req.Question[0], reqType,
opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker)
a, e := getAnswerExtrasForIP(canonicalNodeName, nodeAddress, opts.req.Question[0], reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker, false)
answer = append(answer, a...)
extra = append(extra, e...)
@ -320,8 +314,7 @@ func (d messageSerializer) getAnswerExtrasForAddressAndTarget(nodeAddress *dnsAd
}
canonicalNodeName := canonicalNameForResult(resultType, opts.result.Node.Name,
opts.responseDomain, opts.result.Tenancy, opts.port.Name)
a, e := getAnswerExtrasForIP(canonicalNodeName, nodeAddress, opts.req.Question[0],
reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker)
a, e := getAnswerExtrasForIP(canonicalNodeName, nodeAddress, opts.req.Question[0], reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker, nodeAddress.String() == opts.result.Node.Address) // We compare the node address to the result to detect changes from the WAN translation
answer = append(answer, a...)
extra = append(extra, e...)
@ -331,12 +324,16 @@ func (d messageSerializer) getAnswerExtrasForAddressAndTarget(nodeAddress *dnsAd
answer = append(answer, a...)
extra = append(extra, e...)
case serviceAddress.IsIP() && opts.req.Question[0].Qtype == dns.TypeSRV:
a, e := getAnswerExtrasForIP(qName, serviceAddress, opts.req.Question[0], requestTypeName, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker, false)
answer = append(answer, a...)
extra = append(extra, e...)
// The service address is an IP
case serviceAddress.IsIP():
canonicalServiceName := canonicalNameForResult(discovery.ResultTypeService,
opts.result.Service.Name, opts.responseDomain, opts.result.Tenancy, opts.port.Name)
a, e := getAnswerExtrasForIP(canonicalServiceName, serviceAddress,
opts.req.Question[0], reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker)
a, e := getAnswerExtrasForIP(canonicalServiceName, serviceAddress, opts.req.Question[0], reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker, false)
answer = append(answer, a...)
extra = append(extra, e...)
@ -345,8 +342,7 @@ func (d messageSerializer) getAnswerExtrasForAddressAndTarget(nodeAddress *dnsAd
case serviceAddress.FQDN() == opts.req.Question[0].Name && nodeAddress.IsIP():
canonicalNodeName := canonicalNameForResult(discovery.ResultTypeNode,
opts.result.Node.Name, opts.responseDomain, opts.result.Tenancy, opts.port.Name)
a, e := getAnswerExtrasForIP(canonicalNodeName, nodeAddress, opts.req.Question[0],
reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker)
a, e := getAnswerExtrasForIP(canonicalNodeName, nodeAddress, opts.req.Question[0], reqType, opts.result, opts.ttl, opts.responseDomain, &opts.port, opts.dnsRecordMaker, nodeAddress.String() == opts.result.Node.Address) // We compare the node address to the result to detect changes from the WAN translation
answer = append(answer, a...)
extra = append(extra, e...)
@ -463,9 +459,7 @@ func shouldAppendTXTRecord(query *discovery.Query, cfg *RouterDynamicConfig, req
}
// getAnswerExtrasForIP creates the dns answer and extra from IP dnsAddress pairs.
func getAnswerExtrasForIP(name string, addr *dnsAddress, question dns.Question,
reqType requestType, result *discovery.Result, ttl uint32, domain string,
port *discovery.Port, maker dnsRecordMaker) (answer []dns.RR, extra []dns.RR) {
func getAnswerExtrasForIP(name string, addr *dnsAddress, question dns.Question, reqType requestType, result *discovery.Result, ttl uint32, domain string, port *discovery.Port, maker dnsRecordMaker, addressOverridden bool) (answer []dns.RR, extra []dns.RR) {
qType := question.Qtype
canReturnARecord := qType == dns.TypeSRV || qType == dns.TypeA || qType == dns.TypeANY || qType == dns.TypeNS || qType == dns.TypeTXT
canReturnAAAARecord := qType == dns.TypeSRV || qType == dns.TypeAAAA || qType == dns.TypeANY || qType == dns.TypeNS || qType == dns.TypeTXT
@ -493,7 +487,8 @@ func getAnswerExtrasForIP(name string, addr *dnsAddress, question dns.Question,
}
if reqType != requestTypeAddress && qType == dns.TypeSRV {
if result.Type == discovery.ResultTypeService && addr.IsIP() && result.Node.Address != addr.String() {
if addr.IsIP() && question.Name == name && !addressOverridden {
// encode the ip to be used in the header of the A/AAAA record
// as well as the target of the SRV record.
recHdrName = encodeIPAsFqdn(result, addr.IP(), domain)

View File

@ -158,6 +158,79 @@ func Test_HandleRequest_ServiceQuestions(t *testing.T) {
},
},
},
{
name: "req type: service / question type: SRV / CNAME required: no - multiple results without Node address + tags",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,
},
Question: []dns.Question{
{
Name: "tag.foo.service.consul.",
Qtype: dns.TypeSRV,
},
},
},
configureDataFetcher: func(fetcher discovery.CatalogDataFetcher) {
fetcher.(*discovery.MockCatalogDataFetcher).
On("FetchEndpoints", mock.Anything,
&discovery.QueryPayload{
Name: "foo",
Tenancy: discovery.QueryTenancy{},
Tag: "tag",
}, discovery.LookupTypeService).
Return([]*discovery.Result{
{
// This result emulates an allocation registration with Nomad.
// The node name is generated by Consul and shares the service IP
Type: discovery.ResultTypeService,
Service: &discovery.Location{Name: "foo", Address: "127.0.0.1"},
Node: &discovery.Location{Name: "Node-9e46a487-f5be-2f40-ad60-5a10e32237ed", Address: "127.0.0.1"},
Tenancy: discovery.ResultTenancy{
Datacenter: "dc1",
},
},
},
nil).On("ValidateRequest", mock.Anything,
mock.Anything).Return(nil).On("NormalizeRequest", mock.Anything)
},
response: &dns.Msg{
MsgHdr: dns.MsgHdr{
Response: true,
Authoritative: true,
},
Compress: true,
Question: []dns.Question{
{
Name: "tag.foo.service.consul.",
Qtype: dns.TypeSRV,
},
},
Answer: []dns.RR{
&dns.SRV{
Hdr: dns.RR_Header{
Name: "tag.foo.service.consul.",
Rrtype: dns.TypeSRV,
Class: dns.ClassINET,
Ttl: 123,
},
Target: "7f000001.addr.dc1.consul.",
Priority: 1,
},
},
Extra: []dns.RR{
&dns.A{
Hdr: dns.RR_Header{
Name: "7f000001.addr.dc1.consul.",
Rrtype: dns.TypeA,
Class: dns.ClassINET,
Ttl: 123,
},
A: net.ParseIP("127.0.0.1"),
},
},
},
},
}
for _, tc := range testCases {

View File

@ -462,7 +462,7 @@ func Test_HandleRequest_V2Services(t *testing.T) {
},
},
{
name: "SRV Query with a multi-port service that has workloads w/ hostnames (no recursor)",
name: "SRV Query with a multi-port service that has workloads w/ hostnames (with recursor)",
request: &dns.Msg{
MsgHdr: dns.MsgHdr{
Opcode: dns.OpcodeQuery,

View File

@ -381,6 +381,105 @@ func TestDNS_ServiceLookup(t *testing.T) {
}
}
// TestDNS_ServiceAddressWithTagLookup tests some specific cases that Nomad would exercise,
// Like registering a service w/o a Node. https://github.com/hashicorp/nomad/blob/1174019676ff3d65b39323eb0c7234fb1e09b80c/command/agent/consul/service_client.go#L1366-L1381
// Errors with this were reported in https://github.com/hashicorp/consul/issues/21325#issuecomment-2166845574
// Also we test that only one tag is valid in the URL.
func TestDNS_ServiceAddressWithTagLookup(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
for name, experimentsHCL := range getVersionHCL(true) {
t.Run(name, func(t *testing.T) {
a := NewTestAgent(t, experimentsHCL)
defer a.Shutdown()
testrpc.WaitForLeader(t, a.RPC, "dc1")
{
// This emulates a Nomad service registration.
// Using an internal RPC for Catalog.Register will not trigger the same condition.
err := a.Client().Agent().ServiceRegister(&api.AgentServiceRegistration{
Kind: api.ServiceKindTypical,
ID: "db-1",
Name: "db",
Tags: []string{"primary"},
Address: "127.0.0.1",
Port: 12345,
Checks: make([]*api.AgentServiceCheck, 0),
})
require.NoError(t, err)
}
{
err := a.Client().Agent().ServiceRegister(&api.AgentServiceRegistration{
Kind: api.ServiceKindTypical,
ID: "db-2",
Name: "db",
Tags: []string{"secondary"},
Address: "127.0.0.2", // The address here has to be different, or the DNS server will dedupe it.
Port: 12345,
Checks: make([]*api.AgentServiceCheck, 0),
})
require.NoError(t, err)
}
// Query the service using a tag - this also checks that we're filtering correctly
questions := []string{
"_db._primary.service.dc1.consul.", // w/ RFC 2782 style syntax
"primary.db.service.dc1.consul.",
}
for _, question := range questions {
m := new(dns.Msg)
m.SetQuestion(question, dns.TypeSRV)
c := new(dns.Client)
in, _, err := c.Exchange(m, a.DNSAddr())
require.NoError(t, err)
require.Len(t, in.Answer, 1, "Expected only one result in the Answer section")
srvRec, ok := in.Answer[0].(*dns.SRV)
require.True(t, ok, "Expected an SRV record in the Answer section")
require.Equal(t, uint16(12345), srvRec.Port)
require.Equal(t, "7f000001.addr.dc1.consul.", srvRec.Target)
aRec, ok := in.Extra[0].(*dns.A)
require.True(t, ok, "Expected an A record in the Extra section")
require.Equal(t, "7f000001.addr.dc1.consul.", aRec.Hdr.Name)
require.Equal(t, "127.0.0.1", aRec.A.String())
if strings.Contains(question, "query") {
// The query should have the TTL associated with the query registration.
require.Equal(t, uint32(3), srvRec.Hdr.Ttl)
require.Equal(t, uint32(3), aRec.Hdr.Ttl)
} else {
require.Equal(t, uint32(0), srvRec.Hdr.Ttl)
require.Equal(t, uint32(0), aRec.Hdr.Ttl)
}
}
// Multiple tags are not supported in the legacy DNS server
questions = []string{
"banana._db._primary.service.dc1.consul.",
}
for _, question := range questions {
m := new(dns.Msg)
m.SetQuestion(question, dns.TypeSRV)
c := new(dns.Client)
in, _, err := c.Exchange(m, a.DNSAddr())
require.NoError(t, err)
require.Len(t, in.Answer, 0, "Expected no results in the Answer section")
// For v1dns, we combine the tags with a period, which results in NXDOMAIN
// For v2dns, we are also return NXDomain
// The reported issue says that v2dns this is returning valid results.
require.Equal(t, dns.RcodeNameError, in.Rcode)
}
})
}
}
func TestDNS_ServiceLookupWithInternalServiceAddress(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
@ -1839,6 +1938,49 @@ func TestDNS_ServiceLookup_TagPeriod(t *testing.T) {
}
}
// TestDNS_ServiceLookup_ExtraTags tests tag behavior.
// With v1dns, we still support period tags, but if a tag is not found it's an NXDOMAIN code.
// With v2dns, we do not support period tags, so it's an NXDOMAIN code because the name is not valid.
func TestDNS_ServiceLookup_ExtraTags(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
for name, experimentsHCL := range getVersionHCL(true) {
t.Run(name, func(t *testing.T) {
a := NewTestAgent(t, experimentsHCL)
defer a.Shutdown()
testrpc.WaitForLeader(t, a.RPC, "dc1")
// Register node
args := &structs.RegisterRequest{
Datacenter: "dc1",
Node: "foo",
Address: "127.0.0.1",
Service: &structs.NodeService{
Service: "db",
Tags: []string{"primary"},
Port: 12345,
},
}
var out struct{}
if err := a.RPC(context.Background(), "Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
m1 := new(dns.Msg)
m1.SetQuestion("dummy.primary.db.service.consul.", dns.TypeSRV)
c1 := new(dns.Client)
in, _, err := c1.Exchange(m1, a.DNSAddr())
require.NoError(t, err)
require.Len(t, in.Answer, 0, "Expected no answer")
require.Equal(t, dns.RcodeNameError, in.Rcode)
})
}
}
func TestDNS_ServiceLookup_PreparedQueryNamePeriod(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")

View File

@ -37,11 +37,15 @@ For more detailed information, please refer to the [upgrade details page](/consu
The following issues are known to exist in the v1.19.x releases:
- v1.19.0 - DNS SRV records for registrations that specify a service address instead of a node address return identical `.service` hostnames instead of unique `.addr` addresses.
As a result, it is impossible to resolve the individual service addresses.
This bug can affect Nomad installations using Consul for service discovery because the service address field is always specified to Consul.
To revert to the old DNS behavior, set `experiments: [ "v1dns" ]` in the agent configuration.
We plan to include a fix in a future release of Consul v1.19.1 [[GH-21325](https://github.com/hashicorp/consul/issues/21325)].
- v1.19.0 - There are known issues with the new DNS server implementation.
To revert to the old DNS behavior, set `experiments: [ "v1dns" ]` in the agent configuration.
Fixes will be included in Consul v1.19.1.
- DNS SRV records for registrations that specify a service address instead of a node address return identical `.service` hostnames instead of unique `.addr` addresses.
As a result, it is impossible to resolve the individual service addresses.
This bug can affect Nomad installations using Consul for service discovery because the service address field is always specified to Consul.
[[GH-21325](https://github.com/hashicorp/consul/issues/21325)].
- DNS Tags are not resolved when using the Standard query format, `tag.name.service.consul`.
[[GH-21326](https://github.com/hashicorp/consul/issues/21336)].
## Changelogs