[CE] feat(v2dns): add v2 style query metrics (#20608)

feat(v2dns): add v2 style query metrics
This commit is contained in:
Dan Stough 2024-02-13 12:08:01 -05:00 committed by GitHub
parent 7575b53737
commit 0f0b080514
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 89 additions and 46 deletions

View File

@ -13,6 +13,7 @@ import (
"time"
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog"
@ -29,6 +30,15 @@ const (
staleCounterThreshold = 5 * time.Second
)
// DNSCounters pre-registers the staleness metric.
// This value is used by both the V1 and V2 DNS (V1 Catalog-only) servers.
var DNSCounters = []prometheus.CounterDefinition{
{
Name: []string{"dns", "stale_queries"},
Help: "Increments when an agent serves a query within the allowed stale threshold.",
},
}
// v1DataFetcherDynamicConfig is used to store the dynamic configuration of the V1 data fetcher.
type v1DataFetcherDynamicConfig struct {
// Default request tenancy

View File

@ -16,7 +16,6 @@ import (
"time"
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/armon/go-radix"
"github.com/hashicorp/go-hclog"
"github.com/miekg/dns"
@ -33,24 +32,6 @@ import (
"github.com/hashicorp/consul/logging"
)
var DNSCounters = []prometheus.CounterDefinition{
{
Name: []string{"dns", "stale_queries"},
Help: "Increments when an agent serves a query within the allowed stale threshold.",
},
}
var DNSSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"dns", "ptr_query"},
Help: "Measures the time spent handling a reverse DNS query for the given node.",
},
{
Name: []string{"dns", "domain_query"},
Help: "Measures the time spent handling a domain query for the given node.",
},
}
const (
// UDP can fit ~25 A records in a 512B response, and ~14 AAAA
// records. Limit further to prevent unintentional configuration
@ -406,8 +387,17 @@ func (d *DNSServer) getResponseDomain(questionName string) string {
func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) {
q := req.Question[0]
defer func(s time.Time) {
// V1 DNS-style metrics
metrics.MeasureSinceWithLabels([]string{"dns", "ptr_query"}, s,
[]metrics.Label{{Name: "node", Value: d.agent.config.NodeName}})
// V2 DNS-style metrics for forward compatibility
metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s,
[]metrics.Label{
{Name: "node", Value: d.agent.config.NodeName},
{Name: "type", Value: dns.Type(dns.TypePTR).String()},
})
d.logger.Debug("request served from client",
"question", q,
"latency", time.Since(s).String(),
@ -519,12 +509,21 @@ func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) {
func (d *DNSServer) handleQuery(resp dns.ResponseWriter, req *dns.Msg) {
q := req.Question[0]
defer func(s time.Time) {
// V1 DNS-style metrics
metrics.MeasureSinceWithLabels([]string{"dns", "domain_query"}, s,
[]metrics.Label{{Name: "node", Value: d.agent.config.NodeName}})
// V2 DNS-style metrics for forward compatibility
metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s,
[]metrics.Label{
{Name: "node", Value: d.agent.config.NodeName},
{Name: "type", Value: dns.Type(q.Qtype).String()},
})
d.logger.Debug("request served from client",
"name", q.Name,
"type", dns.Type(q.Qtype),
"class", dns.Class(q.Qclass),
"type", dns.Type(q.Qtype).String(),
"class", dns.Class(q.Qclass).String(),
"latency", time.Since(s).String(),
"client", resp.RemoteAddr().String(),
"client_network", resp.RemoteAddr().Network(),

View File

@ -13,6 +13,7 @@ import (
"sync/atomic"
"time"
"github.com/armon/go-metrics"
"github.com/armon/go-radix"
"github.com/miekg/dns"
@ -47,8 +48,6 @@ var (
trailingSpacesRE = regexp.MustCompile(" +$")
)
// TODO (v2-dns): metrics
// Context is used augment a DNS message with Consul-specific metadata.
type Context struct {
Token string
@ -105,6 +104,7 @@ type Router struct {
domain string
altDomain string
datacenter string
nodeName string
logger hclog.Logger
tokenFunc func() string
@ -124,8 +124,6 @@ func NewRouter(cfg Config) (*Router, error) {
domain := dns.CanonicalName(cfg.AgentConfig.DNSDomain)
altDomain := dns.CanonicalName(cfg.AgentConfig.DNSAltDomain)
// TODO (v2-dns): need to figure out tenancy information here in a way that work for V2 and V1
logger := cfg.Logger.Named(logging.DNS)
router := &Router{
@ -135,6 +133,7 @@ func NewRouter(cfg Config) (*Router, error) {
altDomain: altDomain,
datacenter: cfg.AgentConfig.Datacenter,
logger: logger,
nodeName: cfg.AgentConfig.NodeName,
tokenFunc: cfg.TokenFunc,
translateAddressFunc: cfg.TranslateAddressFunc,
translateServiceAddressFunc: cfg.TranslateServiceAddressFunc,
@ -148,21 +147,6 @@ func NewRouter(cfg Config) (*Router, error) {
// HandleRequest is used to process an individual DNS request. It returns a message in success or fail cases.
func (r *Router) HandleRequest(req *dns.Msg, reqCtx Context, remoteAddress net.Addr) *dns.Msg {
return r.handleRequestRecursively(req, reqCtx, remoteAddress, maxRecursionLevelDefault)
}
// getErrorFromECSNotGlobalError returns the underlying error from an ECSNotGlobalError, if it exists.
func getErrorFromECSNotGlobalError(err error) error {
if errors.Is(err, discovery.ErrECSNotGlobal) {
return err.(discovery.ECSNotGlobalError).Unwrap()
}
return err
}
// handleRequestRecursively is used to process an individual DNS request. It will recurse as needed
// a maximum number of times and returns a message in success or fail cases.
func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context,
remoteAddress net.Addr, maxRecursionLevel int) *dns.Msg {
configCtx := r.dynamicConfig.Load().(*RouterDynamicConfig)
r.logger.Trace("received request", "question", req.Question[0].Name, "type", dns.Type(req.Question[0].Qtype).String())
@ -176,6 +160,45 @@ func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context,
return createServerFailureResponse(req, configCtx, false)
}
defer func(s time.Time, q dns.Question) {
metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s,
[]metrics.Label{
{Name: "node", Value: r.nodeName},
{Name: "type", Value: dns.Type(q.Qtype).String()},
})
r.logger.Debug("request served from client",
"name", q.Name,
"type", dns.Type(q.Qtype).String(),
"class", dns.Class(q.Qclass).String(),
"latency", time.Since(s).String(),
"client", remoteAddress.String(),
"client_network", remoteAddress.Network(),
)
}(time.Now(), req.Question[0])
return r.handleRequestRecursively(req, reqCtx, configCtx, remoteAddress, maxRecursionLevelDefault)
}
// getErrorFromECSNotGlobalError returns the underlying error from an ECSNotGlobalError, if it exists.
func getErrorFromECSNotGlobalError(err error) error {
if errors.Is(err, discovery.ErrECSNotGlobal) {
return err.(discovery.ECSNotGlobalError).Unwrap()
}
return err
}
// handleRequestRecursively is used to process an individual DNS request. It will recurse as needed
// a maximum number of times and returns a message in success or fail cases.
func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context, configCtx *RouterDynamicConfig,
remoteAddress net.Addr, maxRecursionLevel int) *dns.Msg {
r.logger.Trace(
"received request",
"question", req.Question[0].Name,
"type", dns.Type(req.Question[0].Qtype).String(),
"recursion_remaining", maxRecursionLevel)
responseDomain, needRecurse := r.parseDomain(req.Question[0].Name)
if needRecurse && !canRecurse(configCtx) {
// This is the same error as an unmatched domain
@ -655,7 +678,7 @@ func (r *Router) defaultAgentDNSRequestContext() Context {
}
// resolveCNAME is used to recursively resolve CNAME records
func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Context,
func (r *Router) resolveCNAME(cfgContext *RouterDynamicConfig, name string, reqCtx Context,
remoteAddress net.Addr, maxRecursionLevel int) []dns.RR {
// If the CNAME record points to a Consul address, resolve it internally
// Convert query to lowercase because DNS is case-insensitive; d.domain and
@ -670,13 +693,13 @@ func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Cont
req.SetQuestion(name, dns.TypeANY)
// TODO: handle error response
resp := r.handleRequestRecursively(req, reqCtx, nil, maxRecursionLevel-1)
resp := r.handleRequestRecursively(req, reqCtx, cfgContext, nil, maxRecursionLevel-1)
return resp.Answer
}
// Do nothing if we don't have a recursor
if !canRecurse(cfg) {
if !canRecurse(cfgContext) {
return nil
}
@ -685,7 +708,7 @@ func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Cont
m.SetQuestion(name, dns.TypeA)
// Make a DNS lookup request
recursorResponse, err := r.recursor.handle(m, cfg, remoteAddress)
recursorResponse, err := r.recursor.handle(m, cfgContext, remoteAddress)
if err == nil {
return recursorResponse.Answer
}

View File

@ -6,12 +6,13 @@ package dns
import (
"errors"
"fmt"
"github.com/hashicorp/consul/internal/dnsutil"
"net"
"reflect"
"testing"
"time"
"github.com/hashicorp/consul/internal/dnsutil"
"github.com/miekg/dns"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
@ -2743,7 +2744,15 @@ func runHandleTestCases(t *testing.T, tc HandleTestCase) {
if ctx == nil {
ctx = &Context{}
}
actual := router.HandleRequest(tc.request, *ctx, tc.remoteAddress)
var remoteAddress net.Addr
if tc.remoteAddress != nil {
remoteAddress = tc.remoteAddress
} else {
remoteAddress = &net.UDPAddr{}
}
actual := router.HandleRequest(tc.request, *ctx, remoteAddress)
require.Equal(t, tc.response, actual)
}

View File

@ -28,6 +28,7 @@ import (
"github.com/hashicorp/consul/agent/consul/stream"
"github.com/hashicorp/consul/agent/consul/usagemetrics"
"github.com/hashicorp/consul/agent/consul/xdscapacity"
"github.com/hashicorp/consul/agent/discovery"
"github.com/hashicorp/consul/agent/grpc-external/limiter"
grpcInt "github.com/hashicorp/consul/agent/grpc-internal"
"github.com/hashicorp/consul/agent/grpc-internal/balancer"
@ -434,6 +435,7 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G
consul.CatalogCounters,
consul.ClientCounters,
consul.RPCCounters,
discovery.DNSCounters,
grpcWare.StatsCounters,
local.StateCounters,
xds.StatsCounters,