2020-08-07 21:08:43 -04:00
package agent
import (
"fmt"
"io"
"net"
2020-10-14 16:47:16 -04:00
"sync"
2020-08-07 21:08:43 -04:00
"time"
2020-11-12 18:12:12 -08:00
"github.com/armon/go-metrics/prometheus"
2022-09-26 14:58:15 -04:00
"github.com/hashicorp/consul/agent/hcp"
2020-10-05 17:31:35 -04:00
"github.com/hashicorp/go-hclog"
"google.golang.org/grpc/grpclog"
2020-08-07 21:08:43 -04:00
autoconf "github.com/hashicorp/consul/agent/auto-config"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/config"
2020-09-14 18:31:07 -04:00
"github.com/hashicorp/consul/agent/consul"
2021-02-25 16:22:30 -05:00
"github.com/hashicorp/consul/agent/consul/fsm"
proxycfg: server-local config entry data sources
This is the OSS portion of enterprise PR 2056.
This commit provides server-local implementations of the proxycfg.ConfigEntry
and proxycfg.ConfigEntryList interfaces, that source data from streaming events.
It makes use of the LocalMaterializer type introduced for peering replication,
adding the necessary support for authorization.
It also adds support for "wildcard" subscriptions (within a topic) to the event
publisher, as this is needed to fetch service-resolvers for all services when
configuring mesh gateways.
Currently, events will be emitted for just the ingress-gateway, service-resolver,
and mesh config entry types, as these are the only entries required by proxycfg
— the events will be emitted on topics named IngressGateway, ServiceResolver,
and MeshConfig topics respectively.
Though these events will only be consumed "locally" for now, they can also be
consumed via the gRPC endpoint (confirmed using grpcurl) so using them from
client agents should be a case of swapping the LocalMaterializer for an
RPCMaterializer.
2022-07-01 16:09:47 +01:00
"github.com/hashicorp/consul/agent/consul/stream"
2021-02-25 16:22:30 -05:00
"github.com/hashicorp/consul/agent/consul/usagemetrics"
2022-09-09 15:02:01 +01:00
"github.com/hashicorp/consul/agent/consul/xdscapacity"
"github.com/hashicorp/consul/agent/grpc-external/limiter"
2022-07-13 16:33:48 +01:00
grpc "github.com/hashicorp/consul/agent/grpc-internal"
"github.com/hashicorp/consul/agent/grpc-internal/resolver"
2021-02-25 16:22:30 -05:00
"github.com/hashicorp/consul/agent/local"
2020-08-07 21:08:43 -04:00
"github.com/hashicorp/consul/agent/pool"
2020-08-27 11:23:52 -04:00
"github.com/hashicorp/consul/agent/router"
2022-04-06 14:33:05 -07:00
"github.com/hashicorp/consul/agent/rpc/middleware"
2021-02-25 16:22:30 -05:00
"github.com/hashicorp/consul/agent/submatview"
2020-08-07 21:08:43 -04:00
"github.com/hashicorp/consul/agent/token"
2021-05-14 13:59:13 -05:00
"github.com/hashicorp/consul/agent/xds"
2020-08-07 21:08:43 -04:00
"github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
)
// TODO: BaseDeps should be renamed in the future once more of Agent.Start
// has been moved out in front of Agent.New, and we can better see the setup
// dependencies.
type BaseDeps struct {
2020-09-14 18:31:07 -04:00
consul . Deps // TODO: un-embed
2022-05-19 16:03:46 -04:00
RuntimeConfig * config . RuntimeConfig
MetricsConfig * lib . MetricsConfig
AutoConfig * autoconf . AutoConfig // TODO: use an interface
Cache * cache . Cache
ViewStore * submatview . Store
WatchedFiles [ ] string
2020-08-17 14:12:04 -04:00
}
2020-12-21 13:25:32 -05:00
type ConfigLoader func ( source config . Source ) ( config . LoadResult , error )
2020-08-07 21:08:43 -04:00
func NewBaseDeps ( configLoader ConfigLoader , logOut io . Writer ) ( BaseDeps , error ) {
d := BaseDeps { }
2020-12-21 13:25:32 -05:00
result , err := configLoader ( nil )
2020-08-07 21:08:43 -04:00
if err != nil {
return d , err
}
2022-03-31 15:11:49 -04:00
d . WatchedFiles = result . WatchedFiles
2020-12-21 13:25:32 -05:00
cfg := result . RuntimeConfig
2020-08-19 13:17:05 -04:00
logConf := cfg . Logging
logConf . Name = logging . Agent
2020-08-19 12:09:35 -04:00
d . Logger , err = logging . Setup ( logConf , logOut )
2020-08-07 21:08:43 -04:00
if err != nil {
return d , err
}
2021-04-26 11:57:07 -04:00
grpcLogInitOnce . Do ( func ( ) {
grpclog . SetLoggerV2 ( logging . NewGRPCLogger ( cfg . Logging . LogLevel , d . Logger ) )
} )
2020-08-07 21:08:43 -04:00
2020-12-21 13:25:32 -05:00
for _ , w := range result . Warnings {
2020-08-07 21:08:43 -04:00
d . Logger . Warn ( w )
}
cfg . NodeID , err = newNodeIDFromConfig ( cfg , d . Logger )
if err != nil {
return d , fmt . Errorf ( "failed to setup node ID: %w" , err )
}
2021-10-13 09:25:30 -07:00
isServer := result . RuntimeConfig . ServerMode
gauges , counters , summaries := getPrometheusDefs ( cfg . Telemetry , isServer )
2020-11-16 12:44:47 -08:00
cfg . Telemetry . PrometheusOpts . GaugeDefinitions = gauges
cfg . Telemetry . PrometheusOpts . CounterDefinitions = counters
cfg . Telemetry . PrometheusOpts . SummaryDefinitions = summaries
2022-05-19 16:03:46 -04:00
d . MetricsConfig , err = lib . InitTelemetry ( cfg . Telemetry , d . Logger )
2020-08-07 21:08:43 -04:00
if err != nil {
return d , fmt . Errorf ( "failed to initialize telemetry: %w" , err )
}
2022-03-18 10:46:58 +00:00
d . TLSConfigurator , err = tlsutil . NewConfigurator ( cfg . TLS , d . Logger )
2020-08-07 21:08:43 -04:00
if err != nil {
return d , err
}
d . RuntimeConfig = cfg
d . Tokens = new ( token . Store )
2020-08-17 19:30:25 -04:00
2021-02-12 12:43:36 -05:00
cfg . Cache . Logger = d . Logger . Named ( "cache" )
2020-08-07 21:08:43 -04:00
// cache-types are not registered yet, but they won't be used until the components are started.
d . Cache = cache . New ( cfg . Cache )
2021-02-25 16:22:30 -05:00
d . ViewStore = submatview . NewStore ( d . Logger . Named ( "viewstore" ) )
2020-08-07 21:08:43 -04:00
d . ConnPool = newConnPool ( cfg , d . Logger , d . TLSConfigurator )
2021-08-24 16:28:44 -05:00
builder := resolver . NewServerResolverBuilder ( resolver . Config {
// Set the authority to something sufficiently unique so any usage in
// tests would be self-isolating in the global resolver map, while also
// not incurring a huge penalty for non-test code.
Authority : cfg . Datacenter + "." + string ( cfg . NodeID ) ,
} )
2021-06-01 18:31:52 -04:00
resolver . Register ( builder )
2021-08-24 16:28:44 -05:00
d . GRPCConnPool = grpc . NewClientConnPool ( grpc . ClientConnPoolConfig {
Servers : builder ,
SrcAddr : d . ConnPool . SrcAddr ,
TLSWrapper : grpc . TLSWrapper ( d . TLSConfigurator . OutgoingRPCWrapper ( ) ) ,
ALPNWrapper : grpc . ALPNWrapper ( d . TLSConfigurator . OutgoingALPNRPCWrapper ( ) ) ,
UseTLSForDC : d . TLSConfigurator . UseTLS ,
DialingFromServer : cfg . ServerMode ,
DialingFromDatacenter : cfg . Datacenter ,
} )
2021-07-22 13:58:08 -05:00
d . LeaderForwarder = builder
2020-09-08 17:31:47 -04:00
2020-09-14 16:16:44 -04:00
d . Router = router . NewRouter ( d . Logger , cfg . Datacenter , fmt . Sprintf ( "%s.%s" , cfg . NodeName , cfg . Datacenter ) , builder )
2020-08-27 11:23:52 -04:00
2021-05-17 16:01:32 -04:00
// this needs to happen prior to creating auto-config as some of the dependencies
// must also be passed to auto-config
d , err = initEnterpriseBaseDeps ( d , cfg )
if err != nil {
return d , err
}
2020-08-07 21:08:43 -04:00
acConf := autoconf . Config {
2021-05-17 16:01:32 -04:00
DirectRPC : d . ConnPool ,
Logger : d . Logger ,
Loader : configLoader ,
ServerProvider : d . Router ,
TLSConfigurator : d . TLSConfigurator ,
Cache : d . Cache ,
Tokens : d . Tokens ,
2021-05-20 10:07:23 -04:00
EnterpriseConfig : initEnterpriseAutoConfig ( d . EnterpriseDeps , cfg ) ,
2020-08-07 21:08:43 -04:00
}
2021-05-17 16:01:32 -04:00
2020-08-07 21:08:43 -04:00
d . AutoConfig , err = autoconf . New ( acConf )
if err != nil {
return d , err
}
2022-04-06 14:33:05 -07:00
d . NewRequestRecorderFunc = middleware . NewRequestRecorder
d . GetNetRPCInterceptorFunc = middleware . GetNetRPCInterceptor
proxycfg: server-local config entry data sources
This is the OSS portion of enterprise PR 2056.
This commit provides server-local implementations of the proxycfg.ConfigEntry
and proxycfg.ConfigEntryList interfaces, that source data from streaming events.
It makes use of the LocalMaterializer type introduced for peering replication,
adding the necessary support for authorization.
It also adds support for "wildcard" subscriptions (within a topic) to the event
publisher, as this is needed to fetch service-resolvers for all services when
configuring mesh gateways.
Currently, events will be emitted for just the ingress-gateway, service-resolver,
and mesh config entry types, as these are the only entries required by proxycfg
— the events will be emitted on topics named IngressGateway, ServiceResolver,
and MeshConfig topics respectively.
Though these events will only be consumed "locally" for now, they can also be
consumed via the gRPC endpoint (confirmed using grpcurl) so using them from
client agents should be a case of swapping the LocalMaterializer for an
RPCMaterializer.
2022-07-01 16:09:47 +01:00
d . EventPublisher = stream . NewEventPublisher ( 10 * time . Second )
2022-09-09 15:02:01 +01:00
d . XDSStreamLimiter = limiter . NewSessionLimiter ( )
2022-09-26 14:58:15 -04:00
if cfg . IsCloudEnabled ( ) {
d . HCP , err = hcp . NewDeps ( cfg . Cloud , d . Logger )
if err != nil {
return d , err
}
}
2022-09-09 15:02:01 +01:00
2021-05-17 16:01:32 -04:00
return d , nil
2020-08-07 21:08:43 -04:00
}
2021-04-26 11:57:07 -04:00
// grpcLogInitOnce because the test suite will call NewBaseDeps in many tests and
// causes data races when it is re-initialized.
var grpcLogInitOnce sync . Once
2020-08-07 21:08:43 -04:00
func newConnPool ( config * config . RuntimeConfig , logger hclog . Logger , tls * tlsutil . Configurator ) * pool . ConnPool {
var rpcSrcAddr * net . TCPAddr
if ! ipaddr . IsAny ( config . RPCBindAddr ) {
rpcSrcAddr = & net . TCPAddr { IP : config . RPCBindAddr . IP }
}
pool := & pool . ConnPool {
2022-04-21 13:21:35 -07:00
Server : config . ServerMode ,
SrcAddr : rpcSrcAddr ,
Logger : logger . StandardLogger ( & hclog . StandardLoggerOptions { InferLevels : true } ) ,
TLSConfigurator : tls ,
Datacenter : config . Datacenter ,
Timeout : config . RPCHoldTimeout ,
MaxQueryTime : config . MaxQueryTime ,
DefaultQueryTime : config . DefaultQueryTime ,
2020-08-07 21:08:43 -04:00
}
if config . ServerMode {
pool . MaxTime = 2 * time . Minute
pool . MaxStreams = 64
} else {
2020-09-14 18:31:07 -04:00
// MaxTime controls how long we keep an idle connection open to a server.
// 127s was chosen as the first prime above 120s
// (arbitrarily chose to use a prime) with the intent of reusing
// connections who are used by once-a-minute cron(8) jobs *and* who
// use a 60s jitter window (e.g. in vixie cron job execution can
// drift by up to 59s per job, or 119s for a once-a-minute cron job).
2020-08-07 21:08:43 -04:00
pool . MaxTime = 127 * time . Second
pool . MaxStreams = 32
}
return pool
}
2020-10-14 16:47:16 -04:00
2020-11-12 18:12:12 -08:00
// getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends
// all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics.
2021-10-13 09:25:30 -07:00
func getPrometheusDefs ( cfg lib . TelemetryConfig , isServer bool ) ( [ ] prometheus . GaugeDefinition , [ ] prometheus . CounterDefinition , [ ] prometheus . SummaryDefinition ) {
2021-05-04 15:36:53 +01:00
// TODO: "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
raftGauges := [ ] prometheus . GaugeDefinition {
{
Name : [ ] string { "raft" , "fsm" , "lastRestoreDuration" } ,
Help : "This measures how long the last FSM restore (from disk or leader) took." ,
} ,
{
Name : [ ] string { "raft" , "leader" , "oldestLogAge" } ,
Help : "This measures how old the oldest log in the leader's log store is." ,
} ,
}
2022-06-03 13:07:37 -04:00
serverGauges := [ ] prometheus . GaugeDefinition {
{
Name : [ ] string { "server" , "isLeader" } ,
Help : "Tracks if the server is a leader." ,
} ,
}
2020-11-16 14:01:12 -08:00
// Build slice of slices for all gauge definitions
2020-11-12 18:12:12 -08:00
var gauges = [ ] [ ] prometheus . GaugeDefinition {
2020-11-13 16:26:08 -08:00
cache . Gauges ,
2020-11-12 18:12:12 -08:00
consul . RPCGauges ,
consul . SessionGauges ,
grpc . StatsGauges ,
2021-05-14 13:59:13 -05:00
xds . StatsGauges ,
2020-11-12 18:12:12 -08:00
usagemetrics . Gauges ,
2021-04-23 17:05:33 -04:00
consul . ReplicationGauges ,
2021-10-27 15:23:29 -04:00
CertExpirationGauges ,
2020-12-09 09:16:53 -05:00
Gauges ,
2021-05-04 15:36:53 +01:00
raftGauges ,
2022-06-03 13:07:37 -04:00
serverGauges ,
2020-11-12 18:12:12 -08:00
}
2021-05-04 15:36:53 +01:00
2021-10-13 09:25:30 -07:00
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc
if isServer {
2021-10-19 16:49:23 -04:00
gauges = append ( gauges ,
consul . AutopilotGauges ,
2022-07-22 12:05:08 -07:00
consul . LeaderCertExpirationGauges ,
2022-09-09 15:02:01 +01:00
consul . LeaderPeeringMetrics ,
xdscapacity . StatsGauges ,
)
2021-10-13 09:25:30 -07:00
}
2020-11-16 14:01:12 -08:00
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
2020-11-12 18:12:12 -08:00
var gaugeDefs [ ] prometheus . GaugeDefinition
for _ , g := range gauges {
2020-11-13 13:18:04 -08:00
// Set Consul to each definition's namespace
2020-11-16 14:01:12 -08:00
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
2020-11-13 13:18:04 -08:00
var withService [ ] prometheus . GaugeDefinition
for _ , gauge := range g {
2020-11-16 14:01:12 -08:00
gauge . Name = append ( [ ] string { cfg . MetricsPrefix } , gauge . Name ... )
2020-11-13 13:18:04 -08:00
withService = append ( withService , gauge )
}
gaugeDefs = append ( gaugeDefs , withService ... )
2020-11-12 18:12:12 -08:00
}
raftCounters := [ ] prometheus . CounterDefinition {
2020-11-13 16:26:08 -08:00
// TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
2020-11-12 18:12:12 -08:00
{
2020-11-13 13:18:04 -08:00
Name : [ ] string { "raft" , "apply" } ,
2020-11-12 18:12:12 -08:00
Help : "This counts the number of Raft transactions occurring over the interval." ,
} ,
{
2020-11-13 13:18:04 -08:00
Name : [ ] string { "raft" , "state" , "candidate" } ,
2020-11-12 18:12:12 -08:00
Help : "This increments whenever a Consul server starts an election." ,
} ,
{
2020-11-13 13:18:04 -08:00
Name : [ ] string { "raft" , "state" , "leader" } ,
2020-11-12 18:12:12 -08:00
Help : "This increments whenever a Consul server becomes a leader." ,
} ,
}
var counters = [ ] [ ] prometheus . CounterDefinition {
CatalogCounters ,
2020-11-13 16:26:08 -08:00
cache . Counters ,
2020-11-12 18:12:12 -08:00
consul . ACLCounters ,
consul . CatalogCounters ,
consul . ClientCounters ,
consul . RPCCounters ,
grpc . StatsCounters ,
local . StateCounters ,
2022-09-09 15:02:01 +01:00
xds . StatsCounters ,
2020-11-12 18:12:12 -08:00
raftCounters ,
}
2020-11-16 14:01:12 -08:00
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
2020-11-12 18:12:12 -08:00
var counterDefs [ ] prometheus . CounterDefinition
for _ , c := range counters {
2020-11-16 14:01:12 -08:00
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
2020-11-13 13:18:04 -08:00
var withService [ ] prometheus . CounterDefinition
for _ , counter := range c {
2020-11-16 14:01:12 -08:00
counter . Name = append ( [ ] string { cfg . MetricsPrefix } , counter . Name ... )
2020-11-13 13:18:04 -08:00
withService = append ( withService , counter )
}
counterDefs = append ( counterDefs , withService ... )
2020-11-12 18:12:12 -08:00
}
raftSummaries := [ ] prometheus . SummaryDefinition {
2020-11-13 16:26:08 -08:00
// TODO(kit): "raft..." metrics come from the raft lib and we should migrate these to a telemetry
// package within. In the mean time, we're going to define a few here because they're key to monitoring Consul.
2020-11-12 18:12:12 -08:00
{
2020-11-13 13:18:04 -08:00
Name : [ ] string { "raft" , "commitTime" } ,
2020-11-12 18:12:12 -08:00
Help : "This measures the time it takes to commit a new entry to the Raft log on the leader." ,
} ,
{
2020-11-13 13:18:04 -08:00
Name : [ ] string { "raft" , "leader" , "lastContact" } ,
2020-11-12 18:12:12 -08:00
Help : "Measures the time since the leader was last able to contact the follower nodes when checking its leader lease." ,
} ,
2021-05-04 15:36:53 +01:00
{
Name : [ ] string { "raft" , "snapshot" , "persist" } ,
Help : "Measures the time it takes raft to write a new snapshot to disk." ,
} ,
{
Name : [ ] string { "raft" , "rpc" , "installSnapshot" } ,
Help : "Measures the time it takes the raft leader to install a snapshot on a follower that is catching up after being down or has just joined the cluster." ,
} ,
2020-11-12 18:12:12 -08:00
}
var summaries = [ ] [ ] prometheus . SummaryDefinition {
HTTPSummaries ,
consul . ACLSummaries ,
consul . ACLEndpointSummaries ,
consul . CatalogSummaries ,
consul . FederationStateSummaries ,
consul . IntentionSummaries ,
consul . KVSummaries ,
2020-11-13 16:26:08 -08:00
consul . LeaderSummaries ,
2020-11-12 18:12:12 -08:00
consul . PreparedQuerySummaries ,
consul . RPCSummaries ,
2020-11-13 16:26:08 -08:00
consul . SegmentOSSSummaries ,
2020-11-12 18:12:12 -08:00
consul . SessionSummaries ,
2020-11-13 16:26:08 -08:00
consul . SessionEndpointSummaries ,
2020-11-12 18:12:12 -08:00
consul . TxnSummaries ,
2020-11-13 16:26:08 -08:00
fsm . CommandsSummaries ,
fsm . SnapshotSummaries ,
2020-11-12 18:12:12 -08:00
raftSummaries ,
}
2020-11-16 14:01:12 -08:00
// Flatten definitions
// NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique?
2020-11-12 18:12:12 -08:00
var summaryDefs [ ] prometheus . SummaryDefinition
for _ , s := range summaries {
2020-11-16 14:01:12 -08:00
// TODO(kit): Prepending the service to each definition should be handled by go-metrics
2020-11-13 13:18:04 -08:00
var withService [ ] prometheus . SummaryDefinition
for _ , summary := range s {
2020-11-16 14:01:12 -08:00
summary . Name = append ( [ ] string { cfg . MetricsPrefix } , summary . Name ... )
2020-11-13 13:18:04 -08:00
withService = append ( withService , summary )
}
summaryDefs = append ( summaryDefs , withService ... )
2020-11-12 18:12:12 -08:00
}
2020-11-16 12:44:47 -08:00
return gaugeDefs , counterDefs , summaryDefs
2020-11-12 18:12:12 -08:00
}