consul/agent/proxycfg/mesh_gateway.go

832 lines
28 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
[COMPLIANCE] License changes (#18443) * Adding explicit MPL license for sub-package This directory and its subdirectories (packages) contain files licensed with the MPLv2 `LICENSE` file in this directory and are intentionally licensed separately from the BSL `LICENSE` file at the root of this repository. * Adding explicit MPL license for sub-package This directory and its subdirectories (packages) contain files licensed with the MPLv2 `LICENSE` file in this directory and are intentionally licensed separately from the BSL `LICENSE` file at the root of this repository. * Updating the license from MPL to Business Source License Going forward, this project will be licensed under the Business Source License v1.1. Please see our blog post for more details at <Blog URL>, FAQ at www.hashicorp.com/licensing-faq, and details of the license at www.hashicorp.com/bsl. * add missing license headers * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 * Update copyright file headers to BUSL-1.1 --------- Co-authored-by: hashicorp-copywrite[bot] <110428419+hashicorp-copywrite[bot]@users.noreply.github.com>
2023-08-11 09:12:13 -04:00
// SPDX-License-Identifier: BUSL-1.1
package proxycfg
import (
"context"
"fmt"
"net"
"sort"
"strconv"
"strings"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/acl"
cachetype "github.com/hashicorp/consul/agent/cache-types"
agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
2023-06-13 10:54:45 -05:00
"github.com/hashicorp/consul/agent/leafcert"
"github.com/hashicorp/consul/agent/proxycfg/internal/watch"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib/maps"
"github.com/hashicorp/consul/logging"
Protobuf Refactoring for Multi-Module Cleanliness (#16302) Protobuf Refactoring for Multi-Module Cleanliness This commit includes the following: Moves all packages that were within proto/ to proto/private Rewrites imports to account for the packages being moved Adds in buf.work.yaml to enable buf workspaces Names the proto-public buf module so that we can override the Go package imports within proto/buf.yaml Bumps the buf version dependency to 1.14.0 (I was trying out the version to see if it would get around an issue - it didn't but it also doesn't break things and it seemed best to keep up with the toolchain changes) Why: In the future we will need to consume other protobuf dependencies such as the Google HTTP annotations for openapi generation or grpc-gateway usage. There were some recent changes to have our own ratelimiting annotations. The two combined were not working when I was trying to use them together (attempting to rebase another branch) Buf workspaces should be the solution to the problem Buf workspaces means that each module will have generated Go code that embeds proto file names relative to the proto dir and not the top level repo root. This resulted in proto file name conflicts in the Go global protobuf type registry. The solution to that was to add in a private/ directory into the path within the proto/ directory. That then required rewriting all the imports. Is this safe? AFAICT yes The gRPC wire protocol doesn't seem to care about the proto file names (although the Go grpc code does tack on the proto file name as Metadata in the ServiceDesc) Other than imports, there were no changes to any generated code as a result of this.
2023-02-17 16:14:46 -05:00
"github.com/hashicorp/consul/proto/private/pbpeering"
)
type handlerMeshGateway struct {
handlerState
}
type peerAddressType string
const (
undefinedAddressType peerAddressType = ""
ipAddressType peerAddressType = "ip"
hostnameAddressType peerAddressType = "hostname"
)
// initialize sets up the watches needed based on the current mesh gateway registration
func (s *handlerMeshGateway) initialize(ctx context.Context) (ConfigSnapshot, error) {
snap := newConfigSnapshotFromServiceInstance(s.serviceInstance, s.stateConfig)
snap.MeshGateway.WatchedLocalServers = watch.NewMap[string, structs.CheckServiceNodes]()
// Watch for root changes
err := s.dataSources.CARoots.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
}, rootsWatchID, s.ch)
if err != nil {
return snap, err
}
// Watch for all peer trust bundles we may need.
err = s.dataSources.TrustBundleList.Notify(ctx, &cachetype.TrustBundleListRequest{
Request: &pbpeering.TrustBundleListByServiceRequest{
Kind: string(structs.ServiceKindMeshGateway),
ServiceName: s.service,
Namespace: s.proxyID.NamespaceOrDefault(),
Partition: s.proxyID.PartitionOrDefault(),
},
QueryOptions: structs.QueryOptions{Token: s.token},
}, peeringTrustBundlesWatchID, s.ch)
if err != nil {
return snap, err
}
wildcardEntMeta := s.proxyID.WithWildcardNamespace()
// Watch for all services.
2021-10-26 15:58:23 -06:00
// Eventually we will have to watch connect enabled instances for each service as well as the
// destination services themselves but those notifications will be setup later.
// We cannot setup those watches until we know what the services are.
err = s.dataSources.ServiceList.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
EnterpriseMeta: *wildcardEntMeta,
}, serviceListWatchID, s.ch)
if err != nil {
return snap, err
}
// Watch service-resolvers so we can setup service subset clusters
err = s.dataSources.ConfigEntryList.Notify(ctx, &structs.ConfigEntryQuery{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Kind: structs.ServiceResolver,
EnterpriseMeta: *wildcardEntMeta,
}, serviceResolversWatchID, s.ch)
if err != nil {
s.logger.Named(logging.MeshGateway).
Error("failed to register watch for service-resolver config entries", "error", err)
return snap, err
}
2021-10-26 16:10:30 -06:00
if s.proxyID.InDefaultPartition() {
if err := s.initializeCrossDCWatches(ctx, &snap); err != nil {
return snap, err
}
}
if err := s.initializeEntWatches(ctx); err != nil {
return snap, err
}
// Get information about the entire service mesh.
err = s.dataSources.ConfigEntry.Notify(ctx, &structs.ConfigEntryQuery{
Kind: structs.MeshConfig,
Name: structs.MeshConfigMesh,
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
EnterpriseMeta: *structs.DefaultEnterpriseMetaInPartition(s.proxyID.PartitionOrDefault()),
}, meshConfigEntryID, s.ch)
if err != nil {
return snap, err
}
// Watch for all exported services from this mesh gateway's partition in any peering.
err = s.dataSources.ExportedPeeredServices.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
EnterpriseMeta: s.proxyID.EnterpriseMeta,
}, exportedServiceListWatchID, s.ch)
if err != nil {
return snap, err
}
snap.MeshGateway.WatchedServices = make(map[structs.ServiceName]context.CancelFunc)
snap.MeshGateway.WatchedGateways = make(map[string]context.CancelFunc)
snap.MeshGateway.ServiceGroups = make(map[structs.ServiceName]structs.CheckServiceNodes)
snap.MeshGateway.GatewayGroups = make(map[string]structs.CheckServiceNodes)
snap.MeshGateway.ServiceResolvers = make(map[structs.ServiceName]*structs.ServiceResolverConfigEntry)
snap.MeshGateway.HostnameDatacenters = make(map[string]structs.CheckServiceNodes)
snap.MeshGateway.ExportedServicesWithPeers = make(map[structs.ServiceName][]string)
snap.MeshGateway.DiscoveryChain = make(map[structs.ServiceName]*structs.CompiledDiscoveryChain)
snap.MeshGateway.WatchedDiscoveryChains = make(map[structs.ServiceName]context.CancelFunc)
snap.MeshGateway.WatchedPeeringServices = make(map[string]map[structs.ServiceName]context.CancelFunc)
snap.MeshGateway.WatchedPeers = make(map[string]context.CancelFunc)
snap.MeshGateway.PeeringServices = make(map[string]map[structs.ServiceName]PeeringServiceValue)
// there is no need to initialize the map of service resolvers as we
// fully rebuild it every time we get updates
return snap, err
}
func (s *handlerMeshGateway) initializeCrossDCWatches(ctx context.Context, snap *ConfigSnapshot) error {
if s.meta[structs.MetaWANFederationKey] == "1" {
// Conveniently we can just use this service meta attribute in one
// place here to set the machinery in motion and leave the conditional
// behavior out of the rest of the package.
err := s.dataSources.FederationStateListMeshGateways.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
}, federationStateListGatewaysWatchID, s.ch)
if err != nil {
return err
}
err = s.dataSources.Health.Notify(ctx, &structs.ServiceSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceName: structs.ConsulServiceName,
}, consulServerListWatchID, s.ch)
if err != nil {
return err
}
snap.MeshGateway.WatchedLocalServers.InitWatch(structs.ConsulServiceName, nil)
}
err := s.dataSources.Datacenters.Notify(ctx, &structs.DatacentersRequest{
QueryOptions: structs.QueryOptions{Token: s.token, MaxAge: 30 * time.Second},
}, datacentersWatchID, s.ch)
if err != nil {
return err
}
// Once we start getting notified about the datacenters we will setup watches on the
// gateways within those other datacenters. We cannot do that here because we don't
// know what they are yet.
return nil
}
func (s *handlerMeshGateway) handleUpdate(ctx context.Context, u UpdateEvent, snap *ConfigSnapshot) error {
if u.Err != nil {
return fmt.Errorf("error filling agent cache: %v", u.Err)
}
meshLogger := s.logger.Named(logging.MeshGateway)
switch u.CorrelationID {
case rootsWatchID:
roots, ok := u.Result.(*structs.IndexedCARoots)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
snap.Roots = roots
case federationStateListGatewaysWatchID:
dcIndexedNodes, ok := u.Result.(*structs.DatacenterIndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
snap.MeshGateway.FedStateGateways = dcIndexedNodes.DatacenterNodes
for dc, nodes := range dcIndexedNodes.DatacenterNodes {
snap.MeshGateway.HostnameDatacenters[dc] = hostnameEndpoints(
2021-10-28 18:41:48 -06:00
s.logger.Named(logging.MeshGateway),
snap.Locality,
2021-10-28 18:41:48 -06:00
nodes,
)
}
for dc := range snap.MeshGateway.HostnameDatacenters {
if _, ok := dcIndexedNodes.DatacenterNodes[dc]; !ok {
delete(snap.MeshGateway.HostnameDatacenters, dc)
}
}
case serviceListWatchID:
services, ok := u.Result.(*structs.IndexedServiceList)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
svcMap := make(map[structs.ServiceName]struct{})
for _, svc := range services.Services {
// Make sure to add every service to this map, we use it to cancel
// watches below.
svcMap[svc] = struct{}{}
if _, ok := snap.MeshGateway.WatchedServices[svc]; !ok {
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.Health.Notify(ctx, &structs.ServiceSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceName: svc.Name,
Connect: true,
EnterpriseMeta: svc.EnterpriseMeta,
}, fmt.Sprintf("connect-service:%s", svc.String()), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for connect-service",
"service", svc.String(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedServices[svc] = cancel
}
}
for sid, cancelFn := range snap.MeshGateway.WatchedServices {
if _, ok := svcMap[sid]; !ok {
meshLogger.Debug("canceling watch for service", "service", sid.String())
// TODO (gateways) Should the sid also be deleted from snap.MeshGateway.ServiceGroups?
// Do those endpoints get cleaned up some other way?
delete(snap.MeshGateway.WatchedServices, sid)
cancelFn()
// always remove the sid from the ServiceGroups when un-watch the service
delete(snap.MeshGateway.ServiceGroups, sid)
}
}
snap.MeshGateway.WatchedServicesSet = true
case datacentersWatchID:
datacentersRaw, ok := u.Result.(*[]string)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if datacentersRaw == nil {
return fmt.Errorf("invalid response with a nil datacenter list")
}
datacenters := *datacentersRaw
for _, dc := range datacenters {
if dc == s.source.Datacenter {
continue
}
entMeta := structs.DefaultEnterpriseMetaInDefaultPartition()
gk := GatewayKey{Datacenter: dc, Partition: entMeta.PartitionOrDefault()}
if _, ok := snap.MeshGateway.WatchedGateways[gk.String()]; !ok {
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.InternalServiceDump.Notify(ctx, &structs.ServiceDumpRequest{
Datacenter: dc,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceKind: structs.ServiceKindMeshGateway,
UseServiceKind: true,
NodesOnly: true,
Source: *s.source,
EnterpriseMeta: *entMeta,
}, fmt.Sprintf("mesh-gateway:%s", gk.String()), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for mesh-gateway",
"datacenter", dc,
"partition", entMeta.PartitionOrDefault(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedGateways[gk.String()] = cancel
}
}
for key, cancelFn := range snap.MeshGateway.WatchedGateways {
gk := gatewayKeyFromString(key)
if gk.Datacenter == s.source.Datacenter {
// Only cross-DC watches are managed by the datacenters watch.
continue
}
found := false
for _, dcCurrent := range datacenters {
if dcCurrent == gk.Datacenter {
found = true
break
}
}
if !found {
delete(snap.MeshGateway.WatchedGateways, key)
cancelFn()
}
}
case serviceResolversWatchID:
configEntries, ok := u.Result.(*structs.IndexedConfigEntries)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
resolvers := make(map[structs.ServiceName]*structs.ServiceResolverConfigEntry)
for _, entry := range configEntries.Entries {
if resolver, ok := entry.(*structs.ServiceResolverConfigEntry); ok {
resolvers[structs.NewServiceName(resolver.Name, &resolver.EnterpriseMeta)] = resolver
}
}
snap.MeshGateway.ServiceResolvers = resolvers
case consulServerListWatchID:
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
for _, csn := range resp.Nodes {
if csn.Service.Service != structs.ConsulServiceName {
return fmt.Errorf("expected service name %q but got %q",
structs.ConsulServiceName, csn.Service.Service)
}
if csn.Node.Datacenter != snap.Datacenter {
return fmt.Errorf("expected datacenter %q but got %q",
snap.Datacenter, csn.Node.Datacenter)
}
}
snap.MeshGateway.WatchedLocalServers.Set(structs.ConsulServiceName, resp.Nodes)
case exportedServiceListWatchID:
exportedServices, ok := u.Result.(*structs.IndexedExportedServiceList)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
seenServices := make(map[structs.ServiceName][]string) // svc -> peername slice
for peerName, services := range exportedServices.Services {
for _, svc := range services {
seenServices[svc] = append(seenServices[svc], peerName)
}
}
// Sort the peer names so ultimately xDS has a stable output.
for svc := range seenServices {
sort.Strings(seenServices[svc])
}
peeredServiceList := maps.SliceOfKeys(seenServices)
structs.ServiceList(peeredServiceList).Sort()
snap.MeshGateway.ExportedServicesSlice = peeredServiceList
snap.MeshGateway.ExportedServicesWithPeers = seenServices
snap.MeshGateway.ExportedServicesSet = true
// Decide if we do or do not need our leaf.
hasExports := len(snap.MeshGateway.ExportedServicesSlice) > 0
if hasExports && snap.MeshGateway.LeafCertWatchCancel == nil {
// no watch and we need one
ctx, cancel := context.WithCancel(ctx)
agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
2023-06-13 10:54:45 -05:00
err := s.dataSources.LeafCertificate.Notify(ctx, &leafcert.ConnectCALeafRequest{
Datacenter: s.source.Datacenter,
Token: s.token,
Kind: structs.ServiceKindMeshGateway,
EnterpriseMeta: s.proxyID.EnterpriseMeta,
}, leafWatchID, s.ch)
if err != nil {
cancel()
return err
}
snap.MeshGateway.LeafCertWatchCancel = cancel
} else if !hasExports && snap.MeshGateway.LeafCertWatchCancel != nil {
// has watch and shouldn't
snap.MeshGateway.LeafCertWatchCancel()
snap.MeshGateway.LeafCertWatchCancel = nil
snap.MeshGateway.Leaf = nil
}
// For each service that we should be exposing, also watch disco chains
// in the same manner as an ingress gateway would.
for _, svc := range snap.MeshGateway.ExportedServicesSlice {
if _, ok := snap.MeshGateway.WatchedDiscoveryChains[svc]; ok {
continue
}
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.CompiledDiscoveryChain.Notify(ctx, &structs.DiscoveryChainRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Name: svc.Name,
EvaluateInDatacenter: s.source.Datacenter,
EvaluateInNamespace: svc.NamespaceOrDefault(),
EvaluateInPartition: svc.PartitionOrDefault(),
}, "discovery-chain:"+svc.String(), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for discovery chain",
"service", svc.String(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedDiscoveryChains[svc] = cancel
}
// Clean up data from services that were not in the update
for svc, cancelFn := range snap.MeshGateway.WatchedDiscoveryChains {
if _, ok := seenServices[svc]; !ok {
cancelFn()
delete(snap.MeshGateway.WatchedDiscoveryChains, svc)
}
}
// These entries are intentionally handled separately from the
// WatchedDiscoveryChains above. There have been situations where a
// discovery watch was cancelled, then fired. That update event then
// re-populated the DiscoveryChain map entry, which wouldn't get
// cleaned up since there was no known watch for it.
for svc := range snap.MeshGateway.DiscoveryChain {
if _, ok := seenServices[svc]; !ok {
delete(snap.MeshGateway.DiscoveryChain, svc)
}
}
case leafWatchID:
leaf, ok := u.Result.(*structs.IssuedCert)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if hasExports := len(snap.MeshGateway.ExportedServicesSlice) > 0; !hasExports {
return nil // ignore this update, it's stale
}
snap.MeshGateway.Leaf = leaf
case peeringTrustBundlesWatchID:
resp, ok := u.Result.(*pbpeering.TrustBundleListByServiceResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if len(resp.Bundles) > 0 {
snap.MeshGateway.PeeringTrustBundles = resp.Bundles
}
snap.MeshGateway.PeeringTrustBundlesSet = true
wildcardEntMeta := s.proxyID.WithWildcardNamespace()
// For each peer, fetch the imported services to support mesh gateway local
// mode.
for _, tb := range resp.Bundles {
entMeta := structs.DefaultEnterpriseMetaInDefaultPartition()
if _, ok := snap.MeshGateway.WatchedPeers[tb.PeerName]; !ok {
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.ServiceList.Notify(ctx, &structs.DCSpecificRequest{
PeerName: tb.PeerName,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
EnterpriseMeta: *wildcardEntMeta,
}, peeringServiceListWatchID+tb.PeerName, s.ch)
if err != nil {
meshLogger.Error("failed to register watch for mesh-gateway",
"peer", tb.PeerName,
"partition", entMeta.PartitionOrDefault(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedPeers[tb.PeerName] = cancel
}
}
for peerName, cancelFn := range snap.MeshGateway.WatchedPeers {
found := false
for _, bundle := range resp.Bundles {
if peerName == bundle.PeerName {
found = true
break
}
}
if !found {
delete(snap.MeshGateway.PeeringServices, peerName)
delete(snap.MeshGateway.WatchedPeers, peerName)
delete(snap.MeshGateway.WatchedPeeringServices, peerName)
cancelFn()
}
}
case meshConfigEntryID:
resp, ok := u.Result.(*structs.ConfigEntryResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
meshConf, ok := resp.Entry.(*structs.MeshConfigEntry)
if resp.Entry != nil && !ok {
return fmt.Errorf("invalid type for config entry: %T", resp.Entry)
}
snap.MeshGateway.MeshConfig = meshConf
snap.MeshGateway.MeshConfigSet = true
2022-10-07 08:54:37 -06:00
// If we're peering through mesh gateways it means the config entry may be deleted
// or the flag was disabled. Here we clean up related watches if they exist.
if !meshConf.PeerThroughMeshGateways() {
// We avoid canceling server watches when WAN federation is enabled since it
2022-10-07 08:54:37 -06:00
// always requires a watch to the local servers.
if s.meta[structs.MetaWANFederationKey] != "1" {
// If the entry was deleted we cancel watches that may have existed because of
// PeerThroughMeshGateways being set in the past.
snap.MeshGateway.WatchedLocalServers.CancelWatch(structs.ConsulServiceName)
}
if snap.MeshGateway.PeerServersWatchCancel != nil {
snap.MeshGateway.PeerServersWatchCancel()
snap.MeshGateway.PeerServersWatchCancel = nil
snap.MeshGateway.PeerServers = nil
}
return nil
}
// If PeerThroughMeshGateways is enabled, and we are in the default partition,
// we need to start watching the list of peering connections in all partitions
// to set up outbound routes for the control plane. Consul servers are in the default partition,
// so only mesh gateways here have his responsibility.
2022-10-07 08:54:37 -06:00
if snap.ProxyID.InDefaultPartition() &&
snap.MeshGateway.PeerServersWatchCancel == nil {
peeringListCtx, cancel := context.WithCancel(ctx)
err := s.dataSources.PeeringList.Notify(peeringListCtx, &cachetype.PeeringListRequest{
Request: &pbpeering.PeeringListRequest{
Partition: acl.WildcardPartitionName,
},
QueryOptions: structs.QueryOptions{Token: s.token},
}, peerServersWatchID, s.ch)
if err != nil {
meshLogger.Error("failed to register watch for peering list", "error", err)
cancel()
return err
}
snap.MeshGateway.PeerServersWatchCancel = cancel
}
2022-10-07 08:54:37 -06:00
// We avoid initializing Consul server watches when WAN federation is enabled since it
// always requires server watches.
if s.meta[structs.MetaWANFederationKey] == "1" {
return nil
}
if snap.MeshGateway.WatchedLocalServers.IsWatched(structs.ConsulServiceName) {
return nil
}
notifyCtx, cancel := context.WithCancel(ctx)
err := s.dataSources.Health.Notify(notifyCtx, &structs.ServiceSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceName: structs.ConsulServiceName,
}, consulServerListWatchID, s.ch)
if err != nil {
cancel()
return fmt.Errorf("failed to watch local consul servers: %w", err)
}
snap.MeshGateway.WatchedLocalServers.InitWatch(structs.ConsulServiceName, cancel)
case peerServersWatchID:
resp, ok := u.Result.(*pbpeering.PeeringListResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
peerServers := make(map[string]PeerServersValue)
for _, peering := range resp.Peerings {
// We only need to keep track of outbound establish connections for mesh gateway.
// We could also check for the peering status, but this requires a response from the leader
// which holds the peerstream information. We want to allow stale reads so there could be peerings in
// a deleting or terminating state.
if !peering.ShouldDial() {
continue
}
if existing, ok := peerServers[peering.PeerServerName]; ok && existing.Index >= peering.ModifyIndex {
// Multiple peerings can reference the same set of Consul servers, since there can be
// multiple partitions in a datacenter. Rather than randomly overwriting, we attempt to
// use the latest addresses by checking the Raft index associated with the peering.
continue
}
hostnames, ips := peerHostnamesAndIPs(meshLogger, peering.Name, peering.GetAddressesToDial())
if len(hostnames) > 0 {
peerServers[peering.PeerServerName] = PeerServersValue{
Addresses: hostnames,
Index: peering.ModifyIndex,
UseCDS: true,
}
} else if len(ips) > 0 {
peerServers[peering.PeerServerName] = PeerServersValue{
Addresses: ips,
Index: peering.ModifyIndex,
}
}
}
snap.MeshGateway.PeerServers = peerServers
default:
switch {
case strings.HasPrefix(u.CorrelationID, peeringServiceListWatchID):
services, ok := u.Result.(*structs.IndexedServiceList)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
peerName := strings.TrimPrefix(u.CorrelationID, peeringServiceListWatchID)
svcMap := make(map[structs.ServiceName]struct{})
if _, ok := snap.MeshGateway.WatchedPeeringServices[peerName]; !ok {
snap.MeshGateway.WatchedPeeringServices[peerName] = make(map[structs.ServiceName]context.CancelFunc)
}
for _, svc := range services.Services {
// Make sure to add every service to this map, we use it to cancel
// watches below.
svcMap[svc] = struct{}{}
if _, ok := snap.MeshGateway.WatchedPeeringServices[peerName][svc]; !ok {
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.Health.Notify(ctx, &structs.ServiceSpecificRequest{
PeerName: peerName,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceName: svc.Name,
Connect: true,
EnterpriseMeta: svc.EnterpriseMeta,
}, fmt.Sprintf("peering-connect-service:%s:%s", peerName, svc.String()), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for connect-service",
"service", svc.String(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedPeeringServices[peerName][svc] = cancel
}
}
watchedServices := snap.MeshGateway.WatchedPeeringServices[peerName]
for sn, cancelFn := range watchedServices {
if _, ok := svcMap[sn]; !ok {
meshLogger.Debug("canceling watch for service", "service", sn.String())
delete(snap.MeshGateway.WatchedPeeringServices[peerName], sn)
delete(snap.MeshGateway.PeeringServices[peerName], sn)
cancelFn()
}
}
case strings.HasPrefix(u.CorrelationID, "connect-service:"):
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
sn := structs.ServiceNameFromString(strings.TrimPrefix(u.CorrelationID, "connect-service:"))
if len(resp.Nodes) > 0 {
snap.MeshGateway.ServiceGroups[sn] = resp.Nodes
} else if _, ok := snap.MeshGateway.ServiceGroups[sn]; ok {
delete(snap.MeshGateway.ServiceGroups, sn)
}
case strings.HasPrefix(u.CorrelationID, "peering-connect-service:"):
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
key := strings.TrimPrefix(u.CorrelationID, "peering-connect-service:")
peer, snString, ok := strings.Cut(key, ":")
if ok {
sn := structs.ServiceNameFromString(snString)
if len(resp.Nodes) > 0 {
if _, ok := snap.MeshGateway.PeeringServices[peer]; !ok {
snap.MeshGateway.PeeringServices[peer] = make(map[structs.ServiceName]PeeringServiceValue)
}
if eps := hostnameEndpoints(s.logger, GatewayKey{}, resp.Nodes); len(eps) > 0 {
snap.MeshGateway.PeeringServices[peer][sn] = PeeringServiceValue{
Nodes: eps,
UseCDS: true,
}
} else {
snap.MeshGateway.PeeringServices[peer][sn] = PeeringServiceValue{
Nodes: resp.Nodes,
}
}
} else if _, ok := snap.MeshGateway.PeeringServices[peer]; ok {
delete(snap.MeshGateway.PeeringServices[peer], sn)
}
}
case strings.HasPrefix(u.CorrelationID, "mesh-gateway:"):
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
key := strings.TrimPrefix(u.CorrelationID, "mesh-gateway:")
delete(snap.MeshGateway.GatewayGroups, key)
delete(snap.MeshGateway.HostnameDatacenters, key)
if len(resp.Nodes) > 0 {
snap.MeshGateway.GatewayGroups[key] = resp.Nodes
snap.MeshGateway.HostnameDatacenters[key] = hostnameEndpoints(
2021-10-28 18:41:48 -06:00
s.logger.Named(logging.MeshGateway),
snap.Locality,
2021-10-28 18:41:48 -06:00
resp.Nodes,
)
}
case strings.HasPrefix(u.CorrelationID, "discovery-chain:"):
resp, ok := u.Result.(*structs.DiscoveryChainResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
svcString := strings.TrimPrefix(u.CorrelationID, "discovery-chain:")
svc := structs.ServiceNameFromString(svcString)
if !snap.MeshGateway.IsServiceExported(svc) {
delete(snap.MeshGateway.DiscoveryChain, svc)
s.logger.Trace("discovery-chain watch fired for unknown service", "service", svc)
return nil
}
snap.MeshGateway.DiscoveryChain[svc] = resp.Chain
default:
if err := s.handleEntUpdate(meshLogger, ctx, u, snap); err != nil {
return err
}
}
}
return nil
}
func peerHostnamesAndIPs(logger hclog.Logger, peerName string, addresses []string) ([]structs.ServiceAddress, []structs.ServiceAddress) {
var (
hostnames []structs.ServiceAddress
ips []structs.ServiceAddress
)
// Sort the input so that the output is also sorted.
sort.Strings(addresses)
for _, addr := range addresses {
ip, rawPort, splitErr := net.SplitHostPort(addr)
port, convErr := strconv.Atoi(rawPort)
if splitErr != nil || convErr != nil {
logger.Warn("unable to parse ip and port from peer server address. skipping address.",
"peer", peerName, "address", addr)
}
if net.ParseIP(ip) != nil {
ips = append(ips, structs.ServiceAddress{
Address: ip,
Port: port,
})
} else {
hostnames = append(hostnames, structs.ServiceAddress{
Address: ip,
Port: port,
})
}
}
if len(hostnames) > 0 && len(ips) > 0 {
logger.Warn("peer server address list contains mix of hostnames and IP addresses; only hostnames will be passed to Envoy",
"peer", peerName)
}
return hostnames, ips
}