xds: prefer fed state gateway definitions if they're fresher (#11522)

Fixes an issue described in #10132, where if two DCs are WAN federated
over mesh gateways, and the gateway in the non-primary DC is terminated
and receives a new IP address (as is commonly the case when running them
on ephemeral compute instances) the primary DC is unable to re-establish
its connection until the agent running on its own gateway is restarted.

This was happening because we always preferred gateways discovered by
the `Internal.ServiceDump` RPC (which would fail because there's no way
to dial the remote DC) over those discovered in the federation state,
which is replicated as long as the primary DC's gateway is reachable.
This commit is contained in:
Daniel Upton 2021-11-09 16:45:36 +00:00 committed by GitHub
parent 7d95d90fce
commit 50a1f20ff9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 392 additions and 6 deletions

3
.changelog/11522.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
xds: fixes a bug where replacing a mesh gateway node used for WAN federation (with another that has a different IP) could leave gateways in the other DC unable to re-establish the connection
```

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"io/ioutil"
"math"
"path"
"path/filepath"
"sync"
@ -1506,6 +1507,56 @@ func TestConfigSnapshotMeshGatewayUsingFederationStates(t testing.T) *ConfigSnap
return testConfigSnapshotMeshGateway(t, true, true)
}
func TestConfigSnapshotMeshGatewayNewerInformationInFederationStates(t testing.T) *ConfigSnapshot {
snap := TestConfigSnapshotMeshGateway(t)
// Create a duplicate entry in FedStateGateways, with a high ModifyIndex, to
// verify that fresh data in the federation state is preferred over stale data
// in GatewayGroups.
svc := structs.TestNodeServiceMeshGatewayWithAddrs(t,
"10.0.1.3", 8443,
structs.ServiceAddress{Address: "10.0.1.3", Port: 8443},
structs.ServiceAddress{Address: "198.18.1.3", Port: 443},
)
svc.RaftIndex.ModifyIndex = math.MaxUint64
snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{
"dc2": {
{
Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node,
Service: svc,
},
},
}
return snap
}
func TestConfigSnapshotMeshGatewayOlderInformationInFederationStates(t testing.T) *ConfigSnapshot {
snap := TestConfigSnapshotMeshGateway(t)
// Create a duplicate entry in FedStateGateways, with a low ModifyIndex, to
// verify that stale data in the federation state is ignored in favor of the
// fresher data in GatewayGroups.
svc := structs.TestNodeServiceMeshGatewayWithAddrs(t,
"10.0.1.3", 8443,
structs.ServiceAddress{Address: "10.0.1.3", Port: 8443},
structs.ServiceAddress{Address: "198.18.1.3", Port: 443},
)
svc.RaftIndex.ModifyIndex = 0
snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{
"dc2": {
{
Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node,
Service: svc,
},
},
}
return snap
}
func TestConfigSnapshotMeshGatewayNoServices(t testing.T) *ConfigSnapshot {
return testConfigSnapshotMeshGateway(t, false, false)
}

View File

@ -130,6 +130,9 @@ func TestNodeServiceMeshGatewayWithAddrs(t testing.T, address string, port int,
TaggedAddressLAN: lanAddr,
TaggedAddressWAN: wanAddr,
},
RaftIndex: RaftIndex{
ModifyIndex: 1,
},
}
}

View File

@ -123,14 +123,57 @@ func (s *ResourceGenerator) endpointsFromSnapshotMeshGateway(cfgSnap *proxycfg.C
continue
}
endpoints, ok := cfgSnap.MeshGateway.GatewayGroups[key.String()]
if !ok {
endpoints, ok = cfgSnap.MeshGateway.FedStateGateways[key.String()]
if !ok { // not possible
// Mesh gateways in remote DCs are discovered in two ways:
//
// 1. Via an Internal.ServiceDump RPC in the remote DC (GatewayGroups).
// 2. In the federation state that is replicated from the primary DC (FedStateGateways).
//
// We determine which set to use based on whichever contains the highest
// raft ModifyIndex (and is therefore most up-to-date).
//
// Previously, GatewayGroups was always given presedence over FedStateGateways
// but this was problematic when using mesh gateways for WAN federation.
//
// Consider the following example:
//
// - Primary and Secondary DCs are WAN Federated via local mesh gateways.
//
// - Secondary DC's mesh gateway is running on an ephemeral compute instance
// and is abruptly terminated and rescheduled with a *new IP address*.
//
// - Primary DC's mesh gateway is no longer able to connect to the Secondary
// DC as its proxy is configured with the old IP address. Therefore any RPC
// from the Primary to the Secondary DC will fail (including the one to
// discover the gateway's new IP address).
//
// - Secondary DC performs its regular anti-entropy of federation state data
// to the Primary DC (this succeeds as there is still connectivity in this
// direction).
//
// - At this point the Primary DC's mesh gateway should observe the new IP
// address and reconfigure its proxy, however as we always prioritised
// GatewayGroups this didn't happen and the connection remained severed.
maxModifyIndex := func(vals structs.CheckServiceNodes) uint64 {
var max uint64
for _, v := range vals {
if i := v.Service.RaftIndex.ModifyIndex; i > max {
max = i
}
}
return max
}
endpoints := cfgSnap.MeshGateway.GatewayGroups[key.String()]
fedStateEndpoints := cfgSnap.MeshGateway.FedStateGateways[key.String()]
if maxModifyIndex(fedStateEndpoints) > maxModifyIndex(endpoints) {
endpoints = fedStateEndpoints
}
if len(endpoints) == 0 {
s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key)
continue
}
}
{ // standard connect
clusterName := connect.GatewaySNI(key.Datacenter, key.Partition, cfgSnap.Roots.TrustDomain)

View File

@ -245,6 +245,14 @@ func TestEndpointsFromSnapshot(t *testing.T) {
create: proxycfg.TestConfigSnapshotMeshGatewayUsingFederationStates,
setup: nil,
},
{
name: "mesh-gateway-newer-information-in-federation-states",
create: proxycfg.TestConfigSnapshotMeshGatewayNewerInformationInFederationStates,
},
{
name: "mesh-gateway-older-information-in-federation-states",
create: proxycfg.TestConfigSnapshotMeshGatewayOlderInformationInFederationStates,
},
{
name: "mesh-gateway-no-services",
create: proxycfg.TestConfigSnapshotMeshGatewayNoServices,

View File

@ -0,0 +1,133 @@
{
"versionInfo": "00000001",
"resources": [
{
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.6",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.7",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.8",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
},
{
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "198.18.1.3",
"portValue": 443
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
},
{
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.3",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.4",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.5",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.9",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
}
],
"typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"nonce": "00000001"
}

View File

@ -0,0 +1,145 @@
{
"versionInfo": "00000001",
"resources": [
{
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.6",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.7",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.8",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
},
{
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "198.18.1.1",
"portValue": 443
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "198.18.1.2",
"portValue": 443
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
},
{
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.3",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.4",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.5",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.9",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
}
],
"typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
"nonce": "00000001"
}