diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 07acd6e5d3..d84242b690 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -5303,9 +5303,14 @@ func TestAgentConnectAuthorize_deny(t *testing.T) { assert.Contains(obj.Reason, "Matched") } -// Test when there is an intention allowing service but for a different trust -// domain. -func TestAgentConnectAuthorize_denyTrustDomain(t *testing.T) { +// Test when there is an intention allowing service with a different trust +// domain. We allow this because migration between trust domains shouldn't cause +// an outage even if we have stale info about current trusted domains. It's safe +// because the CA root is either unique to this cluster and not used to sign +// anything external, or path validation can be used to ensure that the CA can +// only issue certs that are valid for the specific cluster trust domain at x509 +// level which is enforced by TLS handshake. +func TestAgentConnectAuthorize_allowTrustDomain(t *testing.T) { t.Parallel() assert := assert.New(t) @@ -5345,8 +5350,8 @@ func TestAgentConnectAuthorize_denyTrustDomain(t *testing.T) { assert.Equal(200, resp.Code) obj := respRaw.(*connectAuthorizeResp) - assert.False(obj.Authorized) - assert.Contains(obj.Reason, "Identity from an external trust domain") + require.True(obj.Authorized) + require.Contains(obj.Reason, "Matched") } } diff --git a/agent/connect_auth.go b/agent/connect_auth.go index d9e9c31fed..398fea160e 100644 --- a/agent/connect_auth.go +++ b/agent/connect_auth.go @@ -2,7 +2,6 @@ package agent import ( "fmt" - "strings" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" @@ -62,26 +61,8 @@ func (a *Agent) ConnectAuthorize(token string, return returnErr(acl.ErrPermissionDenied) } - // Validate the trust domain matches ours. Later we will support explicit - // external federation but not built yet. - rootArgs := &structs.DCSpecificRequest{Datacenter: a.config.Datacenter} - raw, _, err := a.cache.Get(cachetype.ConnectCARootName, rootArgs) - if err != nil { - return returnErr(err) - } - - roots, ok := raw.(*structs.IndexedCARoots) - if !ok { - return returnErr(fmt.Errorf("internal error: roots response type not correct")) - } - if roots.TrustDomain == "" { - return returnErr(fmt.Errorf("Connect CA not bootstrapped yet")) - } - if roots.TrustDomain != strings.ToLower(uriService.Host) { - reason = fmt.Sprintf("Identity from an external trust domain: %s", - uriService.Host) - return false, reason, nil, nil - } + // Note that we DON'T explicitly validate the trust-domain matches ours. See + // the PR for this change for details. // TODO(banks): Implement revocation list checking here. diff --git a/agent/http_oss_test.go b/agent/http_oss_test.go index 5c951b2f81..a9b9014a85 100644 --- a/agent/http_oss_test.go +++ b/agent/http_oss_test.go @@ -66,8 +66,8 @@ func TestHTTPAPI_MethodNotAllowed_OSS(t *testing.T) { all := []string{"GET", "PUT", "POST", "DELETE", "HEAD", "OPTIONS"} const testTimeout = 10 * time.Second - fastClient := newHttpClient(10 * time.Second) - slowClient := newHttpClient(30 * time.Second) + fastClient := newHttpClient(15 * time.Second) + slowClient := newHttpClient(45 * time.Second) testMethodNotAllowed := func(method string, path string, allowedMethods []string) { t.Run(method+" "+path, func(t *testing.T) { diff --git a/agent/structs/connect_ca.go b/agent/structs/connect_ca.go index ba98e4c65e..a5a78aac1a 100644 --- a/agent/structs/connect_ca.go +++ b/agent/structs/connect_ca.go @@ -23,11 +23,20 @@ type IndexedCARoots struct { // implement other protocols in future with equivalent semantics. It should be // compared against the "authority" section of a URI (i.e. host:port). // - // NOTE(banks): Later we may support explicitly trusting external domains - // which may be encoded into the CARoot struct or a separate list but this - // domain identifier should be immutable and cluster-wide so deserves to be at - // the root of this response rather than duplicated through all CARoots that - // are not externally trusted entities. + // We need to support migrating a cluster between trust domains to support + // Multi-DC migration in Enterprise. In this case the current trust domain is + // here but entries in Roots may also have ExternalTrustDomain set to a + // non-empty value implying they were previous roots that are still trusted + // but under a different trust domain. + // + // Note that we DON'T validate trust domain during AuthZ since it causes + // issues of loss of connectivity during migration between trust domains. The + // only time the additional validation adds value is where the cluster shares + // an external root (e.g. organization-wide root) with another distinct Consul + // cluster or PKI system. In this case, x509 Name Constraints can be added to + // enforce that Consul's CA can only validly sign or trust certs within the + // same trust-domain. Name constraints as enforced by TLS handshake also allow + // seamless rotation between trust domains thanks to cross-signing. TrustDomain string // Roots is a list of root CA certs to trust. @@ -54,7 +63,14 @@ type CARoot struct { // private key used to sign the certificate. SigningKeyID string - // ExternalTrustDomain is the trust domain this root was generated under. + // ExternalTrustDomain is the trust domain this root was generated under. It + // is usually empty implying "the current cluster trust-domain". It is set + // only in the case that a cluster changes trust domain and then all old roots + // that are still trusted have the old trust domain set here. + // + // We currently DON'T validate these trust domains explicitly anywhere, see + // IndexedRoots.TrustDomain doc. We retain this information for debugging and + // future flexibility. ExternalTrustDomain string // Time validity bounds. diff --git a/connect/resolver.go b/connect/resolver.go index fc9bce81ed..8d137a32f1 100644 --- a/connect/resolver.go +++ b/connect/resolver.go @@ -5,7 +5,6 @@ import ( "fmt" "math/rand" "strings" - "sync" "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/api" @@ -75,22 +74,11 @@ type ConsulResolver struct { // Datacenter to resolve in, empty indicates agent's local DC. Datacenter string - - // trustDomain stores the cluster's trust domain it's populated once on first - // Resolve call and blocks all resolutions. - trustDomain string - trustDomainMu sync.Mutex } // Resolve performs service discovery against the local Consul agent and returns // the address and expected identity of a suitable service instance. func (cr *ConsulResolver) Resolve(ctx context.Context) (string, connect.CertURI, error) { - // Fetch trust domain if we've not done that yet - err := cr.ensureTrustDomain() - if err != nil { - return "", nil, err - } - switch cr.Type { case ConsulResolverTypeService: return cr.resolveService(ctx) @@ -101,27 +89,6 @@ func (cr *ConsulResolver) Resolve(ctx context.Context) (string, connect.CertURI, } } -func (cr *ConsulResolver) ensureTrustDomain() error { - cr.trustDomainMu.Lock() - defer cr.trustDomainMu.Unlock() - - if cr.trustDomain != "" { - return nil - } - - roots, _, err := cr.Client.Agent().ConnectCARoots(nil) - if err != nil { - return fmt.Errorf("failed fetching cluster trust domain: %s", err) - } - - if roots.TrustDomain == "" { - return fmt.Errorf("cluster trust domain empty, connect not bootstrapped") - } - - cr.trustDomain = roots.TrustDomain - return nil -} - func (cr *ConsulResolver) resolveService(ctx context.Context) (string, connect.CertURI, error) { health := cr.Client.Health() @@ -182,7 +149,8 @@ func (cr *ConsulResolver) resolveServiceEntry(entry *api.ServiceEntry) (string, // Generate the expected CertURI certURI := &connect.SpiffeIDService{ - Host: cr.trustDomain, + // No host since we don't validate trust domain here (we rely on x509 to + // prove trust). Namespace: "default", Datacenter: entry.Node.Datacenter, Service: service, diff --git a/connect/resolver_test.go b/connect/resolver_test.go index 13e4fa3039..e28b1ec6f8 100644 --- a/connect/resolver_test.go +++ b/connect/resolver_test.go @@ -124,7 +124,9 @@ func TestConsulResolver_Resolve(t *testing.T) { Name: "web", Type: ConsulResolverTypeService, }, - wantCertURI: connect.TestSpiffeIDService(t, "web"), + // Want empty host since we don't enforce trust domain outside of TLS and + // don't need to load the current one this way. + wantCertURI: connect.TestSpiffeIDServiceWithHost(t, "web", ""), wantErr: false, addrs: proxyAddrs, }, @@ -135,7 +137,9 @@ func TestConsulResolver_Resolve(t *testing.T) { Name: "db", Type: ConsulResolverTypeService, }, - wantCertURI: connect.TestSpiffeIDService(t, "db"), + // Want empty host since we don't enforce trust domain outside of TLS and + // don't need to load the current one this way. + wantCertURI: connect.TestSpiffeIDServiceWithHost(t, "db", ""), wantErr: false, }, { @@ -172,7 +176,9 @@ func TestConsulResolver_Resolve(t *testing.T) { Name: queryId, Type: ConsulResolverTypePreparedQuery, }, - wantCertURI: connect.TestSpiffeIDService(t, "web"), + // Want empty host since we don't enforce trust domain outside of TLS and + // don't need to load the current one this way. + wantCertURI: connect.TestSpiffeIDServiceWithHost(t, "web", ""), wantErr: false, addrs: proxyAddrs, }, @@ -182,7 +188,9 @@ func TestConsulResolver_Resolve(t *testing.T) { Name: "test-query", Type: ConsulResolverTypePreparedQuery, }, - wantCertURI: connect.TestSpiffeIDService(t, "web"), + // Want empty host since we don't enforce trust domain outside of TLS and + // don't need to load the current one this way. + wantCertURI: connect.TestSpiffeIDServiceWithHost(t, "web", ""), wantErr: false, addrs: proxyAddrs, }, diff --git a/connect/tls.go b/connect/tls.go index 86daa6645d..8ff041aa14 100644 --- a/connect/tls.go +++ b/connect/tls.go @@ -11,6 +11,7 @@ import ( "log" "net" "net/url" + "strings" "sync" "github.com/hashicorp/consul/agent/connect" @@ -185,12 +186,17 @@ func verifyServerCertMatchesURI(certs []*x509.Certificate, return errors.New("peer certificate mismatch") } - // We may want to do better than string matching later in some special - // cases and/or encapsulate the "match" logic inside the CertURI - // implementation but for now this is all we need. - if gotURI.String() == expectedStr { + // Override the hostname since we rely on x509 constraints to limit ability to + // spoof the trust domain if needed (i.e. because a root is shared with other + // PKI or Consul clusters). This allows for seamless migrations between trust + // domains. + expectURI := expected.URI() + expectURI.Host = gotURI.Host + if strings.ToLower(gotURI.String()) == strings.ToLower(expectURI.String()) { + // OK! return nil } + return fmt.Errorf("peer certificate mismatch got %s, want %s", gotURI.String(), expectedStr) } diff --git a/connect/tls_test.go b/connect/tls_test.go index c637b2f43d..24097b2e55 100644 --- a/connect/tls_test.go +++ b/connect/tls_test.go @@ -29,6 +29,14 @@ func Test_verifyServerCertMatchesURI(t *testing.T) { expected: connect.TestSpiffeIDService(t, "web"), wantErr: false, }, + { + // Could happen during migration of secondary DC to multi-DC. Trust domain + // validity is enforced with x509 name constraints where needed. + name: "different trust-domain allowed", + certs: TestPeerCertificates(t, "web", ca1), + expected: connect.TestSpiffeIDServiceWithHost(t, "web", "other.consul"), + wantErr: false, + }, { name: "mismatch", certs: TestPeerCertificates(t, "web", ca1),