From 39e4ae25acc570eee879fa05faca424b8d8cafdb Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" Date: Mon, 8 Feb 2021 13:18:51 -0600 Subject: [PATCH] connect: connect CA Roots in the primary datacenter should use a SigningKeyID derived from their local intermediate (#9428) This fixes an issue where leaf certificates issued in primary datacenters using Vault as a Connect CA would be reissued very frequently (every ~20 seconds) because the logic meant to detect root rotation was errantly triggering. The hash of the rootCA was being compared against a hash of the intermediateCA and always failing. This doesn't apply to the Consul built-in CA provider because there is no intermediate in use in the primary DC. This is reminiscent of #6513 --- .changelog/9428.txt | 3 + GNUmakefile | 4 + agent/agent_endpoint_test.go | 126 ++++++++++++++++++++++++++++ agent/consul/leader_connect_ca.go | 18 +++- agent/consul/leader_connect_test.go | 105 +++++++++++++++++++++++ 5 files changed, 254 insertions(+), 2 deletions(-) create mode 100644 .changelog/9428.txt diff --git a/.changelog/9428.txt b/.changelog/9428.txt new file mode 100644 index 0000000000..5405590fc2 --- /dev/null +++ b/.changelog/9428.txt @@ -0,0 +1,3 @@ +```release-note:bug +connect: connect CA Roots in the primary datacenter should use a SigningKeyID derived from their local intermediate +``` diff --git a/GNUmakefile b/GNUmakefile index 3253280e2c..5bfb47ee18 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -359,10 +359,14 @@ ifeq ("$(CIRCLECI)","true") gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report.xml" -- -cover -coverprofile=coverage.txt ./agent/connect/ca # Run leader tests that require Vault gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report-leader.xml" -- -cover -coverprofile=coverage-leader.txt -run TestLeader_Vault_ ./agent/consul +# Run agent tests that require Vault + gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report-agent.xml" -- -cover -coverprofile=coverage-agent.txt -run '.*_Vault_' ./agent else # Run locally @echo "Running /agent/connect/ca tests in verbose mode" @go test -v ./agent/connect/ca + @go test -v ./agent/consul -run 'TestLeader_Vault_' + @go test -v ./agent -run '.*_Vault_' endif proto: $(PROTOGOFILES) $(PROTOGOBINFILES) diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 1949ed82ad..1b238c068a 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -27,6 +27,7 @@ import ( "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/connect/ca" "github.com/hashicorp/consul/agent/consul" "github.com/hashicorp/consul/agent/debug" "github.com/hashicorp/consul/agent/local" @@ -5788,6 +5789,131 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) { } } +func TestAgentConnectCALeafCert_Vault_doesNotChurnLeafCertsAtIdle(t *testing.T) { + ca.SkipIfVaultNotPresent(t) + + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + testVault := ca.NewTestVaultServer(t) + defer testVault.Stop() + + assert := assert.New(t) + require := require.New(t) + a := StartTestAgent(t, TestAgent{Overrides: fmt.Sprintf(` + connect { + test_ca_leaf_root_change_spread = "1ns" + ca_provider = "vault" + ca_config { + address = %[1]q + token = %[2]q + root_pki_path = "pki-root/" + intermediate_pki_path = "pki-intermediate/" + } + } + `, testVault.Addr, testVault.RootToken)}) + defer a.Shutdown() + testrpc.WaitForTestAgent(t, a.RPC, "dc1") + testrpc.WaitForActiveCARoot(t, a.RPC, "dc1", nil) + + var ca1 *structs.CARoot + { + args := &structs.DCSpecificRequest{Datacenter: "dc1"} + var reply structs.IndexedCARoots + require.NoError(a.RPC("ConnectCA.Roots", args, &reply)) + for _, r := range reply.Roots { + if r.ID == reply.ActiveRootID { + ca1 = r + break + } + } + require.NotNil(ca1) + } + + { + // Register a local service + args := &structs.ServiceDefinition{ + ID: "foo", + Name: "test", + Address: "127.0.0.1", + Port: 8000, + Check: structs.CheckType{ + TTL: 15 * time.Second, + }, + } + req, _ := http.NewRequest("PUT", "/v1/agent/service/register", jsonReader(args)) + resp := httptest.NewRecorder() + _, err := a.srv.AgentRegisterService(resp, req) + require.NoError(err) + if !assert.Equal(200, resp.Code) { + t.Log("Body: ", resp.Body.String()) + } + } + + // List + req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil) + resp := httptest.NewRecorder() + obj, err := a.srv.AgentConnectCALeafCert(resp, req) + require.NoError(err) + require.Equal("MISS", resp.Header().Get("X-Cache")) + + // Get the issued cert + issued, ok := obj.(*structs.IssuedCert) + assert.True(ok) + + // Verify that the cert is signed by the CA + requireLeafValidUnderCA(t, issued, ca1) + + // Verify blocking index + assert.True(issued.ModifyIndex > 0) + assert.Equal(fmt.Sprintf("%d", issued.ModifyIndex), + resp.Header().Get("X-Consul-Index")) + + // Test caching + { + // Fetch it again + resp := httptest.NewRecorder() + obj2, err := a.srv.AgentConnectCALeafCert(resp, req) + require.NoError(err) + require.Equal(obj, obj2) + + // Should cache hit this time and not make request + require.Equal("HIT", resp.Header().Get("X-Cache")) + } + + // Test that we aren't churning leaves for no reason at idle. + { + ch := make(chan error, 1) + go func() { + req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+strconv.Itoa(int(issued.ModifyIndex)), nil) + resp := httptest.NewRecorder() + obj, err := a.srv.AgentConnectCALeafCert(resp, req) + if err != nil { + ch <- err + } else { + issued2 := obj.(*structs.IssuedCert) + if issued.CertPEM == issued2.CertPEM { + ch <- fmt.Errorf("leaf woke up unexpectedly with same cert") + } else { + ch <- fmt.Errorf("leaf woke up unexpectedly with new cert") + } + } + }() + + start := time.Now() + + select { + case <-time.After(5 * time.Second): + case err := <-ch: + dur := time.Since(start) + t.Fatalf("unexpected return from blocking query; leaf churned during idle period, took %s: %v", dur, err) + } + } +} + func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) { if testing.Short() { t.Skip("too slow for testing.Short") diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index ff30d50df1..d7bcd42221 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -416,7 +416,7 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi if err != nil { return fmt.Errorf("error generating intermediate cert: %v", err) } - _, err = connect.ParseCert(interPEM) + intermediateCert, err := connect.ParseCert(interPEM) if err != nil { return fmt.Errorf("error getting intermediate cert: %v", err) } @@ -439,6 +439,13 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi } } + // Versions prior to 1.9.3, 1.8.8, and 1.7.12 incorrectly used the primary + // rootCA's subjectKeyID here instead of the intermediate. For + // provider=consul this didn't matter since there are no intermediates in + // the primaryDC, but for vault it does matter. + expectedSigningKeyID := connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) + needsSigningKeyUpdate := (rootCA.SigningKeyID != expectedSigningKeyID) + // Check if the CA root is already initialized and exit if it is, // adding on any existing intermediate certs since they aren't directly // tied to the provider. @@ -449,7 +456,10 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi if err != nil { return err } - if activeRoot != nil { + if activeRoot != nil && needsSigningKeyUpdate { + c.logger.Info("Correcting stored SigningKeyID value", "previous", rootCA.SigningKeyID, "updated", expectedSigningKeyID) + + } else if activeRoot != nil && !needsSigningKeyUpdate { // This state shouldn't be possible to get into because we update the root and // CA config in the same FSM operation. if activeRoot.ID != rootCA.ID { @@ -462,6 +472,10 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi return nil } + if needsSigningKeyUpdate { + rootCA.SigningKeyID = expectedSigningKeyID + } + // Get the highest index idx, _, err := state.CARoots(nil) if err != nil { diff --git a/agent/consul/leader_connect_test.go b/agent/consul/leader_connect_test.go index 12edec0916..bb378a1d67 100644 --- a/agent/consul/leader_connect_test.go +++ b/agent/consul/leader_connect_test.go @@ -579,6 +579,111 @@ func TestLeader_SecondaryCA_IntermediateRefresh(t *testing.T) { require.NoError(err) } +func TestLeader_Vault_PrimaryCA_FixSigningKeyID_OnRestart(t *testing.T) { + ca.SkipIfVaultNotPresent(t) + + if testing.Short() { + t.Skip("too slow for testing.Short") + } + + t.Parallel() + + testVault := ca.NewTestVaultServer(t) + defer testVault.Stop() + + dir1pre, s1pre := testServerWithConfig(t, func(c *Config) { + c.Build = "1.6.0" + c.PrimaryDatacenter = "dc1" + c.CAConfig = &structs.CAConfiguration{ + Provider: "vault", + Config: map[string]interface{}{ + "Address": testVault.Addr, + "Token": testVault.RootToken, + "RootPKIPath": "pki-root/", + "IntermediatePKIPath": "pki-intermediate/", + }, + } + }) + defer os.RemoveAll(dir1pre) + defer s1pre.Shutdown() + + testrpc.WaitForLeader(t, s1pre.RPC, "dc1") + + // Restore the pre-1.9.3/1.8.8/1.7.12 behavior of the SigningKeyID not being derived + // from the intermediates in the primary (which only matters for provider=vault). + var primaryRootSigningKeyID string + { + state := s1pre.fsm.State() + + // Get the highest index + idx, activePrimaryRoot, err := state.CARootActive(nil) + require.NoError(t, err) + require.NotNil(t, activePrimaryRoot) + + rootCert, err := connect.ParseCert(activePrimaryRoot.RootCert) + require.NoError(t, err) + + // Force this to be derived just from the root, not the intermediate. + primaryRootSigningKeyID = connect.EncodeSigningKeyID(rootCert.SubjectKeyId) + activePrimaryRoot.SigningKeyID = primaryRootSigningKeyID + + // Store the root cert in raft + resp, err := s1pre.raftApply(structs.ConnectCARequestType, &structs.CARequest{ + Op: structs.CAOpSetRoots, + Index: idx, + Roots: []*structs.CARoot{activePrimaryRoot}, + }) + require.NoError(t, err) + if respErr, ok := resp.(error); ok { + t.Fatalf("respErr: %v", respErr) + } + } + + // Shutdown s1pre and restart it to trigger the secondary CA init to correct + // the SigningKeyID. + s1pre.Shutdown() + + dir1, s1 := testServerWithConfig(t, func(c *Config) { + c.DataDir = s1pre.config.DataDir + c.Datacenter = "dc1" + c.PrimaryDatacenter = "dc1" + c.NodeName = s1pre.config.NodeName + c.NodeID = s1pre.config.NodeID + }) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + + testrpc.WaitForLeader(t, s1.RPC, "dc1") + + // Retry since it will take some time to init the primary CA fully and there + // isn't a super clean way to watch specifically until it's done than polling + // the CA provider anyway. + retry.Run(t, func(r *retry.R) { + // verify that the root is now corrected + provider, activeRoot := getCAProviderWithLock(s1) + require.NotNil(r, provider) + require.NotNil(r, activeRoot) + + activeIntermediate, err := provider.ActiveIntermediate() + require.NoError(r, err) + + intermediateCert, err := connect.ParseCert(activeIntermediate) + require.NoError(r, err) + + // Force this to be derived just from the root, not the intermediate. + expect := connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId) + + // The in-memory representation was saw the correction via a setCAProvider call. + require.Equal(r, expect, activeRoot.SigningKeyID) + + // The state store saw the correction, too. + _, activePrimaryRoot, err := s1.fsm.State().CARootActive(nil) + require.NoError(r, err) + require.NotNil(r, activePrimaryRoot) + require.Equal(r, expect, activePrimaryRoot.SigningKeyID) + }) +} + func TestLeader_SecondaryCA_FixSigningKeyID_via_IntermediateRefresh(t *testing.T) { if testing.Short() { t.Skip("too slow for testing.Short")