From 7cddf128e94230be7b80135ddc3e137e9c19a2fa Mon Sep 17 00:00:00 2001 From: Matt Keeler Date: Thu, 21 Jan 2021 09:04:30 -0500 Subject: [PATCH] Backport #9570 to release/1.8.x: Ensure that CA initialization does not block leader election. (#9571) Backport of PR: 9570 After fixing that bug I uncovered a couple more: Fix an issue where we might try to cross sign a cert when we never had a valid root. Fix a potential issue where reconfiguring the CA could cause either the Vault or AWS PCA CA providers to delete resources that are still required by the new incarnation of the CA. Ensure that CA initialization does not block leader election. After fixing that bug I uncovered a couple more: Fix an issue where we might try to cross sign a cert when we never had a valid root. Fix a potential issue where reconfiguring the CA could cause either the Vault or AWS PCA CA providers to delete resources that are still required by the new incarnation of the CA. --- .changelog/9498.txt | 15 ++ agent/connect/ca/mock_Provider.go | 10 +- agent/connect/ca/provider.go | 9 +- agent/connect/ca/provider_aws.go | 16 +- agent/connect/ca/provider_aws_test.go | 150 ++++++++++++++- agent/connect/ca/provider_consul.go | 6 +- agent/connect/ca/provider_vault.go | 25 ++- agent/connect/ca/provider_vault_test.go | 80 ++++++++ agent/consul/leader.go | 6 +- agent/consul/leader_connect.go | 6 +- agent/consul/leader_connect_ca.go | 245 +++++++++++++++++------- agent/consul/leader_connect_ca_test.go | 16 +- agent/consul/leader_connect_test.go | 101 ++++++++++ agent/consul/server.go | 3 +- agent/consul/server_connect.go | 4 +- 15 files changed, 585 insertions(+), 107 deletions(-) create mode 100644 .changelog/9498.txt diff --git a/.changelog/9498.txt b/.changelog/9498.txt new file mode 100644 index 0000000000..8672a8f1e6 --- /dev/null +++ b/.changelog/9498.txt @@ -0,0 +1,15 @@ +```release-note:bug +leader: Fixed a bug that could cause Connect CA initialization failures from allowing leader establishment to complete resulting in potentially infinite leader elections. +``` + +```release-note:bug +connect: Fixed a bug in the Vault Connect CA provider that could cause the intermediate PKI path to be deleted after reconfiguring the CA +``` + +```release-note:bug +connect: Fixed a bug in the AWS PCA Connect CA provider that could cause the intermediate PKI path to be deleted after reconfiguring the CA +``` + +```release-note:bug +connect: Fixed an issue that would prevent updating the Connect CA configuration if the CA provider didn't complete initialization previously. +``` \ No newline at end of file diff --git a/agent/connect/ca/mock_Provider.go b/agent/connect/ca/mock_Provider.go index 1cf77c2e34..59f5374fef 100644 --- a/agent/connect/ca/mock_Provider.go +++ b/agent/connect/ca/mock_Provider.go @@ -55,13 +55,13 @@ func (_m *MockProvider) ActiveRoot() (string, error) { return r0, r1 } -// Cleanup provides a mock function with given fields: -func (_m *MockProvider) Cleanup() error { - ret := _m.Called() +// Cleanup provides a mock function with given fields: providerTypeChange, config +func (_m *MockProvider) Cleanup(providerTypeChange bool, config map[string]interface{}) error { + ret := _m.Called(providerTypeChange, config) var r0 error - if rf, ok := ret.Get(0).(func() error); ok { - r0 = rf() + if rf, ok := ret.Get(0).(func(bool, map[string]interface{}) error); ok { + r0 = rf(providerTypeChange, config) } else { r0 = ret.Error(0) } diff --git a/agent/connect/ca/provider.go b/agent/connect/ca/provider.go index 17538ee6f4..1e0d19b36d 100644 --- a/agent/connect/ca/provider.go +++ b/agent/connect/ca/provider.go @@ -161,8 +161,13 @@ type Provider interface { // Cleanup performs any necessary cleanup that should happen when the provider // is shut down permanently, such as removing a temporary PKI backend in Vault - // created for an intermediate CA. - Cleanup() error + // created for an intermediate CA. Whether the CA provider type is changing + // and the other providers raw configuration is passed along so that the provider + // instance can determine which cleanup steps to perform. For example, when the + // Vault provider is in use and there is no type change occuring, the Vault + // provider should check if the intermediate PKI path is changing. If it is not + // changing then the provider should not remove that path from Vault. + Cleanup(providerTypeChange bool, otherConfig map[string]interface{}) error } // NeedsLogger is an optional interface that allows a CA provider to use the diff --git a/agent/connect/ca/provider_aws.go b/agent/connect/ca/provider_aws.go index 1868ac54ec..3bcbc11094 100644 --- a/agent/connect/ca/provider_aws.go +++ b/agent/connect/ca/provider_aws.go @@ -649,12 +649,26 @@ func (a *AWSProvider) deletePCA() error { } // Cleanup implements Provider -func (a *AWSProvider) Cleanup() error { +func (a *AWSProvider) Cleanup(providerTypeChange bool, otherConfig map[string]interface{}) error { old := atomic.SwapUint32(&a.stopped, 1) if old == 0 { close(a.stopCh) } + if !providerTypeChange { + awsConfig, err := ParseAWSCAConfig(otherConfig) + if err != nil { + return err + } + + // if the provider is being replaced and using an existing PCA instance + // then prevent deletion of that instance if the new provider uses + // the same instance. + if a.config.ExistingARN == awsConfig.ExistingARN { + return nil + } + } + if a.config.DeleteOnExit { if err := a.disablePCA(); err != nil { // Log the error but continue trying to delete as some errors may still diff --git a/agent/connect/ca/provider_aws_test.go b/agent/connect/ca/provider_aws_test.go index 12cf1e3ab8..e4c86d4d70 100644 --- a/agent/connect/ca/provider_aws_test.go +++ b/agent/connect/ca/provider_aws_test.go @@ -5,6 +5,8 @@ import ( "strconv" "testing" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/acmpca" "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/sdk/testutil" "github.com/stretchr/testify/require" @@ -39,7 +41,7 @@ func TestAWSBootstrapAndSignPrimary(t *testing.T) { "PrivateKeyBits": tc.KeyBits, } provider := testAWSProvider(t, testProviderConfigPrimary(t, cfg)) - defer provider.Cleanup() + defer provider.Cleanup(true, nil) // Generate the root require.NoError(provider.GenerateRoot()) @@ -89,12 +91,12 @@ func TestAWSBootstrapAndSignSecondary(t *testing.T) { skipIfAWSNotConfigured(t) p1 := testAWSProvider(t, testProviderConfigPrimary(t, nil)) - defer p1.Cleanup() + defer p1.Cleanup(true, nil) rootPEM, err := p1.ActiveRoot() require.NoError(t, err) p2 := testAWSProvider(t, testProviderConfigSecondary(t, nil)) - defer p2.Cleanup() + defer p2.Cleanup(true, nil) testSignIntermediateCrossDC(t, p1, p2) @@ -191,14 +193,14 @@ func TestAWSBootstrapAndSignSecondaryConsul(t *testing.T) { require.NoError(t, p1.GenerateRoot()) p2 := testAWSProvider(t, testProviderConfigSecondary(t, nil)) - defer p2.Cleanup() + defer p2.Cleanup(true, nil) testSignIntermediateCrossDC(t, p1, p2) }) t.Run("pri=aws,sec=consul", func(t *testing.T) { p1 := testAWSProvider(t, testProviderConfigPrimary(t, nil)) - defer p1.Cleanup() + defer p1.Cleanup(true, nil) require.NoError(t, p1.GenerateRoot()) conf := testConsulCAConfig() @@ -217,7 +219,7 @@ func TestAWSNoCrossSigning(t *testing.T) { skipIfAWSNotConfigured(t) p1 := testAWSProvider(t, testProviderConfigPrimary(t, nil)) - defer p1.Cleanup() + defer p1.Cleanup(true, nil) // Don't bother initializing a PCA as that is slow and unnecessary for this // test @@ -235,6 +237,142 @@ func TestAWSNoCrossSigning(t *testing.T) { require.Contains(t, err.Error(), "not implemented") } +func TestAWSProvider_Cleanup(t *testing.T) { + // Note not parallel since we could easily hit AWS limits of too many CAs if + // all of these tests run at once. + skipIfAWSNotConfigured(t) + + describeCA := func(t *testing.T, provider *AWSProvider) (bool, error) { + t.Helper() + state, err := provider.State() + require.NoError(t, err) + + // Load from the resource. + input := &acmpca.DescribeCertificateAuthorityInput{ + CertificateAuthorityArn: aws.String(state[AWSStateCAARNKey]), + } + output, err := provider.client.DescribeCertificateAuthority(input) + if err != nil { + return false, err + } + require.NotNil(t, output) + require.NotNil(t, output.CertificateAuthority) + require.NotNil(t, output.CertificateAuthority.Status) + return *output.CertificateAuthority.Status == acmpca.CertificateAuthorityStatusDeleted, nil + } + + requirePCADeleted := func(t *testing.T, provider *AWSProvider) { + deleted, err := describeCA(t, provider) + require.True(t, err != nil || deleted, "The AWS PCA instance has not been deleted") + } + + requirePCANotDeleted := func(t *testing.T, provider *AWSProvider) { + deleted, err := describeCA(t, provider) + require.NoError(t, err) + require.False(t, deleted, "The AWS PCA instance should not have been deleted") + } + + t.Run("provider-change", func(t *testing.T) { + // create a provider with the default config which will create the CA + p1Conf := testProviderConfigPrimary(t, nil) + p1 := testAWSProvider(t, p1Conf) + p1.GenerateRoot() + + t.Cleanup(func() { + // This is a fail safe just in case the Cleanup routine of the + // second provider fails to delete the CA. In that case we want + // to request that the main provider delete it during Cleanup. + if deleted, err := describeCA(t, p1); err == nil && deleted { + p1.Cleanup(false, p1Conf.RawConfig) + } else { + p1.Cleanup(true, nil) + } + }) + + // just ensure that it got created + requirePCANotDeleted(t, p1) + + state, err := p1.State() + require.NoError(t, err) + + p2Conf := testProviderConfigPrimary(t, map[string]interface{}{ + "ExistingARN": state[AWSStateCAARNKey], + }) + p2 := testAWSProvider(t, p2Conf) + + // provider change should trigger deletion of the CA + require.NoError(t, p2.Cleanup(true, nil)) + + requirePCADeleted(t, p1) + }) + + t.Run("arn-change", func(t *testing.T) { + // create a provider with the default config which will create the CA + p1Conf := testProviderConfigPrimary(t, nil) + p1 := testAWSProvider(t, p1Conf) + p1.GenerateRoot() + + t.Cleanup(func() { + // This is a fail safe just in case the Cleanup routine of the + // second provider fails to delete the CA. In that case we want + // to request that the main provider delete it during Cleanup. + if deleted, err := describeCA(t, p1); err == nil || deleted { + p1.Cleanup(false, p1Conf.RawConfig) + } else { + p1.Cleanup(true, nil) + } + }) + + // just ensure that it got created + requirePCANotDeleted(t, p1) + + state, err := p1.State() + require.NoError(t, err) + + p2Conf := testProviderConfigPrimary(t, map[string]interface{}{ + "ExistingARN": state[AWSStateCAARNKey], + }) + p2 := testAWSProvider(t, p2Conf) + + // changing the ARN should cause the other CA to be deleted + p2ConfAltARN := testProviderConfigPrimary(t, map[string]interface{}{ + "ExistingARN": "doesnt-need-to-be-real", + }) + require.NoError(t, p2.Cleanup(false, p2ConfAltARN.RawConfig)) + + requirePCADeleted(t, p1) + }) + + t.Run("arn-not-changed", func(t *testing.T) { + // create a provider with the default config which will create the CA + p1Conf := testProviderConfigPrimary(t, nil) + p1 := testAWSProvider(t, p1Conf) + p1.GenerateRoot() + + t.Cleanup(func() { + // the p2 provider should not remove the CA but we need to ensure that + // we do clean it up + p1.Cleanup(true, nil) + }) + + // just ensure that it got created + requirePCANotDeleted(t, p1) + + state, err := p1.State() + require.NoError(t, err) + + p2Conf := testProviderConfigPrimary(t, map[string]interface{}{ + "ExistingARN": state[AWSStateCAARNKey], + }) + p2 := testAWSProvider(t, p2Conf) + + // because the ARN isn't changing we don't want to remove the CA + require.NoError(t, p2.Cleanup(false, p2Conf.RawConfig)) + + requirePCANotDeleted(t, p1) + }) +} + func testAWSProvider(t *testing.T, cfg ProviderConfig) *AWSProvider { p := &AWSProvider{} logger := testutil.Logger(t) diff --git a/agent/connect/ca/provider_consul.go b/agent/connect/ca/provider_consul.go index 8ea4af7b46..5f81eb6f36 100644 --- a/agent/connect/ca/provider_consul.go +++ b/agent/connect/ca/provider_consul.go @@ -310,7 +310,11 @@ func (c *ConsulProvider) GenerateIntermediate() (string, error) { } // Remove the state store entry for this provider instance. -func (c *ConsulProvider) Cleanup() error { +func (c *ConsulProvider) Cleanup(_ bool, _ map[string]interface{}) error { + // This method only gets called for final cleanup. Therefore we don't + // need to worry about the case where a ca config update is made to + // change the cert ttls but leaving the private key and root cert the + // same. Changing those would change the id field on the provider. args := &structs.CARequest{ Op: structs.CAOpDeleteProviderState, ProviderState: &structs.CAConsulProviderState{ID: c.id}, diff --git a/agent/connect/ca/provider_vault.go b/agent/connect/ca/provider_vault.go index 0510fe8665..d6b527f569 100644 --- a/agent/connect/ca/provider_vault.go +++ b/agent/connect/ca/provider_vault.go @@ -523,10 +523,31 @@ func (c *VaultProvider) SupportsCrossSigning() (bool, error) { // Cleanup unmounts the configured intermediate PKI backend. It's fine to tear // this down and recreate it on small config changes because the intermediate // certs get bundled with the leaf certs, so there's no cost to the CA changing. -func (v *VaultProvider) Cleanup() error { +func (v *VaultProvider) Cleanup(providerTypeChange bool, otherConfig map[string]interface{}) error { v.Stop() - return v.client.Sys().Unmount(v.config.IntermediatePKIPath) + if !providerTypeChange { + newConfig, err := ParseVaultCAConfig(otherConfig) + if err != nil { + return err + } + + // if the intermeidate PKI path isn't changing we don't want to delete it as + // Cleanup is called after initializing the new provider + if newConfig.IntermediatePKIPath == v.config.IntermediatePKIPath { + return nil + } + } + + err := v.client.Sys().Unmount(v.config.IntermediatePKIPath) + + switch err { + case ErrBackendNotMounted, ErrBackendNotInitialized: + // suppress these errors if we didn't finish initialization before + return nil + default: + return err + } } // Stop shuts down the token renew goroutine. diff --git a/agent/connect/ca/provider_vault_test.go b/agent/connect/ca/provider_vault_test.go index d14ceeefc4..d3b36da96b 100644 --- a/agent/connect/ca/provider_vault_test.go +++ b/agent/connect/ca/provider_vault_test.go @@ -359,6 +359,86 @@ func TestVaultProvider_SignIntermediateConsul(t *testing.T) { }) } +func TestVaultProvider_Cleanup(t *testing.T) { + t.Parallel() + + SkipIfVaultNotPresent(t) + + testVault, err := runTestVault(t) + require.NoError(t, err) + + testVault.WaitUntilReady(t) + + t.Run("provider-change", func(t *testing.T) { + provider, err := createVaultProvider(t, true, testVault.Addr, testVault.RootToken, nil) + require.NoError(t, err) + + // ensure that the intermediate PKI mount exists + mounts, err := provider.client.Sys().ListMounts() + require.NoError(t, err) + require.Contains(t, mounts, provider.config.IntermediatePKIPath) + + // call cleanup with a provider change - this should cause removal of the mount + require.NoError(t, provider.Cleanup(true, nil)) + + // verify the mount was removed + mounts, err = provider.client.Sys().ListMounts() + require.NoError(t, err) + require.NotContains(t, mounts, provider.config.IntermediatePKIPath) + }) + + t.Run("pki-path-change", func(t *testing.T) { + provider, err := createVaultProvider(t, true, testVault.Addr, testVault.RootToken, nil) + require.NoError(t, err) + + // ensure that the intermediate PKI mount exists + mounts, err := provider.client.Sys().ListMounts() + require.NoError(t, err) + require.Contains(t, mounts, provider.config.IntermediatePKIPath) + + // call cleanup with an intermediate pki path change - this should cause removal of the mount + require.NoError(t, provider.Cleanup(false, map[string]interface{}{ + "Address": testVault.Addr, + "Token": testVault.RootToken, + "RootPKIPath": "pki-root/", + // + "IntermediatePKIPath": "pki-intermediate2/", + // Tests duration parsing after msgpack type mangling during raft apply. + "LeafCertTTL": []uint8("72h"), + })) + + // verify the mount was removed + mounts, err = provider.client.Sys().ListMounts() + require.NoError(t, err) + require.NotContains(t, mounts, provider.config.IntermediatePKIPath) + }) + + t.Run("pki-path-unchanged", func(t *testing.T) { + provider, err := createVaultProvider(t, true, testVault.Addr, testVault.RootToken, nil) + require.NoError(t, err) + + // ensure that the intermediate PKI mount exists + mounts, err := provider.client.Sys().ListMounts() + require.NoError(t, err) + require.Contains(t, mounts, provider.config.IntermediatePKIPath) + + // call cleanup with no config changes - this should not cause removal of the intermediate pki path + require.NoError(t, provider.Cleanup(false, map[string]interface{}{ + "Address": testVault.Addr, + "Token": testVault.RootToken, + "RootPKIPath": "pki-root/", + "IntermediatePKIPath": "pki-intermediate/", + // Tests duration parsing after msgpack type mangling during raft apply. + "LeafCertTTL": []uint8("72h"), + })) + + // verify the mount was NOT removed + mounts, err = provider.client.Sys().ListMounts() + require.NoError(t, err) + require.Contains(t, mounts, provider.config.IntermediatePKIPath) + }) +} + func getIntermediateCertTTL(t *testing.T, caConf *structs.CAConfiguration) time.Duration { t.Helper() diff --git a/agent/consul/leader.go b/agent/consul/leader.go index 928d8dab03..64c8875d74 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -334,10 +334,6 @@ func (s *Server) establishLeadership() error { s.getOrCreateAutopilotConfig() s.autopilot.Start() - if err := s.caManager.InitializeCA(); err != nil { - return err - } - s.startConfigReplication() s.startFederationStateReplication() @@ -371,7 +367,7 @@ func (s *Server) revokeLeadership() { s.stopConnectLeader() s.caManager.setCAProvider(nil, nil) - s.caManager.setState(CAStateUninitialized, false) + s.caManager.setState(caStateUninitialized, false) s.stopACLTokenReaping() diff --git a/agent/consul/leader_connect.go b/agent/consul/leader_connect.go index 976b5fa2c6..f9d0eb2051 100644 --- a/agent/consul/leader_connect.go +++ b/agent/consul/leader_connect.go @@ -41,19 +41,17 @@ func (s *Server) startConnectLeader() { // Start the Connect secondary DC actions if enabled. if s.config.Datacenter != s.config.PrimaryDatacenter { - s.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, s.caManager.secondaryCARootWatch) s.leaderRoutineManager.Start(intentionReplicationRoutineName, s.replicateIntentions) } - s.leaderRoutineManager.Start(intermediateCertRenewWatchRoutineName, s.caManager.intermediateCertRenewalWatch) + s.caManager.Start() s.leaderRoutineManager.Start(caRootPruningRoutineName, s.runCARootPruning) } // stopConnectLeader stops connect specific leader functions. func (s *Server) stopConnectLeader() { - s.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName) + s.caManager.Stop() s.leaderRoutineManager.Stop(intentionReplicationRoutineName) - s.leaderRoutineManager.Stop(intermediateCertRenewWatchRoutineName) s.leaderRoutineManager.Stop(caRootPruningRoutineName) // If the provider implements NeedsStop, we call Stop to perform any shutdown actions. diff --git a/agent/consul/leader_connect_ca.go b/agent/consul/leader_connect_ca.go index 69a3424c1a..ff30d50df1 100644 --- a/agent/consul/leader_connect_ca.go +++ b/agent/consul/leader_connect_ca.go @@ -17,14 +17,14 @@ import ( uuid "github.com/hashicorp/go-uuid" ) -type CAState string +type caState string const ( - CAStateUninitialized CAState = "UNINITIALIZED" - CAStateInitializing = "INITIALIZING" - CAStateReady = "READY" - CAStateRenewIntermediate = "RENEWING" - CAStateReconfig = "RECONFIGURING" + caStateUninitialized caState = "UNINITIALIZED" + caStateInitializing = "INITIALIZING" + caStateInitialized = "INITIALIZED" + caStateRenewIntermediate = "RENEWING" + caStateReconfig = "RECONFIGURING" ) // caServerDelegate is an interface for server operations for facilitating @@ -61,9 +61,11 @@ type CAManager struct { // stateLock protects the internal state used for administrative CA tasks. stateLock sync.Mutex - state CAState + state caState primaryRoots structs.IndexedCARoots // The most recently seen state of the root CAs from the primary datacenter. actingSecondaryCA bool // True if this datacenter has been initialized as a secondary CA. + + leaderRoutineManager *LeaderRoutineManager } type caDelegateWithState struct { @@ -74,36 +76,47 @@ func (c *caDelegateWithState) State() *state.Store { return c.fsm.State() } -func NewCAManager(delegate caServerDelegate, logger hclog.Logger, config *Config) *CAManager { +func NewCAManager(delegate caServerDelegate, leaderRoutineManager *LeaderRoutineManager, logger hclog.Logger, config *Config) *CAManager { return &CAManager{ - delegate: delegate, - logger: logger, - serverConf: config, - state: CAStateUninitialized, + delegate: delegate, + logger: logger, + serverConf: config, + state: caStateUninitialized, + leaderRoutineManager: leaderRoutineManager, } } func (c *CAManager) reset() { - c.state = CAStateUninitialized + c.state = caStateUninitialized c.primaryRoots = structs.IndexedCARoots{} c.actingSecondaryCA = false c.setCAProvider(nil, nil) } // setState attempts to update the CA state to the given state. -// If the current state is not READY, this will fail. The only exception is when -// the current state is UNINITIALIZED, and the function is called with CAStateInitializing. -func (c *CAManager) setState(newState CAState, validateState bool) error { +// Valid state transitions are: +// +// caStateInitialized -> +// caStateUninitialized -> caStateInitializing +// caStateUninitialized -> caStateReconfig +// +// Other state transitions may be forced if the validateState parameter is set to false. +// This will mainly be used in deferred functions which aim to set the final status based +// a successful/error return. +func (c *CAManager) setState(newState caState, validateState bool) (caState, error) { c.stateLock.Lock() defer c.stateLock.Unlock() state := c.state - if !validateState || state == CAStateReady || (state == CAStateUninitialized && newState == CAStateInitializing) { + if !validateState || + state == caStateInitialized || + (state == caStateUninitialized && newState == caStateInitializing) || + (state == caStateUninitialized && newState == caStateReconfig) { c.state = newState } else { - return fmt.Errorf("CA is already in state %q", state) + return state, fmt.Errorf("CA is already in state %q", state) } - return nil + return state, nil } // setPrimaryRoots updates the most recently seen roots from the primary. @@ -111,7 +124,7 @@ func (c *CAManager) setPrimaryRoots(newRoots structs.IndexedCARoots) error { c.stateLock.Lock() defer c.stateLock.Unlock() - if c.state == CAStateInitializing || c.state == CAStateReconfig { + if c.state == caStateInitializing || c.state == caStateReconfig { c.primaryRoots = newRoots } else { return fmt.Errorf("Cannot update primary roots in state %q", c.state) @@ -234,21 +247,87 @@ func (c *CAManager) setCAProvider(newProvider ca.Provider, root *structs.CARoot) c.providerLock.Unlock() } +func (c *CAManager) Start() { + // Attempt to initialize the Connect CA now. This will + // happen during leader establishment and it would be great + // if the CA was ready to go once that process was finished. + if err := c.InitializeCA(); err != nil { + c.logger.Error("Failed to initialize Connect CA", "error", err) + + // we failed to fully initialize the CA so we need to spawn a + // go routine to retry this process until it succeeds or we lose + // leadership and the go routine gets stopped. + c.leaderRoutineManager.Start(backgroundCAInitializationRoutineName, c.backgroundCAInitialization) + } else { + // We only start these if CA initialization was successful. If not the completion of the + // background CA initialization will start these routines. + c.startPostInitializeRoutines() + } +} + +func (c *CAManager) Stop() { + c.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName) + c.leaderRoutineManager.Stop(intermediateCertRenewWatchRoutineName) + c.leaderRoutineManager.Stop(backgroundCAInitializationRoutineName) +} + +func (c *CAManager) startPostInitializeRoutines() { + // Start the Connect secondary DC actions if enabled. + if c.serverConf.Datacenter != c.serverConf.PrimaryDatacenter { + c.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, c.secondaryCARootWatch) + } + + c.leaderRoutineManager.Start(intermediateCertRenewWatchRoutineName, c.intermediateCertRenewalWatch) +} + +func (c *CAManager) backgroundCAInitialization(ctx context.Context) error { + retryLoopBackoffAbortOnSuccess(ctx, c.InitializeCA, func(err error) { + c.logger.Error("Failed to initialize Connect CA", + "routine", backgroundCAInitializationRoutineName, + "error", err, + ) + }) + + if err := ctx.Err(); err != nil { + return err + } + + c.logger.Info("Successfully initialized the Connect CA") + + c.startPostInitializeRoutines() + return nil +} + // InitializeCA sets up the CA provider when gaining leadership, either bootstrapping // the CA if this is the primary DC or making a remote RPC for intermediate signing // if this is a secondary DC. -func (c *CAManager) InitializeCA() error { +func (c *CAManager) InitializeCA() (reterr error) { // Bail if connect isn't enabled. if !c.serverConf.ConnectEnabled { return nil } // Update the state before doing anything else. - err := c.setState(CAStateInitializing, true) + oldState, err := c.setState(caStateInitializing, true) + // if we were already in the initialized state then there is nothing to be done. + if oldState == caStateInitialized { + return nil + } if err != nil { return err } - defer c.setState(CAStateReady, false) + + defer func() { + // Using named return values in deferred funcs isnt too common in our code + // but it is first class Go functionality. The error erturned from the + // main func will be available by its given name within deferred functions. + // See: https://blog.golang.org/defer-panic-and-recover + if reterr == nil { + c.setState(caStateInitialized, false) + } else { + c.setState(caStateUninitialized, false) + } + }() // Initialize the provider based on the current config. conf, err := c.initializeCAConfig() @@ -613,12 +692,29 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot return nil } -func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { +func (c *CAManager) UpdateConfiguration(args *structs.CARequest) (reterr error) { // Attempt to update the state first. - if err := c.setState(CAStateReconfig, true); err != nil { + oldState, err := c.setState(caStateReconfig, true) + if err != nil { + return err + } + defer func() { + // Using named return values in deferred funcs isnt too common in our code + // but it is first class Go functionality. The error erturned from the + // main func will be available by its given name within deferred functions. + // See: https://blog.golang.org/defer-panic-and-recover + if reterr == nil { + c.setState(caStateInitialized, false) + } else { + c.setState(oldState, false) + } + }() + + // Attempt to initialize the config if we failed to do so in InitializeCA for some reason + _, err = c.initializeCAConfig() + if err != nil { return err } - defer c.setState(CAStateReady, false) // Exit early if it's a no-op change state := c.delegate.State() @@ -674,7 +770,7 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { cleanupNewProvider := true defer func() { if cleanupNewProvider { - if err := newProvider.Cleanup(); err != nil { + if err := newProvider.Cleanup(args.Config.Provider != config.Provider, args.Config.Config); err != nil { c.logger.Warn("failed to clean up CA provider while handling startup failure", "provider", newProvider, "error", err) } } @@ -739,51 +835,60 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { return nil } - // At this point, we know the config change has trigged a root rotation, - // either by swapping the provider type or changing the provider's config - // to use a different root certificate. - - // First up, sanity check that the current provider actually supports - // cross-signing. + // get the old CA provider to be used for Cross Signing and to clean it up at the end + // of the functi8on. oldProvider, _ := c.getCAProvider() if oldProvider == nil { return fmt.Errorf("internal error: CA provider is nil") } - canXSign, err := oldProvider.SupportsCrossSigning() - if err != nil { - return fmt.Errorf("CA provider error: %s", err) - } - if !canXSign && !args.Config.ForceWithoutCrossSigning { - return errors.New("The current CA Provider does not support cross-signing. " + - "You can try again with ForceWithoutCrossSigningSet but this may cause " + - "disruption - see documentation for more.") - } - if !canXSign && args.Config.ForceWithoutCrossSigning { - c.logger.Warn("current CA doesn't support cross signing but " + - "CA reconfiguration forced anyway with ForceWithoutCrossSigning") - } - // If it's a config change that would trigger a rotation (different provider/root): - // 1. Get the root from the new provider. - // 2. Call CrossSignCA on the old provider to sign the new root with the old one to - // get a cross-signed certificate. - // 3. Take the active root for the new provider and append the intermediate from step 2 - // to its list of intermediates. - newRoot, err := connect.ParseCert(newRootPEM) - if err != nil { - return err - } - - if canXSign { - // Have the old provider cross-sign the new root - xcCert, err := oldProvider.CrossSignCA(newRoot) + // We only even think about cross signing if the current provider has a root cert + // In some cases such as having a bad CA configuration during startup the provider + // may not have been able to generate a cert. We then want to be able to prevent + // an attempt to cross sign the cert which will definitely fail. + if root != nil { + // If it's a config change that would trigger a rotation (different provider/root): + // 1. Get the root from the new provider. + // 2. Call CrossSignCA on the old provider to sign the new root with the old one to + // get a cross-signed certificate. + // 3. Take the active root for the new provider and append the intermediate from step 2 + // to its list of intermediates. + newRoot, err := connect.ParseCert(newRootPEM) if err != nil { return err } - // Add the cross signed cert to the new CA's intermediates (to be attached - // to leaf certs). - newActiveRoot.IntermediateCerts = []string{xcCert} + // At this point, we know the config change has triggered a root rotation, + // either by swapping the provider type or changing the provider's config + // to use a different root certificate. + + // First up, check that the current provider actually supports + // cross-signing. + canXSign, err := oldProvider.SupportsCrossSigning() + if err != nil { + return fmt.Errorf("CA provider error: %s", err) + } + if !canXSign && !args.Config.ForceWithoutCrossSigning { + return errors.New("The current CA Provider does not support cross-signing. " + + "You can try again with ForceWithoutCrossSigningSet but this may cause " + + "disruption - see documentation for more.") + } + if !canXSign && args.Config.ForceWithoutCrossSigning { + c.logger.Warn("current CA doesn't support cross signing but " + + "CA reconfiguration forced anyway with ForceWithoutCrossSigning") + } + + if canXSign { + // Have the old provider cross-sign the new root + xcCert, err := oldProvider.CrossSignCA(newRoot) + if err != nil { + return err + } + + // Add the cross signed cert to the new CA's intermediates (to be attached + // to leaf certs). + newActiveRoot.IntermediateCerts = []string{xcCert} + } } intermediate, err := newProvider.GenerateIntermediate() @@ -831,8 +936,8 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error { cleanupNewProvider = false c.setCAProvider(newProvider, newActiveRoot) - if err := oldProvider.Cleanup(); err != nil { - c.logger.Warn("failed to clean up old provider", "provider", config.Provider) + if err := oldProvider.Cleanup(args.Config.Provider != config.Provider, args.Config.Config); err != nil { + c.logger.Warn("failed to clean up old provider", "provider", config.Provider, "error", err) } c.logger.Info("CA rotated to new root under provider", "provider", args.Config.Provider) @@ -924,10 +1029,10 @@ func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error { func (c *CAManager) RenewIntermediate(ctx context.Context, isPrimary bool) error { // Grab the 'lock' right away so the provider/config can't be changed out while we check // the intermediate. - if err := c.setState(CAStateRenewIntermediate, true); err != nil { + if _, err := c.setState(caStateRenewIntermediate, true); err != nil { return err } - defer c.setState(CAStateReady, false) + defer c.setState(caStateInitialized, false) provider, _ := c.getCAProvider() if provider == nil { @@ -1053,10 +1158,10 @@ func (c *CAManager) secondaryCARootWatch(ctx context.Context) error { // certificate if necessary. func (c *CAManager) UpdateRoots(roots structs.IndexedCARoots) error { // Update the state first to claim the 'lock'. - if err := c.setState(CAStateReconfig, true); err != nil { + if _, err := c.setState(caStateReconfig, true); err != nil { return err } - defer c.setState(CAStateReady, false) + defer c.setState(caStateInitialized, false) // Update the cached primary roots now that the lock is held. if err := c.setPrimaryRoots(roots); err != nil { @@ -1125,7 +1230,7 @@ func (c *CAManager) setSecondaryCA() error { c.stateLock.Lock() defer c.stateLock.Unlock() - if c.state == CAStateInitializing || c.state == CAStateReconfig { + if c.state == caStateInitializing || c.state == caStateReconfig { c.actingSecondaryCA = true } else { return fmt.Errorf("Cannot update secondary CA flag in state %q", c.state) diff --git a/agent/consul/leader_connect_ca_test.go b/agent/consul/leader_connect_ca_test.go index cc1ba16afa..b667b2f865 100644 --- a/agent/consul/leader_connect_ca_test.go +++ b/agent/consul/leader_connect_ca_test.go @@ -140,7 +140,7 @@ func (m *mockCAProvider) Sign(*x509.CertificateRequest) (string, error) func (m *mockCAProvider) SignIntermediate(*x509.CertificateRequest) (string, error) { return "", nil } func (m *mockCAProvider) CrossSignCA(*x509.Certificate) (string, error) { return "", nil } func (m *mockCAProvider) SupportsCrossSigning() (bool, error) { return false, nil } -func (m *mockCAProvider) Cleanup() error { return nil } +func (m *mockCAProvider) Cleanup(_ bool, _ map[string]interface{}) error { return nil } func waitForCh(t *testing.T, ch chan string, expected string) { select { @@ -200,18 +200,18 @@ func TestCAManager_Initialize(t *testing.T) { conf.PrimaryDatacenter = "dc1" conf.Datacenter = "dc2" delegate := NewMockCAServerDelegate(t, conf) - manager := NewCAManager(delegate, testutil.Logger(t), conf) + manager := NewCAManager(delegate, nil, testutil.Logger(t), conf) // Call InitializeCA and then confirm the RPCs and provider calls // happen in the expected order. - require.EqualValues(t, CAStateUninitialized, manager.state) + require.EqualValues(t, caStateUninitialized, manager.state) errCh := make(chan error) go func() { errCh <- manager.InitializeCA() }() waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.Roots") - require.EqualValues(t, CAStateInitializing, manager.state) + require.EqualValues(t, caStateInitializing, manager.state) waitForCh(t, delegate.callbackCh, "provider/GenerateIntermediateCSR") waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.SignIntermediate") waitForCh(t, delegate.callbackCh, "provider/SetIntermediate") @@ -226,7 +226,7 @@ func TestCAManager_Initialize(t *testing.T) { t.Fatal("never got result from errCh") } - require.EqualValues(t, CAStateReady, manager.state) + require.EqualValues(t, caStateInitialized, manager.state) } func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { @@ -246,7 +246,7 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { conf.PrimaryDatacenter = "dc1" conf.Datacenter = "dc2" delegate := NewMockCAServerDelegate(t, conf) - manager := NewCAManager(delegate, testutil.Logger(t), conf) + manager := NewCAManager(delegate, nil, testutil.Logger(t), conf) initTestManager(t, manager, delegate) // Wait half the TTL for the cert to need renewing. @@ -264,7 +264,7 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { // Call UpdateConfiguration while RenewIntermediate is still in-flight to // make sure we get an error about the state being occupied. go func() { - require.EqualValues(t, CAStateRenewIntermediate, manager.state) + require.EqualValues(t, caStateRenewIntermediate, manager.state) require.Error(t, errors.New("already in state"), manager.UpdateConfiguration(&structs.CARequest{})) }() @@ -281,5 +281,5 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) { t.Fatal("never got result from errCh") } - require.EqualValues(t, CAStateReady, manager.state) + require.EqualValues(t, caStateInitialized, manager.state) } diff --git a/agent/consul/leader_connect_test.go b/agent/consul/leader_connect_test.go index 4dd59aa27f..2226bcaa50 100644 --- a/agent/consul/leader_connect_test.go +++ b/agent/consul/leader_connect_test.go @@ -1585,3 +1585,104 @@ func TestLeader_retryLoopBackoffHandleSuccess(t *testing.T) { }) } } + +func TestLeader_Vault_BadCAConfigShouldntPreventLeaderEstablishment(t *testing.T) { + ca.SkipIfVaultNotPresent(t) + + testVault := ca.NewTestVaultServer(t) + defer testVault.Stop() + + _, s1 := testServerWithConfig(t, func(c *Config) { + c.Build = "1.9.1" + c.PrimaryDatacenter = "dc1" + c.CAConfig = &structs.CAConfiguration{ + Provider: "vault", + Config: map[string]interface{}{ + "Address": testVault.Addr, + "Token": "not-the-root", + "RootPKIPath": "pki-root/", + "IntermediatePKIPath": "pki-intermediate/", + }, + } + }) + defer s1.Shutdown() + + waitForLeaderEstablishment(t, s1) + + rootsList, activeRoot, err := getTestRoots(s1, "dc1") + require.NoError(t, err) + require.Empty(t, rootsList.Roots) + require.Nil(t, activeRoot) + + // Now that the leader is up and we have verified that there are no roots / CA init failed, + // verify that we can reconfigure away from the bad configuration. + newConfig := &structs.CAConfiguration{ + Provider: "vault", + Config: map[string]interface{}{ + "Address": testVault.Addr, + "Token": testVault.RootToken, + "RootPKIPath": "pki-root/", + "IntermediatePKIPath": "pki-intermediate/", + }, + } + { + args := &structs.CARequest{ + Datacenter: "dc1", + Config: newConfig, + } + var reply interface{} + + retry.Run(t, func(r *retry.R) { + require.NoError(r, s1.RPC("ConnectCA.ConfigurationSet", args, &reply)) + }) + } + + rootsList, activeRoot, err = getTestRoots(s1, "dc1") + require.NoError(t, err) + require.NotEmpty(t, rootsList.Roots) + require.NotNil(t, activeRoot) +} + +func TestLeader_Consul_BadCAConfigShouldntPreventLeaderEstablishment(t *testing.T) { + ca.SkipIfVaultNotPresent(t) + + _, s1 := testServerWithConfig(t, func(c *Config) { + c.Build = "1.9.1" + c.PrimaryDatacenter = "dc1" + c.CAConfig = &structs.CAConfiguration{ + Provider: "consul", + Config: map[string]interface{}{ + "RootCert": "garbage", + }, + } + }) + defer s1.Shutdown() + + waitForLeaderEstablishment(t, s1) + + rootsList, activeRoot, err := getTestRoots(s1, "dc1") + require.NoError(t, err) + require.Empty(t, rootsList.Roots) + require.Nil(t, activeRoot) + + newConfig := &structs.CAConfiguration{ + Provider: "consul", + Config: map[string]interface{}{}, + } + { + args := &structs.CARequest{ + Datacenter: "dc1", + Config: newConfig, + } + var reply interface{} + + retry.Run(t, func(r *retry.R) { + require.NoError(r, s1.RPC("ConnectCA.ConfigurationSet", args, &reply)) + }) + } + + rootsList, activeRoot, err = getTestRoots(s1, "dc1") + require.NoError(t, err) + require.NotEmpty(t, rootsList.Roots) + require.NotNil(t, activeRoot) +} diff --git a/agent/consul/server.go b/agent/consul/server.go index a190dd740b..d39c5e6b52 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -108,6 +108,7 @@ const ( intentionReplicationRoutineName = "intention replication" secondaryCARootWatchRoutineName = "secondary CA roots watch" intermediateCertRenewWatchRoutineName = "intermediate cert renew watch" + backgroundCAInitializationRoutineName = "CA initialization" ) var ( @@ -480,7 +481,7 @@ func NewServer(config *Config, options ...ConsulOption) (*Server, error) { return nil, fmt.Errorf("Failed to start Raft: %v", err) } - s.caManager = NewCAManager(&caDelegateWithState{s}, s.loggers.Named(logging.Connect), s.config) + s.caManager = NewCAManager(&caDelegateWithState{s}, s.leaderRoutineManager, s.loggers.Named(logging.Connect), s.config) if s.config.ConnectEnabled && (s.config.AutoEncryptAllowTLS || s.config.AutoConfigAuthzEnabled) { go s.connectCARootsMonitor(&lib.StopChannelContext{StopCh: s.shutdownCh}) } diff --git a/agent/consul/server_connect.go b/agent/consul/server_connect.go index e854d813a6..9401684298 100644 --- a/agent/consul/server_connect.go +++ b/agent/consul/server_connect.go @@ -134,9 +134,9 @@ func (s *Server) getCARoots(ws memdb.WatchSet, state *state.Store) (*structs.Ind func (s *Server) SignCertificate(csr *x509.CertificateRequest, spiffeID connect.CertURI) (*structs.IssuedCert, error) { provider, caRoot := s.caManager.getCAProvider() if provider == nil { - return nil, fmt.Errorf("internal error: CA provider is nil") + return nil, fmt.Errorf("CA is uninitialized and unable to sign certificates yet: provider is nil") } else if caRoot == nil { - return nil, fmt.Errorf("internal error: CA root is nil") + return nil, fmt.Errorf("CA is uninitialized and unable to sign certificates yet: no root certificate") } // Verify that the CSR entity is in the cluster's trust domain