Ensure that CA initialization does not block leader election.

After fixing that bug I uncovered a couple more:

Fix an issue where we might try to cross sign a cert when we never had a valid root.
Fix a potential issue where reconfiguring the CA could cause either the Vault or AWS PCA CA providers to delete resources that are still required by the new incarnation of the CA.
This commit is contained in:
Matt Keeler 2021-01-15 13:20:27 -05:00
parent cf4fa18af4
commit d9d4c492ab
No known key found for this signature in database
GPG Key ID: 04DBAE1857E0081B
15 changed files with 587 additions and 111 deletions

15
.changelog/9498.txt Normal file
View File

@ -0,0 +1,15 @@
```release-note:bug
leader: Fixed a bug that could cause Connect CA initialization failures from allowing leader establishment to complete resulting in potentially infinite leader elections.
```
```release-note:bug
connect: Fixed a bug in the Vault Connect CA provider that could cause the intermediate PKI path to be deleted after reconfiguring the CA
```
```release-note:bug
connect: Fixed a bug in the AWS PCA Connect CA provider that could cause the intermediate PKI path to be deleted after reconfiguring the CA
```
```release-note:bug
connect: Fixed an issue that would prevent updating the Connect CA configuration if the CA provider didn't complete initialization previously.
```

View File

@ -55,13 +55,13 @@ func (_m *MockProvider) ActiveRoot() (string, error) {
return r0, r1
}
// Cleanup provides a mock function with given fields:
func (_m *MockProvider) Cleanup() error {
ret := _m.Called()
// Cleanup provides a mock function with given fields: providerTypeChange, config
func (_m *MockProvider) Cleanup(providerTypeChange bool, config map[string]interface{}) error {
ret := _m.Called(providerTypeChange, config)
var r0 error
if rf, ok := ret.Get(0).(func() error); ok {
r0 = rf()
if rf, ok := ret.Get(0).(func(bool, map[string]interface{}) error); ok {
r0 = rf(providerTypeChange, config)
} else {
r0 = ret.Error(0)
}

View File

@ -161,8 +161,13 @@ type Provider interface {
// Cleanup performs any necessary cleanup that should happen when the provider
// is shut down permanently, such as removing a temporary PKI backend in Vault
// created for an intermediate CA.
Cleanup() error
// created for an intermediate CA. Whether the CA provider type is changing
// and the other providers raw configuration is passed along so that the provider
// instance can determine which cleanup steps to perform. For example, when the
// Vault provider is in use and there is no type change occuring, the Vault
// provider should check if the intermediate PKI path is changing. If it is not
// changing then the provider should not remove that path from Vault.
Cleanup(providerTypeChange bool, otherConfig map[string]interface{}) error
}
// NeedsLogger is an optional interface that allows a CA provider to use the

View File

@ -303,6 +303,7 @@ func (a *AWSProvider) createPCA() error {
a.logger.Debug("creating new PCA", "common_name", commonName)
createOutput, err := a.client.CreateCertificateAuthority(&createInput)
if err != nil {
a.logger.Error("failed to create new PCA", "common_name", commonName, "error", err)
return err
}
@ -649,12 +650,26 @@ func (a *AWSProvider) deletePCA() error {
}
// Cleanup implements Provider
func (a *AWSProvider) Cleanup() error {
func (a *AWSProvider) Cleanup(providerTypeChange bool, otherConfig map[string]interface{}) error {
old := atomic.SwapUint32(&a.stopped, 1)
if old == 0 {
close(a.stopCh)
}
if !providerTypeChange {
awsConfig, err := ParseAWSCAConfig(otherConfig)
if err != nil {
return err
}
// if the provider is being replaced and using an existing PCA instance
// then prevent deletion of that instance if the new provider uses
// the same instance.
if a.config.ExistingARN == awsConfig.ExistingARN {
return nil
}
}
if a.config.DeleteOnExit {
if err := a.disablePCA(); err != nil {
// Log the error but continue trying to delete as some errors may still

View File

@ -5,6 +5,8 @@ import (
"strconv"
"testing"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/acmpca"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/stretchr/testify/require"
@ -39,7 +41,7 @@ func TestAWSBootstrapAndSignPrimary(t *testing.T) {
"PrivateKeyBits": tc.KeyBits,
}
provider := testAWSProvider(t, testProviderConfigPrimary(t, cfg))
defer provider.Cleanup()
defer provider.Cleanup(true, nil)
// Generate the root
require.NoError(provider.GenerateRoot())
@ -89,12 +91,12 @@ func TestAWSBootstrapAndSignSecondary(t *testing.T) {
skipIfAWSNotConfigured(t)
p1 := testAWSProvider(t, testProviderConfigPrimary(t, nil))
defer p1.Cleanup()
defer p1.Cleanup(true, nil)
rootPEM, err := p1.ActiveRoot()
require.NoError(t, err)
p2 := testAWSProvider(t, testProviderConfigSecondary(t, nil))
defer p2.Cleanup()
defer p2.Cleanup(true, nil)
testSignIntermediateCrossDC(t, p1, p2)
@ -191,14 +193,14 @@ func TestAWSBootstrapAndSignSecondaryConsul(t *testing.T) {
require.NoError(t, p1.GenerateRoot())
p2 := testAWSProvider(t, testProviderConfigSecondary(t, nil))
defer p2.Cleanup()
defer p2.Cleanup(true, nil)
testSignIntermediateCrossDC(t, p1, p2)
})
t.Run("pri=aws,sec=consul", func(t *testing.T) {
p1 := testAWSProvider(t, testProviderConfigPrimary(t, nil))
defer p1.Cleanup()
defer p1.Cleanup(true, nil)
require.NoError(t, p1.GenerateRoot())
conf := testConsulCAConfig()
@ -217,7 +219,7 @@ func TestAWSNoCrossSigning(t *testing.T) {
skipIfAWSNotConfigured(t)
p1 := testAWSProvider(t, testProviderConfigPrimary(t, nil))
defer p1.Cleanup()
defer p1.Cleanup(true, nil)
// Don't bother initializing a PCA as that is slow and unnecessary for this
// test
@ -235,6 +237,142 @@ func TestAWSNoCrossSigning(t *testing.T) {
require.Contains(t, err.Error(), "not implemented")
}
func TestAWSProvider_Cleanup(t *testing.T) {
// Note not parallel since we could easily hit AWS limits of too many CAs if
// all of these tests run at once.
skipIfAWSNotConfigured(t)
describeCA := func(t *testing.T, provider *AWSProvider) (bool, error) {
t.Helper()
state, err := provider.State()
require.NoError(t, err)
// Load from the resource.
input := &acmpca.DescribeCertificateAuthorityInput{
CertificateAuthorityArn: aws.String(state[AWSStateCAARNKey]),
}
output, err := provider.client.DescribeCertificateAuthority(input)
if err != nil {
return false, err
}
require.NotNil(t, output)
require.NotNil(t, output.CertificateAuthority)
require.NotNil(t, output.CertificateAuthority.Status)
return *output.CertificateAuthority.Status == acmpca.CertificateAuthorityStatusDeleted, nil
}
requirePCADeleted := func(t *testing.T, provider *AWSProvider) {
deleted, err := describeCA(t, provider)
require.True(t, err != nil || deleted, "The AWS PCA instance has not been deleted")
}
requirePCANotDeleted := func(t *testing.T, provider *AWSProvider) {
deleted, err := describeCA(t, provider)
require.NoError(t, err)
require.False(t, deleted, "The AWS PCA instance should not have been deleted")
}
t.Run("provider-change", func(t *testing.T) {
// create a provider with the default config which will create the CA
p1Conf := testProviderConfigPrimary(t, nil)
p1 := testAWSProvider(t, p1Conf)
p1.GenerateRoot()
t.Cleanup(func() {
// This is a fail safe just in case the Cleanup routine of the
// second provider fails to delete the CA. In that case we want
// to request that the main provider delete it during Cleanup.
if deleted, err := describeCA(t, p1); err == nil && deleted {
p1.Cleanup(false, p1Conf.RawConfig)
} else {
p1.Cleanup(true, nil)
}
})
// just ensure that it got created
requirePCANotDeleted(t, p1)
state, err := p1.State()
require.NoError(t, err)
p2Conf := testProviderConfigPrimary(t, map[string]interface{}{
"ExistingARN": state[AWSStateCAARNKey],
})
p2 := testAWSProvider(t, p2Conf)
// provider change should trigger deletion of the CA
require.NoError(t, p2.Cleanup(true, nil))
requirePCADeleted(t, p1)
})
t.Run("arn-change", func(t *testing.T) {
// create a provider with the default config which will create the CA
p1Conf := testProviderConfigPrimary(t, nil)
p1 := testAWSProvider(t, p1Conf)
p1.GenerateRoot()
t.Cleanup(func() {
// This is a fail safe just in case the Cleanup routine of the
// second provider fails to delete the CA. In that case we want
// to request that the main provider delete it during Cleanup.
if deleted, err := describeCA(t, p1); err == nil || deleted {
p1.Cleanup(false, p1Conf.RawConfig)
} else {
p1.Cleanup(true, nil)
}
})
// just ensure that it got created
requirePCANotDeleted(t, p1)
state, err := p1.State()
require.NoError(t, err)
p2Conf := testProviderConfigPrimary(t, map[string]interface{}{
"ExistingARN": state[AWSStateCAARNKey],
})
p2 := testAWSProvider(t, p2Conf)
// changing the ARN should cause the other CA to be deleted
p2ConfAltARN := testProviderConfigPrimary(t, map[string]interface{}{
"ExistingARN": "doesnt-need-to-be-real",
})
require.NoError(t, p2.Cleanup(false, p2ConfAltARN.RawConfig))
requirePCADeleted(t, p1)
})
t.Run("arn-not-changed", func(t *testing.T) {
// create a provider with the default config which will create the CA
p1Conf := testProviderConfigPrimary(t, nil)
p1 := testAWSProvider(t, p1Conf)
p1.GenerateRoot()
t.Cleanup(func() {
// the p2 provider should not remove the CA but we need to ensure that
// we do clean it up
p1.Cleanup(true, nil)
})
// just ensure that it got created
requirePCANotDeleted(t, p1)
state, err := p1.State()
require.NoError(t, err)
p2Conf := testProviderConfigPrimary(t, map[string]interface{}{
"ExistingARN": state[AWSStateCAARNKey],
})
p2 := testAWSProvider(t, p2Conf)
// because the ARN isn't changing we don't want to remove the CA
require.NoError(t, p2.Cleanup(false, p2Conf.RawConfig))
requirePCANotDeleted(t, p1)
})
}
func testAWSProvider(t *testing.T, cfg ProviderConfig) *AWSProvider {
p := &AWSProvider{}
logger := testutil.Logger(t)

View File

@ -310,7 +310,11 @@ func (c *ConsulProvider) GenerateIntermediate() (string, error) {
}
// Remove the state store entry for this provider instance.
func (c *ConsulProvider) Cleanup() error {
func (c *ConsulProvider) Cleanup(_ bool, _ map[string]interface{}) error {
// This method only gets called for final cleanup. Therefore we don't
// need to worry about the case where a ca config update is made to
// change the cert ttls but leaving the private key and root cert the
// same. Changing those would change the id field on the provider.
args := &structs.CARequest{
Op: structs.CAOpDeleteProviderState,
ProviderState: &structs.CAConsulProviderState{ID: c.id},

View File

@ -523,10 +523,31 @@ func (c *VaultProvider) SupportsCrossSigning() (bool, error) {
// Cleanup unmounts the configured intermediate PKI backend. It's fine to tear
// this down and recreate it on small config changes because the intermediate
// certs get bundled with the leaf certs, so there's no cost to the CA changing.
func (v *VaultProvider) Cleanup() error {
func (v *VaultProvider) Cleanup(providerTypeChange bool, otherConfig map[string]interface{}) error {
v.Stop()
return v.client.Sys().Unmount(v.config.IntermediatePKIPath)
if !providerTypeChange {
newConfig, err := ParseVaultCAConfig(otherConfig)
if err != nil {
return err
}
// if the intermeidate PKI path isn't changing we don't want to delete it as
// Cleanup is called after initializing the new provider
if newConfig.IntermediatePKIPath == v.config.IntermediatePKIPath {
return nil
}
}
err := v.client.Sys().Unmount(v.config.IntermediatePKIPath)
switch err {
case ErrBackendNotMounted, ErrBackendNotInitialized:
// suppress these errors if we didn't finish initialization before
return nil
default:
return err
}
}
// Stop shuts down the token renew goroutine.

View File

@ -359,6 +359,86 @@ func TestVaultProvider_SignIntermediateConsul(t *testing.T) {
})
}
func TestVaultProvider_Cleanup(t *testing.T) {
t.Parallel()
SkipIfVaultNotPresent(t)
testVault, err := runTestVault(t)
require.NoError(t, err)
testVault.WaitUntilReady(t)
t.Run("provider-change", func(t *testing.T) {
provider, err := createVaultProvider(t, true, testVault.Addr, testVault.RootToken, nil)
require.NoError(t, err)
// ensure that the intermediate PKI mount exists
mounts, err := provider.client.Sys().ListMounts()
require.NoError(t, err)
require.Contains(t, mounts, provider.config.IntermediatePKIPath)
// call cleanup with a provider change - this should cause removal of the mount
require.NoError(t, provider.Cleanup(true, nil))
// verify the mount was removed
mounts, err = provider.client.Sys().ListMounts()
require.NoError(t, err)
require.NotContains(t, mounts, provider.config.IntermediatePKIPath)
})
t.Run("pki-path-change", func(t *testing.T) {
provider, err := createVaultProvider(t, true, testVault.Addr, testVault.RootToken, nil)
require.NoError(t, err)
// ensure that the intermediate PKI mount exists
mounts, err := provider.client.Sys().ListMounts()
require.NoError(t, err)
require.Contains(t, mounts, provider.config.IntermediatePKIPath)
// call cleanup with an intermediate pki path change - this should cause removal of the mount
require.NoError(t, provider.Cleanup(false, map[string]interface{}{
"Address": testVault.Addr,
"Token": testVault.RootToken,
"RootPKIPath": "pki-root/",
//
"IntermediatePKIPath": "pki-intermediate2/",
// Tests duration parsing after msgpack type mangling during raft apply.
"LeafCertTTL": []uint8("72h"),
}))
// verify the mount was removed
mounts, err = provider.client.Sys().ListMounts()
require.NoError(t, err)
require.NotContains(t, mounts, provider.config.IntermediatePKIPath)
})
t.Run("pki-path-unchanged", func(t *testing.T) {
provider, err := createVaultProvider(t, true, testVault.Addr, testVault.RootToken, nil)
require.NoError(t, err)
// ensure that the intermediate PKI mount exists
mounts, err := provider.client.Sys().ListMounts()
require.NoError(t, err)
require.Contains(t, mounts, provider.config.IntermediatePKIPath)
// call cleanup with no config changes - this should not cause removal of the intermediate pki path
require.NoError(t, provider.Cleanup(false, map[string]interface{}{
"Address": testVault.Addr,
"Token": testVault.RootToken,
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
// Tests duration parsing after msgpack type mangling during raft apply.
"LeafCertTTL": []uint8("72h"),
}))
// verify the mount was NOT removed
mounts, err = provider.client.Sys().ListMounts()
require.NoError(t, err)
require.Contains(t, mounts, provider.config.IntermediatePKIPath)
})
}
func getIntermediateCertTTL(t *testing.T, caConf *structs.CAConfiguration) time.Duration {
t.Helper()

View File

@ -343,10 +343,6 @@ func (s *Server) establishLeadership(ctx context.Context) error {
s.getOrCreateAutopilotConfig()
s.autopilot.Start(ctx)
if err := s.caManager.InitializeCA(); err != nil {
return err
}
s.startConfigReplication()
s.startFederationStateReplication()
@ -391,7 +387,7 @@ func (s *Server) revokeLeadership() {
s.stopConnectLeader()
s.caManager.setCAProvider(nil, nil)
s.caManager.setState(CAStateUninitialized, false)
s.caManager.setState(caStateUninitialized, false)
s.stopACLTokenReaping()

View File

@ -34,21 +34,16 @@ func (s *Server) startConnectLeader() error {
return nil
}
// Start the Connect secondary DC actions if enabled.
if s.config.Datacenter != s.config.PrimaryDatacenter {
s.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, s.caManager.secondaryCARootWatch)
}
s.leaderRoutineManager.Start(intermediateCertRenewWatchRoutineName, s.caManager.intermediateCertRenewalWatch)
s.caManager.Start()
s.leaderRoutineManager.Start(caRootPruningRoutineName, s.runCARootPruning)
return s.startIntentionConfigEntryMigration()
}
// stopConnectLeader stops connect specific leader functions.
func (s *Server) stopConnectLeader() {
s.caManager.Stop()
s.leaderRoutineManager.Stop(intentionMigrationRoutineName)
s.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName)
s.leaderRoutineManager.Stop(intermediateCertRenewWatchRoutineName)
s.leaderRoutineManager.Stop(caRootPruningRoutineName)
// If the provider implements NeedsStop, we call Stop to perform any shutdown actions.

View File

@ -17,14 +17,14 @@ import (
uuid "github.com/hashicorp/go-uuid"
)
type CAState string
type caState string
const (
CAStateUninitialized CAState = "UNINITIALIZED"
CAStateInitializing = "INITIALIZING"
CAStateReady = "READY"
CAStateRenewIntermediate = "RENEWING"
CAStateReconfig = "RECONFIGURING"
caStateUninitialized caState = "UNINITIALIZED"
caStateInitializing = "INITIALIZING"
caStateInitialized = "INITIALIZED"
caStateRenewIntermediate = "RENEWING"
caStateReconfig = "RECONFIGURING"
)
// caServerDelegate is an interface for server operations for facilitating
@ -61,9 +61,11 @@ type CAManager struct {
// stateLock protects the internal state used for administrative CA tasks.
stateLock sync.Mutex
state CAState
state caState
primaryRoots structs.IndexedCARoots // The most recently seen state of the root CAs from the primary datacenter.
actingSecondaryCA bool // True if this datacenter has been initialized as a secondary CA.
leaderRoutineManager *LeaderRoutineManager
}
type caDelegateWithState struct {
@ -74,36 +76,47 @@ func (c *caDelegateWithState) State() *state.Store {
return c.fsm.State()
}
func NewCAManager(delegate caServerDelegate, logger hclog.Logger, config *Config) *CAManager {
func NewCAManager(delegate caServerDelegate, leaderRoutineManager *LeaderRoutineManager, logger hclog.Logger, config *Config) *CAManager {
return &CAManager{
delegate: delegate,
logger: logger,
serverConf: config,
state: CAStateUninitialized,
delegate: delegate,
logger: logger,
serverConf: config,
state: caStateUninitialized,
leaderRoutineManager: leaderRoutineManager,
}
}
func (c *CAManager) reset() {
c.state = CAStateUninitialized
c.state = caStateUninitialized
c.primaryRoots = structs.IndexedCARoots{}
c.actingSecondaryCA = false
c.setCAProvider(nil, nil)
}
// setState attempts to update the CA state to the given state.
// If the current state is not READY, this will fail. The only exception is when
// the current state is UNINITIALIZED, and the function is called with CAStateInitializing.
func (c *CAManager) setState(newState CAState, validateState bool) error {
// Valid state transitions are:
//
// caStateInitialized -> <any state>
// caStateUninitialized -> caStateInitializing
// caStateUninitialized -> caStateReconfig
//
// Other state transitions may be forced if the validateState parameter is set to false.
// This will mainly be used in deferred functions which aim to set the final status based
// a successful/error return.
func (c *CAManager) setState(newState caState, validateState bool) (caState, error) {
c.stateLock.Lock()
defer c.stateLock.Unlock()
state := c.state
if !validateState || state == CAStateReady || (state == CAStateUninitialized && newState == CAStateInitializing) {
if !validateState ||
state == caStateInitialized ||
(state == caStateUninitialized && newState == caStateInitializing) ||
(state == caStateUninitialized && newState == caStateReconfig) {
c.state = newState
} else {
return fmt.Errorf("CA is already in state %q", state)
return state, fmt.Errorf("CA is already in state %q", state)
}
return nil
return state, nil
}
// setPrimaryRoots updates the most recently seen roots from the primary.
@ -111,7 +124,7 @@ func (c *CAManager) setPrimaryRoots(newRoots structs.IndexedCARoots) error {
c.stateLock.Lock()
defer c.stateLock.Unlock()
if c.state == CAStateInitializing || c.state == CAStateReconfig {
if c.state == caStateInitializing || c.state == caStateReconfig {
c.primaryRoots = newRoots
} else {
return fmt.Errorf("Cannot update primary roots in state %q", c.state)
@ -234,21 +247,87 @@ func (c *CAManager) setCAProvider(newProvider ca.Provider, root *structs.CARoot)
c.providerLock.Unlock()
}
func (c *CAManager) Start() {
// Attempt to initialize the Connect CA now. This will
// happen during leader establishment and it would be great
// if the CA was ready to go once that process was finished.
if err := c.InitializeCA(); err != nil {
c.logger.Error("Failed to initialize Connect CA", "error", err)
// we failed to fully initialize the CA so we need to spawn a
// go routine to retry this process until it succeeds or we lose
// leadership and the go routine gets stopped.
c.leaderRoutineManager.Start(backgroundCAInitializationRoutineName, c.backgroundCAInitialization)
} else {
// We only start these if CA initialization was successful. If not the completion of the
// background CA initialization will start these routines.
c.startPostInitializeRoutines()
}
}
func (c *CAManager) Stop() {
c.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName)
c.leaderRoutineManager.Stop(intermediateCertRenewWatchRoutineName)
c.leaderRoutineManager.Stop(backgroundCAInitializationRoutineName)
}
func (c *CAManager) startPostInitializeRoutines() {
// Start the Connect secondary DC actions if enabled.
if c.serverConf.Datacenter != c.serverConf.PrimaryDatacenter {
c.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, c.secondaryCARootWatch)
}
c.leaderRoutineManager.Start(intermediateCertRenewWatchRoutineName, c.intermediateCertRenewalWatch)
}
func (c *CAManager) backgroundCAInitialization(ctx context.Context) error {
retryLoopBackoffAbortOnSuccess(ctx, c.InitializeCA, func(err error) {
c.logger.Error("Failed to initialize Connect CA",
"routine", backgroundCAInitializationRoutineName,
"error", err,
)
})
if err := ctx.Err(); err != nil {
return err
}
c.logger.Info("Successfully initialized the Connect CA")
c.startPostInitializeRoutines()
return nil
}
// InitializeCA sets up the CA provider when gaining leadership, either bootstrapping
// the CA if this is the primary DC or making a remote RPC for intermediate signing
// if this is a secondary DC.
func (c *CAManager) InitializeCA() error {
func (c *CAManager) InitializeCA() (reterr error) {
// Bail if connect isn't enabled.
if !c.serverConf.ConnectEnabled {
return nil
}
// Update the state before doing anything else.
err := c.setState(CAStateInitializing, true)
oldState, err := c.setState(caStateInitializing, true)
// if we were already in the initialized state then there is nothing to be done.
if oldState == caStateInitialized {
return nil
}
if err != nil {
return err
}
defer c.setState(CAStateReady, false)
defer func() {
// Using named return values in deferred funcs isnt too common in our code
// but it is first class Go functionality. The error erturned from the
// main func will be available by its given name within deferred functions.
// See: https://blog.golang.org/defer-panic-and-recover
if reterr == nil {
c.setState(caStateInitialized, false)
} else {
c.setState(caStateUninitialized, false)
}
}()
// Initialize the provider based on the current config.
conf, err := c.initializeCAConfig()
@ -613,12 +692,29 @@ func (c *CAManager) persistNewRootAndConfig(provider ca.Provider, newActiveRoot
return nil
}
func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error {
func (c *CAManager) UpdateConfiguration(args *structs.CARequest) (reterr error) {
// Attempt to update the state first.
if err := c.setState(CAStateReconfig, true); err != nil {
oldState, err := c.setState(caStateReconfig, true)
if err != nil {
return err
}
defer func() {
// Using named return values in deferred funcs isnt too common in our code
// but it is first class Go functionality. The error erturned from the
// main func will be available by its given name within deferred functions.
// See: https://blog.golang.org/defer-panic-and-recover
if reterr == nil {
c.setState(caStateInitialized, false)
} else {
c.setState(oldState, false)
}
}()
// Attempt to initialize the config if we failed to do so in InitializeCA for some reason
_, err = c.initializeCAConfig()
if err != nil {
return err
}
defer c.setState(CAStateReady, false)
// Exit early if it's a no-op change
state := c.delegate.State()
@ -674,7 +770,7 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error {
cleanupNewProvider := true
defer func() {
if cleanupNewProvider {
if err := newProvider.Cleanup(); err != nil {
if err := newProvider.Cleanup(args.Config.Provider != config.Provider, args.Config.Config); err != nil {
c.logger.Warn("failed to clean up CA provider while handling startup failure", "provider", newProvider, "error", err)
}
}
@ -739,51 +835,60 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error {
return nil
}
// At this point, we know the config change has trigged a root rotation,
// either by swapping the provider type or changing the provider's config
// to use a different root certificate.
// First up, sanity check that the current provider actually supports
// cross-signing.
// get the old CA provider to be used for Cross Signing and to clean it up at the end
// of the functi8on.
oldProvider, _ := c.getCAProvider()
if oldProvider == nil {
return fmt.Errorf("internal error: CA provider is nil")
}
canXSign, err := oldProvider.SupportsCrossSigning()
if err != nil {
return fmt.Errorf("CA provider error: %s", err)
}
if !canXSign && !args.Config.ForceWithoutCrossSigning {
return errors.New("The current CA Provider does not support cross-signing. " +
"You can try again with ForceWithoutCrossSigningSet but this may cause " +
"disruption - see documentation for more.")
}
if !canXSign && args.Config.ForceWithoutCrossSigning {
c.logger.Warn("current CA doesn't support cross signing but " +
"CA reconfiguration forced anyway with ForceWithoutCrossSigning")
}
// If it's a config change that would trigger a rotation (different provider/root):
// 1. Get the root from the new provider.
// 2. Call CrossSignCA on the old provider to sign the new root with the old one to
// get a cross-signed certificate.
// 3. Take the active root for the new provider and append the intermediate from step 2
// to its list of intermediates.
newRoot, err := connect.ParseCert(newRootPEM)
if err != nil {
return err
}
if canXSign {
// Have the old provider cross-sign the new root
xcCert, err := oldProvider.CrossSignCA(newRoot)
// We only even think about cross signing if the current provider has a root cert
// In some cases such as having a bad CA configuration during startup the provider
// may not have been able to generate a cert. We then want to be able to prevent
// an attempt to cross sign the cert which will definitely fail.
if root != nil {
// If it's a config change that would trigger a rotation (different provider/root):
// 1. Get the root from the new provider.
// 2. Call CrossSignCA on the old provider to sign the new root with the old one to
// get a cross-signed certificate.
// 3. Take the active root for the new provider and append the intermediate from step 2
// to its list of intermediates.
newRoot, err := connect.ParseCert(newRootPEM)
if err != nil {
return err
}
// Add the cross signed cert to the new CA's intermediates (to be attached
// to leaf certs).
newActiveRoot.IntermediateCerts = []string{xcCert}
// At this point, we know the config change has triggered a root rotation,
// either by swapping the provider type or changing the provider's config
// to use a different root certificate.
// First up, check that the current provider actually supports
// cross-signing.
canXSign, err := oldProvider.SupportsCrossSigning()
if err != nil {
return fmt.Errorf("CA provider error: %s", err)
}
if !canXSign && !args.Config.ForceWithoutCrossSigning {
return errors.New("The current CA Provider does not support cross-signing. " +
"You can try again with ForceWithoutCrossSigningSet but this may cause " +
"disruption - see documentation for more.")
}
if !canXSign && args.Config.ForceWithoutCrossSigning {
c.logger.Warn("current CA doesn't support cross signing but " +
"CA reconfiguration forced anyway with ForceWithoutCrossSigning")
}
if canXSign {
// Have the old provider cross-sign the new root
xcCert, err := oldProvider.CrossSignCA(newRoot)
if err != nil {
return err
}
// Add the cross signed cert to the new CA's intermediates (to be attached
// to leaf certs).
newActiveRoot.IntermediateCerts = []string{xcCert}
}
}
intermediate, err := newProvider.GenerateIntermediate()
@ -831,8 +936,8 @@ func (c *CAManager) UpdateConfiguration(args *structs.CARequest) error {
cleanupNewProvider = false
c.setCAProvider(newProvider, newActiveRoot)
if err := oldProvider.Cleanup(); err != nil {
c.logger.Warn("failed to clean up old provider", "provider", config.Provider)
if err := oldProvider.Cleanup(args.Config.Provider != config.Provider, args.Config.Config); err != nil {
c.logger.Warn("failed to clean up old provider", "provider", config.Provider, "error", err)
}
c.logger.Info("CA rotated to new root under provider", "provider", args.Config.Provider)
@ -924,10 +1029,10 @@ func (c *CAManager) intermediateCertRenewalWatch(ctx context.Context) error {
func (c *CAManager) RenewIntermediate(ctx context.Context, isPrimary bool) error {
// Grab the 'lock' right away so the provider/config can't be changed out while we check
// the intermediate.
if err := c.setState(CAStateRenewIntermediate, true); err != nil {
if _, err := c.setState(caStateRenewIntermediate, true); err != nil {
return err
}
defer c.setState(CAStateReady, false)
defer c.setState(caStateInitialized, false)
provider, _ := c.getCAProvider()
if provider == nil {
@ -1053,10 +1158,10 @@ func (c *CAManager) secondaryCARootWatch(ctx context.Context) error {
// certificate if necessary.
func (c *CAManager) UpdateRoots(roots structs.IndexedCARoots) error {
// Update the state first to claim the 'lock'.
if err := c.setState(CAStateReconfig, true); err != nil {
if _, err := c.setState(caStateReconfig, true); err != nil {
return err
}
defer c.setState(CAStateReady, false)
defer c.setState(caStateInitialized, false)
// Update the cached primary roots now that the lock is held.
if err := c.setPrimaryRoots(roots); err != nil {
@ -1125,7 +1230,7 @@ func (c *CAManager) setSecondaryCA() error {
c.stateLock.Lock()
defer c.stateLock.Unlock()
if c.state == CAStateInitializing || c.state == CAStateReconfig {
if c.state == caStateInitializing || c.state == caStateReconfig {
c.actingSecondaryCA = true
} else {
return fmt.Errorf("Cannot update secondary CA flag in state %q", c.state)

View File

@ -138,7 +138,7 @@ func (m *mockCAProvider) Sign(*x509.CertificateRequest) (string, error)
func (m *mockCAProvider) SignIntermediate(*x509.CertificateRequest) (string, error) { return "", nil }
func (m *mockCAProvider) CrossSignCA(*x509.Certificate) (string, error) { return "", nil }
func (m *mockCAProvider) SupportsCrossSigning() (bool, error) { return false, nil }
func (m *mockCAProvider) Cleanup() error { return nil }
func (m *mockCAProvider) Cleanup(_ bool, _ map[string]interface{}) error { return nil }
func waitForCh(t *testing.T, ch chan string, expected string) {
select {
@ -202,18 +202,18 @@ func TestCAManager_Initialize(t *testing.T) {
conf.PrimaryDatacenter = "dc1"
conf.Datacenter = "dc2"
delegate := NewMockCAServerDelegate(t, conf)
manager := NewCAManager(delegate, testutil.Logger(t), conf)
manager := NewCAManager(delegate, nil, testutil.Logger(t), conf)
// Call InitializeCA and then confirm the RPCs and provider calls
// happen in the expected order.
require.EqualValues(t, CAStateUninitialized, manager.state)
require.EqualValues(t, caStateUninitialized, manager.state)
errCh := make(chan error)
go func() {
errCh <- manager.InitializeCA()
}()
waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.Roots")
require.EqualValues(t, CAStateInitializing, manager.state)
require.EqualValues(t, caStateInitializing, manager.state)
waitForCh(t, delegate.callbackCh, "provider/GenerateIntermediateCSR")
waitForCh(t, delegate.callbackCh, "forwardDC/ConnectCA.SignIntermediate")
waitForCh(t, delegate.callbackCh, "provider/SetIntermediate")
@ -228,7 +228,7 @@ func TestCAManager_Initialize(t *testing.T) {
t.Fatal("never got result from errCh")
}
require.EqualValues(t, CAStateReady, manager.state)
require.EqualValues(t, caStateInitialized, manager.state)
}
func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) {
@ -252,7 +252,7 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) {
conf.PrimaryDatacenter = "dc1"
conf.Datacenter = "dc2"
delegate := NewMockCAServerDelegate(t, conf)
manager := NewCAManager(delegate, testutil.Logger(t), conf)
manager := NewCAManager(delegate, nil, testutil.Logger(t), conf)
initTestManager(t, manager, delegate)
// Wait half the TTL for the cert to need renewing.
@ -270,7 +270,7 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) {
// Call UpdateConfiguration while RenewIntermediate is still in-flight to
// make sure we get an error about the state being occupied.
go func() {
require.EqualValues(t, CAStateRenewIntermediate, manager.state)
require.EqualValues(t, caStateRenewIntermediate, manager.state)
require.Error(t, errors.New("already in state"), manager.UpdateConfiguration(&structs.CARequest{}))
}()
@ -287,5 +287,5 @@ func TestCAManager_UpdateConfigWhileRenewIntermediate(t *testing.T) {
t.Fatal("never got result from errCh")
}
require.EqualValues(t, CAStateReady, manager.state)
require.EqualValues(t, caStateInitialized, manager.state)
}

View File

@ -1204,3 +1204,104 @@ func TestLeader_retryLoopBackoffHandleSuccess(t *testing.T) {
})
}
}
func TestLeader_Vault_BadCAConfigShouldntPreventLeaderEstablishment(t *testing.T) {
ca.SkipIfVaultNotPresent(t)
testVault := ca.NewTestVaultServer(t)
defer testVault.Stop()
_, s1 := testServerWithConfig(t, func(c *Config) {
c.Build = "1.9.1"
c.PrimaryDatacenter = "dc1"
c.CAConfig = &structs.CAConfiguration{
Provider: "vault",
Config: map[string]interface{}{
"Address": testVault.Addr,
"Token": "not-the-root",
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
},
}
})
defer s1.Shutdown()
waitForLeaderEstablishment(t, s1)
rootsList, activeRoot, err := getTestRoots(s1, "dc1")
require.NoError(t, err)
require.Empty(t, rootsList.Roots)
require.Nil(t, activeRoot)
// Now that the leader is up and we have verified that there are no roots / CA init failed,
// verify that we can reconfigure away from the bad configuration.
newConfig := &structs.CAConfiguration{
Provider: "vault",
Config: map[string]interface{}{
"Address": testVault.Addr,
"Token": testVault.RootToken,
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
},
}
{
args := &structs.CARequest{
Datacenter: "dc1",
Config: newConfig,
}
var reply interface{}
retry.Run(t, func(r *retry.R) {
require.NoError(r, s1.RPC("ConnectCA.ConfigurationSet", args, &reply))
})
}
rootsList, activeRoot, err = getTestRoots(s1, "dc1")
require.NoError(t, err)
require.NotEmpty(t, rootsList.Roots)
require.NotNil(t, activeRoot)
}
func TestLeader_Consul_BadCAConfigShouldntPreventLeaderEstablishment(t *testing.T) {
ca.SkipIfVaultNotPresent(t)
_, s1 := testServerWithConfig(t, func(c *Config) {
c.Build = "1.9.1"
c.PrimaryDatacenter = "dc1"
c.CAConfig = &structs.CAConfiguration{
Provider: "consul",
Config: map[string]interface{}{
"RootCert": "garbage",
},
}
})
defer s1.Shutdown()
waitForLeaderEstablishment(t, s1)
rootsList, activeRoot, err := getTestRoots(s1, "dc1")
require.NoError(t, err)
require.Empty(t, rootsList.Roots)
require.Nil(t, activeRoot)
newConfig := &structs.CAConfiguration{
Provider: "consul",
Config: map[string]interface{}{},
}
{
args := &structs.CARequest{
Datacenter: "dc1",
Config: newConfig,
}
var reply interface{}
retry.Run(t, func(r *retry.R) {
require.NoError(r, s1.RPC("ConnectCA.ConfigurationSet", args, &reply))
})
}
rootsList, activeRoot, err = getTestRoots(s1, "dc1")
require.NoError(t, err)
require.NotEmpty(t, rootsList.Roots)
require.NotNil(t, activeRoot)
}

View File

@ -108,6 +108,7 @@ const (
intentionMigrationRoutineName = "intention config entry migration"
secondaryCARootWatchRoutineName = "secondary CA roots watch"
intermediateCertRenewWatchRoutineName = "intermediate cert renew watch"
backgroundCAInitializationRoutineName = "CA initialization"
)
var (
@ -468,7 +469,7 @@ func NewServer(config *Config, flat Deps) (*Server, error) {
return nil, fmt.Errorf("Failed to start Raft: %v", err)
}
s.caManager = NewCAManager(&caDelegateWithState{s}, s.loggers.Named(logging.Connect), s.config)
s.caManager = NewCAManager(&caDelegateWithState{s}, s.leaderRoutineManager, s.loggers.Named(logging.Connect), s.config)
if s.config.ConnectEnabled && (s.config.AutoEncryptAllowTLS || s.config.AutoConfigAuthzEnabled) {
go s.connectCARootsMonitor(&lib.StopChannelContext{StopCh: s.shutdownCh})
}

View File

@ -134,9 +134,9 @@ func (s *Server) getCARoots(ws memdb.WatchSet, state *state.Store) (*structs.Ind
func (s *Server) SignCertificate(csr *x509.CertificateRequest, spiffeID connect.CertURI) (*structs.IssuedCert, error) {
provider, caRoot := s.caManager.getCAProvider()
if provider == nil {
return nil, fmt.Errorf("internal error: CA provider is nil")
return nil, fmt.Errorf("CA is uninitialized and unable to sign certificates yet: provider is nil")
} else if caRoot == nil {
return nil, fmt.Errorf("internal error: CA root is nil")
return nil, fmt.Errorf("CA is uninitialized and unable to sign certificates yet: no root certificate")
}
// Verify that the CSR entity is in the cluster's trust domain