mirror of https://github.com/status-im/consul.git
200 lines
7.2 KiB
Go
200 lines
7.2 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package autoconf
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/hashicorp/consul/agent/cache"
|
|
"github.com/hashicorp/consul/agent/structs"
|
|
)
|
|
|
|
// handleCacheEvent is used to handle event notifications from the cache for the roots
|
|
// or leaf cert watches.
|
|
func (ac *AutoConfig) handleCacheEvent(u cache.UpdateEvent) error {
|
|
switch u.CorrelationID {
|
|
case rootsWatchID:
|
|
ac.logger.Debug("roots watch fired - updating CA certificates")
|
|
if u.Err != nil {
|
|
return fmt.Errorf("root watch returned an error: %w", u.Err)
|
|
}
|
|
|
|
roots, ok := u.Result.(*structs.IndexedCARoots)
|
|
if !ok {
|
|
return fmt.Errorf("invalid type for roots watch response: %T", u.Result)
|
|
}
|
|
|
|
return ac.updateCARoots(roots)
|
|
case leafWatchID:
|
|
ac.logger.Debug("leaf certificate watch fired - updating TLS certificate")
|
|
if u.Err != nil {
|
|
return fmt.Errorf("leaf watch returned an error: %w", u.Err)
|
|
}
|
|
|
|
leaf, ok := u.Result.(*structs.IssuedCert)
|
|
if !ok {
|
|
return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result)
|
|
}
|
|
|
|
return ac.updateLeafCert(leaf)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// handleTokenUpdate is used when a notification about the agent token being updated
|
|
// is received and various watches need cancelling/restarting to use the new token.
|
|
func (ac *AutoConfig) handleTokenUpdate(ctx context.Context) error {
|
|
ac.logger.Debug("Agent token updated - resetting watches")
|
|
|
|
// TODO (autoencrypt) Prepopulate the cache with the new token with
|
|
// the existing cache entry with the old token. The certificate doesn't
|
|
// need to change just because the token has. However there isn't a
|
|
// good way to make that happen and this behavior is benign enough
|
|
// that I am going to push off implementing it.
|
|
|
|
// the agent token has been updated so we must update our leaf cert watch.
|
|
// this cancels the current watches before setting up new ones
|
|
ac.cancelWatches()
|
|
|
|
// recreate the chan for cache updates. This is a precautionary measure to ensure
|
|
// that we don't accidentally get notified for the new watches being setup before
|
|
// a blocking query in the cache returns and sends data to the old chan. In theory
|
|
// the code in agent/cache/watch.go should prevent this where we specifically check
|
|
// for context cancellation prior to sending the event. However we could cancel
|
|
// it after that check and finish setting up the new watches before getting the old
|
|
// events. Both the go routine scheduler and the OS thread scheduler would have to
|
|
// be acting up for this to happen. Regardless the way to ensure we don't get events
|
|
// for the old watches is to simply replace the chan we are expecting them from.
|
|
close(ac.cacheUpdates)
|
|
ac.cacheUpdates = make(chan cache.UpdateEvent, 10)
|
|
|
|
// restart watches - this will be done with the correct token
|
|
cancelWatches, err := ac.setupCertificateCacheWatches(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to restart watches after agent token update: %w", err)
|
|
}
|
|
ac.cancelWatches = cancelWatches
|
|
return nil
|
|
}
|
|
|
|
// handleFallback is used when the current TLS certificate has expired and the normal
|
|
// updating mechanisms have failed to renew it quickly enough. This function will
|
|
// use the configured fallback mechanism to retrieve a new cert and start monitoring
|
|
// that one.
|
|
func (ac *AutoConfig) handleFallback(ctx context.Context) error {
|
|
ac.logger.Warn("agent's client certificate has expired")
|
|
// Background because the context is mainly useful when the agent is first starting up.
|
|
switch {
|
|
case ac.config.AutoConfig.Enabled:
|
|
resp, err := ac.getInitialConfiguration(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("error while retrieving new agent certificates via auto-config: %w", err)
|
|
}
|
|
|
|
return ac.recordInitialConfiguration(resp)
|
|
case ac.config.AutoEncryptTLS:
|
|
reply, err := ac.autoEncryptInitialCerts(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("error while retrieving new agent certificate via auto-encrypt: %w", err)
|
|
}
|
|
return ac.setInitialTLSCertificates(reply)
|
|
default:
|
|
return fmt.Errorf("logic error: either auto-encrypt or auto-config must be enabled")
|
|
}
|
|
}
|
|
|
|
// run is the private method to be spawn by the Start method for
|
|
// executing the main monitoring loop.
|
|
func (ac *AutoConfig) run(ctx context.Context, exit chan struct{}) {
|
|
// The fallbackTimer is used to notify AFTER the agents
|
|
// leaf certificate has expired and where we need
|
|
// to fall back to the less secure RPC endpoint just like
|
|
// if the agent was starting up new.
|
|
//
|
|
// Check 10sec (fallback leeway duration) after cert
|
|
// expires. The agent cache should be handling the expiration
|
|
// and renew it before then.
|
|
//
|
|
// If there is no cert, use a value which immediately triggers the
|
|
// renew, but this case shouldn't happen because at
|
|
// this point, auto_encrypt was just being setup
|
|
// successfully.
|
|
calcFallbackInterval := func() time.Duration {
|
|
cert := ac.acConfig.TLSConfigurator.AutoEncryptCert()
|
|
if cert == nil {
|
|
return -1
|
|
}
|
|
expiry := cert.NotAfter.Add(ac.acConfig.FallbackLeeway)
|
|
return expiry.Sub(time.Now())
|
|
}
|
|
fallbackTimer := time.NewTimer(calcFallbackInterval())
|
|
|
|
// cleanup for once we are stopped
|
|
defer func() {
|
|
// cancel the go routines performing the cache watches
|
|
ac.cancelWatches()
|
|
// ensure we don't leak the timers go routine
|
|
fallbackTimer.Stop()
|
|
// stop receiving notifications for token updates
|
|
ac.acConfig.Tokens.StopNotify(ac.tokenUpdates)
|
|
|
|
ac.logger.Debug("auto-config has been stopped")
|
|
|
|
ac.Lock()
|
|
ac.cancel = nil
|
|
ac.running = false
|
|
// this should be the final cleanup task as its what notifies
|
|
// the rest of the world that this go routine has exited.
|
|
close(exit)
|
|
ac.Unlock()
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
ac.logger.Debug("stopping auto-config")
|
|
return
|
|
case <-ac.tokenUpdates.Ch:
|
|
ac.logger.Debug("handling a token update event")
|
|
|
|
if err := ac.handleTokenUpdate(ctx); err != nil {
|
|
ac.logger.Error("error in handling token update event", "error", err)
|
|
}
|
|
case u := <-ac.cacheUpdates:
|
|
ac.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID)
|
|
|
|
if err := ac.handleCacheEvent(u); err != nil {
|
|
ac.logger.Error("error in handling cache update event", "error", err)
|
|
}
|
|
|
|
// reset the fallback timer as the certificate may have been updated
|
|
fallbackTimer.Stop()
|
|
fallbackTimer = time.NewTimer(calcFallbackInterval())
|
|
case <-fallbackTimer.C:
|
|
// This is a safety net in case the cert doesn't get renewed
|
|
// in time. The agent would be stuck in that case because the watches
|
|
// never use the AutoEncrypt.Sign endpoint.
|
|
|
|
// check auto encrypt client cert expiration
|
|
cert := ac.acConfig.TLSConfigurator.AutoEncryptCert()
|
|
if cert == nil || cert.NotAfter.Before(time.Now()) {
|
|
if err := ac.handleFallback(ctx); err != nil {
|
|
ac.logger.Error("error when handling a certificate expiry event", "error", err)
|
|
fallbackTimer = time.NewTimer(ac.acConfig.FallbackRetry)
|
|
} else {
|
|
fallbackTimer = time.NewTimer(calcFallbackInterval())
|
|
}
|
|
} else {
|
|
// this shouldn't be possible. We calculate the timer duration to be the certificate
|
|
// expiration time + some leeway (10s default). So whenever we get here the certificate
|
|
// should be expired. Regardless its probably worth resetting the timer.
|
|
fallbackTimer = time.NewTimer(calcFallbackInterval())
|
|
}
|
|
}
|
|
}
|
|
}
|