mirror of
https://github.com/status-im/consul.git
synced 2025-01-15 00:04:47 +00:00
78b170ad50
* Refactors the leafcert package to not have a dependency on agent/consul and agent/cache to avoid import cycles. This way the xds controller can just import the leafcert package to use the leafcert manager. The leaf cert logic in the controller: * Sets up watches for leaf certs that are referenced in the ProxyStateTemplate (which generates the leaf certs too). * Gets the leaf cert from the leaf cert cache * Stores the leaf cert in the ProxyState that's pushed to xds * For the cert watches, this PR also uses a bimapper + a thin wrapper to map leaf cert events to related ProxyStateTemplates Since bimapper uses a resource.Reference or resource.ID to map between two resource types, I've created an internal type for a leaf certificate to use for the resource.Reference, since it's not a v2 resource. The wrapper allows mapping events to resources (as opposed to mapping resources to resources) The controller tests: Unit: Ensure that we resolve leaf cert references Lifecycle: Ensure that when the CA is updated, the leaf cert is as well Also adds a new spiffe id type, and adds workload identity and workload identity URI to leaf certs. This is so certs are generated with the new workload identity based SPIFFE id. * Pulls out some leaf cert test helpers into a helpers file so it can be used in the xds controller tests. * Wires up leaf cert manager dependency * Support getting token from proxytracker * Add workload identity spiffe id type to the authorize and sign functions --------- Co-authored-by: John Murret <john.murret@hashicorp.com>
1043 lines
31 KiB
Go
1043 lines
31 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package leafcert
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"crypto/x509"
|
|
"encoding/pem"
|
|
"fmt"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"github.com/hashicorp/consul/acl"
|
|
"github.com/hashicorp/consul/agent/cacheshim"
|
|
"github.com/hashicorp/consul/agent/connect"
|
|
"github.com/hashicorp/consul/agent/structs"
|
|
"github.com/hashicorp/consul/sdk/testutil"
|
|
"github.com/hashicorp/consul/sdk/testutil/retry"
|
|
)
|
|
|
|
// Test that after an initial signing, new CA roots (new ID) will
|
|
// trigger a blocking query to execute.
|
|
func TestManager_changingRoots(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
t.Parallel()
|
|
|
|
m, signer := NewTestManager(t, nil)
|
|
|
|
caRoot := signer.UpdateCA(t, nil)
|
|
|
|
// We'll reuse the fetch options and request
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1", Service: "web",
|
|
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
|
|
}
|
|
|
|
// First fetch should return immediately
|
|
getCh := testAsyncGet(t, m, req)
|
|
var idx uint64
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotNil(t, result.Value)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot)
|
|
require.True(t, result.Index > 0)
|
|
|
|
idx = result.Index
|
|
}
|
|
|
|
// Second fetch should block with set index
|
|
req.MinQueryIndex = idx
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case result := <-getCh:
|
|
t.Fatalf("should not return: %#v", result)
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
|
|
// Let's send in new roots, which should trigger the sign req. We need to take
|
|
// care to set the new root as active
|
|
caRoot2 := signer.UpdateCA(t, nil)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotNil(t, result.Value)
|
|
require.True(t, result.Index > idx)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot2)
|
|
}
|
|
|
|
// Third fetch should block
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case result := <-getCh:
|
|
t.Fatalf("should not return: %#v", result)
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
}
|
|
|
|
// Tests that if the root change jitter is longer than the time left on the
|
|
// timeout, we return normally but then still renew the cert on a subsequent
|
|
// call.
|
|
func TestManager_changingRootsJitterBetweenCalls(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
const TestOverrideCAChangeInitialDelay = 100 * time.Millisecond
|
|
|
|
m, signer := NewTestManager(t, func(cfg *Config) {
|
|
// Override the root-change delay so we will timeout first. We can't set it to
|
|
// a crazy high value otherwise we'll have to wait that long in the test to
|
|
// see if it actually happens on subsequent calls. We instead reduce the
|
|
// timeout in FetchOptions to be much shorter than this.
|
|
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
|
|
})
|
|
|
|
caRoot := signer.UpdateCA(t, nil)
|
|
|
|
// We'll reuse the fetch options and request. Timeout must be much shorter
|
|
// than the initial root delay. 20ms means that if we deliver the root change
|
|
// during the first blocking call, we should need to block fully for 5 more
|
|
// calls before the cert is renewed. We pick a timeout that is not an exact
|
|
// multiple of the 100ms delay above to reduce the chance that timing works
|
|
// out in a way that makes it hard to tell a timeout from an early return due
|
|
// to a cert renewal.
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1", Service: "web",
|
|
MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond,
|
|
}
|
|
|
|
// First fetch should return immediately
|
|
getCh := testAsyncGet(t, m, req)
|
|
var (
|
|
idx uint64
|
|
issued *structs.IssuedCert
|
|
)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotNil(t, result.Value)
|
|
require.True(t, result.Index > 0)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot)
|
|
idx = result.Index
|
|
issued = result.Value
|
|
}
|
|
|
|
// Let's send in new roots, which should eventually trigger the sign req. We
|
|
// need to take care to set the new root as active. Note that this is
|
|
// implicitly testing that root updates that happen in between leaf blocking
|
|
// queries are still noticed too. At this point no leaf blocking query is
|
|
// running so the root watch should be stopped. By pushing this update, the
|
|
// next blocking query will _immediately_ see the new root which means it
|
|
// needs to correctly notice that it is not the same one that generated the
|
|
// current cert and start the rotation. This is good, just not obvious that
|
|
// the behavior is actually well tested here when it is.
|
|
caRoot2 := signer.UpdateCA(t, nil)
|
|
earliestRootDelivery := time.Now()
|
|
|
|
// Some number of fetches (2,3,4 likely) should timeout after 20ms and after
|
|
// 100ms has elapsed total we should see the new cert. Since this is all very
|
|
// timing dependent, we don't hard code exact numbers here and instead loop
|
|
// for plenty of time and do as many calls as it takes and just assert on the
|
|
// time taken and that the call either blocks and returns the cached cert, or
|
|
// returns the new one.
|
|
req.MinQueryIndex = idx
|
|
var shouldExpireAfter time.Time
|
|
i := 1
|
|
rootsDelivered := false
|
|
for rootsDelivered {
|
|
start := time.Now()
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
timeTaken := time.Since(start)
|
|
|
|
// There are two options, either it blocked waiting for the delay after
|
|
// the rotation or it returned the new CA cert before the timeout was
|
|
// done. TO be more robust against timing, we take the value as the
|
|
// decider for which case it is, and assert timing matches our expected
|
|
// bounds rather than vice versa.
|
|
|
|
if result.Index > idx {
|
|
// Got a new cert
|
|
require.NotEqual(t, issued, result.Value)
|
|
require.NotNil(t, result.Value)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot2)
|
|
// Should not have been delivered before the delay
|
|
require.True(t, time.Since(earliestRootDelivery) > TestOverrideCAChangeInitialDelay)
|
|
// All good. We are done!
|
|
rootsDelivered = true
|
|
} else {
|
|
// Should be the cached cert
|
|
require.Equal(t, issued, result.Value)
|
|
require.Equal(t, idx, result.Index)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot)
|
|
// Sanity check we blocked for the whole timeout
|
|
require.Truef(t, timeTaken > req.MaxQueryTime,
|
|
"should block for at least %s, returned after %s",
|
|
req.MaxQueryTime, timeTaken)
|
|
// Sanity check that the forceExpireAfter state was set correctly
|
|
shouldExpireAfter := testObserveLeafCert(m, req, func(cd *certData) time.Time {
|
|
return cd.state.forceExpireAfter
|
|
})
|
|
require.True(t, shouldExpireAfter.After(time.Now()))
|
|
require.True(t, shouldExpireAfter.Before(time.Now().Add(TestOverrideCAChangeInitialDelay)))
|
|
}
|
|
case <-time.After(50 * time.Millisecond):
|
|
t.Fatalf("request %d blocked too long", i)
|
|
}
|
|
i++
|
|
|
|
// Sanity check that we've not gone way beyond the deadline without a
|
|
// new cert. We give some leeway to make it less brittle.
|
|
require.Falsef(t, time.Now().After(shouldExpireAfter.Add(100*time.Millisecond)),
|
|
"waited extra 100ms and delayed CA rotate renew didn't happen")
|
|
}
|
|
}
|
|
|
|
func testObserveLeafCert[T any](m *Manager, req *ConnectCALeafRequest, cb func(*certData) T) T {
|
|
key := req.Key()
|
|
|
|
cd := m.getCertData(key)
|
|
|
|
cd.lock.Lock()
|
|
defer cd.lock.Unlock()
|
|
|
|
return cb(cd)
|
|
}
|
|
|
|
// Tests that if the root changes in between blocking calls we still pick it up.
|
|
func TestManager_changingRootsBetweenBlockingCalls(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
m, signer := NewTestManager(t, nil)
|
|
|
|
caRoot := signer.UpdateCA(t, nil)
|
|
|
|
// We'll reuse the fetch options and request. Short timeout important since we
|
|
// wait the full timeout before chaning roots.
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1", Service: "web",
|
|
MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond,
|
|
}
|
|
|
|
// First fetch should return immediately
|
|
getCh := testAsyncGet(t, m, req)
|
|
var (
|
|
idx uint64
|
|
issued *structs.IssuedCert
|
|
)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotNil(t, result.Value)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot)
|
|
require.True(t, result.Index > 0)
|
|
idx = result.Index
|
|
issued = result.Value
|
|
}
|
|
|
|
// Next fetch should block for the full timeout
|
|
start := time.Now()
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block for too long waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.Equal(t, issued, result.Value)
|
|
// Still the initial cached result
|
|
require.Equal(t, idx, result.Index)
|
|
// Sanity check that it waited
|
|
require.True(t, time.Since(start) > req.MaxQueryTime)
|
|
}
|
|
|
|
// No active requests, simulate root change now
|
|
caRoot2 := signer.UpdateCA(t, nil)
|
|
earliestRootDelivery := time.Now()
|
|
|
|
// We should get the new cert immediately on next fetch (since test override
|
|
// root change jitter to be 1 nanosecond so no delay expected).
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block too long waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotEqual(t, issued, result.Value)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot2)
|
|
require.True(t, result.Index > idx)
|
|
// Sanity check that we didn't wait too long
|
|
require.True(t, time.Since(earliestRootDelivery) < req.MaxQueryTime)
|
|
}
|
|
}
|
|
|
|
func TestManager_CSRRateLimiting(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
t.Parallel()
|
|
|
|
m, signer := NewTestManager(t, func(cfg *Config) {
|
|
// Each jitter window will be only 100 ms long to make testing quick but
|
|
// highly likely not to fail based on scheduling issues.
|
|
cfg.TestOverrideCAChangeInitialDelay = 100 * time.Millisecond
|
|
})
|
|
|
|
signer.UpdateCA(t, nil)
|
|
|
|
signer.SetSignCallErrors(
|
|
// First call return rate limit error. This is important as it checks
|
|
// behavior when cache is empty and we have to return a nil Value but need to
|
|
// save state to do the right thing for retry.
|
|
structs.ErrRateLimited, // inc
|
|
// Then succeed on second call
|
|
nil,
|
|
// Then be rate limited again on several further calls
|
|
structs.ErrRateLimited, // inc
|
|
structs.ErrRateLimited, // inc
|
|
// Then fine after that
|
|
)
|
|
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "web",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
|
|
// First fetch should return rate limit error directly - client is expected to
|
|
// backoff itself.
|
|
getCh := testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(200 * time.Millisecond):
|
|
t.Fatal("shouldn't block longer than one jitter window for success")
|
|
case result := <-getCh:
|
|
require.Error(t, result.Err)
|
|
require.Equal(t, structs.ErrRateLimited.Error(), result.Err.Error())
|
|
}
|
|
|
|
// Second call should return correct cert immediately.
|
|
getCh = testAsyncGet(t, m, req)
|
|
var (
|
|
idx uint64
|
|
issued *structs.IssuedCert
|
|
)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotNil(t, result.Value)
|
|
require.True(t, result.Index > 0)
|
|
idx = result.Index
|
|
issued = result.Value
|
|
}
|
|
|
|
// Send in new roots, which should trigger the next sign req. We need to take
|
|
// care to set the new root as active
|
|
signer.UpdateCA(t, nil)
|
|
earliestRootDelivery := time.Now()
|
|
|
|
// Sanity check state
|
|
require.Equal(t, uint64(1), signer.GetSignCallErrorCount())
|
|
|
|
// After root rotation jitter has been waited out, a new CSR will
|
|
// be attempted but will fail and return the previous cached result with no
|
|
// error since we will try again soon.
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(200 * time.Millisecond):
|
|
t.Fatal("shouldn't block too long waiting for fetch")
|
|
case result := <-getCh:
|
|
// We should block for _at least_ one jitter period since we set that to
|
|
// 100ms and in test override mode we always pick the max jitter not a
|
|
// random amount.
|
|
require.True(t, time.Since(earliestRootDelivery) > 100*time.Millisecond)
|
|
require.Equal(t, uint64(2), signer.GetSignCallErrorCount())
|
|
|
|
require.NoError(t, result.Err)
|
|
require.Equal(t, issued, result.Value)
|
|
// 1 since this should still be the original cached result as we failed to
|
|
// get a new cert.
|
|
require.Equal(t, idx, result.Index)
|
|
}
|
|
|
|
// Root rotation state is now only captured in the opts.LastResult.State so a
|
|
// subsequent call should also wait for 100ms and then attempt to generate a
|
|
// new cert since we failed last time.
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(200 * time.Millisecond):
|
|
t.Fatal("shouldn't block too long waiting for fetch")
|
|
case result := <-getCh:
|
|
// We should block for _at least_ two jitter periods now.
|
|
require.True(t, time.Since(earliestRootDelivery) > 200*time.Millisecond)
|
|
require.Equal(t, uint64(3), signer.GetSignCallErrorCount())
|
|
|
|
require.NoError(t, result.Err)
|
|
require.Equal(t, issued, result.Value)
|
|
// 1 since this should still be the original cached result as we failed to
|
|
// get a new cert.
|
|
require.Equal(t, idx, result.Index)
|
|
}
|
|
|
|
// Now we've had two rate limit failures and seen root rotation state work
|
|
// across both the blocking request that observed the rotation and the
|
|
// subsequent one. The next request should wait out the rest of the backoff
|
|
// and then actually fetch a new cert at last!
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(200 * time.Millisecond):
|
|
t.Fatal("shouldn't block too long waiting for fetch")
|
|
case result := <-getCh:
|
|
// We should block for _at least_ three jitter periods now.
|
|
require.True(t, time.Since(earliestRootDelivery) > 300*time.Millisecond)
|
|
require.Equal(t, uint64(3), signer.GetSignCallErrorCount())
|
|
|
|
require.NoError(t, result.Err)
|
|
require.NotEqual(t, issued, result.Value)
|
|
// 3 since the rootCA change used 2
|
|
require.True(t, result.Index > idx)
|
|
}
|
|
}
|
|
|
|
// This test runs multiple concurrent callers watching different leaf certs and
|
|
// tries to ensure that the background root watch activity behaves correctly.
|
|
func TestManager_watchRootsDedupingMultipleCallers(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
t.Parallel()
|
|
|
|
m, signer := NewTestManager(t, nil)
|
|
|
|
caRoot := signer.UpdateCA(t, nil)
|
|
|
|
// n is the number of clients we'll run
|
|
n := 3
|
|
|
|
// setup/testDoneCh are used for coordinating clients such that each has
|
|
// initial cert delivered and is blocking before the root changes. It's not a
|
|
// wait group since we want to be able to timeout the main test goroutine if
|
|
// one of the clients gets stuck. Instead it's a buffered chan.
|
|
setupDoneCh := make(chan error, n)
|
|
testDoneCh := make(chan error, n)
|
|
// rootsUpdate is used to coordinate clients so they know when they should
|
|
// expect to see leaf renewed after root change.
|
|
rootsUpdatedCh := make(chan struct{})
|
|
|
|
// Create a function that models a single client. It should go through the
|
|
// steps of getting an initial cert and then watching for changes until root
|
|
// updates.
|
|
client := func(i int) {
|
|
// We'll reuse the fetch options and request
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1", Service: fmt.Sprintf("web-%d", i),
|
|
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
|
|
}
|
|
|
|
// First fetch should return immediately
|
|
getCh := testAsyncGet(t, m, req)
|
|
var idx uint64
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
setupDoneCh <- fmt.Errorf("shouldn't block waiting for fetch")
|
|
return
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
idx = result.Index
|
|
}
|
|
|
|
// Second fetch should block with set index
|
|
req.MinQueryIndex = idx
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case result := <-getCh:
|
|
setupDoneCh <- fmt.Errorf("should not return: %#v", result)
|
|
return
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
|
|
// We're done with setup and the blocking call is still blocking in
|
|
// background.
|
|
setupDoneCh <- nil
|
|
|
|
// Wait until all others are also done and roots change incase there are
|
|
// stragglers delaying the root update.
|
|
select {
|
|
case <-rootsUpdatedCh:
|
|
case <-time.After(200 * time.Millisecond):
|
|
testDoneCh <- fmt.Errorf("waited too long for root update")
|
|
return
|
|
}
|
|
|
|
// Now we should see root update within a short period
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
testDoneCh <- fmt.Errorf("shouldn't block waiting for fetch")
|
|
return
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
if req.MinQueryIndex == result.Value.CreateIndex {
|
|
testDoneCh <- fmt.Errorf("index must be different")
|
|
return
|
|
}
|
|
}
|
|
|
|
testDoneCh <- nil
|
|
}
|
|
|
|
// Sanity check the roots watcher is not running yet
|
|
assertRootsWatchCounts(t, m, 0, 0)
|
|
|
|
for i := 0; i < n; i++ {
|
|
go client(i)
|
|
}
|
|
|
|
timeoutCh := time.After(200 * time.Millisecond)
|
|
|
|
for i := 0; i < n; i++ {
|
|
select {
|
|
case <-timeoutCh:
|
|
t.Fatal("timed out waiting for clients")
|
|
case err := <-setupDoneCh:
|
|
if err != nil {
|
|
t.Fatalf(err.Error())
|
|
}
|
|
}
|
|
}
|
|
|
|
// Should be 3 clients running now, so the roots watcher should have started
|
|
// once and not stopped.
|
|
assertRootsWatchCounts(t, m, 1, 0)
|
|
|
|
caRootCopy := caRoot.Clone()
|
|
caRootCopy.Active = false
|
|
|
|
// Now we deliver the root update
|
|
_ = signer.UpdateCA(t, nil)
|
|
// And notify clients
|
|
close(rootsUpdatedCh)
|
|
|
|
timeoutCh = time.After(200 * time.Millisecond)
|
|
for i := 0; i < n; i++ {
|
|
select {
|
|
case <-timeoutCh:
|
|
t.Fatalf("timed out waiting for %d of %d clients to renew after root change", n-i, n)
|
|
case err := <-testDoneCh:
|
|
if err != nil {
|
|
t.Fatalf(err.Error())
|
|
}
|
|
}
|
|
}
|
|
|
|
// All active requests have returned the new cert so the rootsWatcher should
|
|
// have stopped. This is timing dependent though so retry a few times
|
|
retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) {
|
|
assertRootsWatchCounts(r, m, 1, 1)
|
|
})
|
|
}
|
|
|
|
func assertRootsWatchCounts(t require.TestingT, m *Manager, wantStarts, wantStops int) {
|
|
if tt, ok := t.(*testing.T); ok {
|
|
tt.Helper()
|
|
}
|
|
starts := atomic.LoadUint32(&m.rootWatcher.testStartCount)
|
|
stops := atomic.LoadUint32(&m.rootWatcher.testStopCount)
|
|
require.Equal(t, wantStarts, int(starts))
|
|
require.Equal(t, wantStops, int(stops))
|
|
}
|
|
|
|
// Test that after an initial signing, an expiringLeaf will trigger a
|
|
// blocking query to resign.
|
|
func TestManager_expiringLeaf(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
t.Parallel()
|
|
|
|
m, signer := NewTestManager(t, nil)
|
|
|
|
caRoot := signer.UpdateCA(t, nil)
|
|
|
|
signer.SetSignCallErrors(
|
|
// First call returns expired cert to prime cache with an expired one.
|
|
ReplyWithExpiredCert,
|
|
)
|
|
|
|
// We'll reuse the fetch options and request
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1", Service: "web",
|
|
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
|
|
}
|
|
|
|
// First fetch should return immediately
|
|
getCh := testAsyncGet(t, m, req)
|
|
var (
|
|
idx uint64
|
|
issued *structs.IssuedCert
|
|
)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotNil(t, result.Value)
|
|
require.True(t, result.Index > 0)
|
|
idx = result.Index
|
|
issued = result.Value
|
|
}
|
|
|
|
// Second fetch should return immediately despite there being
|
|
// no updated CA roots, because we issued an expired cert.
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
t.Fatal("shouldn't block waiting for fetch")
|
|
case result := <-getCh:
|
|
require.NoError(t, result.Err)
|
|
require.NotEqual(t, issued, result.Value)
|
|
require.True(t, result.Index > idx)
|
|
requireLeafValidUnderCA(t, result.Value, caRoot)
|
|
idx = result.Index
|
|
}
|
|
|
|
// Third fetch should block since the cert is not expiring and
|
|
// we also didn't update CA certs.
|
|
req.MinQueryIndex = idx
|
|
getCh = testAsyncGet(t, m, req)
|
|
select {
|
|
case result := <-getCh:
|
|
t.Fatalf("should not return: %#v", result)
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
}
|
|
|
|
func TestManager_DNSSANForService(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
m, signer := NewTestManager(t, nil)
|
|
|
|
_ = signer.UpdateCA(t, nil)
|
|
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "web",
|
|
DNSSAN: []string{"test.example.com"},
|
|
}
|
|
|
|
_, _, err := m.Get(context.Background(), req)
|
|
require.NoError(t, err)
|
|
|
|
caReq := signer.GetCapture(0)
|
|
require.NotNil(t, caReq)
|
|
|
|
pemBlock, _ := pem.Decode([]byte(caReq.CSR))
|
|
csr, err := x509.ParseCertificateRequest(pemBlock.Bytes)
|
|
require.NoError(t, err)
|
|
require.Equal(t, csr.DNSNames, []string{"test.example.com"})
|
|
}
|
|
|
|
func TestManager_workflow_good(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
t.Cleanup(cancel)
|
|
|
|
const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond
|
|
|
|
m, signer := NewTestManager(t, func(cfg *Config) {
|
|
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
|
|
})
|
|
|
|
ca1 := signer.UpdateCA(t, nil)
|
|
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
|
|
// List
|
|
issued, meta, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
require.False(t, meta.Hit)
|
|
require.NotNil(t, issued)
|
|
|
|
// Verify that the cert is signed by the CA
|
|
requireLeafValidUnderCA(t, issued, ca1)
|
|
|
|
// Verify blocking index
|
|
require.True(t, issued.ModifyIndex > 0)
|
|
require.Equal(t, issued.ModifyIndex, meta.Index)
|
|
|
|
index := meta.Index
|
|
|
|
// Fetch it again
|
|
testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
issued2, _, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
require.NotNil(t, issued2)
|
|
require.Equal(t, issued, issued2)
|
|
})
|
|
|
|
type reply struct {
|
|
cert *structs.IssuedCert
|
|
meta cacheshim.ResultMeta
|
|
err error
|
|
}
|
|
|
|
replyCh := make(chan *reply, 1)
|
|
go func() {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
MinQueryIndex: index,
|
|
}
|
|
|
|
issued2, meta2, err := m.Get(ctx, req)
|
|
|
|
replyCh <- &reply{issued2, meta2, err}
|
|
}()
|
|
|
|
// Set a new CA
|
|
ca2 := signer.UpdateCA(t, nil)
|
|
|
|
// Issue a blocking query to ensure that the cert gets updated appropriately
|
|
testutil.RunStep(t, "test blocking queries update leaf cert", func(t *testing.T) {
|
|
var got *reply
|
|
select {
|
|
case got = <-replyCh:
|
|
case <-time.After(500 * time.Millisecond):
|
|
t.Fatal("blocking query did not wake up during rotation")
|
|
}
|
|
|
|
issued2, meta2, err := got.cert, got.meta, got.err
|
|
require.NoError(t, err)
|
|
require.NotNil(t, issued2)
|
|
|
|
require.NotEqual(t, issued.CertPEM, issued2.CertPEM)
|
|
require.NotEqual(t, issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
|
|
|
|
// Verify that the cert is signed by the new CA
|
|
requireLeafValidUnderCA(t, issued2, ca2)
|
|
|
|
// Should not be a cache hit! The data was updated in response to the blocking
|
|
// query being made.
|
|
require.False(t, meta2.Hit)
|
|
})
|
|
|
|
testutil.RunStep(t, "test non-blocking queries update leaf cert", func(t *testing.T) {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
|
|
issued, _, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
require.NotNil(t, issued)
|
|
|
|
// Verify that the cert is signed by the CA
|
|
requireLeafValidUnderCA(t, issued, ca2)
|
|
|
|
// Issue a non blocking query to ensure that the cert gets updated appropriately
|
|
{
|
|
// Set a new CA
|
|
ca3 := signer.UpdateCA(t, nil)
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
|
|
issued2, meta2, err := m.Get(ctx, req)
|
|
require.NoError(r, err)
|
|
require.NotNil(r, issued2)
|
|
|
|
requireLeafValidUnderCA(r, issued2, ca3)
|
|
|
|
// Should not be a cache hit!
|
|
require.False(r, meta2.Hit)
|
|
|
|
require.NotEqual(r, issued.CertPEM, issued2.CertPEM)
|
|
require.NotEqual(r, issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
|
|
|
|
// Verify that the cert is signed by the new CA
|
|
requireLeafValidUnderCA(r, issued2, ca3)
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
// Test we can request a leaf cert for a service and witness correct caching,
|
|
// blocking, and update semantics.
|
|
//
|
|
// This test originally was a client agent test in
|
|
// agent.TestAgentConnectCALeafCert_goodNotLocal and was cloned here to
|
|
// increase complex coverage, but the specific naming of the parent test is
|
|
// irrelevant here since there's no notion of the catalog at all at this layer.
|
|
func TestManager_workflow_goodNotLocal(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
t.Cleanup(cancel)
|
|
|
|
const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond
|
|
|
|
m, signer := NewTestManager(t, func(cfg *Config) {
|
|
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
|
|
})
|
|
|
|
ca1 := signer.UpdateCA(t, nil)
|
|
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
|
|
// List
|
|
issued, meta, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
require.False(t, meta.Hit)
|
|
require.NotNil(t, issued)
|
|
|
|
// Verify that the cert is signed by the CA
|
|
requireLeafValidUnderCA(t, issued, ca1)
|
|
|
|
// Verify blocking index
|
|
require.True(t, issued.ModifyIndex > 0)
|
|
require.Equal(t, issued.ModifyIndex, meta.Index)
|
|
|
|
// Fetch it again
|
|
testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
issued2, _, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
require.NotNil(t, issued2)
|
|
require.Equal(t, issued, issued2)
|
|
})
|
|
|
|
// Test Blocking - see https://github.com/hashicorp/consul/issues/4462
|
|
testutil.RunStep(t, "test blocking issue 4462", func(t *testing.T) {
|
|
// Fetch it again
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
MinQueryIndex: issued.ModifyIndex,
|
|
MaxQueryTime: 125 * time.Millisecond,
|
|
}
|
|
var (
|
|
respCh = make(chan *structs.IssuedCert)
|
|
errCh = make(chan error, 1)
|
|
)
|
|
go func() {
|
|
issued2, _, err := m.Get(ctx, req)
|
|
if err != nil {
|
|
errCh <- err
|
|
} else {
|
|
respCh <- issued2
|
|
}
|
|
}()
|
|
|
|
select {
|
|
case <-time.After(500 * time.Millisecond):
|
|
require.FailNow(t, "Shouldn't block for this long - not respecting wait parameter in the query")
|
|
|
|
case err := <-errCh:
|
|
require.NoError(t, err)
|
|
case <-respCh:
|
|
}
|
|
})
|
|
|
|
testutil.RunStep(t, "test that caching is updated in the background", func(t *testing.T) {
|
|
// Set a new CA
|
|
ca := signer.UpdateCA(t, nil)
|
|
|
|
retry.Run(t, func(r *retry.R) {
|
|
// Try and sign again (note no index/wait arg since cache should update in
|
|
// background even if we aren't actively blocking)
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
|
|
issued2, _, err := m.Get(ctx, req)
|
|
require.NoError(r, err)
|
|
|
|
if issued.CertPEM == issued2.CertPEM {
|
|
r.Fatalf("leaf has not updated")
|
|
}
|
|
|
|
// Got a new leaf. Sanity check it's a whole new key as well as different
|
|
// cert.
|
|
if issued.PrivateKeyPEM == issued2.PrivateKeyPEM {
|
|
r.Fatalf("new leaf has same private key as before")
|
|
}
|
|
|
|
// Verify that the cert is signed by the new CA
|
|
requireLeafValidUnderCA(r, issued2, ca)
|
|
|
|
require.NotEqual(r, issued, issued2)
|
|
})
|
|
})
|
|
}
|
|
|
|
func TestManager_workflow_nonBlockingQuery_after_blockingQuery_shouldNotBlock(t *testing.T) {
|
|
// see: https://github.com/hashicorp/consul/issues/12048
|
|
|
|
if testing.Short() {
|
|
t.Skip("too slow for testing.Short")
|
|
}
|
|
|
|
t.Parallel()
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
t.Cleanup(cancel)
|
|
|
|
m, signer := NewTestManager(t, nil)
|
|
|
|
_ = signer.UpdateCA(t, nil)
|
|
|
|
var (
|
|
serialNumber string
|
|
index uint64
|
|
issued *structs.IssuedCert
|
|
)
|
|
testutil.RunStep(t, "do initial non-blocking query", func(t *testing.T) {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
issued1, meta, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
|
|
serialNumber = issued1.SerialNumber
|
|
|
|
require.False(t, meta.Hit, "for the leaf cert cache type these are always MISS")
|
|
index = meta.Index
|
|
issued = issued1
|
|
})
|
|
|
|
go func() {
|
|
// launch goroutine for blocking query
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
MinQueryIndex: index,
|
|
}
|
|
_, _, _ = m.Get(ctx, req)
|
|
}()
|
|
|
|
// We just need to ensure that the above blocking query is in-flight before
|
|
// the next step, so do a little sleep.
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
// The initial non-blocking query populated the leaf cert cache entry
|
|
// implicitly. The agent cache doesn't prune entries very often at all, so
|
|
// in between both of these steps the data should still be there, causing
|
|
// this to be a HIT that completes in less than 10m (the default inner leaf
|
|
// cert blocking query timeout).
|
|
testutil.RunStep(t, "do a non-blocking query that should not block", func(t *testing.T) {
|
|
req := &ConnectCALeafRequest{
|
|
Datacenter: "dc1",
|
|
Service: "test",
|
|
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
|
|
}
|
|
issued2, meta2, err := m.Get(ctx, req)
|
|
require.NoError(t, err)
|
|
|
|
require.True(t, meta2.Hit)
|
|
|
|
// If this is actually returning a cached result, the serial number
|
|
// should be unchanged.
|
|
require.Equal(t, serialNumber, issued2.SerialNumber)
|
|
|
|
require.Equal(t, issued, issued2)
|
|
})
|
|
}
|
|
|
|
func requireLeafValidUnderCA(t require.TestingT, issued *structs.IssuedCert, ca *structs.CARoot) {
|
|
require.NotNil(t, issued)
|
|
require.NotNil(t, ca)
|
|
|
|
leaf, intermediates, err := connect.ParseLeafCerts(issued.CertPEM)
|
|
require.NoError(t, err)
|
|
|
|
roots := x509.NewCertPool()
|
|
require.True(t, roots.AppendCertsFromPEM([]byte(ca.RootCert)))
|
|
|
|
_, err = leaf.Verify(x509.VerifyOptions{
|
|
Roots: roots,
|
|
Intermediates: intermediates,
|
|
})
|
|
require.NoError(t, err)
|
|
|
|
// Verify the private key matches. tls.LoadX509Keypair does this for us!
|
|
_, err = tls.X509KeyPair([]byte(issued.CertPEM), []byte(issued.PrivateKeyPEM))
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
type testGetResult struct {
|
|
Index uint64
|
|
Value *structs.IssuedCert
|
|
Err error
|
|
}
|
|
|
|
// testAsyncGet returns a channel that returns the result of the testGet call.
|
|
//
|
|
// This is useful for testing timing and concurrency with testGet calls.
|
|
func testAsyncGet(t *testing.T, m *Manager, req *ConnectCALeafRequest) <-chan testGetResult {
|
|
ch := make(chan testGetResult)
|
|
go func() {
|
|
index, cert, err := m.testGet(req)
|
|
if err != nil {
|
|
ch <- testGetResult{Err: err}
|
|
return
|
|
}
|
|
|
|
ch <- testGetResult{Index: index, Value: cert}
|
|
}()
|
|
return ch
|
|
}
|