consul/agent/leafcert/leafcert_test.go

1137 lines
33 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package leafcert
import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/pem"
"fmt"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
)
// Test that after an initial signing, new CA roots (new ID) will
// trigger a blocking query to execute.
func TestManager_changingRoots(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
// We'll reuse the fetch options and request
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var idx uint64
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot)
require.True(t, result.Index > 0)
idx = result.Index
}
// Second fetch should block with set index
req.MinQueryIndex = idx
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
// Let's send in new roots, which should trigger the sign req. We need to take
// care to set the new root as active
caRoot2 := signer.UpdateCA(t, nil)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > idx)
requireLeafValidUnderCA(t, result.Value, caRoot2)
}
// Third fetch should block
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
}
// Tests that if the root change jitter is longer than the time left on the
// timeout, we return normally but then still renew the cert on a subsequent
// call.
func TestManager_changingRootsJitterBetweenCalls(t *testing.T) {
t.Parallel()
const TestOverrideCAChangeInitialDelay = 100 * time.Millisecond
m, signer := testManager(t, func(cfg *Config) {
// Override the root-change delay so we will timeout first. We can't set it to
// a crazy high value otherwise we'll have to wait that long in the test to
// see if it actually happens on subsequent calls. We instead reduce the
// timeout in FetchOptions to be much shorter than this.
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
})
caRoot := signer.UpdateCA(t, nil)
// We'll reuse the fetch options and request. Timeout must be much shorter
// than the initial root delay. 20ms means that if we deliver the root change
// during the first blocking call, we should need to block fully for 5 more
// calls before the cert is renewed. We pick a timeout that is not an exact
// multiple of the 100ms delay above to reduce the chance that timing works
// out in a way that makes it hard to tell a timeout from an early return due
// to a cert renewal.
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > 0)
requireLeafValidUnderCA(t, result.Value, caRoot)
idx = result.Index
issued = result.Value
}
// Let's send in new roots, which should eventually trigger the sign req. We
// need to take care to set the new root as active. Note that this is
// implicitly testing that root updates that happen in between leaf blocking
// queries are still noticed too. At this point no leaf blocking query is
// running so the root watch should be stopped. By pushing this update, the
// next blocking query will _immediately_ see the new root which means it
// needs to correctly notice that it is not the same one that generated the
// current cert and start the rotation. This is good, just not obvious that
// the behavior is actually well tested here when it is.
caRoot2 := signer.UpdateCA(t, nil)
earliestRootDelivery := time.Now()
// Some number of fetches (2,3,4 likely) should timeout after 20ms and after
// 100ms has elapsed total we should see the new cert. Since this is all very
// timing dependent, we don't hard code exact numbers here and instead loop
// for plenty of time and do as many calls as it takes and just assert on the
// time taken and that the call either blocks and returns the cached cert, or
// returns the new one.
req.MinQueryIndex = idx
var shouldExpireAfter time.Time
i := 1
rootsDelivered := false
for rootsDelivered {
start := time.Now()
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
require.NoError(t, result.Err)
timeTaken := time.Since(start)
// There are two options, either it blocked waiting for the delay after
// the rotation or it returned the new CA cert before the timeout was
// done. TO be more robust against timing, we take the value as the
// decider for which case it is, and assert timing matches our expected
// bounds rather than vice versa.
if result.Index > idx {
// Got a new cert
require.NotEqual(t, issued, result.Value)
require.NotNil(t, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot2)
// Should not have been delivered before the delay
require.True(t, time.Since(earliestRootDelivery) > TestOverrideCAChangeInitialDelay)
// All good. We are done!
rootsDelivered = true
} else {
// Should be the cached cert
require.Equal(t, issued, result.Value)
require.Equal(t, idx, result.Index)
requireLeafValidUnderCA(t, result.Value, caRoot)
// Sanity check we blocked for the whole timeout
require.Truef(t, timeTaken > req.MaxQueryTime,
"should block for at least %s, returned after %s",
req.MaxQueryTime, timeTaken)
// Sanity check that the forceExpireAfter state was set correctly
shouldExpireAfter := testObserveLeafCert(m, req, func(cd *certData) time.Time {
return cd.state.forceExpireAfter
})
require.True(t, shouldExpireAfter.After(time.Now()))
require.True(t, shouldExpireAfter.Before(time.Now().Add(TestOverrideCAChangeInitialDelay)))
}
case <-time.After(50 * time.Millisecond):
t.Fatalf("request %d blocked too long", i)
}
i++
// Sanity check that we've not gone way beyond the deadline without a
// new cert. We give some leeway to make it less brittle.
require.Falsef(t, time.Now().After(shouldExpireAfter.Add(100*time.Millisecond)),
"waited extra 100ms and delayed CA rotate renew didn't happen")
}
}
func testObserveLeafCert[T any](m *Manager, req *ConnectCALeafRequest, cb func(*certData) T) T {
key := req.Key()
cd := m.getCertData(key)
cd.lock.Lock()
defer cd.lock.Unlock()
return cb(cd)
}
// Tests that if the root changes in between blocking calls we still pick it up.
func TestManager_changingRootsBetweenBlockingCalls(t *testing.T) {
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
// We'll reuse the fetch options and request. Short timeout important since we
// wait the full timeout before chaning roots.
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot)
require.True(t, result.Index > 0)
idx = result.Index
issued = result.Value
}
// Next fetch should block for the full timeout
start := time.Now()
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block for too long waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.Equal(t, issued, result.Value)
// Still the initial cached result
require.Equal(t, idx, result.Index)
// Sanity check that it waited
require.True(t, time.Since(start) > req.MaxQueryTime)
}
// No active requests, simulate root change now
caRoot2 := signer.UpdateCA(t, nil)
earliestRootDelivery := time.Now()
// We should get the new cert immediately on next fetch (since test override
// root change jitter to be 1 nanosecond so no delay expected).
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotEqual(t, issued, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot2)
require.True(t, result.Index > idx)
// Sanity check that we didn't wait too long
require.True(t, time.Since(earliestRootDelivery) < req.MaxQueryTime)
}
}
func TestManager_CSRRateLimiting(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, func(cfg *Config) {
// Each jitter window will be only 100 ms long to make testing quick but
// highly likely not to fail based on scheduling issues.
cfg.TestOverrideCAChangeInitialDelay = 100 * time.Millisecond
})
signer.UpdateCA(t, nil)
signer.SetSignCallErrors(
// First call return rate limit error. This is important as it checks
// behavior when cache is empty and we have to return a nil Value but need to
// save state to do the right thing for retry.
consul.ErrRateLimited, // inc
// Then succeed on second call
nil,
// Then be rate limited again on several further calls
consul.ErrRateLimited, // inc
consul.ErrRateLimited, // inc
// Then fine after that
)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "web",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
// First fetch should return rate limit error directly - client is expected to
// backoff itself.
getCh := testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block longer than one jitter window for success")
case result := <-getCh:
require.Error(t, result.Err)
require.Equal(t, consul.ErrRateLimited.Error(), result.Err.Error())
}
// Second call should return correct cert immediately.
getCh = testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > 0)
idx = result.Index
issued = result.Value
}
// Send in new roots, which should trigger the next sign req. We need to take
// care to set the new root as active
signer.UpdateCA(t, nil)
earliestRootDelivery := time.Now()
// Sanity check state
require.Equal(t, uint64(1), signer.GetSignCallErrorCount())
// After root rotation jitter has been waited out, a new CSR will
// be attempted but will fail and return the previous cached result with no
// error since we will try again soon.
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
// We should block for _at least_ one jitter period since we set that to
// 100ms and in test override mode we always pick the max jitter not a
// random amount.
require.True(t, time.Since(earliestRootDelivery) > 100*time.Millisecond)
require.Equal(t, uint64(2), signer.GetSignCallErrorCount())
require.NoError(t, result.Err)
require.Equal(t, issued, result.Value)
// 1 since this should still be the original cached result as we failed to
// get a new cert.
require.Equal(t, idx, result.Index)
}
// Root rotation state is now only captured in the opts.LastResult.State so a
// subsequent call should also wait for 100ms and then attempt to generate a
// new cert since we failed last time.
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
// We should block for _at least_ two jitter periods now.
require.True(t, time.Since(earliestRootDelivery) > 200*time.Millisecond)
require.Equal(t, uint64(3), signer.GetSignCallErrorCount())
require.NoError(t, result.Err)
require.Equal(t, issued, result.Value)
// 1 since this should still be the original cached result as we failed to
// get a new cert.
require.Equal(t, idx, result.Index)
}
// Now we've had two rate limit failures and seen root rotation state work
// across both the blocking request that observed the rotation and the
// subsequent one. The next request should wait out the rest of the backoff
// and then actually fetch a new cert at last!
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
// We should block for _at least_ three jitter periods now.
require.True(t, time.Since(earliestRootDelivery) > 300*time.Millisecond)
require.Equal(t, uint64(3), signer.GetSignCallErrorCount())
require.NoError(t, result.Err)
require.NotEqual(t, issued, result.Value)
// 3 since the rootCA change used 2
require.True(t, result.Index > idx)
}
}
// This test runs multiple concurrent callers watching different leaf certs and
// tries to ensure that the background root watch activity behaves correctly.
func TestManager_watchRootsDedupingMultipleCallers(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
// n is the number of clients we'll run
n := 3
// setup/testDoneCh are used for coordinating clients such that each has
// initial cert delivered and is blocking before the root changes. It's not a
// wait group since we want to be able to timeout the main test goroutine if
// one of the clients gets stuck. Instead it's a buffered chan.
setupDoneCh := make(chan error, n)
testDoneCh := make(chan error, n)
// rootsUpdate is used to coordinate clients so they know when they should
// expect to see leaf renewed after root change.
rootsUpdatedCh := make(chan struct{})
// Create a function that models a single client. It should go through the
// steps of getting an initial cert and then watching for changes until root
// updates.
client := func(i int) {
// We'll reuse the fetch options and request
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: fmt.Sprintf("web-%d", i),
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var idx uint64
select {
case <-time.After(100 * time.Millisecond):
setupDoneCh <- fmt.Errorf("shouldn't block waiting for fetch")
return
case result := <-getCh:
require.NoError(t, result.Err)
idx = result.Index
}
// Second fetch should block with set index
req.MinQueryIndex = idx
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
setupDoneCh <- fmt.Errorf("should not return: %#v", result)
return
case <-time.After(100 * time.Millisecond):
}
// We're done with setup and the blocking call is still blocking in
// background.
setupDoneCh <- nil
// Wait until all others are also done and roots change incase there are
// stragglers delaying the root update.
select {
case <-rootsUpdatedCh:
case <-time.After(200 * time.Millisecond):
testDoneCh <- fmt.Errorf("waited too long for root update")
return
}
// Now we should see root update within a short period
select {
case <-time.After(100 * time.Millisecond):
testDoneCh <- fmt.Errorf("shouldn't block waiting for fetch")
return
case result := <-getCh:
require.NoError(t, result.Err)
if req.MinQueryIndex == result.Value.CreateIndex {
testDoneCh <- fmt.Errorf("index must be different")
return
}
}
testDoneCh <- nil
}
// Sanity check the roots watcher is not running yet
assertRootsWatchCounts(t, m, 0, 0)
for i := 0; i < n; i++ {
go client(i)
}
timeoutCh := time.After(200 * time.Millisecond)
for i := 0; i < n; i++ {
select {
case <-timeoutCh:
t.Fatal("timed out waiting for clients")
case err := <-setupDoneCh:
if err != nil {
t.Fatalf(err.Error())
}
}
}
// Should be 3 clients running now, so the roots watcher should have started
// once and not stopped.
assertRootsWatchCounts(t, m, 1, 0)
caRootCopy := caRoot.Clone()
caRootCopy.Active = false
// Now we deliver the root update
_ = signer.UpdateCA(t, nil)
// And notify clients
close(rootsUpdatedCh)
timeoutCh = time.After(200 * time.Millisecond)
for i := 0; i < n; i++ {
select {
case <-timeoutCh:
t.Fatalf("timed out waiting for %d of %d clients to renew after root change", n-i, n)
case err := <-testDoneCh:
if err != nil {
t.Fatalf(err.Error())
}
}
}
// All active requests have returned the new cert so the rootsWatcher should
// have stopped. This is timing dependent though so retry a few times
retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) {
assertRootsWatchCounts(r, m, 1, 1)
})
}
func assertRootsWatchCounts(t require.TestingT, m *Manager, wantStarts, wantStops int) {
if tt, ok := t.(*testing.T); ok {
tt.Helper()
}
starts := atomic.LoadUint32(&m.rootWatcher.testStartCount)
stops := atomic.LoadUint32(&m.rootWatcher.testStopCount)
require.Equal(t, wantStarts, int(starts))
require.Equal(t, wantStops, int(stops))
}
// Test that after an initial signing, an expiringLeaf will trigger a
// blocking query to resign.
func TestManager_expiringLeaf(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
signer.SetSignCallErrors(
// First call returns expired cert to prime cache with an expired one.
ReplyWithExpiredCert,
)
// We'll reuse the fetch options and request
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > 0)
idx = result.Index
issued = result.Value
}
// Second fetch should return immediately despite there being
// no updated CA roots, because we issued an expired cert.
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotEqual(t, issued, result.Value)
require.True(t, result.Index > idx)
requireLeafValidUnderCA(t, result.Value, caRoot)
idx = result.Index
}
// Third fetch should block since the cert is not expiring and
// we also didn't update CA certs.
req.MinQueryIndex = idx
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
}
func TestManager_DNSSANForService(t *testing.T) {
t.Parallel()
m, signer := testManager(t, nil)
_ = signer.UpdateCA(t, nil)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "web",
DNSSAN: []string{"test.example.com"},
}
_, _, err := m.Get(context.Background(), req)
require.NoError(t, err)
caReq := signer.GetCapture(0)
require.NotNil(t, caReq)
pemBlock, _ := pem.Decode([]byte(caReq.CSR))
csr, err := x509.ParseCertificateRequest(pemBlock.Bytes)
require.NoError(t, err)
require.Equal(t, csr.DNSNames, []string{"test.example.com"})
}
func TestManager_workflow_good(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond
m, signer := testManager(t, func(cfg *Config) {
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
})
ca1 := signer.UpdateCA(t, nil)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
// List
issued, meta, err := m.Get(ctx, req)
require.NoError(t, err)
require.False(t, meta.Hit)
require.NotNil(t, issued)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca1)
// Verify blocking index
require.True(t, issued.ModifyIndex > 0)
require.Equal(t, issued.ModifyIndex, meta.Index)
index := meta.Index
// Fetch it again
testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, _, err := m.Get(ctx, req)
require.NoError(t, err)
require.NotNil(t, issued2)
require.Equal(t, issued, issued2)
})
type reply struct {
cert *structs.IssuedCert
meta cache.ResultMeta
err error
}
replyCh := make(chan *reply, 1)
go func() {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
MinQueryIndex: index,
}
issued2, meta2, err := m.Get(ctx, req)
replyCh <- &reply{issued2, meta2, err}
}()
// Set a new CA
ca2 := signer.UpdateCA(t, nil)
// Issue a blocking query to ensure that the cert gets updated appropriately
testutil.RunStep(t, "test blocking queries update leaf cert", func(t *testing.T) {
var got *reply
select {
case got = <-replyCh:
case <-time.After(500 * time.Millisecond):
t.Fatal("blocking query did not wake up during rotation")
}
issued2, meta2, err := got.cert, got.meta, got.err
require.NoError(t, err)
require.NotNil(t, issued2)
require.NotEqual(t, issued.CertPEM, issued2.CertPEM)
require.NotEqual(t, issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(t, issued2, ca2)
// Should not be a cache hit! The data was updated in response to the blocking
// query being made.
require.False(t, meta2.Hit)
})
testutil.RunStep(t, "test non-blocking queries update leaf cert", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued, _, err := m.Get(ctx, req)
require.NoError(t, err)
require.NotNil(t, issued)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca2)
// Issue a non blocking query to ensure that the cert gets updated appropriately
{
// Set a new CA
ca3 := signer.UpdateCA(t, nil)
retry.Run(t, func(r *retry.R) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, meta2, err := m.Get(ctx, req)
require.NoError(r, err)
require.NotNil(r, issued2)
requireLeafValidUnderCA(r, issued2, ca3)
// Should not be a cache hit!
require.False(r, meta2.Hit)
require.NotEqual(r, issued.CertPEM, issued2.CertPEM)
require.NotEqual(r, issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(r, issued2, ca3)
})
}
})
}
// Test we can request a leaf cert for a service and witness correct caching,
// blocking, and update semantics.
//
// This test originally was a client agent test in
// agent.TestAgentConnectCALeafCert_goodNotLocal and was cloned here to
// increase complex coverage, but the specific naming of the parent test is
// irrelevant here since there's no notion of the catalog at all at this layer.
func TestManager_workflow_goodNotLocal(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond
m, signer := testManager(t, func(cfg *Config) {
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
})
ca1 := signer.UpdateCA(t, nil)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
// List
issued, meta, err := m.Get(ctx, req)
require.NoError(t, err)
require.False(t, meta.Hit)
require.NotNil(t, issued)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca1)
// Verify blocking index
require.True(t, issued.ModifyIndex > 0)
require.Equal(t, issued.ModifyIndex, meta.Index)
// Fetch it again
testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, _, err := m.Get(ctx, req)
require.NoError(t, err)
require.NotNil(t, issued2)
require.Equal(t, issued, issued2)
})
// Test Blocking - see https://github.com/hashicorp/consul/issues/4462
testutil.RunStep(t, "test blocking issue 4462", func(t *testing.T) {
// Fetch it again
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
MinQueryIndex: issued.ModifyIndex,
MaxQueryTime: 125 * time.Millisecond,
}
var (
respCh = make(chan *structs.IssuedCert)
errCh = make(chan error, 1)
)
go func() {
issued2, _, err := m.Get(ctx, req)
if err != nil {
errCh <- err
} else {
respCh <- issued2
}
}()
select {
case <-time.After(500 * time.Millisecond):
require.FailNow(t, "Shouldn't block for this long - not respecting wait parameter in the query")
case err := <-errCh:
require.NoError(t, err)
case <-respCh:
}
})
testutil.RunStep(t, "test that caching is updated in the background", func(t *testing.T) {
// Set a new CA
ca := signer.UpdateCA(t, nil)
retry.Run(t, func(r *retry.R) {
// Try and sign again (note no index/wait arg since cache should update in
// background even if we aren't actively blocking)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, _, err := m.Get(ctx, req)
require.NoError(r, err)
if issued.CertPEM == issued2.CertPEM {
r.Fatalf("leaf has not updated")
}
// Got a new leaf. Sanity check it's a whole new key as well as different
// cert.
if issued.PrivateKeyPEM == issued2.PrivateKeyPEM {
r.Fatalf("new leaf has same private key as before")
}
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(r, issued2, ca)
require.NotEqual(r, issued, issued2)
})
})
}
func TestManager_workflow_nonBlockingQuery_after_blockingQuery_shouldNotBlock(t *testing.T) {
// see: https://github.com/hashicorp/consul/issues/12048
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
m, signer := testManager(t, nil)
_ = signer.UpdateCA(t, nil)
var (
serialNumber string
index uint64
issued *structs.IssuedCert
)
testutil.RunStep(t, "do initial non-blocking query", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued1, meta, err := m.Get(ctx, req)
require.NoError(t, err)
serialNumber = issued1.SerialNumber
require.False(t, meta.Hit, "for the leaf cert cache type these are always MISS")
index = meta.Index
issued = issued1
})
go func() {
// launch goroutine for blocking query
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
MinQueryIndex: index,
}
_, _, _ = m.Get(ctx, req)
}()
// We just need to ensure that the above blocking query is in-flight before
// the next step, so do a little sleep.
time.Sleep(50 * time.Millisecond)
// The initial non-blocking query populated the leaf cert cache entry
// implicitly. The agent cache doesn't prune entries very often at all, so
// in between both of these steps the data should still be there, causing
// this to be a HIT that completes in less than 10m (the default inner leaf
// cert blocking query timeout).
testutil.RunStep(t, "do a non-blocking query that should not block", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, meta2, err := m.Get(ctx, req)
require.NoError(t, err)
require.True(t, meta2.Hit)
// If this is actually returning a cached result, the serial number
// should be unchanged.
require.Equal(t, serialNumber, issued2.SerialNumber)
require.Equal(t, issued, issued2)
})
}
func requireLeafValidUnderCA(t require.TestingT, issued *structs.IssuedCert, ca *structs.CARoot) {
require.NotNil(t, issued)
require.NotNil(t, ca)
leaf, intermediates, err := connect.ParseLeafCerts(issued.CertPEM)
require.NoError(t, err)
roots := x509.NewCertPool()
require.True(t, roots.AppendCertsFromPEM([]byte(ca.RootCert)))
_, err = leaf.Verify(x509.VerifyOptions{
Roots: roots,
Intermediates: intermediates,
})
require.NoError(t, err)
// Verify the private key matches. tls.LoadX509Keypair does this for us!
_, err = tls.X509KeyPair([]byte(issued.CertPEM), []byte(issued.PrivateKeyPEM))
require.NoError(t, err)
}
// testManager returns a *Manager that is pre-configured to use a mock RPC
// implementation that can sign certs, and an in-memory CA roots reader that
// interacts well with it.
func testManager(t *testing.T, mut func(*Config)) (*Manager, *testSigner) {
signer := newTestSigner(t, nil, nil)
deps := Deps{
Logger: testutil.Logger(t),
RootsReader: signer.RootsReader,
CertSigner: signer,
Config: Config{
// Override the root-change spread so we don't have to wait up to 20 seconds
// to see root changes work. Can be changed back for specific tests that
// need to test this, Note it's not 0 since that used default but is
// effectively the same.
TestOverrideCAChangeInitialDelay: 1 * time.Microsecond,
},
}
if mut != nil {
mut(&deps.Config)
}
m := NewManager(deps)
t.Cleanup(m.Stop)
return m, signer
}
type testRootsReader struct {
mu sync.Mutex
index uint64
roots *structs.IndexedCARoots
watcher chan struct{}
}
func newTestRootsReader(t *testing.T) *testRootsReader {
r := &testRootsReader{
watcher: make(chan struct{}),
}
t.Cleanup(func() {
r.mu.Lock()
watcher := r.watcher
r.mu.Unlock()
close(watcher)
})
return r
}
var _ RootsReader = (*testRootsReader)(nil)
func (r *testRootsReader) Set(roots *structs.IndexedCARoots) {
r.mu.Lock()
oldWatcher := r.watcher
r.watcher = make(chan struct{})
r.roots = roots
if roots == nil {
r.index = 1
} else {
r.index = roots.Index
}
r.mu.Unlock()
close(oldWatcher)
}
func (r *testRootsReader) Get() (*structs.IndexedCARoots, error) {
r.mu.Lock()
defer r.mu.Unlock()
return r.roots, nil
}
func (r *testRootsReader) Notify(ctx context.Context, correlationID string, ch chan<- cache.UpdateEvent) error {
r.mu.Lock()
watcher := r.watcher
r.mu.Unlock()
go func() {
<-watcher
r.mu.Lock()
defer r.mu.Unlock()
ch <- cache.UpdateEvent{
CorrelationID: correlationID,
Result: r.roots,
Meta: cache.ResultMeta{Index: r.index},
Err: nil,
}
}()
return nil
}
type testGetResult struct {
Index uint64
Value *structs.IssuedCert
Err error
}
// testAsyncGet returns a channel that returns the result of the testGet call.
//
// This is useful for testing timing and concurrency with testGet calls.
func testAsyncGet(t *testing.T, m *Manager, req *ConnectCALeafRequest) <-chan testGetResult {
ch := make(chan testGetResult)
go func() {
index, cert, err := m.testGet(req)
if err != nil {
ch <- testGetResult{Err: err}
return
}
ch <- testGetResult{Index: index, Value: cert}
}()
return ch
}