Merge pull request #8211 from hashicorp/bugfix/auto-encrypt-various

This commit is contained in:
Matt Keeler 2020-07-02 09:49:49 -04:00 committed by GitHub
commit f8e8f48125
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 339 additions and 102 deletions

View File

@ -840,7 +840,7 @@ func (a *Agent) setupClientAutoEncrypt(ctx context.Context) (*structs.SignedResp
}
addrs = append(addrs, retryJoinAddrs(disco, retryJoinSerfVariant, "LAN", a.config.RetryJoinLAN, a.logger)...)
reply, priv, err := client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken())
reply, priv, err := client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken(), a.config.AutoEncryptDNSSAN, a.config.AutoEncryptIPSAN)
if err != nil {
return nil, err
}
@ -877,7 +877,17 @@ func (a *Agent) setupClientAutoEncryptCache(reply *structs.SignedResponse) (*str
}
// prepolutate leaf cache
certRes := cache.FetchResult{Value: &reply.IssuedCert, Index: reply.ConnectCARoots.QueryMeta.Index}
certRes := cache.FetchResult{
Value: &reply.IssuedCert,
Index: reply.ConnectCARoots.QueryMeta.Index,
}
for _, ca := range reply.ConnectCARoots.Roots {
if ca.ID == reply.ConnectCARoots.ActiveRootID {
certRes.State = cachetype.ConnectCALeafSuccess(ca.SigningKeyID)
break
}
}
if err := a.cache.Prepopulate(cachetype.ConnectCALeafName, certRes, a.config.Datacenter, a.tokens.AgentToken(), leafReq.Key()); err != nil {
return nil, nil, err
}

View File

@ -4,6 +4,7 @@ import (
"bytes"
"context"
"crypto/tls"
"crypto/x509"
"encoding/base64"
"encoding/json"
"fmt"
@ -4628,7 +4629,7 @@ func TestAutoConfig_Integration(t *testing.T) {
})
require.NoError(t, err)
client := StartTestAgent(t, TestAgent{Name: "test-client", HCL: TestACLConfigWithParams(nil) + `
client := StartTestAgent(t, TestAgent{Name: "test-client", HCL: `
bootstrap = false
server = false
ca_file = "` + caFile + `"
@ -4654,3 +4655,78 @@ func TestAutoConfig_Integration(t *testing.T) {
// spot check that we now have an ACL token
require.NotEmpty(t, client.tokens.AgentToken())
}
func TestAgent_AutoEncrypt(t *testing.T) {
// eventually this test should really live with integration tests
// the goal here is to have one test server and another test client
// spin up both agents and allow the server to authorize the auto encrypt
// request and then see the client get a TLS certificate
cfgDir := testutil.TempDir(t, "auto-encrypt")
// write some test TLS certificates out to the cfg dir
cert, key, cacert, err := testTLSCertificates("server.dc1.consul")
require.NoError(t, err)
certFile := filepath.Join(cfgDir, "cert.pem")
caFile := filepath.Join(cfgDir, "cacert.pem")
keyFile := filepath.Join(cfgDir, "key.pem")
require.NoError(t, ioutil.WriteFile(certFile, []byte(cert), 0600))
require.NoError(t, ioutil.WriteFile(caFile, []byte(cacert), 0600))
require.NoError(t, ioutil.WriteFile(keyFile, []byte(key), 0600))
hclConfig := TestACLConfigWithParams(nil) + `
verify_incoming = true
verify_outgoing = true
verify_server_hostname = true
ca_file = "` + caFile + `"
cert_file = "` + certFile + `"
key_file = "` + keyFile + `"
connect { enabled = true }
auto_encrypt { allow_tls = true }
`
srv := StartTestAgent(t, TestAgent{Name: "test-server", HCL: hclConfig})
defer srv.Shutdown()
testrpc.WaitForTestAgent(t, srv.RPC, "dc1", testrpc.WithToken(TestDefaultMasterToken))
client := StartTestAgent(t, TestAgent{Name: "test-client", HCL: TestACLConfigWithParams(nil) + `
bootstrap = false
server = false
ca_file = "` + caFile + `"
verify_outgoing = true
verify_server_hostname = true
node_name = "test-client"
auto_encrypt {
tls = true
}
ports {
server = ` + strconv.Itoa(srv.Config.RPCBindAddr.Port) + `
}
retry_join = ["` + srv.Config.SerfBindAddrLAN.String() + `"]`,
UseTLS: true,
})
defer client.Shutdown()
// when this is successful we managed to get a TLS certificate and are using it for
// encrypted RPC connections.
testrpc.WaitForTestAgent(t, client.RPC, "dc1", testrpc.WithToken(TestDefaultMasterToken))
// now we need to validate that our certificate has the correct CN
aeCert := client.tlsConfigurator.Cert()
require.NotNil(t, aeCert)
id := connect.SpiffeIDAgent{
Host: connect.TestClusterID + ".consul",
Datacenter: "dc1",
Agent: "test-client",
}
expectedCN := connect.AgentCN("test-client", connect.TestClusterID)
x509Cert, err := x509.ParseCertificate(aeCert.Certificate[0])
require.NoError(t, err)
require.Equal(t, expectedCN, x509Cert.Subject.CommonName)
require.Len(t, x509Cert.URIs, 1)
require.Equal(t, id.URI(), x509Cert.URIs[0])
}

View File

@ -121,6 +121,15 @@ type fetchState struct {
consecutiveRateLimitErrs int
}
func ConnectCALeafSuccess(authorityKeyID string) interface{} {
return fetchState{
authorityKeyID: authorityKeyID,
forceExpireAfter: time.Time{},
consecutiveRateLimitErrs: 0,
activeRootRotationStart: time.Time{},
}
}
// fetchStart is called on each fetch that is about to block and wait for
// changes to the leaf. It subscribes a chan to receive updates from the shared
// root watcher and triggers root watcher if it's not already running.
@ -532,7 +541,7 @@ func (c *ConnectCALeaf) generateNewLeaf(req *ConnectCALeafRequest,
}
commonName = connect.AgentCN(req.Agent, roots.TrustDomain)
dnsNames = append([]string{"localhost"}, req.DNSSAN...)
ipAddresses = append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::")}, req.IPSAN...)
ipAddresses = append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, req.IPSAN...)
} else {
return result, errors.New("URI must be either service or agent")
}

View File

@ -19,7 +19,53 @@ const (
retryJitterWindow = 30 * time.Second
)
func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string) (*structs.SignedResponse, string, error) {
func (c *Client) autoEncryptCSR(extraDNSSANs []string, extraIPSANs []net.IP) (string, string, error) {
// We don't provide the correct host here, because we don't know any
// better at this point. Apart from the domain, we would need the
// ClusterID, which we don't have. This is why we go with
// dummyTrustDomain the first time. Subsequent CSRs will have the
// correct TrustDomain.
id := &connect.SpiffeIDAgent{
Host: dummyTrustDomain,
Datacenter: c.config.Datacenter,
Agent: c.config.NodeName,
}
conf, err := c.config.CAConfig.GetCommonConfig()
if err != nil {
return "", "", err
}
if conf.PrivateKeyType == "" {
conf.PrivateKeyType = connect.DefaultPrivateKeyType
}
if conf.PrivateKeyBits == 0 {
conf.PrivateKeyBits = connect.DefaultPrivateKeyBits
}
// Create a new private key
pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits)
if err != nil {
return "", "", err
}
dnsNames := append([]string{"localhost"}, extraDNSSANs...)
ipAddresses := append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, extraIPSANs...)
// Create a CSR.
//
// The Common Name includes the dummy trust domain for now but Server will
// override this when it is signed anyway so it's OK.
cn := connect.AgentCN(c.config.NodeName, dummyTrustDomain)
csr, err := connect.CreateCSR(id, cn, pk, dnsNames, ipAddresses)
if err != nil {
return "", "", err
}
return pkPEM, csr, nil
}
func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string, extraDNSSANs []string, extraIPSANs []net.IP) (*structs.SignedResponse, string, error) {
errFn := func(err error) (*structs.SignedResponse, string, error) {
return nil, "", err
}
@ -36,44 +82,7 @@ func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string,
return errFn(fmt.Errorf("No servers to request AutoEncrypt.Sign"))
}
// We don't provide the correct host here, because we don't know any
// better at this point. Apart from the domain, we would need the
// ClusterID, which we don't have. This is why we go with
// dummyTrustDomain the first time. Subsequent CSRs will have the
// correct TrustDomain.
id := &connect.SpiffeIDAgent{
Host: dummyTrustDomain,
Datacenter: c.config.Datacenter,
Agent: c.config.NodeName,
}
conf, err := c.config.CAConfig.GetCommonConfig()
if err != nil {
return errFn(err)
}
if conf.PrivateKeyType == "" {
conf.PrivateKeyType = connect.DefaultPrivateKeyType
}
if conf.PrivateKeyBits == 0 {
conf.PrivateKeyBits = connect.DefaultPrivateKeyBits
}
// Create a new private key
pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits)
if err != nil {
return errFn(err)
}
dnsNames := []string{"localhost"}
ipAddresses := []net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::")}
// Create a CSR.
//
// The Common Name includes the dummy trust domain for now but Server will
// override this when it is signed anyway so it's OK.
cn := connect.AgentCN(c.config.NodeName, dummyTrustDomain)
csr, err := connect.CreateCSR(id, cn, pk, dnsNames, ipAddresses)
pkPEM, csr, err := c.autoEncryptCSR(extraDNSSANs, extraIPSANs)
if err != nil {
return errFn(err)
}

View File

@ -2,11 +2,17 @@ package consul
import (
"context"
"crypto/x509"
"crypto/x509/pkix"
"encoding/asn1"
"net"
"net/url"
"os"
"testing"
"time"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/go-hclog"
"github.com/stretchr/testify/require"
@ -98,7 +104,7 @@ func TestAutoEncrypt_RequestAutoEncryptCerts(t *testing.T) {
doneCh := make(chan struct{})
var err error
go func() {
_, _, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token)
_, _, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token, nil, nil)
close(doneCh)
}()
select {
@ -113,3 +119,87 @@ func TestAutoEncrypt_RequestAutoEncryptCerts(t *testing.T) {
// try to request certs.
}
}
func TestAutoEncrypt_autoEncryptCSR(t *testing.T) {
type testCase struct {
conf *Config
extraDNSSANs []string
extraIPSANs []net.IP
err string
// to validate the csr
expectedSubject pkix.Name
expectedSigAlg x509.SignatureAlgorithm
expectedPubAlg x509.PublicKeyAlgorithm
expectedDNSNames []string
expectedIPs []net.IP
expectedURIs []*url.URL
}
cases := map[string]testCase{
"sans": {
conf: &Config{
Datacenter: "dc1",
NodeName: "test-node",
CAConfig: &structs.CAConfiguration{},
},
extraDNSSANs: []string{"foo.local", "bar.local"},
extraIPSANs: []net.IP{net.IPv4(198, 18, 0, 1), net.IPv4(198, 18, 0, 2)},
expectedSubject: pkix.Name{
CommonName: connect.AgentCN("test-node", dummyTrustDomain),
Names: []pkix.AttributeTypeAndValue{
{
// 2,5,4,3 is the CommonName type ASN1 identifier
Type: asn1.ObjectIdentifier{2, 5, 4, 3},
Value: "testnode.agnt.dummy.tr.consul",
},
},
},
expectedSigAlg: x509.ECDSAWithSHA256,
expectedPubAlg: x509.ECDSA,
expectedDNSNames: []string{
"localhost",
"foo.local",
"bar.local",
},
expectedIPs: []net.IP{
{127, 0, 0, 1},
net.ParseIP("::1"),
{198, 18, 0, 1},
{198, 18, 0, 2},
},
expectedURIs: []*url.URL{
{
Scheme: "spiffe",
Host: dummyTrustDomain,
Path: "/agent/client/dc/dc1/id/test-node",
},
},
},
}
for name, tcase := range cases {
t.Run(name, func(t *testing.T) {
client := Client{config: tcase.conf}
_, csr, err := client.autoEncryptCSR(tcase.extraDNSSANs, tcase.extraIPSANs)
if tcase.err == "" {
require.NoError(t, err)
request, err := connect.ParseCSR(csr)
require.NoError(t, err)
require.NotNil(t, request)
require.Equal(t, tcase.expectedSubject, request.Subject)
require.Equal(t, tcase.expectedSigAlg, request.SignatureAlgorithm)
require.Equal(t, tcase.expectedPubAlg, request.PublicKeyAlgorithm)
require.Equal(t, tcase.expectedDNSNames, request.DNSNames)
require.Equal(t, tcase.expectedIPs, request.IPAddresses)
require.Equal(t, tcase.expectedURIs, request.URIs)
} else {
require.Error(t, err)
require.Empty(t, csr)
}
})
}
}

View File

@ -4,12 +4,11 @@ import (
"context"
"errors"
"fmt"
"net/url"
"reflect"
"strings"
"sync"
"time"
"github.com/hashicorp/consul/lib/semaphore"
"github.com/hashicorp/go-hclog"
"golang.org/x/time/rate"
@ -53,57 +52,6 @@ type ConnectCA struct {
srv *Server
logger hclog.Logger
// csrRateLimiter limits the rate of signing new certs if configured. Lazily
// initialized from current config to support dynamic changes.
// csrRateLimiterMu must be held while dereferencing the pointer or storing a
// new one, but methods can be called on the limiter object outside of the
// locked section. This is done only in the getCSRRateLimiterWithLimit method.
csrRateLimiter *rate.Limiter
csrRateLimiterMu sync.RWMutex
// csrConcurrencyLimiter is a dynamically resizable semaphore used to limit
// Sign RPC concurrency if configured. The zero value is usable as soon as
// SetSize is called which we do dynamically in the RPC handler to avoid
// having to hook elaborate synchronization mechanisms through the CA config
// endpoint and config reload etc.
csrConcurrencyLimiter semaphore.Dynamic
}
// getCSRRateLimiterWithLimit returns a rate.Limiter with the desired limit set.
// It uses the shared server-wide limiter unless the limit has been changed in
// config or the limiter has not been setup yet in which case it just-in-time
// configures the new limiter. We assume that limit changes are relatively rare
// and that all callers (there is currently only one) use the same config value
// as the limit. There might be some flapping if there are multiple concurrent
// requests in flight at the time the config changes where A sees the new value
// and updates, B sees the old but then gets this lock second and changes back.
// Eventually though and very soon (once all current RPCs are complete) we are
// guaranteed to have the correct limit set by the next RPC that comes in so I
// assume this is fine. If we observe strange behavior because of it, we could
// add hysteresis that prevents changes too soon after a previous change but
// that seems unnecessary for now.
func (s *ConnectCA) getCSRRateLimiterWithLimit(limit rate.Limit) *rate.Limiter {
s.csrRateLimiterMu.RLock()
lim := s.csrRateLimiter
s.csrRateLimiterMu.RUnlock()
// If there is a current limiter with the same limit, return it. This should
// be the common case.
if lim != nil && lim.Limit() == limit {
return lim
}
// Need to change limiter, get write lock
s.csrRateLimiterMu.Lock()
defer s.csrRateLimiterMu.Unlock()
// No limiter yet, or limit changed in CA config, reconfigure a new limiter.
// We use burst of 1 for a hard limit. Note that either bursting or waiting is
// necessary to get expected behavior in fact of random arrival times, but we
// don't need both and we use Wait with a small delay to smooth noise. See
// https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md.
s.csrRateLimiter = rate.NewLimiter(limit, 1)
return s.csrRateLimiter
}
// ConfigurationGet returns the configuration for the CA.
@ -480,6 +428,30 @@ func (s *ConnectCA) Sign(
return fmt.Errorf("SPIFFE ID in CSR from a different trust domain: %s, "+
"we are %s", serviceID.Host, signingID.Host())
}
} else {
// isAgent - if we support more ID types then this would need to be an else if
// here we are just automatically fixing the trust domain. For auto-encrypt and
// auto-config they make certificate requests before learning about the roots
// so they will have a dummy trust domain in the CSR.
trustDomain := signingID.Host()
if agentID.Host != trustDomain {
originalURI := agentID.URI()
agentID.Host = trustDomain
csr.Subject.CommonName = connect.AgentCN(agentID.Agent, trustDomain)
// recreate the URIs list
uris := make([]*url.URL, len(csr.URIs))
for i, uri := range csr.URIs {
if originalURI.String() == uri.String() {
uris[i] = agentID.URI()
} else {
uris[i] = uri
}
}
csr.URIs = uris
}
}
// Verify that the ACL token provided has permission to act as this service
@ -514,7 +486,7 @@ func (s *ConnectCA) Sign(
return err
}
if commonCfg.CSRMaxPerSecond > 0 {
lim := s.getCSRRateLimiterWithLimit(rate.Limit(commonCfg.CSRMaxPerSecond))
lim := s.srv.caLeafLimiter.getCSRRateLimiterWithLimit(rate.Limit(commonCfg.CSRMaxPerSecond))
// Wait up to the small threshold we allow for a token.
ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait)
defer cancel()
@ -522,13 +494,13 @@ func (s *ConnectCA) Sign(
return ErrRateLimited
}
} else if commonCfg.CSRMaxConcurrent > 0 {
s.csrConcurrencyLimiter.SetSize(int64(commonCfg.CSRMaxConcurrent))
s.srv.caLeafLimiter.csrConcurrencyLimiter.SetSize(int64(commonCfg.CSRMaxConcurrent))
ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait)
defer cancel()
if err := s.csrConcurrencyLimiter.Acquire(ctx); err != nil {
if err := s.srv.caLeafLimiter.csrConcurrencyLimiter.Acquire(ctx); err != nil {
return ErrRateLimited
}
defer s.csrConcurrencyLimiter.Release()
defer s.srv.caLeafLimiter.csrConcurrencyLimiter.Release()
}
// All seems to be in order, actually sign it.

View File

@ -153,6 +153,9 @@ type Server struct {
caProviderRoot *structs.CARoot
caProviderLock sync.RWMutex
// rate limiter to use when signing leaf certificates
caLeafLimiter connectSignRateLimiter
// Consul configuration
config *Config

View File

@ -0,0 +1,61 @@
package consul
import (
"sync"
"github.com/hashicorp/consul/lib/semaphore"
"golang.org/x/time/rate"
)
type connectSignRateLimiter struct {
// csrRateLimiter limits the rate of signing new certs if configured. Lazily
// initialized from current config to support dynamic changes.
// csrRateLimiterMu must be held while dereferencing the pointer or storing a
// new one, but methods can be called on the limiter object outside of the
// locked section. This is done only in the getCSRRateLimiterWithLimit method.
csrRateLimiter *rate.Limiter
csrRateLimiterMu sync.RWMutex
// csrConcurrencyLimiter is a dynamically resizable semaphore used to limit
// Sign RPC concurrency if configured. The zero value is usable as soon as
// SetSize is called which we do dynamically in the RPC handler to avoid
// having to hook elaborate synchronization mechanisms through the CA config
// endpoint and config reload etc.
csrConcurrencyLimiter semaphore.Dynamic
}
// getCSRRateLimiterWithLimit returns a rate.Limiter with the desired limit set.
// It uses the shared server-wide limiter unless the limit has been changed in
// config or the limiter has not been setup yet in which case it just-in-time
// configures the new limiter. We assume that limit changes are relatively rare
// and that all callers (there is currently only one) use the same config value
// as the limit. There might be some flapping if there are multiple concurrent
// requests in flight at the time the config changes where A sees the new value
// and updates, B sees the old but then gets this lock second and changes back.
// Eventually though and very soon (once all current RPCs are complete) we are
// guaranteed to have the correct limit set by the next RPC that comes in so I
// assume this is fine. If we observe strange behavior because of it, we could
// add hysteresis that prevents changes too soon after a previous change but
// that seems unnecessary for now.
func (l *connectSignRateLimiter) getCSRRateLimiterWithLimit(limit rate.Limit) *rate.Limiter {
l.csrRateLimiterMu.RLock()
lim := l.csrRateLimiter
l.csrRateLimiterMu.RUnlock()
// If there is a current limiter with the same limit, return it. This should
// be the common case.
if lim != nil && lim.Limit() == limit {
return lim
}
// Need to change limiter, get write lock
l.csrRateLimiterMu.Lock()
defer l.csrRateLimiterMu.Unlock()
// No limiter yet, or limit changed in CA config, reconfigure a new limiter.
// We use burst of 1 for a hard limit. Note that either bursting or waiting is
// necessary to get expected behavior in fact of random arrival times, but we
// don't need both and we use Wait with a small delay to smooth noise. See
// https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md.
l.csrRateLimiter = rate.NewLimiter(limit, 1)
return l.csrRateLimiter
}

View File

@ -505,6 +505,12 @@ func (c *Configurator) commonTLSConfig(verifyIncoming bool) *tls.Config {
cert = c.manual.cert
}
if cert == nil {
// the return value MUST not be nil but an empty certificate will be
// treated the same as having no client certificate
cert = &tls.Certificate{}
}
return cert, nil
}

View File

@ -649,7 +649,8 @@ func TestConfigurator_CommonTLSConfigGetClientCertificate(t *testing.T) {
cert, err := c.commonTLSConfig(false).GetClientCertificate(nil)
require.NoError(t, err)
require.Nil(t, cert)
require.NotNil(t, cert)
require.Empty(t, cert.Certificate)
c1, err := loadKeyPair("../test/key/something_expired.cer", "../test/key/something_expired.key")
require.NoError(t, err)