consul/agent/auto-config/tls.go

286 lines
7.8 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package autoconf
import (
"context"
"fmt"
"net"
"github.com/hashicorp/consul/agent/cache"
cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/connect"
agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
2023-06-13 15:54:45 +00:00
"github.com/hashicorp/consul/agent/leafcert"
"github.com/hashicorp/consul/agent/structs"
Protobuf Refactoring for Multi-Module Cleanliness (#16302) Protobuf Refactoring for Multi-Module Cleanliness This commit includes the following: Moves all packages that were within proto/ to proto/private Rewrites imports to account for the packages being moved Adds in buf.work.yaml to enable buf workspaces Names the proto-public buf module so that we can override the Go package imports within proto/buf.yaml Bumps the buf version dependency to 1.14.0 (I was trying out the version to see if it would get around an issue - it didn't but it also doesn't break things and it seemed best to keep up with the toolchain changes) Why: In the future we will need to consume other protobuf dependencies such as the Google HTTP annotations for openapi generation or grpc-gateway usage. There were some recent changes to have our own ratelimiting annotations. The two combined were not working when I was trying to use them together (attempting to rebase another branch) Buf workspaces should be the solution to the problem Buf workspaces means that each module will have generated Go code that embeds proto file names relative to the proto dir and not the top level repo root. This resulted in proto file name conflicts in the Go global protobuf type registry. The solution to that was to add in a private/ directory into the path within the proto/ directory. That then required rewriting all the imports. Is this safe? AFAICT yes The gRPC wire protocol doesn't seem to care about the proto file names (although the Go grpc code does tack on the proto file name as Metadata in the ServiceDesc) Other than imports, there were no changes to any generated code as a result of this.
2023-02-17 21:14:46 +00:00
"github.com/hashicorp/consul/proto/private/pbautoconf"
"github.com/hashicorp/consul/proto/private/pbconnect"
)
const (
// ID of the roots watch
rootsWatchID = "roots"
// ID of the leaf watch
leafWatchID = "leaf"
unknownTrustDomain = "unknown"
)
var (
defaultDNSSANs = []string{"localhost"}
defaultIPSANs = []net.IP{{127, 0, 0, 1}, net.ParseIP("::1")}
)
func extractPEMs(roots *structs.IndexedCARoots) []string {
var pems []string
for _, root := range roots.Roots {
pems = append(pems, root.RootCert)
}
return pems
}
// updateTLSFromResponse will update the TLS certificate and roots in the shared
// TLS configurator.
func (ac *AutoConfig) updateTLSFromResponse(resp *pbautoconf.AutoConfigResponse) error {
var pems []string
for _, root := range resp.GetCARoots().GetRoots() {
pems = append(pems, root.RootCert)
}
err := ac.acConfig.TLSConfigurator.UpdateAutoTLS(
resp.ExtraCACertificates,
pems,
resp.Certificate.GetCertPEM(),
resp.Certificate.GetPrivateKeyPEM(),
resp.Config.GetTLS().GetVerifyServerHostname(),
)
if err != nil {
return fmt.Errorf("Failed to update the TLS configurator with new certificates: %w", err)
}
return nil
}
func (ac *AutoConfig) setInitialTLSCertificates(certs *structs.SignedResponse) error {
if certs == nil {
return nil
}
if err := ac.populateCertificateCache(certs); err != nil {
return fmt.Errorf("error populating cache with certificates: %w", err)
}
connectCAPems := extractPEMs(&certs.ConnectCARoots)
err := ac.acConfig.TLSConfigurator.UpdateAutoTLS(
certs.ManualCARoots,
connectCAPems,
certs.IssuedCert.CertPEM,
certs.IssuedCert.PrivateKeyPEM,
certs.VerifyServerHostname,
)
if err != nil {
return fmt.Errorf("error updating TLS configurator with certificates: %w", err)
}
return nil
}
func (ac *AutoConfig) populateCertificateCache(certs *structs.SignedResponse) error {
cert, err := connect.ParseCert(certs.IssuedCert.CertPEM)
if err != nil {
return fmt.Errorf("Failed to parse certificate: %w", err)
}
// prepolutate roots cache
rootRes := cache.FetchResult{Value: &certs.ConnectCARoots, Index: certs.ConnectCARoots.QueryMeta.Index}
rootsReq := ac.caRootsRequest()
// getting the roots doesn't require a token so in order to potentially share the cache with another
if err := ac.acConfig.Cache.Prepopulate(cachetype.ConnectCARootName, rootRes, ac.config.Datacenter, structs.DefaultPeerKeyword, "", rootsReq.CacheInfo().Key); err != nil {
return err
}
leafReq := ac.leafCertRequest()
// prepolutate leaf cache
agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
2023-06-13 15:54:45 +00:00
err = ac.acConfig.LeafCertManager.Prepopulate(
context.Background(),
leafReq.Key(),
certs.IssuedCert.RaftIndex.ModifyIndex,
&certs.IssuedCert,
connect.EncodeSigningKeyID(cert.AuthorityKeyId),
)
if err != nil {
return err
}
return nil
}
func (ac *AutoConfig) setupCertificateCacheWatches(ctx context.Context) (context.CancelFunc, error) {
notificationCtx, cancel := context.WithCancel(ctx)
rootsReq := ac.caRootsRequest()
err := ac.acConfig.Cache.Notify(notificationCtx, cachetype.ConnectCARootName, &rootsReq, rootsWatchID, ac.cacheUpdates)
if err != nil {
cancel()
return nil, err
}
leafReq := ac.leafCertRequest()
agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
2023-06-13 15:54:45 +00:00
err = ac.acConfig.LeafCertManager.Notify(notificationCtx, &leafReq, leafWatchID, ac.cacheUpdates)
if err != nil {
cancel()
return nil, err
}
return cancel, nil
}
func (ac *AutoConfig) updateCARoots(roots *structs.IndexedCARoots) error {
switch {
case ac.config.AutoConfig.Enabled:
ac.Lock()
defer ac.Unlock()
var err error
2022-03-16 16:12:29 +00:00
ac.autoConfigResponse.CARoots, err = pbconnect.NewCARootsFromStructs(roots)
if err != nil {
return err
}
if err := ac.updateTLSFromResponse(ac.autoConfigResponse); err != nil {
return err
}
return ac.persistAutoConfig(ac.autoConfigResponse)
case ac.config.AutoEncryptTLS:
pems := extractPEMs(roots)
if err := ac.acConfig.TLSConfigurator.UpdateAutoTLSCA(pems); err != nil {
return fmt.Errorf("failed to update Connect CA certificates: %w", err)
}
return nil
default:
return nil
}
}
func (ac *AutoConfig) updateLeafCert(cert *structs.IssuedCert) error {
switch {
case ac.config.AutoConfig.Enabled:
ac.Lock()
defer ac.Unlock()
var err error
2022-03-16 16:12:29 +00:00
ac.autoConfigResponse.Certificate, err = pbconnect.NewIssuedCertFromStructs(cert)
if err != nil {
return err
}
if err := ac.updateTLSFromResponse(ac.autoConfigResponse); err != nil {
return err
}
return ac.persistAutoConfig(ac.autoConfigResponse)
case ac.config.AutoEncryptTLS:
if err := ac.acConfig.TLSConfigurator.UpdateAutoTLSCert(cert.CertPEM, cert.PrivateKeyPEM); err != nil {
return fmt.Errorf("failed to update the agent leaf cert: %w", err)
}
return nil
default:
return nil
}
}
func (ac *AutoConfig) caRootsRequest() structs.DCSpecificRequest {
return structs.DCSpecificRequest{Datacenter: ac.config.Datacenter}
}
agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
2023-06-13 15:54:45 +00:00
func (ac *AutoConfig) leafCertRequest() leafcert.ConnectCALeafRequest {
return leafcert.ConnectCALeafRequest{
Datacenter: ac.config.Datacenter,
Agent: ac.config.NodeName,
DNSSAN: ac.getDNSSANs(),
IPSAN: ac.getIPSANs(),
Token: ac.acConfig.Tokens.AgentToken(),
EnterpriseMeta: *structs.NodeEnterpriseMetaInPartition(ac.config.PartitionOrEmpty()),
}
}
// generateCSR will generate a CSR for an Agent certificate. This should
// be sent along with the AutoConfig.InitialConfiguration RPC or the
// AutoEncrypt.Sign RPC. The generated CSR does NOT have a real trust domain
// as when generating this we do not yet have the CA roots. The server will
// update the trust domain for us though.
func (ac *AutoConfig) generateCSR() (csr string, key string, err error) {
// We don't provide the correct host here, because we don't know any
// better at this point. Apart from the domain, we would need the
// ClusterID, which we don't have. This is why we go with
// unknownTrustDomain the first time. Subsequent CSRs will have the
// correct TrustDomain.
id := &connect.SpiffeIDAgent{
// will be replaced
Host: unknownTrustDomain,
Datacenter: ac.config.Datacenter,
Agent: ac.config.NodeName,
Partition: ac.config.PartitionOrDefault(),
}
caConfig, err := ac.config.ConnectCAConfiguration()
if err != nil {
return "", "", fmt.Errorf("Cannot generate CSR: %w", err)
}
conf, err := caConfig.GetCommonConfig()
if err != nil {
return "", "", fmt.Errorf("Failed to load common CA configuration: %w", err)
}
if conf.PrivateKeyType == "" {
conf.PrivateKeyType = connect.DefaultPrivateKeyType
}
if conf.PrivateKeyBits == 0 {
conf.PrivateKeyBits = connect.DefaultPrivateKeyBits
}
// Create a new private key
pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits)
if err != nil {
return "", "", fmt.Errorf("Failed to generate private key: %w", err)
}
dnsNames := ac.getDNSSANs()
ipAddresses := ac.getIPSANs()
// Create a CSR.
csr, err = connect.CreateCSR(id, pk, dnsNames, ipAddresses)
if err != nil {
return "", "", err
}
return csr, pkPEM, nil
}
func (ac *AutoConfig) getDNSSANs() []string {
sans := defaultDNSSANs
switch {
case ac.config.AutoConfig.Enabled:
sans = append(sans, ac.config.AutoConfig.DNSSANs...)
case ac.config.AutoEncryptTLS:
sans = append(sans, ac.config.AutoEncryptDNSSAN...)
}
return sans
}
func (ac *AutoConfig) getIPSANs() []net.IP {
sans := defaultIPSANs
switch {
case ac.config.AutoConfig.Enabled:
sans = append(sans, ac.config.AutoConfig.IPSANs...)
case ac.config.AutoEncryptTLS:
sans = append(sans, ac.config.AutoEncryptIPSAN...)
}
return sans
}