consul/tlsutil/config.go

996 lines
30 KiB
Go
Raw Normal View History

package tlsutil
import (
"crypto/tls"
"crypto/x509"
"fmt"
"io/ioutil"
"net"
"os"
"path/filepath"
2020-02-19 23:22:31 +01:00
"sort"
2015-05-11 16:05:39 -07:00
"strings"
"sync"
"time"
2020-02-19 23:22:31 +01:00
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/go-hclog"
)
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
// ALPNWrapper is a function that is used to wrap a non-TLS connection and
// returns an appropriate TLS connection or error. This taks a datacenter and
// node name as argument to configure the desired SNI value and the desired
// next proto for configuring ALPN.
type ALPNWrapper func(dc, nodeName, alpnProto string, conn net.Conn) (net.Conn, error)
// DCWrapper is a function that is used to wrap a non-TLS connection
// and returns an appropriate TLS connection or error. This takes
// a datacenter as an argument.
type DCWrapper func(dc string, conn net.Conn) (net.Conn, error)
// Wrapper is a variant of DCWrapper, where the DC is provided as
// a constant value. This is usually done by currying DCWrapper.
type Wrapper func(conn net.Conn) (net.Conn, error)
2015-05-08 15:54:42 -07:00
2017-02-01 15:52:04 -05:00
// TLSLookup maps the tls_min_version configuration to the internal value
var TLSLookup = map[string]uint16{
"": tls.VersionTLS10, // default in golang
2017-02-01 15:52:04 -05:00
"tls10": tls.VersionTLS10,
"tls11": tls.VersionTLS11,
"tls12": tls.VersionTLS12,
2020-02-19 23:22:31 +01:00
"tls13": tls.VersionTLS13,
2017-02-01 15:52:04 -05:00
}
2020-02-19 23:22:31 +01:00
// TLSVersions has all the keys from the map above.
var TLSVersions = strings.Join(tlsVersions(), ", ")
// Config used to create tls.Config
type Config struct {
// VerifyIncoming is used to verify the authenticity of incoming
// connections. This means that TCP requests are forbidden, only
// allowing for TLS. TLS connections must match a provided certificate
// authority. This can be used to force client auth.
VerifyIncoming bool
// VerifyIncomingRPC is used to verify the authenticity of incoming RPC
// connections. This means that TCP requests are forbidden, only
// allowing for TLS. TLS connections must match a provided certificate
// authority. This can be used to force client auth.
VerifyIncomingRPC bool
// VerifyIncomingHTTPS is used to verify the authenticity of incoming
// HTTPS connections. This means that TCP requests are forbidden, only
// allowing for TLS. TLS connections must match a provided certificate
// authority. This can be used to force client auth.
VerifyIncomingHTTPS bool
// VerifyOutgoing is used to verify the authenticity of outgoing
// connections. This means that TLS requests are used, and TCP
// requests are not made. TLS connections must match a provided
// certificate authority. This is used to verify authenticity of server
// nodes.
VerifyOutgoing bool
// VerifyServerHostname is used to enable hostname verification of
// servers. This ensures that the certificate presented is valid for
// server.<datacenter>.<domain>. This prevents a compromised client
// from being restarted as a server, and then intercepting request
// traffic as well as being added as a raft peer. This should be
// enabled by default with VerifyOutgoing, but for legacy reasons we
// cannot break existing clients.
VerifyServerHostname bool
// UseTLS is used to enable outgoing TLS connections to Consul servers.
UseTLS bool
// CAFile is a path to a certificate authority file. This is used with
// VerifyIncoming or VerifyOutgoing to verify the TLS connection.
CAFile string
// CAPath is a path to a directory containing certificate authority
// files. This is used with VerifyIncoming or VerifyOutgoing to verify
// the TLS connection.
CAPath string
// CertFile is used to provide a TLS certificate that is used for
// serving TLS connections. Must be provided to serve TLS connections.
CertFile string
// KeyFile is used to provide a TLS key that is used for serving TLS
// connections. Must be provided to serve TLS connections.
KeyFile string
// Node name is the name we use to advertise. Defaults to hostname.
NodeName string
// ServerName is used with the TLS certificate to ensure the name we
// provide matches the certificate
ServerName string
// Domain is the Consul TLD being used. Defaults to "consul."
Domain string
2017-02-01 15:52:04 -05:00
// TLSMinVersion is the minimum accepted TLS version that can be used.
TLSMinVersion string
// CipherSuites is the list of TLS cipher suites to use.
CipherSuites []uint16
// PreferServerCipherSuites specifies whether to prefer the server's
// ciphersuite over the client ciphersuites.
PreferServerCipherSuites bool
// EnableAgentTLSForChecks is used to apply the agent's TLS settings in
// order to configure the HTTP client used for health checks. Enabling
// this allows HTTP checks to present a client certificate and verify
// the server using the same TLS configuration as the agent (CA, cert,
// and key).
EnableAgentTLSForChecks bool
// AutoEncryptTLS opts the agent into provisioning agent
// TLS certificates.
AutoEncryptTLS bool
}
2020-02-19 23:22:31 +01:00
func tlsVersions() []string {
versions := []string{}
for v := range TLSLookup {
if v != "" {
versions = append(versions, v)
}
}
sort.Strings(versions)
return versions
}
// KeyPair is used to open and parse a certificate and key file
func (c *Config) KeyPair() (*tls.Certificate, error) {
return loadKeyPair(c.CertFile, c.KeyFile)
}
// SpecificDC is used to invoke a static datacenter
// and turns a DCWrapper into a Wrapper type.
func SpecificDC(dc string, tlsWrap DCWrapper) Wrapper {
if tlsWrap == nil {
return nil
}
return func(conn net.Conn) (net.Conn, error) {
return tlsWrap(dc, conn)
}
}
type autoEncrypt struct {
manualCAPems []string
connectCAPems []string
cert *tls.Certificate
verifyServerHostname bool
}
func (a *autoEncrypt) caPems() []string {
return append(a.manualCAPems, a.connectCAPems...)
}
type manual struct {
caPems []string
cert *tls.Certificate
}
// Configurator holds a Config and is responsible for generating all the
// *tls.Config necessary for Consul. Except the one in the api package.
type Configurator struct {
sync.RWMutex
base *Config
autoEncrypt *autoEncrypt
manual *manual
peerDatacenterUseTLS map[string]bool
caPool *x509.CertPool
logger hclog.Logger
version int
}
// NewConfigurator creates a new Configurator and sets the provided
// configuration.
func NewConfigurator(config Config, logger hclog.Logger) (*Configurator, error) {
if logger == nil {
logger = hclog.New(&hclog.LoggerOptions{
Level: hclog.Debug,
})
}
c := &Configurator{
logger: logger.Named(logging.TLSUtil),
manual: &manual{},
autoEncrypt: &autoEncrypt{},
peerDatacenterUseTLS: map[string]bool{},
}
err := c.Update(config)
if err != nil {
return nil, err
}
return c, nil
}
// CAPems returns the currently loaded CAs in PEM format.
func (c *Configurator) CAPems() []string {
c.RLock()
defer c.RUnlock()
return append(c.manual.caPems, c.autoEncrypt.caPems()...)
}
// ManualCAPems returns the currently loaded CAs in PEM format.
func (c *Configurator) ManualCAPems() []string {
c.RLock()
defer c.RUnlock()
return c.manual.caPems
}
// Update updates the internal configuration which is used to generate
// *tls.Config.
// This function acquires a write lock because it writes the new config.
func (c *Configurator) Update(config Config) error {
c.Lock()
// order of defers matters because log acquires a RLock()
defer c.log("Update")
defer c.Unlock()
cert, err := loadKeyPair(config.CertFile, config.KeyFile)
if err != nil {
return err
}
pems, err := loadCAs(config.CAFile, config.CAPath)
if err != nil {
return err
}
pool, err := pool(append(pems, c.autoEncrypt.caPems()...))
if err != nil {
return err
}
if err = c.check(config, pool, cert); err != nil {
return err
}
c.base = &config
c.manual.cert = cert
c.manual.caPems = pems
c.caPool = pool
c.version++
return nil
}
// UpdateAutoEncryptCA updates the autoEncrypt.caPems. This is supposed to be called
// from the server in order to be able to accept TLS connections with TLS
// certificates.
// Or it is being called on the client side when CA changes are detected.
func (c *Configurator) UpdateAutoEncryptCA(connectCAPems []string) error {
c.Lock()
// order of defers matters because log acquires a RLock()
defer c.log("UpdateAutoEncryptCA")
defer c.Unlock()
pool, err := pool(append(c.manual.caPems, append(c.autoEncrypt.manualCAPems, connectCAPems...)...))
if err != nil {
c.RUnlock()
return err
}
if err = c.check(*c.base, pool, c.manual.cert); err != nil {
c.RUnlock()
return err
}
c.autoEncrypt.connectCAPems = connectCAPems
c.caPool = pool
c.version++
return nil
}
// UpdateAutoEncryptCert
func (c *Configurator) UpdateAutoEncryptCert(pub, priv string) error {
// order of defers matters because log acquires a RLock()
defer c.log("UpdateAutoEncryptCert")
cert, err := tls.X509KeyPair([]byte(pub), []byte(priv))
if err != nil {
return fmt.Errorf("Failed to load cert/key pair: %v", err)
}
c.Lock()
defer c.Unlock()
c.autoEncrypt.cert = &cert
c.version++
return nil
}
// UpdateAutoEncrypt sets everything under autoEncrypt. This is being called on the
// client when it received its cert from AutoEncrypt endpoint.
func (c *Configurator) UpdateAutoEncrypt(manualCAPems, connectCAPems []string, pub, priv string, verifyServerHostname bool) error {
// order of defers matters because log acquires a RLock()
defer c.log("UpdateAutoEncrypt")
cert, err := tls.X509KeyPair([]byte(pub), []byte(priv))
if err != nil {
return fmt.Errorf("Failed to load cert/key pair: %v", err)
}
c.Lock()
defer c.Unlock()
pool, err := pool(append(c.manual.caPems, append(manualCAPems, connectCAPems...)...))
if err != nil {
return err
}
c.autoEncrypt.manualCAPems = manualCAPems
c.autoEncrypt.connectCAPems = connectCAPems
c.autoEncrypt.cert = &cert
c.caPool = pool
c.autoEncrypt.verifyServerHostname = verifyServerHostname
c.version++
return nil
}
func (c *Configurator) UpdateAreaPeerDatacenterUseTLS(peerDatacenter string, useTLS bool) {
c.Lock()
defer c.Unlock()
c.version++
c.peerDatacenterUseTLS[peerDatacenter] = useTLS
}
func (c *Configurator) getAreaForPeerDatacenterUseTLS(peerDatacenter string) bool {
c.RLock()
defer c.RUnlock()
if v, ok := c.peerDatacenterUseTLS[peerDatacenter]; ok {
return v
}
return true
}
func (c *Configurator) Base() Config {
c.RLock()
defer c.RUnlock()
return *c.base
}
func pool(pems []string) (*x509.CertPool, error) {
pool := x509.NewCertPool()
for _, pem := range pems {
if !pool.AppendCertsFromPEM([]byte(pem)) {
return nil, fmt.Errorf("Couldn't parse PEM %s", pem)
}
}
if len(pool.Subjects()) == 0 {
return nil, nil
}
return pool, nil
}
func (c *Configurator) check(config Config, pool *x509.CertPool, cert *tls.Certificate) error {
// Check if a minimum TLS version was set
if config.TLSMinVersion != "" {
if _, ok := TLSLookup[config.TLSMinVersion]; !ok {
2020-02-19 23:22:31 +01:00
return fmt.Errorf("TLSMinVersion: value %s not supported, please specify one of [%s]", config.TLSMinVersion, TLSVersions)
}
}
// Ensure we have a CA if VerifyOutgoing is set
if config.VerifyOutgoing && pool == nil {
return fmt.Errorf("VerifyOutgoing set, and no CA certificate provided!")
}
// Ensure we have a CA and cert if VerifyIncoming is set
if config.anyVerifyIncoming() {
autoEncryptMsg := " AutoEncrypt only secures the connection between client and server and doesn't affect incoming connections on the client."
if pool == nil {
errMsg := "VerifyIncoming set, and no CA certificate provided!"
if config.AutoEncryptTLS {
errMsg += autoEncryptMsg
}
return fmt.Errorf(errMsg)
}
if cert == nil {
errMsg := "VerifyIncoming set, and no Cert/Key pair provided!"
if config.AutoEncryptTLS {
errMsg += autoEncryptMsg
}
return fmt.Errorf(errMsg)
}
}
return nil
}
func (c Config) anyVerifyIncoming() bool {
return c.baseVerifyIncoming() || c.VerifyIncomingRPC || c.VerifyIncomingHTTPS
}
func (c Config) verifyIncomingRPC() bool {
return c.baseVerifyIncoming() || c.VerifyIncomingRPC
}
func (c Config) verifyIncomingHTTPS() bool {
return c.baseVerifyIncoming() || c.VerifyIncomingHTTPS
}
func (c *Config) baseVerifyIncoming() bool {
return c.VerifyIncoming
}
func loadKeyPair(certFile, keyFile string) (*tls.Certificate, error) {
if certFile == "" || keyFile == "" {
return nil, nil
}
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
return nil, fmt.Errorf("Failed to load cert/key pair: %v", err)
}
return &cert, nil
}
func loadCAs(caFile, caPath string) ([]string, error) {
if caFile == "" && caPath == "" {
return nil, nil
}
pems := []string{}
readFn := func(path string) error {
pem, err := ioutil.ReadFile(path)
if err != nil {
return fmt.Errorf("Error loading from %s: %s", path, err)
}
pems = append(pems, string(pem))
return nil
}
walkFn := func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
if err := readFn(path); err != nil {
return err
}
}
return nil
}
if caFile != "" {
err := readFn(caFile)
if err != nil {
return pems, err
}
} else if caPath != "" {
err := filepath.Walk(caPath, walkFn)
if err != nil {
return pems, err
}
if len(pems) == 0 {
return pems, fmt.Errorf("Error loading from CAPath: no CAs found")
}
}
return pems, nil
}
// commonTLSConfig generates a *tls.Config from the base configuration the
// Configurator has. It accepts an additional flag in case a config is needed
// for incoming TLS connections.
// This function acquires a read lock because it reads from the config.
func (c *Configurator) commonTLSConfig(verifyIncoming bool) *tls.Config {
// this needs to be outside of RLock because it acquires an RLock itself
verifyServerHostname := c.VerifyServerHostname()
c.RLock()
defer c.RUnlock()
tlsConfig := &tls.Config{
InsecureSkipVerify: !verifyServerHostname,
}
// Set the cipher suites
if len(c.base.CipherSuites) != 0 {
tlsConfig.CipherSuites = c.base.CipherSuites
}
tlsConfig.PreferServerCipherSuites = c.base.PreferServerCipherSuites
// GetCertificate is used when acting as a server and responding to
// client requests. Default to the manually configured cert, but allow
// autoEncrypt cert too so that a client can encrypt incoming
// connections without having a manual cert configured.
tlsConfig.GetCertificate = func(*tls.ClientHelloInfo) (*tls.Certificate, error) {
return c.Cert(), nil
}
// GetClientCertificate is used when acting as a client and responding
// to a server requesting a certificate. Return the autoEncrypt certificate
// if possible, otherwise default to the manually provisioned one.
tlsConfig.GetClientCertificate = func(*tls.CertificateRequestInfo) (*tls.Certificate, error) {
cert := c.autoEncrypt.cert
if cert == nil {
cert = c.manual.cert
}
if cert == nil {
// the return value MUST not be nil but an empty certificate will be
// treated the same as having no client certificate
cert = &tls.Certificate{}
}
return cert, nil
}
tlsConfig.ClientCAs = c.caPool
tlsConfig.RootCAs = c.caPool
// This is possible because TLSLookup also contains "" with golang's
// default (tls10). And because the initial check makes sure the
// version correctly matches.
tlsConfig.MinVersion = TLSLookup[c.base.TLSMinVersion]
// Set ClientAuth if necessary
if verifyIncoming {
tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert
}
return tlsConfig
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) Cert() *tls.Certificate {
c.RLock()
defer c.RUnlock()
cert := c.manual.cert
if cert == nil {
cert = c.autoEncrypt.cert
}
return cert
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) VerifyIncomingRPC() bool {
c.RLock()
defer c.RUnlock()
return c.base.verifyIncomingRPC()
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) outgoingRPCTLSDisabled() bool {
c.RLock()
defer c.RUnlock()
// if AutoEncrypt enabled, always use TLS
if c.base.AutoEncryptTLS {
return false
}
// if CAs are provided or VerifyOutgoing is set, use TLS
if c.base.VerifyOutgoing {
return false
}
return true
}
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
func (c *Configurator) MutualTLSCapable() bool {
return c.mutualTLSCapable()
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) mutualTLSCapable() bool {
c.RLock()
defer c.RUnlock()
return c.caPool != nil && (c.autoEncrypt.cert != nil || c.manual.cert != nil)
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) verifyOutgoing() bool {
c.RLock()
defer c.RUnlock()
// If AutoEncryptTLS is enabled and there is a CA, then verify
// outgoing.
if c.base.AutoEncryptTLS && c.caPool != nil {
return true
}
return c.base.VerifyOutgoing
}
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
func (c *Configurator) ServerSNI(dc, nodeName string) string {
// Strip the trailing '.' from the domain if any
domain := strings.TrimSuffix(c.domain(), ".")
if nodeName == "" || nodeName == "*" {
return "server." + dc + "." + domain
}
return nodeName + ".server." + dc + "." + domain
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) domain() string {
c.RLock()
defer c.RUnlock()
return c.base.Domain
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) verifyIncomingRPC() bool {
c.RLock()
defer c.RUnlock()
return c.base.verifyIncomingRPC()
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) verifyIncomingHTTPS() bool {
c.RLock()
defer c.RUnlock()
return c.base.verifyIncomingHTTPS()
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) enableAgentTLSForChecks() bool {
c.RLock()
defer c.RUnlock()
return c.base.EnableAgentTLSForChecks
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) serverNameOrNodeName() string {
c.RLock()
defer c.RUnlock()
if c.base.ServerName != "" {
return c.base.ServerName
}
return c.base.NodeName
}
2017-02-01 15:52:04 -05:00
// This function acquires a read lock because it reads from the config.
func (c *Configurator) VerifyServerHostname() bool {
c.RLock()
defer c.RUnlock()
return c.base.VerifyServerHostname || c.autoEncrypt.verifyServerHostname
}
// IncomingGRPCConfig generates a *tls.Config for incoming GRPC connections.
func (c *Configurator) IncomingGRPCConfig() *tls.Config {
c.log("IncomingGRPCConfig")
// false has the effect that this config doesn't require a client cert
// verification. This is because there is no verify_incoming_grpc
// configuration option. And using verify_incoming would be backwards
// incompatible, because even if it was set before, it didn't have an
// effect on the grpc server.
config := c.commonTLSConfig(false)
config.GetConfigForClient = func(*tls.ClientHelloInfo) (*tls.Config, error) {
return c.IncomingGRPCConfig(), nil
}
return config
}
// IncomingRPCConfig generates a *tls.Config for incoming RPC connections.
func (c *Configurator) IncomingRPCConfig() *tls.Config {
c.log("IncomingRPCConfig")
config := c.commonTLSConfig(c.verifyIncomingRPC())
config.GetConfigForClient = func(*tls.ClientHelloInfo) (*tls.Config, error) {
return c.IncomingRPCConfig(), nil
}
return config
}
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
// IncomingALPNRPCConfig generates a *tls.Config for incoming RPC connections
// directly using TLS with ALPN instead of the older byte-prefixed protocol.
func (c *Configurator) IncomingALPNRPCConfig(alpnProtos []string) *tls.Config {
c.log("IncomingALPNRPCConfig")
// Since the ALPN-RPC variation is indirectly exposed to the internet via
// mesh gateways we force mTLS and full server name verification.
config := c.commonTLSConfig(true)
config.InsecureSkipVerify = false
config.GetConfigForClient = func(*tls.ClientHelloInfo) (*tls.Config, error) {
return c.IncomingALPNRPCConfig(alpnProtos), nil
}
config.NextProtos = alpnProtos
return config
}
// IncomingInsecureRPCConfig means that it doesn't verify incoming even thought
// it might have been configured. This is only supposed to be used by the
// servers for the insecure RPC server. At the time of writing only the
// AutoEncrypt.Sign call is supported on that server. And it might be the only
// usecase ever.
func (c *Configurator) IncomingInsecureRPCConfig() *tls.Config {
c.log("IncomingInsecureRPCConfig")
config := c.commonTLSConfig(false)
config.GetConfigForClient = func(*tls.ClientHelloInfo) (*tls.Config, error) {
return c.IncomingInsecureRPCConfig(), nil
}
return config
}
// IncomingHTTPSConfig generates a *tls.Config for incoming HTTPS connections.
func (c *Configurator) IncomingHTTPSConfig() *tls.Config {
c.log("IncomingHTTPSConfig")
config := c.commonTLSConfig(c.verifyIncomingHTTPS())
config.NextProtos = []string{"h2", "http/1.1"}
config.GetConfigForClient = func(*tls.ClientHelloInfo) (*tls.Config, error) {
return c.IncomingHTTPSConfig(), nil
}
return config
}
// IncomingTLSConfig generates a *tls.Config for outgoing TLS connections for
// checks. This function is separated because there is an extra flag to
// consider for checks. EnableAgentTLSForChecks and InsecureSkipVerify has to
// be checked for checks.
func (c *Configurator) OutgoingTLSConfigForCheck(skipVerify bool) *tls.Config {
c.log("OutgoingTLSConfigForCheck")
if !c.enableAgentTLSForChecks() {
return &tls.Config{
InsecureSkipVerify: skipVerify,
}
}
config := c.commonTLSConfig(false)
config.InsecureSkipVerify = skipVerify
config.ServerName = c.serverNameOrNodeName()
return config
}
// OutgoingRPCConfig generates a *tls.Config for outgoing RPC connections. If
// there is a CA or VerifyOutgoing is set, a *tls.Config will be provided,
// otherwise we assume that no TLS should be used.
func (c *Configurator) OutgoingRPCConfig() *tls.Config {
c.log("OutgoingRPCConfig")
if c.outgoingRPCTLSDisabled() {
return nil
}
return c.commonTLSConfig(false)
}
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
// OutgoingALPNRPCConfig generates a *tls.Config for outgoing RPC connections
// directly using TLS with ALPN instead of the older byte-prefixed protocol.
// If there is a CA or VerifyOutgoing is set, a *tls.Config will be provided,
// otherwise we assume that no TLS should be used which completely disables the
// ALPN variation.
func (c *Configurator) OutgoingALPNRPCConfig() *tls.Config {
c.log("OutgoingALPNRPCConfig")
if !c.mutualTLSCapable() {
return nil // ultimately this will hard-fail as TLS is required
}
// Since the ALPN-RPC variation is indirectly exposed to the internet via
// mesh gateways we force mTLS and full server name verification.
config := c.commonTLSConfig(true)
config.InsecureSkipVerify = false
return config
}
// OutgoingRPCWrapper wraps the result of OutgoingRPCConfig in a DCWrapper. It
// decides if verify server hostname should be used.
func (c *Configurator) OutgoingRPCWrapper() DCWrapper {
c.log("OutgoingRPCWrapper")
// Generate the wrapper based on dc
return func(dc string, conn net.Conn) (net.Conn, error) {
if c.UseTLS(dc) {
return c.wrapTLSClient(dc, conn)
}
return conn, nil
}
}
func (c *Configurator) UseTLS(dc string) bool {
return !c.outgoingRPCTLSDisabled() && c.getAreaForPeerDatacenterUseTLS(dc)
}
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
// OutgoingALPNRPCWrapper wraps the result of OutgoingALPNRPCConfig in an
// ALPNWrapper. It configures all of the negotiation plumbing.
func (c *Configurator) OutgoingALPNRPCWrapper() ALPNWrapper {
c.log("OutgoingALPNRPCWrapper")
if !c.mutualTLSCapable() {
return nil
}
return func(dc, nodeName, alpnProto string, conn net.Conn) (net.Conn, error) {
return c.wrapALPNTLSClient(dc, nodeName, alpnProto, conn)
}
}
// AutoEncryptCertNotAfter returns NotAfter from the auto_encrypt cert. In case
// there is no cert, it will return a time in the past.
func (c *Configurator) AutoEncryptCertNotAfter() time.Time {
c.RLock()
defer c.RUnlock()
tlsCert := c.autoEncrypt.cert
if tlsCert == nil || tlsCert.Certificate == nil {
return time.Now().AddDate(0, 0, -1)
}
cert, err := x509.ParseCertificate(tlsCert.Certificate[0])
if err != nil {
return time.Now().AddDate(0, 0, -1)
}
return cert.NotAfter
}
// AutoEncryptCertExpired returns if the auto_encrypt cert is expired.
func (c *Configurator) AutoEncryptCertExpired() bool {
return c.AutoEncryptCertNotAfter().Before(time.Now())
}
// This function acquires a read lock because it reads from the config.
func (c *Configurator) log(name string) {
if c.logger != nil {
c.RLock()
defer c.RUnlock()
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
c.logger.Trace(name, "version", c.version)
2017-02-01 15:52:04 -05:00
}
}
// Wrap a net.Conn into a client tls connection, performing any
// additional verification as needed.
//
// As of go 1.3, crypto/tls only supports either doing no certificate
// verification, or doing full verification including of the peer's
// DNS name. For consul, we want to validate that the certificate is
// signed by a known CA, but because consul doesn't use DNS names for
// node names, we don't verify the certificate DNS names. Since go 1.3
// no longer supports this mode of operation, we have to do it
// manually.
func (c *Configurator) wrapTLSClient(dc string, conn net.Conn) (net.Conn, error) {
config := c.OutgoingRPCConfig()
verifyServerHostname := c.VerifyServerHostname()
verifyOutgoing := c.verifyOutgoing()
domain := c.domain()
if verifyServerHostname {
// Strip the trailing '.' from the domain if any
domain = strings.TrimSuffix(domain, ".")
config.ServerName = "server." + dc + "." + domain
}
tlsConn := tls.Client(conn, config)
// If crypto/tls is doing verification, there's no need to do
// our own.
if !config.InsecureSkipVerify {
return tlsConn, nil
}
// If verification is not turned on, don't do it.
if !verifyOutgoing {
return tlsConn, nil
}
err := tlsConn.Handshake()
if err != nil {
tlsConn.Close()
return nil, err
}
// The following is lightly-modified from the doFullHandshake
// method in crypto/tls's handshake_client.go.
opts := x509.VerifyOptions{
Roots: config.RootCAs,
CurrentTime: time.Now(),
DNSName: "",
Intermediates: x509.NewCertPool(),
}
certs := tlsConn.ConnectionState().PeerCertificates
for i, cert := range certs {
if i == 0 {
continue
}
opts.Intermediates.AddCert(cert)
}
_, err = certs[0].Verify(opts)
if err != nil {
tlsConn.Close()
return nil, err
}
return tlsConn, err
}
wan federation via mesh gateways (#6884) This is like a Möbius strip of code due to the fact that low-level components (serf/memberlist) are connected to high-level components (the catalog and mesh-gateways) in a twisty maze of references which make it hard to dive into. With that in mind here's a high level summary of what you'll find in the patch: There are several distinct chunks of code that are affected: * new flags and config options for the server * retry join WAN is slightly different * retry join code is shared to discover primary mesh gateways from secondary datacenters * because retry join logic runs in the *agent* and the results of that operation for primary mesh gateways are needed in the *server* there are some methods like `RefreshPrimaryGatewayFallbackAddresses` that must occur at multiple layers of abstraction just to pass the data down to the right layer. * new cache type `FederationStateListMeshGatewaysName` for use in `proxycfg/xds` layers * the function signature for RPC dialing picked up a new required field (the node name of the destination) * several new RPCs for manipulating a FederationState object: `FederationState:{Apply,Get,List,ListMeshGateways}` * 3 read-only internal APIs for debugging use to invoke those RPCs from curl * raft and fsm changes to persist these FederationStates * replication for FederationStates as they are canonically stored in the Primary and replicated to the Secondaries. * a special derivative of anti-entropy that runs in secondaries to snapshot their local mesh gateway `CheckServiceNodes` and sync them into their upstream FederationState in the primary (this works in conjunction with the replication to distribute addresses for all mesh gateways in all DCs to all other DCs) * a "gateway locator" convenience object to make use of this data to choose the addresses of gateways to use for any given RPC or gossip operation to a remote DC. This gets data from the "retry join" logic in the agent and also directly calls into the FSM. * RPC (`:8300`) on the server sniffs the first byte of a new connection to determine if it's actually doing native TLS. If so it checks the ALPN header for protocol determination (just like how the existing system uses the type-byte marker). * 2 new kinds of protocols are exclusively decoded via this native TLS mechanism: one for ferrying "packet" operations (udp-like) from the gossip layer and one for "stream" operations (tcp-like). The packet operations re-use sockets (using length-prefixing) to cut down on TLS re-negotiation overhead. * the server instances specially wrap the `memberlist.NetTransport` when running with gateway federation enabled (in a `wanfed.Transport`). The general gist is that if it tries to dial a node in the SAME datacenter (deduced by looking at the suffix of the node name) there is no change. If dialing a DIFFERENT datacenter it is wrapped up in a TLS+ALPN blob and sent through some mesh gateways to eventually end up in a server's :8300 port. * a new flag when launching a mesh gateway via `consul connect envoy` to indicate that the servers are to be exposed. This sets a special service meta when registering the gateway into the catalog. * `proxycfg/xds` notice this metadata blob to activate additional watches for the FederationState objects as well as the location of all of the consul servers in that datacenter. * `xds:` if the extra metadata is in place additional clusters are defined in a DC to bulk sink all traffic to another DC's gateways. For the current datacenter we listen on a wildcard name (`server.<dc>.consul`) that load balances all servers as well as one mini-cluster per node (`<node>.server.<dc>.consul`) * the `consul tls cert create` command got a new flag (`-node`) to help create an additional SAN in certs that can be used with this flavor of federation.
2020-03-09 15:59:02 -05:00
// Wrap a net.Conn into a client tls connection suitable for secure ALPN-RPC,
// performing any additional verification as needed.
func (c *Configurator) wrapALPNTLSClient(dc, nodeName, alpnProto string, conn net.Conn) (net.Conn, error) {
if dc == "" {
return nil, fmt.Errorf("cannot dial using ALPN-RPC without a target datacenter")
} else if nodeName == "" {
return nil, fmt.Errorf("cannot dial using ALPN-RPC without a target node")
} else if alpnProto == "" {
return nil, fmt.Errorf("cannot dial using ALPN-RPC without a target alpn protocol")
}
config := c.OutgoingALPNRPCConfig()
if config == nil {
return nil, fmt.Errorf("cannot dial via a mesh gateway when outgoing TLS is disabled")
}
// Since the ALPN-RPC variation is indirectly exposed to the internet via
// mesh gateways we force mTLS and full hostname validation (forcing
// verify_server_hostname and verify_outgoing to be effectively true).
config.ServerName = c.ServerSNI(dc, nodeName)
config.NextProtos = []string{alpnProto}
tlsConn := tls.Client(conn, config)
// NOTE: For this handshake to succeed the server must have key material
// for either "<nodename>.server.<datacenter>.<domain>" or
// "*.server.<datacenter>.<domain>" in addition to the
// "server.<datacenter>.<domain>" required for standard TLS'd RPC.
if err := tlsConn.Handshake(); err != nil {
tlsConn.Close()
return nil, err
}
if cs := tlsConn.ConnectionState(); !cs.NegotiatedProtocolIsMutual {
tlsConn.Close()
return nil, fmt.Errorf("could not negotiate ALPN protocol %q with %q", alpnProto, config.ServerName)
}
return tlsConn, nil
}
// ParseCiphers parse ciphersuites from the comma-separated string into
// recognized slice
func ParseCiphers(cipherStr string) ([]uint16, error) {
suites := []uint16{}
cipherStr = strings.TrimSpace(cipherStr)
if cipherStr == "" {
return []uint16{}, nil
}
ciphers := strings.Split(cipherStr, ",")
// Note: this needs to be kept up to date with the cipherMap in CipherString
cipherMap := map[string]uint16{
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA": tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256": tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,
"TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256": tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA": tls.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,
"TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384": tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA": tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256": tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,
"TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256": tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA": tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,
"TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384": tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
}
for _, cipher := range ciphers {
if v, ok := cipherMap[cipher]; ok {
suites = append(suites, v)
} else {
return suites, fmt.Errorf("unsupported cipher %q", cipher)
}
}
return suites, nil
}
// CipherString performs the inverse operation of ParseCiphers
func CipherString(ciphers []uint16) (string, error) {
// Note: this needs to be kept up to date with the cipherMap in ParseCiphers
cipherMap := map[uint16]string{
tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA: "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256: "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256: "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256",
tls.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA: "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384: "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384",
tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA: "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256: "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256: "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256",
tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA: "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA",
tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384: "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384",
}
cipherStrings := make([]string, len(ciphers))
for i, cipher := range ciphers {
if v, ok := cipherMap[cipher]; ok {
cipherStrings[i] = v
} else {
return "", fmt.Errorf("unsupported cipher %d", cipher)
}
}
return strings.Join(cipherStrings, ","), nil
}