Fix nil-pointer panics from proxycfg package. (#16277)

Prior to this PR, servers / agents would panic and crash if an ingress
or api gateway were configured to use a discovery chain that both:

1. Referenced a peered service
2. Had a mesh gateway mode of local

This could occur, because code for handling upstream watches was shared
between both connect-proxy and the gateways. As a short-term fix, this
PR ensures that the maps are always initialized for these gateway services.

This PR also wraps the proxycfg execution and service
registration calls with recover statements to ensure that future issues
like this do not put the server into an unrecoverable state.
This commit is contained in:
Derek Menteer 2023-02-15 11:54:44 -06:00 committed by GitHub
parent 1d9ee50681
commit 6599a9be1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 0 deletions

View File

@ -73,6 +73,7 @@ func (h *handlerAPIGateway) initialize(ctx context.Context) (ConfigSnapshot, err
snap.APIGateway.WatchedDiscoveryChains = make(map[UpstreamID]context.CancelFunc)
snap.APIGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.APIGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.APIGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]()
snap.APIGateway.WatchedUpstreams = make(map[UpstreamID]map[string]context.CancelFunc)
snap.APIGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)

View File

@ -67,6 +67,7 @@ func (s *handlerIngressGateway) initialize(ctx context.Context) (ConfigSnapshot,
snap.IngressGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.IngressGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]()
snap.IngressGateway.Listeners = make(map[IngressListenerKey]structs.IngressListener)
snap.IngressGateway.UpstreamPeerTrustBundles = watch.NewMap[string, *pbpeering.PeeringTrustBundle]()
snap.IngressGateway.PeerUpstreamEndpoints = watch.NewMap[UpstreamID, structs.CheckServiceNodes]()

View File

@ -2,6 +2,7 @@ package proxycfg
import (
"errors"
"runtime/debug"
"sync"
"github.com/hashicorp/go-hclog"
@ -142,6 +143,20 @@ func (m *Manager) Register(id ProxyID, ns *structs.NodeService, source ProxySour
m.mu.Lock()
defer m.mu.Unlock()
defer func() {
if r := recover(); r != nil {
m.Logger.Error("unexpected panic during service manager registration",
"node", id.NodeName,
"service", id.ServiceID,
"message", r,
"stacktrace", string(debug.Stack()),
)
}
}()
return m.register(id, ns, source, token, overwrite)
}
func (m *Manager) register(id ProxyID, ns *structs.NodeService, source ProxySource, token string, overwrite bool) error {
state, ok := m.proxies[id]
if ok {
if state.source != source && !overwrite {

View File

@ -6,6 +6,7 @@ import (
"fmt"
"net"
"reflect"
"runtime/debug"
"sync/atomic"
"time"
@ -298,6 +299,21 @@ func newConfigSnapshotFromServiceInstance(s serviceInstance, config stateConfig)
}
func (s *state) run(ctx context.Context, snap *ConfigSnapshot) {
// Add a recover here so than any panics do not make their way up
// into the server / agent.
defer func() {
if r := recover(); r != nil {
s.logger.Error("unexpected panic while running proxycfg",
"node", s.serviceInstance.proxyID.NodeName,
"service", s.serviceInstance.proxyID.ServiceID,
"message", r,
"stacktrace", string(debug.Stack()))
}
}()
s.unsafeRun(ctx, snap)
}
func (s *state) unsafeRun(ctx context.Context, snap *ConfigSnapshot) {
// Close the channel we return from Watch when we stop so consumers can stop
// watching and clean up their goroutines. It's important we do this here and
// not in Close since this routine sends on this chan and so might panic if it