mirror of
https://github.com/status-im/consul.git
synced 2025-01-31 07:57:17 +00:00
0ac8ae6c3b
* Fix xDS deadlock due to syncLoop termination. This fixes an issue where agentless xDS streams can deadlock permanently until a server is restarted. When this issue occurs, no new proxies are able to successfully connect to the server. Effectively, the trigger for this deadlock stems from the following return statement: https://github.com/hashicorp/consul/blob/v1.18.0/agent/proxycfg-sources/catalog/config_source.go#L199-L202 When this happens, the entire `syncLoop()` terminates and stops consuming from the following channel: https://github.com/hashicorp/consul/blob/v1.18.0/agent/proxycfg-sources/catalog/config_source.go#L182-L192 Which results in the `ConfigSource.cleanup()` function never receiving a response and holding a mutex indefinitely: https://github.com/hashicorp/consul/blob/v1.18.0/agent/proxycfg-sources/catalog/config_source.go#L241-L247 Because this mutex is shared, it effectively deadlocks the server's ability to process new xDS streams. ---- The fix to this issue involves removing the `chan chan struct{}` used like an RPC-over-channels pattern and replacing it with two distinct channels: + `stopSyncLoopCh` - indicates that the `syncLoop()` should terminate soon. + `syncLoopDoneCh` - indicates that the `syncLoop()` has terminated. Splitting these two concepts out and deferring a `close(syncLoopDoneCh)` in the `syncLoop()` function ensures that the deadlock above should no longer occur. We also now evict xDS connections of all proxies for the corresponding `syncLoop()` whenever it encounters an irrecoverable error. This is done by hoisting the new `syncLoopDoneCh` upwards so that it's visible to the xDS delta processing. Prior to this fix, the behavior was to simply orphan them so they would never receive catalog-registration or service-defaults updates. * Add changelog.
46 lines
1.5 KiB
Go
46 lines
1.5 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package local
|
|
|
|
import (
|
|
"github.com/hashicorp/consul/agent/grpc-external/limiter"
|
|
"github.com/hashicorp/consul/agent/proxycfg"
|
|
"github.com/hashicorp/consul/agent/proxycfg-sources/catalog"
|
|
structs "github.com/hashicorp/consul/agent/structs"
|
|
proxysnapshot "github.com/hashicorp/consul/internal/mesh/proxy-snapshot"
|
|
"github.com/hashicorp/consul/proto-public/pbresource"
|
|
)
|
|
|
|
// ConfigSource wraps a proxycfg.Manager to create watches on services
|
|
// local to the agent (pre-registered by Sync).
|
|
type ConfigSource struct {
|
|
manager ConfigManager
|
|
}
|
|
|
|
// NewConfigSource builds a ConfigSource with the given proxycfg.Manager.
|
|
func NewConfigSource(cfgMgr ConfigManager) *ConfigSource {
|
|
return &ConfigSource{cfgMgr}
|
|
}
|
|
|
|
func (m *ConfigSource) Watch(proxyID *pbresource.ID, nodeName string, _ string) (
|
|
<-chan proxysnapshot.ProxySnapshot,
|
|
limiter.SessionTerminatedChan,
|
|
proxycfg.SrcTerminatedChan,
|
|
proxysnapshot.CancelFunc,
|
|
error,
|
|
) {
|
|
serviceID := structs.NewServiceID(proxyID.Name, catalog.GetEnterpriseMetaFromResourceID(proxyID))
|
|
watchCh, cancelWatch := m.manager.Watch(proxycfg.ProxyID{
|
|
ServiceID: serviceID,
|
|
NodeName: nodeName,
|
|
|
|
// Note: we *intentionally* don't set Token here. All watches on local
|
|
// services use the same ACL token, regardless of whatever token is
|
|
// presented in the xDS stream (the token presented to the xDS server
|
|
// is checked before the watch is created).
|
|
Token: "",
|
|
})
|
|
return watchCh, nil, nil, cancelWatch, nil
|
|
}
|