mirror of https://github.com/status-im/consul.git
agent: cache notifications work after error if the underlying RPC returns index=1 (#6547)
Fixes #6521 Ensure that initial failures to fetch an agent cache entry using the notify API where the underlying RPC returns a synthetic index of 1 correctly recovers when those RPCs resume working. The bug in the Cache.notifyBlockingQuery used to incorrectly "fix" the index for the next query from 0 to 1 for all queries, when it should have not done so for queries that errored. Also fixed some things that made debugging difficult: - config entry read/list endpoints send back QueryMeta headers - xds event loops don't swallow the cache notification errors
This commit is contained in:
parent
76cf54068b
commit
9566df524e
|
@ -20,6 +20,8 @@ import (
|
||||||
|
|
||||||
"github.com/hashicorp/consul/testrpc"
|
"github.com/hashicorp/consul/testrpc"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/agent/cache"
|
||||||
|
cachetype "github.com/hashicorp/consul/agent/cache-types"
|
||||||
"github.com/hashicorp/consul/agent/checks"
|
"github.com/hashicorp/consul/agent/checks"
|
||||||
"github.com/hashicorp/consul/agent/config"
|
"github.com/hashicorp/consul/agent/config"
|
||||||
"github.com/hashicorp/consul/agent/connect"
|
"github.com/hashicorp/consul/agent/connect"
|
||||||
|
@ -4011,3 +4013,99 @@ func TestAgent_RerouteNewHTTPChecks(t *testing.T) {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgentCache_serviceInConfigFile_initialFetchErrors_Issue6521(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Ensure that initial failures to fetch the discovery chain via the agent
|
||||||
|
// cache using the notify API for a service with no config entries
|
||||||
|
// correctly recovers when those RPCs resume working. The key here is that
|
||||||
|
// the lack of config entries guarantees that the RPC will come back with a
|
||||||
|
// synthetic index of 1.
|
||||||
|
//
|
||||||
|
// The bug in the Cache.notifyBlockingQuery used to incorrectly "fix" the
|
||||||
|
// index for the next query from 0 to 1 for all queries, when it should
|
||||||
|
// have not done so for queries that errored.
|
||||||
|
|
||||||
|
a1 := NewTestAgent(t, t.Name()+"-a1", "")
|
||||||
|
defer a1.Shutdown()
|
||||||
|
testrpc.WaitForLeader(t, a1.RPC, "dc1")
|
||||||
|
|
||||||
|
a2 := NewTestAgent(t, t.Name()+"-a2", `
|
||||||
|
server = false
|
||||||
|
bootstrap = false
|
||||||
|
services {
|
||||||
|
name = "echo-client"
|
||||||
|
port = 8080
|
||||||
|
connect {
|
||||||
|
sidecar_service {
|
||||||
|
proxy {
|
||||||
|
upstreams {
|
||||||
|
destination_name = "echo"
|
||||||
|
local_bind_port = 9191
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
services {
|
||||||
|
name = "echo"
|
||||||
|
port = 9090
|
||||||
|
connect {
|
||||||
|
sidecar_service {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`)
|
||||||
|
defer a2.Shutdown()
|
||||||
|
|
||||||
|
// Starting a client agent disconnected from a server with services.
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
ch := make(chan cache.UpdateEvent, 1)
|
||||||
|
require.NoError(t, a2.cache.Notify(ctx, cachetype.CompiledDiscoveryChainName, &structs.DiscoveryChainRequest{
|
||||||
|
Datacenter: "dc1",
|
||||||
|
Name: "echo",
|
||||||
|
EvaluateInDatacenter: "dc1",
|
||||||
|
EvaluateInNamespace: "default",
|
||||||
|
}, "foo", ch))
|
||||||
|
|
||||||
|
{ // The first event is an error because we are not joined yet.
|
||||||
|
evt := <-ch
|
||||||
|
require.Equal(t, "foo", evt.CorrelationID)
|
||||||
|
require.Nil(t, evt.Result)
|
||||||
|
require.Error(t, evt.Err)
|
||||||
|
require.Equal(t, evt.Err, structs.ErrNoServers)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("joining client to server")
|
||||||
|
|
||||||
|
// Now connect to server
|
||||||
|
_, err := a1.JoinLAN([]string{
|
||||||
|
fmt.Sprintf("127.0.0.1:%d", a2.Config.SerfPortLAN),
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
t.Logf("joined client to server")
|
||||||
|
|
||||||
|
deadlineCh := time.After(10 * time.Second)
|
||||||
|
start := time.Now()
|
||||||
|
LOOP:
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case evt := <-ch:
|
||||||
|
// We may receive several notifications of an error until we get the
|
||||||
|
// first successful reply.
|
||||||
|
require.Equal(t, "foo", evt.CorrelationID)
|
||||||
|
if evt.Err != nil {
|
||||||
|
break LOOP
|
||||||
|
}
|
||||||
|
require.NoError(t, evt.Err)
|
||||||
|
require.NotNil(t, evt.Result)
|
||||||
|
t.Logf("took %s to get first success", time.Since(start))
|
||||||
|
case <-deadlineCh:
|
||||||
|
t.Fatal("did not get notified successfully")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -126,7 +126,7 @@ func (c *Cache) notifyBlockingQuery(ctx context.Context, t string, r Request, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Sanity check we always request blocking on second pass
|
// Sanity check we always request blocking on second pass
|
||||||
if index < 1 {
|
if err == nil && index < 1 {
|
||||||
index = 1
|
index = 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,8 +33,9 @@ func TestCacheNotify(t *testing.T) {
|
||||||
// initially.
|
// initially.
|
||||||
typ.Static(FetchResult{Value: nil, Index: 0}, errors.New("no servers available")).Once()
|
typ.Static(FetchResult{Value: nil, Index: 0}, errors.New("no servers available")).Once()
|
||||||
|
|
||||||
// Configure the type
|
// Configure the type. The first time we use the fake index of "1" to verify we
|
||||||
typ.Static(FetchResult{Value: 1, Index: 4}, nil).Once().Run(func(args mock.Arguments) {
|
// don't regress on https://github.com/hashicorp/consul/issues/6521 .
|
||||||
|
typ.Static(FetchResult{Value: 1, Index: 1}, nil).Once().Run(func(args mock.Arguments) {
|
||||||
// Assert the right request type - all real Fetch implementations do this so
|
// Assert the right request type - all real Fetch implementations do this so
|
||||||
// it keeps us honest that Watch doesn't require type mangling which will
|
// it keeps us honest that Watch doesn't require type mangling which will
|
||||||
// break in real life (hint: it did on the first attempt)
|
// break in real life (hint: it did on the first attempt)
|
||||||
|
@ -79,7 +80,7 @@ func TestCacheNotify(t *testing.T) {
|
||||||
TestCacheNotifyChResult(t, ch, UpdateEvent{
|
TestCacheNotifyChResult(t, ch, UpdateEvent{
|
||||||
CorrelationID: "test",
|
CorrelationID: "test",
|
||||||
Result: 1,
|
Result: 1,
|
||||||
Meta: ResultMeta{Hit: false, Index: 4},
|
Meta: ResultMeta{Hit: false, Index: 1},
|
||||||
Err: nil,
|
Err: nil,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ func (s *HTTPServer) configGet(resp http.ResponseWriter, req *http.Request) (int
|
||||||
if err := s.agent.RPC("ConfigEntry.Get", &args, &reply); err != nil {
|
if err := s.agent.RPC("ConfigEntry.Get", &args, &reply); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
setMeta(resp, &reply.QueryMeta)
|
||||||
|
|
||||||
if reply.Entry == nil {
|
if reply.Entry == nil {
|
||||||
return nil, NotFoundError{Reason: fmt.Sprintf("Config entry not found for %q / %q", pathArgs[0], pathArgs[1])}
|
return nil, NotFoundError{Reason: fmt.Sprintf("Config entry not found for %q / %q", pathArgs[0], pathArgs[1])}
|
||||||
|
@ -56,6 +57,7 @@ func (s *HTTPServer) configGet(resp http.ResponseWriter, req *http.Request) (int
|
||||||
if err := s.agent.RPC("ConfigEntry.List", &args, &reply); err != nil {
|
if err := s.agent.RPC("ConfigEntry.List", &args, &reply); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
setMeta(resp, &reply.QueryMeta)
|
||||||
|
|
||||||
return reply.Entries, nil
|
return reply.Entries, nil
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -470,18 +470,22 @@ func (s *state) handleUpdate(u cache.UpdateEvent, snap *ConfigSnapshot) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapshot) error {
|
func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapshot) error {
|
||||||
|
if u.Err != nil {
|
||||||
|
return fmt.Errorf("error filling agent cache: %v", u.Err)
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case u.CorrelationID == rootsWatchID:
|
case u.CorrelationID == rootsWatchID:
|
||||||
roots, ok := u.Result.(*structs.IndexedCARoots)
|
roots, ok := u.Result.(*structs.IndexedCARoots)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for roots response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
snap.Roots = roots
|
snap.Roots = roots
|
||||||
|
|
||||||
case u.CorrelationID == leafWatchID:
|
case u.CorrelationID == leafWatchID:
|
||||||
leaf, ok := u.Result.(*structs.IssuedCert)
|
leaf, ok := u.Result.(*structs.IssuedCert)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for leaf response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
snap.ConnectProxy.Leaf = leaf
|
snap.ConnectProxy.Leaf = leaf
|
||||||
|
|
||||||
|
@ -491,7 +495,7 @@ func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapsh
|
||||||
case strings.HasPrefix(u.CorrelationID, "discovery-chain:"):
|
case strings.HasPrefix(u.CorrelationID, "discovery-chain:"):
|
||||||
resp, ok := u.Result.(*structs.DiscoveryChainResponse)
|
resp, ok := u.Result.(*structs.DiscoveryChainResponse)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for service response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
svc := strings.TrimPrefix(u.CorrelationID, "discovery-chain:")
|
svc := strings.TrimPrefix(u.CorrelationID, "discovery-chain:")
|
||||||
snap.ConnectProxy.DiscoveryChain[svc] = resp.Chain
|
snap.ConnectProxy.DiscoveryChain[svc] = resp.Chain
|
||||||
|
@ -503,7 +507,7 @@ func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapsh
|
||||||
case strings.HasPrefix(u.CorrelationID, "upstream-target:"):
|
case strings.HasPrefix(u.CorrelationID, "upstream-target:"):
|
||||||
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for service response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
correlationID := strings.TrimPrefix(u.CorrelationID, "upstream-target:")
|
correlationID := strings.TrimPrefix(u.CorrelationID, "upstream-target:")
|
||||||
targetID, svc, ok := removeColonPrefix(correlationID)
|
targetID, svc, ok := removeColonPrefix(correlationID)
|
||||||
|
@ -521,7 +525,7 @@ func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapsh
|
||||||
case strings.HasPrefix(u.CorrelationID, "mesh-gateway:"):
|
case strings.HasPrefix(u.CorrelationID, "mesh-gateway:"):
|
||||||
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for service response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
correlationID := strings.TrimPrefix(u.CorrelationID, "mesh-gateway:")
|
correlationID := strings.TrimPrefix(u.CorrelationID, "mesh-gateway:")
|
||||||
dc, svc, ok := removeColonPrefix(correlationID)
|
dc, svc, ok := removeColonPrefix(correlationID)
|
||||||
|
@ -538,7 +542,7 @@ func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapsh
|
||||||
case strings.HasPrefix(u.CorrelationID, "upstream:"+serviceIDPrefix):
|
case strings.HasPrefix(u.CorrelationID, "upstream:"+serviceIDPrefix):
|
||||||
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for service response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
svc := strings.TrimPrefix(u.CorrelationID, "upstream:"+serviceIDPrefix)
|
svc := strings.TrimPrefix(u.CorrelationID, "upstream:"+serviceIDPrefix)
|
||||||
snap.ConnectProxy.UpstreamEndpoints[svc] = resp.Nodes
|
snap.ConnectProxy.UpstreamEndpoints[svc] = resp.Nodes
|
||||||
|
@ -546,7 +550,7 @@ func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapsh
|
||||||
case strings.HasPrefix(u.CorrelationID, "upstream:"+preparedQueryIDPrefix):
|
case strings.HasPrefix(u.CorrelationID, "upstream:"+preparedQueryIDPrefix):
|
||||||
resp, ok := u.Result.(*structs.PreparedQueryExecuteResponse)
|
resp, ok := u.Result.(*structs.PreparedQueryExecuteResponse)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for prepared query response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
pq := strings.TrimPrefix(u.CorrelationID, "upstream:")
|
pq := strings.TrimPrefix(u.CorrelationID, "upstream:")
|
||||||
snap.ConnectProxy.UpstreamEndpoints[pq] = resp.Nodes
|
snap.ConnectProxy.UpstreamEndpoints[pq] = resp.Nodes
|
||||||
|
@ -560,7 +564,7 @@ func (s *state) handleUpdateConnectProxy(u cache.UpdateEvent, snap *ConfigSnapsh
|
||||||
snap.ConnectProxy.WatchedServiceChecks[svcID] = resp
|
snap.ConnectProxy.WatchedServiceChecks[svcID] = resp
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return errors.New("unknown correlation ID")
|
return fmt.Errorf("unknown correlation ID: %s", u.CorrelationID)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -668,17 +672,21 @@ func (s *state) resetWatchesFromChain(
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *state) handleUpdateMeshGateway(u cache.UpdateEvent, snap *ConfigSnapshot) error {
|
func (s *state) handleUpdateMeshGateway(u cache.UpdateEvent, snap *ConfigSnapshot) error {
|
||||||
|
if u.Err != nil {
|
||||||
|
return fmt.Errorf("error filling agent cache: %v", u.Err)
|
||||||
|
}
|
||||||
|
|
||||||
switch u.CorrelationID {
|
switch u.CorrelationID {
|
||||||
case rootsWatchID:
|
case rootsWatchID:
|
||||||
roots, ok := u.Result.(*structs.IndexedCARoots)
|
roots, ok := u.Result.(*structs.IndexedCARoots)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for roots response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
snap.Roots = roots
|
snap.Roots = roots
|
||||||
case serviceListWatchID:
|
case serviceListWatchID:
|
||||||
services, ok := u.Result.(*structs.IndexedServices)
|
services, ok := u.Result.(*structs.IndexedServices)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for services response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
|
|
||||||
for svcName := range services.Services {
|
for svcName := range services.Services {
|
||||||
|
@ -721,7 +729,7 @@ func (s *state) handleUpdateMeshGateway(u cache.UpdateEvent, snap *ConfigSnapsho
|
||||||
case datacentersWatchID:
|
case datacentersWatchID:
|
||||||
datacentersRaw, ok := u.Result.(*[]string)
|
datacentersRaw, ok := u.Result.(*[]string)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for datacenters response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
if datacentersRaw == nil {
|
if datacentersRaw == nil {
|
||||||
return fmt.Errorf("invalid response with a nil datacenter list")
|
return fmt.Errorf("invalid response with a nil datacenter list")
|
||||||
|
@ -771,7 +779,7 @@ func (s *state) handleUpdateMeshGateway(u cache.UpdateEvent, snap *ConfigSnapsho
|
||||||
case serviceResolversWatchID:
|
case serviceResolversWatchID:
|
||||||
configEntries, ok := u.Result.(*structs.IndexedConfigEntries)
|
configEntries, ok := u.Result.(*structs.IndexedConfigEntries)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for services response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
|
|
||||||
resolvers := make(map[string]*structs.ServiceResolverConfigEntry)
|
resolvers := make(map[string]*structs.ServiceResolverConfigEntry)
|
||||||
|
@ -786,7 +794,7 @@ func (s *state) handleUpdateMeshGateway(u cache.UpdateEvent, snap *ConfigSnapsho
|
||||||
case strings.HasPrefix(u.CorrelationID, "connect-service:"):
|
case strings.HasPrefix(u.CorrelationID, "connect-service:"):
|
||||||
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for service response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
|
|
||||||
svc := strings.TrimPrefix(u.CorrelationID, "connect-service:")
|
svc := strings.TrimPrefix(u.CorrelationID, "connect-service:")
|
||||||
|
@ -799,7 +807,7 @@ func (s *state) handleUpdateMeshGateway(u cache.UpdateEvent, snap *ConfigSnapsho
|
||||||
case strings.HasPrefix(u.CorrelationID, "mesh-gateway:"):
|
case strings.HasPrefix(u.CorrelationID, "mesh-gateway:"):
|
||||||
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("invalid type for service response: %T", u.Result)
|
return fmt.Errorf("invalid type for response: %T", u.Result)
|
||||||
}
|
}
|
||||||
|
|
||||||
dc := strings.TrimPrefix(u.CorrelationID, "mesh-gateway:")
|
dc := strings.TrimPrefix(u.CorrelationID, "mesh-gateway:")
|
||||||
|
|
Loading…
Reference in New Issue