From 0589525ae953d990717d4f9d81fb3aeb63f01b7a Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Tue, 8 Jan 2019 10:06:38 +0000 Subject: [PATCH] agent: Don't leave old errors around in cache (#5094) * Fixes #4480. Don't leave old errors around in cache that can be hit in specific circumstances. * Move error reset to cover extreme edge case of nil Value, nil err Fetch --- agent/cache/cache.go | 44 ++++++++++++++++++---------- agent/cache/cache_test.go | 61 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 16 deletions(-) diff --git a/agent/cache/cache.go b/agent/cache/cache.go index 9b91916eb5..228c909ffc 100644 --- a/agent/cache/cache.go +++ b/agent/cache/cache.go @@ -306,18 +306,28 @@ RETRY_GET: c.entriesExpiryHeap.Fix(entry.Expiry) c.entriesLock.Unlock() - // We purposely do not return an error here since the cache - // only works with fetching values that either have a value - // or have an error, but not both. The Error may be non-nil - // in the entry because of this to note future fetch errors. + // We purposely do not return an error here since the cache only works with + // fetching values that either have a value or have an error, but not both. + // The Error may be non-nil in the entry in the case that an error has + // occurred _since_ the last good value, but we still want to return the + // good value to clients that are not requesting a specific version. The + // effect of this is that blocking clients will all see an error immediately + // without waiting a whole timeout to see it, but clients that just look up + // cache with an older index than the last valid result will still see the + // result and not the error here. I.e. the error is not "cached" without a + // new fetch attempt occuring, but the last good value can still be fetched + // from cache. return entry.Value, meta, nil } - // If this isn't our first time through and our last value has an error, - // then we return the error. This has the behavior that we don't sit in - // a retry loop getting the same error for the entire duration of the - // timeout. Instead, we make one effort to fetch a new value, and if - // there was an error, we return. + // If this isn't our first time through and our last value has an error, then + // we return the error. This has the behavior that we don't sit in a retry + // loop getting the same error for the entire duration of the timeout. + // Instead, we make one effort to fetch a new value, and if there was an + // error, we return. Note that the invariant is that if both entry.Value AND + // entry.Error are non-nil, the error _must_ be more recent than the Value. In + // other words valid fetches should reset the error. See + // https://github.com/hashicorp/consul/issues/4480. if !first && entry.Error != nil { return entry.Value, ResultMeta{Index: entry.Index}, entry.Error } @@ -454,6 +464,15 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<- // Copy the existing entry to start. newEntry := entry newEntry.Fetching = false + + // Importantly, always reset the Error. Having both Error and a Value that + // are non-nil is allowed in the cache entry but it indicates that the Error + // is _newer_ than the last good value. So if the err is nil then we need to + // reset to replace any _older_ errors and avoid them bubbling up. If the + // error is non-nil then we need to set it anyway and used to do it in the + // code below. See https://github.com/hashicorp/consul/issues/4480. + newEntry.Error = err + if result.Value != nil { // A new value was given, so we create a brand new entry. newEntry.Value = result.Value @@ -512,13 +531,6 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<- // Increment attempt counter attempt++ - // Always set the error. We don't override the value here because - // if Valid is true, then we can reuse the Value in the case a - // specific index isn't requested. However, for blocking queries, - // we want Error to be set so that we can return early with the - // error. - newEntry.Error = err - // If we are refreshing and just failed, updated the lost contact time as // our cache will be stale until we get successfully reconnected. We only // set this on the first failure (if it's zero) so we can track how long diff --git a/agent/cache/cache_test.go b/agent/cache/cache_test.go index 9f7397e021..e8cb1ddf3e 100644 --- a/agent/cache/cache_test.go +++ b/agent/cache/cache_test.go @@ -81,6 +81,67 @@ func TestCacheGet_initError(t *testing.T) { typ.AssertExpectations(t) } +// Test a cached error is replaced by a successful result. See +// https://github.com/hashicorp/consul/issues/4480 +func TestCacheGet_cachedErrorsDontStick(t *testing.T) { + t.Parallel() + + require := require.New(t) + + typ := TestType(t) + defer typ.AssertExpectations(t) + c := TestCache(t) + c.RegisterType("t", typ, nil) + + // Configure the type + fetcherr := fmt.Errorf("initial error") + // First fetch errors, subsequent fetches are successful and then block + typ.Static(FetchResult{}, fetcherr).Times(1) + typ.Static(FetchResult{Value: 42, Index: 123}, nil).Times(1) + // We trigger this to return same value to simulate a timeout. + triggerCh := make(chan time.Time) + typ.Static(FetchResult{Value: 42, Index: 123}, nil).WaitUntil(triggerCh) + + // Get, should fetch and get error + req := TestRequest(t, RequestInfo{Key: "hello"}) + result, meta, err := c.Get("t", req) + require.Error(err) + require.Nil(result) + require.False(meta.Hit) + + // Get, should fetch again since our last fetch was an error, but get success + result, meta, err = c.Get("t", req) + require.NoError(err) + require.Equal(42, result) + require.False(meta.Hit) + + // Now get should block until timeout and then get the same response NOT the + // cached error. + getCh1 := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{ + Key: "hello", + MinIndex: 123, + // We _don't_ set a timeout here since that doesn't trigger the bug - the + // bug occurs when the Fetch call times out and returns the same value when + // an error is set. If it returns a new value the blocking loop works too. + })) + time.AfterFunc(50*time.Millisecond, func() { + // "Timeout" the Fetch after a short time. + close(triggerCh) + }) + select { + case result := <-getCh1: + t.Fatalf("result or error returned before an update happened. "+ + "If this is nil look above for the error log: %v", result) + case <-time.After(100 * time.Millisecond): + // It _should_ keep blocking for a new value here + } + + // Sleep a tiny bit just to let maybe some background calls happen + // then verify the calls. + time.Sleep(20 * time.Millisecond) + typ.AssertExpectations(t) +} + // Test a Get with a request that returns a blank cache key. This should // force a backend request and skip the cache entirely. func TestCacheGet_blankCacheKey(t *testing.T) {