agent: Don't leave old errors around in cache (#5094)

* Fixes #4480. Don't leave old errors around in cache that can be hit in specific circumstances. * Move error reset to cover extreme edge case of nil Value, nil err Fetch
2019-01-08 10:06:38 +00:00 · 2019-01-08 10:06:38 +00:00 · 0589525ae9
parent 96b877f79e
commit 0589525ae9
2 changed files with 89 additions and 16 deletions
--- a/agent/cache/cache.go
+++ b/agent/cache/cache.go
@ -306,18 +306,28 @@ RETRY_GET:
 		c.entriesExpiryHeap.Fix(entry.Expiry)
 		c.entriesLock.Unlock()
-		// We purposely do not return an error here since the cache
+		// We purposely do not return an error here since the cache only works with
-		// only works with fetching values that either have a value
+		// fetching values that either have a value or have an error, but not both.
-		// or have an error, but not both. The Error may be non-nil
+		// The Error may be non-nil in the entry in the case that an error has
-		// in the entry because of this to note future fetch errors.
+		// occurred _since_ the last good value, but we still want to return the
 		// good value to clients that are not requesting a specific version. The
 		// effect of this is that blocking clients will all see an error immediately
 		// without waiting a whole timeout to see it, but clients that just look up
 		// cache with an older index than the last valid result will still see the
 		// result and not the error here. I.e. the error is not "cached" without a
 		// new fetch attempt occuring, but the last good value can still be fetched
 		// from cache.
 		return entry.Value, meta, nil
 	}
-	// If this isn't our first time through and our last value has an error,
+	// If this isn't our first time through and our last value has an error, then
-	// then we return the error. This has the behavior that we don't sit in
+	// we return the error. This has the behavior that we don't sit in a retry
-	// a retry loop getting the same error for the entire duration of the
+	// loop getting the same error for the entire duration of the timeout.
-	// timeout. Instead, we make one effort to fetch a new value, and if
+	// Instead, we make one effort to fetch a new value, and if there was an
-	// there was an error, we return.
+	// error, we return. Note that the invariant is that if both entry.Value AND
 	// entry.Error are non-nil, the error _must_ be more recent than the Value. In
 	// other words valid fetches should reset the error. See
 	// https://github.com/hashicorp/consul/issues/4480.
 	if !first && entry.Error != nil {
 		return entry.Value, ResultMeta{Index: entry.Index}, entry.Error
 	}
@ -454,6 +464,15 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 		// Copy the existing entry to start.
 		newEntry := entry
 		newEntry.Fetching = false
 		// Importantly, always reset the Error. Having both Error and a Value that
 		// are non-nil is allowed in the cache entry but it indicates that the Error
 		// is _newer_ than the last good value. So if the err is nil then we need to
 		// reset to replace any _older_ errors and avoid them bubbling up. If the
 		// error is non-nil then we need to set it anyway and used to do it in the
 		// code below. See https://github.com/hashicorp/consul/issues/4480.
 		newEntry.Error = err
 		if result.Value != nil {
 			// A new value was given, so we create a brand new entry.
 			newEntry.Value = result.Value
@ -512,13 +531,6 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 			// Increment attempt counter
 			attempt++
 			// Always set the error. We don't override the value here because
 			// if Valid is true, then we can reuse the Value in the case a
 			// specific index isn't requested. However, for blocking queries,
 			// we want Error to be set so that we can return early with the
 			// error.
 			newEntry.Error = err
 			// If we are refreshing and just failed, updated the lost contact time as
 			// our cache will be stale until we get successfully reconnected. We only
 			// set this on the first failure (if it's zero) so we can track how long
--- a/agent/cache/cache_test.go
+++ b/agent/cache/cache_test.go
@ -81,6 +81,67 @@ func TestCacheGet_initError(t *testing.T) {
 	typ.AssertExpectations(t)
 }
 // Test a cached error is replaced by a successful result. See
 // https://github.com/hashicorp/consul/issues/4480
 func TestCacheGet_cachedErrorsDontStick(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
 	typ := TestType(t)
 	defer typ.AssertExpectations(t)
 	c := TestCache(t)
 	c.RegisterType("t", typ, nil)
 	// Configure the type
 	fetcherr := fmt.Errorf("initial error")
 	// First fetch errors, subsequent fetches are successful and then block
 	typ.Static(FetchResult{}, fetcherr).Times(1)
 	typ.Static(FetchResult{Value: 42, Index: 123}, nil).Times(1)
 	// We trigger this to return same value to simulate a timeout.
 	triggerCh := make(chan time.Time)
 	typ.Static(FetchResult{Value: 42, Index: 123}, nil).WaitUntil(triggerCh)
 	// Get, should fetch and get error
 	req := TestRequest(t, RequestInfo{Key: "hello"})
 	result, meta, err := c.Get("t", req)
 	require.Error(err)
 	require.Nil(result)
 	require.False(meta.Hit)
 	// Get, should fetch again since our last fetch was an error, but get success
 	result, meta, err = c.Get("t", req)
 	require.NoError(err)
 	require.Equal(42, result)
 	require.False(meta.Hit)
 	// Now get should block until timeout and then get the same response NOT the
 	// cached error.
 	getCh1 := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{
 		Key:      "hello",
 		MinIndex: 123,
 		// We _don't_ set a timeout here since that doesn't trigger the bug - the
 		// bug occurs when the Fetch call times out and returns the same value when
 		// an error is set. If it returns a new value the blocking loop works too.
 	}))
 	time.AfterFunc(50*time.Millisecond, func() {
 		// "Timeout" the Fetch after a short time.
 		close(triggerCh)
 	})
 	select {
 	case result := <-getCh1:
 		t.Fatalf("result or error returned before an update happened. "+
 			"If this is nil look above for the error log: %v", result)
 	case <-time.After(100 * time.Millisecond):
 		// It _should_ keep blocking for a new value here
 	}
 	// Sleep a tiny bit just to let maybe some background calls happen
 	// then verify the calls.
 	time.Sleep(20 * time.Millisecond)
 	typ.AssertExpectations(t)
 }
 // Test a Get with a request that returns a blank cache key. This should
 // force a backend request and skip the cache entirely.
 func TestCacheGet_blankCacheKey(t *testing.T) {