agent: Don't leave old errors around in cache (#5094)

* Fixes #4480. Don't leave old errors around in cache that can be hit in specific circumstances. * Move error reset to cover extreme edge case of nil Value, nil err Fetch
2025-02-24 11:28:40 +00:00 · 2019-01-08 10:06:38 +00:00 · 2019-01-08 10:06:38 +00:00 · 0589525ae9
commit 0589525ae9
parent 96b877f79e
2 changed files with 89 additions and 16 deletions
--- a/agent/cache/cache.go
+++ b/agent/cache/cache.go
@ -306,18 +306,28 @@ RETRY_GET:
 		c.entriesExpiryHeap.Fix(entry.Expiry)
 		c.entriesLock.Unlock()

-		// We purposely do not return an error here since the cache
-		// only works with fetching values that either have a value
-		// or have an error, but not both. The Error may be non-nil
-		// in the entry because of this to note future fetch errors.
+		// We purposely do not return an error here since the cache only works with
+		// fetching values that either have a value or have an error, but not both.
+		// The Error may be non-nil in the entry in the case that an error has
+		// occurred _since_ the last good value, but we still want to return the
+		// good value to clients that are not requesting a specific version. The
+		// effect of this is that blocking clients will all see an error immediately
+		// without waiting a whole timeout to see it, but clients that just look up
+		// cache with an older index than the last valid result will still see the
+		// result and not the error here. I.e. the error is not "cached" without a
+		// new fetch attempt occuring, but the last good value can still be fetched
+		// from cache.
 		return entry.Value, meta, nil
 	}

-	// If this isn't our first time through and our last value has an error,
-	// then we return the error. This has the behavior that we don't sit in
-	// a retry loop getting the same error for the entire duration of the
-	// timeout. Instead, we make one effort to fetch a new value, and if
-	// there was an error, we return.
+	// If this isn't our first time through and our last value has an error, then
+	// we return the error. This has the behavior that we don't sit in a retry
+	// loop getting the same error for the entire duration of the timeout.
+	// Instead, we make one effort to fetch a new value, and if there was an
+	// error, we return. Note that the invariant is that if both entry.Value AND
+	// entry.Error are non-nil, the error _must_ be more recent than the Value. In
+	// other words valid fetches should reset the error. See
+	// https://github.com/hashicorp/consul/issues/4480.
 	if !first && entry.Error != nil {
 		return entry.Value, ResultMeta{Index: entry.Index}, entry.Error
 	}
@ -454,6 +464,15 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 		// Copy the existing entry to start.
 		newEntry := entry
 		newEntry.Fetching = false
+
+		// Importantly, always reset the Error. Having both Error and a Value that
+		// are non-nil is allowed in the cache entry but it indicates that the Error
+		// is _newer_ than the last good value. So if the err is nil then we need to
+		// reset to replace any _older_ errors and avoid them bubbling up. If the
+		// error is non-nil then we need to set it anyway and used to do it in the
+		// code below. See https://github.com/hashicorp/consul/issues/4480.
+		newEntry.Error = err
+
 		if result.Value != nil {
 			// A new value was given, so we create a brand new entry.
 			newEntry.Value = result.Value
@ -512,13 +531,6 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
 			// Increment attempt counter
 			attempt++

-			// Always set the error. We don't override the value here because
-			// if Valid is true, then we can reuse the Value in the case a
-			// specific index isn't requested. However, for blocking queries,
-			// we want Error to be set so that we can return early with the
-			// error.
-			newEntry.Error = err
-
 			// If we are refreshing and just failed, updated the lost contact time as
 			// our cache will be stale until we get successfully reconnected. We only
 			// set this on the first failure (if it's zero) so we can track how long
--- a/agent/cache/cache_test.go
+++ b/agent/cache/cache_test.go
@ -81,6 +81,67 @@ func TestCacheGet_initError(t *testing.T) {
 	typ.AssertExpectations(t)
 }

+// Test a cached error is replaced by a successful result. See
+// https://github.com/hashicorp/consul/issues/4480
+func TestCacheGet_cachedErrorsDontStick(t *testing.T) {
+	t.Parallel()
+
+	require := require.New(t)
+
+	typ := TestType(t)
+	defer typ.AssertExpectations(t)
+	c := TestCache(t)
+	c.RegisterType("t", typ, nil)
+
+	// Configure the type
+	fetcherr := fmt.Errorf("initial error")
+	// First fetch errors, subsequent fetches are successful and then block
+	typ.Static(FetchResult{}, fetcherr).Times(1)
+	typ.Static(FetchResult{Value: 42, Index: 123}, nil).Times(1)
+	// We trigger this to return same value to simulate a timeout.
+	triggerCh := make(chan time.Time)
+	typ.Static(FetchResult{Value: 42, Index: 123}, nil).WaitUntil(triggerCh)
+
+	// Get, should fetch and get error
+	req := TestRequest(t, RequestInfo{Key: "hello"})
+	result, meta, err := c.Get("t", req)
+	require.Error(err)
+	require.Nil(result)
+	require.False(meta.Hit)
+
+	// Get, should fetch again since our last fetch was an error, but get success
+	result, meta, err = c.Get("t", req)
+	require.NoError(err)
+	require.Equal(42, result)
+	require.False(meta.Hit)
+
+	// Now get should block until timeout and then get the same response NOT the
+	// cached error.
+	getCh1 := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{
+		Key:      "hello",
+		MinIndex: 123,
+		// We _don't_ set a timeout here since that doesn't trigger the bug - the
+		// bug occurs when the Fetch call times out and returns the same value when
+		// an error is set. If it returns a new value the blocking loop works too.
+	}))
+	time.AfterFunc(50*time.Millisecond, func() {
+		// "Timeout" the Fetch after a short time.
+		close(triggerCh)
+	})
+	select {
+	case result := <-getCh1:
+		t.Fatalf("result or error returned before an update happened. "+
+			"If this is nil look above for the error log: %v", result)
+	case <-time.After(100 * time.Millisecond):
+		// It _should_ keep blocking for a new value here
+	}
+
+	// Sleep a tiny bit just to let maybe some background calls happen
+	// then verify the calls.
+	time.Sleep(20 * time.Millisecond)
+	typ.AssertExpectations(t)
+}
+
 // Test a Get with a request that returns a blank cache key. This should
 // force a backend request and skip the cache entirely.
 func TestCacheGet_blankCacheKey(t *testing.T) {