agent: Don't leave old errors around in cache (#5094)

* Fixes #4480. Don't leave old errors around in cache that can be hit in specific circumstances.

* Move error reset to cover extreme edge case of nil Value, nil err Fetch
This commit is contained in:
Paul Banks 2019-01-08 10:06:38 +00:00 committed by GitHub
parent 96b877f79e
commit 0589525ae9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 89 additions and 16 deletions

44
agent/cache/cache.go vendored
View File

@ -306,18 +306,28 @@ RETRY_GET:
c.entriesExpiryHeap.Fix(entry.Expiry) c.entriesExpiryHeap.Fix(entry.Expiry)
c.entriesLock.Unlock() c.entriesLock.Unlock()
// We purposely do not return an error here since the cache // We purposely do not return an error here since the cache only works with
// only works with fetching values that either have a value // fetching values that either have a value or have an error, but not both.
// or have an error, but not both. The Error may be non-nil // The Error may be non-nil in the entry in the case that an error has
// in the entry because of this to note future fetch errors. // occurred _since_ the last good value, but we still want to return the
// good value to clients that are not requesting a specific version. The
// effect of this is that blocking clients will all see an error immediately
// without waiting a whole timeout to see it, but clients that just look up
// cache with an older index than the last valid result will still see the
// result and not the error here. I.e. the error is not "cached" without a
// new fetch attempt occuring, but the last good value can still be fetched
// from cache.
return entry.Value, meta, nil return entry.Value, meta, nil
} }
// If this isn't our first time through and our last value has an error, // If this isn't our first time through and our last value has an error, then
// then we return the error. This has the behavior that we don't sit in // we return the error. This has the behavior that we don't sit in a retry
// a retry loop getting the same error for the entire duration of the // loop getting the same error for the entire duration of the timeout.
// timeout. Instead, we make one effort to fetch a new value, and if // Instead, we make one effort to fetch a new value, and if there was an
// there was an error, we return. // error, we return. Note that the invariant is that if both entry.Value AND
// entry.Error are non-nil, the error _must_ be more recent than the Value. In
// other words valid fetches should reset the error. See
// https://github.com/hashicorp/consul/issues/4480.
if !first && entry.Error != nil { if !first && entry.Error != nil {
return entry.Value, ResultMeta{Index: entry.Index}, entry.Error return entry.Value, ResultMeta{Index: entry.Index}, entry.Error
} }
@ -454,6 +464,15 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
// Copy the existing entry to start. // Copy the existing entry to start.
newEntry := entry newEntry := entry
newEntry.Fetching = false newEntry.Fetching = false
// Importantly, always reset the Error. Having both Error and a Value that
// are non-nil is allowed in the cache entry but it indicates that the Error
// is _newer_ than the last good value. So if the err is nil then we need to
// reset to replace any _older_ errors and avoid them bubbling up. If the
// error is non-nil then we need to set it anyway and used to do it in the
// code below. See https://github.com/hashicorp/consul/issues/4480.
newEntry.Error = err
if result.Value != nil { if result.Value != nil {
// A new value was given, so we create a brand new entry. // A new value was given, so we create a brand new entry.
newEntry.Value = result.Value newEntry.Value = result.Value
@ -512,13 +531,6 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
// Increment attempt counter // Increment attempt counter
attempt++ attempt++
// Always set the error. We don't override the value here because
// if Valid is true, then we can reuse the Value in the case a
// specific index isn't requested. However, for blocking queries,
// we want Error to be set so that we can return early with the
// error.
newEntry.Error = err
// If we are refreshing and just failed, updated the lost contact time as // If we are refreshing and just failed, updated the lost contact time as
// our cache will be stale until we get successfully reconnected. We only // our cache will be stale until we get successfully reconnected. We only
// set this on the first failure (if it's zero) so we can track how long // set this on the first failure (if it's zero) so we can track how long

View File

@ -81,6 +81,67 @@ func TestCacheGet_initError(t *testing.T) {
typ.AssertExpectations(t) typ.AssertExpectations(t)
} }
// Test a cached error is replaced by a successful result. See
// https://github.com/hashicorp/consul/issues/4480
func TestCacheGet_cachedErrorsDontStick(t *testing.T) {
t.Parallel()
require := require.New(t)
typ := TestType(t)
defer typ.AssertExpectations(t)
c := TestCache(t)
c.RegisterType("t", typ, nil)
// Configure the type
fetcherr := fmt.Errorf("initial error")
// First fetch errors, subsequent fetches are successful and then block
typ.Static(FetchResult{}, fetcherr).Times(1)
typ.Static(FetchResult{Value: 42, Index: 123}, nil).Times(1)
// We trigger this to return same value to simulate a timeout.
triggerCh := make(chan time.Time)
typ.Static(FetchResult{Value: 42, Index: 123}, nil).WaitUntil(triggerCh)
// Get, should fetch and get error
req := TestRequest(t, RequestInfo{Key: "hello"})
result, meta, err := c.Get("t", req)
require.Error(err)
require.Nil(result)
require.False(meta.Hit)
// Get, should fetch again since our last fetch was an error, but get success
result, meta, err = c.Get("t", req)
require.NoError(err)
require.Equal(42, result)
require.False(meta.Hit)
// Now get should block until timeout and then get the same response NOT the
// cached error.
getCh1 := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{
Key: "hello",
MinIndex: 123,
// We _don't_ set a timeout here since that doesn't trigger the bug - the
// bug occurs when the Fetch call times out and returns the same value when
// an error is set. If it returns a new value the blocking loop works too.
}))
time.AfterFunc(50*time.Millisecond, func() {
// "Timeout" the Fetch after a short time.
close(triggerCh)
})
select {
case result := <-getCh1:
t.Fatalf("result or error returned before an update happened. "+
"If this is nil look above for the error log: %v", result)
case <-time.After(100 * time.Millisecond):
// It _should_ keep blocking for a new value here
}
// Sleep a tiny bit just to let maybe some background calls happen
// then verify the calls.
time.Sleep(20 * time.Millisecond)
typ.AssertExpectations(t)
}
// Test a Get with a request that returns a blank cache key. This should // Test a Get with a request that returns a blank cache key. This should
// force a backend request and skip the cache entirely. // force a backend request and skip the cache entirely.
func TestCacheGet_blankCacheKey(t *testing.T) { func TestCacheGet_blankCacheKey(t *testing.T) {