mirror of https://github.com/status-im/consul.git
agent: Don't leave old errors around in cache (#5094)
* Fixes #4480. Don't leave old errors around in cache that can be hit in specific circumstances. * Move error reset to cover extreme edge case of nil Value, nil err Fetch
This commit is contained in:
parent
96b877f79e
commit
0589525ae9
|
@ -306,18 +306,28 @@ RETRY_GET:
|
|||
c.entriesExpiryHeap.Fix(entry.Expiry)
|
||||
c.entriesLock.Unlock()
|
||||
|
||||
// We purposely do not return an error here since the cache
|
||||
// only works with fetching values that either have a value
|
||||
// or have an error, but not both. The Error may be non-nil
|
||||
// in the entry because of this to note future fetch errors.
|
||||
// We purposely do not return an error here since the cache only works with
|
||||
// fetching values that either have a value or have an error, but not both.
|
||||
// The Error may be non-nil in the entry in the case that an error has
|
||||
// occurred _since_ the last good value, but we still want to return the
|
||||
// good value to clients that are not requesting a specific version. The
|
||||
// effect of this is that blocking clients will all see an error immediately
|
||||
// without waiting a whole timeout to see it, but clients that just look up
|
||||
// cache with an older index than the last valid result will still see the
|
||||
// result and not the error here. I.e. the error is not "cached" without a
|
||||
// new fetch attempt occuring, but the last good value can still be fetched
|
||||
// from cache.
|
||||
return entry.Value, meta, nil
|
||||
}
|
||||
|
||||
// If this isn't our first time through and our last value has an error,
|
||||
// then we return the error. This has the behavior that we don't sit in
|
||||
// a retry loop getting the same error for the entire duration of the
|
||||
// timeout. Instead, we make one effort to fetch a new value, and if
|
||||
// there was an error, we return.
|
||||
// If this isn't our first time through and our last value has an error, then
|
||||
// we return the error. This has the behavior that we don't sit in a retry
|
||||
// loop getting the same error for the entire duration of the timeout.
|
||||
// Instead, we make one effort to fetch a new value, and if there was an
|
||||
// error, we return. Note that the invariant is that if both entry.Value AND
|
||||
// entry.Error are non-nil, the error _must_ be more recent than the Value. In
|
||||
// other words valid fetches should reset the error. See
|
||||
// https://github.com/hashicorp/consul/issues/4480.
|
||||
if !first && entry.Error != nil {
|
||||
return entry.Value, ResultMeta{Index: entry.Index}, entry.Error
|
||||
}
|
||||
|
@ -454,6 +464,15 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
|
|||
// Copy the existing entry to start.
|
||||
newEntry := entry
|
||||
newEntry.Fetching = false
|
||||
|
||||
// Importantly, always reset the Error. Having both Error and a Value that
|
||||
// are non-nil is allowed in the cache entry but it indicates that the Error
|
||||
// is _newer_ than the last good value. So if the err is nil then we need to
|
||||
// reset to replace any _older_ errors and avoid them bubbling up. If the
|
||||
// error is non-nil then we need to set it anyway and used to do it in the
|
||||
// code below. See https://github.com/hashicorp/consul/issues/4480.
|
||||
newEntry.Error = err
|
||||
|
||||
if result.Value != nil {
|
||||
// A new value was given, so we create a brand new entry.
|
||||
newEntry.Value = result.Value
|
||||
|
@ -512,13 +531,6 @@ func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-
|
|||
// Increment attempt counter
|
||||
attempt++
|
||||
|
||||
// Always set the error. We don't override the value here because
|
||||
// if Valid is true, then we can reuse the Value in the case a
|
||||
// specific index isn't requested. However, for blocking queries,
|
||||
// we want Error to be set so that we can return early with the
|
||||
// error.
|
||||
newEntry.Error = err
|
||||
|
||||
// If we are refreshing and just failed, updated the lost contact time as
|
||||
// our cache will be stale until we get successfully reconnected. We only
|
||||
// set this on the first failure (if it's zero) so we can track how long
|
||||
|
|
|
@ -81,6 +81,67 @@ func TestCacheGet_initError(t *testing.T) {
|
|||
typ.AssertExpectations(t)
|
||||
}
|
||||
|
||||
// Test a cached error is replaced by a successful result. See
|
||||
// https://github.com/hashicorp/consul/issues/4480
|
||||
func TestCacheGet_cachedErrorsDontStick(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
require := require.New(t)
|
||||
|
||||
typ := TestType(t)
|
||||
defer typ.AssertExpectations(t)
|
||||
c := TestCache(t)
|
||||
c.RegisterType("t", typ, nil)
|
||||
|
||||
// Configure the type
|
||||
fetcherr := fmt.Errorf("initial error")
|
||||
// First fetch errors, subsequent fetches are successful and then block
|
||||
typ.Static(FetchResult{}, fetcherr).Times(1)
|
||||
typ.Static(FetchResult{Value: 42, Index: 123}, nil).Times(1)
|
||||
// We trigger this to return same value to simulate a timeout.
|
||||
triggerCh := make(chan time.Time)
|
||||
typ.Static(FetchResult{Value: 42, Index: 123}, nil).WaitUntil(triggerCh)
|
||||
|
||||
// Get, should fetch and get error
|
||||
req := TestRequest(t, RequestInfo{Key: "hello"})
|
||||
result, meta, err := c.Get("t", req)
|
||||
require.Error(err)
|
||||
require.Nil(result)
|
||||
require.False(meta.Hit)
|
||||
|
||||
// Get, should fetch again since our last fetch was an error, but get success
|
||||
result, meta, err = c.Get("t", req)
|
||||
require.NoError(err)
|
||||
require.Equal(42, result)
|
||||
require.False(meta.Hit)
|
||||
|
||||
// Now get should block until timeout and then get the same response NOT the
|
||||
// cached error.
|
||||
getCh1 := TestCacheGetCh(t, c, "t", TestRequest(t, RequestInfo{
|
||||
Key: "hello",
|
||||
MinIndex: 123,
|
||||
// We _don't_ set a timeout here since that doesn't trigger the bug - the
|
||||
// bug occurs when the Fetch call times out and returns the same value when
|
||||
// an error is set. If it returns a new value the blocking loop works too.
|
||||
}))
|
||||
time.AfterFunc(50*time.Millisecond, func() {
|
||||
// "Timeout" the Fetch after a short time.
|
||||
close(triggerCh)
|
||||
})
|
||||
select {
|
||||
case result := <-getCh1:
|
||||
t.Fatalf("result or error returned before an update happened. "+
|
||||
"If this is nil look above for the error log: %v", result)
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
// It _should_ keep blocking for a new value here
|
||||
}
|
||||
|
||||
// Sleep a tiny bit just to let maybe some background calls happen
|
||||
// then verify the calls.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
typ.AssertExpectations(t)
|
||||
}
|
||||
|
||||
// Test a Get with a request that returns a blank cache key. This should
|
||||
// force a backend request and skip the cache entirely.
|
||||
func TestCacheGet_blankCacheKey(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue