diff --git a/waku/v2/api/filter/filter.go b/waku/v2/api/filter/filter.go index 3302bdfb..6b6f3f46 100644 --- a/waku/v2/api/filter/filter.go +++ b/waku/v2/api/filter/filter.go @@ -227,23 +227,6 @@ func (apiSub *Sub) resubscribe(failedPeer peer.ID) { apiSub.multiplex(subs) } -// shouldIncrementErrCnt reports whether a Sub.subscribe error should count -// toward the per-5-s-window retry budget filterSubMaxErrCnt. The previous -// implementation gated only on errors.Is(err, utils.ErrNoPeersAvailable) || -// errors.Is(err, swarm.ErrDialBackoff), which matched 0 of 1014 observed -// production failures because the dominant error from -// WakuFilterLightNode.Subscribe is a generic *errors.errorString wrapper -// (or, post-Bug-2-fix, a typed *SubscribeError) — neither of those -// sentinels. With the bound effectively disabled, every subscribe failure -// pushed the closing channel and re-entered subscribe, producing a tight -// retry loop (~1100/sec aggregate observed). -// -// Counting any non-nil error makes the 3-error-per-5-s budget actually bind -// for all failure modes. -func shouldIncrementErrCnt(err error) bool { - return err != nil -} - // shouldHonourRateLimitBackoff reports whether the apiSub is currently within // a rate-limit backoff window and should skip retry triggers. now == rateLimitedUntil // is treated as "window has just elapsed" → false (allow retry), so a zero-value @@ -285,13 +268,12 @@ func (apiSub *Sub) subscribe(contentFilter protocol.ContentFilter, peerCount int ) } - if shouldIncrementErrCnt(err) { - apiSub.errcnt++ - apiSub.log.Debug("errcnt incremented", - zap.Int("new-errcnt", apiSub.errcnt), - zap.Error(err), - ) - } + apiSub.errcnt++ + apiSub.log.Debug("errcnt incremented", + zap.Int("new-errcnt", apiSub.errcnt), + zap.Error(err), + ) + //Inform of error, so that resubscribe can be triggered if required if len(apiSub.closing) < apiSub.Config.MaxPeers { apiSub.closing <- "" diff --git a/waku/v2/api/filter/filter_race_test.go b/waku/v2/api/filter/filter_race_test.go index ea208f5d..5d4ad83d 100644 --- a/waku/v2/api/filter/filter_race_test.go +++ b/waku/v2/api/filter/filter_race_test.go @@ -2,20 +2,15 @@ package filter import ( "context" - "errors" "fmt" - "io" "testing" "time" - "github.com/libp2p/go-libp2p/p2p/net/swarm" "github.com/stretchr/testify/require" "github.com/waku-org/go-waku/waku/v2/onlinechecker" "github.com/waku-org/go-waku/waku/v2/protocol" - pkgfilter "github.com/waku-org/go-waku/waku/v2/protocol/filter" "github.com/waku-org/go-waku/waku/v2/protocol/subscription" - "github.com/waku-org/go-waku/waku/v2/utils" "go.uber.org/zap" ) @@ -194,63 +189,6 @@ func makeNContentTopics(n, seed int) protocol.ContentTopicSet { return protocol.NewContentTopicSet(topics...) } -// TestShouldIncrementErrCnt validates the predicate that gates whether a -// Sub.subscribe error counts toward the per-5-s-window retry budget -// (filterSubMaxErrCnt). The previous implementation used possibleRecursiveError, -// which only matched utils.ErrNoPeersAvailable and swarm.ErrDialBackoff — and -// observed empirically as 0 increments across 1014 production failures because -// the dominant production error is a generic *errors.errorString wrapper -// returned from protocol/filter/client.go (the per-peer aggregator), neither -// of those sentinels. -// -// The fix replaces the predicate with one that counts every non-nil error. -// This test will fail to compile against the unpatched filter.go (the -// shouldIncrementErrCnt symbol does not exist there). -func TestShouldIncrementErrCnt(t *testing.T) { - cases := []struct { - name string - err error - expect bool - }{ - {name: "nil → false", err: nil, expect: false}, - { - name: "plain *errors.errorString (the dominant production error)", - err: errors.New("subscriptions failed for contentTopics: /waku/1/0xabcdef/rfc26"), - expect: true, - }, - { - name: "utils.ErrNoPeersAvailable (was counted before)", - err: utils.ErrNoPeersAvailable, - expect: true, - }, - { - name: "swarm.ErrDialBackoff (was counted before)", - err: swarm.ErrDialBackoff, - expect: true, - }, - { - name: "context.DeadlineExceeded (request timeout)", - err: context.DeadlineExceeded, - expect: true, - }, - { - name: "wrapped via fmt.Errorf %w (io.EOF unwrap chain)", - err: fmt.Errorf("wrapped: %w", io.EOF), - expect: true, - }, - { - name: "*FilterError{Code: 429} (rate limit from server)", - err: &pkgfilter.FilterError{Code: 429, Message: "rate limited"}, - expect: true, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - require.Equal(t, tc.expect, shouldIncrementErrCnt(tc.err)) - }) - } -} - // TestShouldHonourRateLimitBackoff covers the pure predicate that gates retry // triggers in subscriptionLoop once a peer has responded with HTTP 429. While // in the backoff window the subscriptionLoop must stop pushing to apiSub.closing