fix_: missing message verifier start-stop ,go-waku updates, lightpush service rate limits (#5964)

This commit is contained in:
Prem Chaitanya Prathi 2024-12-23 14:02:48 +05:30 committed by GitHub
parent 810468a57f
commit a4e36d49cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 107 additions and 48 deletions

2
go.mod
View File

@ -97,7 +97,7 @@ require (
github.com/schollz/peerdiscovery v1.7.0
github.com/siphiuel/lc-proxy-wrapper v0.0.0-20230516150924-246507cee8c7
github.com/urfave/cli/v2 v2.27.2
github.com/waku-org/go-waku v0.8.1-0.20241203032230-6550ff35bc71
github.com/waku-org/go-waku v0.8.1-0.20241219102436-278907543b02
github.com/wk8/go-ordered-map/v2 v2.1.7
github.com/yeqown/go-qrcode/v2 v2.2.1
github.com/yeqown/go-qrcode/writer/standard v1.2.1

4
go.sum
View File

@ -2152,8 +2152,8 @@ github.com/waku-org/go-libp2p-pubsub v0.12.0-gowaku.0.20240823143342-b0f2429ca27
github.com/waku-org/go-libp2p-pubsub v0.12.0-gowaku.0.20240823143342-b0f2429ca27f/go.mod h1:Oi0zw9aw8/Y5GC99zt+Ef2gYAl+0nZlwdJonDyOz/sE=
github.com/waku-org/go-libp2p-rendezvous v0.0.0-20240110193335-a67d1cc760a0 h1:R4YYx2QamhBRl/moIxkDCNW+OP7AHbyWLBygDc/xIMo=
github.com/waku-org/go-libp2p-rendezvous v0.0.0-20240110193335-a67d1cc760a0/go.mod h1:EhZP9fee0DYjKH/IOQvoNSy1tSHp2iZadsHGphcAJgY=
github.com/waku-org/go-waku v0.8.1-0.20241203032230-6550ff35bc71 h1:P9sQncEeeBqBRQEtiLdgQe5oWcTlAV5IVA5VGMqGslc=
github.com/waku-org/go-waku v0.8.1-0.20241203032230-6550ff35bc71/go.mod h1:zYhLgqwBE3sGP2vP+aNiM5moOKlf/uSoIv36puAj9WI=
github.com/waku-org/go-waku v0.8.1-0.20241219102436-278907543b02 h1:4XOKp1EwJ8h5HAnuNXBbhz8zbmjnsZLunuwdMNUYlTQ=
github.com/waku-org/go-waku v0.8.1-0.20241219102436-278907543b02/go.mod h1:zYhLgqwBE3sGP2vP+aNiM5moOKlf/uSoIv36puAj9WI=
github.com/waku-org/go-zerokit-rln v0.1.14-0.20240102145250-fa738c0bdf59 h1:jisj+OCI6QydLtFq3Pyhu49wl9ytPN7oAHjMfepHDrA=
github.com/waku-org/go-zerokit-rln v0.1.14-0.20240102145250-fa738c0bdf59/go.mod h1:1PdBdPzyTaKt3VnpAHk3zj+r9dXPFOr3IHZP9nFle6E=
github.com/waku-org/go-zerokit-rln-apple v0.0.0-20230916172309-ee0ee61dde2b h1:KgZVhsLkxsj5gb/FfndSCQu6VYwALrCOgYI3poR95yE=

View File

@ -61,7 +61,8 @@ type EnevelopeProcessor interface {
OnNewEnvelope(env *protocol.Envelope) error
}
func NewFilterManager(ctx context.Context, logger *zap.Logger, minPeersPerFilter int, envProcessor EnevelopeProcessor, node *filter.WakuFilterLightNode, opts ...SubscribeOptions) *FilterManager {
func NewFilterManager(ctx context.Context, logger *zap.Logger, minPeersPerFilter int,
envProcessor EnevelopeProcessor, node *filter.WakuFilterLightNode, opts ...SubscribeOptions) *FilterManager {
// This fn is being mocked in test
mgr := new(FilterManager)
mgr.ctx = ctx
@ -162,6 +163,7 @@ func (mgr *FilterManager) subscribeAndRunLoop(f filterConfig) {
defer utils.LogOnPanic()
ctx, cancel := context.WithCancel(mgr.ctx)
config := FilterConfig{MaxPeers: mgr.minPeersPerFilter}
sub, err := Subscribe(ctx, mgr.node, f.contentFilter, config, mgr.logger, mgr.params)
mgr.Lock()
mgr.filterSubscriptions[f.ID] = SubDetails{cancel, sub}
@ -188,6 +190,7 @@ func (mgr *FilterManager) OnConnectionStatusChange(pubsubTopic string, newStatus
mgr.logger.Debug("inside on connection status change", zap.Bool("new-status", newStatus),
zap.Int("agg filters count", len(mgr.filterSubscriptions)), zap.Int("filter subs count", len(subs)))
if newStatus && !mgr.onlineChecker.IsOnline() { // switched from offline to Online
mgr.onlineChecker.SetOnline(newStatus)
mgr.NetworkChange()
mgr.logger.Debug("switching from offline to online")
mgr.Lock()

View File

@ -35,18 +35,21 @@ type MessageTracker interface {
// MissingMessageVerifier is used to periodically retrieve missing messages from store nodes that have some specific criteria
type MissingMessageVerifier struct {
ctx context.Context
cancel context.CancelFunc
params missingMessageVerifierParams
storenodeRequestor common.StorenodeRequestor
messageTracker MessageTracker
criteriaInterest map[string]criteriaInterest // Track message verification requests and when was the last time a pubsub topic was verified for missing messages
criteriaInterest map[string]*criteriaInterest // Track message verification requests and when was the last time a pubsub topic was verified for missing messages
criteriaInterestMu sync.RWMutex
C <-chan *protocol.Envelope
C chan *protocol.Envelope
timesource timesource.Timesource
logger *zap.Logger
isRunning bool
runningMutex sync.RWMutex
}
// NewMissingMessageVerifier creates an instance of a MissingMessageVerifier
@ -63,6 +66,8 @@ func NewMissingMessageVerifier(storenodeRequester common.StorenodeRequestor, mes
messageTracker: messageTracker,
logger: logger.Named("missing-msg-verifier"),
params: params,
criteriaInterest: make(map[string]*criteriaInterest),
C: make(chan *protocol.Envelope, 1000),
}
}
@ -94,15 +99,36 @@ func (m *MissingMessageVerifier) SetCriteriaInterest(peerID peer.ID, contentFilt
currMessageVerificationRequest.cancel()
}
m.criteriaInterest[contentFilter.PubsubTopic] = criteriaInterest
m.criteriaInterest[contentFilter.PubsubTopic] = &criteriaInterest
}
func (m *MissingMessageVerifier) setRunning(running bool) {
m.runningMutex.Lock()
defer m.runningMutex.Unlock()
m.isRunning = running
}
func (m *MissingMessageVerifier) Start(ctx context.Context) {
m.ctx = ctx
m.criteriaInterest = make(map[string]criteriaInterest)
m.runningMutex.Lock()
if m.isRunning { //make sure verifier only runs once.
m.runningMutex.Unlock()
return
}
m.isRunning = true
m.runningMutex.Unlock()
c := make(chan *protocol.Envelope, 1000)
m.C = c
ctx, cancelFunc := context.WithCancel(ctx)
m.ctx = ctx
m.cancel = cancelFunc
// updating context for existing criteria
m.criteriaInterestMu.Lock()
for _, value := range m.criteriaInterest {
ctx, cancel := context.WithCancel(m.ctx)
value.ctx = ctx
value.cancel = cancel
}
m.criteriaInterestMu.Unlock()
go func() {
defer utils.LogOnPanic()
@ -117,30 +143,39 @@ func (m *MissingMessageVerifier) Start(ctx context.Context) {
m.criteriaInterestMu.RLock()
critIntList := make([]criteriaInterest, 0, len(m.criteriaInterest))
for _, value := range m.criteriaInterest {
critIntList = append(critIntList, value)
critIntList = append(critIntList, *value)
}
m.criteriaInterestMu.RUnlock()
for _, interest := range critIntList {
select {
case <-ctx.Done():
m.setRunning(false)
return
default:
semaphore <- struct{}{}
go func(interest criteriaInterest) {
defer utils.LogOnPanic()
m.fetchHistory(c, interest)
m.fetchHistory(m.C, interest)
<-semaphore
}(interest)
}
}
case <-ctx.Done():
m.setRunning(false)
return
}
}
}()
}
func (m *MissingMessageVerifier) Stop() {
m.cancel()
m.runningMutex.Lock()
defer m.runningMutex.Unlock()
m.isRunning = false
}
func (m *MissingMessageVerifier) fetchHistory(c chan<- *protocol.Envelope, interest criteriaInterest) {
contentTopics := interest.contentFilter.ContentTopics.ToList()
for i := 0; i < len(contentTopics); i += maxContentTopicsPerRequest {

View File

@ -16,8 +16,8 @@ import (
)
const DefaultPeersToPublishForLightpush = 2
const DefaultPublishingLimiterRate = rate.Limit(2)
const DefaultPublishingLimitBurst = 4
const DefaultPublishingLimiterRate = rate.Limit(5)
const DefaultPublishingLimitBurst = 10
type PublishMethod int

View File

@ -102,7 +102,6 @@ const maxFailedAttempts = 5
const prunePeerStoreInterval = 10 * time.Minute
const peerConnectivityLoopSecs = 15
const maxConnsToPeerRatio = 3
const badPeersCleanupInterval = 1 * time.Minute
const maxDialFailures = 2
// 80% relay peers 20% service peers
@ -258,32 +257,27 @@ func (pm *PeerManager) Start(ctx context.Context) {
}
}
func (pm *PeerManager) removeBadPeers() {
if !pm.RelayEnabled {
for _, peerID := range pm.host.Peerstore().Peers() {
if pm.host.Peerstore().(wps.WakuPeerstore).ConnFailures(peerID) > maxDialFailures {
func (pm *PeerManager) CheckAndRemoveBadPeer(peerID peer.ID) {
if pm.host.Peerstore().(wps.WakuPeerstore).ConnFailures(peerID) > maxDialFailures &&
pm.peerConnector.onlineChecker.IsOnline() {
if origin, _ := pm.host.Peerstore().(wps.WakuPeerstore).Origin(peerID); origin != wps.Static { // delete only if a peer is discovered and not configured statically.
//delete peer from peerStore
pm.logger.Debug("removing bad peer due to recurring dial failures", zap.Stringer("peerID", peerID))
pm.RemovePeer(peerID)
}
}
}
}
func (pm *PeerManager) peerStoreLoop(ctx context.Context) {
defer utils.LogOnPanic()
t := time.NewTicker(prunePeerStoreInterval)
t1 := time.NewTicker(badPeersCleanupInterval)
defer t.Stop()
defer t1.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
pm.prunePeerStore()
case <-t1.C:
pm.removeBadPeers()
}
}
}
@ -749,6 +743,7 @@ func (pm *PeerManager) HandleDialError(err error, peerID peer.ID) {
if err == nil || errors.Is(err, context.Canceled) {
return
}
if pm.peerConnector != nil {
pm.peerConnector.addConnectionBackoff(peerID)
}
@ -762,9 +757,4 @@ func (pm *PeerManager) HandleDialError(err error, peerID peer.ID) {
pm.logger.Error("failed to emit DialError", zap.Error(emitterErr))
}
}
if !pm.RelayEnabled && pm.host.Peerstore().(wps.WakuPeerstore).ConnFailures(peerID) >= maxDialFailures {
//delete peer from peerStore
pm.logger.Debug("removing bad peer due to recurring dial failures", zap.Stringer("peerID", peerID))
pm.RemovePeer(peerID)
}
}

View File

@ -130,6 +130,7 @@ func (pm *PeerManager) selectServicePeer(criteria PeerSelectionCriteria) (PeerSe
if len(criteria.PubsubTopics) == 0 || (len(criteria.PubsubTopics) == 1 && criteria.PubsubTopics[0] == "") {
return slot.getRandom(criteria.MaxPeers, criteria.ExcludePeers)
} else { //PubsubTopic based selection
slot.mu.RLock()
keys := make([]peer.ID, 0, len(slot.m))
for i := range slot.m {
if PeerInSet(criteria.ExcludePeers, i) {
@ -137,6 +138,7 @@ func (pm *PeerManager) selectServicePeer(criteria PeerSelectionCriteria) (PeerSe
}
keys = append(keys, i)
}
slot.mu.RUnlock()
selectedPeers := pm.host.Peerstore().(wps.WakuPeerstore).PeersByPubSubTopics(criteria.PubsubTopics, keys...)
tmpPeers, err := selectRandomPeers(selectedPeers, criteria.ExcludePeers, criteria.MaxPeers)
for tmpPeer := range tmpPeers {

View File

@ -15,6 +15,7 @@ import (
"github.com/libp2p/go-libp2p/core/network"
"github.com/libp2p/go-libp2p/core/peer"
libp2pProtocol "github.com/libp2p/go-libp2p/core/protocol"
"github.com/libp2p/go-libp2p/p2p/net/swarm"
"github.com/libp2p/go-msgio/pbio"
"github.com/prometheus/client_golang/prometheus"
"github.com/waku-org/go-waku/logging"
@ -267,6 +268,10 @@ func (wf *WakuFilterLightNode) request(ctx context.Context, requestID []byte,
wf.metrics.RecordError(dialFailure)
if wf.pm != nil {
wf.pm.HandleDialError(err, peerID)
if errors.Is(err, swarm.ErrAllDialsFailed) ||
errors.Is(err, swarm.ErrDialBackoff) || errors.Is(err, swarm.ErrNoAddresses) {
wf.pm.CheckAndRemoveBadPeer(peerID)
}
}
return err
}
@ -355,7 +360,7 @@ func (wf *WakuFilterLightNode) handleFilterSubscribeOptions(ctx context.Context,
if params.pm != nil && reqPeerCount > 0 {
wf.log.Debug("handleFilterSubscribeOptions", zap.Int("peerCount", reqPeerCount), zap.Int("excludePeersLen", len(params.peersToExclude)))
params.selectedPeers, err = wf.pm.SelectPeers(
selectedPeers, err := wf.pm.SelectPeers(
peermanager.PeerSelectionCriteria{
SelectionType: params.peerSelectionType,
Proto: FilterSubscribeID_v20beta1,
@ -368,9 +373,14 @@ func (wf *WakuFilterLightNode) handleFilterSubscribeOptions(ctx context.Context,
)
if err != nil {
wf.log.Error("peer selection returned err", zap.Error(err))
if len(params.selectedPeers) == 0 {
return nil, nil, err
}
}
if len(selectedPeers) > 0 {
params.selectedPeers = append(params.selectedPeers, selectedPeers...)
}
}
wf.log.Debug("handleFilterSubscribeOptions exit", zap.Int("selectedPeerCount", len(params.selectedPeers)))
return params, pubSubTopicMap, nil

View File

@ -24,7 +24,7 @@ func (wf *WakuFilterLightNode) PingPeer(peer peer.ID) {
ctxWithTimeout, cancel := context.WithTimeout(wf.CommonService.Context(), PingTimeout)
defer cancel()
err := wf.Ping(ctxWithTimeout, peer)
if err != nil {
if err != nil && wf.onlineChecker.IsOnline() {
wf.log.Warn("Filter ping failed towards peer", zap.Stringer("peer", peer), zap.Error(err))
//quickly retry ping again before marking subscription as failure
//Note that PingTimeout is a fraction of PingInterval so this shouldn't cause parallel pings being sent.

View File

@ -83,7 +83,7 @@ func WithLightNodeRateLimiter(r rate.Limit, b int) LightNodeOption {
func DefaultLightNodeOptions() []LightNodeOption {
return []LightNodeOption{
WithLightNodeRateLimiter(1, 1),
WithLightNodeRateLimiter(15, 20),
}
}

View File

@ -13,6 +13,7 @@ import (
"github.com/libp2p/go-libp2p/core/network"
"github.com/libp2p/go-libp2p/core/peer"
libp2pProtocol "github.com/libp2p/go-libp2p/core/protocol"
"github.com/libp2p/go-libp2p/p2p/net/swarm"
"github.com/libp2p/go-msgio/pbio"
"github.com/prometheus/client_golang/prometheus"
"github.com/waku-org/go-waku/logging"
@ -198,6 +199,10 @@ func (wakuLP *WakuLightPush) request(ctx context.Context, req *pb.PushRequest, p
wakuLP.metrics.RecordError(dialFailure)
if wakuLP.pm != nil {
wakuLP.pm.HandleDialError(err, peerID)
if errors.Is(err, swarm.ErrAllDialsFailed) ||
errors.Is(err, swarm.ErrDialBackoff) || errors.Is(err, swarm.ErrNoAddresses) {
wakuLP.pm.CheckAndRemoveBadPeer(peerID)
}
}
return nil, err
}

View File

@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"math"
"sync"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/peer"
@ -73,6 +74,7 @@ type WakuStore struct {
defaultRatelimit rate.Limit
rateLimiters map[peer.ID]*rate.Limiter
rateLimitersMux sync.Mutex
}
// NewWakuStore is used to instantiate a StoreV3 client
@ -297,11 +299,13 @@ func (s *WakuStore) queryFrom(ctx context.Context, storeRequest *pb.StoreQueryRe
logger.Debug("sending store request")
if !params.skipRatelimit {
s.rateLimitersMux.Lock()
rateLimiter, ok := s.rateLimiters[params.selectedPeer]
if !ok {
rateLimiter = rate.NewLimiter(s.defaultRatelimit, 1)
s.rateLimiters[params.selectedPeer] = rateLimiter
}
s.rateLimitersMux.Unlock()
err := rateLimiter.Wait(ctx)
if err != nil {
return nil, err

2
vendor/modules.txt vendored
View File

@ -1044,7 +1044,7 @@ github.com/waku-org/go-discover/discover/v5wire
github.com/waku-org/go-libp2p-rendezvous
github.com/waku-org/go-libp2p-rendezvous/db
github.com/waku-org/go-libp2p-rendezvous/pb
# github.com/waku-org/go-waku v0.8.1-0.20241203032230-6550ff35bc71
# github.com/waku-org/go-waku v0.8.1-0.20241219102436-278907543b02
## explicit; go 1.21
github.com/waku-org/go-waku/logging
github.com/waku-org/go-waku/tests

View File

@ -349,7 +349,7 @@ func New(nodeKey *ecdsa.PrivateKey, fleet string, cfg *Config, logger *zap.Logge
if !cfg.LightClient {
opts = append(opts, node.WithWakuFilterFullNode(filter.WithMaxSubscribers(20)))
opts = append(opts, node.WithLightPush(lightpush.WithRateLimiter(1, 1)))
opts = append(opts, node.WithLightPush(lightpush.WithRateLimiter(5, 10)))
}
if appDB != nil {
@ -1371,7 +1371,6 @@ func (w *Waku) SetTopicsToVerifyForMissingMessages(peerID peer.ID, pubsubTopic s
if !w.cfg.EnableMissingMessageVerification {
return
}
w.missingMsgVerifier.SetCriteriaInterest(peerID, protocol.NewContentFilter(pubsubTopic, contentTopics...))
}
@ -1721,12 +1720,22 @@ func (w *Waku) isGoingOnline(state connection.State) bool {
return !state.Offline && !w.onlineChecker.IsOnline()
}
func (w *Waku) isGoingOffline(state connection.State) bool {
return state.Offline && w.onlineChecker.IsOnline()
}
func (w *Waku) ConnectionChanged(state connection.State) {
if w.isGoingOnline(state) {
//TODO: analyze if we need to discover and connect to peers for relay.
w.discoverAndConnectPeers()
if w.cfg.EnableMissingMessageVerification {
w.missingMsgVerifier.Start(w.ctx)
}
}
if w.isGoingOffline(state) && w.cfg.EnableMissingMessageVerification {
w.missingMsgVerifier.Stop()
}
isOnline := !state.Offline
if w.cfg.LightClient {
//TODO: Update this as per https://github.com/waku-org/go-waku/issues/1114
go func() {
@ -1743,9 +1752,9 @@ func (w *Waku) ConnectionChanged(state connection.State) {
w.logger.Warn("could not write on connection changed channel")
}
}
}
// update state
w.onlineChecker.SetOnline(isOnline)
}
w.state = state
}

View File

@ -674,6 +674,7 @@ func TestOnlineChecker(t *testing.T) {
}
func TestLightpushRateLimit(t *testing.T) {
t.Skip("flaky as it is hard to simulate rate-limits as execution time varies in environments")
logger := tt.MustCreateTestLogger()
config0 := &Config{}
@ -753,7 +754,7 @@ func TestLightpushRateLimit(t *testing.T) {
event := make(chan common.EnvelopeEvent, 10)
w2.SubscribeEnvelopeEvents(event)
for i := range [4]int{} {
for i := range [15]int{} {
msgTimestamp := w2.timestamp()
_, err := w2.Send(config2.DefaultShardPubsubTopic, &pb.WakuMessage{
Payload: []byte{1, 2, 3, 4, 5, 6, byte(i)},
@ -764,12 +765,12 @@ func TestLightpushRateLimit(t *testing.T) {
require.NoError(t, err)
time.Sleep(550 * time.Millisecond)
time.Sleep(20 * time.Millisecond)
}
messages := filter.Retrieve()
require.Len(t, messages, 2)
require.Len(t, messages, 10)
}