2024-05-08 16:37:42 -04:00
package main
import (
"context"
"database/sql"
2024-05-29 21:42:22 -04:00
"encoding/hex"
2024-05-20 16:02:09 -04:00
"errors"
2024-05-21 08:02:54 -04:00
"fmt"
"net"
2024-05-09 15:22:50 -04:00
"sync"
2024-05-08 16:37:42 -04:00
"time"
"github.com/ethereum/go-ethereum/common/hexutil"
2024-05-09 16:03:33 -04:00
"github.com/google/uuid"
2024-05-20 16:02:09 -04:00
"github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/peerstore"
"github.com/waku-org/go-waku/waku/v2/dnsdisc"
2024-05-08 16:37:42 -04:00
"github.com/waku-org/go-waku/waku/v2/node"
"github.com/waku-org/go-waku/waku/v2/protocol"
"github.com/waku-org/go-waku/waku/v2/protocol/pb"
"github.com/waku-org/go-waku/waku/v2/protocol/store"
"github.com/waku-org/storenode-messages/internal/logging"
"github.com/waku-org/storenode-messages/internal/persistence"
"go.uber.org/zap"
2024-05-28 16:37:00 -04:00
"go.uber.org/zap/zapcore"
2024-05-08 16:37:42 -04:00
"google.golang.org/protobuf/proto"
)
type MessageExistence int
const (
Unknown MessageExistence = iota
Exists
DoesNotExist
)
2024-05-09 15:22:50 -04:00
const timeInterval = 2 * time . Minute
const delay = 5 * time . Minute
2024-05-08 16:37:42 -04:00
const maxAttempts = 3
2024-05-09 15:22:50 -04:00
type MessageAttr struct {
Timestamp uint64
PubsubTopic string
}
2024-05-08 16:37:42 -04:00
func Execute ( ctx context . Context , options Options ) error {
// Set encoding for logs (console, json, ...)
// Note that libp2p reads the encoding from GOLOG_LOG_FMT env var.
logging . InitLogger ( options . LogEncoding , options . LogOutput )
logger := logging . Logger ( )
var db * sql . DB
var migrationFn func ( * sql . DB , * zap . Logger ) error
db , migrationFn , err := persistence . ParseURL ( options . DatabaseURL , logger )
if err != nil {
return err
}
2024-05-20 10:10:28 -04:00
dbStore , err := persistence . NewDBStore ( logger , persistence . WithDB ( db ) , persistence . WithMigrations ( migrationFn ) , persistence . WithRetentionPolicy ( options . RetentionPolicy ) )
2024-05-08 16:37:42 -04:00
if err != nil {
return err
}
defer dbStore . Stop ( )
2024-05-20 16:02:09 -04:00
var discoveredNodes [ ] dnsdisc . DiscoveredNode
if len ( options . DNSDiscoveryURLs . Value ( ) ) != 0 {
discoveredNodes = node . GetNodesFromDNSDiscovery ( logger , ctx , options . DNSDiscoveryNameserver , options . DNSDiscoveryURLs . Value ( ) )
}
var storenodes [ ] peer . AddrInfo
for _ , node := range discoveredNodes {
if len ( node . PeerInfo . Addrs ) == 0 {
continue
}
storenodes = append ( storenodes , node . PeerInfo )
}
for _ , node := range options . StoreNodes {
pInfo , err := peer . AddrInfosFromP2pAddrs ( node )
if err != nil {
return err
}
storenodes = append ( storenodes , pInfo ... )
}
if len ( storenodes ) == 0 {
return errors . New ( "no storenodes specified" )
}
2024-05-21 08:02:54 -04:00
hostAddr , err := net . ResolveTCPAddr ( "tcp" , fmt . Sprintf ( "%s:%d" , options . Address , options . Port ) )
if err != nil {
return err
}
2024-05-28 16:37:00 -04:00
lvl , err := zapcore . ParseLevel ( options . LogLevel )
if err != nil {
return err
}
2024-05-08 16:37:42 -04:00
wakuNode , err := node . New (
2024-05-28 16:37:00 -04:00
node . WithLogLevel ( lvl ) ,
2024-05-08 16:37:42 -04:00
node . WithNTP ( ) ,
node . WithClusterID ( uint16 ( options . ClusterID ) ) ,
2024-05-21 08:02:54 -04:00
node . WithHostAddress ( hostAddr ) ,
2024-05-08 16:37:42 -04:00
)
2024-05-09 16:03:33 -04:00
if err != nil {
return err
}
2024-05-08 16:37:42 -04:00
err = wakuNode . Start ( ctx )
if err != nil {
return err
}
defer wakuNode . Stop ( )
2024-05-20 16:02:09 -04:00
for _ , s := range storenodes {
wakuNode . Host ( ) . Peerstore ( ) . AddAddrs ( s . ID , s . Addrs , peerstore . PermanentAddrTTL )
}
2024-05-08 16:37:42 -04:00
err = dbStore . Start ( ctx , wakuNode . Timesource ( ) )
if err != nil {
return err
}
2024-05-09 16:03:33 -04:00
timer := time . NewTimer ( 0 )
defer timer . Stop ( )
2024-05-08 16:37:42 -04:00
for {
select {
case <- ctx . Done ( ) :
2024-05-09 16:03:33 -04:00
return nil
case <- timer . C :
2024-05-16 19:12:10 -04:00
logger . Info ( "verifying message history..." )
2024-05-20 16:02:09 -04:00
err := verifyHistory ( ctx , storenodes , wakuNode , dbStore , logger )
2024-05-09 16:03:33 -04:00
if err != nil {
return err
}
2024-05-16 19:12:10 -04:00
logger . Info ( "verification complete" )
2024-05-09 16:03:33 -04:00
timer . Reset ( timeInterval )
2024-05-09 15:22:50 -04:00
}
}
}
2024-05-08 16:37:42 -04:00
2024-05-09 16:03:33 -04:00
var msgMapLock sync . Mutex
2024-05-20 16:02:09 -04:00
var msgMap map [ pb . MessageHash ] map [ peer . ID ] MessageExistence
2024-05-09 15:22:50 -04:00
var msgAttr map [ pb . MessageHash ] MessageAttr
2024-05-08 16:37:42 -04:00
2024-05-20 16:02:09 -04:00
func verifyHistory ( ctx context . Context , storenodes [ ] peer . AddrInfo , wakuNode * node . WakuNode , dbStore * persistence . DBStore , logger * zap . Logger ) error {
2024-05-29 21:42:22 -04:00
tmpUUID := uuid . New ( )
runId := hex . EncodeToString ( tmpUUID [ : ] )
2024-05-09 16:03:33 -04:00
logger = logger . With ( zap . String ( "runId" , runId ) )
2024-05-09 15:22:50 -04:00
// [MessageHash][StoreNode] = exists?
msgMapLock . Lock ( )
2024-05-20 16:02:09 -04:00
msgMap = make ( map [ pb . MessageHash ] map [ peer . ID ] MessageExistence )
2024-05-16 19:12:10 -04:00
msgAttr = make ( map [ pb . MessageHash ] MessageAttr )
2024-05-09 15:22:50 -04:00
msgMapLock . Unlock ( )
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
topicSyncStatus , err := dbStore . GetTopicSyncStatus ( ctx , options . ClusterID , options . PubSubTopics . Value ( ) )
if err != nil {
return err
}
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
tx , err := dbStore . GetTrx ( ctx )
if err != nil {
return err
}
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
defer func ( ) {
if err == nil {
err = tx . Commit ( )
return
}
// don't shadow original error
_ = tx . Rollback ( )
} ( )
wg := sync . WaitGroup { }
for topic , lastSyncTimestamp := range topicSyncStatus {
2024-05-09 16:03:33 -04:00
wg . Add ( 1 )
go func ( topic string , lastSyncTimestamp * time . Time ) {
2024-05-09 15:22:50 -04:00
defer wg . Done ( )
2024-05-20 16:02:09 -04:00
retrieveHistory ( ctx , runId , storenodes , topic , lastSyncTimestamp , wakuNode , dbStore , tx , logger )
2024-05-09 16:03:33 -04:00
} ( topic , lastSyncTimestamp )
2024-05-09 15:22:50 -04:00
}
wg . Wait ( )
// Verify for each storenode which messages are not available, and query
// for their existence using message hash
// ========================================================================
2024-05-20 16:02:09 -04:00
msgsToVerify := make ( map [ peer . ID ] [ ] pb . MessageHash ) // storenode -> msgHash
2024-05-09 15:22:50 -04:00
msgMapLock . Lock ( )
for msgHash , nodes := range msgMap {
2024-05-20 16:02:09 -04:00
for _ , node := range storenodes {
if nodes [ node . ID ] != Exists {
msgsToVerify [ node . ID ] = append ( msgsToVerify [ node . ID ] , msgHash )
2024-05-09 15:22:50 -04:00
}
}
}
msgMapLock . Unlock ( )
wg = sync . WaitGroup { }
2024-05-20 16:02:09 -04:00
for peerID , messageHashes := range msgsToVerify {
2024-05-09 15:22:50 -04:00
wg . Add ( 1 )
2024-05-20 16:02:09 -04:00
go func ( peerID peer . ID , messageHashes [ ] pb . MessageHash ) {
2024-05-09 15:22:50 -04:00
defer wg . Done ( )
2024-05-29 21:42:22 -04:00
verifyMessageExistence ( ctx , runId , peerID , messageHashes , wakuNode , dbStore , logger )
2024-05-20 16:02:09 -04:00
} ( peerID , messageHashes )
2024-05-09 15:22:50 -04:00
}
wg . Wait ( )
// If a message is not available, store in DB in which store nodes it wasnt
// available and its timestamp
// ========================================================================
msgMapLock . Lock ( )
defer msgMapLock . Unlock ( )
2024-05-20 16:24:16 -04:00
2024-05-09 15:22:50 -04:00
for msgHash , nodes := range msgMap {
2024-05-20 16:02:09 -04:00
var missingIn [ ] peer . AddrInfo
var unknownIn [ ] peer . AddrInfo
for _ , node := range storenodes {
if nodes [ node . ID ] == DoesNotExist {
missingIn = append ( missingIn , node )
} else if nodes [ node . ID ] == Unknown {
unknownIn = append ( unknownIn , node )
2024-05-09 15:22:50 -04:00
}
}
2024-05-08 16:37:42 -04:00
2024-05-20 16:24:16 -04:00
if len ( missingIn ) != 0 {
logger . Info ( "missing message identified" , zap . Stringer ( "hash" , msgHash ) , zap . String ( "pubsubTopic" , msgAttr [ msgHash ] . PubsubTopic ) , zap . Int ( "num_nodes" , len ( missingIn ) ) )
err := dbStore . RecordMessage ( runId , tx , msgHash , options . ClusterID , msgAttr [ msgHash ] . PubsubTopic , msgAttr [ msgHash ] . Timestamp , missingIn , "does_not_exist" )
if err != nil {
return err
}
2024-05-09 15:22:50 -04:00
}
2024-05-08 16:37:42 -04:00
2024-05-20 16:24:16 -04:00
if len ( unknownIn ) != 0 {
logger . Debug ( "message with unknown state identified" , zap . Stringer ( "hash" , msgHash ) , zap . String ( "pubsubTopic" , msgAttr [ msgHash ] . PubsubTopic ) , zap . Int ( "num_nodes" , len ( missingIn ) ) )
err = dbStore . RecordMessage ( runId , tx , msgHash , options . ClusterID , msgAttr [ msgHash ] . PubsubTopic , msgAttr [ msgHash ] . Timestamp , unknownIn , "unknown" )
if err != nil {
return err
}
2024-05-09 15:22:50 -04:00
}
}
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
if err != nil {
return err
}
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
return nil
}
2024-05-08 16:37:42 -04:00
2024-05-20 16:02:09 -04:00
func retrieveHistory ( ctx context . Context , runId string , storenodes [ ] peer . AddrInfo , topic string , lastSyncTimestamp * time . Time , wakuNode * node . WakuNode , dbStore * persistence . DBStore , tx * sql . Tx , logger * zap . Logger ) {
2024-05-16 19:12:10 -04:00
logger = logger . With ( zap . String ( "topic" , topic ) , zap . Timep ( "lastSyncTimestamp" , lastSyncTimestamp ) )
2024-05-09 15:22:50 -04:00
now := wakuNode . Timesource ( ) . Now ( )
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
// Query is done with a delay
2024-05-17 14:38:52 -04:00
startTime := now . Add ( - ( timeInterval + delay ) )
2024-05-09 15:22:50 -04:00
if lastSyncTimestamp != nil {
startTime = * lastSyncTimestamp
}
endTime := now . Add ( - delay )
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
if startTime . After ( endTime ) {
2024-05-16 19:12:10 -04:00
logger . Warn ( "too soon to retrieve messages for topic" )
2024-05-09 15:22:50 -04:00
return
}
// Determine if the messages exist across all nodes
2024-05-20 16:02:09 -04:00
for _ , node := range storenodes {
2024-05-09 15:22:50 -04:00
storeNodeFailure := false
var result * store . Result
var err error
2024-05-16 19:12:10 -04:00
logger . Info ( "retrieving message history for topic" , zap . Stringer ( "storenode" , node ) , zap . Int64 ( "from" , startTime . UnixNano ( ) ) , zap . Int64 ( "to" , endTime . UnixNano ( ) ) )
queryLbl :
2024-05-09 15:22:50 -04:00
for i := 0 ; i < maxAttempts ; i ++ {
result , err = wakuNode . Store ( ) . Query ( ctx , store . FilterCriteria {
ContentFilter : protocol . NewContentFilter ( topic ) ,
TimeStart : proto . Int64 ( startTime . UnixNano ( ) ) ,
TimeEnd : proto . Int64 ( endTime . UnixNano ( ) ) ,
2024-05-20 16:02:09 -04:00
} , store . WithPeer ( node . ID ) )
2024-05-09 15:22:50 -04:00
if err != nil {
logger . Error ( "could not query storenode" , zap . Stringer ( "storenode" , node ) , zap . Error ( err ) )
storeNodeFailure = true
time . Sleep ( 2 * time . Second )
} else {
2024-05-16 19:12:10 -04:00
logger . Debug ( "messages available?" , zap . Int ( "len" , len ( result . Messages ( ) ) ) )
2024-05-09 15:22:50 -04:00
storeNodeFailure = false
2024-05-16 19:12:10 -04:00
break queryLbl
2024-05-09 15:22:50 -04:00
}
}
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
if storeNodeFailure {
logger . Error ( "storenode not available" , zap . Stringer ( "storenode" , node ) , zap . Time ( "startTime" , startTime ) , zap . Time ( "endTime" , endTime ) )
2024-05-20 16:02:09 -04:00
err := dbStore . RecordStorenodeUnavailable ( runId , node )
2024-05-20 10:10:28 -04:00
if err != nil {
2024-05-20 16:02:09 -04:00
logger . Error ( "could not store node unavailable" , zap . Error ( err ) , zap . Stringer ( "storenode" , node ) )
2024-05-20 10:10:28 -04:00
}
2024-05-09 15:22:50 -04:00
} else {
2024-05-16 19:12:10 -04:00
iteratorLbl :
for ! result . IsComplete ( ) {
msgMapLock . Lock ( )
for _ , mkv := range result . Messages ( ) {
hash := mkv . WakuMessageHash ( )
_ , ok := msgMap [ hash ]
if ! ok {
2024-05-20 16:02:09 -04:00
msgMap [ hash ] = make ( map [ peer . ID ] MessageExistence )
2024-05-16 19:12:10 -04:00
}
2024-05-20 16:02:09 -04:00
msgMap [ hash ] [ node . ID ] = Exists
2024-05-16 19:12:10 -04:00
msgAttr [ hash ] = MessageAttr {
Timestamp : uint64 ( mkv . Message . GetTimestamp ( ) ) ,
PubsubTopic : mkv . GetPubsubTopic ( ) ,
}
}
msgMapLock . Unlock ( )
storeNodeFailure := false
nextRetryLbl :
2024-05-08 16:37:42 -04:00
for i := 0 ; i < maxAttempts ; i ++ {
2024-05-16 19:12:10 -04:00
err = result . Next ( ctx )
2024-05-08 16:37:42 -04:00
if err != nil {
2024-05-09 15:22:50 -04:00
logger . Error ( "could not query storenode" , zap . Stringer ( "storenode" , node ) , zap . Error ( err ) )
2024-05-08 16:37:42 -04:00
storeNodeFailure = true
time . Sleep ( 2 * time . Second )
} else {
2024-05-16 19:12:10 -04:00
storeNodeFailure = false
break nextRetryLbl
2024-05-08 16:37:42 -04:00
}
}
if storeNodeFailure {
logger . Error ( "storenode not available" ,
2024-05-09 15:22:50 -04:00
zap . Stringer ( "storenode" , node ) ,
zap . Time ( "startTime" , startTime ) ,
zap . Time ( "endTime" , endTime ) ,
zap . String ( "topic" , topic ) ,
zap . String ( "cursor" , hexutil . Encode ( result . Cursor ( ) ) ) )
2024-05-20 16:02:09 -04:00
err := dbStore . RecordStorenodeUnavailable ( runId , node )
if err != nil {
logger . Error ( "could not store recordnode unavailable" , zap . Error ( err ) , zap . Stringer ( "storenode" , node ) )
}
2024-05-16 19:12:10 -04:00
break iteratorLbl
2024-05-08 16:37:42 -04:00
}
}
2024-05-09 15:22:50 -04:00
}
}
2024-05-08 16:37:42 -04:00
2024-05-09 15:22:50 -04:00
// Update db with last sync time
2024-05-16 19:12:10 -04:00
err := dbStore . UpdateTopicSyncState ( tx , options . ClusterID , topic , endTime )
if err != nil {
logger . Panic ( "could not update topic sync state" , zap . Error ( err ) )
}
2024-05-09 15:22:50 -04:00
}
2024-05-08 16:37:42 -04:00
2024-05-29 21:42:22 -04:00
func verifyMessageExistence ( ctx context . Context , runId string , peerID peer . ID , messageHashes [ ] pb . MessageHash , wakuNode * node . WakuNode , dbStore * persistence . DBStore , logger * zap . Logger ) {
2024-05-09 15:22:50 -04:00
storeNodeFailure := false
var result * store . Result
var err error
2024-05-16 19:12:10 -04:00
2024-05-20 16:02:09 -04:00
peerInfo := wakuNode . Host ( ) . Peerstore ( ) . PeerInfo ( peerID )
2024-05-16 19:12:10 -04:00
queryLbl :
2024-05-09 15:22:50 -04:00
for i := 0 ; i < maxAttempts ; i ++ {
2024-05-20 16:02:09 -04:00
result , err = wakuNode . Store ( ) . QueryByHash ( ctx , messageHashes , store . IncludeData ( false ) , store . WithPeer ( peerInfo . ID ) )
2024-05-09 15:22:50 -04:00
if err != nil {
2024-05-20 16:02:09 -04:00
logger . Error ( "could not query storenode" , zap . Stringer ( "storenode" , peerInfo ) , zap . Error ( err ) )
2024-05-09 15:22:50 -04:00
storeNodeFailure = true
time . Sleep ( 2 * time . Second )
} else {
2024-05-16 19:12:10 -04:00
storeNodeFailure = false
break queryLbl
2024-05-09 15:22:50 -04:00
}
}
if storeNodeFailure {
logger . Error ( "storenode not available" ,
2024-05-20 16:02:09 -04:00
zap . Stringer ( "storenode" , peerInfo ) ,
2024-05-09 15:22:50 -04:00
zap . Stringers ( "hashes" , messageHashes ) )
2024-05-20 10:10:28 -04:00
2024-05-20 16:02:09 -04:00
err := dbStore . RecordStorenodeUnavailable ( runId , peerInfo )
2024-05-20 10:10:28 -04:00
if err != nil {
2024-05-20 16:02:09 -04:00
logger . Error ( "could not store recordnode unavailable" , zap . Error ( err ) , zap . Stringer ( "storenode" , peerInfo ) )
2024-05-20 10:10:28 -04:00
}
2024-05-09 15:22:50 -04:00
} else {
2024-05-16 19:12:10 -04:00
for ! result . IsComplete ( ) {
msgMapLock . Lock ( )
for _ , mkv := range result . Messages ( ) {
hash := mkv . WakuMessageHash ( )
_ , ok := msgMap [ hash ]
if ! ok {
2024-05-20 16:02:09 -04:00
msgMap [ hash ] = make ( map [ peer . ID ] MessageExistence )
2024-05-16 19:12:10 -04:00
}
2024-05-20 16:02:09 -04:00
msgMap [ hash ] [ peerInfo . ID ] = Exists
2024-05-16 19:12:10 -04:00
}
for _ , msgHash := range messageHashes {
2024-05-20 16:02:09 -04:00
if msgMap [ msgHash ] [ peerInfo . ID ] != Exists {
msgMap [ msgHash ] [ peerInfo . ID ] = DoesNotExist
2024-05-16 19:12:10 -04:00
}
}
msgMapLock . Unlock ( )
2024-05-09 15:22:50 -04:00
storeNodeFailure = false
2024-05-16 19:12:10 -04:00
nextRetryLbl :
2024-05-09 15:22:50 -04:00
for i := 0 ; i < maxAttempts ; i ++ {
2024-05-16 19:12:10 -04:00
err = result . Next ( ctx )
2024-05-08 16:37:42 -04:00
if err != nil {
2024-05-20 16:02:09 -04:00
logger . Error ( "could not query storenode" , zap . Stringer ( "storenode" , peerInfo ) , zap . Error ( err ) )
2024-05-09 15:22:50 -04:00
storeNodeFailure = true
time . Sleep ( 2 * time . Second )
} else {
2024-05-16 19:12:10 -04:00
storeNodeFailure = false
break nextRetryLbl
2024-05-08 16:37:42 -04:00
}
}
2024-05-09 15:22:50 -04:00
if storeNodeFailure {
logger . Error ( "storenode not available" ,
2024-05-20 16:02:09 -04:00
zap . Stringer ( "storenode" , peerInfo ) ,
2024-05-09 15:22:50 -04:00
zap . Stringers ( "hashes" , messageHashes ) ,
zap . String ( "cursor" , hexutil . Encode ( result . Cursor ( ) ) ) )
2024-05-20 10:10:28 -04:00
2024-05-20 16:02:09 -04:00
err := dbStore . RecordStorenodeUnavailable ( runId , peerInfo )
2024-05-20 10:10:28 -04:00
if err != nil {
2024-05-20 16:02:09 -04:00
logger . Error ( "could not store recordnode unavailable" , zap . Error ( err ) , zap . Stringer ( "storenode" , peerInfo ) )
2024-05-20 10:10:28 -04:00
}
2024-05-09 15:22:50 -04:00
}
2024-05-16 19:12:10 -04:00
2024-05-08 16:37:42 -04:00
}
}
}