2021-11-03 13:38:37 +01:00
package telemetry
import (
"bytes"
2024-06-13 15:31:09 -07:00
"context"
2021-11-03 13:38:37 +01:00
"encoding/json"
"fmt"
"net/http"
2024-08-01 05:27:43 +02:00
"strings"
2024-06-28 03:24:04 -07:00
"sync"
2021-11-03 13:38:37 +01:00
"time"
"go.uber.org/zap"
"github.com/status-im/status-go/eth-node/types"
"github.com/status-im/status-go/protocol/transport"
2024-06-13 15:31:09 -07:00
"github.com/status-im/status-go/wakuv2"
2023-10-30 15:51:57 +01:00
v2protocol "github.com/waku-org/go-waku/waku/v2/protocol"
2024-06-19 11:40:52 +01:00
v1protocol "github.com/status-im/status-go/protocol/v1"
2021-11-03 13:38:37 +01:00
)
2024-06-13 15:31:09 -07:00
type TelemetryType string
const (
2024-06-28 03:24:04 -07:00
ProtocolStatsMetric TelemetryType = "ProtocolStats"
ReceivedEnvelopeMetric TelemetryType = "ReceivedEnvelope"
SentEnvelopeMetric TelemetryType = "SentEnvelope"
UpdateEnvelopeMetric TelemetryType = "UpdateEnvelope"
ReceivedMessagesMetric TelemetryType = "ReceivedMessages"
ErrorSendingEnvelopeMetric TelemetryType = "ErrorSendingEnvelope"
2024-07-12 13:37:55 -07:00
PeerCountMetric TelemetryType = "PeerCount"
2024-08-05 11:44:57 -07:00
PeerConnFailuresMetric TelemetryType = "PeerConnFailure"
2024-07-01 20:08:54 +02:00
MaxRetryCache = 5000
2024-06-13 15:31:09 -07:00
)
type TelemetryRequest struct {
Id int ` json:"id" `
TelemetryType TelemetryType ` json:"telemetry_type" `
TelemetryData * json . RawMessage ` json:"telemetry_data" `
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) PushReceivedMessages ( ctx context . Context , receivedMessages ReceivedMessages ) {
c . processAndPushTelemetry ( ctx , receivedMessages )
2024-06-13 15:31:09 -07:00
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) PushSentEnvelope ( ctx context . Context , sentEnvelope wakuv2 . SentEnvelope ) {
c . processAndPushTelemetry ( ctx , sentEnvelope )
2024-06-13 15:31:09 -07:00
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) PushReceivedEnvelope ( ctx context . Context , receivedEnvelope * v2protocol . Envelope ) {
c . processAndPushTelemetry ( ctx , receivedEnvelope )
2024-06-28 03:24:04 -07:00
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) PushErrorSendingEnvelope ( ctx context . Context , errorSendingEnvelope wakuv2 . ErrorSendingEnvelope ) {
c . processAndPushTelemetry ( ctx , errorSendingEnvelope )
2024-06-13 15:31:09 -07:00
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) PushPeerCount ( ctx context . Context , peerCount int ) {
2024-08-05 11:44:57 -07:00
if peerCount != c . lastPeerCount {
c . lastPeerCount = peerCount
2024-08-12 22:30:13 +01:00
c . processAndPushTelemetry ( ctx , PeerCount { PeerCount : peerCount } )
2024-08-05 11:44:57 -07:00
}
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) PushPeerConnFailures ( ctx context . Context , peerConnFailures map [ string ] int ) {
2024-08-05 11:44:57 -07:00
for peerID , failures := range peerConnFailures {
if lastFailures , exists := c . lastPeerConnFailures [ peerID ] ; exists {
if failures == lastFailures {
continue
}
}
c . lastPeerConnFailures [ peerID ] = failures
2024-08-12 22:30:13 +01:00
c . processAndPushTelemetry ( ctx , PeerConnFailure { FailedPeerId : peerID , FailureCount : failures } )
2024-08-05 11:44:57 -07:00
}
2024-07-12 13:37:55 -07:00
}
2024-06-13 15:31:09 -07:00
type ReceivedMessages struct {
Filter transport . Filter
SSHMessage * types . Message
Messages [ ] * v1protocol . StatusMessage
}
2024-07-12 13:37:55 -07:00
type PeerCount struct {
PeerCount int
}
2024-08-05 11:44:57 -07:00
type PeerConnFailure struct {
FailedPeerId string
FailureCount int
}
2021-11-03 13:38:37 +01:00
type Client struct {
2024-08-05 11:44:57 -07:00
serverURL string
httpClient * http . Client
logger * zap . Logger
keyUID string
nodeName string
peerId string
version string
telemetryCh chan TelemetryRequest
telemetryCacheLock sync . Mutex
telemetryCache [ ] TelemetryRequest
telemetryRetryCache [ ] TelemetryRequest
nextIdLock sync . Mutex
nextId int
sendPeriod time . Duration
lastPeerCount int
lastPeerConnFailures map [ string ] int
2021-11-03 13:38:37 +01:00
}
2024-06-28 03:24:04 -07:00
type TelemetryClientOption func ( * Client )
func WithSendPeriod ( sendPeriod time . Duration ) TelemetryClientOption {
return func ( c * Client ) {
c . sendPeriod = sendPeriod
}
}
2024-08-01 05:27:43 +02:00
func WithPeerID ( peerId string ) TelemetryClientOption {
return func ( c * Client ) {
c . peerId = peerId
}
}
2024-06-28 03:24:04 -07:00
func NewClient ( logger * zap . Logger , serverURL string , keyUID string , nodeName string , version string , opts ... TelemetryClientOption ) * Client {
2024-08-01 05:27:43 +02:00
serverURL = strings . TrimRight ( serverURL , "/" )
2024-06-28 03:24:04 -07:00
client := & Client {
2024-08-05 11:44:57 -07:00
serverURL : serverURL ,
httpClient : & http . Client { Timeout : time . Minute } ,
logger : logger ,
keyUID : keyUID ,
nodeName : nodeName ,
version : version ,
telemetryCh : make ( chan TelemetryRequest ) ,
telemetryCacheLock : sync . Mutex { } ,
telemetryCache : make ( [ ] TelemetryRequest , 0 ) ,
telemetryRetryCache : make ( [ ] TelemetryRequest , 0 ) ,
nextId : 0 ,
nextIdLock : sync . Mutex { } ,
sendPeriod : 10 * time . Second , // default value
lastPeerCount : 0 ,
lastPeerConnFailures : make ( map [ string ] int ) ,
2024-06-28 03:24:04 -07:00
}
for _ , opt := range opts {
opt ( client )
2024-06-13 15:31:09 -07:00
}
2024-06-28 03:24:04 -07:00
return client
2024-06-13 15:31:09 -07:00
}
2024-06-28 03:24:04 -07:00
func ( c * Client ) Start ( ctx context . Context ) {
2024-06-13 15:31:09 -07:00
go func ( ) {
for {
select {
2024-06-28 03:24:04 -07:00
case telemetryRequest := <- c . telemetryCh :
c . telemetryCacheLock . Lock ( )
c . telemetryCache = append ( c . telemetryCache , telemetryRequest )
c . telemetryCacheLock . Unlock ( )
2024-06-13 15:31:09 -07:00
case <- ctx . Done ( ) :
return
}
}
} ( )
go func ( ) {
2024-07-01 20:08:54 +02:00
sendPeriod := c . sendPeriod
timer := time . NewTimer ( sendPeriod )
defer timer . Stop ( )
2024-06-13 15:31:09 -07:00
for {
select {
2024-07-01 20:08:54 +02:00
case <- timer . C :
2024-06-28 03:24:04 -07:00
c . telemetryCacheLock . Lock ( )
telemetryRequests := make ( [ ] TelemetryRequest , len ( c . telemetryCache ) )
copy ( telemetryRequests , c . telemetryCache )
c . telemetryCache = nil
c . telemetryCacheLock . Unlock ( )
2024-06-13 15:31:09 -07:00
if len ( telemetryRequests ) > 0 {
2024-07-01 20:08:54 +02:00
err := c . pushTelemetryRequest ( telemetryRequests )
if err != nil {
2024-07-12 03:07:23 +02:00
if sendPeriod < 60 * time . Second { //Stop the growing if the timer is > 60s to at least retry every minute
2024-07-01 20:08:54 +02:00
sendPeriod = sendPeriod * 2
}
} else {
sendPeriod = c . sendPeriod
}
2024-06-13 15:31:09 -07:00
}
2024-07-01 20:08:54 +02:00
timer . Reset ( sendPeriod )
2024-06-13 15:31:09 -07:00
case <- ctx . Done ( ) :
return
}
}
2024-06-28 03:24:04 -07:00
2024-06-13 15:31:09 -07:00
} ( )
}
2024-08-12 22:30:13 +01:00
func ( c * Client ) processAndPushTelemetry ( ctx context . Context , data interface { } ) {
2024-06-13 15:31:09 -07:00
var telemetryRequest TelemetryRequest
switch v := data . ( type ) {
case ReceivedMessages :
telemetryRequest = TelemetryRequest {
Id : c . nextId ,
TelemetryType : ReceivedMessagesMetric ,
TelemetryData : c . ProcessReceivedMessages ( v ) ,
}
case * v2protocol . Envelope :
telemetryRequest = TelemetryRequest {
Id : c . nextId ,
TelemetryType : ReceivedEnvelopeMetric ,
TelemetryData : c . ProcessReceivedEnvelope ( v ) ,
}
case wakuv2 . SentEnvelope :
telemetryRequest = TelemetryRequest {
Id : c . nextId ,
TelemetryType : SentEnvelopeMetric ,
TelemetryData : c . ProcessSentEnvelope ( v ) ,
}
2024-06-28 03:24:04 -07:00
case wakuv2 . ErrorSendingEnvelope :
telemetryRequest = TelemetryRequest {
Id : c . nextId ,
TelemetryType : ErrorSendingEnvelopeMetric ,
TelemetryData : c . ProcessErrorSendingEnvelope ( v ) ,
}
2024-07-12 13:37:55 -07:00
case PeerCount :
telemetryRequest = TelemetryRequest {
Id : c . nextId ,
TelemetryType : PeerCountMetric ,
TelemetryData : c . ProcessPeerCount ( v ) ,
}
2024-08-05 11:44:57 -07:00
case PeerConnFailure :
telemetryRequest = TelemetryRequest {
Id : c . nextId ,
TelemetryType : PeerConnFailuresMetric ,
TelemetryData : c . ProcessPeerConnFailure ( v ) ,
}
2024-06-13 15:31:09 -07:00
default :
c . logger . Error ( "Unknown telemetry data type" )
return
}
2024-08-12 22:30:13 +01:00
select {
case <- ctx . Done ( ) :
return
case c . telemetryCh <- telemetryRequest :
}
2024-06-28 03:24:04 -07:00
c . nextIdLock . Lock ( )
c . nextId ++
c . nextIdLock . Unlock ( )
2024-06-13 15:31:09 -07:00
}
2024-07-01 20:08:54 +02:00
// This is assuming to not run concurrently as we are not locking the `telemetryRetryCache`
func ( c * Client ) pushTelemetryRequest ( request [ ] TelemetryRequest ) error {
2024-07-12 03:07:23 +02:00
if len ( c . telemetryRetryCache ) > MaxRetryCache { //Limit the size of the cache to not grow the slice indefinitely in case the Telemetry server is gone for longer time
removeNum := len ( c . telemetryRetryCache ) - MaxRetryCache
2024-07-01 20:08:54 +02:00
c . telemetryRetryCache = c . telemetryRetryCache [ removeNum : ]
}
c . telemetryRetryCache = append ( c . telemetryRetryCache , request ... )
2024-06-13 15:31:09 -07:00
url := fmt . Sprintf ( "%s/record-metrics" , c . serverURL )
2024-07-01 20:08:54 +02:00
body , err := json . Marshal ( c . telemetryRetryCache )
if err != nil {
c . logger . Error ( "Error marshaling telemetry data" , zap . Error ( err ) )
return err
}
res , err := c . httpClient . Post ( url , "application/json" , bytes . NewBuffer ( body ) )
2024-06-13 15:31:09 -07:00
if err != nil {
c . logger . Error ( "Error sending telemetry data" , zap . Error ( err ) )
2024-07-01 20:08:54 +02:00
return err
}
2024-07-12 13:37:55 -07:00
defer res . Body . Close ( )
var responseBody [ ] map [ string ] interface { }
if err := json . NewDecoder ( res . Body ) . Decode ( & responseBody ) ; err != nil {
c . logger . Error ( "Error decoding response body" , zap . Error ( err ) )
return err
}
if res . StatusCode != http . StatusCreated {
c . logger . Error ( "Error sending telemetry data" , zap . Int ( "statusCode" , res . StatusCode ) , zap . Any ( "responseBody" , responseBody ) )
return fmt . Errorf ( "status code %d, response body: %v" , res . StatusCode , responseBody )
2021-11-03 13:38:37 +01:00
}
2024-07-01 20:08:54 +02:00
c . telemetryRetryCache = nil
return nil
2021-11-03 13:38:37 +01:00
}
2024-06-13 15:31:09 -07:00
func ( c * Client ) ProcessReceivedMessages ( receivedMessages ReceivedMessages ) * json . RawMessage {
2021-11-03 13:38:37 +01:00
var postBody [ ] map [ string ] interface { }
2024-06-13 15:31:09 -07:00
for _ , message := range receivedMessages . Messages {
2021-11-03 13:38:37 +01:00
postBody = append ( postBody , map [ string ] interface { } {
2024-06-13 15:31:09 -07:00
"chatId" : receivedMessages . Filter . ChatID ,
"messageHash" : types . EncodeHex ( receivedMessages . SSHMessage . Hash ) ,
2023-11-08 19:05:33 +01:00
"messageId" : message . ApplicationLayer . ID ,
2024-06-13 15:31:09 -07:00
"sentAt" : receivedMessages . SSHMessage . Timestamp ,
"pubsubTopic" : receivedMessages . Filter . PubsubTopic ,
"topic" : receivedMessages . Filter . ContentTopic . String ( ) ,
2023-11-08 19:05:33 +01:00
"messageType" : message . ApplicationLayer . Type . String ( ) ,
2021-11-03 13:38:37 +01:00
"receiverKeyUID" : c . keyUID ,
2024-08-01 05:27:43 +02:00
"peerId" : c . peerId ,
2021-11-03 13:38:37 +01:00
"nodeName" : c . nodeName ,
2024-06-13 15:31:09 -07:00
"messageSize" : len ( receivedMessages . SSHMessage . Payload ) ,
"statusVersion" : c . version ,
2021-11-03 13:38:37 +01:00
} )
}
body , _ := json . Marshal ( postBody )
2024-06-13 15:31:09 -07:00
jsonRawMessage := json . RawMessage ( body )
return & jsonRawMessage
2021-11-03 13:38:37 +01:00
}
2023-10-30 15:51:57 +01:00
2024-06-13 15:31:09 -07:00
func ( c * Client ) ProcessReceivedEnvelope ( envelope * v2protocol . Envelope ) * json . RawMessage {
2023-10-30 15:51:57 +01:00
postBody := map [ string ] interface { } {
2024-05-15 19:15:00 -04:00
"messageHash" : envelope . Hash ( ) . String ( ) ,
2023-12-05 12:29:27 +08:00
"sentAt" : uint32 ( envelope . Message ( ) . GetTimestamp ( ) / int64 ( time . Second ) ) ,
2023-10-30 15:51:57 +01:00
"pubsubTopic" : envelope . PubsubTopic ( ) ,
"topic" : envelope . Message ( ) . ContentTopic ,
"receiverKeyUID" : c . keyUID ,
2024-08-01 05:27:43 +02:00
"peerId" : c . peerId ,
2023-10-30 15:51:57 +01:00
"nodeName" : c . nodeName ,
2024-06-13 15:31:09 -07:00
"statusVersion" : c . version ,
2023-10-30 15:51:57 +01:00
}
body , _ := json . Marshal ( postBody )
2024-06-13 15:31:09 -07:00
jsonRawMessage := json . RawMessage ( body )
return & jsonRawMessage
}
func ( c * Client ) ProcessSentEnvelope ( sentEnvelope wakuv2 . SentEnvelope ) * json . RawMessage {
postBody := map [ string ] interface { } {
"messageHash" : sentEnvelope . Envelope . Hash ( ) . String ( ) ,
"sentAt" : uint32 ( sentEnvelope . Envelope . Message ( ) . GetTimestamp ( ) / int64 ( time . Second ) ) ,
"pubsubTopic" : sentEnvelope . Envelope . PubsubTopic ( ) ,
"topic" : sentEnvelope . Envelope . Message ( ) . ContentTopic ,
"senderKeyUID" : c . keyUID ,
2024-08-01 05:27:43 +02:00
"peerId" : c . peerId ,
2024-06-13 15:31:09 -07:00
"nodeName" : c . nodeName ,
"publishMethod" : sentEnvelope . PublishMethod . String ( ) ,
"statusVersion" : c . version ,
2023-10-30 15:51:57 +01:00
}
2024-06-13 15:31:09 -07:00
body , _ := json . Marshal ( postBody )
jsonRawMessage := json . RawMessage ( body )
return & jsonRawMessage
2023-10-30 15:51:57 +01:00
}
2024-06-28 03:24:04 -07:00
func ( c * Client ) ProcessErrorSendingEnvelope ( errorSendingEnvelope wakuv2 . ErrorSendingEnvelope ) * json . RawMessage {
postBody := map [ string ] interface { } {
"messageHash" : errorSendingEnvelope . SentEnvelope . Envelope . Hash ( ) . String ( ) ,
"sentAt" : uint32 ( errorSendingEnvelope . SentEnvelope . Envelope . Message ( ) . GetTimestamp ( ) / int64 ( time . Second ) ) ,
"pubsubTopic" : errorSendingEnvelope . SentEnvelope . Envelope . PubsubTopic ( ) ,
"topic" : errorSendingEnvelope . SentEnvelope . Envelope . Message ( ) . ContentTopic ,
"senderKeyUID" : c . keyUID ,
2024-08-01 05:27:43 +02:00
"peerId" : c . peerId ,
2024-06-28 03:24:04 -07:00
"nodeName" : c . nodeName ,
"publishMethod" : errorSendingEnvelope . SentEnvelope . PublishMethod . String ( ) ,
"statusVersion" : c . version ,
"error" : errorSendingEnvelope . Error . Error ( ) ,
}
body , _ := json . Marshal ( postBody )
jsonRawMessage := json . RawMessage ( body )
return & jsonRawMessage
}
2024-07-12 13:37:55 -07:00
func ( c * Client ) ProcessPeerCount ( peerCount PeerCount ) * json . RawMessage {
postBody := map [ string ] interface { } {
"peerCount" : peerCount . PeerCount ,
"nodeName" : c . nodeName ,
"nodeKeyUID" : c . keyUID ,
2024-08-01 05:27:43 +02:00
"peerId" : c . peerId ,
2024-07-12 13:37:55 -07:00
"statusVersion" : c . version ,
"timestamp" : time . Now ( ) . Unix ( ) ,
}
body , _ := json . Marshal ( postBody )
2024-08-05 11:44:57 -07:00
jsonRawMessage := json . RawMessage ( body )
return & jsonRawMessage
}
func ( c * Client ) ProcessPeerConnFailure ( peerConnFailure PeerConnFailure ) * json . RawMessage {
postBody := map [ string ] interface { } {
"failedPeerId" : peerConnFailure . FailedPeerId ,
"failureCount" : peerConnFailure . FailureCount ,
"nodeName" : c . nodeName ,
"nodeKeyUID" : c . keyUID ,
"peerId" : c . peerId ,
"statusVersion" : c . version ,
"timestamp" : time . Now ( ) . Unix ( ) ,
}
body , _ := json . Marshal ( postBody )
2024-07-12 13:37:55 -07:00
jsonRawMessage := json . RawMessage ( body )
return & jsonRawMessage
}
2023-10-30 15:51:57 +01:00
func ( c * Client ) UpdateEnvelopeProcessingError ( shhMessage * types . Message , processingError error ) {
c . logger . Debug ( "Pushing envelope update to telemetry server" , zap . String ( "hash" , types . EncodeHex ( shhMessage . Hash ) ) )
url := fmt . Sprintf ( "%s/update-envelope" , c . serverURL )
var errorString = ""
if processingError != nil {
errorString = processingError . Error ( )
}
postBody := map [ string ] interface { } {
"messageHash" : types . EncodeHex ( shhMessage . Hash ) ,
"sentAt" : shhMessage . Timestamp ,
"pubsubTopic" : shhMessage . PubsubTopic ,
"topic" : shhMessage . Topic ,
"receiverKeyUID" : c . keyUID ,
2024-08-01 05:27:43 +02:00
"peerId" : c . peerId ,
2023-10-30 15:51:57 +01:00
"nodeName" : c . nodeName ,
"processingError" : errorString ,
}
body , _ := json . Marshal ( postBody )
_ , err := c . httpClient . Post ( url , "application/json" , bytes . NewBuffer ( body ) )
if err != nil {
c . logger . Error ( "Error sending envelope update to telemetry server" , zap . Error ( err ) )
}
}