nwaku/apps/networkmonitor/networkmonitor.nim

627 lines
21 KiB
Nim
Raw Normal View History

when (NimMajor, NimMinor) < (1, 4):
{.push raises: [Defect].}
else:
{.push raises: [].}
import
std/[tables, strutils, times, sequtils, random],
stew/results,
stew/shims/net,
chronicles,
chronicles/topics_registry,
chronos,
chronos/timer as ctime,
confutils,
eth/keys,
eth/p2p/discoveryv5/enr,
libp2p/crypto/crypto,
libp2p/nameresolving/dnsresolver,
libp2p/protocols/ping,
metrics,
metrics/chronos_httpserver,
presto/[route, server, client]
import
../../waku/waku_core,
../../waku/node/peer_manager,
../../waku/waku_node,
../../waku/waku_enr,
../../waku/discovery/waku_discv5,
../../waku/discovery/waku_dnsdisc,
../../waku/waku_relay,
../../waku/waku_rln_relay,
../../waku/factory/builder,
../wakunode2/networks_config,
./networkmonitor_metrics,
./networkmonitor_config,
./networkmonitor_utils
logScope:
topics = "networkmonitor"
const ReconnectTime = 60
const MaxConnectionRetries = 5
const ResetRetriesAfter = 1200
const AvgPingWindow = 10.0
const MaxConnectedPeers = 150
const git_version* {.strdefine.} = "n/a"
proc setDiscoveredPeersCapabilities(routingTableNodes: seq[Node]) =
for capability in @[Relay, Store, Filter, Lightpush]:
let nOfNodesWithCapability =
routingTableNodes.countIt(it.record.supportsCapability(capability))
info "capabilities as per ENR waku flag",
capability = capability, amount = nOfNodesWithCapability
networkmonitor_peer_type_as_per_enr.set(
int64(nOfNodesWithCapability), labelValues = [$capability]
)
proc analyzePeer(
customPeerInfo: CustomPeerInfoRef,
peerInfo: RemotePeerInfo,
node: WakuNode,
timeout: chronos.Duration,
): Future[Result[string, string]] {.async.} =
var pingDelay: chronos.Duration
proc ping(): Future[Result[void, string]] {.async, gcsafe.} =
try:
let conn = await node.switch.dial(peerInfo.peerId, peerInfo.addrs, PingCodec)
pingDelay = await node.libp2pPing.ping(conn)
return ok()
except CatchableError:
var msg = getCurrentExceptionMsg()
if msg == "Future operation cancelled!":
msg = "timedout"
warn "failed to ping the peer", peer = peerInfo, err = msg
customPeerInfo.connError = msg
return err("could not ping peer: " & msg)
let timedOut = not await ping().withTimeout(timeout)
# need this check for pingDelat == 0 because there may be a conn error before timeout
if timedOut or pingDelay == 0.millis:
customPeerInfo.retries += 1
return err(customPeerInfo.connError)
customPeerInfo.connError = ""
info "successfully pinged peer", peer = peerInfo, duration = pingDelay.millis
networkmonitor_peer_ping.observe(pingDelay.millis)
if customPeerInfo.avgPingDuration == 0.millis:
customPeerInfo.avgPingDuration = pingDelay
# TODO: check why the calculation ends up losing precision
customPeerInfo.avgPingDuration = int64(
(
float64(customPeerInfo.avgPingDuration.millis) * (AvgPingWindow - 1.0) +
float64(pingDelay.millis)
) / AvgPingWindow
).millis
customPeerInfo.lastPingDuration = pingDelay
return ok(customPeerInfo.peerId)
proc shouldReconnect(customPeerInfo: CustomPeerInfoRef): bool =
let reconnetIntervalCheck =
getTime().toUnix() >= customPeerInfo.lastTimeConnected + ReconnectTime
var retriesCheck = customPeerInfo.retries < MaxConnectionRetries
if not retriesCheck and
getTime().toUnix() >= customPeerInfo.lastTimeConnected + ResetRetriesAfter:
customPeerInfo.retries = 0
retriesCheck = true
info "resetting retries counter", peerId = customPeerInfo.peerId
return reconnetIntervalCheck and retriesCheck
# TODO: Split in discover, connect
proc setConnectedPeersMetrics(
discoveredNodes: seq[Node],
node: WakuNode,
timeout: chronos.Duration,
restClient: RestClientRef,
allPeers: CustomPeersTableRef,
) {.async.} =
let currentTime = getTime().toUnix()
var newPeers = 0
var successfulConnections = 0
var analyzeFuts: seq[Future[Result[string, string]]]
var (inConns, outConns) = node.peer_manager.connectedPeers(WakuRelayCodec)
info "connected peers", inConns = inConns.len, outConns = outConns.len
shuffle(outConns)
if outConns.len >= toInt(MaxConnectedPeers / 2):
for p in outConns[0 ..< toInt(outConns.len / 2)]:
trace "Pruning Peer", Peer = $p
asyncSpawn(node.switch.disconnect(p))
# iterate all newly discovered nodes
for discNode in discoveredNodes:
let typedRecord = discNode.record.toTypedRecord()
if not typedRecord.isOk():
warn "could not convert record to typed record", record = discNode.record
continue
let secp256k1 = typedRecord.get().secp256k1
if not secp256k1.isSome():
warn "could not get secp256k1 key", typedRecord = typedRecord.get()
continue
let peerRes = toRemotePeerInfo(discNode.record)
let peerInfo = peerRes.valueOr:
warn "error converting record to remote peer info", record = discNode.record
continue
# create new entry if new peerId found
let peerId = $peerInfo.peerId
if not allPeers.hasKey(peerId):
allPeers[peerId] = CustomPeerInfoRef(peerId: peerId)
newPeers += 1
else:
info "already seen", peerId = peerId
let customPeerInfo = allPeers[peerId]
customPeerInfo.lastTimeDiscovered = currentTime
customPeerInfo.enr = discNode.record.toURI()
customPeerInfo.enrCapabilities = discNode.record.getCapabilities().mapIt($it)
customPeerInfo.discovered += 1
if not typedRecord.get().ip.isSome():
warn "ip field is not set", record = typedRecord.get()
continue
let ip = $typedRecord.get().ip.get().join(".")
customPeerInfo.ip = ip
# try to ping the peer
if shouldReconnect(customPeerInfo):
if customPeerInfo.retries > 0:
warn "trying to dial failed peer again",
peerId = peerId, retry = customPeerInfo.retries
analyzeFuts.add(analyzePeer(customPeerInfo, peerInfo, node, timeout))
# Wait for all connection attempts to finish
let analyzedPeers = await allFinished(analyzeFuts)
for peerIdFut in analyzedPeers:
let peerIdRes = await peerIdFut
let peerIdStr = peerIdRes.valueOr:
continue
successfulConnections += 1
let peerId = PeerId.init(peerIdStr).valueOr:
warn "failed to parse peerId", peerId = peerIdStr
continue
var customPeerInfo = allPeers[peerIdStr]
debug "connected to peer", peer = customPeerInfo[]
# after connection, get supported protocols
let lp2pPeerStore = node.switch.peerStore
let nodeProtocols = lp2pPeerStore[ProtoBook][peerId]
customPeerInfo.supportedProtocols = nodeProtocols
customPeerInfo.lastTimeConnected = currentTime
# after connection, get user-agent
let nodeUserAgent = lp2pPeerStore[AgentBook][peerId]
customPeerInfo.userAgent = nodeUserAgent
info "number of newly discovered peers", amount = newPeers
# inform the total connections that we did in this round
info "number of successful connections", amount = successfulConnections
proc updateMetrics(allPeersRef: CustomPeersTableRef) {.gcsafe.} =
var allProtocols: Table[string, int]
var allAgentStrings: Table[string, int]
var countries: Table[string, int]
var connectedPeers = 0
var failedPeers = 0
for peerInfo in allPeersRef.values:
if peerInfo.connError == "":
for protocol in peerInfo.supportedProtocols:
allProtocols[protocol] = allProtocols.mgetOrPut(protocol, 0) + 1
# store available user-agents in the network
allAgentStrings[peerInfo.userAgent] =
allAgentStrings.mgetOrPut(peerInfo.userAgent, 0) + 1
if peerInfo.country != "":
countries[peerInfo.country] = countries.mgetOrPut(peerInfo.country, 0) + 1
connectedPeers += 1
else:
failedPeers += 1
networkmonitor_peer_count.set(int64(connectedPeers), labelValues = ["true"])
networkmonitor_peer_count.set(int64(failedPeers), labelValues = ["false"])
# update count on each protocol
for protocol in allProtocols.keys():
let countOfProtocols = allProtocols.mgetOrPut(protocol, 0)
networkmonitor_peer_type_as_per_protocol.set(
int64(countOfProtocols), labelValues = [protocol]
)
info "supported protocols in the network",
protocol = protocol, count = countOfProtocols
# update count on each user-agent
for userAgent in allAgentStrings.keys():
let countOfUserAgent = allAgentStrings.mgetOrPut(userAgent, 0)
networkmonitor_peer_user_agents.set(
int64(countOfUserAgent), labelValues = [userAgent]
)
info "user agents participating in the network",
userAgent = userAgent, count = countOfUserAgent
for country in countries.keys():
let peerCount = countries.mgetOrPut(country, 0)
networkmonitor_peer_country_count.set(int64(peerCount), labelValues = [country])
info "number of peers per country", country = country, count = peerCount
proc populateInfoFromIp(
allPeersRef: CustomPeersTableRef, restClient: RestClientRef
) {.async.} =
for peer in allPeersRef.keys():
if allPeersRef[peer].country != "" and allPeersRef[peer].city != "":
continue
# TODO: Update also if last update > x
if allPeersRef[peer].ip == "":
continue
# get more info the peers from its ip address
var location: NodeLocation
try:
# IP-API endpoints are now limited to 45 HTTP requests per minute
await sleepAsync(1400.millis)
let response = await restClient.ipToLocation(allPeersRef[peer].ip)
location = response.data
except CatchableError:
warn "could not get location", ip = allPeersRef[peer].ip
continue
allPeersRef[peer].country = location.country
allPeersRef[peer].city = location.city
# TODO: Split in discovery, connections, and ip2location
# crawls the network discovering peers and trying to connect to them
# metrics are processed and exposed
proc crawlNetwork(
node: WakuNode,
wakuDiscv5: WakuDiscoveryV5,
restClient: RestClientRef,
conf: NetworkMonitorConf,
allPeersRef: CustomPeersTableRef,
) {.async.} =
let crawlInterval = conf.refreshInterval * 1000
while true:
let startTime = Moment.now()
# discover new random nodes
let discoveredNodes = await wakuDiscv5.protocol.queryRandom()
# nodes are nested into bucket, flat it
let flatNodes = wakuDiscv5.protocol.routingTable.buckets.mapIt(it.nodes).flatten()
# populate metrics related to capabilities as advertised by the ENR (see waku field)
setDiscoveredPeersCapabilities(flatNodes)
# tries to connect to all newly discovered nodes
# and populates metrics related to peers we could connect
# note random discovered nodes can be already known
await setConnectedPeersMetrics(
discoveredNodes, node, conf.timeout, restClient, allPeersRef
)
updateMetrics(allPeersRef)
# populate info from ip addresses
await populateInfoFromIp(allPeersRef, restClient)
let totalNodes = flatNodes.len
let seenNodes = flatNodes.countIt(it.seen)
info "discovered nodes: ", total = totalNodes, seen = seenNodes
# Notes:
# we dont run ipMajorityLoop
# we dont run revalidateLoop
let endTime = Moment.now()
let elapsed = (endTime - startTime).nanos
info "crawl duration", time = elapsed.millis
await sleepAsync(crawlInterval.millis - elapsed.millis)
proc retrieveDynamicBootstrapNodes(
dnsDiscovery: bool, dnsDiscoveryUrl: string, dnsDiscoveryNameServers: seq[IpAddress]
): Result[seq[RemotePeerInfo], string] =
if dnsDiscovery and dnsDiscoveryUrl != "":
# DNS discovery
debug "Discovering nodes using Waku DNS discovery", url = dnsDiscoveryUrl
var nameServers: seq[TransportAddress]
for ip in dnsDiscoveryNameServers:
nameServers.add(initTAddress(ip, Port(53))) # Assume all servers use port 53
let dnsResolver = DnsResolver.new(nameServers)
proc resolver(domain: string): Future[string] {.async, gcsafe.} =
trace "resolving", domain = domain
let resolved = await dnsResolver.resolveTxt(domain)
return resolved[0] # Use only first answer
var wakuDnsDiscovery = WakuDnsDiscovery.init(dnsDiscoveryUrl, resolver)
if wakuDnsDiscovery.isOk():
return wakuDnsDiscovery.get().findPeers().mapErr(
proc(e: cstring): string =
$e
)
else:
warn "Failed to init Waku DNS discovery"
debug "No method for retrieving dynamic bootstrap nodes specified."
ok(newSeq[RemotePeerInfo]()) # Return an empty seq by default
proc getBootstrapFromDiscDns(
conf: NetworkMonitorConf
): Result[seq[enr.Record], string] =
try:
let dnsNameServers = @[parseIpAddress("1.1.1.1"), parseIpAddress("1.0.0.1")]
let dynamicBootstrapNodesRes =
retrieveDynamicBootstrapNodes(true, conf.dnsDiscoveryUrl, dnsNameServers)
if not dynamicBootstrapNodesRes.isOk():
error("failed discovering peers from DNS")
let dynamicBootstrapNodes = dynamicBootstrapNodesRes.get()
# select dynamic bootstrap nodes that have an ENR containing a udp port.
# Discv5 only supports UDP https://github.com/ethereum/devp2p/blob/master/discv5/discv5-theory.md)
var discv5BootstrapEnrs: seq[enr.Record]
for n in dynamicBootstrapNodes:
if n.enr.isSome():
let
enr = n.enr.get()
tenrRes = enr.toTypedRecord()
if tenrRes.isOk() and (
tenrRes.get().udp.isSome() or tenrRes.get().udp6.isSome()
):
discv5BootstrapEnrs.add(enr)
return ok(discv5BootstrapEnrs)
except CatchableError:
error("failed discovering peers from DNS")
proc initAndStartApp(
conf: NetworkMonitorConf
): Result[(WakuNode, WakuDiscoveryV5), string] =
let bindIp =
try:
parseIpAddress("0.0.0.0")
except CatchableError:
return err("could not start node: " & getCurrentExceptionMsg())
let extIp =
try:
parseIpAddress("127.0.0.1")
except CatchableError:
return err("could not start node: " & getCurrentExceptionMsg())
let
# some hardcoded parameters
rng = keys.newRng()
key = crypto.PrivateKey.random(Secp256k1, rng[])[]
nodeTcpPort = Port(60000)
nodeUdpPort = Port(9000)
flags = CapabilitiesBitfield.init(
lightpush = false, filter = false, store = false, relay = true
)
var builder = EnrBuilder.init(key)
builder.withIpAddressAndPorts(
ipAddr = some(extIp), tcpPort = some(nodeTcpPort), udpPort = some(nodeUdpPort)
)
builder.withWakuCapabilities(flags)
let addShardedTopics = builder.withShardedTopics(conf.pubsubTopics)
if addShardedTopics.isErr():
error "failed to add sharded topics to ENR", error = addShardedTopics.error
return err($addShardedTopics.error)
let recordRes = builder.build()
let record =
if recordRes.isErr():
return err("cannot build record: " & $recordRes.error)
else:
recordRes.get()
var nodeBuilder = WakuNodeBuilder.init()
nodeBuilder.withNodeKey(key)
nodeBuilder.withRecord(record)
nodeBUilder.withSwitchConfiguration(maxConnections = some(MaxConnectedPeers))
nodeBuilder.withPeerManagerConfig(maxRelayPeers = some(20), shardAware = true)
let res = nodeBuilder.withNetworkConfigurationDetails(bindIp, nodeTcpPort)
if res.isErr():
return err("node building error" & $res.error)
let nodeRes = nodeBuilder.build()
let node =
if nodeRes.isErr():
return err("node building error" & $res.error)
else:
nodeRes.get()
var discv5BootstrapEnrsRes = getBootstrapFromDiscDns(conf)
if discv5BootstrapEnrsRes.isErr():
error("failed discovering peers from DNS")
var discv5BootstrapEnrs = discv5BootstrapEnrsRes.get()
# parse enrURIs from the configuration and add the resulting ENRs to the discv5BootstrapEnrs seq
for enrUri in conf.bootstrapNodes:
addBootstrapNode(enrUri, discv5BootstrapEnrs)
# discv5
let discv5Conf = WakuDiscoveryV5Config(
discv5Config: none(DiscoveryConfig),
address: bindIp,
port: nodeUdpPort,
privateKey: keys.PrivateKey(key.skkey),
bootstrapRecords: discv5BootstrapEnrs,
autoupdateRecord: false,
)
let wakuDiscv5 = WakuDiscoveryV5.new(node.rng, discv5Conf, some(record))
try:
wakuDiscv5.protocol.open()
except CatchableError:
return err("could not start node: " & getCurrentExceptionMsg())
ok((node, wakuDiscv5))
proc startRestApiServer(
conf: NetworkMonitorConf,
allPeersInfo: CustomPeersTableRef,
numMessagesPerContentTopic: ContentTopicMessageTableRef,
): Result[void, string] =
try:
let serverAddress =
initTAddress(conf.metricsRestAddress & ":" & $conf.metricsRestPort)
proc validate(pattern: string, value: string): int =
if pattern.startsWith("{") and pattern.endsWith("}"): 0 else: 1
var router = RestRouter.init(validate)
router.installHandler(allPeersInfo, numMessagesPerContentTopic)
var sres = RestServerRef.new(router, serverAddress)
let restServer = sres.get()
restServer.start()
except CatchableError:
error("could not start rest api server")
ok()
# handles rx of messages over a topic (see subscribe)
# counts the number of messages per content topic
proc subscribeAndHandleMessages(
node: WakuNode,
pubsubTopic: PubsubTopic,
msgPerContentTopic: ContentTopicMessageTableRef,
) =
# handle function
proc handler(
pubsubTopic: PubsubTopic, msg: WakuMessage
): Future[void] {.async, gcsafe.} =
trace "rx message", pubsubTopic = pubsubTopic, contentTopic = msg.contentTopic
# If we reach a table limit size, remove c topics with the least messages.
let tableSize = 100
if msgPerContentTopic.len > (tableSize - 1):
let minIndex = toSeq(msgPerContentTopic.values()).minIndex()
msgPerContentTopic.del(toSeq(msgPerContentTopic.keys())[minIndex])
# TODO: Will overflow at some point
# +1 if content topic existed, init to 1 otherwise
if msgPerContentTopic.hasKey(msg.contentTopic):
msgPerContentTopic[msg.contentTopic] += 1
else:
msgPerContentTopic[msg.contentTopic] = 1
node.subscribe((kind: PubsubSub, topic: pubsubTopic), some(WakuRelayHandler(handler)))
when isMainModule:
# known issue: confutils.nim(775, 17) Error: can raise an unlisted exception: ref IOError
{.pop.}
let confRes = NetworkMonitorConf.loadConfig()
if confRes.isErr():
error "could not load cli variables", err = confRes.error
quit(1)
var conf = confRes.get()
info "cli flags", conf = conf
if conf.clusterId == 1:
let twnClusterConf = ClusterConf.TheWakuNetworkConf()
conf.bootstrapNodes = twnClusterConf.discv5BootstrapNodes
conf.pubsubTopics = twnClusterConf.pubsubTopics
conf.rlnRelayDynamic = twnClusterConf.rlnRelayDynamic
conf.rlnRelayEthContractAddress = twnClusterConf.rlnRelayEthContractAddress
conf.rlnEpochSizeSec = twnClusterConf.rlnEpochSizeSec
conf.rlnRelayUserMessageLimit = twnClusterConf.rlnRelayUserMessageLimit
if conf.logLevel != LogLevel.NONE:
setLogLevel(conf.logLevel)
# list of peers that we have discovered/connected
var allPeersInfo = CustomPeersTableRef()
# content topic and the number of messages that were received
var msgPerContentTopic = ContentTopicMessageTableRef()
# start metrics server
if conf.metricsServer:
let res =
startMetricsServer(conf.metricsServerAddress, Port(conf.metricsServerPort))
if res.isErr():
error "could not start metrics server", err = res.error
quit(1)
# start rest server for custom metrics
let res = startRestApiServer(conf, allPeersInfo, msgPerContentTopic)
if res.isErr():
error "could not start rest api server", err = res.error
quit(1)
# create a rest client
let clientRest =
RestClientRef.new(url = "http://ip-api.com", connectTimeout = ctime.seconds(2))
if clientRest.isErr():
error "could not start rest api client", err = res.error
quit(1)
let restClient = clientRest.get()
# start waku node
let nodeRes = initAndStartApp(conf)
if nodeRes.isErr():
error "could not start node"
quit 1
let (node, discv5) = nodeRes.get()
waitFor node.mountRelay()
waitFor node.mountLibp2pPing()
if conf.rlnRelayEthContractAddress != "":
let rlnConf = WakuRlnConfig(
rlnRelayDynamic: conf.rlnRelayDynamic,
rlnRelayCredIndex: some(uint(0)),
rlnRelayEthContractAddress: conf.rlnRelayEthContractAddress,
rlnRelayEthClientAddress: string(conf.rlnRelayethClientAddress),
rlnRelayCredPath: "",
rlnRelayCredPassword: "",
rlnRelayTreePath: conf.rlnRelayTreePath,
rlnEpochSizeSec: conf.rlnEpochSizeSec,
)
try:
waitFor node.mountRlnRelay(rlnConf)
except CatchableError:
error "failed to setup RLN", err = getCurrentExceptionMsg()
quit 1
node.mountMetadata(conf.clusterId).isOkOr:
error "failed to mount waku metadata protocol: ", err = error
quit 1
for pubsubTopic in conf.pubsubTopics:
# Subscribe the node to the default pubsubtopic, to count messages
subscribeAndHandleMessages(node, pubsubTopic, msgPerContentTopic)
# spawn the routine that crawls the network
# TODO: split into 3 routines (discovery, connections, ip2location)
asyncSpawn crawlNetwork(node, discv5, restClient, conf, allPeersInfo)
runForever()