Elad 34f11e752f cmd/swarm/swarm-snapshot: swarm snapshot generator (#18453)
* cmd/swarm/swarm-snapshot: add binary to create network snapshots

* cmd/swarm/swarm-snapshot: refactor and extend tests

* p2p/simulations: remove unused triggerChecks func and fix linter

* internal/cmdtest: raise the timeout for killing TestCmd

* cmd/swarm/swarm-snapshot: add more comments and other minor adjustments

* cmd/swarm/swarm-snapshot: remove redundant check in createSnapshot

* cmd/swarm/swarm-snapshot: change comment wording

* p2p/simulations: revert Simulation.Run from master

https://github.com/ethersphere/go-ethereum/pull/1077/files#r247078904

* cmd/swarm/swarm-snapshot: address pr comments

* swarm/network/simulations/discovery: removed snapshot write to file

* cmd/swarm/swarm-snapshot, swarm/network/simulations: removed redundant connection event check, fixed lint error
2019-01-16 14:33:02 +01:00

533 lines
17 KiB
Go

// Copyright 2018 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package discovery
import (
"context"
"flag"
"fmt"
"io/ioutil"
"os"
"path"
"strings"
"testing"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/node"
"github.com/ethereum/go-ethereum/p2p"
"github.com/ethereum/go-ethereum/p2p/enode"
"github.com/ethereum/go-ethereum/p2p/simulations"
"github.com/ethereum/go-ethereum/p2p/simulations/adapters"
"github.com/ethereum/go-ethereum/swarm/network"
"github.com/ethereum/go-ethereum/swarm/state"
colorable "github.com/mattn/go-colorable"
)
// serviceName is used with the exec adapter so the exec'd binary knows which
// service to execute
const serviceName = "discovery"
const testNeighbourhoodSize = 2
const discoveryPersistenceDatadir = "discovery_persistence_test_store"
var discoveryPersistencePath = path.Join(os.TempDir(), discoveryPersistenceDatadir)
var discoveryEnabled = true
var persistenceEnabled = false
var services = adapters.Services{
serviceName: newService,
}
func cleanDbStores() error {
entries, err := ioutil.ReadDir(os.TempDir())
if err != nil {
return err
}
for _, f := range entries {
if strings.HasPrefix(f.Name(), discoveryPersistenceDatadir) {
os.RemoveAll(path.Join(os.TempDir(), f.Name()))
}
}
return nil
}
func getDbStore(nodeID string) (*state.DBStore, error) {
if _, err := os.Stat(discoveryPersistencePath + "_" + nodeID); os.IsNotExist(err) {
log.Info(fmt.Sprintf("directory for nodeID %s does not exist. creating...", nodeID))
ioutil.TempDir("", discoveryPersistencePath+"_"+nodeID)
}
log.Info(fmt.Sprintf("opening storage directory for nodeID %s", nodeID))
store, err := state.NewDBStore(discoveryPersistencePath + "_" + nodeID)
if err != nil {
return nil, err
}
return store, nil
}
var (
nodeCount = flag.Int("nodes", 10, "number of nodes to create (default 10)")
initCount = flag.Int("conns", 1, "number of originally connected peers (default 1)")
loglevel = flag.Int("loglevel", 3, "verbosity of logs")
rawlog = flag.Bool("rawlog", false, "remove terminal formatting from logs")
)
func init() {
flag.Parse()
// register the discovery service which will run as a devp2p
// protocol when using the exec adapter
adapters.RegisterServices(services)
log.PrintOrigins(true)
log.Root().SetHandler(log.LvlFilterHandler(log.Lvl(*loglevel), log.StreamHandler(colorable.NewColorableStderr(), log.TerminalFormat(!*rawlog))))
}
// Benchmarks to test the average time it takes for an N-node ring
// to full a healthy kademlia topology
func BenchmarkDiscovery_8_1(b *testing.B) { benchmarkDiscovery(b, 8, 1) }
func BenchmarkDiscovery_16_1(b *testing.B) { benchmarkDiscovery(b, 16, 1) }
func BenchmarkDiscovery_32_1(b *testing.B) { benchmarkDiscovery(b, 32, 1) }
func BenchmarkDiscovery_64_1(b *testing.B) { benchmarkDiscovery(b, 64, 1) }
func BenchmarkDiscovery_128_1(b *testing.B) { benchmarkDiscovery(b, 128, 1) }
func BenchmarkDiscovery_256_1(b *testing.B) { benchmarkDiscovery(b, 256, 1) }
func BenchmarkDiscovery_8_2(b *testing.B) { benchmarkDiscovery(b, 8, 2) }
func BenchmarkDiscovery_16_2(b *testing.B) { benchmarkDiscovery(b, 16, 2) }
func BenchmarkDiscovery_32_2(b *testing.B) { benchmarkDiscovery(b, 32, 2) }
func BenchmarkDiscovery_64_2(b *testing.B) { benchmarkDiscovery(b, 64, 2) }
func BenchmarkDiscovery_128_2(b *testing.B) { benchmarkDiscovery(b, 128, 2) }
func BenchmarkDiscovery_256_2(b *testing.B) { benchmarkDiscovery(b, 256, 2) }
func BenchmarkDiscovery_8_4(b *testing.B) { benchmarkDiscovery(b, 8, 4) }
func BenchmarkDiscovery_16_4(b *testing.B) { benchmarkDiscovery(b, 16, 4) }
func BenchmarkDiscovery_32_4(b *testing.B) { benchmarkDiscovery(b, 32, 4) }
func BenchmarkDiscovery_64_4(b *testing.B) { benchmarkDiscovery(b, 64, 4) }
func BenchmarkDiscovery_128_4(b *testing.B) { benchmarkDiscovery(b, 128, 4) }
func BenchmarkDiscovery_256_4(b *testing.B) { benchmarkDiscovery(b, 256, 4) }
func TestDiscoverySimulationExecAdapter(t *testing.T) {
testDiscoverySimulationExecAdapter(t, *nodeCount, *initCount)
}
func testDiscoverySimulationExecAdapter(t *testing.T, nodes, conns int) {
baseDir, err := ioutil.TempDir("", "swarm-test")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(baseDir)
testDiscoverySimulation(t, nodes, conns, adapters.NewExecAdapter(baseDir))
}
func TestDiscoverySimulationSimAdapter(t *testing.T) {
testDiscoverySimulationSimAdapter(t, *nodeCount, *initCount)
}
func TestDiscoveryPersistenceSimulationSimAdapter(t *testing.T) {
testDiscoveryPersistenceSimulationSimAdapter(t, *nodeCount, *initCount)
}
func testDiscoveryPersistenceSimulationSimAdapter(t *testing.T, nodes, conns int) {
testDiscoveryPersistenceSimulation(t, nodes, conns, adapters.NewSimAdapter(services))
}
func testDiscoverySimulationSimAdapter(t *testing.T, nodes, conns int) {
testDiscoverySimulation(t, nodes, conns, adapters.NewSimAdapter(services))
}
func testDiscoverySimulation(t *testing.T, nodes, conns int, adapter adapters.NodeAdapter) {
t.Skip("discovery tests depend on suggestpeer, which is unreliable after kademlia depth change.")
startedAt := time.Now()
result, err := discoverySimulation(nodes, conns, adapter)
if err != nil {
t.Fatalf("Setting up simulation failed: %v", err)
}
if result.Error != nil {
t.Fatalf("Simulation failed: %s", result.Error)
}
t.Logf("Simulation with %d nodes passed in %s", nodes, result.FinishedAt.Sub(result.StartedAt))
var min, max time.Duration
var sum int
for _, pass := range result.Passes {
duration := pass.Sub(result.StartedAt)
if sum == 0 || duration < min {
min = duration
}
if duration > max {
max = duration
}
sum += int(duration.Nanoseconds())
}
t.Logf("Min: %s, Max: %s, Average: %s", min, max, time.Duration(sum/len(result.Passes))*time.Nanosecond)
finishedAt := time.Now()
t.Logf("Setup: %s, shutdown: %s", result.StartedAt.Sub(startedAt), finishedAt.Sub(result.FinishedAt))
}
func testDiscoveryPersistenceSimulation(t *testing.T, nodes, conns int, adapter adapters.NodeAdapter) map[int][]byte {
t.Skip("discovery tests depend on suggestpeer, which is unreliable after kademlia depth change.")
persistenceEnabled = true
discoveryEnabled = true
result, err := discoveryPersistenceSimulation(nodes, conns, adapter)
if err != nil {
t.Fatalf("Setting up simulation failed: %v", err)
}
if result.Error != nil {
t.Fatalf("Simulation failed: %s", result.Error)
}
t.Logf("Simulation with %d nodes passed in %s", nodes, result.FinishedAt.Sub(result.StartedAt))
// set the discovery and persistence flags again to default so other
// tests will not be affected
discoveryEnabled = true
persistenceEnabled = false
return nil
}
func benchmarkDiscovery(b *testing.B, nodes, conns int) {
for i := 0; i < b.N; i++ {
result, err := discoverySimulation(nodes, conns, adapters.NewSimAdapter(services))
if err != nil {
b.Fatalf("setting up simulation failed: %v", err)
}
if result.Error != nil {
b.Logf("simulation failed: %s", result.Error)
}
}
}
func discoverySimulation(nodes, conns int, adapter adapters.NodeAdapter) (*simulations.StepResult, error) {
// create network
net := simulations.NewNetwork(adapter, &simulations.NetworkConfig{
ID: "0",
DefaultService: serviceName,
})
defer net.Shutdown()
trigger := make(chan enode.ID)
ids := make([]enode.ID, nodes)
for i := 0; i < nodes; i++ {
conf := adapters.RandomNodeConfig()
node, err := net.NewNodeWithConfig(conf)
if err != nil {
return nil, fmt.Errorf("error starting node: %s", err)
}
if err := net.Start(node.ID()); err != nil {
return nil, fmt.Errorf("error starting node %s: %s", node.ID().TerminalString(), err)
}
if err := triggerChecks(trigger, net, node.ID()); err != nil {
return nil, fmt.Errorf("error triggering checks for node %s: %s", node.ID().TerminalString(), err)
}
ids[i] = node.ID()
}
// run a simulation which connects the 10 nodes in a ring and waits
// for full peer discovery
var addrs [][]byte
action := func(ctx context.Context) error {
return nil
}
for i := range ids {
// collect the overlay addresses, to
addrs = append(addrs, ids[i].Bytes())
}
err := net.ConnectNodesChain(nil)
if err != nil {
return nil, err
}
log.Debug(fmt.Sprintf("nodes: %v", len(addrs)))
// construct the peer pot, so that kademlia health can be checked
ppmap := network.NewPeerPotMap(network.NewKadParams().NeighbourhoodSize, addrs)
check := func(ctx context.Context, id enode.ID) (bool, error) {
select {
case <-ctx.Done():
return false, ctx.Err()
default:
}
node := net.GetNode(id)
if node == nil {
return false, fmt.Errorf("unknown node: %s", id)
}
client, err := node.Client()
if err != nil {
return false, fmt.Errorf("error getting node client: %s", err)
}
healthy := &network.Health{}
if err := client.Call(&healthy, "hive_healthy", ppmap); err != nil {
return false, fmt.Errorf("error getting node health: %s", err)
}
log.Info(fmt.Sprintf("node %4s healthy: connected nearest neighbours: %v, know nearest neighbours: %v,\n\n%v", id, healthy.ConnectNN, healthy.KnowNN, healthy.Hive))
return healthy.KnowNN && healthy.ConnectNN, nil
}
// 64 nodes ~ 1min
// 128 nodes ~
timeout := 300 * time.Second
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
result := simulations.NewSimulation(net).Run(ctx, &simulations.Step{
Action: action,
Trigger: trigger,
Expect: &simulations.Expectation{
Nodes: ids,
Check: check,
},
})
if result.Error != nil {
return result, nil
}
return result, nil
}
func discoveryPersistenceSimulation(nodes, conns int, adapter adapters.NodeAdapter) (*simulations.StepResult, error) {
cleanDbStores()
defer cleanDbStores()
// create network
net := simulations.NewNetwork(adapter, &simulations.NetworkConfig{
ID: "0",
DefaultService: serviceName,
})
defer net.Shutdown()
trigger := make(chan enode.ID)
ids := make([]enode.ID, nodes)
var addrs [][]byte
for i := 0; i < nodes; i++ {
conf := adapters.RandomNodeConfig()
node, err := net.NewNodeWithConfig(conf)
if err != nil {
panic(err)
}
if err != nil {
return nil, fmt.Errorf("error starting node: %s", err)
}
if err := net.Start(node.ID()); err != nil {
return nil, fmt.Errorf("error starting node %s: %s", node.ID().TerminalString(), err)
}
if err := triggerChecks(trigger, net, node.ID()); err != nil {
return nil, fmt.Errorf("error triggering checks for node %s: %s", node.ID().TerminalString(), err)
}
// TODO we shouldn't be equating underaddr and overaddr like this, as they are not the same in production
ids[i] = node.ID()
a := ids[i].Bytes()
addrs = append(addrs, a)
}
// run a simulation which connects the 10 nodes in a ring and waits
// for full peer discovery
var restartTime time.Time
action := func(ctx context.Context) error {
ticker := time.NewTicker(500 * time.Millisecond)
for range ticker.C {
isHealthy := true
for _, id := range ids {
//call Healthy RPC
node := net.GetNode(id)
if node == nil {
return fmt.Errorf("unknown node: %s", id)
}
client, err := node.Client()
if err != nil {
return fmt.Errorf("error getting node client: %s", err)
}
healthy := &network.Health{}
addr := id.String()
ppmap := network.NewPeerPotMap(network.NewKadParams().NeighbourhoodSize, addrs)
if err := client.Call(&healthy, "hive_healthy", ppmap); err != nil {
return fmt.Errorf("error getting node health: %s", err)
}
log.Info(fmt.Sprintf("NODE: %s, IS HEALTHY: %t", addr, healthy.ConnectNN && healthy.KnowNN && healthy.CountKnowNN > 0))
var nodeStr string
if err := client.Call(&nodeStr, "hive_string"); err != nil {
return fmt.Errorf("error getting node string %s", err)
}
log.Info(nodeStr)
for _, a := range addrs {
log.Info(common.Bytes2Hex(a))
}
if !healthy.ConnectNN || healthy.CountKnowNN == 0 {
isHealthy = false
break
}
}
if isHealthy {
break
}
}
ticker.Stop()
log.Info("reached healthy kademlia. starting to shutdown nodes.")
shutdownStarted := time.Now()
// stop all ids, then start them again
for _, id := range ids {
node := net.GetNode(id)
if err := net.Stop(node.ID()); err != nil {
return fmt.Errorf("error stopping node %s: %s", node.ID().TerminalString(), err)
}
}
log.Info(fmt.Sprintf("shutting down nodes took: %s", time.Since(shutdownStarted)))
persistenceEnabled = true
discoveryEnabled = false
restartTime = time.Now()
for _, id := range ids {
node := net.GetNode(id)
if err := net.Start(node.ID()); err != nil {
return fmt.Errorf("error starting node %s: %s", node.ID().TerminalString(), err)
}
if err := triggerChecks(trigger, net, node.ID()); err != nil {
return fmt.Errorf("error triggering checks for node %s: %s", node.ID().TerminalString(), err)
}
}
log.Info(fmt.Sprintf("restarting nodes took: %s", time.Since(restartTime)))
return nil
}
net.ConnectNodesChain(nil)
log.Debug(fmt.Sprintf("nodes: %v", len(addrs)))
// construct the peer pot, so that kademlia health can be checked
check := func(ctx context.Context, id enode.ID) (bool, error) {
select {
case <-ctx.Done():
return false, ctx.Err()
default:
}
node := net.GetNode(id)
if node == nil {
return false, fmt.Errorf("unknown node: %s", id)
}
client, err := node.Client()
if err != nil {
return false, fmt.Errorf("error getting node client: %s", err)
}
healthy := &network.Health{}
ppmap := network.NewPeerPotMap(network.NewKadParams().NeighbourhoodSize, addrs)
if err := client.Call(&healthy, "hive_healthy", ppmap); err != nil {
return false, fmt.Errorf("error getting node health: %s", err)
}
log.Info(fmt.Sprintf("node %4s healthy: got nearest neighbours: %v, know nearest neighbours: %v", id, healthy.ConnectNN, healthy.KnowNN))
return healthy.KnowNN && healthy.ConnectNN, nil
}
// 64 nodes ~ 1min
// 128 nodes ~
timeout := 300 * time.Second
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
result := simulations.NewSimulation(net).Run(ctx, &simulations.Step{
Action: action,
Trigger: trigger,
Expect: &simulations.Expectation{
Nodes: ids,
Check: check,
},
})
if result.Error != nil {
return result, nil
}
return result, nil
}
// triggerChecks triggers a simulation step check whenever a peer is added or
// removed from the given node, and also every second to avoid a race between
// peer events and kademlia becoming healthy
func triggerChecks(trigger chan enode.ID, net *simulations.Network, id enode.ID) error {
node := net.GetNode(id)
if node == nil {
return fmt.Errorf("unknown node: %s", id)
}
client, err := node.Client()
if err != nil {
return err
}
events := make(chan *p2p.PeerEvent)
sub, err := client.Subscribe(context.Background(), "admin", events, "peerEvents")
if err != nil {
return fmt.Errorf("error getting peer events for node %v: %s", id, err)
}
go func() {
defer sub.Unsubscribe()
tick := time.NewTicker(time.Second)
defer tick.Stop()
for {
select {
case <-events:
trigger <- id
case <-tick.C:
trigger <- id
case err := <-sub.Err():
if err != nil {
log.Error(fmt.Sprintf("error getting peer events for node %v", id), "err", err)
}
return
}
}
}()
return nil
}
func newService(ctx *adapters.ServiceContext) (node.Service, error) {
addr := network.NewAddr(ctx.Config.Node())
kp := network.NewKadParams()
kp.NeighbourhoodSize = testNeighbourhoodSize
if ctx.Config.Reachable != nil {
kp.Reachable = func(o *network.BzzAddr) bool {
return ctx.Config.Reachable(o.ID())
}
}
kad := network.NewKademlia(addr.Over(), kp)
hp := network.NewHiveParams()
hp.KeepAliveInterval = time.Duration(200) * time.Millisecond
hp.Discovery = discoveryEnabled
log.Info(fmt.Sprintf("discovery for nodeID %s is %t", ctx.Config.ID.String(), hp.Discovery))
config := &network.BzzConfig{
OverlayAddr: addr.Over(),
UnderlayAddr: addr.Under(),
HiveParams: hp,
}
if persistenceEnabled {
log.Info(fmt.Sprintf("persistence enabled for nodeID %s", ctx.Config.ID.String()))
store, err := getDbStore(ctx.Config.ID.String())
if err != nil {
return nil, err
}
return network.NewBzz(config, kad, store, nil, nil), nil
}
return network.NewBzz(config, kad, nil, nil, nil), nil
}