From 62d9d638582fa5fd7bcfaa55e8810fcfb898a4dd Mon Sep 17 00:00:00 2001 From: lash Date: Thu, 28 Feb 2019 08:12:50 +0100 Subject: [PATCH] swarm/network: WIP consider all nodes for healthy iteration (#19155) * swarm/network: WIP consider all nodes for healthy iteration * swarm/network/simulation: extend TestWaitTillHealthy to really check kads are healthy * cmd/swarm/swarm-snapshot: fixed bugs in snapshot creation binary * swarm/network/simulation: addressed PR comments * swarm/network/simulation: defer sim.Clsoe() * swarm/network/simulation: fixed wrong sim.Close() * swarm/network/simulation: addressed PR comments * cmd/swarm/swarm-snapshot: reducing default to 8 nodes, more to 4 * cmd/swarm/swarm-snapshot: extended timeout to 3 mins, or 256 nodes snapshot times out * swarm/network/simulation: More PR comments --- cmd/swarm/swarm-snapshot/create.go | 11 +- cmd/swarm/swarm-snapshot/create_test.go | 4 +- cmd/swarm/swarm-snapshot/main.go | 2 +- swarm/network/simulation/kademlia.go | 5 +- swarm/network/simulation/kademlia_test.go | 120 ++++++++++++++++++---- 5 files changed, 113 insertions(+), 29 deletions(-) diff --git a/cmd/swarm/swarm-snapshot/create.go b/cmd/swarm/swarm-snapshot/create.go index 127fde8ae..434561a49 100644 --- a/cmd/swarm/swarm-snapshot/create.go +++ b/cmd/swarm/swarm-snapshot/create.go @@ -59,13 +59,16 @@ func createSnapshot(filename string, nodes int, services []string) (err error) { log.Debug("create snapshot", "filename", filename, "nodes", nodes, "services", services) sim := simulation.New(map[string]simulation.ServiceFunc{ - "bzz": func(ctx *adapters.ServiceContext, b *sync.Map) (node.Service, func(), error) { + "bzz": func(ctx *adapters.ServiceContext, bucket *sync.Map) (node.Service, func(), error) { addr := network.NewAddr(ctx.Config.Node()) kad := network.NewKademlia(addr.Over(), network.NewKadParams()) hp := network.NewHiveParams() hp.KeepAliveInterval = time.Duration(200) * time.Millisecond hp.Discovery = true // discovery must be enabled when creating a snapshot + // store the kademlia in the bucket, needed later in the WaitTillHealthy function + bucket.Store(simulation.BucketKeyKademlia, kad) + config := &network.BzzConfig{ OverlayAddr: addr.Over(), UnderlayAddr: addr.Under(), @@ -76,17 +79,17 @@ func createSnapshot(filename string, nodes int, services []string) (err error) { }) defer sim.Close() - _, err = sim.AddNodes(nodes) + ids, err := sim.AddNodes(nodes) if err != nil { return fmt.Errorf("add nodes: %v", err) } - err = sim.Net.ConnectNodesRing(nil) + err = sim.Net.ConnectNodesRing(ids) if err != nil { return fmt.Errorf("connect nodes: %v", err) } - ctx, cancelSimRun := context.WithTimeout(context.Background(), 2*time.Minute) + ctx, cancelSimRun := context.WithTimeout(context.Background(), 3*time.Minute) defer cancelSimRun() if _, err := sim.WaitTillHealthy(ctx); err != nil { return fmt.Errorf("wait for healthy kademlia: %v", err) diff --git a/cmd/swarm/swarm-snapshot/create_test.go b/cmd/swarm/swarm-snapshot/create_test.go index c9445168d..b2e30c201 100644 --- a/cmd/swarm/swarm-snapshot/create_test.go +++ b/cmd/swarm/swarm-snapshot/create_test.go @@ -48,7 +48,7 @@ func TestSnapshotCreate(t *testing.T) { }, { name: "more nodes", - nodes: defaultNodes + 5, + nodes: defaultNodes + 4, }, { name: "services", @@ -81,7 +81,7 @@ func TestSnapshotCreate(t *testing.T) { } testCmd := runSnapshot(t, append(args, file.Name())...) - testCmd.ExpectExit() + testCmd.WaitExit() if code := testCmd.ExitStatus(); code != 0 { t.Fatalf("command exit code %v, expected 0", code) } diff --git a/cmd/swarm/swarm-snapshot/main.go b/cmd/swarm/swarm-snapshot/main.go index 184727e4d..136295e51 100644 --- a/cmd/swarm/swarm-snapshot/main.go +++ b/cmd/swarm/swarm-snapshot/main.go @@ -27,7 +27,7 @@ import ( var gitCommit string // Git SHA1 commit hash of the release (set via linker flags) // default value for "create" command --nodes flag -const defaultNodes = 10 +const defaultNodes = 8 func main() { err := newApp().Run(os.Args) diff --git a/swarm/network/simulation/kademlia.go b/swarm/network/simulation/kademlia.go index c58d402b0..a3419c03f 100644 --- a/swarm/network/simulation/kademlia.go +++ b/swarm/network/simulation/kademlia.go @@ -58,7 +58,7 @@ func (s *Simulation) WaitTillHealthy(ctx context.Context) (ill map[enode.ID]*net for k := range ill { delete(ill, k) } - log.Debug("kademlia health check", "addr count", len(addrs)) + log.Debug("kademlia health check", "addr count", len(addrs), "kad len", len(kademlias)) for id, k := range kademlias { //PeerPot for this node addr := common.Bytes2Hex(k.BaseAddr()) @@ -70,7 +70,7 @@ func (s *Simulation) WaitTillHealthy(ctx context.Context) (ill map[enode.ID]*net log.Debug("kademlia", "connectNN", h.ConnectNN, "knowNN", h.KnowNN) log.Debug("kademlia", "health", h.ConnectNN && h.KnowNN, "addr", hex.EncodeToString(k.BaseAddr()), "node", id) log.Debug("kademlia", "ill condition", !h.ConnectNN, "addr", hex.EncodeToString(k.BaseAddr()), "node", id) - if !h.ConnectNN { + if !h.Healthy() { ill[id] = k } } @@ -85,6 +85,7 @@ func (s *Simulation) WaitTillHealthy(ctx context.Context) (ill map[enode.ID]*net // in simulation bucket. func (s *Simulation) kademlias() (ks map[enode.ID]*network.Kademlia) { items := s.UpNodesItems(BucketKeyKademlia) + log.Debug("kademlia len items", "len", len(items)) ks = make(map[enode.ID]*network.Kademlia, len(items)) for id, v := range items { k, ok := v.(*network.Kademlia) diff --git a/swarm/network/simulation/kademlia_test.go b/swarm/network/simulation/kademlia_test.go index bbc93ee8c..4cfcecd8e 100644 --- a/swarm/network/simulation/kademlia_test.go +++ b/swarm/network/simulation/kademlia_test.go @@ -22,16 +22,115 @@ import ( "testing" "time" + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/node" "github.com/ethereum/go-ethereum/p2p/simulations/adapters" "github.com/ethereum/go-ethereum/swarm/network" ) +/* + TestWaitTillHealthy tests that we indeed get a healthy network after we wait for it. + For this to be tested, a bit of a snake tail bite needs to happen: + * First we create a first simulation + * Run it as nodes connected in a ring + * Wait until the network is healthy + * Then we create a snapshot + * With this snapshot we create a new simulation + * This simulation is expected to have a healthy configuration, as it uses the snapshot + * Thus we just iterate all nodes and check that their kademlias are healthy + * If all kademlias are healthy, the test succeeded, otherwise it failed +*/ func TestWaitTillHealthy(t *testing.T) { - sim := New(map[string]ServiceFunc{ + + testNodesNum := 10 + + // create the first simulation + sim := New(createSimServiceMap(true)) + + // connect and... + nodeIDs, err := sim.AddNodesAndConnectRing(testNodesNum) + if err != nil { + t.Fatal(err) + } + + // array of all overlay addresses + var addrs [][]byte + // iterate once to be able to build the peer map + for _, node := range nodeIDs { + //get the kademlia overlay address from this ID + a := node.Bytes() + //append it to the array of all overlay addresses + addrs = append(addrs, a) + } + // build a PeerPot only once + pp := network.NewPeerPotMap(network.NewKadParams().NeighbourhoodSize, addrs) + + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + + // ...wait until healthy + ill, err := sim.WaitTillHealthy(ctx) + if err != nil { + for id, kad := range ill { + t.Log("Node", id) + t.Log(kad.String()) + } + t.Fatal(err) + } + + // now create a snapshot of this network + snap, err := sim.Net.Snapshot() + if err != nil { + t.Fatal(err) + } + + // close the initial simulation + sim.Close() + // create a control simulation + controlSim := New(createSimServiceMap(false)) + defer controlSim.Close() + + // load the snapshot into this control simulation + err = controlSim.Net.Load(snap) + if err != nil { + t.Fatal(err) + } + _, err = controlSim.WaitTillHealthy(ctx) + if err != nil { + t.Fatal(err) + } + + for _, node := range nodeIDs { + // ...get its kademlia + item, ok := controlSim.NodeItem(node, BucketKeyKademlia) + if !ok { + t.Fatal("No kademlia bucket item") + } + kad := item.(*network.Kademlia) + // get its base address + kid := common.Bytes2Hex(kad.BaseAddr()) + + //get the health info + info := kad.GetHealthInfo(pp[kid]) + log.Trace("Health info", "info", info) + // check that it is healthy + healthy := info.Healthy() + if !healthy { + t.Fatalf("Expected node %v of control simulation to be healthy, but it is not, unhealthy kademlias: %v", node, kad.String()) + } + } +} + +// createSimServiceMap returns the services map +// this function will create the sim services with or without discovery enabled +// based on the flag passed +func createSimServiceMap(discovery bool) map[string]ServiceFunc { + return map[string]ServiceFunc{ "bzz": func(ctx *adapters.ServiceContext, b *sync.Map) (node.Service, func(), error) { addr := network.NewAddr(ctx.Config.Node()) hp := network.NewHiveParams() + hp.Discovery = discovery config := &network.BzzConfig{ OverlayAddr: addr.Over(), UnderlayAddr: addr.Under(), @@ -43,24 +142,5 @@ func TestWaitTillHealthy(t *testing.T) { b.Store(BucketKeyKademlia, kad) return network.NewBzz(config, kad, nil, nil, nil), nil, nil }, - }) - defer sim.Close() - - _, err := sim.AddNodesAndConnectRing(10) - if err != nil { - t.Fatal(err) - } - - ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) - defer cancel() - ill, err := sim.WaitTillHealthy(ctx) - if err != nil { - for id, kad := range ill { - t.Log("Node", id) - t.Log(kad.String()) - } - if err != nil { - t.Fatal(err) - } } }