diff --git a/eth/downloader/downloader.go b/eth/downloader/downloader.go index 5fa18a2e3..c272d05af 100644 --- a/eth/downloader/downloader.go +++ b/eth/downloader/downloader.go @@ -45,16 +45,17 @@ var ( MaxReceiptFetch = 256 // Amount of transaction receipts to allow fetching per request MaxStateFetch = 384 // Amount of node state values to allow fetching per request - hashTTL = 5 * time.Second // [eth/61] Time it takes for a hash request to time out - blockSoftTTL = 3 * time.Second // [eth/61] Request completion threshold for increasing or decreasing a peer's bandwidth - blockHardTTL = 3 * blockSoftTTL // [eth/61] Maximum time allowance before a block request is considered expired - headerTTL = 5 * time.Second // [eth/62] Time it takes for a header request to time out - bodySoftTTL = 3 * time.Second // [eth/62] Request completion threshold for increasing or decreasing a peer's bandwidth - bodyHardTTL = 3 * bodySoftTTL // [eth/62] Maximum time allowance before a block body request is considered expired - receiptSoftTTL = 3 * time.Second // [eth/63] Request completion threshold for increasing or decreasing a peer's bandwidth - receiptHardTTL = 3 * receiptSoftTTL // [eth/63] Maximum time allowance before a receipt request is considered expired - stateSoftTTL = 2 * time.Second // [eth/63] Request completion threshold for increasing or decreasing a peer's bandwidth - stateHardTTL = 3 * stateSoftTTL // [eth/63] Maximum time allowance before a node data request is considered expired + hashTTL = 3 * time.Second // [eth/61] Time it takes for a hash request to time out + blockTargetRTT = 3 * time.Second / 2 // [eth/61] Target time for completing a block retrieval request + blockTTL = 3 * blockTargetRTT // [eth/61] Maximum time allowance before a block request is considered expired + + headerTTL = 3 * time.Second // [eth/62] Time it takes for a header request to time out + bodyTargetRTT = 3 * time.Second / 2 // [eth/62] Target time for completing a block body retrieval request + bodyTTL = 3 * bodyTargetRTT // [eth/62] Maximum time allowance before a block body request is considered expired + receiptTargetRTT = 3 * time.Second / 2 // [eth/63] Target time for completing a receipt retrieval request + receiptTTL = 3 * receiptTargetRTT // [eth/63] Maximum time allowance before a receipt request is considered expired + stateTargetRTT = 2 * time.Second / 2 // [eth/63] Target time for completing a state trie retrieval request + stateTTL = 3 * stateTargetRTT // [eth/63] Maximum time allowance before a node data request is considered expired maxQueuedHashes = 256 * 1024 // [eth/61] Maximum number of hashes to queue for import (DOS protection) maxQueuedHeaders = 256 * 1024 // [eth/62] Maximum number of headers to queue for import (DOS protection) @@ -486,7 +487,7 @@ func (d *Downloader) fetchHeight61(p *peer) (uint64, error) { // Request the advertised remote head block and wait for the response go p.getBlocks([]common.Hash{p.head}) - timeout := time.After(blockSoftTTL) + timeout := time.After(hashTTL) for { select { case <-d.cancelCh: @@ -779,47 +780,27 @@ func (d *Downloader) fetchBlocks61(from uint64) error { // If the peer was previously banned and failed to deliver it's pack // in a reasonable time frame, ignore it's message. if peer := d.peers.Peer(packet.PeerId()); peer != nil { - // Deliver the received chunk of blocks, and demote in case of errors blocks := packet.(*blockPack).blocks - err := d.queue.DeliverBlocks(peer.id, blocks) - switch err { - case nil: - // If no blocks were delivered, demote the peer (need the delivery above) - if len(blocks) == 0 { - peer.Demote() - peer.SetBlocksIdle() - glog.V(logger.Detail).Infof("%s: no blocks delivered", peer) - break - } - // All was successful, promote the peer and potentially start processing - peer.Promote() - peer.SetBlocksIdle() - glog.V(logger.Detail).Infof("%s: delivered %d blocks", peer, len(blocks)) - case errInvalidChain: - // The hash chain is invalid (blocks are not ordered properly), abort + // Deliver the received chunk of blocks and check chain validity + accepted, err := d.queue.DeliverBlocks(peer.id, blocks) + if err == errInvalidChain { return err - - case errNoFetchesPending: - // Peer probably timed out with its delivery but came through - // in the end, demote, but allow to to pull from this peer. - peer.Demote() - peer.SetBlocksIdle() - glog.V(logger.Detail).Infof("%s: out of bound delivery", peer) - - case errStaleDelivery: - // Delivered something completely else than requested, usually - // caused by a timeout and delivery during a new sync cycle. - // Don't set it to idle as the original request should still be - // in flight. - peer.Demote() - glog.V(logger.Detail).Infof("%s: stale delivery", peer) - + } + // Unless a peer delivered something completely else than requested (usually + // caused by a timed out request which came through in the end), set it to + // idle. If the delivery's stale, the peer should have already been idled. + if err != errStaleDelivery { + peer.SetBlocksIdle(accepted) + } + // Issue a log to the user to see what's going on + switch { + case err == nil && len(blocks) == 0: + glog.V(logger.Detail).Infof("%s: no blocks delivered", peer) + case err == nil: + glog.V(logger.Detail).Infof("%s: delivered %d blocks", peer, len(blocks)) default: - // Peer did something semi-useful, demote but keep it around - peer.Demote() - peer.SetBlocksIdle() - glog.V(logger.Detail).Infof("%s: delivery partially failed: %v", peer, err) + glog.V(logger.Detail).Infof("%s: delivery failed: %v", peer, err) } } // Blocks arrived, try to update the progress @@ -852,10 +833,15 @@ func (d *Downloader) fetchBlocks61(from uint64) error { return errNoPeers } // Check for block request timeouts and demote the responsible peers - for _, pid := range d.queue.ExpireBlocks(blockHardTTL) { + for pid, fails := range d.queue.ExpireBlocks(blockTTL) { if peer := d.peers.Peer(pid); peer != nil { - peer.Demote() - glog.V(logger.Detail).Infof("%s: block delivery timeout", peer) + if fails > 1 { + glog.V(logger.Detail).Infof("%s: block delivery timeout", peer) + peer.SetBlocksIdle(0) + } else { + glog.V(logger.Debug).Infof("%s: stalling block delivery, dropping", peer) + d.dropPeer(pid) + } } } // If there's nothing more to fetch, wait or terminate @@ -1281,14 +1267,14 @@ func (d *Downloader) fetchBodies(from uint64) error { glog.V(logger.Debug).Infof("Downloading block bodies from #%d", from) var ( - deliver = func(packet dataPack) error { + deliver = func(packet dataPack) (int, error) { pack := packet.(*bodyPack) return d.queue.DeliverBodies(pack.peerId, pack.transactions, pack.uncles) } - expire = func() []string { return d.queue.ExpireBodies(bodyHardTTL) } + expire = func() map[string]int { return d.queue.ExpireBodies(bodyTTL) } fetch = func(p *peer, req *fetchRequest) error { return p.FetchBodies(req) } capacity = func(p *peer) int { return p.BlockCapacity() } - setIdle = func(p *peer) { p.SetBodiesIdle() } + setIdle = func(p *peer, accepted int) { p.SetBodiesIdle(accepted) } ) err := d.fetchParts(errCancelBodyFetch, d.bodyCh, deliver, d.bodyWakeCh, expire, d.queue.PendingBlocks, d.queue.InFlightBlocks, d.queue.ShouldThrottleBlocks, d.queue.ReserveBodies, @@ -1305,14 +1291,14 @@ func (d *Downloader) fetchReceipts(from uint64) error { glog.V(logger.Debug).Infof("Downloading receipts from #%d", from) var ( - deliver = func(packet dataPack) error { + deliver = func(packet dataPack) (int, error) { pack := packet.(*receiptPack) return d.queue.DeliverReceipts(pack.peerId, pack.receipts) } - expire = func() []string { return d.queue.ExpireReceipts(receiptHardTTL) } + expire = func() map[string]int { return d.queue.ExpireReceipts(receiptTTL) } fetch = func(p *peer, req *fetchRequest) error { return p.FetchReceipts(req) } capacity = func(p *peer) int { return p.ReceiptCapacity() } - setIdle = func(p *peer) { p.SetReceiptsIdle() } + setIdle = func(p *peer, accepted int) { p.SetReceiptsIdle(accepted) } ) err := d.fetchParts(errCancelReceiptFetch, d.receiptCh, deliver, d.receiptWakeCh, expire, d.queue.PendingReceipts, d.queue.InFlightReceipts, d.queue.ShouldThrottleReceipts, d.queue.ReserveReceipts, @@ -1329,7 +1315,7 @@ func (d *Downloader) fetchNodeData() error { glog.V(logger.Debug).Infof("Downloading node state data") var ( - deliver = func(packet dataPack) error { + deliver = func(packet dataPack) (int, error) { start := time.Now() return d.queue.DeliverNodeData(packet.PeerId(), packet.(*statePack).states, func(err error, delivered int) { if err != nil { @@ -1352,14 +1338,14 @@ func (d *Downloader) fetchNodeData() error { glog.V(logger.Info).Infof("imported %d state entries in %v: processed %d in total", delivered, time.Since(start), d.syncStatsStateDone) }) } - expire = func() []string { return d.queue.ExpireNodeData(stateHardTTL) } + expire = func() map[string]int { return d.queue.ExpireNodeData(stateTTL) } throttle = func() bool { return false } reserve = func(p *peer, count int) (*fetchRequest, bool, error) { return d.queue.ReserveNodeData(p, count), false, nil } fetch = func(p *peer, req *fetchRequest) error { return p.FetchNodeData(req) } capacity = func(p *peer) int { return p.NodeDataCapacity() } - setIdle = func(p *peer) { p.SetNodeDataIdle() } + setIdle = func(p *peer, accepted int) { p.SetNodeDataIdle(accepted) } ) err := d.fetchParts(errCancelStateFetch, d.stateCh, deliver, d.stateWakeCh, expire, d.queue.PendingNodeData, d.queue.InFlightNodeData, throttle, reserve, nil, fetch, @@ -1372,10 +1358,10 @@ func (d *Downloader) fetchNodeData() error { // fetchParts iteratively downloads scheduled block parts, taking any available // peers, reserving a chunk of fetch requests for each, waiting for delivery and // also periodically checking for timeouts. -func (d *Downloader) fetchParts(errCancel error, deliveryCh chan dataPack, deliver func(packet dataPack) error, wakeCh chan bool, - expire func() []string, pending func() int, inFlight func() bool, throttle func() bool, reserve func(*peer, int) (*fetchRequest, bool, error), +func (d *Downloader) fetchParts(errCancel error, deliveryCh chan dataPack, deliver func(dataPack) (int, error), wakeCh chan bool, + expire func() map[string]int, pending func() int, inFlight func() bool, throttle func() bool, reserve func(*peer, int) (*fetchRequest, bool, error), fetchHook func([]*types.Header), fetch func(*peer, *fetchRequest) error, cancel func(*fetchRequest), capacity func(*peer) int, - idle func() ([]*peer, int), setIdle func(*peer), kind string) error { + idle func() ([]*peer, int), setIdle func(*peer, int), kind string) error { // Create a ticker to detect expired retrieval tasks ticker := time.NewTicker(100 * time.Millisecond) @@ -1394,45 +1380,25 @@ func (d *Downloader) fetchParts(errCancel error, deliveryCh chan dataPack, deliv // If the peer was previously banned and failed to deliver it's pack // in a reasonable time frame, ignore it's message. if peer := d.peers.Peer(packet.PeerId()); peer != nil { - // Deliver the received chunk of data, and demote in case of errors - switch err := deliver(packet); err { - case nil: - // If no blocks were delivered, demote the peer (need the delivery above to clean internal queue!) - if packet.Items() == 0 { - peer.Demote() - setIdle(peer) - glog.V(logger.Detail).Infof("%s: no %s delivered", peer, strings.ToLower(kind)) - break - } - // All was successful, promote the peer and potentially start processing - peer.Promote() - setIdle(peer) - glog.V(logger.Detail).Infof("%s: delivered %s %s(s)", peer, packet.Stats(), strings.ToLower(kind)) - - case errInvalidChain: - // The hash chain is invalid (blocks are not ordered properly), abort + // Deliver the received chunk of data and check chain validity + accepted, err := deliver(packet) + if err == errInvalidChain { return err - - case errNoFetchesPending: - // Peer probably timed out with its delivery but came through - // in the end, demote, but allow to to pull from this peer. - peer.Demote() - setIdle(peer) - glog.V(logger.Detail).Infof("%s: out of bound %s delivery", peer, strings.ToLower(kind)) - - case errStaleDelivery: - // Delivered something completely else than requested, usually - // caused by a timeout and delivery during a new sync cycle. - // Don't set it to idle as the original request should still be - // in flight. - peer.Demote() - glog.V(logger.Detail).Infof("%s: %s stale delivery", peer, strings.ToLower(kind)) - + } + // Unless a peer delivered something completely else than requested (usually + // caused by a timed out request which came through in the end), set it to + // idle. If the delivery's stale, the peer should have already been idled. + if err != errStaleDelivery { + setIdle(peer, accepted) + } + // Issue a log to the user to see what's going on + switch { + case err == nil && packet.Items() == 0: + glog.V(logger.Detail).Infof("%s: no %s delivered", peer, strings.ToLower(kind)) + case err == nil: + glog.V(logger.Detail).Infof("%s: delivered %s %s(s)", peer, packet.Stats(), strings.ToLower(kind)) default: - // Peer did something semi-useful, demote but keep it around - peer.Demote() - setIdle(peer) - glog.V(logger.Detail).Infof("%s: %s delivery partially failed: %v", peer, strings.ToLower(kind), err) + glog.V(logger.Detail).Infof("%s: %s delivery failed: %v", peer, strings.ToLower(kind), err) } } // Blocks assembled, try to update the progress @@ -1465,11 +1431,15 @@ func (d *Downloader) fetchParts(errCancel error, deliveryCh chan dataPack, deliv return errNoPeers } // Check for fetch request timeouts and demote the responsible peers - for _, pid := range expire() { + for pid, fails := range expire() { if peer := d.peers.Peer(pid); peer != nil { - peer.Demote() - setIdle(peer) - glog.V(logger.Detail).Infof("%s: %s delivery timeout", peer, strings.ToLower(kind)) + if fails > 1 { + glog.V(logger.Detail).Infof("%s: %s delivery timeout", peer, strings.ToLower(kind)) + setIdle(peer, 0) + } else { + glog.V(logger.Debug).Infof("%s: stalling %s delivery, dropping", peer, strings.ToLower(kind)) + d.dropPeer(pid) + } } } // If there's nothing more to fetch, wait or terminate diff --git a/eth/downloader/peer.go b/eth/downloader/peer.go index 9ba6dabbd..80f08b68f 100644 --- a/eth/downloader/peer.go +++ b/eth/downloader/peer.go @@ -30,8 +30,10 @@ import ( "github.com/ethereum/go-ethereum/common" ) -// Maximum number of entries allowed on the list or lacking items. -const maxLackingHashes = 4096 +const ( + maxLackingHashes = 4096 // Maximum number of entries allowed on the list or lacking items + throughputImpact = 0.1 // The impact a single measurement has on a peer's final throughput value. +) // Hash and block fetchers belonging to eth/61 and below type relativeHashFetcherFn func(common.Hash) error @@ -59,18 +61,16 @@ type peer struct { blockIdle int32 // Current block activity state of the peer (idle = 0, active = 1) receiptIdle int32 // Current receipt activity state of the peer (idle = 0, active = 1) stateIdle int32 // Current node data activity state of the peer (idle = 0, active = 1) - rep int32 // Simple peer reputation - blockCapacity int32 // Number of blocks (bodies) allowed to fetch per request - receiptCapacity int32 // Number of receipts allowed to fetch per request - stateCapacity int32 // Number of node data pieces allowed to fetch per request + blockThroughput float64 // Number of blocks (bodies) measured to be retrievable per second + receiptThroughput float64 // Number of receipts measured to be retrievable per second + stateThroughput float64 // Number of node data pieces measured to be retrievable per second blockStarted time.Time // Time instance when the last block (body)fetch was started receiptStarted time.Time // Time instance when the last receipt fetch was started stateStarted time.Time // Time instance when the last node data fetch was started - lacking map[common.Hash]struct{} // Set of hashes not to request (didn't have previously) - lackingLock sync.RWMutex // Lock protecting the lacking hashes list + lacking map[common.Hash]struct{} // Set of hashes not to request (didn't have previously) getRelHashes relativeHashFetcherFn // [eth/61] Method to retrieve a batch of hashes from an origin hash getAbsHashes absoluteHashFetcherFn // [eth/61] Method to retrieve a batch of hashes from an absolute position @@ -84,6 +84,7 @@ type peer struct { getNodeData stateFetcherFn // [eth/63] Method to retrieve a batch of state trie data version int // Eth protocol version number to switch strategies + lock sync.RWMutex } // newPeer create a new downloader peer, with specific hash and block retrieval @@ -93,12 +94,9 @@ func newPeer(id string, version int, head common.Hash, getRelHeaders relativeHeaderFetcherFn, getAbsHeaders absoluteHeaderFetcherFn, getBlockBodies blockBodyFetcherFn, getReceipts receiptFetcherFn, getNodeData stateFetcherFn) *peer { return &peer{ - id: id, - head: head, - blockCapacity: 1, - receiptCapacity: 1, - stateCapacity: 1, - lacking: make(map[common.Hash]struct{}), + id: id, + head: head, + lacking: make(map[common.Hash]struct{}), getRelHashes: getRelHashes, getAbsHashes: getAbsHashes, @@ -117,15 +115,18 @@ func newPeer(id string, version int, head common.Hash, // Reset clears the internal state of a peer entity. func (p *peer) Reset() { + p.lock.Lock() + defer p.lock.Unlock() + atomic.StoreInt32(&p.blockIdle, 0) atomic.StoreInt32(&p.receiptIdle, 0) - atomic.StoreInt32(&p.blockCapacity, 1) - atomic.StoreInt32(&p.receiptCapacity, 1) - atomic.StoreInt32(&p.stateCapacity, 1) + atomic.StoreInt32(&p.stateIdle, 0) + + p.blockThroughput = 0 + p.receiptThroughput = 0 + p.stateThroughput = 0 - p.lackingLock.Lock() p.lacking = make(map[common.Hash]struct{}) - p.lackingLock.Unlock() } // Fetch61 sends a block retrieval request to the remote peer. @@ -216,107 +217,86 @@ func (p *peer) FetchNodeData(request *fetchRequest) error { return nil } -// SetBlocksIdle sets the peer to idle, allowing it to execute new retrieval requests. -// Its block retrieval allowance will also be updated either up- or downwards, -// depending on whether the previous fetch completed in time. -func (p *peer) SetBlocksIdle() { - p.setIdle(p.blockStarted, blockSoftTTL, blockHardTTL, MaxBlockFetch, &p.blockCapacity, &p.blockIdle) +// SetBlocksIdle sets the peer to idle, allowing it to execute new block retrieval +// requests. Its estimated block retrieval throughput is updated with that measured +// just now. +func (p *peer) SetBlocksIdle(delivered int) { + p.setIdle(p.blockStarted, delivered, &p.blockThroughput, &p.blockIdle) } -// SetBodiesIdle sets the peer to idle, allowing it to execute new retrieval requests. -// Its block body retrieval allowance will also be updated either up- or downwards, -// depending on whether the previous fetch completed in time. -func (p *peer) SetBodiesIdle() { - p.setIdle(p.blockStarted, bodySoftTTL, bodyHardTTL, MaxBodyFetch, &p.blockCapacity, &p.blockIdle) +// SetBodiesIdle sets the peer to idle, allowing it to execute block body retrieval +// requests. Its estimated body retrieval throughput is updated with that measured +// just now. +func (p *peer) SetBodiesIdle(delivered int) { + p.setIdle(p.blockStarted, delivered, &p.blockThroughput, &p.blockIdle) } -// SetReceiptsIdle sets the peer to idle, allowing it to execute new retrieval requests. -// Its receipt retrieval allowance will also be updated either up- or downwards, -// depending on whether the previous fetch completed in time. -func (p *peer) SetReceiptsIdle() { - p.setIdle(p.receiptStarted, receiptSoftTTL, receiptHardTTL, MaxReceiptFetch, &p.receiptCapacity, &p.receiptIdle) +// SetReceiptsIdle sets the peer to idle, allowing it to execute new receipt +// retrieval requests. Its estimated receipt retrieval throughput is updated +// with that measured just now. +func (p *peer) SetReceiptsIdle(delivered int) { + p.setIdle(p.receiptStarted, delivered, &p.receiptThroughput, &p.receiptIdle) } -// SetNodeDataIdle sets the peer to idle, allowing it to execute new retrieval -// requests. Its node data retrieval allowance will also be updated either up- or -// downwards, depending on whether the previous fetch completed in time. -func (p *peer) SetNodeDataIdle() { - p.setIdle(p.stateStarted, stateSoftTTL, stateSoftTTL, MaxStateFetch, &p.stateCapacity, &p.stateIdle) +// SetNodeDataIdle sets the peer to idle, allowing it to execute new state trie +// data retrieval requests. Its estimated state retrieval throughput is updated +// with that measured just now. +func (p *peer) SetNodeDataIdle(delivered int) { + p.setIdle(p.stateStarted, delivered, &p.stateThroughput, &p.stateIdle) } // setIdle sets the peer to idle, allowing it to execute new retrieval requests. -// Its data retrieval allowance will also be updated either up- or downwards, -// depending on whether the previous fetch completed in time. -func (p *peer) setIdle(started time.Time, softTTL, hardTTL time.Duration, maxFetch int, capacity, idle *int32) { - // Update the peer's download allowance based on previous performance - scale := 2.0 - if time.Since(started) > softTTL { - scale = 0.5 - if time.Since(started) > hardTTL { - scale = 1 / float64(maxFetch) // reduces capacity to 1 - } - } - for { - // Calculate the new download bandwidth allowance - prev := atomic.LoadInt32(capacity) - next := int32(math.Max(1, math.Min(float64(maxFetch), float64(prev)*scale))) +// Its estimated retrieval throughput is updated with that measured just now. +func (p *peer) setIdle(started time.Time, delivered int, throughput *float64, idle *int32) { + // Irrelevant of the scaling, make sure the peer ends up idle + defer atomic.StoreInt32(idle, 0) - // Try to update the old value - if atomic.CompareAndSwapInt32(capacity, prev, next) { - // If we're having problems at 1 capacity, try to find better peers - if next == 1 { - p.Demote() - } - break - } + p.lock.RLock() + defer p.lock.RUnlock() + + // If nothing was delivered (hard timeout / unavailable data), reduce throughput to minimum + if delivered == 0 { + *throughput = 0 + return } - // Set the peer to idle to allow further fetch requests - atomic.StoreInt32(idle, 0) + // Otherwise update the throughput with a new measurement + measured := float64(delivered) / (float64(time.Since(started)+1) / float64(time.Second)) // +1 (ns) to ensure non-zero divisor + *throughput = (1-throughputImpact)*(*throughput) + throughputImpact*measured } // BlockCapacity retrieves the peers block download allowance based on its -// previously discovered bandwidth capacity. +// previously discovered throughput. func (p *peer) BlockCapacity() int { - return int(atomic.LoadInt32(&p.blockCapacity)) + p.lock.RLock() + defer p.lock.RUnlock() + + return int(math.Max(1, math.Min(p.blockThroughput*float64(blockTargetRTT)/float64(time.Second), float64(MaxBlockFetch)))) } -// ReceiptCapacity retrieves the peers block download allowance based on its -// previously discovered bandwidth capacity. +// ReceiptCapacity retrieves the peers receipt download allowance based on its +// previously discovered throughput. func (p *peer) ReceiptCapacity() int { - return int(atomic.LoadInt32(&p.receiptCapacity)) + p.lock.RLock() + defer p.lock.RUnlock() + + return int(math.Max(1, math.Min(p.receiptThroughput*float64(receiptTargetRTT)/float64(time.Second), float64(MaxReceiptFetch)))) } -// NodeDataCapacity retrieves the peers block download allowance based on its -// previously discovered bandwidth capacity. +// NodeDataCapacity retrieves the peers state download allowance based on its +// previously discovered throughput. func (p *peer) NodeDataCapacity() int { - return int(atomic.LoadInt32(&p.stateCapacity)) -} + p.lock.RLock() + defer p.lock.RUnlock() -// Promote increases the peer's reputation. -func (p *peer) Promote() { - atomic.AddInt32(&p.rep, 1) -} - -// Demote decreases the peer's reputation or leaves it at 0. -func (p *peer) Demote() { - for { - // Calculate the new reputation value - prev := atomic.LoadInt32(&p.rep) - next := prev / 2 - - // Try to update the old value - if atomic.CompareAndSwapInt32(&p.rep, prev, next) { - return - } - } + return int(math.Max(1, math.Min(p.stateThroughput*float64(stateTargetRTT)/float64(time.Second), float64(MaxStateFetch)))) } // MarkLacking appends a new entity to the set of items (blocks, receipts, states) // that a peer is known not to have (i.e. have been requested before). If the // set reaches its maximum allowed capacity, items are randomly dropped off. func (p *peer) MarkLacking(hash common.Hash) { - p.lackingLock.Lock() - defer p.lackingLock.Unlock() + p.lock.Lock() + defer p.lock.Unlock() for len(p.lacking) >= maxLackingHashes { for drop, _ := range p.lacking { @@ -330,8 +310,8 @@ func (p *peer) MarkLacking(hash common.Hash) { // Lacks retrieves whether the hash of a blockchain item is on the peers lacking // list (i.e. whether we know that the peer does not have it). func (p *peer) Lacks(hash common.Hash) bool { - p.lackingLock.RLock() - defer p.lackingLock.RUnlock() + p.lock.RLock() + defer p.lock.RUnlock() _, ok := p.lacking[hash] return ok @@ -339,13 +319,13 @@ func (p *peer) Lacks(hash common.Hash) bool { // String implements fmt.Stringer. func (p *peer) String() string { - p.lackingLock.RLock() - defer p.lackingLock.RUnlock() + p.lock.RLock() + defer p.lock.RUnlock() return fmt.Sprintf("Peer %s [%s]", p.id, - fmt.Sprintf("reputation %3d, ", atomic.LoadInt32(&p.rep))+ - fmt.Sprintf("block cap %3d, ", atomic.LoadInt32(&p.blockCapacity))+ - fmt.Sprintf("receipt cap %3d, ", atomic.LoadInt32(&p.receiptCapacity))+ + fmt.Sprintf("blocks %3.2f/s, ", p.blockThroughput)+ + fmt.Sprintf("receipts %3.2f/s, ", p.receiptThroughput)+ + fmt.Sprintf("states %3.2f/s, ", p.stateThroughput)+ fmt.Sprintf("lacking %4d", len(p.lacking)), ) } @@ -377,6 +357,10 @@ func (ps *peerSet) Reset() { // Register injects a new peer into the working set, or returns an error if the // peer is already known. +// +// The method also sets the starting throughput values of the new peer to the +// average of all existing peers, to give it a realistic change of being used +// for data retrievals. func (ps *peerSet) Register(p *peer) error { ps.lock.Lock() defer ps.lock.Unlock() @@ -384,6 +368,20 @@ func (ps *peerSet) Register(p *peer) error { if _, ok := ps.peers[p.id]; ok { return errAlreadyRegistered } + if len(ps.peers) > 0 { + p.blockThroughput, p.receiptThroughput, p.stateThroughput = 0, 0, 0 + + for _, peer := range ps.peers { + peer.lock.RLock() + p.blockThroughput += peer.blockThroughput + p.receiptThroughput += peer.receiptThroughput + p.stateThroughput += peer.stateThroughput + peer.lock.RUnlock() + } + p.blockThroughput /= float64(len(ps.peers)) + p.receiptThroughput /= float64(len(ps.peers)) + p.stateThroughput /= float64(len(ps.peers)) + } ps.peers[p.id] = p return nil } @@ -435,7 +433,12 @@ func (ps *peerSet) BlockIdlePeers() ([]*peer, int) { idle := func(p *peer) bool { return atomic.LoadInt32(&p.blockIdle) == 0 } - return ps.idlePeers(61, 61, idle) + throughput := func(p *peer) float64 { + p.lock.RLock() + defer p.lock.RUnlock() + return p.blockThroughput + } + return ps.idlePeers(61, 61, idle, throughput) } // BodyIdlePeers retrieves a flat list of all the currently body-idle peers within @@ -444,7 +447,12 @@ func (ps *peerSet) BodyIdlePeers() ([]*peer, int) { idle := func(p *peer) bool { return atomic.LoadInt32(&p.blockIdle) == 0 } - return ps.idlePeers(62, 64, idle) + throughput := func(p *peer) float64 { + p.lock.RLock() + defer p.lock.RUnlock() + return p.blockThroughput + } + return ps.idlePeers(62, 64, idle, throughput) } // ReceiptIdlePeers retrieves a flat list of all the currently receipt-idle peers @@ -453,7 +461,12 @@ func (ps *peerSet) ReceiptIdlePeers() ([]*peer, int) { idle := func(p *peer) bool { return atomic.LoadInt32(&p.receiptIdle) == 0 } - return ps.idlePeers(63, 64, idle) + throughput := func(p *peer) float64 { + p.lock.RLock() + defer p.lock.RUnlock() + return p.receiptThroughput + } + return ps.idlePeers(63, 64, idle, throughput) } // NodeDataIdlePeers retrieves a flat list of all the currently node-data-idle @@ -462,12 +475,18 @@ func (ps *peerSet) NodeDataIdlePeers() ([]*peer, int) { idle := func(p *peer) bool { return atomic.LoadInt32(&p.stateIdle) == 0 } - return ps.idlePeers(63, 64, idle) + throughput := func(p *peer) float64 { + p.lock.RLock() + defer p.lock.RUnlock() + return p.stateThroughput + } + return ps.idlePeers(63, 64, idle, throughput) } // idlePeers retrieves a flat list of all currently idle peers satisfying the // protocol version constraints, using the provided function to check idleness. -func (ps *peerSet) idlePeers(minProtocol, maxProtocol int, idleCheck func(*peer) bool) ([]*peer, int) { +// The resulting set of peers are sorted by their measure throughput. +func (ps *peerSet) idlePeers(minProtocol, maxProtocol int, idleCheck func(*peer) bool, throughput func(*peer) float64) ([]*peer, int) { ps.lock.RLock() defer ps.lock.RUnlock() @@ -482,7 +501,7 @@ func (ps *peerSet) idlePeers(minProtocol, maxProtocol int, idleCheck func(*peer) } for i := 0; i < len(idle); i++ { for j := i + 1; j < len(idle); j++ { - if atomic.LoadInt32(&idle[i].rep) < atomic.LoadInt32(&idle[j].rep) { + if throughput(idle[i]) < throughput(idle[j]) { idle[i], idle[j] = idle[j], idle[i] } } diff --git a/eth/downloader/queue.go b/eth/downloader/queue.go index 584797d7b..1e55560db 100644 --- a/eth/downloader/queue.go +++ b/eth/downloader/queue.go @@ -703,7 +703,7 @@ func (q *queue) Revoke(peerId string) { // ExpireBlocks checks for in flight requests that exceeded a timeout allowance, // canceling them and returning the responsible peers for penalisation. -func (q *queue) ExpireBlocks(timeout time.Duration) []string { +func (q *queue) ExpireBlocks(timeout time.Duration) map[string]int { q.lock.Lock() defer q.lock.Unlock() @@ -712,7 +712,7 @@ func (q *queue) ExpireBlocks(timeout time.Duration) []string { // ExpireBodies checks for in flight block body requests that exceeded a timeout // allowance, canceling them and returning the responsible peers for penalisation. -func (q *queue) ExpireBodies(timeout time.Duration) []string { +func (q *queue) ExpireBodies(timeout time.Duration) map[string]int { q.lock.Lock() defer q.lock.Unlock() @@ -721,7 +721,7 @@ func (q *queue) ExpireBodies(timeout time.Duration) []string { // ExpireReceipts checks for in flight receipt requests that exceeded a timeout // allowance, canceling them and returning the responsible peers for penalisation. -func (q *queue) ExpireReceipts(timeout time.Duration) []string { +func (q *queue) ExpireReceipts(timeout time.Duration) map[string]int { q.lock.Lock() defer q.lock.Unlock() @@ -730,7 +730,7 @@ func (q *queue) ExpireReceipts(timeout time.Duration) []string { // ExpireNodeData checks for in flight node data requests that exceeded a timeout // allowance, canceling them and returning the responsible peers for penalisation. -func (q *queue) ExpireNodeData(timeout time.Duration) []string { +func (q *queue) ExpireNodeData(timeout time.Duration) map[string]int { q.lock.Lock() defer q.lock.Unlock() @@ -743,9 +743,9 @@ func (q *queue) ExpireNodeData(timeout time.Duration) []string { // Note, this method expects the queue lock to be already held. The // reason the lock is not obtained in here is because the parameters already need // to access the queue, so they already need a lock anyway. -func (q *queue) expire(timeout time.Duration, pendPool map[string]*fetchRequest, taskQueue *prque.Prque, timeoutMeter metrics.Meter) []string { +func (q *queue) expire(timeout time.Duration, pendPool map[string]*fetchRequest, taskQueue *prque.Prque, timeoutMeter metrics.Meter) map[string]int { // Iterate over the expired requests and return each to the queue - peers := []string{} + expiries := make(map[string]int) for id, request := range pendPool { if time.Since(request.Time) > timeout { // Update the metrics with the timeout @@ -758,25 +758,32 @@ func (q *queue) expire(timeout time.Duration, pendPool map[string]*fetchRequest, for _, header := range request.Headers { taskQueue.Push(header, -float32(header.Number.Uint64())) } - peers = append(peers, id) + // Add the peer to the expiry report along the the number of failed requests + expirations := len(request.Hashes) + if expirations < len(request.Headers) { + expirations = len(request.Headers) + } + expiries[id] = expirations } } // Remove the expired requests from the pending pool - for _, id := range peers { + for id, _ := range expiries { delete(pendPool, id) } - return peers + return expiries } -// DeliverBlocks injects a block retrieval response into the download queue. -func (q *queue) DeliverBlocks(id string, blocks []*types.Block) error { +// DeliverBlocks injects a block retrieval response into the download queue. The +// method returns the number of blocks accepted from the delivery and also wakes +// any threads waiting for data delivery. +func (q *queue) DeliverBlocks(id string, blocks []*types.Block) (int, error) { q.lock.Lock() defer q.lock.Unlock() // Short circuit if the blocks were never requested request := q.blockPendPool[id] if request == nil { - return errNoFetchesPending + return 0, errNoFetchesPending } blockReqTimer.UpdateSince(request.Time) delete(q.blockPendPool, id) @@ -788,7 +795,7 @@ func (q *queue) DeliverBlocks(id string, blocks []*types.Block) error { } } // Iterate over the downloaded blocks and add each of them - errs := make([]error, 0) + accepted, errs := 0, make([]error, 0) for _, block := range blocks { // Skip any blocks that were not requested hash := block.Hash() @@ -811,28 +818,33 @@ func (q *queue) DeliverBlocks(id string, blocks []*types.Block) error { delete(request.Hashes, hash) delete(q.hashPool, hash) + accepted++ } // Return all failed or missing fetches to the queue for hash, index := range request.Hashes { q.hashQueue.Push(hash, float32(index)) } // Wake up WaitResults - q.active.Signal() + if accepted > 0 { + q.active.Signal() + } // If none of the blocks were good, it's a stale delivery switch { case len(errs) == 0: - return nil + return accepted, nil case len(errs) == 1 && (errs[0] == errInvalidChain || errs[0] == errInvalidBlock): - return errs[0] + return accepted, errs[0] case len(errs) == len(blocks): - return errStaleDelivery + return accepted, errStaleDelivery default: - return fmt.Errorf("multiple failures: %v", errs) + return accepted, fmt.Errorf("multiple failures: %v", errs) } } // DeliverBodies injects a block body retrieval response into the results queue. -func (q *queue) DeliverBodies(id string, txLists [][]*types.Transaction, uncleLists [][]*types.Header) error { +// The method returns the number of blocks bodies accepted from the delivery and +// also wakes any threads waiting for data delivery. +func (q *queue) DeliverBodies(id string, txLists [][]*types.Transaction, uncleLists [][]*types.Header) (int, error) { q.lock.Lock() defer q.lock.Unlock() @@ -848,7 +860,9 @@ func (q *queue) DeliverBodies(id string, txLists [][]*types.Transaction, uncleLi } // DeliverReceipts injects a receipt retrieval response into the results queue. -func (q *queue) DeliverReceipts(id string, receiptList [][]*types.Receipt) error { +// The method returns the number of transaction receipts accepted from the delivery +// and also wakes any threads waiting for data delivery. +func (q *queue) DeliverReceipts(id string, receiptList [][]*types.Receipt) (int, error) { q.lock.Lock() defer q.lock.Unlock() @@ -867,12 +881,14 @@ func (q *queue) DeliverReceipts(id string, receiptList [][]*types.Receipt) error // Note, this method expects the queue lock to be already held for writing. The // reason the lock is not obtained in here is because the parameters already need // to access the queue, so they already need a lock anyway. -func (q *queue) deliver(id string, taskPool map[common.Hash]*types.Header, taskQueue *prque.Prque, pendPool map[string]*fetchRequest, - donePool map[common.Hash]struct{}, reqTimer metrics.Timer, results int, reconstruct func(header *types.Header, index int, result *fetchResult) error) error { +func (q *queue) deliver(id string, taskPool map[common.Hash]*types.Header, taskQueue *prque.Prque, + pendPool map[string]*fetchRequest, donePool map[common.Hash]struct{}, reqTimer metrics.Timer, + results int, reconstruct func(header *types.Header, index int, result *fetchResult) error) (int, error) { + // Short circuit if the data was never requested request := pendPool[id] if request == nil { - return errNoFetchesPending + return 0, errNoFetchesPending } reqTimer.UpdateSince(request.Time) delete(pendPool, id) @@ -885,8 +901,9 @@ func (q *queue) deliver(id string, taskPool map[common.Hash]*types.Header, taskQ } // Assemble each of the results with their headers and retrieved data parts var ( - failure error - useful bool + accepted int + failure error + useful bool ) for i, header := range request.Headers { // Short circuit assembly if no more fetch results are found @@ -906,6 +923,7 @@ func (q *queue) deliver(id string, taskPool map[common.Hash]*types.Header, taskQ donePool[header.Hash()] = struct{}{} q.resultCache[index].Pending-- useful = true + accepted++ // Clean up a successful fetch request.Headers[i] = nil @@ -918,27 +936,31 @@ func (q *queue) deliver(id string, taskPool map[common.Hash]*types.Header, taskQ } } // Wake up WaitResults - q.active.Signal() + if accepted > 0 { + q.active.Signal() + } // If none of the data was good, it's a stale delivery switch { case failure == nil || failure == errInvalidChain: - return failure + return accepted, failure case useful: - return fmt.Errorf("partial failure: %v", failure) + return accepted, fmt.Errorf("partial failure: %v", failure) default: - return errStaleDelivery + return accepted, errStaleDelivery } } // DeliverNodeData injects a node state data retrieval response into the queue. -func (q *queue) DeliverNodeData(id string, data [][]byte, callback func(error, int)) error { +// The method returns the number of node state entries originally requested, and +// the number of them actually accepted from the delivery. +func (q *queue) DeliverNodeData(id string, data [][]byte, callback func(error, int)) (int, error) { q.lock.Lock() defer q.lock.Unlock() // Short circuit if the data was never requested request := q.statePendPool[id] if request == nil { - return errNoFetchesPending + return 0, errNoFetchesPending } stateReqTimer.UpdateSince(request.Time) delete(q.statePendPool, id) @@ -950,10 +972,10 @@ func (q *queue) DeliverNodeData(id string, data [][]byte, callback func(error, i } } // Iterate over the downloaded data and verify each of them - errs := make([]error, 0) + accepted, errs := 0, make([]error, 0) process := []trie.SyncResult{} for _, blob := range data { - // Skip any blocks that were not requested + // Skip any state trie entires that were not requested hash := common.BytesToHash(crypto.Sha3(blob)) if _, ok := request.Hashes[hash]; !ok { errs = append(errs, fmt.Errorf("non-requested state data %x", hash)) @@ -961,6 +983,7 @@ func (q *queue) DeliverNodeData(id string, data [][]byte, callback func(error, i } // Inject the next state trie item into the processing queue process = append(process, trie.SyncResult{hash, blob}) + accepted++ delete(request.Hashes, hash) delete(q.stateTaskPool, hash) @@ -978,11 +1001,11 @@ func (q *queue) DeliverNodeData(id string, data [][]byte, callback func(error, i // If none of the data items were good, it's a stale delivery switch { case len(errs) == 0: - return nil + return accepted, nil case len(errs) == len(request.Hashes): - return errStaleDelivery + return accepted, errStaleDelivery default: - return fmt.Errorf("multiple failures: %v", errs) + return accepted, fmt.Errorf("multiple failures: %v", errs) } }