Merge pull request #22678 from karalabe/snap-ephemeral-channels

eth/protocols/snap: use ephemeral channels to avoid cross-sync delveries
This commit is contained in:
Péter Szilágyi 2021-04-16 09:27:39 +03:00 committed by GitHub
commit f8afb681dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 112 additions and 101 deletions

View File

@ -106,9 +106,11 @@ type accountRequest struct {
peer string // Peer to which this request is assigned
id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
deliver chan *accountResponse // Channel to deliver successful response on
revert chan *accountRequest // Channel to deliver request failure on
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
origin common.Hash // First account requested to allow continuation checks
limit common.Hash // Last account requested to allow non-overlapping chunking
@ -147,9 +149,11 @@ type bytecodeRequest struct {
peer string // Peer to which this request is assigned
id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
deliver chan *bytecodeResponse // Channel to deliver successful response on
revert chan *bytecodeRequest // Channel to deliver request failure on
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
hashes []common.Hash // Bytecode hashes to validate responses
task *accountTask // Task which this request is filling (only access fields through the runloop!!)
@ -176,9 +180,11 @@ type storageRequest struct {
peer string // Peer to which this request is assigned
id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
deliver chan *storageResponse // Channel to deliver successful response on
revert chan *storageRequest // Channel to deliver request failure on
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
accounts []common.Hash // Account hashes to validate responses
roots []common.Hash // Storage roots to validate responses
@ -224,9 +230,11 @@ type trienodeHealRequest struct {
peer string // Peer to which this request is assigned
id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
deliver chan *trienodeHealResponse // Channel to deliver successful response on
revert chan *trienodeHealRequest // Channel to deliver request failure on
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
hashes []common.Hash // Trie node hashes to validate responses
paths []trie.SyncPath // Trie node paths requested for rescheduling
@ -256,9 +264,11 @@ type bytecodeHealRequest struct {
peer string // Peer to which this request is assigned
id uint64 // Request ID of this request
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
deliver chan *bytecodeHealResponse // Channel to deliver successful response on
revert chan *bytecodeHealRequest // Channel to deliver request failure on
cancel chan struct{} // Channel to track sync cancellation
timeout *time.Timer // Timer to track delivery timeout
stale chan struct{} // Channel to signal the request was dropped
hashes []common.Hash // Bytecode hashes to validate responses
task *healTask // Task which this request is filling (only access fields through the runloop!!)
@ -399,14 +409,6 @@ type Syncer struct {
bytecodeReqs map[uint64]*bytecodeRequest // Bytecode requests currently running
storageReqs map[uint64]*storageRequest // Storage requests currently running
accountReqFails chan *accountRequest // Failed account range requests to revert
bytecodeReqFails chan *bytecodeRequest // Failed bytecode requests to revert
storageReqFails chan *storageRequest // Failed storage requests to revert
accountResps chan *accountResponse // Account sub-tries to integrate into the database
bytecodeResps chan *bytecodeResponse // Bytecodes to integrate into the database
storageResps chan *storageResponse // Storage sub-tries to integrate into the database
accountSynced uint64 // Number of accounts downloaded
accountBytes common.StorageSize // Number of account trie bytes persisted to disk
bytecodeSynced uint64 // Number of bytecodes downloaded
@ -421,12 +423,6 @@ type Syncer struct {
trienodeHealReqs map[uint64]*trienodeHealRequest // Trie node requests currently running
bytecodeHealReqs map[uint64]*bytecodeHealRequest // Bytecode requests currently running
trienodeHealReqFails chan *trienodeHealRequest // Failed trienode requests to revert
bytecodeHealReqFails chan *bytecodeHealRequest // Failed bytecode requests to revert
trienodeHealResps chan *trienodeHealResponse // Trie nodes to integrate into the database
bytecodeHealResps chan *bytecodeHealResponse // Bytecodes to integrate into the database
trienodeHealSynced uint64 // Number of state trie nodes downloaded
trienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk
trienodeHealDups uint64 // Number of state trie nodes already processed
@ -464,26 +460,16 @@ func NewSyncer(db ethdb.KeyValueStore) *Syncer {
storageIdlers: make(map[string]struct{}),
bytecodeIdlers: make(map[string]struct{}),
accountReqs: make(map[uint64]*accountRequest),
storageReqs: make(map[uint64]*storageRequest),
bytecodeReqs: make(map[uint64]*bytecodeRequest),
accountReqFails: make(chan *accountRequest),
storageReqFails: make(chan *storageRequest),
bytecodeReqFails: make(chan *bytecodeRequest),
accountResps: make(chan *accountResponse),
storageResps: make(chan *storageResponse),
bytecodeResps: make(chan *bytecodeResponse),
accountReqs: make(map[uint64]*accountRequest),
storageReqs: make(map[uint64]*storageRequest),
bytecodeReqs: make(map[uint64]*bytecodeRequest),
trienodeHealIdlers: make(map[string]struct{}),
bytecodeHealIdlers: make(map[string]struct{}),
trienodeHealReqs: make(map[uint64]*trienodeHealRequest),
bytecodeHealReqs: make(map[uint64]*bytecodeHealRequest),
trienodeHealReqFails: make(chan *trienodeHealRequest),
bytecodeHealReqFails: make(chan *bytecodeHealRequest),
trienodeHealResps: make(chan *trienodeHealResponse),
bytecodeHealResps: make(chan *bytecodeHealResponse),
stateWriter: db.NewBatch(),
trienodeHealReqs: make(map[uint64]*trienodeHealRequest),
bytecodeHealReqs: make(map[uint64]*bytecodeHealRequest),
stateWriter: db.NewBatch(),
}
}
@ -611,6 +597,21 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
peerDropSub := s.peerDrop.Subscribe(peerDrop)
defer peerDropSub.Unsubscribe()
// Create a set of unique channels for this sync cycle. We need these to be
// ephemeral so a data race doesn't accidentally deliver something stale on
// a persistent channel across syncs (yup, this happened)
var (
accountReqFails = make(chan *accountRequest)
storageReqFails = make(chan *storageRequest)
bytecodeReqFails = make(chan *bytecodeRequest)
accountResps = make(chan *accountResponse)
storageResps = make(chan *storageResponse)
bytecodeResps = make(chan *bytecodeResponse)
trienodeHealReqFails = make(chan *trienodeHealRequest)
bytecodeHealReqFails = make(chan *bytecodeHealRequest)
trienodeHealResps = make(chan *trienodeHealResponse)
bytecodeHealResps = make(chan *bytecodeHealResponse)
)
for {
// Remove all completed tasks and terminate sync if everything's done
s.cleanStorageTasks()
@ -619,14 +620,14 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
return nil
}
// Assign all the data retrieval tasks to any free peers
s.assignAccountTasks(cancel)
s.assignBytecodeTasks(cancel)
s.assignStorageTasks(cancel)
s.assignAccountTasks(accountResps, accountReqFails, cancel)
s.assignBytecodeTasks(bytecodeResps, bytecodeReqFails, cancel)
s.assignStorageTasks(storageResps, storageReqFails, cancel)
if len(s.tasks) == 0 {
// Sync phase done, run heal phase
s.assignTrienodeHealTasks(cancel)
s.assignBytecodeHealTasks(cancel)
s.assignTrienodeHealTasks(trienodeHealResps, trienodeHealReqFails, cancel)
s.assignBytecodeHealTasks(bytecodeHealResps, bytecodeHealReqFails, cancel)
}
// Wait for something to happen
select {
@ -639,26 +640,26 @@ func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error {
case <-cancel:
return ErrCancelled
case req := <-s.accountReqFails:
case req := <-accountReqFails:
s.revertAccountRequest(req)
case req := <-s.bytecodeReqFails:
case req := <-bytecodeReqFails:
s.revertBytecodeRequest(req)
case req := <-s.storageReqFails:
case req := <-storageReqFails:
s.revertStorageRequest(req)
case req := <-s.trienodeHealReqFails:
case req := <-trienodeHealReqFails:
s.revertTrienodeHealRequest(req)
case req := <-s.bytecodeHealReqFails:
case req := <-bytecodeHealReqFails:
s.revertBytecodeHealRequest(req)
case res := <-s.accountResps:
case res := <-accountResps:
s.processAccountResponse(res)
case res := <-s.bytecodeResps:
case res := <-bytecodeResps:
s.processBytecodeResponse(res)
case res := <-s.storageResps:
case res := <-storageResps:
s.processStorageResponse(res)
case res := <-s.trienodeHealResps:
case res := <-trienodeHealResps:
s.processTrienodeHealResponse(res)
case res := <-s.bytecodeHealResps:
case res := <-bytecodeHealResps:
s.processBytecodeHealResponse(res)
}
// Report stats if something meaningful happened
@ -801,7 +802,7 @@ func (s *Syncer) cleanStorageTasks() {
// assignAccountTasks attempts to match idle peers to pending account range
// retrievals.
func (s *Syncer) assignAccountTasks(cancel chan struct{}) {
func (s *Syncer) assignAccountTasks(success chan *accountResponse, fail chan *accountRequest, cancel chan struct{}) {
s.lock.Lock()
defer s.lock.Unlock()
@ -847,13 +848,15 @@ func (s *Syncer) assignAccountTasks(cancel chan struct{}) {
}
// Generate the network query and send it to the peer
req := &accountRequest{
peer: idle,
id: reqid,
cancel: cancel,
stale: make(chan struct{}),
origin: task.Next,
limit: task.Last,
task: task,
peer: idle,
id: reqid,
deliver: success,
revert: fail,
cancel: cancel,
stale: make(chan struct{}),
origin: task.Next,
limit: task.Last,
task: task,
}
req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Account range request timed out", "reqid", reqid)
@ -879,7 +882,7 @@ func (s *Syncer) assignAccountTasks(cancel chan struct{}) {
}
// assignBytecodeTasks attempts to match idle peers to pending code retrievals.
func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) {
func (s *Syncer) assignBytecodeTasks(success chan *bytecodeResponse, fail chan *bytecodeRequest, cancel chan struct{}) {
s.lock.Lock()
defer s.lock.Unlock()
@ -937,12 +940,14 @@ func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) {
}
}
req := &bytecodeRequest{
peer: idle,
id: reqid,
cancel: cancel,
stale: make(chan struct{}),
hashes: hashes,
task: task,
peer: idle,
id: reqid,
deliver: success,
revert: fail,
cancel: cancel,
stale: make(chan struct{}),
hashes: hashes,
task: task,
}
req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Bytecode request timed out", "reqid", reqid)
@ -966,7 +971,7 @@ func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) {
// assignStorageTasks attempts to match idle peers to pending storage range
// retrievals.
func (s *Syncer) assignStorageTasks(cancel chan struct{}) {
func (s *Syncer) assignStorageTasks(success chan *storageResponse, fail chan *storageRequest, cancel chan struct{}) {
s.lock.Lock()
defer s.lock.Unlock()
@ -1059,6 +1064,8 @@ func (s *Syncer) assignStorageTasks(cancel chan struct{}) {
req := &storageRequest{
peer: idle,
id: reqid,
deliver: success,
revert: fail,
cancel: cancel,
stale: make(chan struct{}),
accounts: accounts,
@ -1101,7 +1108,7 @@ func (s *Syncer) assignStorageTasks(cancel chan struct{}) {
// assignTrienodeHealTasks attempts to match idle peers to trie node requests to
// heal any trie errors caused by the snap sync's chunked retrieval model.
func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) {
func (s *Syncer) assignTrienodeHealTasks(success chan *trienodeHealResponse, fail chan *trienodeHealRequest, cancel chan struct{}) {
s.lock.Lock()
defer s.lock.Unlock()
@ -1179,13 +1186,15 @@ func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) {
}
}
req := &trienodeHealRequest{
peer: idle,
id: reqid,
cancel: cancel,
stale: make(chan struct{}),
hashes: hashes,
paths: paths,
task: s.healer,
peer: idle,
id: reqid,
deliver: success,
revert: fail,
cancel: cancel,
stale: make(chan struct{}),
hashes: hashes,
paths: paths,
task: s.healer,
}
req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Trienode heal request timed out", "reqid", reqid)
@ -1209,7 +1218,7 @@ func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) {
// assignBytecodeHealTasks attempts to match idle peers to bytecode requests to
// heal any trie errors caused by the snap sync's chunked retrieval model.
func (s *Syncer) assignBytecodeHealTasks(cancel chan struct{}) {
func (s *Syncer) assignBytecodeHealTasks(success chan *bytecodeHealResponse, fail chan *bytecodeHealRequest, cancel chan struct{}) {
s.lock.Lock()
defer s.lock.Unlock()
@ -1280,12 +1289,14 @@ func (s *Syncer) assignBytecodeHealTasks(cancel chan struct{}) {
}
}
req := &bytecodeHealRequest{
peer: idle,
id: reqid,
cancel: cancel,
stale: make(chan struct{}),
hashes: hashes,
task: s.healer,
peer: idle,
id: reqid,
deliver: success,
revert: fail,
cancel: cancel,
stale: make(chan struct{}),
hashes: hashes,
task: s.healer,
}
req.timeout = time.AfterFunc(requestTimeout, func() {
peer.Log().Debug("Bytecode heal request timed out", "reqid", reqid)
@ -1366,7 +1377,7 @@ func (s *Syncer) revertRequests(peer string) {
// request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertAccountRequest(req *accountRequest) {
select {
case s.accountReqFails <- req:
case req.revert <- req:
// Sync event loop notified
case <-req.cancel:
// Sync cycle got cancelled
@ -1407,7 +1418,7 @@ func (s *Syncer) revertAccountRequest(req *accountRequest) {
// and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertBytecodeRequest(req *bytecodeRequest) {
select {
case s.bytecodeReqFails <- req:
case req.revert <- req:
// Sync event loop notified
case <-req.cancel:
// Sync cycle got cancelled
@ -1448,7 +1459,7 @@ func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) {
// request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertStorageRequest(req *storageRequest) {
select {
case s.storageReqFails <- req:
case req.revert <- req:
// Sync event loop notified
case <-req.cancel:
// Sync cycle got cancelled
@ -1493,7 +1504,7 @@ func (s *Syncer) revertStorageRequest(req *storageRequest) {
// request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertTrienodeHealRequest(req *trienodeHealRequest) {
select {
case s.trienodeHealReqFails <- req:
case req.revert <- req:
// Sync event loop notified
case <-req.cancel:
// Sync cycle got cancelled
@ -1534,7 +1545,7 @@ func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) {
// request and return all failed retrieval tasks to the scheduler for reassignment.
func (s *Syncer) scheduleRevertBytecodeHealRequest(req *bytecodeHealRequest) {
select {
case s.bytecodeHealReqFails <- req:
case req.revert <- req:
// Sync event loop notified
case <-req.cancel:
// Sync cycle got cancelled
@ -2147,7 +2158,7 @@ func (s *Syncer) OnAccounts(peer SyncPeer, id uint64, hashes []common.Hash, acco
cont: cont,
}
select {
case s.accountResps <- response:
case req.deliver <- response:
case <-req.cancel:
case <-req.stale:
}
@ -2253,7 +2264,7 @@ func (s *Syncer) onByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) error
codes: codes,
}
select {
case s.bytecodeResps <- response:
case req.deliver <- response:
case <-req.cancel:
case <-req.stale:
}
@ -2411,7 +2422,7 @@ func (s *Syncer) OnStorage(peer SyncPeer, id uint64, hashes [][]common.Hash, slo
cont: cont,
}
select {
case s.storageResps <- response:
case req.deliver <- response:
case <-req.cancel:
case <-req.stale:
}
@ -2505,7 +2516,7 @@ func (s *Syncer) OnTrieNodes(peer SyncPeer, id uint64, trienodes [][]byte) error
nodes: nodes,
}
select {
case s.trienodeHealResps <- response:
case req.deliver <- response:
case <-req.cancel:
case <-req.stale:
}
@ -2598,7 +2609,7 @@ func (s *Syncer) onHealByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) e
codes: codes,
}
select {
case s.bytecodeHealResps <- response:
case req.deliver <- response:
case <-req.cancel:
case <-req.stale:
}