diff options
author | Wladimir J. van der Laan <laanwj@gmail.com> | 2014-10-06 17:55:55 +0200 |
---|---|---|
committer | Wladimir J. van der Laan <laanwj@gmail.com> | 2014-10-06 18:30:12 +0200 |
commit | aedc74dfa688306c5a139a88782da74f69ba6757 (patch) | |
tree | 9e7b2fd87c426010465298c3ab5d72ba54bad66d /contrib/linearize | |
parent | 5505a1b13f75af9f0f6421b42d97c06e079db345 (diff) |
contrib: make linearize-data.py cope with out-of-order blocks
Make it possible to read blocks in any order. This will be required
after headers-first (#4468), so should be merged before that.
- Read block header. For expected blocks, continue, else skip.
- For in-order blocks: copy block contents directly. Write prior
out-of-order blocks if this connects a consecutive span.
- For out-of-order blocks, store extents of block data for later
retrieval. Cache out-of-order blocks in memory up to 100MB
(configurable).
Diffstat (limited to 'contrib/linearize')
-rw-r--r-- | contrib/linearize/example-linearize.cfg | 2 | ||||
-rwxr-xr-x | contrib/linearize/linearize-data.py | 258 |
2 files changed, 162 insertions, 98 deletions
diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg index 071345f23a..e0fef13886 100644 --- a/contrib/linearize/example-linearize.cfg +++ b/contrib/linearize/example-linearize.cfg @@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat hashlist=hashlist.txt split_year=1 +# Maxmimum size in bytes of out-of-order blocks cache in memory +out_of_order_cache_sz = 100000000 diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py index 3b5d198c14..2dac3a614b 100755 --- a/contrib/linearize/linearize-data.py +++ b/contrib/linearize/linearize-data.py @@ -2,11 +2,12 @@ # # linearize-data.py: Construct a linear, no-fork version of the chain. # -# Copyright (c) 2013 The Bitcoin developers +# Copyright (c) 2013-2014 The Bitcoin developers # Distributed under the MIT/X11 software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. # +from __future__ import print_function, division import json import struct import re @@ -17,10 +18,10 @@ import sys import hashlib import datetime import time +from collections import namedtuple settings = {} - def uint32(x): return x & 0xffffffffL @@ -78,116 +79,174 @@ def get_block_hashes(settings): return blkindex -def mkblockset(blkindex): +def mkblockmap(blkindex): blkmap = {} - for hash in blkindex: - blkmap[hash] = True + for height,hash in enumerate(blkindex): + blkmap[hash] = height return blkmap -def copydata(settings, blkindex, blkset): - inFn = 0 - inF = None - outFn = 0 - outsz = 0 - outF = None - outFname = None - blkCount = 0 - - lastDate = datetime.datetime(2000, 1, 1) - highTS = 1408893517 - 315360000 - timestampSplit = False - fileOutput = True - setFileTime = False - maxOutSz = settings['max_out_sz'] - if 'output' in settings: - fileOutput = False - if settings['file_timestamp'] != 0: - setFileTime = True - if settings['split_timestamp'] != 0: - timestampSplit = True - - while True: - if not inF: - fname = "%s/blk%05d.dat" % (settings['input'], inFn) - print("Input file" + fname) - try: - inF = open(fname, "rb") - except IOError: - print "Done" - return - - inhdr = inF.read(8) - if (not inhdr or (inhdr[0] == "\0")): - inF.close() - inF = None - inFn = inFn + 1 - continue - - inMagic = inhdr[:4] - if (inMagic != settings['netmagic']): - print("Invalid magic:" + inMagic) - return - inLenLE = inhdr[4:] - su = struct.unpack("<I", inLenLE) - inLen = su[0] - rawblock = inF.read(inLen) - blk_hdr = rawblock[:80] - - hash_str = calc_hash_str(blk_hdr) - if not hash_str in blkset: - print("Skipping unknown block " + hash_str) - continue - - if blkindex[blkCount] != hash_str: - print("Out of order block.") - print("Expected " + blkindex[blkCount]) - print("Got " + hash_str) - sys.exit(1) - - if not fileOutput and ((outsz + inLen) > maxOutSz): - outF.close() - if setFileTime: +# Block header and extent on disk +BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) + +class BlockDataCopier: + def __init__(self, settings, blkindex, blkmap): + self.settings = settings + self.blkindex = blkindex + self.blkmap = blkmap + + self.inFn = 0 + self.inF = None + self.outFn = 0 + self.outsz = 0 + self.outF = None + self.outFname = None + self.blkCountIn = 0 + self.blkCountOut = 0 + + self.lastDate = datetime.datetime(2000, 1, 1) + self.highTS = 1408893517 - 315360000 + self.timestampSplit = False + self.fileOutput = True + self.setFileTime = False + self.maxOutSz = settings['max_out_sz'] + if 'output' in settings: + self.fileOutput = False + if settings['file_timestamp'] != 0: + self.setFileTime = True + if settings['split_timestamp'] != 0: + self.timestampSplit = True + # Extents and cache for out-of-order blocks + self.blockExtents = {} + self.outOfOrderData = {} + self.outOfOrderSize = 0 # running total size for items in outOfOrderData + + def writeBlock(self, inhdr, blk_hdr, rawblock): + if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz): + self.outF.close() + if self.setFileTime: os.utime(outFname, (int(time.time()), highTS)) - outF = None - outFname = None - outFn = outFn + 1 - outsz = 0 + self.outF = None + self.outFname = None + self.outFn = outFn + 1 + self.outsz = 0 (blkDate, blkTS) = get_blk_dt(blk_hdr) - if timestampSplit and (blkDate > lastDate): + if self.timestampSplit and (blkDate > self.lastDate): print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str) lastDate = blkDate if outF: outF.close() if setFileTime: os.utime(outFname, (int(time.time()), highTS)) - outF = None - outFname = None - outFn = outFn + 1 - outsz = 0 - - if not outF: - if fileOutput: - outFname = settings['output_file'] + self.outF = None + self.outFname = None + self.outFn = self.outFn + 1 + self.outsz = 0 + + if not self.outF: + if self.fileOutput: + outFname = self.settings['output_file'] else: - outFname = "%s/blk%05d.dat" % (settings['output'], outFn) + outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn) print("Output file" + outFname) - outF = open(outFname, "wb") - - outF.write(inhdr) - outF.write(rawblock) - outsz = outsz + inLen + 8 - - blkCount = blkCount + 1 - if blkTS > highTS: - highTS = blkTS - - if (blkCount % 1000) == 0: - print("Wrote " + str(blkCount) + " blocks") + self.outF = open(outFname, "wb") + + self.outF.write(inhdr) + self.outF.write(blk_hdr) + self.outF.write(rawblock) + self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) + + self.blkCountOut = self.blkCountOut + 1 + if blkTS > self.highTS: + self.highTS = blkTS + + if (self.blkCountOut % 1000) == 0: + print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % + (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) + + def inFileName(self, fn): + return "%s/blk%05d.dat" % (self.settings['input'], fn) + + def fetchBlock(self, extent): + '''Fetch block contents from disk given extents''' + with open(self.inFileName(extent.fn), "rb") as f: + f.seek(extent.offset) + return f.read(extent.size) + + def copyOneBlock(self): + '''Find the next block to be written in the input, and copy it to the output.''' + extent = self.blockExtents.pop(self.blkCountOut) + if self.blkCountOut in self.outOfOrderData: + # If the data is cached, use it from memory and remove from the cache + rawblock = self.outOfOrderData.pop(self.blkCountOut) + self.outOfOrderSize -= len(rawblock) + else: # Otherwise look up data on disk + rawblock = self.fetchBlock(extent) + + self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) + + def run(self): + while self.blkCountOut < len(self.blkindex): + if not self.inF: + fname = self.inFileName(self.inFn) + print("Input file" + fname) + try: + self.inF = open(fname, "rb") + except IOError: + print("Premature end of block data") + return + + inhdr = self.inF.read(8) + if (not inhdr or (inhdr[0] == "\0")): + self.inF.close() + self.inF = None + self.inFn = self.inFn + 1 + continue + + inMagic = inhdr[:4] + if (inMagic != self.settings['netmagic']): + print("Invalid magic:" + inMagic) + return + inLenLE = inhdr[4:] + su = struct.unpack("<I", inLenLE) + inLen = su[0] - 80 # length without header + blk_hdr = self.inF.read(80) + inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) + + hash_str = calc_hash_str(blk_hdr) + if not hash_str in blkmap: + print("Skipping unknown block " + hash_str) + self.inF.seek(inLen, os.SEEK_CUR) + continue + + blkHeight = self.blkmap[hash_str] + self.blkCountIn += 1 + + if self.blkCountOut == blkHeight: + # If in-order block, just copy + rawblock = self.inF.read(inLen) + self.writeBlock(inhdr, blk_hdr, rawblock) + + # See if we can catch up to prior out-of-order blocks + while self.blkCountOut in self.blockExtents: + self.copyOneBlock() + + else: # If out-of-order, skip over block data for now + self.blockExtents[blkHeight] = inExtent + if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: + # If there is space in the cache, read the data + # Reading the data in file sequence instead of seeking and fetching it later is preferred, + # but we don't want to fill up memory + self.outOfOrderData[blkHeight] = self.inF.read(inLen) + self.outOfOrderSize += inLen + else: # If no space in cache, seek forward + self.inF.seek(inLen, os.SEEK_CUR) + + print("Done (%i blocks written)" % (self.blkCountOut)) if __name__ == '__main__': if len(sys.argv) != 2: - print "Usage: linearize-data.py CONFIG-FILE" + print("Usage: linearize-data.py CONFIG-FILE") sys.exit(1) f = open(sys.argv[1]) @@ -216,22 +275,25 @@ if __name__ == '__main__': settings['split_timestamp'] = 0 if 'max_out_sz' not in settings: settings['max_out_sz'] = 1000L * 1000 * 1000 + if 'out_of_order_cache_sz' not in settings: + settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 settings['max_out_sz'] = long(settings['max_out_sz']) settings['split_timestamp'] = int(settings['split_timestamp']) settings['file_timestamp'] = int(settings['file_timestamp']) settings['netmagic'] = settings['netmagic'].decode('hex') + settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) if 'output_file' not in settings and 'output' not in settings: print("Missing output file / directory") sys.exit(1) blkindex = get_block_hashes(settings) - blkset = mkblockset(blkindex) + blkmap = mkblockmap(blkindex) - if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset: + if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap: print("not found") else: - copydata(settings, blkindex, blkset) + BlockDataCopier(settings, blkindex, blkmap).run() |