contrib: make linearize-data.py cope with out-of-order blocks

Make it possible to read blocks in any order. This will be required after headers-first (#4468), so should be merged before that. - Read block header. For expected blocks, continue, else skip. - For in-order blocks: copy block contents directly. Write prior out-of-order blocks if this connects a consecutive span. - For out-of-order blocks, store extents of block data for later retrieval. Cache out-of-order blocks in memory up to 100MB (configurable).
author: Wladimir J. van der Laan <laanwj@gmail.com> 2014-10-06 17:55:55 +0200
committer: Wladimir J. van der Laan <laanwj@gmail.com> 2014-10-06 18:30:12 +0200
commit: aedc74dfa688306c5a139a88782da74f69ba6757 (patch)
tree: 9e7b2fd87c426010465298c3ab5d72ba54bad66d /contrib/linearize
parent: 5505a1b13f75af9f0f6421b42d97c06e079db345 (diff)
2 files changed, 162 insertions, 98 deletions
diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg
index 071345f23a..e0fef13886 100644
--- a/contrib/linearize/example-linearize.cfg
+++ b/contrib/linearize/example-linearize.cfg
@@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat
 hashlist=hashlist.txt
 split_year=1
 
+# Maxmimum size in bytes of out-of-order blocks cache in memory
+out_of_order_cache_sz = 100000000
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
index 3b5d198c14..2dac3a614b 100755
--- a/contrib/linearize/linearize-data.py
+++ b/contrib/linearize/linearize-data.py
@@ -2,11 +2,12 @@
 #
 # linearize-data.py: Construct a linear, no-fork version of the chain.
 #
-# Copyright (c) 2013 The Bitcoin developers
+# Copyright (c) 2013-2014 The Bitcoin developers
 # Distributed under the MIT/X11 software license, see the accompanying
 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
 #
 
+from __future__ import print_function, division
 import json
 import struct
 import re
@@ -17,10 +18,10 @@ import sys
 import hashlib
 import datetime
 import time
+from collections import namedtuple
 
 settings = {}
 
-
 def uint32(x):
 	return x & 0xffffffffL
 
@@ -78,116 +79,174 @@ def get_block_hashes(settings):
 
 	return blkindex
 
-def mkblockset(blkindex):
+def mkblockmap(blkindex):
 	blkmap = {}
-	for hash in blkindex:
-		blkmap[hash] = True
+	for height,hash in enumerate(blkindex):
+		blkmap[hash] = height
 	return blkmap
 
-def copydata(settings, blkindex, blkset):
-	inFn = 0
-	inF = None
-	outFn = 0
-	outsz = 0
-	outF = None
-	outFname = None
-	blkCount = 0
-
-	lastDate = datetime.datetime(2000, 1, 1)
-	highTS = 1408893517 - 315360000
-	timestampSplit = False
-	fileOutput = True
-	setFileTime = False
-	maxOutSz = settings['max_out_sz']
-	if 'output' in settings:
-		fileOutput = False
-	if settings['file_timestamp'] != 0:
-		setFileTime = True
-	if settings['split_timestamp'] != 0:
-		timestampSplit = True
-
-	while True:
-		if not inF:
-			fname = "%s/blk%05d.dat" % (settings['input'], inFn)
-			print("Input file" + fname)
-			try:
-				inF = open(fname, "rb")
-			except IOError:
-				print "Done"
-				return
-
-		inhdr = inF.read(8)
-		if (not inhdr or (inhdr[0] == "\0")):
-			inF.close()
-			inF = None
-			inFn = inFn + 1
-			continue
-
-		inMagic = inhdr[:4]
-		if (inMagic != settings['netmagic']):
-			print("Invalid magic:" + inMagic)
-			return
-		inLenLE = inhdr[4:]
-		su = struct.unpack("<I", inLenLE)
-		inLen = su[0]
-		rawblock = inF.read(inLen)
-		blk_hdr = rawblock[:80]
-
-		hash_str = calc_hash_str(blk_hdr)
-		if not hash_str in blkset:
-			print("Skipping unknown block " + hash_str)
-			continue
-
-		if blkindex[blkCount] != hash_str:
-			print("Out of order block.")
-			print("Expected " + blkindex[blkCount])
-			print("Got " + hash_str)
-			sys.exit(1)
-
-		if not fileOutput and ((outsz + inLen) > maxOutSz):
-			outF.close()
-			if setFileTime:
+# Block header and extent on disk
+BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
+
+class BlockDataCopier:
+	def __init__(self, settings, blkindex, blkmap):
+		self.settings = settings
+		self.blkindex = blkindex
+		self.blkmap = blkmap
+
+		self.inFn = 0
+		self.inF = None
+		self.outFn = 0
+		self.outsz = 0
+		self.outF = None
+		self.outFname = None
+		self.blkCountIn = 0
+		self.blkCountOut = 0
+
+		self.lastDate = datetime.datetime(2000, 1, 1)
+		self.highTS = 1408893517 - 315360000
+		self.timestampSplit = False
+		self.fileOutput = True
+		self.setFileTime = False
+		self.maxOutSz = settings['max_out_sz']
+		if 'output' in settings:
+			self.fileOutput = False
+		if settings['file_timestamp'] != 0:
+			self.setFileTime = True
+		if settings['split_timestamp'] != 0:
+			self.timestampSplit = True
+        # Extents and cache for out-of-order blocks
+		self.blockExtents = {}
+		self.outOfOrderData = {}
+		self.outOfOrderSize = 0 # running total size for items in outOfOrderData
+
+	def writeBlock(self, inhdr, blk_hdr, rawblock):
+		if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
+			self.outF.close()
+			if self.setFileTime:
 				os.utime(outFname, (int(time.time()), highTS))
-			outF = None
-			outFname = None
-			outFn = outFn + 1
-			outsz = 0
+			self.outF = None
+			self.outFname = None
+			self.outFn = outFn + 1
+			self.outsz = 0
 
 		(blkDate, blkTS) = get_blk_dt(blk_hdr)
-		if timestampSplit and (blkDate > lastDate):
+		if self.timestampSplit and (blkDate > self.lastDate):
 			print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
 			lastDate = blkDate
 			if outF:
 				outF.close()
 				if setFileTime:
 					os.utime(outFname, (int(time.time()), highTS))
-				outF = None
-				outFname = None
-				outFn = outFn + 1
-				outsz = 0
-
-		if not outF:
-			if fileOutput:
-				outFname = settings['output_file']
+				self.outF = None
+				self.outFname = None
+				self.outFn = self.outFn + 1
+				self.outsz = 0
+
+		if not self.outF:
+			if self.fileOutput:
+				outFname = self.settings['output_file']
 			else:
-				outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
+				outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
 			print("Output file" + outFname)
-			outF = open(outFname, "wb")
-
-		outF.write(inhdr)
-		outF.write(rawblock)
-		outsz = outsz + inLen + 8
-
-		blkCount = blkCount + 1
-		if blkTS > highTS:
-			highTS = blkTS
-
-		if (blkCount % 1000) == 0:
-			print("Wrote " + str(blkCount) + " blocks")
+			self.outF = open(outFname, "wb")
+
+		self.outF.write(inhdr)
+		self.outF.write(blk_hdr)
+		self.outF.write(rawblock)
+		self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
+
+		self.blkCountOut = self.blkCountOut + 1
+		if blkTS > self.highTS:
+			self.highTS = blkTS
+
+		if (self.blkCountOut % 1000) == 0:
+			print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % 
+					(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
+
+	def inFileName(self, fn):
+		return "%s/blk%05d.dat" % (self.settings['input'], fn)
+
+	def fetchBlock(self, extent):
+		'''Fetch block contents from disk given extents'''
+		with open(self.inFileName(extent.fn), "rb") as f:
+			f.seek(extent.offset)
+			return f.read(extent.size)
+
+	def copyOneBlock(self):
+		'''Find the next block to be written in the input, and copy it to the output.'''
+		extent = self.blockExtents.pop(self.blkCountOut)
+		if self.blkCountOut in self.outOfOrderData:
+			# If the data is cached, use it from memory and remove from the cache
+			rawblock = self.outOfOrderData.pop(self.blkCountOut)
+			self.outOfOrderSize -= len(rawblock)
+		else: # Otherwise look up data on disk
+			rawblock = self.fetchBlock(extent)
+
+		self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
+
+	def run(self):
+		while self.blkCountOut < len(self.blkindex):
+			if not self.inF:
+				fname = self.inFileName(self.inFn)
+				print("Input file" + fname)
+				try:
+					self.inF = open(fname, "rb")
+				except IOError:
+					print("Premature end of block data")
+					return
+
+			inhdr = self.inF.read(8)
+			if (not inhdr or (inhdr[0] == "\0")):
+				self.inF.close()
+				self.inF = None
+				self.inFn = self.inFn + 1
+				continue
+
+			inMagic = inhdr[:4]
+			if (inMagic != self.settings['netmagic']):
+				print("Invalid magic:" + inMagic)
+				return
+			inLenLE = inhdr[4:]
+			su = struct.unpack("<I", inLenLE)
+			inLen = su[0] - 80 # length without header
+			blk_hdr = self.inF.read(80)
+			inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
+
+			hash_str = calc_hash_str(blk_hdr)
+			if not hash_str in blkmap:
+				print("Skipping unknown block " + hash_str)
+				self.inF.seek(inLen, os.SEEK_CUR)
+				continue
+
+			blkHeight = self.blkmap[hash_str]
+			self.blkCountIn += 1
+
+			if self.blkCountOut == blkHeight:
+				# If in-order block, just copy
+				rawblock = self.inF.read(inLen)
+				self.writeBlock(inhdr, blk_hdr, rawblock)
+
+				# See if we can catch up to prior out-of-order blocks
+				while self.blkCountOut in self.blockExtents:
+					self.copyOneBlock()
+
+			else: # If out-of-order, skip over block data for now
+				self.blockExtents[blkHeight] = inExtent
+				if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
+					# If there is space in the cache, read the data
+					# Reading the data in file sequence instead of seeking and fetching it later is preferred,
+					# but we don't want to fill up memory
+					self.outOfOrderData[blkHeight] = self.inF.read(inLen)
+					self.outOfOrderSize += inLen
+				else: # If no space in cache, seek forward
+					self.inF.seek(inLen, os.SEEK_CUR)
+
+		print("Done (%i blocks written)" % (self.blkCountOut))
 
 if __name__ == '__main__':
 	if len(sys.argv) != 2:
-		print "Usage: linearize-data.py CONFIG-FILE"
+		print("Usage: linearize-data.py CONFIG-FILE")
 		sys.exit(1)
 
 	f = open(sys.argv[1])
@@ -216,22 +275,25 @@ if __name__ == '__main__':
 		settings['split_timestamp'] = 0
 	if 'max_out_sz' not in settings:
 		settings['max_out_sz'] = 1000L * 1000 * 1000
+	if 'out_of_order_cache_sz' not in settings:
+		settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
 
 	settings['max_out_sz'] = long(settings['max_out_sz'])
 	settings['split_timestamp'] = int(settings['split_timestamp'])
 	settings['file_timestamp'] = int(settings['file_timestamp'])
 	settings['netmagic'] = settings['netmagic'].decode('hex')
+	settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
 
 	if 'output_file' not in settings and 'output' not in settings:
 		print("Missing output file / directory")
 		sys.exit(1)
 
 	blkindex = get_block_hashes(settings)
-	blkset = mkblockset(blkindex)
+	blkmap = mkblockmap(blkindex)
 
-	if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset:
+	if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap:
 		print("not found")
 	else:
-		copydata(settings, blkindex, blkset)
+		BlockDataCopier(settings, blkindex, blkmap).run()
author	Wladimir J. van der Laan <laanwj@gmail.com>	2014-10-06 17:55:55 +0200
committer	Wladimir J. van der Laan <laanwj@gmail.com>	2014-10-06 18:30:12 +0200
commit	aedc74dfa688306c5a139a88782da74f69ba6757 (patch)
tree	9e7b2fd87c426010465298c3ab5d72ba54bad66d /contrib/linearize
parent	5505a1b13f75af9f0f6421b42d97c06e079db345 (diff)