1 files changed, 322 insertions, 0 deletions
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
new file mode 100755
index 0000000000..afcec2b60a
--- /dev/null
+++ b/contrib/linearize/linearize-data.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python3
+#
+# linearize-data.py: Construct a linear, no-fork version of the chain.
+#
+# Copyright (c) 2013-2016 The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+#
+
+from __future__ import print_function, division
+import struct
+import re
+import os
+import os.path
+import sys
+import hashlib
+import datetime
+import time
+from collections import namedtuple
+from binascii import hexlify, unhexlify
+
+settings = {}
+
+##### Switch endian-ness #####
+def hex_switchEndian(s):
+	""" Switches the endianness of a hex string (in pairs of hex chars) """
+	pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
+	return b''.join(pairList[::-1]).decode()
+
+def uint32(x):
+	return x & 0xffffffff
+
+def bytereverse(x):
+	return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
+		       (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
+
+def bufreverse(in_buf):
+	out_words = []
+	for i in range(0, len(in_buf), 4):
+		word = struct.unpack('@I', in_buf[i:i+4])[0]
+		out_words.append(struct.pack('@I', bytereverse(word)))
+	return b''.join(out_words)
+
+def wordreverse(in_buf):
+	out_words = []
+	for i in range(0, len(in_buf), 4):
+		out_words.append(in_buf[i:i+4])
+	out_words.reverse()
+	return b''.join(out_words)
+
+def calc_hdr_hash(blk_hdr):
+	hash1 = hashlib.sha256()
+	hash1.update(blk_hdr)
+	hash1_o = hash1.digest()
+
+	hash2 = hashlib.sha256()
+	hash2.update(hash1_o)
+	hash2_o = hash2.digest()
+
+	return hash2_o
+
+def calc_hash_str(blk_hdr):
+	hash = calc_hdr_hash(blk_hdr)
+	hash = bufreverse(hash)
+	hash = wordreverse(hash)
+	hash_str = hexlify(hash).decode('utf-8')
+	return hash_str
+
+def get_blk_dt(blk_hdr):
+	members = struct.unpack("<I", blk_hdr[68:68+4])
+	nTime = members[0]
+	dt = datetime.datetime.fromtimestamp(nTime)
+	dt_ym = datetime.datetime(dt.year, dt.month, 1)
+	return (dt_ym, nTime)
+
+# When getting the list of block hashes, undo any byte reversals.
+def get_block_hashes(settings):
+	blkindex = []
+	f = open(settings['hashlist'], "r")
+	for line in f:
+		line = line.rstrip()
+		if settings['rev_hash_bytes'] == 'true':
+			line = hex_switchEndian(line)
+		blkindex.append(line)
+
+	print("Read " + str(len(blkindex)) + " hashes")
+
+	return blkindex
+
+# The block map shouldn't give or receive byte-reversed hashes.
+def mkblockmap(blkindex):
+	blkmap = {}
+	for height,hash in enumerate(blkindex):
+		blkmap[hash] = height
+	return blkmap
+
+# Block header and extent on disk
+BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
+
+class BlockDataCopier:
+	def __init__(self, settings, blkindex, blkmap):
+		self.settings = settings
+		self.blkindex = blkindex
+		self.blkmap = blkmap
+
+		self.inFn = 0
+		self.inF = None
+		self.outFn = 0
+		self.outsz = 0
+		self.outF = None
+		self.outFname = None
+		self.blkCountIn = 0
+		self.blkCountOut = 0
+
+		self.lastDate = datetime.datetime(2000, 1, 1)
+		self.highTS = 1408893517 - 315360000
+		self.timestampSplit = False
+		self.fileOutput = True
+		self.setFileTime = False
+		self.maxOutSz = settings['max_out_sz']
+		if 'output' in settings:
+			self.fileOutput = False
+		if settings['file_timestamp'] != 0:
+			self.setFileTime = True
+		if settings['split_timestamp'] != 0:
+			self.timestampSplit = True
+		# Extents and cache for out-of-order blocks
+		self.blockExtents = {}
+		self.outOfOrderData = {}
+		self.outOfOrderSize = 0 # running total size for items in outOfOrderData
+
+	def writeBlock(self, inhdr, blk_hdr, rawblock):
+		blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
+		if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
+			self.outF.close()
+			if self.setFileTime:
+				os.utime(self.outFname, (int(time.time()), self.highTS))
+			self.outF = None
+			self.outFname = None
+			self.outFn = self.outFn + 1
+			self.outsz = 0
+
+		(blkDate, blkTS) = get_blk_dt(blk_hdr)
+		if self.timestampSplit and (blkDate > self.lastDate):
+			print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
+			self.lastDate = blkDate
+			if self.outF:
+				self.outF.close()
+				if self.setFileTime:
+					os.utime(self.outFname, (int(time.time()), self.highTS))
+				self.outF = None
+				self.outFname = None
+				self.outFn = self.outFn + 1
+				self.outsz = 0
+
+		if not self.outF:
+			if self.fileOutput:
+				self.outFname = self.settings['output_file']
+			else:
+				self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
+			print("Output file " + self.outFname)
+			self.outF = open(self.outFname, "wb")
+
+		self.outF.write(inhdr)
+		self.outF.write(blk_hdr)
+		self.outF.write(rawblock)
+		self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
+
+		self.blkCountOut = self.blkCountOut + 1
+		if blkTS > self.highTS:
+			self.highTS = blkTS
+
+		if (self.blkCountOut % 1000) == 0:
+			print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % 
+					(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
+
+	def inFileName(self, fn):
+		return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
+
+	def fetchBlock(self, extent):
+		'''Fetch block contents from disk given extents'''
+		with open(self.inFileName(extent.fn), "rb") as f:
+			f.seek(extent.offset)
+			return f.read(extent.size)
+
+	def copyOneBlock(self):
+		'''Find the next block to be written in the input, and copy it to the output.'''
+		extent = self.blockExtents.pop(self.blkCountOut)
+		if self.blkCountOut in self.outOfOrderData:
+			# If the data is cached, use it from memory and remove from the cache
+			rawblock = self.outOfOrderData.pop(self.blkCountOut)
+			self.outOfOrderSize -= len(rawblock)
+		else: # Otherwise look up data on disk
+			rawblock = self.fetchBlock(extent)
+
+		self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
+
+	def run(self):
+		while self.blkCountOut < len(self.blkindex):
+			if not self.inF:
+				fname = self.inFileName(self.inFn)
+				print("Input file " + fname)
+				try:
+					self.inF = open(fname, "rb")
+				except IOError:
+					print("Premature end of block data")
+					return
+
+			inhdr = self.inF.read(8)
+			if (not inhdr or (inhdr[0] == "\0")):
+				self.inF.close()
+				self.inF = None
+				self.inFn = self.inFn + 1
+				continue
+
+			inMagic = inhdr[:4]
+			if (inMagic != self.settings['netmagic']):
+				print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
+				return
+			inLenLE = inhdr[4:]
+			su = struct.unpack("<I", inLenLE)
+			inLen = su[0] - 80 # length without header
+			blk_hdr = self.inF.read(80)
+			inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
+
+			self.hash_str = calc_hash_str(blk_hdr)
+			if not self.hash_str in blkmap:
+				# Because blocks can be written to files out-of-order as of 0.10, the script
+				# may encounter blocks it doesn't know about. Treat as debug output.
+				if settings['debug_output'] == 'true':
+					print("Skipping unknown block " + self.hash_str)
+				self.inF.seek(inLen, os.SEEK_CUR)
+				continue
+
+			blkHeight = self.blkmap[self.hash_str]
+			self.blkCountIn += 1
+
+			if self.blkCountOut == blkHeight:
+				# If in-order block, just copy
+				rawblock = self.inF.read(inLen)
+				self.writeBlock(inhdr, blk_hdr, rawblock)
+
+				# See if we can catch up to prior out-of-order blocks
+				while self.blkCountOut in self.blockExtents:
+					self.copyOneBlock()
+
+			else: # If out-of-order, skip over block data for now
+				self.blockExtents[blkHeight] = inExtent
+				if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
+					# If there is space in the cache, read the data
+					# Reading the data in file sequence instead of seeking and fetching it later is preferred,
+					# but we don't want to fill up memory
+					self.outOfOrderData[blkHeight] = self.inF.read(inLen)
+					self.outOfOrderSize += inLen
+				else: # If no space in cache, seek forward
+					self.inF.seek(inLen, os.SEEK_CUR)
+
+		print("Done (%i blocks written)" % (self.blkCountOut))
+
+if __name__ == '__main__':
+	if len(sys.argv) != 2:
+		print("Usage: linearize-data.py CONFIG-FILE")
+		sys.exit(1)
+
+	f = open(sys.argv[1])
+	for line in f:
+		# skip comment lines
+		m = re.search('^\s*#', line)
+		if m:
+			continue
+
+		# parse key=value lines
+		m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
+		if m is None:
+			continue
+		settings[m.group(1)] = m.group(2)
+	f.close()
+
+	# Force hash byte format setting to be lowercase to make comparisons easier.
+	# Also place upfront in case any settings need to know about it.
+	if 'rev_hash_bytes' not in settings:
+		settings['rev_hash_bytes'] = 'false'
+	settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
+
+	if 'netmagic' not in settings:
+		settings['netmagic'] = 'f9beb4d9'
+	if 'genesis' not in settings:
+		settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
+	if 'input' not in settings:
+		settings['input'] = 'input'
+	if 'hashlist' not in settings:
+		settings['hashlist'] = 'hashlist.txt'
+	if 'file_timestamp' not in settings:
+		settings['file_timestamp'] = 0
+	if 'split_timestamp' not in settings:
+		settings['split_timestamp'] = 0
+	if 'max_out_sz' not in settings:
+		settings['max_out_sz'] = 1000 * 1000 * 1000
+	if 'out_of_order_cache_sz' not in settings:
+		settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
+	if 'debug_output' not in settings:
+		settings['debug_output'] = 'false'
+
+	settings['max_out_sz'] = int(settings['max_out_sz'])
+	settings['split_timestamp'] = int(settings['split_timestamp'])
+	settings['file_timestamp'] = int(settings['file_timestamp'])
+	settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
+	settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
+	settings['debug_output'] = settings['debug_output'].lower()
+
+	if 'output_file' not in settings and 'output' not in settings:
+		print("Missing output file / directory")
+		sys.exit(1)
+
+	blkindex = get_block_hashes(settings)
+	blkmap = mkblockmap(blkindex)
+
+	# Block hash map won't be byte-reversed. Neither should the genesis hash.
+	if not settings['genesis'] in blkmap:
+		print("Genesis block not found in hashlist")
+	else:
+		BlockDataCopier(settings, blkindex, blkmap).run()