aboutsummaryrefslogtreecommitdiff
path: root/contrib/linearize/linearize-data.py
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/linearize/linearize-data.py')
-rwxr-xr-xcontrib/linearize/linearize-data.py532
1 files changed, 266 insertions, 266 deletions
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
index c609e9b336..f8aea27342 100755
--- a/contrib/linearize/linearize-data.py
+++ b/contrib/linearize/linearize-data.py
@@ -22,300 +22,300 @@ from binascii import hexlify, unhexlify
settings = {}
def hex_switchEndian(s):
- """ Switches the endianness of a hex string (in pairs of hex chars) """
- pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
- return b''.join(pairList[::-1]).decode()
+ """ Switches the endianness of a hex string (in pairs of hex chars) """
+ pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
+ return b''.join(pairList[::-1]).decode()
def uint32(x):
- return x & 0xffffffff
+ return x & 0xffffffff
def bytereverse(x):
- return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
- (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
+ return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
+ (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
def bufreverse(in_buf):
- out_words = []
- for i in range(0, len(in_buf), 4):
- word = struct.unpack('@I', in_buf[i:i+4])[0]
- out_words.append(struct.pack('@I', bytereverse(word)))
- return b''.join(out_words)
+ out_words = []
+ for i in range(0, len(in_buf), 4):
+ word = struct.unpack('@I', in_buf[i:i+4])[0]
+ out_words.append(struct.pack('@I', bytereverse(word)))
+ return b''.join(out_words)
def wordreverse(in_buf):
- out_words = []
- for i in range(0, len(in_buf), 4):
- out_words.append(in_buf[i:i+4])
- out_words.reverse()
- return b''.join(out_words)
+ out_words = []
+ for i in range(0, len(in_buf), 4):
+ out_words.append(in_buf[i:i+4])
+ out_words.reverse()
+ return b''.join(out_words)
def calc_hdr_hash(blk_hdr):
- hash1 = hashlib.sha256()
- hash1.update(blk_hdr)
- hash1_o = hash1.digest()
+ hash1 = hashlib.sha256()
+ hash1.update(blk_hdr)
+ hash1_o = hash1.digest()
- hash2 = hashlib.sha256()
- hash2.update(hash1_o)
- hash2_o = hash2.digest()
+ hash2 = hashlib.sha256()
+ hash2.update(hash1_o)
+ hash2_o = hash2.digest()
- return hash2_o
+ return hash2_o
def calc_hash_str(blk_hdr):
- hash = calc_hdr_hash(blk_hdr)
- hash = bufreverse(hash)
- hash = wordreverse(hash)
- hash_str = hexlify(hash).decode('utf-8')
- return hash_str
+ hash = calc_hdr_hash(blk_hdr)
+ hash = bufreverse(hash)
+ hash = wordreverse(hash)
+ hash_str = hexlify(hash).decode('utf-8')
+ return hash_str
def get_blk_dt(blk_hdr):
- members = struct.unpack("<I", blk_hdr[68:68+4])
- nTime = members[0]
- dt = datetime.datetime.fromtimestamp(nTime)
- dt_ym = datetime.datetime(dt.year, dt.month, 1)
- return (dt_ym, nTime)
+ members = struct.unpack("<I", blk_hdr[68:68+4])
+ nTime = members[0]
+ dt = datetime.datetime.fromtimestamp(nTime)
+ dt_ym = datetime.datetime(dt.year, dt.month, 1)
+ return (dt_ym, nTime)
# When getting the list of block hashes, undo any byte reversals.
def get_block_hashes(settings):
- blkindex = []
- f = open(settings['hashlist'], "r")
- for line in f:
- line = line.rstrip()
- if settings['rev_hash_bytes'] == 'true':
- line = hex_switchEndian(line)
- blkindex.append(line)
+ blkindex = []
+ f = open(settings['hashlist'], "r")
+ for line in f:
+ line = line.rstrip()
+ if settings['rev_hash_bytes'] == 'true':
+ line = hex_switchEndian(line)
+ blkindex.append(line)
- print("Read " + str(len(blkindex)) + " hashes")
+ print("Read " + str(len(blkindex)) + " hashes")
- return blkindex
+ return blkindex
# The block map shouldn't give or receive byte-reversed hashes.
def mkblockmap(blkindex):
- blkmap = {}
- for height,hash in enumerate(blkindex):
- blkmap[hash] = height
- return blkmap
+ blkmap = {}
+ for height,hash in enumerate(blkindex):
+ blkmap[hash] = height
+ return blkmap
# Block header and extent on disk
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
class BlockDataCopier:
- def __init__(self, settings, blkindex, blkmap):
- self.settings = settings
- self.blkindex = blkindex
- self.blkmap = blkmap
-
- self.inFn = 0
- self.inF = None
- self.outFn = 0
- self.outsz = 0
- self.outF = None
- self.outFname = None
- self.blkCountIn = 0
- self.blkCountOut = 0
-
- self.lastDate = datetime.datetime(2000, 1, 1)
- self.highTS = 1408893517 - 315360000
- self.timestampSplit = False
- self.fileOutput = True
- self.setFileTime = False
- self.maxOutSz = settings['max_out_sz']
- if 'output' in settings:
- self.fileOutput = False
- if settings['file_timestamp'] != 0:
- self.setFileTime = True
- if settings['split_timestamp'] != 0:
- self.timestampSplit = True
- # Extents and cache for out-of-order blocks
- self.blockExtents = {}
- self.outOfOrderData = {}
- self.outOfOrderSize = 0 # running total size for items in outOfOrderData
-
- def writeBlock(self, inhdr, blk_hdr, rawblock):
- blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
- if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
- self.outF.close()
- if self.setFileTime:
- os.utime(self.outFname, (int(time.time()), self.highTS))
- self.outF = None
- self.outFname = None
- self.outFn = self.outFn + 1
- self.outsz = 0
-
- (blkDate, blkTS) = get_blk_dt(blk_hdr)
- if self.timestampSplit and (blkDate > self.lastDate):
- print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
- self.lastDate = blkDate
- if self.outF:
- self.outF.close()
- if self.setFileTime:
- os.utime(self.outFname, (int(time.time()), self.highTS))
- self.outF = None
- self.outFname = None
- self.outFn = self.outFn + 1
- self.outsz = 0
-
- if not self.outF:
- if self.fileOutput:
- self.outFname = self.settings['output_file']
- else:
- self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
- print("Output file " + self.outFname)
- self.outF = open(self.outFname, "wb")
-
- self.outF.write(inhdr)
- self.outF.write(blk_hdr)
- self.outF.write(rawblock)
- self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
-
- self.blkCountOut = self.blkCountOut + 1
- if blkTS > self.highTS:
- self.highTS = blkTS
-
- if (self.blkCountOut % 1000) == 0:
- print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
- (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
-
- def inFileName(self, fn):
- return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
-
- def fetchBlock(self, extent):
- '''Fetch block contents from disk given extents'''
- with open(self.inFileName(extent.fn), "rb") as f:
- f.seek(extent.offset)
- return f.read(extent.size)
-
- def copyOneBlock(self):
- '''Find the next block to be written in the input, and copy it to the output.'''
- extent = self.blockExtents.pop(self.blkCountOut)
- if self.blkCountOut in self.outOfOrderData:
- # If the data is cached, use it from memory and remove from the cache
- rawblock = self.outOfOrderData.pop(self.blkCountOut)
- self.outOfOrderSize -= len(rawblock)
- else: # Otherwise look up data on disk
- rawblock = self.fetchBlock(extent)
-
- self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
-
- def run(self):
- while self.blkCountOut < len(self.blkindex):
- if not self.inF:
- fname = self.inFileName(self.inFn)
- print("Input file " + fname)
- try:
- self.inF = open(fname, "rb")
- except IOError:
- print("Premature end of block data")
- return
-
- inhdr = self.inF.read(8)
- if (not inhdr or (inhdr[0] == "\0")):
- self.inF.close()
- self.inF = None
- self.inFn = self.inFn + 1
- continue
-
- inMagic = inhdr[:4]
- if (inMagic != self.settings['netmagic']):
- print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
- return
- inLenLE = inhdr[4:]
- su = struct.unpack("<I", inLenLE)
- inLen = su[0] - 80 # length without header
- blk_hdr = self.inF.read(80)
- inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
-
- self.hash_str = calc_hash_str(blk_hdr)
- if not self.hash_str in blkmap:
- # Because blocks can be written to files out-of-order as of 0.10, the script
- # may encounter blocks it doesn't know about. Treat as debug output.
- if settings['debug_output'] == 'true':
- print("Skipping unknown block " + self.hash_str)
- self.inF.seek(inLen, os.SEEK_CUR)
- continue
-
- blkHeight = self.blkmap[self.hash_str]
- self.blkCountIn += 1
-
- if self.blkCountOut == blkHeight:
- # If in-order block, just copy
- rawblock = self.inF.read(inLen)
- self.writeBlock(inhdr, blk_hdr, rawblock)
-
- # See if we can catch up to prior out-of-order blocks
- while self.blkCountOut in self.blockExtents:
- self.copyOneBlock()
-
- else: # If out-of-order, skip over block data for now
- self.blockExtents[blkHeight] = inExtent
- if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
- # If there is space in the cache, read the data
- # Reading the data in file sequence instead of seeking and fetching it later is preferred,
- # but we don't want to fill up memory
- self.outOfOrderData[blkHeight] = self.inF.read(inLen)
- self.outOfOrderSize += inLen
- else: # If no space in cache, seek forward
- self.inF.seek(inLen, os.SEEK_CUR)
-
- print("Done (%i blocks written)" % (self.blkCountOut))
+ def __init__(self, settings, blkindex, blkmap):
+ self.settings = settings
+ self.blkindex = blkindex
+ self.blkmap = blkmap
+
+ self.inFn = 0
+ self.inF = None
+ self.outFn = 0
+ self.outsz = 0
+ self.outF = None
+ self.outFname = None
+ self.blkCountIn = 0
+ self.blkCountOut = 0
+
+ self.lastDate = datetime.datetime(2000, 1, 1)
+ self.highTS = 1408893517 - 315360000
+ self.timestampSplit = False
+ self.fileOutput = True
+ self.setFileTime = False
+ self.maxOutSz = settings['max_out_sz']
+ if 'output' in settings:
+ self.fileOutput = False
+ if settings['file_timestamp'] != 0:
+ self.setFileTime = True
+ if settings['split_timestamp'] != 0:
+ self.timestampSplit = True
+ # Extents and cache for out-of-order blocks
+ self.blockExtents = {}
+ self.outOfOrderData = {}
+ self.outOfOrderSize = 0 # running total size for items in outOfOrderData
+
+ def writeBlock(self, inhdr, blk_hdr, rawblock):
+ blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
+ if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
+ self.outF.close()
+ if self.setFileTime:
+ os.utime(self.outFname, (int(time.time()), self.highTS))
+ self.outF = None
+ self.outFname = None
+ self.outFn = self.outFn + 1
+ self.outsz = 0
+
+ (blkDate, blkTS) = get_blk_dt(blk_hdr)
+ if self.timestampSplit and (blkDate > self.lastDate):
+ print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
+ self.lastDate = blkDate
+ if self.outF:
+ self.outF.close()
+ if self.setFileTime:
+ os.utime(self.outFname, (int(time.time()), self.highTS))
+ self.outF = None
+ self.outFname = None
+ self.outFn = self.outFn + 1
+ self.outsz = 0
+
+ if not self.outF:
+ if self.fileOutput:
+ self.outFname = self.settings['output_file']
+ else:
+ self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
+ print("Output file " + self.outFname)
+ self.outF = open(self.outFname, "wb")
+
+ self.outF.write(inhdr)
+ self.outF.write(blk_hdr)
+ self.outF.write(rawblock)
+ self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
+
+ self.blkCountOut = self.blkCountOut + 1
+ if blkTS > self.highTS:
+ self.highTS = blkTS
+
+ if (self.blkCountOut % 1000) == 0:
+ print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
+ (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
+
+ def inFileName(self, fn):
+ return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
+
+ def fetchBlock(self, extent):
+ '''Fetch block contents from disk given extents'''
+ with open(self.inFileName(extent.fn), "rb") as f:
+ f.seek(extent.offset)
+ return f.read(extent.size)
+
+ def copyOneBlock(self):
+ '''Find the next block to be written in the input, and copy it to the output.'''
+ extent = self.blockExtents.pop(self.blkCountOut)
+ if self.blkCountOut in self.outOfOrderData:
+ # If the data is cached, use it from memory and remove from the cache
+ rawblock = self.outOfOrderData.pop(self.blkCountOut)
+ self.outOfOrderSize -= len(rawblock)
+ else: # Otherwise look up data on disk
+ rawblock = self.fetchBlock(extent)
+
+ self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
+
+ def run(self):
+ while self.blkCountOut < len(self.blkindex):
+ if not self.inF:
+ fname = self.inFileName(self.inFn)
+ print("Input file " + fname)
+ try:
+ self.inF = open(fname, "rb")
+ except IOError:
+ print("Premature end of block data")
+ return
+
+ inhdr = self.inF.read(8)
+ if (not inhdr or (inhdr[0] == "\0")):
+ self.inF.close()
+ self.inF = None
+ self.inFn = self.inFn + 1
+ continue
+
+ inMagic = inhdr[:4]
+ if (inMagic != self.settings['netmagic']):
+ print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
+ return
+ inLenLE = inhdr[4:]
+ su = struct.unpack("<I", inLenLE)
+ inLen = su[0] - 80 # length without header
+ blk_hdr = self.inF.read(80)
+ inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
+
+ self.hash_str = calc_hash_str(blk_hdr)
+ if not self.hash_str in blkmap:
+ # Because blocks can be written to files out-of-order as of 0.10, the script
+ # may encounter blocks it doesn't know about. Treat as debug output.
+ if settings['debug_output'] == 'true':
+ print("Skipping unknown block " + self.hash_str)
+ self.inF.seek(inLen, os.SEEK_CUR)
+ continue
+
+ blkHeight = self.blkmap[self.hash_str]
+ self.blkCountIn += 1
+
+ if self.blkCountOut == blkHeight:
+ # If in-order block, just copy
+ rawblock = self.inF.read(inLen)
+ self.writeBlock(inhdr, blk_hdr, rawblock)
+
+ # See if we can catch up to prior out-of-order blocks
+ while self.blkCountOut in self.blockExtents:
+ self.copyOneBlock()
+
+ else: # If out-of-order, skip over block data for now
+ self.blockExtents[blkHeight] = inExtent
+ if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
+ # If there is space in the cache, read the data
+ # Reading the data in file sequence instead of seeking and fetching it later is preferred,
+ # but we don't want to fill up memory
+ self.outOfOrderData[blkHeight] = self.inF.read(inLen)
+ self.outOfOrderSize += inLen
+ else: # If no space in cache, seek forward
+ self.inF.seek(inLen, os.SEEK_CUR)
+
+ print("Done (%i blocks written)" % (self.blkCountOut))
if __name__ == '__main__':
- if len(sys.argv) != 2:
- print("Usage: linearize-data.py CONFIG-FILE")
- sys.exit(1)
-
- f = open(sys.argv[1])
- for line in f:
- # skip comment lines
- m = re.search('^\s*#', line)
- if m:
- continue
-
- # parse key=value lines
- m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
- if m is None:
- continue
- settings[m.group(1)] = m.group(2)
- f.close()
-
- # Force hash byte format setting to be lowercase to make comparisons easier.
- # Also place upfront in case any settings need to know about it.
- if 'rev_hash_bytes' not in settings:
- settings['rev_hash_bytes'] = 'false'
- settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
-
- if 'netmagic' not in settings:
- settings['netmagic'] = 'f9beb4d9'
- if 'genesis' not in settings:
- settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
- if 'input' not in settings:
- settings['input'] = 'input'
- if 'hashlist' not in settings:
- settings['hashlist'] = 'hashlist.txt'
- if 'file_timestamp' not in settings:
- settings['file_timestamp'] = 0
- if 'split_timestamp' not in settings:
- settings['split_timestamp'] = 0
- if 'max_out_sz' not in settings:
- settings['max_out_sz'] = 1000 * 1000 * 1000
- if 'out_of_order_cache_sz' not in settings:
- settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
- if 'debug_output' not in settings:
- settings['debug_output'] = 'false'
-
- settings['max_out_sz'] = int(settings['max_out_sz'])
- settings['split_timestamp'] = int(settings['split_timestamp'])
- settings['file_timestamp'] = int(settings['file_timestamp'])
- settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
- settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
- settings['debug_output'] = settings['debug_output'].lower()
-
- if 'output_file' not in settings and 'output' not in settings:
- print("Missing output file / directory")
- sys.exit(1)
-
- blkindex = get_block_hashes(settings)
- blkmap = mkblockmap(blkindex)
-
- # Block hash map won't be byte-reversed. Neither should the genesis hash.
- if not settings['genesis'] in blkmap:
- print("Genesis block not found in hashlist")
- else:
- BlockDataCopier(settings, blkindex, blkmap).run()
+ if len(sys.argv) != 2:
+ print("Usage: linearize-data.py CONFIG-FILE")
+ sys.exit(1)
+
+ f = open(sys.argv[1])
+ for line in f:
+ # skip comment lines
+ m = re.search('^\s*#', line)
+ if m:
+ continue
+
+ # parse key=value lines
+ m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
+ if m is None:
+ continue
+ settings[m.group(1)] = m.group(2)
+ f.close()
+
+ # Force hash byte format setting to be lowercase to make comparisons easier.
+ # Also place upfront in case any settings need to know about it.
+ if 'rev_hash_bytes' not in settings:
+ settings['rev_hash_bytes'] = 'false'
+ settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
+
+ if 'netmagic' not in settings:
+ settings['netmagic'] = 'f9beb4d9'
+ if 'genesis' not in settings:
+ settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
+ if 'input' not in settings:
+ settings['input'] = 'input'
+ if 'hashlist' not in settings:
+ settings['hashlist'] = 'hashlist.txt'
+ if 'file_timestamp' not in settings:
+ settings['file_timestamp'] = 0
+ if 'split_timestamp' not in settings:
+ settings['split_timestamp'] = 0
+ if 'max_out_sz' not in settings:
+ settings['max_out_sz'] = 1000 * 1000 * 1000
+ if 'out_of_order_cache_sz' not in settings:
+ settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
+ if 'debug_output' not in settings:
+ settings['debug_output'] = 'false'
+
+ settings['max_out_sz'] = int(settings['max_out_sz'])
+ settings['split_timestamp'] = int(settings['split_timestamp'])
+ settings['file_timestamp'] = int(settings['file_timestamp'])
+ settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
+ settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
+ settings['debug_output'] = settings['debug_output'].lower()
+
+ if 'output_file' not in settings and 'output' not in settings:
+ print("Missing output file / directory")
+ sys.exit(1)
+
+ blkindex = get_block_hashes(settings)
+ blkmap = mkblockmap(blkindex)
+
+ # Block hash map won't be byte-reversed. Neither should the genesis hash.
+ if not settings['genesis'] in blkmap:
+ print("Genesis block not found in hashlist")
+ else:
+ BlockDataCopier(settings, blkindex, blkmap).run()