diff options
Diffstat (limited to 'contrib/linearize/linearize-data.py')
-rwxr-xr-x | contrib/linearize/linearize-data.py | 532 |
1 files changed, 266 insertions, 266 deletions
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py index c609e9b336..f8aea27342 100755 --- a/contrib/linearize/linearize-data.py +++ b/contrib/linearize/linearize-data.py @@ -22,300 +22,300 @@ from binascii import hexlify, unhexlify settings = {} def hex_switchEndian(s): - """ Switches the endianness of a hex string (in pairs of hex chars) """ - pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)] - return b''.join(pairList[::-1]).decode() + """ Switches the endianness of a hex string (in pairs of hex chars) """ + pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)] + return b''.join(pairList[::-1]).decode() def uint32(x): - return x & 0xffffffff + return x & 0xffffffff def bytereverse(x): - return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) | - (((x) >> 8) & 0x0000ff00) | ((x) >> 24) )) + return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) | + (((x) >> 8) & 0x0000ff00) | ((x) >> 24) )) def bufreverse(in_buf): - out_words = [] - for i in range(0, len(in_buf), 4): - word = struct.unpack('@I', in_buf[i:i+4])[0] - out_words.append(struct.pack('@I', bytereverse(word))) - return b''.join(out_words) + out_words = [] + for i in range(0, len(in_buf), 4): + word = struct.unpack('@I', in_buf[i:i+4])[0] + out_words.append(struct.pack('@I', bytereverse(word))) + return b''.join(out_words) def wordreverse(in_buf): - out_words = [] - for i in range(0, len(in_buf), 4): - out_words.append(in_buf[i:i+4]) - out_words.reverse() - return b''.join(out_words) + out_words = [] + for i in range(0, len(in_buf), 4): + out_words.append(in_buf[i:i+4]) + out_words.reverse() + return b''.join(out_words) def calc_hdr_hash(blk_hdr): - hash1 = hashlib.sha256() - hash1.update(blk_hdr) - hash1_o = hash1.digest() + hash1 = hashlib.sha256() + hash1.update(blk_hdr) + hash1_o = hash1.digest() - hash2 = hashlib.sha256() - hash2.update(hash1_o) - hash2_o = hash2.digest() + hash2 = hashlib.sha256() + hash2.update(hash1_o) + hash2_o = hash2.digest() - return hash2_o + return hash2_o def calc_hash_str(blk_hdr): - hash = calc_hdr_hash(blk_hdr) - hash = bufreverse(hash) - hash = wordreverse(hash) - hash_str = hexlify(hash).decode('utf-8') - return hash_str + hash = calc_hdr_hash(blk_hdr) + hash = bufreverse(hash) + hash = wordreverse(hash) + hash_str = hexlify(hash).decode('utf-8') + return hash_str def get_blk_dt(blk_hdr): - members = struct.unpack("<I", blk_hdr[68:68+4]) - nTime = members[0] - dt = datetime.datetime.fromtimestamp(nTime) - dt_ym = datetime.datetime(dt.year, dt.month, 1) - return (dt_ym, nTime) + members = struct.unpack("<I", blk_hdr[68:68+4]) + nTime = members[0] + dt = datetime.datetime.fromtimestamp(nTime) + dt_ym = datetime.datetime(dt.year, dt.month, 1) + return (dt_ym, nTime) # When getting the list of block hashes, undo any byte reversals. def get_block_hashes(settings): - blkindex = [] - f = open(settings['hashlist'], "r") - for line in f: - line = line.rstrip() - if settings['rev_hash_bytes'] == 'true': - line = hex_switchEndian(line) - blkindex.append(line) + blkindex = [] + f = open(settings['hashlist'], "r") + for line in f: + line = line.rstrip() + if settings['rev_hash_bytes'] == 'true': + line = hex_switchEndian(line) + blkindex.append(line) - print("Read " + str(len(blkindex)) + " hashes") + print("Read " + str(len(blkindex)) + " hashes") - return blkindex + return blkindex # The block map shouldn't give or receive byte-reversed hashes. def mkblockmap(blkindex): - blkmap = {} - for height,hash in enumerate(blkindex): - blkmap[hash] = height - return blkmap + blkmap = {} + for height,hash in enumerate(blkindex): + blkmap[hash] = height + return blkmap # Block header and extent on disk BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) class BlockDataCopier: - def __init__(self, settings, blkindex, blkmap): - self.settings = settings - self.blkindex = blkindex - self.blkmap = blkmap - - self.inFn = 0 - self.inF = None - self.outFn = 0 - self.outsz = 0 - self.outF = None - self.outFname = None - self.blkCountIn = 0 - self.blkCountOut = 0 - - self.lastDate = datetime.datetime(2000, 1, 1) - self.highTS = 1408893517 - 315360000 - self.timestampSplit = False - self.fileOutput = True - self.setFileTime = False - self.maxOutSz = settings['max_out_sz'] - if 'output' in settings: - self.fileOutput = False - if settings['file_timestamp'] != 0: - self.setFileTime = True - if settings['split_timestamp'] != 0: - self.timestampSplit = True - # Extents and cache for out-of-order blocks - self.blockExtents = {} - self.outOfOrderData = {} - self.outOfOrderSize = 0 # running total size for items in outOfOrderData - - def writeBlock(self, inhdr, blk_hdr, rawblock): - blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) - if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): - self.outF.close() - if self.setFileTime: - os.utime(self.outFname, (int(time.time()), self.highTS)) - self.outF = None - self.outFname = None - self.outFn = self.outFn + 1 - self.outsz = 0 - - (blkDate, blkTS) = get_blk_dt(blk_hdr) - if self.timestampSplit and (blkDate > self.lastDate): - print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str) - self.lastDate = blkDate - if self.outF: - self.outF.close() - if self.setFileTime: - os.utime(self.outFname, (int(time.time()), self.highTS)) - self.outF = None - self.outFname = None - self.outFn = self.outFn + 1 - self.outsz = 0 - - if not self.outF: - if self.fileOutput: - self.outFname = self.settings['output_file'] - else: - self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) - print("Output file " + self.outFname) - self.outF = open(self.outFname, "wb") - - self.outF.write(inhdr) - self.outF.write(blk_hdr) - self.outF.write(rawblock) - self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) - - self.blkCountOut = self.blkCountOut + 1 - if blkTS > self.highTS: - self.highTS = blkTS - - if (self.blkCountOut % 1000) == 0: - print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % - (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) - - def inFileName(self, fn): - return os.path.join(self.settings['input'], "blk%05d.dat" % fn) - - def fetchBlock(self, extent): - '''Fetch block contents from disk given extents''' - with open(self.inFileName(extent.fn), "rb") as f: - f.seek(extent.offset) - return f.read(extent.size) - - def copyOneBlock(self): - '''Find the next block to be written in the input, and copy it to the output.''' - extent = self.blockExtents.pop(self.blkCountOut) - if self.blkCountOut in self.outOfOrderData: - # If the data is cached, use it from memory and remove from the cache - rawblock = self.outOfOrderData.pop(self.blkCountOut) - self.outOfOrderSize -= len(rawblock) - else: # Otherwise look up data on disk - rawblock = self.fetchBlock(extent) - - self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) - - def run(self): - while self.blkCountOut < len(self.blkindex): - if not self.inF: - fname = self.inFileName(self.inFn) - print("Input file " + fname) - try: - self.inF = open(fname, "rb") - except IOError: - print("Premature end of block data") - return - - inhdr = self.inF.read(8) - if (not inhdr or (inhdr[0] == "\0")): - self.inF.close() - self.inF = None - self.inFn = self.inFn + 1 - continue - - inMagic = inhdr[:4] - if (inMagic != self.settings['netmagic']): - print("Invalid magic: " + hexlify(inMagic).decode('utf-8')) - return - inLenLE = inhdr[4:] - su = struct.unpack("<I", inLenLE) - inLen = su[0] - 80 # length without header - blk_hdr = self.inF.read(80) - inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) - - self.hash_str = calc_hash_str(blk_hdr) - if not self.hash_str in blkmap: - # Because blocks can be written to files out-of-order as of 0.10, the script - # may encounter blocks it doesn't know about. Treat as debug output. - if settings['debug_output'] == 'true': - print("Skipping unknown block " + self.hash_str) - self.inF.seek(inLen, os.SEEK_CUR) - continue - - blkHeight = self.blkmap[self.hash_str] - self.blkCountIn += 1 - - if self.blkCountOut == blkHeight: - # If in-order block, just copy - rawblock = self.inF.read(inLen) - self.writeBlock(inhdr, blk_hdr, rawblock) - - # See if we can catch up to prior out-of-order blocks - while self.blkCountOut in self.blockExtents: - self.copyOneBlock() - - else: # If out-of-order, skip over block data for now - self.blockExtents[blkHeight] = inExtent - if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: - # If there is space in the cache, read the data - # Reading the data in file sequence instead of seeking and fetching it later is preferred, - # but we don't want to fill up memory - self.outOfOrderData[blkHeight] = self.inF.read(inLen) - self.outOfOrderSize += inLen - else: # If no space in cache, seek forward - self.inF.seek(inLen, os.SEEK_CUR) - - print("Done (%i blocks written)" % (self.blkCountOut)) + def __init__(self, settings, blkindex, blkmap): + self.settings = settings + self.blkindex = blkindex + self.blkmap = blkmap + + self.inFn = 0 + self.inF = None + self.outFn = 0 + self.outsz = 0 + self.outF = None + self.outFname = None + self.blkCountIn = 0 + self.blkCountOut = 0 + + self.lastDate = datetime.datetime(2000, 1, 1) + self.highTS = 1408893517 - 315360000 + self.timestampSplit = False + self.fileOutput = True + self.setFileTime = False + self.maxOutSz = settings['max_out_sz'] + if 'output' in settings: + self.fileOutput = False + if settings['file_timestamp'] != 0: + self.setFileTime = True + if settings['split_timestamp'] != 0: + self.timestampSplit = True + # Extents and cache for out-of-order blocks + self.blockExtents = {} + self.outOfOrderData = {} + self.outOfOrderSize = 0 # running total size for items in outOfOrderData + + def writeBlock(self, inhdr, blk_hdr, rawblock): + blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) + if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): + self.outF.close() + if self.setFileTime: + os.utime(self.outFname, (int(time.time()), self.highTS)) + self.outF = None + self.outFname = None + self.outFn = self.outFn + 1 + self.outsz = 0 + + (blkDate, blkTS) = get_blk_dt(blk_hdr) + if self.timestampSplit and (blkDate > self.lastDate): + print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str) + self.lastDate = blkDate + if self.outF: + self.outF.close() + if self.setFileTime: + os.utime(self.outFname, (int(time.time()), self.highTS)) + self.outF = None + self.outFname = None + self.outFn = self.outFn + 1 + self.outsz = 0 + + if not self.outF: + if self.fileOutput: + self.outFname = self.settings['output_file'] + else: + self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) + print("Output file " + self.outFname) + self.outF = open(self.outFname, "wb") + + self.outF.write(inhdr) + self.outF.write(blk_hdr) + self.outF.write(rawblock) + self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) + + self.blkCountOut = self.blkCountOut + 1 + if blkTS > self.highTS: + self.highTS = blkTS + + if (self.blkCountOut % 1000) == 0: + print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % + (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) + + def inFileName(self, fn): + return os.path.join(self.settings['input'], "blk%05d.dat" % fn) + + def fetchBlock(self, extent): + '''Fetch block contents from disk given extents''' + with open(self.inFileName(extent.fn), "rb") as f: + f.seek(extent.offset) + return f.read(extent.size) + + def copyOneBlock(self): + '''Find the next block to be written in the input, and copy it to the output.''' + extent = self.blockExtents.pop(self.blkCountOut) + if self.blkCountOut in self.outOfOrderData: + # If the data is cached, use it from memory and remove from the cache + rawblock = self.outOfOrderData.pop(self.blkCountOut) + self.outOfOrderSize -= len(rawblock) + else: # Otherwise look up data on disk + rawblock = self.fetchBlock(extent) + + self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) + + def run(self): + while self.blkCountOut < len(self.blkindex): + if not self.inF: + fname = self.inFileName(self.inFn) + print("Input file " + fname) + try: + self.inF = open(fname, "rb") + except IOError: + print("Premature end of block data") + return + + inhdr = self.inF.read(8) + if (not inhdr or (inhdr[0] == "\0")): + self.inF.close() + self.inF = None + self.inFn = self.inFn + 1 + continue + + inMagic = inhdr[:4] + if (inMagic != self.settings['netmagic']): + print("Invalid magic: " + hexlify(inMagic).decode('utf-8')) + return + inLenLE = inhdr[4:] + su = struct.unpack("<I", inLenLE) + inLen = su[0] - 80 # length without header + blk_hdr = self.inF.read(80) + inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) + + self.hash_str = calc_hash_str(blk_hdr) + if not self.hash_str in blkmap: + # Because blocks can be written to files out-of-order as of 0.10, the script + # may encounter blocks it doesn't know about. Treat as debug output. + if settings['debug_output'] == 'true': + print("Skipping unknown block " + self.hash_str) + self.inF.seek(inLen, os.SEEK_CUR) + continue + + blkHeight = self.blkmap[self.hash_str] + self.blkCountIn += 1 + + if self.blkCountOut == blkHeight: + # If in-order block, just copy + rawblock = self.inF.read(inLen) + self.writeBlock(inhdr, blk_hdr, rawblock) + + # See if we can catch up to prior out-of-order blocks + while self.blkCountOut in self.blockExtents: + self.copyOneBlock() + + else: # If out-of-order, skip over block data for now + self.blockExtents[blkHeight] = inExtent + if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: + # If there is space in the cache, read the data + # Reading the data in file sequence instead of seeking and fetching it later is preferred, + # but we don't want to fill up memory + self.outOfOrderData[blkHeight] = self.inF.read(inLen) + self.outOfOrderSize += inLen + else: # If no space in cache, seek forward + self.inF.seek(inLen, os.SEEK_CUR) + + print("Done (%i blocks written)" % (self.blkCountOut)) if __name__ == '__main__': - if len(sys.argv) != 2: - print("Usage: linearize-data.py CONFIG-FILE") - sys.exit(1) - - f = open(sys.argv[1]) - for line in f: - # skip comment lines - m = re.search('^\s*#', line) - if m: - continue - - # parse key=value lines - m = re.search('^(\w+)\s*=\s*(\S.*)$', line) - if m is None: - continue - settings[m.group(1)] = m.group(2) - f.close() - - # Force hash byte format setting to be lowercase to make comparisons easier. - # Also place upfront in case any settings need to know about it. - if 'rev_hash_bytes' not in settings: - settings['rev_hash_bytes'] = 'false' - settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower() - - if 'netmagic' not in settings: - settings['netmagic'] = 'f9beb4d9' - if 'genesis' not in settings: - settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f' - if 'input' not in settings: - settings['input'] = 'input' - if 'hashlist' not in settings: - settings['hashlist'] = 'hashlist.txt' - if 'file_timestamp' not in settings: - settings['file_timestamp'] = 0 - if 'split_timestamp' not in settings: - settings['split_timestamp'] = 0 - if 'max_out_sz' not in settings: - settings['max_out_sz'] = 1000 * 1000 * 1000 - if 'out_of_order_cache_sz' not in settings: - settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 - if 'debug_output' not in settings: - settings['debug_output'] = 'false' - - settings['max_out_sz'] = int(settings['max_out_sz']) - settings['split_timestamp'] = int(settings['split_timestamp']) - settings['file_timestamp'] = int(settings['file_timestamp']) - settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8')) - settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) - settings['debug_output'] = settings['debug_output'].lower() - - if 'output_file' not in settings and 'output' not in settings: - print("Missing output file / directory") - sys.exit(1) - - blkindex = get_block_hashes(settings) - blkmap = mkblockmap(blkindex) - - # Block hash map won't be byte-reversed. Neither should the genesis hash. - if not settings['genesis'] in blkmap: - print("Genesis block not found in hashlist") - else: - BlockDataCopier(settings, blkindex, blkmap).run() + if len(sys.argv) != 2: + print("Usage: linearize-data.py CONFIG-FILE") + sys.exit(1) + + f = open(sys.argv[1]) + for line in f: + # skip comment lines + m = re.search('^\s*#', line) + if m: + continue + + # parse key=value lines + m = re.search('^(\w+)\s*=\s*(\S.*)$', line) + if m is None: + continue + settings[m.group(1)] = m.group(2) + f.close() + + # Force hash byte format setting to be lowercase to make comparisons easier. + # Also place upfront in case any settings need to know about it. + if 'rev_hash_bytes' not in settings: + settings['rev_hash_bytes'] = 'false' + settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower() + + if 'netmagic' not in settings: + settings['netmagic'] = 'f9beb4d9' + if 'genesis' not in settings: + settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f' + if 'input' not in settings: + settings['input'] = 'input' + if 'hashlist' not in settings: + settings['hashlist'] = 'hashlist.txt' + if 'file_timestamp' not in settings: + settings['file_timestamp'] = 0 + if 'split_timestamp' not in settings: + settings['split_timestamp'] = 0 + if 'max_out_sz' not in settings: + settings['max_out_sz'] = 1000 * 1000 * 1000 + if 'out_of_order_cache_sz' not in settings: + settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 + if 'debug_output' not in settings: + settings['debug_output'] = 'false' + + settings['max_out_sz'] = int(settings['max_out_sz']) + settings['split_timestamp'] = int(settings['split_timestamp']) + settings['file_timestamp'] = int(settings['file_timestamp']) + settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8')) + settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) + settings['debug_output'] = settings['debug_output'].lower() + + if 'output_file' not in settings and 'output' not in settings: + print("Missing output file / directory") + sys.exit(1) + + blkindex = get_block_hashes(settings) + blkmap = mkblockmap(blkindex) + + # Block hash map won't be byte-reversed. Neither should the genesis hash. + if not settings['genesis'] in blkmap: + print("Genesis block not found in hashlist") + else: + BlockDataCopier(settings, blkindex, blkmap).run() |