diff options
author | Jeff Garzik <jgarzik@bitpay.com> | 2014-08-23 22:59:16 -0400 |
---|---|---|
committer | Jeff Garzik <jgarzik@bitpay.com> | 2014-08-23 22:59:16 -0400 |
commit | b4a72a75b4795dffbe973b0ae287343df2dac13c (patch) | |
tree | b08c16770090458743b9d95b04e4314dede7c521 | |
parent | 476eb7eb53f680494952865a823e5cf9459da2b9 (diff) |
contrib/linearize: split output files based on new-timestamp-year or max-file-size
-rw-r--r-- | contrib/linearize/README.md | 3 | ||||
-rw-r--r-- | contrib/linearize/example-linearize.cfg | 1 | ||||
-rwxr-xr-x | contrib/linearize/linearize-data.py | 44 |
3 files changed, 39 insertions, 9 deletions
diff --git a/contrib/linearize/README.md b/contrib/linearize/README.md index 8d06d53b10..b5c6e7824e 100644 --- a/contrib/linearize/README.md +++ b/contrib/linearize/README.md @@ -26,4 +26,7 @@ output. Optional config file setting for linearize-data: * "netmagic": network magic number +* "max_out_sz": maximum output file size (default 1000*1000*1000) +* "split_year": Split files when a new year is first seen, in addition to +reaching a maximum file size. diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg index 9c3270d653..071345f23a 100644 --- a/contrib/linearize/example-linearize.cfg +++ b/contrib/linearize/example-linearize.cfg @@ -13,4 +13,5 @@ netmagic=f9beb4d9 input=/home/example/.bitcoin/blocks output_file=/home/example/Downloads/bootstrap.dat hashlist=hashlist.txt +split_year=1 diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py index 77bae6e3c6..ea94f25fae 100755 --- a/contrib/linearize/linearize-data.py +++ b/contrib/linearize/linearize-data.py @@ -14,8 +14,7 @@ import base64 import httplib import sys import hashlib - -MAX_OUT_SZ = 128 * 1024 * 1024 +import datetime settings = {} @@ -41,9 +40,7 @@ def wordreverse(in_buf): out_words.reverse() return ''.join(out_words) -def calc_hdr_hash(rawblock): - blk_hdr = rawblock[:80] - +def calc_hdr_hash(blk_hdr): hash1 = hashlib.sha256() hash1.update(blk_hdr) hash1_o = hash1.digest() @@ -54,13 +51,18 @@ def calc_hdr_hash(rawblock): return hash2_o -def calc_hash_str(rawblock): - hash = calc_hdr_hash(rawblock) +def calc_hash_str(blk_hdr): + hash = calc_hdr_hash(blk_hdr) hash = bufreverse(hash) hash = wordreverse(hash) hash_str = hash.encode('hex') return hash_str +def get_blk_year(blk_hdr): + members = struct.unpack("<I", blk_hdr[68:68+4]) + dt = datetime.datetime.fromtimestamp(members[0]) + return dt.year + def get_block_hashes(settings): blkindex = [] f = open(settings['hashlist'], "r") @@ -86,9 +88,14 @@ def copydata(settings, blkindex, blkset): outF = None blkCount = 0 + lastYear = 0 + splitYear = False fileOutput = True + maxOutSz = settings['max_out_sz'] if 'output' in settings: fileOutput = False + if settings['split_year'] != 0: + splitYear = True while True: if not inF: @@ -111,17 +118,30 @@ def copydata(settings, blkindex, blkset): su = struct.unpack("<I", inLenLE) inLen = su[0] rawblock = inF.read(inLen) + blk_hdr = rawblock[:80] - hash_str = calc_hash_str(rawblock) + hash_str = calc_hash_str(blk_hdr) if not hash_str in blkset: print("Skipping unknown block " + hash_str) continue - if not fileOutput and ((outsz + inLen) > MAX_OUT_SZ): + if not fileOutput and ((outsz + inLen) > maxOutSz): outF.close() outF = None outFn = outFn + 1 outsz = 0 + + if splitYear: + blkYear = get_blk_year(blk_hdr) + if blkYear > lastYear: + print("New year " + str(blkYear) + " @ " + hash_str) + lastYear = blkYear + if outF: + outF.close() + outF = None + outFn = outFn + 1 + outsz = 0 + if not outF: if fileOutput: fname = settings['output_file'] @@ -164,7 +184,13 @@ if __name__ == '__main__': settings['input'] = 'input' if 'hashlist' not in settings: settings['hashlist'] = 'hashlist.txt' + if 'split_year' not in settings: + settings['split_year'] = 0 + if 'max_out_sz' not in settings: + settings['max_out_sz'] = 1000L * 1000 * 1000 + settings['max_out_sz'] = long(settings['max_out_sz']) + settings['split_year'] = int(settings['split_year']) settings['netmagic'] = settings['netmagic'].decode('hex') if 'output_file' not in settings and 'output' not in settings: |