aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@bitpay.com>2014-08-23 22:59:16 -0400
committerJeff Garzik <jgarzik@bitpay.com>2014-08-23 22:59:16 -0400
commitb4a72a75b4795dffbe973b0ae287343df2dac13c (patch)
treeb08c16770090458743b9d95b04e4314dede7c521
parent476eb7eb53f680494952865a823e5cf9459da2b9 (diff)
contrib/linearize: split output files based on new-timestamp-year or max-file-size
-rw-r--r--contrib/linearize/README.md3
-rw-r--r--contrib/linearize/example-linearize.cfg1
-rwxr-xr-xcontrib/linearize/linearize-data.py44
3 files changed, 39 insertions, 9 deletions
diff --git a/contrib/linearize/README.md b/contrib/linearize/README.md
index 8d06d53b10..b5c6e7824e 100644
--- a/contrib/linearize/README.md
+++ b/contrib/linearize/README.md
@@ -26,4 +26,7 @@ output.
Optional config file setting for linearize-data:
* "netmagic": network magic number
+* "max_out_sz": maximum output file size (default 1000*1000*1000)
+* "split_year": Split files when a new year is first seen, in addition to
+reaching a maximum file size.
diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg
index 9c3270d653..071345f23a 100644
--- a/contrib/linearize/example-linearize.cfg
+++ b/contrib/linearize/example-linearize.cfg
@@ -13,4 +13,5 @@ netmagic=f9beb4d9
input=/home/example/.bitcoin/blocks
output_file=/home/example/Downloads/bootstrap.dat
hashlist=hashlist.txt
+split_year=1
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
index 77bae6e3c6..ea94f25fae 100755
--- a/contrib/linearize/linearize-data.py
+++ b/contrib/linearize/linearize-data.py
@@ -14,8 +14,7 @@ import base64
import httplib
import sys
import hashlib
-
-MAX_OUT_SZ = 128 * 1024 * 1024
+import datetime
settings = {}
@@ -41,9 +40,7 @@ def wordreverse(in_buf):
out_words.reverse()
return ''.join(out_words)
-def calc_hdr_hash(rawblock):
- blk_hdr = rawblock[:80]
-
+def calc_hdr_hash(blk_hdr):
hash1 = hashlib.sha256()
hash1.update(blk_hdr)
hash1_o = hash1.digest()
@@ -54,13 +51,18 @@ def calc_hdr_hash(rawblock):
return hash2_o
-def calc_hash_str(rawblock):
- hash = calc_hdr_hash(rawblock)
+def calc_hash_str(blk_hdr):
+ hash = calc_hdr_hash(blk_hdr)
hash = bufreverse(hash)
hash = wordreverse(hash)
hash_str = hash.encode('hex')
return hash_str
+def get_blk_year(blk_hdr):
+ members = struct.unpack("<I", blk_hdr[68:68+4])
+ dt = datetime.datetime.fromtimestamp(members[0])
+ return dt.year
+
def get_block_hashes(settings):
blkindex = []
f = open(settings['hashlist'], "r")
@@ -86,9 +88,14 @@ def copydata(settings, blkindex, blkset):
outF = None
blkCount = 0
+ lastYear = 0
+ splitYear = False
fileOutput = True
+ maxOutSz = settings['max_out_sz']
if 'output' in settings:
fileOutput = False
+ if settings['split_year'] != 0:
+ splitYear = True
while True:
if not inF:
@@ -111,17 +118,30 @@ def copydata(settings, blkindex, blkset):
su = struct.unpack("<I", inLenLE)
inLen = su[0]
rawblock = inF.read(inLen)
+ blk_hdr = rawblock[:80]
- hash_str = calc_hash_str(rawblock)
+ hash_str = calc_hash_str(blk_hdr)
if not hash_str in blkset:
print("Skipping unknown block " + hash_str)
continue
- if not fileOutput and ((outsz + inLen) > MAX_OUT_SZ):
+ if not fileOutput and ((outsz + inLen) > maxOutSz):
outF.close()
outF = None
outFn = outFn + 1
outsz = 0
+
+ if splitYear:
+ blkYear = get_blk_year(blk_hdr)
+ if blkYear > lastYear:
+ print("New year " + str(blkYear) + " @ " + hash_str)
+ lastYear = blkYear
+ if outF:
+ outF.close()
+ outF = None
+ outFn = outFn + 1
+ outsz = 0
+
if not outF:
if fileOutput:
fname = settings['output_file']
@@ -164,7 +184,13 @@ if __name__ == '__main__':
settings['input'] = 'input'
if 'hashlist' not in settings:
settings['hashlist'] = 'hashlist.txt'
+ if 'split_year' not in settings:
+ settings['split_year'] = 0
+ if 'max_out_sz' not in settings:
+ settings['max_out_sz'] = 1000L * 1000 * 1000
+ settings['max_out_sz'] = long(settings['max_out_sz'])
+ settings['split_year'] = int(settings['split_year'])
settings['netmagic'] = settings['netmagic'].decode('hex')
if 'output_file' not in settings and 'output' not in settings: