Merge #17336: scripts: search for first block file for linearize-data with some block files pruned

317fb96de9c6257972f1213b4ef2c3fe87dde99f Add search for first blk file with pruned node (Rjected) Pull request description:   When bitcoind is running in pruned mode, producing a hashlist with `./linearize-hashes.py linearize.cfg > hashlist.txt` and then executing `linearize-data.py linearize.cfg` will produce: ``` Read 313001 hashes Input file /home/dan/.bitcoin/blocks/blk00000.dat Premature end of block data ``` This happens because `linearize-data` starts by attempting to process `blk00000.dat` regardless of whether or not `blk00000.dat` actually exists - this may not be the case if working with a pruned node. This PR adds a function which finds the first block file that does exist, and calls that function when the `BlockDataCopier` is initialized. This is a refactor of #16431.  ACKs for top commit: darosior: ACK 317fb96de9c6257972f1213b4ef2c3fe87dde99f laanwj: Code review ACK 317fb96de9c6257972f1213b4ef2c3fe87dde99f theStack: Code review ACK https://github.com/bitcoin/bitcoin/pull/17336/commits/317fb96de9c6257972f1213b4ef2c3fe87dde99f Tree-SHA512: fc8014282df6cfe7b267e64db8ce7d82b86b758c302fbfea4a3c39b62d93512f5c2e31a0de4e9c5ec18fc0268c917f011257d37b45afaef6033eec90e4aa585f
author: fanquake <fanquake@gmail.com> 2020-02-05 08:29:57 +0800
committer: fanquake <fanquake@gmail.com> 2020-02-05 08:45:06 +0800
commit: 8625446b4d86880be9e218f3cd136de1e382101d (patch)
tree: 1b164a98a864bbeb85e9a14eb6785c569d7976a2 /contrib/linearize
parent: f32564f0a73c5ad1a107dd112e40516f39d1a51e (diff)
parent: 317fb96de9c6257972f1213b4ef2c3fe87dde99f (diff)
1 files changed, 28 insertions, 1 deletions
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
index 1b7d77f7b4..bcca3b7cea 100755
--- a/contrib/linearize/linearize-data.py
+++ b/contrib/linearize/linearize-data.py
@@ -15,6 +15,7 @@ import sys
 import hashlib
 import datetime
 import time
+import glob
 from collections import namedtuple
 from binascii import unhexlify
 
@@ -92,6 +93,30 @@ def mkblockmap(blkindex):
         blkmap[hash] = height
     return blkmap
 
+# This gets the first block file ID that exists from the input block
+# file directory.
+def getFirstBlockFileId(block_dir_path):
+    # First, this sets up a pattern to search for block files, for
+    # example 'blkNNNNN.dat'.
+    blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat")
+
+    # This search is done with glob
+    blkFnList = glob.glob(blkFilePattern)
+
+    if len(blkFnList) == 0:
+        print("blocks not pruned - starting at 0")
+        return 0
+    # We then get the lexicographic minimum, which should be the first
+    # block file name.
+    firstBlkFilePath = min(blkFnList)
+    firstBlkFn = os.path.basename(firstBlkFilePath)
+
+    # now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t']
+    # So get the ID by choosing:              3   4   5   6   7
+    # The ID is not necessarily 0 if this is a pruned node.
+    blkId = int(firstBlkFn[3:8])
+    return blkId
+
 # Block header and extent on disk
 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
 
@@ -101,7 +126,9 @@ class BlockDataCopier:
         self.blkindex = blkindex
         self.blkmap = blkmap
 
-        self.inFn = 0
+        # Get first occurring block file id - for pruned nodes this
+        # will not necessarily be 0
+        self.inFn = getFirstBlockFileId(self.settings['input'])
         self.inF = None
         self.outFn = 0
         self.outsz = 0
author	fanquake <fanquake@gmail.com>	2020-02-05 08:29:57 +0800
committer	fanquake <fanquake@gmail.com>	2020-02-05 08:45:06 +0800
commit	8625446b4d86880be9e218f3cd136de1e382101d (patch)
tree	1b164a98a864bbeb85e9a14eb6785c569d7976a2 /contrib/linearize
parent	f32564f0a73c5ad1a107dd112e40516f39d1a51e (diff)
parent	317fb96de9c6257972f1213b4ef2c3fe87dde99f (diff)