devtools: Make github-merge compute SHA512 from git, instead of worktree

This changes tree_sha512sum() to requests the objects for hashing from git instead of from the working tree. The change should make the process more deterministic (it hashes what will be pushed) and hopefully avoids the frequent miscomputed SHA512's that happen now.
author: Wladimir J. van der Laan <laanwj@gmail.com> 2017-03-13 16:09:38 +0100
committer: Wladimir J. van der Laan <laanwj@gmail.com> 2017-03-13 16:13:38 +0100
commit: a327e8ea30e10abdce910a3e37c78e00d0918548 (patch)
tree: 06c554f84c450923774a1a1dcad5cc04ef15224f /contrib
parent: 8040ae6fc576e9504186f2ae3ff2c8125de1095c (diff)
1 files changed, 37 insertions, 8 deletions
diff --git a/contrib/devtools/github-merge.py b/contrib/devtools/github-merge.py
index f1b6a12fd0..3fee39143d 100755
--- a/contrib/devtools/github-merge.py
+++ b/contrib/devtools/github-merge.py
@@ -78,24 +78,53 @@ def get_symlink_files():
             ret.append(f.decode('utf-8').split("\t")[1])
     return ret
 
-def tree_sha512sum():
-    files = sorted(subprocess.check_output([GIT, 'ls-tree', '--full-tree', '-r', '--name-only', 'HEAD']).splitlines())
+def tree_sha512sum(commit='HEAD'):
+    # request metadata for entire tree, recursively
+    files = []
+    blob_by_name = {}
+    for line in subprocess.check_output([GIT, 'ls-tree', '--full-tree', '-r', commit]).splitlines():
+        name_sep = line.index(b'\t')
+        metadata = line[:name_sep].split() # perms, 'blob', blobid
+        assert(metadata[1] == b'blob')
+        name = line[name_sep+1:]
+        files.append(name)
+        blob_by_name[name] = metadata[2]
+
+    files.sort()
+    # open connection to git-cat-file in batch mode to request data for all blobs
+    # this is much faster than launching it per file
+    p = subprocess.Popen([GIT, 'cat-file', '--batch'], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
     overall = hashlib.sha512()
     for f in files:
+        blob = blob_by_name[f]
+        # request blob
+        p.stdin.write(blob + b'\n')
+        p.stdin.flush()
+        # read header: blob, "blob", size
+        reply = p.stdout.readline().split()
+        assert(reply[0] == blob and reply[1] == b'blob')
+        size = int(reply[2])
+        # hash the blob data
         intern = hashlib.sha512()
-        fi = open(f, 'rb')
-        while True:
-            piece = fi.read(65536)
-            if piece:
+        ptr = 0
+        while ptr < size:
+            bs = min(65536, size - ptr)
+            piece = p.stdout.read(bs)
+            if len(piece) == bs:
                 intern.update(piece)
             else:
-                break
-        fi.close()
+                raise IOError('Premature EOF reading git cat-file output')
+            ptr += bs
         dig = intern.hexdigest()
+        assert(p.stdout.read(1) == b'\n') # ignore LF that follows blob data
+        # update overall hash with file hash
         overall.update(dig.encode("utf-8"))
         overall.update("  ".encode("utf-8"))
         overall.update(f)
         overall.update("\n".encode("utf-8"))
+    p.stdin.close()
+    if p.wait():
+        raise IOError('Non-zero return value executing git cat-file')
     return overall.hexdigest()
author	Wladimir J. van der Laan <laanwj@gmail.com>	2017-03-13 16:09:38 +0100
committer	Wladimir J. van der Laan <laanwj@gmail.com>	2017-03-13 16:13:38 +0100
commit	a327e8ea30e10abdce910a3e37c78e00d0918548 (patch)
tree	06c554f84c450923774a1a1dcad5cc04ef15224f /contrib
parent	8040ae6fc576e9504186f2ae3ff2c8125de1095c (diff)