From a327e8ea30e10abdce910a3e37c78e00d0918548 Mon Sep 17 00:00:00 2001 From: "Wladimir J. van der Laan" Date: Mon, 13 Mar 2017 16:09:38 +0100 Subject: devtools: Make github-merge compute SHA512 from git, instead of worktree This changes tree_sha512sum() to requests the objects for hashing from git instead of from the working tree. The change should make the process more deterministic (it hashes what will be pushed) and hopefully avoids the frequent miscomputed SHA512's that happen now. --- contrib/devtools/github-merge.py | 45 +++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) (limited to 'contrib') diff --git a/contrib/devtools/github-merge.py b/contrib/devtools/github-merge.py index f1b6a12fd0..3fee39143d 100755 --- a/contrib/devtools/github-merge.py +++ b/contrib/devtools/github-merge.py @@ -78,24 +78,53 @@ def get_symlink_files(): ret.append(f.decode('utf-8').split("\t")[1]) return ret -def tree_sha512sum(): - files = sorted(subprocess.check_output([GIT, 'ls-tree', '--full-tree', '-r', '--name-only', 'HEAD']).splitlines()) +def tree_sha512sum(commit='HEAD'): + # request metadata for entire tree, recursively + files = [] + blob_by_name = {} + for line in subprocess.check_output([GIT, 'ls-tree', '--full-tree', '-r', commit]).splitlines(): + name_sep = line.index(b'\t') + metadata = line[:name_sep].split() # perms, 'blob', blobid + assert(metadata[1] == b'blob') + name = line[name_sep+1:] + files.append(name) + blob_by_name[name] = metadata[2] + + files.sort() + # open connection to git-cat-file in batch mode to request data for all blobs + # this is much faster than launching it per file + p = subprocess.Popen([GIT, 'cat-file', '--batch'], stdout=subprocess.PIPE, stdin=subprocess.PIPE) overall = hashlib.sha512() for f in files: + blob = blob_by_name[f] + # request blob + p.stdin.write(blob + b'\n') + p.stdin.flush() + # read header: blob, "blob", size + reply = p.stdout.readline().split() + assert(reply[0] == blob and reply[1] == b'blob') + size = int(reply[2]) + # hash the blob data intern = hashlib.sha512() - fi = open(f, 'rb') - while True: - piece = fi.read(65536) - if piece: + ptr = 0 + while ptr < size: + bs = min(65536, size - ptr) + piece = p.stdout.read(bs) + if len(piece) == bs: intern.update(piece) else: - break - fi.close() + raise IOError('Premature EOF reading git cat-file output') + ptr += bs dig = intern.hexdigest() + assert(p.stdout.read(1) == b'\n') # ignore LF that follows blob data + # update overall hash with file hash overall.update(dig.encode("utf-8")) overall.update(" ".encode("utf-8")) overall.update(f) overall.update("\n".encode("utf-8")) + p.stdin.close() + if p.wait(): + raise IOError('Non-zero return value executing git cat-file') return overall.hexdigest() -- cgit v1.2.3