5 files changed, 416 insertions, 53 deletions
diff --git a/contrib/devtools/update-translations.py b/contrib/devtools/update-translations.py
index 1950a42678..0be632069a 100755
--- a/contrib/devtools/update-translations.py
+++ b/contrib/devtools/update-translations.py
@@ -14,13 +14,14 @@ It will do the following automatically:
 
 TODO:
 - auto-add new translations to the build system according to the translation process
-- remove 'unfinished' translation items
 '''
 from __future__ import division, print_function
 import subprocess
 import re
 import sys
 import os
+import io
+import xml.etree.ElementTree as ET
 
 # Name of transifex tool
 TX = 'tx'
@@ -40,24 +41,143 @@ def fetch_all_translations():
         print('Error while fetching translations', file=sys.stderr)
         exit(1)
 
-def postprocess_translations():
-    print('Postprocessing...')
+def find_format_specifiers(s):
+    '''Find all format specifiers in a string.'''
+    pos = 0
+    specifiers = []
+    while True:
+        percent = s.find('%', pos)
+        if percent < 0:
+            break
+        specifiers.append(s[percent+1])
+        pos = percent+2
+    return specifiers
+
+def split_format_specifiers(specifiers):
+    '''Split format specifiers between numeric (Qt) and others (strprintf)'''
+    numeric = []
+    other = []
+    for s in specifiers:
+        if s in {'1','2','3','4','5','6','7','8','9'}:
+            numeric.append(s)
+        else:
+            other.append(s)
+
+    # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
+    return set(numeric),other
+
+def sanitize_string(s):
+    '''Sanitize string for printing'''
+    return s.replace('\n',' ')
+
+def check_format_specifiers(source, translation, errors):
+    source_f = split_format_specifiers(find_format_specifiers(source))
+    # assert that no source messages contain both Qt and strprintf format specifiers
+    # if this fails, go change the source as this is hacky and confusing!
+    assert(not(source_f[0] and source_f[1]))
+    try:
+        translation_f = split_format_specifiers(find_format_specifiers(translation))
+    except IndexError:
+        errors.append("Parse error in translation '%s'" % sanitize_string(translation))
+        return False
+    else:
+        if source_f != translation_f:
+            errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
+            return False
+    return True
+
+def all_ts_files(suffix=''):
     for filename in os.listdir(LOCALE_DIR):
         # process only language files, and do not process source language
-        if not filename.endswith('.ts') or filename == SOURCE_LANG: 
+        if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
             continue
+        if suffix: # remove provided suffix
+            filename = filename[0:-len(suffix)]
         filepath = os.path.join(LOCALE_DIR, filename)
-        with open(filepath, 'rb') as f:
+        yield(filename, filepath)
+
+FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
+def remove_invalid_characters(s):
+    '''Remove invalid characters from translation string'''
+    return FIX_RE.sub(b'', s)
+
+# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
+# comparison, disable by default)
+_orig_escape_cdata = None
+def escape_cdata(text):
+    text = _orig_escape_cdata(text)
+    text = text.replace("'", '&apos;')
+    text = text.replace('"', '&quot;')
+    return text
+
+def postprocess_translations(reduce_diff_hacks=False):
+    print('Checking and postprocessing...')
+
+    if reduce_diff_hacks:
+        global _orig_escape_cdata
+        _orig_escape_cdata = ET._escape_cdata
+        ET._escape_cdata = escape_cdata
+
+    for (filename,filepath) in all_ts_files():
+        os.rename(filepath, filepath+'.orig')
+
+    have_errors = False
+    for (filename,filepath) in all_ts_files('.orig'):
+        # pre-fixups to cope with transifex output
+        parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
+        with open(filepath + '.orig', 'rb') as f:
             data = f.read()
-        # remove non-allowed control characters
-        data = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', data)
-        data = data.split('\n')
-        # strip locations from non-origin translation
-        # location tags are used to guide translators, they are not necessary for compilation
-        # TODO: actually process XML instead of relying on Transifex's one-tag-per-line output format
-        data = [line for line in data if not '<location' in line]
-        with open(filepath, 'wb') as f:
-            f.write('\n'.join(data))
+        # remove control characters; this must be done over the entire file otherwise the XML parser will fail
+        data = remove_invalid_characters(data)
+        tree = ET.parse(io.BytesIO(data), parser=parser)
+
+        # iterate over all messages in file
+        root = tree.getroot()
+        for context in root.findall('context'):
+            for message in context.findall('message'):
+                numerus = message.get('numerus') == 'yes'
+                source = message.find('source').text
+                translation_node = message.find('translation')
+                # pick all numerusforms
+                if numerus:
+                    translations = [i.text for i in translation_node.findall('numerusform')]
+                else:
+                    translations = [translation_node.text]
+
+                for translation in translations:
+                    if translation is None:
+                        continue
+                    errors = []
+                    valid = check_format_specifiers(source, translation, errors)
+
+                    for error in errors:
+                        print('%s: %s' % (filename, error))
+
+                    if not valid: # set type to unfinished and clear string if invalid
+                        translation_node.clear()
+                        translation_node.set('type', 'unfinished')
+                        have_errors = True
+
+                # Remove location tags
+                for location in message.findall('location'):
+                    message.remove(location)
+
+                # Remove entire message if it is an unfinished translation
+                if translation_node.get('type') == 'unfinished':
+                    context.remove(message)
+
+        # write fixed-up tree
+        # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
+        if reduce_diff_hacks:
+            out = io.BytesIO()
+            tree.write(out, encoding='utf-8')
+            out = out.getvalue()
+            out = out.replace(b' />', b'/>')
+            with open(filepath, 'wb') as f:
+                f.write(out)
+        else:
+            tree.write(filepath, encoding='utf-8')
+    return have_errors
 
 if __name__ == '__main__':
     check_at_repository_root()
diff --git a/contrib/linearize/README.md b/contrib/linearize/README.md
index 70b9f034cd..157586e4d4 100644
--- a/contrib/linearize/README.md
+++ b/contrib/linearize/README.md
@@ -1,2 +1,33 @@
-### Linearize ###
-Construct a linear, no-fork, best version of the blockchain.
-\ No newline at end of file
+# Linearize
+Construct a linear, no-fork, best version of the blockchain.
+
+## Step 1: Download hash list
+
+   $ ./linearize-hashes.py linearize.cfg > hashlist.txt
+
+Required configuration file settings for linearize-hashes:
+* RPC: rpcuser, rpcpassword
+
+Optional config file setting for linearize-hashes:
+* RPC: host, port
+* Block chain: min_height, max_height
+
+## Step 2: Copy local block data
+
+   $ ./linearize-data.py linearize.cfg
+
+Required configuration file settings:
+* "input": bitcoind blocks/ directory containing blkNNNNN.dat
+* "hashlist": text file containing list of block hashes, linearized-hashes.py
+output.
+* "output_file": bootstrap.dat
+      or
+* "output": output directory for linearized blocks/blkNNNNN.dat output
+
+Optional config file setting for linearize-data:
+* "netmagic": network magic number
+* "max_out_sz": maximum output file size (default 1000*1000*1000)
+* "split_timestamp": Split files when a new month is first seen, in addition to
+reaching a maximum file size.
+* "file_timestamp": Set each file's last-modified time to that of the
+most recent block in that file.
diff --git a/contrib/linearize/example-linearize.cfg b/contrib/linearize/example-linearize.cfg
index f5cdab5325..071345f23a 100644
--- a/contrib/linearize/example-linearize.cfg
+++ b/contrib/linearize/example-linearize.cfg
@@ -1,12 +1,17 @@
 
-# bitcoind RPC settings
+# bitcoind RPC settings (linearize-hashes)
 rpcuser=someuser
 rpcpassword=somepassword
 host=127.0.0.1
 port=8332
 
-# bootstrap.dat settings
+# bootstrap.dat hashlist settings (linearize-hashes)
+max_height=313000
+
+# bootstrap.dat input/output settings (linearize-data)
 netmagic=f9beb4d9
-max_height=279000
-output=bootstrap.dat
+input=/home/example/.bitcoin/blocks
+output_file=/home/example/Downloads/bootstrap.dat
+hashlist=hashlist.txt
+split_year=1
 
diff --git a/contrib/linearize/linearize-data.py b/contrib/linearize/linearize-data.py
new file mode 100755
index 0000000000..383bb38198
--- /dev/null
+++ b/contrib/linearize/linearize-data.py
@@ -0,0 +1,233 @@
+#!/usr/bin/python
+#
+# linearize-data.py: Construct a linear, no-fork version of the chain.
+#
+# Copyright (c) 2013 The Bitcoin developers
+# Distributed under the MIT/X11 software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+#
+
+import json
+import struct
+import re
+import os
+import base64
+import httplib
+import sys
+import hashlib
+import datetime
+import time
+
+settings = {}
+
+
+def uint32(x):
+	return x & 0xffffffffL
+
+def bytereverse(x):
+	return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
+		       (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
+
+def bufreverse(in_buf):
+	out_words = []
+	for i in range(0, len(in_buf), 4):
+		word = struct.unpack('@I', in_buf[i:i+4])[0]
+		out_words.append(struct.pack('@I', bytereverse(word)))
+	return ''.join(out_words)
+
+def wordreverse(in_buf):
+	out_words = []
+	for i in range(0, len(in_buf), 4):
+		out_words.append(in_buf[i:i+4])
+	out_words.reverse()
+	return ''.join(out_words)
+
+def calc_hdr_hash(blk_hdr):
+	hash1 = hashlib.sha256()
+	hash1.update(blk_hdr)
+	hash1_o = hash1.digest()
+
+	hash2 = hashlib.sha256()
+	hash2.update(hash1_o)
+	hash2_o = hash2.digest()
+
+	return hash2_o
+
+def calc_hash_str(blk_hdr):
+	hash = calc_hdr_hash(blk_hdr)
+	hash = bufreverse(hash)
+	hash = wordreverse(hash)
+	hash_str = hash.encode('hex')
+	return hash_str
+
+def get_blk_dt(blk_hdr):
+	members = struct.unpack("<I", blk_hdr[68:68+4])
+	nTime = members[0]
+	dt = datetime.datetime.fromtimestamp(nTime)
+	dt_ym = datetime.datetime(dt.year, dt.month, 1)
+	return (dt_ym, nTime)
+
+def get_block_hashes(settings):
+	blkindex = []
+	f = open(settings['hashlist'], "r")
+	for line in f:
+		line = line.rstrip()
+		blkindex.append(line)
+
+	print("Read " + str(len(blkindex)) + " hashes")
+
+	return blkindex
+
+def mkblockset(blkindex):
+	blkmap = {}
+	for hash in blkindex:
+		blkmap[hash] = True
+	return blkmap
+
+def copydata(settings, blkindex, blkset):
+	inFn = 0
+	inF = None
+	outFn = 0
+	outsz = 0
+	outF = None
+	outFname = None
+	blkCount = 0
+
+	lastDate = datetime.datetime(2000, 1, 1)
+	highTS = 1408893517 - 315360000
+	timestampSplit = False
+	fileOutput = True
+	setFileTime = False
+	maxOutSz = settings['max_out_sz']
+	if 'output' in settings:
+		fileOutput = False
+	if settings['file_timestamp'] != 0:
+		setFileTime = True
+	if settings['split_timestamp'] != 0:
+		timestampSplit = True
+
+	while True:
+		if not inF:
+			fname = "%s/blk%05d.dat" % (settings['input'], inFn)
+			print("Input file" + fname)
+			inF = open(fname, "rb")
+
+		inhdr = inF.read(8)
+		if (not inhdr or (inhdr[0] == "\0")):
+			inF.close()
+			inF = None
+			inFn = inFn + 1
+			continue
+
+		inMagic = inhdr[:4]
+		if (inMagic != settings['netmagic']):
+			print("Invalid magic:" + inMagic)
+			return
+		inLenLE = inhdr[4:]
+		su = struct.unpack("<I", inLenLE)
+		inLen = su[0]
+		rawblock = inF.read(inLen)
+		blk_hdr = rawblock[:80]
+
+		hash_str = calc_hash_str(blk_hdr)
+		if not hash_str in blkset:
+			print("Skipping unknown block " + hash_str)
+			continue
+
+		if blkindex[blkCount] != hash_str:
+			print("Out of order block.")
+			print("Expected " + blkindex[blkCount])
+			print("Got " + hash_str)
+			sys.exit(1)
+
+		if not fileOutput and ((outsz + inLen) > maxOutSz):
+			outF.close()
+			if setFileTime:
+				os.utime(outFname, (int(time.time()), highTS))
+			outF = None
+			outFname = None
+			outFn = outFn + 1
+			outsz = 0
+
+		(blkDate, blkTS) = get_blk_dt(blk_hdr)
+		if timestampSplit and (blkDate > lastDate):
+			print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
+			lastDate = blkDate
+			if outF:
+				outF.close()
+				if setFileTime:
+					os.utime(outFname, (int(time.time()), highTS))
+				outF = None
+				outFname = None
+				outFn = outFn + 1
+				outsz = 0
+
+		if not outF:
+			if fileOutput:
+				outFname = settings['output_file']
+			else:
+				outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
+			print("Output file" + outFname)
+			outF = open(outFname, "wb")
+
+		outF.write(inhdr)
+		outF.write(rawblock)
+		outsz = outsz + inLen + 8
+
+		blkCount = blkCount + 1
+		if blkTS > highTS:
+			highTS = blkTS
+
+		if (blkCount % 1000) == 0:
+			print("Wrote " + str(blkCount) + " blocks")
+
+if __name__ == '__main__':
+	if len(sys.argv) != 2:
+		print "Usage: linearize-data.py CONFIG-FILE"
+		sys.exit(1)
+
+	f = open(sys.argv[1])
+	for line in f:
+		# skip comment lines
+		m = re.search('^\s*#', line)
+		if m:
+			continue
+
+		# parse key=value lines
+		m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
+		if m is None:
+			continue
+		settings[m.group(1)] = m.group(2)
+	f.close()
+
+	if 'netmagic' not in settings:
+		settings['netmagic'] = 'f9beb4d9'
+	if 'input' not in settings:
+		settings['input'] = 'input'
+	if 'hashlist' not in settings:
+		settings['hashlist'] = 'hashlist.txt'
+	if 'file_timestamp' not in settings:
+		settings['file_timestamp'] = 0
+	if 'split_timestamp' not in settings:
+		settings['split_timestamp'] = 0
+	if 'max_out_sz' not in settings:
+		settings['max_out_sz'] = 1000L * 1000 * 1000
+
+	settings['max_out_sz'] = long(settings['max_out_sz'])
+	settings['split_timestamp'] = int(settings['split_timestamp'])
+	settings['file_timestamp'] = int(settings['file_timestamp'])
+	settings['netmagic'] = settings['netmagic'].decode('hex')
+
+	if 'output_file' not in settings and 'output' not in settings:
+		print("Missing output file / directory")
+		sys.exit(1)
+
+	blkindex = get_block_hashes(settings)
+	blkset = mkblockset(blkindex)
+
+	if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset:
+		print("not found")
+	else:
+		copydata(settings, blkindex, blkset)
+
+
diff --git a/contrib/linearize/linearize.py b/contrib/linearize/linearize-hashes.py
index 650f7d3684..791b71bc33 100644..100755
--- a/contrib/linearize/linearize.py
+++ b/contrib/linearize/linearize-hashes.py
@@ -1,7 +1,6 @@
 #!/usr/bin/python
 #
-# linearize.py:  Construct a linear, no-fork, best version of the blockchain.
-#
+# linearize-hashes.py:  List blocks in a linear, no-fork version of the chain.
 #
 # Copyright (c) 2013 The Bitcoin developers
 # Distributed under the MIT/X11 software license, see the accompanying
@@ -15,9 +14,6 @@ import base64
 import httplib
 import sys
 
-ERR_SLEEP = 15
-MAX_NONCE = 1000000L
-
 settings = {}
 
 class BitcoinRPC:
@@ -62,34 +58,18 @@ class BitcoinRPC:
 	def getblockhash(self, index):
 		return self.rpc('getblockhash', [index])
 
-def getblock(rpc, settings, n):
-	hash = rpc.getblockhash(n)
-	hexdata = rpc.getblock(hash, False)
-	data = hexdata.decode('hex')
-
-	return data
-
-def get_blocks(settings):
+def get_block_hashes(settings):
 	rpc = BitcoinRPC(settings['host'], settings['port'],
 			 settings['rpcuser'], settings['rpcpassword'])
 
-	outf = open(settings['output'], 'ab')
-
 	for height in xrange(settings['min_height'], settings['max_height']+1):
-		data = getblock(rpc, settings, height)
-
-		outhdr = settings['netmagic']
-		outhdr += struct.pack("<i", len(data))
+		hash = rpc.getblockhash(height)
 
-		outf.write(outhdr)
-		outf.write(data)
-
-		if (height % 1000) == 0:
-			sys.stdout.write("Wrote block " + str(height) + "\n")
+		print(hash)
 
 if __name__ == '__main__':
 	if len(sys.argv) != 2:
-		print "Usage: linearize.py CONFIG-FILE"
+		print "Usage: linearize-hashes.py CONFIG-FILE"
 		sys.exit(1)
 
 	f = open(sys.argv[1])
@@ -106,10 +86,6 @@ if __name__ == '__main__':
 		settings[m.group(1)] = m.group(2)
 	f.close()
 
-	if 'netmagic' not in settings:
-		settings['netmagic'] = 'f9beb4d9'
-	if 'output' not in settings:
-		settings['output'] = 'bootstrap.dat'
 	if 'host' not in settings:
 		settings['host'] = '127.0.0.1'
 	if 'port' not in settings:
@@ -117,16 +93,14 @@ if __name__ == '__main__':
 	if 'min_height' not in settings:
 		settings['min_height'] = 0
 	if 'max_height' not in settings:
-		settings['max_height'] = 279000
+		settings['max_height'] = 313000
 	if 'rpcuser' not in settings or 'rpcpassword' not in settings:
 		print "Missing username and/or password in cfg file"
 		sys.exit(1)
 
-	settings['netmagic'] = settings['netmagic'].decode('hex')
 	settings['port'] = int(settings['port'])
 	settings['min_height'] = int(settings['min_height'])
 	settings['max_height'] = int(settings['max_height'])
 
-	get_blocks(settings)
-
+	get_block_hashes(settings)