aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWladimir J. van der Laan <laanwj@gmail.com>2014-08-28 13:09:19 +0200
committerWladimir J. van der Laan <laanwj@gmail.com>2014-09-01 10:06:15 +0200
commitda59f283350343a623820fa9ea48dd1ebb817064 (patch)
tree802007dbd23dfc8151228ae5f6698c5ad03d668c
parent93f97aab629d6d3b7e2c296b24fc37eef9502cd1 (diff)
Add deeper XML checking to update-translation script
- Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774.
-rwxr-xr-xcontrib/devtools/update-translations.py148
1 files changed, 134 insertions, 14 deletions
diff --git a/contrib/devtools/update-translations.py b/contrib/devtools/update-translations.py
index 1950a42678..0be632069a 100755
--- a/contrib/devtools/update-translations.py
+++ b/contrib/devtools/update-translations.py
@@ -14,13 +14,14 @@ It will do the following automatically:
TODO:
- auto-add new translations to the build system according to the translation process
-- remove 'unfinished' translation items
'''
from __future__ import division, print_function
import subprocess
import re
import sys
import os
+import io
+import xml.etree.ElementTree as ET
# Name of transifex tool
TX = 'tx'
@@ -40,24 +41,143 @@ def fetch_all_translations():
print('Error while fetching translations', file=sys.stderr)
exit(1)
-def postprocess_translations():
- print('Postprocessing...')
+def find_format_specifiers(s):
+ '''Find all format specifiers in a string.'''
+ pos = 0
+ specifiers = []
+ while True:
+ percent = s.find('%', pos)
+ if percent < 0:
+ break
+ specifiers.append(s[percent+1])
+ pos = percent+2
+ return specifiers
+
+def split_format_specifiers(specifiers):
+ '''Split format specifiers between numeric (Qt) and others (strprintf)'''
+ numeric = []
+ other = []
+ for s in specifiers:
+ if s in {'1','2','3','4','5','6','7','8','9'}:
+ numeric.append(s)
+ else:
+ other.append(s)
+
+ # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
+ return set(numeric),other
+
+def sanitize_string(s):
+ '''Sanitize string for printing'''
+ return s.replace('\n',' ')
+
+def check_format_specifiers(source, translation, errors):
+ source_f = split_format_specifiers(find_format_specifiers(source))
+ # assert that no source messages contain both Qt and strprintf format specifiers
+ # if this fails, go change the source as this is hacky and confusing!
+ assert(not(source_f[0] and source_f[1]))
+ try:
+ translation_f = split_format_specifiers(find_format_specifiers(translation))
+ except IndexError:
+ errors.append("Parse error in translation '%s'" % sanitize_string(translation))
+ return False
+ else:
+ if source_f != translation_f:
+ errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
+ return False
+ return True
+
+def all_ts_files(suffix=''):
for filename in os.listdir(LOCALE_DIR):
# process only language files, and do not process source language
- if not filename.endswith('.ts') or filename == SOURCE_LANG:
+ if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
continue
+ if suffix: # remove provided suffix
+ filename = filename[0:-len(suffix)]
filepath = os.path.join(LOCALE_DIR, filename)
- with open(filepath, 'rb') as f:
+ yield(filename, filepath)
+
+FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
+def remove_invalid_characters(s):
+ '''Remove invalid characters from translation string'''
+ return FIX_RE.sub(b'', s)
+
+# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
+# comparison, disable by default)
+_orig_escape_cdata = None
+def escape_cdata(text):
+ text = _orig_escape_cdata(text)
+ text = text.replace("'", '&apos;')
+ text = text.replace('"', '&quot;')
+ return text
+
+def postprocess_translations(reduce_diff_hacks=False):
+ print('Checking and postprocessing...')
+
+ if reduce_diff_hacks:
+ global _orig_escape_cdata
+ _orig_escape_cdata = ET._escape_cdata
+ ET._escape_cdata = escape_cdata
+
+ for (filename,filepath) in all_ts_files():
+ os.rename(filepath, filepath+'.orig')
+
+ have_errors = False
+ for (filename,filepath) in all_ts_files('.orig'):
+ # pre-fixups to cope with transifex output
+ parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
+ with open(filepath + '.orig', 'rb') as f:
data = f.read()
- # remove non-allowed control characters
- data = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', data)
- data = data.split('\n')
- # strip locations from non-origin translation
- # location tags are used to guide translators, they are not necessary for compilation
- # TODO: actually process XML instead of relying on Transifex's one-tag-per-line output format
- data = [line for line in data if not '<location' in line]
- with open(filepath, 'wb') as f:
- f.write('\n'.join(data))
+ # remove control characters; this must be done over the entire file otherwise the XML parser will fail
+ data = remove_invalid_characters(data)
+ tree = ET.parse(io.BytesIO(data), parser=parser)
+
+ # iterate over all messages in file
+ root = tree.getroot()
+ for context in root.findall('context'):
+ for message in context.findall('message'):
+ numerus = message.get('numerus') == 'yes'
+ source = message.find('source').text
+ translation_node = message.find('translation')
+ # pick all numerusforms
+ if numerus:
+ translations = [i.text for i in translation_node.findall('numerusform')]
+ else:
+ translations = [translation_node.text]
+
+ for translation in translations:
+ if translation is None:
+ continue
+ errors = []
+ valid = check_format_specifiers(source, translation, errors)
+
+ for error in errors:
+ print('%s: %s' % (filename, error))
+
+ if not valid: # set type to unfinished and clear string if invalid
+ translation_node.clear()
+ translation_node.set('type', 'unfinished')
+ have_errors = True
+
+ # Remove location tags
+ for location in message.findall('location'):
+ message.remove(location)
+
+ # Remove entire message if it is an unfinished translation
+ if translation_node.get('type') == 'unfinished':
+ context.remove(message)
+
+ # write fixed-up tree
+ # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
+ if reduce_diff_hacks:
+ out = io.BytesIO()
+ tree.write(out, encoding='utf-8')
+ out = out.getvalue()
+ out = out.replace(b' />', b'/>')
+ with open(filepath, 'wb') as f:
+ f.write(out)
+ else:
+ tree.write(filepath, encoding='utf-8')
+ return have_errors
if __name__ == '__main__':
check_at_repository_root()