diff options
author | Wladimir J. van der Laan <laanwj@gmail.com> | 2014-08-28 13:09:19 +0200 |
---|---|---|
committer | Wladimir J. van der Laan <laanwj@gmail.com> | 2014-09-01 10:06:15 +0200 |
commit | da59f283350343a623820fa9ea48dd1ebb817064 (patch) | |
tree | 802007dbd23dfc8151228ae5f6698c5ad03d668c | |
parent | 93f97aab629d6d3b7e2c296b24fc37eef9502cd1 (diff) |
Add deeper XML checking to update-translation script
- Catch problems such as mismatched formatting characters. Remove
messages that can give problems at runtime.
- Also remove unfinished/untranslated messages, they just take up space
in the ts and waste parsing time.
Fixes #4774.
-rwxr-xr-x | contrib/devtools/update-translations.py | 148 |
1 files changed, 134 insertions, 14 deletions
diff --git a/contrib/devtools/update-translations.py b/contrib/devtools/update-translations.py index 1950a42678..0be632069a 100755 --- a/contrib/devtools/update-translations.py +++ b/contrib/devtools/update-translations.py @@ -14,13 +14,14 @@ It will do the following automatically: TODO: - auto-add new translations to the build system according to the translation process -- remove 'unfinished' translation items ''' from __future__ import division, print_function import subprocess import re import sys import os +import io +import xml.etree.ElementTree as ET # Name of transifex tool TX = 'tx' @@ -40,24 +41,143 @@ def fetch_all_translations(): print('Error while fetching translations', file=sys.stderr) exit(1) -def postprocess_translations(): - print('Postprocessing...') +def find_format_specifiers(s): + '''Find all format specifiers in a string.''' + pos = 0 + specifiers = [] + while True: + percent = s.find('%', pos) + if percent < 0: + break + specifiers.append(s[percent+1]) + pos = percent+2 + return specifiers + +def split_format_specifiers(specifiers): + '''Split format specifiers between numeric (Qt) and others (strprintf)''' + numeric = [] + other = [] + for s in specifiers: + if s in {'1','2','3','4','5','6','7','8','9'}: + numeric.append(s) + else: + other.append(s) + + # numeric (Qt) can be present in any order, others (strprintf) must be in specified order + return set(numeric),other + +def sanitize_string(s): + '''Sanitize string for printing''' + return s.replace('\n',' ') + +def check_format_specifiers(source, translation, errors): + source_f = split_format_specifiers(find_format_specifiers(source)) + # assert that no source messages contain both Qt and strprintf format specifiers + # if this fails, go change the source as this is hacky and confusing! + assert(not(source_f[0] and source_f[1])) + try: + translation_f = split_format_specifiers(find_format_specifiers(translation)) + except IndexError: + errors.append("Parse error in translation '%s'" % sanitize_string(translation)) + return False + else: + if source_f != translation_f: + errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation))) + return False + return True + +def all_ts_files(suffix=''): for filename in os.listdir(LOCALE_DIR): # process only language files, and do not process source language - if not filename.endswith('.ts') or filename == SOURCE_LANG: + if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix: continue + if suffix: # remove provided suffix + filename = filename[0:-len(suffix)] filepath = os.path.join(LOCALE_DIR, filename) - with open(filepath, 'rb') as f: + yield(filename, filepath) + +FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]') +def remove_invalid_characters(s): + '''Remove invalid characters from translation string''' + return FIX_RE.sub(b'', s) + +# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for +# comparison, disable by default) +_orig_escape_cdata = None +def escape_cdata(text): + text = _orig_escape_cdata(text) + text = text.replace("'", ''') + text = text.replace('"', '"') + return text + +def postprocess_translations(reduce_diff_hacks=False): + print('Checking and postprocessing...') + + if reduce_diff_hacks: + global _orig_escape_cdata + _orig_escape_cdata = ET._escape_cdata + ET._escape_cdata = escape_cdata + + for (filename,filepath) in all_ts_files(): + os.rename(filepath, filepath+'.orig') + + have_errors = False + for (filename,filepath) in all_ts_files('.orig'): + # pre-fixups to cope with transifex output + parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8' + with open(filepath + '.orig', 'rb') as f: data = f.read() - # remove non-allowed control characters - data = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', data) - data = data.split('\n') - # strip locations from non-origin translation - # location tags are used to guide translators, they are not necessary for compilation - # TODO: actually process XML instead of relying on Transifex's one-tag-per-line output format - data = [line for line in data if not '<location' in line] - with open(filepath, 'wb') as f: - f.write('\n'.join(data)) + # remove control characters; this must be done over the entire file otherwise the XML parser will fail + data = remove_invalid_characters(data) + tree = ET.parse(io.BytesIO(data), parser=parser) + + # iterate over all messages in file + root = tree.getroot() + for context in root.findall('context'): + for message in context.findall('message'): + numerus = message.get('numerus') == 'yes' + source = message.find('source').text + translation_node = message.find('translation') + # pick all numerusforms + if numerus: + translations = [i.text for i in translation_node.findall('numerusform')] + else: + translations = [translation_node.text] + + for translation in translations: + if translation is None: + continue + errors = [] + valid = check_format_specifiers(source, translation, errors) + + for error in errors: + print('%s: %s' % (filename, error)) + + if not valid: # set type to unfinished and clear string if invalid + translation_node.clear() + translation_node.set('type', 'unfinished') + have_errors = True + + # Remove location tags + for location in message.findall('location'): + message.remove(location) + + # Remove entire message if it is an unfinished translation + if translation_node.get('type') == 'unfinished': + context.remove(message) + + # write fixed-up tree + # if diff reduction requested, replace some XML to 'sanitize' to qt formatting + if reduce_diff_hacks: + out = io.BytesIO() + tree.write(out, encoding='utf-8') + out = out.getvalue() + out = out.replace(b' />', b'/>') + with open(filepath, 'wb') as f: + f.write(out) + else: + tree.write(filepath, encoding='utf-8') + return have_errors if __name__ == '__main__': check_at_repository_root() |