diff options
author | Wladimir J. van der Laan <laanwj@gmail.com> | 2016-06-16 12:08:18 +0200 |
---|---|---|
committer | Wladimir J. van der Laan <laanwj@gmail.com> | 2016-06-16 12:08:35 +0200 |
commit | 9c3d0fab3623ad6ca6c150ad9ebb5c5c6d4bcec3 (patch) | |
tree | 018c458a231ed472877151a532f34bf5d9d26350 | |
parent | 3f89a534acfe91896555439ed3a754cfd430d202 (diff) | |
parent | 7982fce64c61c53d1e2e8cc6a8724f878701796d (diff) |
Merge #7892: Add full UTF-8 support to RPC
7982fce doc: Mention full UTF-8 support in release notes (Wladimir J. van der Laan)
6bbb4ef test: test utf-8 for labels in wallet (Wladimir J. van der Laan)
a406fcb test: add ensure_ascii setting to AuthServiceProxy (Wladimir J. van der Laan)
60ab9b2 Squashed 'src/univalue/' changes from 2740c4f..f32df99 (Wladimir J. van der Laan)
-rw-r--r-- | doc/release-notes.md | 5 | ||||
-rw-r--r-- | qa/rpc-tests/test_framework/authproxy.py | 16 | ||||
-rwxr-xr-x | qa/rpc-tests/wallet.py | 14 | ||||
-rw-r--r-- | src/univalue/Makefile.am | 9 | ||||
-rw-r--r-- | src/univalue/configure.ac | 6 | ||||
-rw-r--r-- | src/univalue/lib/univalue_read.cpp | 37 | ||||
-rw-r--r-- | src/univalue/lib/univalue_utffilter.h | 119 | ||||
-rw-r--r-- | src/univalue/lib/univalue_write.cpp | 11 | ||||
-rw-r--r-- | src/univalue/test/fail38.json | 1 | ||||
-rw-r--r-- | src/univalue/test/fail39.json | 1 | ||||
-rw-r--r-- | src/univalue/test/fail40.json | 1 | ||||
-rw-r--r-- | src/univalue/test/fail41.json | 1 | ||||
-rw-r--r-- | src/univalue/test/round2.json | 1 | ||||
-rw-r--r-- | src/univalue/test/unitester.cpp | 31 |
14 files changed, 209 insertions, 44 deletions
diff --git a/doc/release-notes.md b/doc/release-notes.md index 491c82b693..6cc05989db 100644 --- a/doc/release-notes.md +++ b/doc/release-notes.md @@ -43,6 +43,11 @@ RPC low-level changes 32-bit and 64-bit platforms, and the txids were missing in the hashed data. This has been fixed, but this means that the output will be different than from previous versions. +- Full UTF-8 support in the RPC API. Non-ASCII characters in, for example, + wallet labels have always been malformed because they weren't taken into account + properly in JSON RPC processing. This is no longer the case. This also affects + the GUI debug console. + C++11 and Python 3 ------------------- diff --git a/qa/rpc-tests/test_framework/authproxy.py b/qa/rpc-tests/test_framework/authproxy.py index 95b2be658c..d095a56ce7 100644 --- a/qa/rpc-tests/test_framework/authproxy.py +++ b/qa/rpc-tests/test_framework/authproxy.py @@ -67,9 +67,11 @@ def EncodeDecimal(o): class AuthServiceProxy(object): __id_count = 0 - def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None): + # ensure_ascii: escape unicode as \uXXXX, passed to json.dumps + def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None, ensure_ascii=True): self.__service_url = service_url self._service_name = service_name + self.ensure_ascii = ensure_ascii # can be toggled on the fly by tests self.__url = urlparse.urlparse(service_url) if self.__url.port is None: port = 80 @@ -134,12 +136,12 @@ class AuthServiceProxy(object): AuthServiceProxy.__id_count += 1 log.debug("-%s-> %s %s"%(AuthServiceProxy.__id_count, self._service_name, - json.dumps(args, default=EncodeDecimal))) + json.dumps(args, default=EncodeDecimal, ensure_ascii=self.ensure_ascii))) postdata = json.dumps({'version': '1.1', 'method': self._service_name, 'params': args, - 'id': AuthServiceProxy.__id_count}, default=EncodeDecimal) - response = self._request('POST', self.__url.path, postdata) + 'id': AuthServiceProxy.__id_count}, default=EncodeDecimal, ensure_ascii=self.ensure_ascii) + response = self._request('POST', self.__url.path, postdata.encode('utf-8')) if response['error'] is not None: raise JSONRPCException(response['error']) elif 'result' not in response: @@ -149,9 +151,9 @@ class AuthServiceProxy(object): return response['result'] def _batch(self, rpc_call_list): - postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal) + postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal, ensure_ascii=self.ensure_ascii) log.debug("--> "+postdata) - return self._request('POST', self.__url.path, postdata) + return self._request('POST', self.__url.path, postdata.encode('utf-8')) def _get_response(self): http_response = self.__conn.getresponse() @@ -167,7 +169,7 @@ class AuthServiceProxy(object): responsedata = http_response.read().decode('utf8') response = json.loads(responsedata, parse_float=decimal.Decimal) if "error" in response and response["error"] is None: - log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal))) + log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal, ensure_ascii=self.ensure_ascii))) else: log.debug("<-- "+responsedata) return response diff --git a/qa/rpc-tests/wallet.py b/qa/rpc-tests/wallet.py index 0b7eb52f07..5d96e7a6e5 100755 --- a/qa/rpc-tests/wallet.py +++ b/qa/rpc-tests/wallet.py @@ -309,6 +309,20 @@ class WalletTest (BitcoinTestFramework): balance_nodes = [self.nodes[i].getbalance() for i in range(3)] block_count = self.nodes[0].getblockcount() + # Check modes: + # - True: unicode escaped as \u.... + # - False: unicode directly as UTF-8 + for mode in [True, False]: + self.nodes[0].ensure_ascii = mode + # unicode check: Basic Multilingual Plane, Supplementary Plane respectively + for s in [u'ััะฑะฐ', u'๐
ก']: + addr = self.nodes[0].getaccountaddress(s) + label = self.nodes[0].getaccount(addr) + assert_equal(label, s) + assert(s in self.nodes[0].listaccounts().keys()) + self.nodes[0].ensure_ascii = True # restore to default + + # maintenance tests maintenance = [ '-rescan', '-reindex', diff --git a/src/univalue/Makefile.am b/src/univalue/Makefile.am index 34fe9e3f13..6c1ec81e63 100644 --- a/src/univalue/Makefile.am +++ b/src/univalue/Makefile.am @@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4 .INTERMEDIATE: $(GENBIN) include_HEADERS = include/univalue.h -noinst_HEADERS = lib/univalue_escapes.h +noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h lib_LTLIBRARIES = libunivalue.la @@ -73,6 +73,10 @@ TEST_FILES = \ $(TEST_DATA_DIR)/fail35.json \ $(TEST_DATA_DIR)/fail36.json \ $(TEST_DATA_DIR)/fail37.json \ + $(TEST_DATA_DIR)/fail38.json \ + $(TEST_DATA_DIR)/fail39.json \ + $(TEST_DATA_DIR)/fail40.json \ + $(TEST_DATA_DIR)/fail41.json \ $(TEST_DATA_DIR)/fail3.json \ $(TEST_DATA_DIR)/fail4.json \ $(TEST_DATA_DIR)/fail5.json \ @@ -83,6 +87,7 @@ TEST_FILES = \ $(TEST_DATA_DIR)/pass1.json \ $(TEST_DATA_DIR)/pass2.json \ $(TEST_DATA_DIR)/pass3.json \ - $(TEST_DATA_DIR)/round1.json + $(TEST_DATA_DIR)/round1.json \ + $(TEST_DATA_DIR)/round2.json EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS) diff --git a/src/univalue/configure.ac b/src/univalue/configure.ac index 0515b632bd..93d3ba945d 100644 --- a/src/univalue/configure.ac +++ b/src/univalue/configure.ac @@ -1,7 +1,7 @@ m4_define([libunivalue_major_version], [1]) m4_define([libunivalue_minor_version], [1]) -m4_define([libunivalue_micro_version], [1]) -m4_define([libunivalue_interface_age], [1]) +m4_define([libunivalue_micro_version], [2]) +m4_define([libunivalue_interface_age], [2]) # If you need a modifier for the version number. # Normally empty, but can be used to make "fixup" releases. m4_define([libunivalue_extraversion], []) @@ -14,7 +14,7 @@ m4_define([libunivalue_age], [m4_eval(libunivalue_binary_age - libunivalue_inter m4_define([libunivalue_version], [libunivalue_major_version().libunivalue_minor_version().libunivalue_micro_version()libunivalue_extraversion()]) -AC_INIT([univalue], [1.0.1], +AC_INIT([univalue], [1.0.2], [http://github.com/jgarzik/univalue/]) dnl make the compilation flags quiet unless V=1 is used diff --git a/src/univalue/lib/univalue_read.cpp b/src/univalue/lib/univalue_read.cpp index c7516b9628..95bac6958d 100644 --- a/src/univalue/lib/univalue_read.cpp +++ b/src/univalue/lib/univalue_read.cpp @@ -6,6 +6,7 @@ #include <vector> #include <stdio.h> #include "univalue.h" +#include "univalue_utffilter.h" using namespace std; @@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed, raw++; // skip " string valStr; + JSONUTF8StringFilter writer(valStr); while (*raw) { - if (*raw < 0x20) + if ((unsigned char)*raw < 0x20) return JTOK_ERR; else if (*raw == '\\') { raw++; // skip backslash switch (*raw) { - case '"': valStr += "\""; break; - case '\\': valStr += "\\"; break; - case '/': valStr += "/"; break; - case 'b': valStr += "\b"; break; - case 'f': valStr += "\f"; break; - case 'n': valStr += "\n"; break; - case 'r': valStr += "\r"; break; - case 't': valStr += "\t"; break; + case '"': writer.push_back('\"'); break; + case '\\': writer.push_back('\\'); break; + case '/': writer.push_back('/'); break; + case 'b': writer.push_back('\b'); break; + case 'f': writer.push_back('\f'); break; + case 'n': writer.push_back('\n'); break; + case 'r': writer.push_back('\r'); break; + case 't': writer.push_back('\t'); break; case 'u': { unsigned int codepoint; if (hatoui(raw + 1, raw + 1 + 4, codepoint) != raw + 1 + 4) return JTOK_ERR; - - if (codepoint <= 0x7f) - valStr.push_back((char)codepoint); - else if (codepoint <= 0x7FF) { - valStr.push_back((char)(0xC0 | (codepoint >> 6))); - valStr.push_back((char)(0x80 | (codepoint & 0x3F))); - } else if (codepoint <= 0xFFFF) { - valStr.push_back((char)(0xE0 | (codepoint >> 12))); - valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); - valStr.push_back((char)(0x80 | (codepoint & 0x3F))); - } - + writer.push_back_u(codepoint); raw += 4; break; } @@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed, } else { - valStr += *raw; + writer.push_back(*raw); raw++; } } + if (!writer.finalize()) + return JTOK_ERR; tokenVal = valStr; consumed = (raw - rawStart); return JTOK_STRING; diff --git a/src/univalue/lib/univalue_utffilter.h b/src/univalue/lib/univalue_utffilter.h new file mode 100644 index 0000000000..0e330dce9c --- /dev/null +++ b/src/univalue/lib/univalue_utffilter.h @@ -0,0 +1,119 @@ +// Copyright 2016 Wladimir J. van der Laan +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. +#ifndef UNIVALUE_UTFFILTER_H +#define UNIVALUE_UTFFILTER_H + +#include <string> + +/** + * Filter that generates and validates UTF-8, as well as collates UTF-16 + * surrogate pairs as specified in RFC4627. + */ +class JSONUTF8StringFilter +{ +public: + JSONUTF8StringFilter(std::string &s): + str(s), is_valid(true), codepoint(0), state(0), surpair(0) + { + } + // Write single 8-bit char (may be part of UTF-8 sequence) + void push_back(unsigned char ch) + { + if (state == 0) { + if (ch < 0x80) // 7-bit ASCII, fast direct pass-through + str.push_back(ch); + else if (ch < 0xc0) // Mid-sequence character, invalid in this state + is_valid = false; + else if (ch < 0xe0) { // Start of 2-byte sequence + codepoint = (ch & 0x1f) << 6; + state = 6; + } else if (ch < 0xf0) { // Start of 3-byte sequence + codepoint = (ch & 0x0f) << 12; + state = 12; + } else if (ch < 0xf8) { // Start of 4-byte sequence + codepoint = (ch & 0x07) << 18; + state = 18; + } else // Reserved, invalid + is_valid = false; + } else { + if ((ch & 0xc0) != 0x80) // Not a continuation, invalid + is_valid = false; + state -= 6; + codepoint |= (ch & 0x3f) << state; + if (state == 0) + push_back_u(codepoint); + } + } + // Write codepoint directly, possibly collating surrogate pairs + void push_back_u(unsigned int codepoint) + { + if (state) // Only accept full codepoints in open state + is_valid = false; + if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair + if (surpair) // Two subsequent surrogate pair openers - fail + is_valid = false; + else + surpair = codepoint; + } else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair + if (surpair) { // Open surrogate pair, expect second half + // Compute code point from UTF-16 surrogate pair + append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00)); + surpair = 0; + } else // Second half doesn't follow a first half - fail + is_valid = false; + } else { + if (surpair) // First half of surrogate pair not followed by second - fail + is_valid = false; + else + append_codepoint(codepoint); + } + } + // Check that we're in a state where the string can be ended + // No open sequences, no open surrogate pairs, etc + bool finalize() + { + if (state || surpair) + is_valid = false; + return is_valid; + } +private: + std::string &str; + bool is_valid; + // Current UTF-8 decoding state + unsigned int codepoint; + int state; // Top bit to be filled in for next UTF-8 byte, or 0 + + // Keep track of the following state to handle the following section of + // RFC4627: + // + // To escape an extended character that is not in the Basic Multilingual + // Plane, the character is represented as a twelve-character sequence, + // encoding the UTF-16 surrogate pair. So, for example, a string + // containing only the G clef character (U+1D11E) may be represented as + // "\uD834\uDD1E". + // + // Two subsequent \u.... may have to be replaced with one actual codepoint. + unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 + + void append_codepoint(unsigned int codepoint) + { + if (codepoint <= 0x7f) + str.push_back((char)codepoint); + else if (codepoint <= 0x7FF) { + str.push_back((char)(0xC0 | (codepoint >> 6))); + str.push_back((char)(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + str.push_back((char)(0xE0 | (codepoint >> 12))); + str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); + str.push_back((char)(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0x1FFFFF) { + str.push_back((char)(0xF0 | (codepoint >> 18))); + str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F))); + str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); + str.push_back((char)(0x80 | (codepoint & 0x3F))); + } + } +}; + +#endif diff --git a/src/univalue/lib/univalue_write.cpp b/src/univalue/lib/univalue_write.cpp index ceb4cc9166..cfbdad3284 100644 --- a/src/univalue/lib/univalue_write.cpp +++ b/src/univalue/lib/univalue_write.cpp @@ -8,8 +8,6 @@ #include "univalue.h" #include "univalue_escapes.h" -// TODO: Using UTF8 - using namespace std; static string json_escape(const string& inS) @@ -23,15 +21,8 @@ static string json_escape(const string& inS) if (escStr) outS += escStr; - - else if (ch < 0x80) + else outS += ch; - - else { // TODO handle UTF-8 properly - char tmpesc[16]; - sprintf(tmpesc, "\\u%04x", ch); - outS += tmpesc; - } } return outS; diff --git a/src/univalue/test/fail38.json b/src/univalue/test/fail38.json new file mode 100644 index 0000000000..b245e2e46c --- /dev/null +++ b/src/univalue/test/fail38.json @@ -0,0 +1 @@ +["\ud834"] diff --git a/src/univalue/test/fail39.json b/src/univalue/test/fail39.json new file mode 100644 index 0000000000..7c9e263f27 --- /dev/null +++ b/src/univalue/test/fail39.json @@ -0,0 +1 @@ +["\udd61"] diff --git a/src/univalue/test/fail40.json b/src/univalue/test/fail40.json new file mode 100644 index 0000000000..664dc9e245 --- /dev/null +++ b/src/univalue/test/fail40.json @@ -0,0 +1 @@ +["
ก"]
\ No newline at end of file diff --git a/src/univalue/test/fail41.json b/src/univalue/test/fail41.json new file mode 100644 index 0000000000..0de342a2b5 --- /dev/null +++ b/src/univalue/test/fail41.json @@ -0,0 +1 @@ +["๐
"]
\ No newline at end of file diff --git a/src/univalue/test/round2.json b/src/univalue/test/round2.json new file mode 100644 index 0000000000..b766cccc68 --- /dev/null +++ b/src/univalue/test/round2.json @@ -0,0 +1 @@ +["aยงโ ๐๐
ก"] diff --git a/src/univalue/test/unitester.cpp b/src/univalue/test/unitester.cpp index 5a052fe92c..05f3842cd1 100644 --- a/src/univalue/test/unitester.cpp +++ b/src/univalue/test/unitester.cpp @@ -22,6 +22,7 @@ string srcdir(JSON_TEST_SRC); static bool test_failed = false; #define d_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", filename.c_str()); } } +#define f_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", __func__); } } static std::string rtrim(std::string s) { @@ -108,6 +109,10 @@ static const char *filenames[] = { "fail35.json", "fail36.json", "fail37.json", + "fail38.json", // invalid unicode: only first half of surrogate pair + "fail39.json", // invalid unicode: only second half of surrogate pair + "fail40.json", // invalid unicode: broken UTF-8 + "fail41.json", // invalid unicode: unfinished UTF-8 "fail3.json", "fail4.json", // extra comma "fail5.json", @@ -119,14 +124,40 @@ static const char *filenames[] = { "pass2.json", "pass3.json", "round1.json", // round-trip test + "round2.json", // unicode }; +// Test \u handling +void unescape_unicode_test() +{ + UniValue val; + bool testResult; + // Escaped ASCII (quote) + testResult = val.read("[\"\\u0022\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\""); + // Escaped Basic Plane character, two-byte UTF-8 + testResult = val.read("[\"\\u0191\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\xc6\x91"); + // Escaped Basic Plane character, three-byte UTF-8 + testResult = val.read("[\"\\u2191\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\xe2\x86\x91"); + // Escaped Supplementary Plane character U+1d161 + testResult = val.read("[\"\\ud834\\udd61\"]"); + f_assert(testResult); + f_assert(val[0].get_str() == "\xf0\x9d\x85\xa1"); +} + int main (int argc, char *argv[]) { for (unsigned int fidx = 0; fidx < ARRAY_SIZE(filenames); fidx++) { runtest_file(filenames[fidx]); } + unescape_unicode_test(); + return test_failed ? 1 : 0; } |