diff options
author | Markus Armbruster <armbru@redhat.com> | 2013-04-11 18:07:21 +0200 |
---|---|---|
committer | Blue Swirl <blauwirbel@gmail.com> | 2013-04-13 19:40:25 +0000 |
commit | e2ec3f976803b360c70d9ae2ba13852fa5d11665 (patch) | |
tree | 55b35131f8eceadc89e793ccba46c542194742a8 /qobject | |
parent | 1d50c8e947180174acb02bad9ff95e0aee6249ea (diff) |
qjson: to_json() case QTYPE_QSTRING is buggy, rewrite
Known bugs in to_json():
* A start byte for a three-byte sequence followed by less than two
continuation bytes is split into one-byte sequences.
* Start bytes for sequences longer than three bytes get misinterpreted
as start bytes for three-byte sequences. Continuation bytes beyond
byte three become one-byte sequences.
This means all characters outside the BMP are decoded incorrectly.
* One-byte sequences with the MSB are put into the JSON string
verbatim when char is unsigned, producing invalid UTF-8. When char
is signed, they're replaced by "\\uFFFF" instead.
This includes \xFE, \xFF, and stray continuation bytes.
* Overlong sequences are happily accepted, unless screwed up by the
bugs above.
* Likewise, sequences encoding surrogate code points or noncharacters.
* Unlike other control characters, ASCII DEL is not escaped. Except
in overlong encodings.
My rewrite fixes them as follows:
* Malformed UTF-8 sequences are replaced.
Except the overlong encoding \xC0\x80 of U+0000 is still accepted.
Permits embedding NUL characters in C strings. This trick is known
as "Modified UTF-8".
* Sequences encoding code points beyond Unicode range are replaced.
* Sequences encoding code points beyond the BMP produce a surrogate
pair.
* Sequences encoding surrogate code points are replaced.
* Sequences encoding noncharacters are replaced.
* ASCII DEL is now always escaped.
The replacement character is U+FFFD.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
Diffstat (limited to 'qobject')
-rw-r--r-- | qobject/qjson.c | 102 |
1 files changed, 45 insertions, 57 deletions
diff --git a/qobject/qjson.c b/qobject/qjson.c index 83a6b4f7c1..19085a1bb7 100644 --- a/qobject/qjson.c +++ b/qobject/qjson.c @@ -136,68 +136,56 @@ static void to_json(const QObject *obj, QString *str, int pretty, int indent) case QTYPE_QSTRING: { QString *val = qobject_to_qstring(obj); const char *ptr; + int cp; + char buf[16]; + char *end; ptr = qstring_get_str(val); qstring_append(str, "\""); - while (*ptr) { - if ((ptr[0] & 0xE0) == 0xE0 && - (ptr[1] & 0x80) && (ptr[2] & 0x80)) { - uint16_t wchar; - char escape[7]; - - wchar = (ptr[0] & 0x0F) << 12; - wchar |= (ptr[1] & 0x3F) << 6; - wchar |= (ptr[2] & 0x3F); - ptr += 2; - - snprintf(escape, sizeof(escape), "\\u%04X", wchar); - qstring_append(str, escape); - } else if ((ptr[0] & 0xE0) == 0xC0 && (ptr[1] & 0x80)) { - uint16_t wchar; - char escape[7]; - - wchar = (ptr[0] & 0x1F) << 6; - wchar |= (ptr[1] & 0x3F); - ptr++; - - snprintf(escape, sizeof(escape), "\\u%04X", wchar); - qstring_append(str, escape); - } else switch (ptr[0]) { - case '\"': - qstring_append(str, "\\\""); - break; - case '\\': - qstring_append(str, "\\\\"); - break; - case '\b': - qstring_append(str, "\\b"); - break; - case '\f': - qstring_append(str, "\\f"); - break; - case '\n': - qstring_append(str, "\\n"); - break; - case '\r': - qstring_append(str, "\\r"); - break; - case '\t': - qstring_append(str, "\\t"); - break; - default: { - if (ptr[0] <= 0x1F) { - char escape[7]; - snprintf(escape, sizeof(escape), "\\u%04X", ptr[0]); - qstring_append(str, escape); - } else { - char buf[2] = { ptr[0], 0 }; - qstring_append(str, buf); - } - break; + + for (; *ptr; ptr = end) { + cp = mod_utf8_codepoint(ptr, 6, &end); + switch (cp) { + case '\"': + qstring_append(str, "\\\""); + break; + case '\\': + qstring_append(str, "\\\\"); + break; + case '\b': + qstring_append(str, "\\b"); + break; + case '\f': + qstring_append(str, "\\f"); + break; + case '\n': + qstring_append(str, "\\n"); + break; + case '\r': + qstring_append(str, "\\r"); + break; + case '\t': + qstring_append(str, "\\t"); + break; + default: + if (cp < 0) { + cp = 0xFFFD; /* replacement character */ } + if (cp > 0xFFFF) { + /* beyond BMP; need a surrogate pair */ + snprintf(buf, sizeof(buf), "\\u%04X\\u%04X", + 0xD800 + ((cp - 0x10000) >> 10), + 0xDC00 + ((cp - 0x10000) & 0x3FF)); + } else if (cp < 0x20 || cp >= 0x7F) { + snprintf(buf, sizeof(buf), "\\u%04X", cp); + } else { + buf[0] = cp; + buf[1] = 0; } - ptr++; - } + qstring_append(str, buf); + } + }; + qstring_append(str, "\""); break; } |