aboutsummaryrefslogtreecommitdiff
path: root/qobject/json-lexer.c
diff options
context:
space:
mode:
authorMarkus Armbruster <armbru@redhat.com>2018-08-31 09:58:39 +0200
committerMarkus Armbruster <armbru@redhat.com>2018-09-24 18:08:07 +0200
commit0f07a5d5f1f484c9c334d52193617e89442da7c9 (patch)
tree82f88bdd8f8454492d5b289fce7fd113caa3e5e5 /qobject/json-lexer.c
parentc0ee3afa7fa2547b5766dd25e52ced292c204d4e (diff)
json: Nicer recovery from lexical errors
When the lexer chokes on an input character, it consumes the character, emits a JSON error token, and enters its start state. This can lead to suboptimal error recovery. For instance, input 0123 , produces the tokens JSON_ERROR 01 JSON_INTEGER 23 JSON_COMMA , Make the lexer skip characters after a lexical error until a structural character ('[', ']', '{', '}', ':', ','), an ASCII control character, or '\xFE', or '\xFF'. Note that we must not skip ASCII control characters, '\xFE', '\xFF', because those are documented to force the JSON parser into known-good state, by docs/interop/qmp-spec.txt. The lexer now produces JSON_ERROR 01 JSON_COMMA , Update qmp-test for the nicer error recovery: QMP now reports just one error for input %p instead of two. Also drop the newline after %p; it was needed to tease out the second error. Signed-off-by: Markus Armbruster <armbru@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Message-Id: <20180831075841.13363-5-armbru@redhat.com> [Conflict with commit ebb4d82d888 resolved]
Diffstat (limited to 'qobject/json-lexer.c')
-rw-r--r--qobject/json-lexer.c43
1 files changed, 29 insertions, 14 deletions
diff --git a/qobject/json-lexer.c b/qobject/json-lexer.c
index 28582e17d9..39c7ce7adc 100644
--- a/qobject/json-lexer.c
+++ b/qobject/json-lexer.c
@@ -101,6 +101,7 @@
enum json_lexer_state {
IN_ERROR = 0, /* must really be 0, see json_lexer[] */
+ IN_RECOVERY,
IN_DQ_STRING_ESCAPE,
IN_DQ_STRING,
IN_SQ_STRING_ESCAPE,
@@ -130,6 +131,28 @@ QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
static const uint8_t json_lexer[][256] = {
/* Relies on default initialization to IN_ERROR! */
+ /* error recovery */
+ [IN_RECOVERY] = {
+ /*
+ * Skip characters until a structural character, an ASCII
+ * control character other than '\t', or impossible UTF-8
+ * bytes '\xFE', '\xFF'. Structural characters and line
+ * endings are promising resynchronization points. Clients
+ * may use the others to force the JSON parser into known-good
+ * state; see docs/interop/qmp-spec.txt.
+ */
+ [0 ... 0x1F] = IN_START | LOOKAHEAD,
+ [0x20 ... 0xFD] = IN_RECOVERY,
+ [0xFE ... 0xFF] = IN_START | LOOKAHEAD,
+ ['\t'] = IN_RECOVERY,
+ ['['] = IN_START | LOOKAHEAD,
+ [']'] = IN_START | LOOKAHEAD,
+ ['{'] = IN_START | LOOKAHEAD,
+ ['}'] = IN_START | LOOKAHEAD,
+ [':'] = IN_START | LOOKAHEAD,
+ [','] = IN_START | LOOKAHEAD,
+ },
+
/* double quote string */
[IN_DQ_STRING_ESCAPE] = {
[0x20 ... 0xFD] = IN_DQ_STRING,
@@ -301,26 +324,18 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
/* fall through */
case JSON_SKIP:
g_string_truncate(lexer->token, 0);
+ /* fall through */
+ case IN_START:
new_state = lexer->start_state;
break;
case IN_ERROR:
- /* XXX: To avoid having previous bad input leaving the parser in an
- * unresponsive state where we consume unpredictable amounts of
- * subsequent "good" input, percolate this error state up to the
- * parser by emitting a JSON_ERROR token, then reset lexer state.
- *
- * Also note that this handling is required for reliable channel
- * negotiation between QMP and the guest agent, since chr(0xFF)
- * is placed at the beginning of certain events to ensure proper
- * delivery when the channel is in an unknown state. chr(0xFF) is
- * never a valid ASCII/UTF-8 sequence, so this should reliably
- * induce an error/flush state.
- */
json_message_process_token(lexer, lexer->token, JSON_ERROR,
lexer->x, lexer->y);
+ new_state = IN_RECOVERY;
+ /* fall through */
+ case IN_RECOVERY:
g_string_truncate(lexer->token, 0);
- lexer->state = lexer->start_state;
- return;
+ break;
default:
break;
}