/* * JSON lexer * * Copyright IBM, Corp. 2009 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. * See the COPYING.LIB file in the top-level directory. * */ #include "qstring.h" #include "qlist.h" #include "qdict.h" #include "qint.h" #include "qemu-common.h" #include "json-lexer.h" #define MAX_TOKEN_SIZE (64ULL << 20) /* * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\" * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*' * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+)) * [{}\[\],:] * [a-z]+ * */ enum json_lexer_state { IN_ERROR = 0, IN_DQ_UCODE3, IN_DQ_UCODE2, IN_DQ_UCODE1, IN_DQ_UCODE0, IN_DQ_STRING_ESCAPE, IN_DQ_STRING, IN_SQ_UCODE3, IN_SQ_UCODE2, IN_SQ_UCODE1, IN_SQ_UCODE0, IN_SQ_STRING_ESCAPE, IN_SQ_STRING, IN_ZERO, IN_DIGITS, IN_DIGIT, IN_EXP_E, IN_MANTISSA, IN_MANTISSA_DIGITS, IN_NONZERO_NUMBER, IN_NEG_NONZERO_NUMBER, IN_KEYWORD, IN_ESCAPE, IN_ESCAPE_L, IN_ESCAPE_LL, IN_ESCAPE_I, IN_ESCAPE_I6, IN_ESCAPE_I64, IN_WHITESPACE, IN_START, }; #define TERMINAL(state) [0 ... 0x7F] = (state) /* Return whether TERMINAL is a terminal state and the transition to it from OLD_STATE required lookahead. This happens whenever the table below uses the TERMINAL macro. */ #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \ (json_lexer[(old_state)][0] == (terminal)) static const uint8_t json_lexer[][256] = { /* double quote string */ [IN_DQ_UCODE3] = { ['0' ... '9'] = IN_DQ_STRING, ['a' ... 'f'] = IN_DQ_STRING, ['A' ... 'F'] = IN_DQ_STRING, }, [IN_DQ_UCODE2] = { ['0' ... '9'] = IN_DQ_UCODE3, ['a' ... 'f'] = IN_DQ_UCODE3, ['A' ... 'F'] = IN_DQ_UCODE3, }, [IN_DQ_UCODE1] = { ['0' ... '9'] = IN_DQ_UCODE2, ['a' ... 'f'] = IN_DQ_UCODE2, ['A' ... 'F'] = IN_DQ_UCODE2, }, [IN_DQ_UCODE0] = { ['0' ... '9'] = IN_DQ_UCODE1, ['a' ... 'f'] = IN_DQ_UCODE1, ['A' ... 'F'] = IN_DQ_UCODE1, }, [IN_DQ_STRING_ESCAPE] = { ['b'] = IN_DQ_STRING, ['f'] = IN_DQ_STRING, ['n'] = IN_DQ_STRING, ['r'] = IN_DQ_STRING, ['t'] = IN_DQ_STRING, ['/'] = IN_DQ_STRING, ['\\'] = IN_DQ_STRING, ['\''] = IN_DQ_STRING, ['\"'] = IN_DQ_STRING, ['u'] = IN_DQ_UCODE0, }, [IN_DQ_STRING] = { [1 ... 0xBF] = IN_DQ_STRING, [0xC2 ... 0xF4] = IN_DQ_STRING, ['\\'] = IN_DQ_STRING_ESCAPE, ['"'] = JSON_STRING, }, /* single quote string */ [IN_SQ_UCODE3] = { ['0' ... '9'] = IN_SQ_STRING, ['a' ... 'f'] = IN_SQ_STRING, ['A' ... 'F'] = IN_SQ_STRING, }, [IN_SQ_UCODE2] = { ['0' ... '9'] = IN_SQ_UCODE3, ['a' ... 'f'] = IN_SQ_UCODE3, ['A' ... 'F'] = IN_SQ_UCODE3, }, [IN_SQ_UCODE1] = { ['0' ... '9'] = IN_SQ_UCODE2, ['a' ... 'f'] = IN_SQ_UCODE2, ['A' ... 'F'] = IN_SQ_UCODE2, }, [IN_SQ_UCODE0] = { ['0' ... '9'] = IN_SQ_UCODE1, ['a' ... 'f'] = IN_SQ_UCODE1, ['A' ... 'F'] = IN_SQ_UCODE1, }, [IN_SQ_STRING_ESCAPE] = { ['b'] = IN_SQ_STRING, ['f'] = IN_SQ_STRING, ['n'] = IN_SQ_STRING, ['r'] = IN_SQ_STRING, ['t'] = IN_SQ_STRING, ['/'] = IN_DQ_STRING, ['\\'] = IN_DQ_STRING, ['\''] = IN_SQ_STRING, ['\"'] = IN_SQ_STRING, ['u'] = IN_SQ_UCODE0, }, [IN_SQ_STRING] = { [1 ... 0xBF] = IN_SQ_STRING, [0xC2 ... 0xF4] = IN_SQ_STRING, ['\\'] = IN_SQ_STRING_ESCAPE, ['\''] = JSON_STRING, }, /* Zero */ [IN_ZERO] = { TERMINAL(JSON_INTEGER), ['0' ... '9'] = IN_ERROR, ['.'] = IN_MANTISSA, }, /* Float */ [IN_DIGITS] = { TERMINAL(JSON_FLOAT), ['0' ... '9'] = IN_DIGITS, }, [IN_DIGIT] = { ['0' ... '9'] = IN_DIGITS, }, [IN_EXP_E] = { ['-'] = IN_DIGIT, ['+'] = IN_DIGIT, ['0' ... '9'] = IN_DIGITS, }, [IN_MANTISSA_DIGITS] = { TERMINAL(JSON_FLOAT), ['0' ... '9'] = IN_MANTISSA_DIGITS, ['e'] = IN_EXP_E, ['E'] = IN_EXP_E, }, [IN_MANTISSA] = { ['0' ... '9'] = IN_MANTISSA_DIGITS, }, /* Number */ [IN_NONZERO_NUMBER] = { TERMINAL(JSON_INTEGER), ['0' ... '9'] = IN_NONZERO_NUMBER, ['e'] = IN_EXP_E, ['E'] = IN_EXP_E, ['.'] = IN_MANTISSA, }, [IN_NEG_NONZERO_NUMBER] = { ['0'] = IN_ZERO, ['1' ... '9'] = IN_NONZERO_NUMBER, }, /* keywords */ [IN_KEYWORD] = { TERMINAL(JSON_KEYWORD), ['a' ... 'z'] = IN_KEYWORD, }, /* whitespace */ [IN_WHITESPACE] = { TERMINAL(JSON_SKIP), [' '] = IN_WHITESPACE, ['\t'] = IN_WHITESPACE, ['\r'] = IN_WHITESPACE, ['\n'] = IN_WHITESPACE, }, /* escape */ [IN_ESCAPE_LL] = { ['d'] = JSON_ESCAPE, }, [IN_ESCAPE_L] = { ['d'] = JSON_ESCAPE, ['l'] = IN_ESCAPE_LL, }, [IN_ESCAPE_I64] = { ['d'] = JSON_ESCAPE, }, [IN_ESCAPE_I6] = { ['4'] = IN_ESCAPE_I64, }, [IN_ESCAPE_I] = { ['6'] = IN_ESCAPE_I6, }, [IN_ESCAPE] = { ['d'] = JSON_ESCAPE, ['i'] = JSON_ESCAPE, ['p'] = JSON_ESCAPE, ['s'] = JSON_ESCAPE, ['f'] = JSON_ESCAPE, ['l'] = IN_ESCAPE_L, ['I'] = IN_ESCAPE_I, }, /* top level rule */ [IN_START] = { ['"'] = IN_DQ_STRING, ['\''] = IN_SQ_STRING, ['0'] = IN_ZERO, ['1' ... '9'] = IN_NONZERO_NUMBER, ['-'] = IN_NEG_NONZERO_NUMBER, ['{'] = JSON_OPERATOR, ['}'] = JSON_OPERATOR, ['['] = JSON_OPERATOR, [']'] = JSON_OPERATOR, [','] = JSON_OPERATOR, [':'] = JSON_OPERATOR, ['a' ... 'z'] = IN_KEYWORD, ['%'] = IN_ESCAPE, [' '] = IN_WHITESPACE, ['\t'] = IN_WHITESPACE, ['\r'] = IN_WHITESPACE, ['\n'] = IN_WHITESPACE, }, }; void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func) { lexer->emit = func; lexer->state = IN_START; lexer->token = qstring_new(); lexer->x = lexer->y = 0; } static int json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) { int char_consumed, new_state; lexer->x++; if (ch == '\n') { lexer->x = 0; lexer->y++; } do { new_state = json_lexer[lexer->state][(uint8_t)ch]; char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state); if (char_consumed) { qstring_append_chr(lexer->token, ch); } switch (new_state) { case JSON_OPERATOR: case JSON_ESCAPE: case JSON_INTEGER: case JSON_FLOAT: case JSON_KEYWORD: case JSON_STRING: lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y); case JSON_SKIP: QDECREF(lexer->token); lexer->token = qstring_new(); new_state = IN_START; break; case IN_ERROR: /* XXX: To avoid having previous bad input leaving the parser in an * unresponsive state where we consume unpredictable amounts of * subsequent "good" input, percolate this error state up to the * tokenizer/parser by forcing a NULL object to be emitted, then * reset state. * * Also note that this handling is required for reliable channel * negotiation between QMP and the guest agent, since chr(0xFF) * is placed at the beginning of certain events to ensure proper * delivery when the channel is in an unknown state. chr(0xFF) is * never a valid ASCII/UTF-8 sequence, so this should reliably * induce an error/flush state. */ lexer->emit(lexer, lexer->token, JSON_ERROR, lexer->x, lexer->y); QDECREF(lexer->token); lexer->token = qstring_new(); new_state = IN_START; lexer->state = new_state; return 0; default: break; } lexer->state = new_state; } while (!char_consumed && !flush); /* Do not let a single token grow to an arbitrarily large size, * this is a security consideration. */ if (lexer->token->length > MAX_TOKEN_SIZE) { lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y); QDECREF(lexer->token); lexer->token = qstring_new(); lexer->state = IN_START; } return 0; } int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) { size_t i; for (i = 0; i < size; i++) { int err; err = json_lexer_feed_char(lexer, buffer[i], false); if (err < 0) { return err; } } return 0; } int json_lexer_flush(JSONLexer *lexer) { return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0, true); } void json_lexer_destroy(JSONLexer *lexer) { QDECREF(lexer->token); }