diff options
Diffstat (limited to 'vendor/github.com/BurntSushi/toml/lex.go')
-rw-r--r-- | vendor/github.com/BurntSushi/toml/lex.go | 1281 |
1 files changed, 1281 insertions, 0 deletions
diff --git a/vendor/github.com/BurntSushi/toml/lex.go b/vendor/github.com/BurntSushi/toml/lex.go new file mode 100644 index 0000000..a1016d9 --- /dev/null +++ b/vendor/github.com/BurntSushi/toml/lex.go @@ -0,0 +1,1281 @@ +package toml + +import ( + "fmt" + "reflect" + "runtime" + "strings" + "unicode" + "unicode/utf8" +) + +type itemType int + +const ( + itemError itemType = iota + itemNIL // used in the parser to indicate no type + itemEOF + itemText + itemString + itemStringEsc + itemRawString + itemMultilineString + itemRawMultilineString + itemBool + itemInteger + itemFloat + itemDatetime + itemArray // the start of an array + itemArrayEnd + itemTableStart + itemTableEnd + itemArrayTableStart + itemArrayTableEnd + itemKeyStart + itemKeyEnd + itemCommentStart + itemInlineTableStart + itemInlineTableEnd +) + +const eof = 0 + +type stateFn func(lx *lexer) stateFn + +func (p Position) String() string { + return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len) +} + +type lexer struct { + input string + start int + pos int + line int + state stateFn + items chan item + tomlNext bool + esc bool + + // Allow for backing up up to 4 runes. This is necessary because TOML + // contains 3-rune tokens (""" and '''). + prevWidths [4]int + nprev int // how many of prevWidths are in use + atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again. + + // A stack of state functions used to maintain context. + // + // The idea is to reuse parts of the state machine in various places. For + // example, values can appear at the top level or within arbitrarily nested + // arrays. The last state on the stack is used after a value has been lexed. + // Similarly for comments. + stack []stateFn +} + +type item struct { + typ itemType + val string + err error + pos Position +} + +func (lx *lexer) nextItem() item { + for { + select { + case item := <-lx.items: + return item + default: + lx.state = lx.state(lx) + //fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack) + } + } +} + +func lex(input string, tomlNext bool) *lexer { + lx := &lexer{ + input: input, + state: lexTop, + items: make(chan item, 10), + stack: make([]stateFn, 0, 10), + line: 1, + tomlNext: tomlNext, + } + return lx +} + +func (lx *lexer) push(state stateFn) { + lx.stack = append(lx.stack, state) +} + +func (lx *lexer) pop() stateFn { + if len(lx.stack) == 0 { + return lx.errorf("BUG in lexer: no states to pop") + } + last := lx.stack[len(lx.stack)-1] + lx.stack = lx.stack[0 : len(lx.stack)-1] + return last +} + +func (lx *lexer) current() string { + return lx.input[lx.start:lx.pos] +} + +func (lx lexer) getPos() Position { + p := Position{ + Line: lx.line, + Start: lx.start, + Len: lx.pos - lx.start, + } + if p.Len <= 0 { + p.Len = 1 + } + return p +} + +func (lx *lexer) emit(typ itemType) { + // Needed for multiline strings ending with an incomplete UTF-8 sequence. + if lx.start > lx.pos { + lx.error(errLexUTF8{lx.input[lx.pos]}) + return + } + lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()} + lx.start = lx.pos +} + +func (lx *lexer) emitTrim(typ itemType) { + lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())} + lx.start = lx.pos +} + +func (lx *lexer) next() (r rune) { + if lx.atEOF { + panic("BUG in lexer: next called after EOF") + } + if lx.pos >= len(lx.input) { + lx.atEOF = true + return eof + } + + if lx.input[lx.pos] == '\n' { + lx.line++ + } + lx.prevWidths[3] = lx.prevWidths[2] + lx.prevWidths[2] = lx.prevWidths[1] + lx.prevWidths[1] = lx.prevWidths[0] + if lx.nprev < 4 { + lx.nprev++ + } + + r, w := utf8.DecodeRuneInString(lx.input[lx.pos:]) + if r == utf8.RuneError && w == 1 { + lx.error(errLexUTF8{lx.input[lx.pos]}) + return utf8.RuneError + } + + // Note: don't use peek() here, as this calls next(). + if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) { + lx.errorControlChar(r) + return utf8.RuneError + } + + lx.prevWidths[0] = w + lx.pos += w + return r +} + +// ignore skips over the pending input before this point. +func (lx *lexer) ignore() { + lx.start = lx.pos +} + +// backup steps back one rune. Can be called 4 times between calls to next. +func (lx *lexer) backup() { + if lx.atEOF { + lx.atEOF = false + return + } + if lx.nprev < 1 { + panic("BUG in lexer: backed up too far") + } + w := lx.prevWidths[0] + lx.prevWidths[0] = lx.prevWidths[1] + lx.prevWidths[1] = lx.prevWidths[2] + lx.prevWidths[2] = lx.prevWidths[3] + lx.nprev-- + + lx.pos -= w + if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' { + lx.line-- + } +} + +// accept consumes the next rune if it's equal to `valid`. +func (lx *lexer) accept(valid rune) bool { + if lx.next() == valid { + return true + } + lx.backup() + return false +} + +// peek returns but does not consume the next rune in the input. +func (lx *lexer) peek() rune { + r := lx.next() + lx.backup() + return r +} + +// skip ignores all input that matches the given predicate. +func (lx *lexer) skip(pred func(rune) bool) { + for { + r := lx.next() + if pred(r) { + continue + } + lx.backup() + lx.ignore() + return + } +} + +// error stops all lexing by emitting an error and returning `nil`. +// +// Note that any value that is a character is escaped if it's a special +// character (newlines, tabs, etc.). +func (lx *lexer) error(err error) stateFn { + if lx.atEOF { + return lx.errorPrevLine(err) + } + lx.items <- item{typ: itemError, pos: lx.getPos(), err: err} + return nil +} + +// errorfPrevline is like error(), but sets the position to the last column of +// the previous line. +// +// This is so that unexpected EOF or NL errors don't show on a new blank line. +func (lx *lexer) errorPrevLine(err error) stateFn { + pos := lx.getPos() + pos.Line-- + pos.Len = 1 + pos.Start = lx.pos - 1 + lx.items <- item{typ: itemError, pos: pos, err: err} + return nil +} + +// errorPos is like error(), but allows explicitly setting the position. +func (lx *lexer) errorPos(start, length int, err error) stateFn { + pos := lx.getPos() + pos.Start = start + pos.Len = length + lx.items <- item{typ: itemError, pos: pos, err: err} + return nil +} + +// errorf is like error, and creates a new error. +func (lx *lexer) errorf(format string, values ...any) stateFn { + if lx.atEOF { + pos := lx.getPos() + pos.Line-- + pos.Len = 1 + pos.Start = lx.pos - 1 + lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)} + return nil + } + lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)} + return nil +} + +func (lx *lexer) errorControlChar(cc rune) stateFn { + return lx.errorPos(lx.pos-1, 1, errLexControl{cc}) +} + +// lexTop consumes elements at the top level of TOML data. +func lexTop(lx *lexer) stateFn { + r := lx.next() + if isWhitespace(r) || isNL(r) { + return lexSkip(lx, lexTop) + } + switch r { + case '#': + lx.push(lexTop) + return lexCommentStart + case '[': + return lexTableStart + case eof: + if lx.pos > lx.start { + return lx.errorf("unexpected EOF") + } + lx.emit(itemEOF) + return nil + } + + // At this point, the only valid item can be a key, so we back up + // and let the key lexer do the rest. + lx.backup() + lx.push(lexTopEnd) + return lexKeyStart +} + +// lexTopEnd is entered whenever a top-level item has been consumed. (A value +// or a table.) It must see only whitespace, and will turn back to lexTop +// upon a newline. If it sees EOF, it will quit the lexer successfully. +func lexTopEnd(lx *lexer) stateFn { + r := lx.next() + switch { + case r == '#': + // a comment will read to a newline for us. + lx.push(lexTop) + return lexCommentStart + case isWhitespace(r): + return lexTopEnd + case isNL(r): + lx.ignore() + return lexTop + case r == eof: + lx.emit(itemEOF) + return nil + } + return lx.errorf("expected a top-level item to end with a newline, comment, or EOF, but got %q instead", r) +} + +// lexTable lexes the beginning of a table. Namely, it makes sure that +// it starts with a character other than '.' and ']'. +// It assumes that '[' has already been consumed. +// It also handles the case that this is an item in an array of tables. +// e.g., '[[name]]'. +func lexTableStart(lx *lexer) stateFn { + if lx.peek() == '[' { + lx.next() + lx.emit(itemArrayTableStart) + lx.push(lexArrayTableEnd) + } else { + lx.emit(itemTableStart) + lx.push(lexTableEnd) + } + return lexTableNameStart +} + +func lexTableEnd(lx *lexer) stateFn { + lx.emit(itemTableEnd) + return lexTopEnd +} + +func lexArrayTableEnd(lx *lexer) stateFn { + if r := lx.next(); r != ']' { + return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r) + } + lx.emit(itemArrayTableEnd) + return lexTopEnd +} + +func lexTableNameStart(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.peek(); { + case r == ']' || r == eof: + return lx.errorf("unexpected end of table name (table names cannot be empty)") + case r == '.': + return lx.errorf("unexpected table separator (table names cannot be empty)") + case r == '"' || r == '\'': + lx.ignore() + lx.push(lexTableNameEnd) + return lexQuotedName + default: + lx.push(lexTableNameEnd) + return lexBareName + } +} + +// lexTableNameEnd reads the end of a piece of a table name, optionally +// consuming whitespace. +func lexTableNameEnd(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.next(); { + case isWhitespace(r): + return lexTableNameEnd + case r == '.': + lx.ignore() + return lexTableNameStart + case r == ']': + return lx.pop() + default: + return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r) + } +} + +// lexBareName lexes one part of a key or table. +// +// It assumes that at least one valid character for the table has already been +// read. +// +// Lexes only one part, e.g. only 'a' inside 'a.b'. +func lexBareName(lx *lexer) stateFn { + r := lx.next() + if isBareKeyChar(r, lx.tomlNext) { + return lexBareName + } + lx.backup() + lx.emit(itemText) + return lx.pop() +} + +// lexBareName lexes one part of a key or table. +// +// It assumes that at least one valid character for the table has already been +// read. +// +// Lexes only one part, e.g. only '"a"' inside '"a".b'. +func lexQuotedName(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r): + return lexSkip(lx, lexValue) + case r == '"': + lx.ignore() // ignore the '"' + return lexString + case r == '\'': + lx.ignore() // ignore the "'" + return lexRawString + case r == eof: + return lx.errorf("unexpected EOF; expected value") + default: + return lx.errorf("expected value but found %q instead", r) + } +} + +// lexKeyStart consumes all key parts until a '='. +func lexKeyStart(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.peek(); { + case r == '=' || r == eof: + return lx.errorf("unexpected '=': key name appears blank") + case r == '.': + return lx.errorf("unexpected '.': keys cannot start with a '.'") + case r == '"' || r == '\'': + lx.ignore() + fallthrough + default: // Bare key + lx.emit(itemKeyStart) + return lexKeyNameStart + } +} + +func lexKeyNameStart(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.peek(); { + case r == '=' || r == eof: + return lx.errorf("unexpected '='") + case r == '.': + return lx.errorf("unexpected '.'") + case r == '"' || r == '\'': + lx.ignore() + lx.push(lexKeyEnd) + return lexQuotedName + default: + lx.push(lexKeyEnd) + return lexBareName + } +} + +// lexKeyEnd consumes the end of a key and trims whitespace (up to the key +// separator). +func lexKeyEnd(lx *lexer) stateFn { + lx.skip(isWhitespace) + switch r := lx.next(); { + case isWhitespace(r): + return lexSkip(lx, lexKeyEnd) + case r == eof: + return lx.errorf("unexpected EOF; expected key separator '='") + case r == '.': + lx.ignore() + return lexKeyNameStart + case r == '=': + lx.emit(itemKeyEnd) + return lexSkip(lx, lexValue) + default: + return lx.errorf("expected '.' or '=', but got %q instead", r) + } +} + +// lexValue starts the consumption of a value anywhere a value is expected. +// lexValue will ignore whitespace. +// After a value is lexed, the last state on the next is popped and returned. +func lexValue(lx *lexer) stateFn { + // We allow whitespace to precede a value, but NOT newlines. + // In array syntax, the array states are responsible for ignoring newlines. + r := lx.next() + switch { + case isWhitespace(r): + return lexSkip(lx, lexValue) + case isDigit(r): + lx.backup() // avoid an extra state and use the same as above + return lexNumberOrDateStart + } + switch r { + case '[': + lx.ignore() + lx.emit(itemArray) + return lexArrayValue + case '{': + lx.ignore() + lx.emit(itemInlineTableStart) + return lexInlineTableValue + case '"': + if lx.accept('"') { + if lx.accept('"') { + lx.ignore() // Ignore """ + return lexMultilineString + } + lx.backup() + } + lx.ignore() // ignore the '"' + return lexString + case '\'': + if lx.accept('\'') { + if lx.accept('\'') { + lx.ignore() // Ignore """ + return lexMultilineRawString + } + lx.backup() + } + lx.ignore() // ignore the "'" + return lexRawString + case '.': // special error case, be kind to users + return lx.errorf("floats must start with a digit, not '.'") + case 'i', 'n': + if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) { + lx.emit(itemFloat) + return lx.pop() + } + case '-', '+': + return lexDecimalNumberStart + } + if unicode.IsLetter(r) { + // Be permissive here; lexBool will give a nice error if the + // user wrote something like + // x = foo + // (i.e. not 'true' or 'false' but is something else word-like.) + lx.backup() + return lexBool + } + if r == eof { + return lx.errorf("unexpected EOF; expected value") + } + return lx.errorf("expected value but found %q instead", r) +} + +// lexArrayValue consumes one value in an array. It assumes that '[' or ',' +// have already been consumed. All whitespace and newlines are ignored. +func lexArrayValue(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r) || isNL(r): + return lexSkip(lx, lexArrayValue) + case r == '#': + lx.push(lexArrayValue) + return lexCommentStart + case r == ',': + return lx.errorf("unexpected comma") + case r == ']': + return lexArrayEnd + } + + lx.backup() + lx.push(lexArrayValueEnd) + return lexValue +} + +// lexArrayValueEnd consumes everything between the end of an array value and +// the next value (or the end of the array): it ignores whitespace and newlines +// and expects either a ',' or a ']'. +func lexArrayValueEnd(lx *lexer) stateFn { + switch r := lx.next(); { + case isWhitespace(r) || isNL(r): + return lexSkip(lx, lexArrayValueEnd) + case r == '#': + lx.push(lexArrayValueEnd) + return lexCommentStart + case r == ',': + lx.ignore() + return lexArrayValue // move on to the next value + case r == ']': + return lexArrayEnd + default: + return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r)) + } +} + +// lexArrayEnd finishes the lexing of an array. +// It assumes that a ']' has just been consumed. +func lexArrayEnd(lx *lexer) stateFn { + lx.ignore() + lx.emit(itemArrayEnd) + return lx.pop() +} + +// lexInlineTableValue consumes one key/value pair in an inline table. +// It assumes that '{' or ',' have already been consumed. Whitespace is ignored. +func lexInlineTableValue(lx *lexer) stateFn { + r := lx.next() + switch { + case isWhitespace(r): + return lexSkip(lx, lexInlineTableValue) + case isNL(r): + if lx.tomlNext { + return lexSkip(lx, lexInlineTableValue) + } + return lx.errorPrevLine(errLexInlineTableNL{}) + case r == '#': + lx.push(lexInlineTableValue) + return lexCommentStart + case r == ',': + return lx.errorf("unexpected comma") + case r == '}': + return lexInlineTableEnd + } + lx.backup() + lx.push(lexInlineTableValueEnd) + return lexKeyStart +} + +// lexInlineTableValueEnd consumes everything between the end of an inline table +// key/value pair and the next pair (or the end of the table): +// it ignores whitespace and expects either a ',' or a '}'. +func lexInlineTableValueEnd(lx *lexer) stateFn { + switch r := lx.next(); { + case isWhitespace(r): + return lexSkip(lx, lexInlineTableValueEnd) + case isNL(r): + if lx.tomlNext { + return lexSkip(lx, lexInlineTableValueEnd) + } + return lx.errorPrevLine(errLexInlineTableNL{}) + case r == '#': + lx.push(lexInlineTableValueEnd) + return lexCommentStart + case r == ',': + lx.ignore() + lx.skip(isWhitespace) + if lx.peek() == '}' { + if lx.tomlNext { + return lexInlineTableValueEnd + } + return lx.errorf("trailing comma not allowed in inline tables") + } + return lexInlineTableValue + case r == '}': + return lexInlineTableEnd + default: + return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r)) + } +} + +func runeOrEOF(r rune) string { + if r == eof { + return "end of file" + } + return "'" + string(r) + "'" +} + +// lexInlineTableEnd finishes the lexing of an inline table. +// It assumes that a '}' has just been consumed. +func lexInlineTableEnd(lx *lexer) stateFn { + lx.ignore() + lx.emit(itemInlineTableEnd) + return lx.pop() +} + +// lexString consumes the inner contents of a string. It assumes that the +// beginning '"' has already been consumed and ignored. +func lexString(lx *lexer) stateFn { + r := lx.next() + switch { + case r == eof: + return lx.errorf(`unexpected EOF; expected '"'`) + case isNL(r): + return lx.errorPrevLine(errLexStringNL{}) + case r == '\\': + lx.push(lexString) + return lexStringEscape + case r == '"': + lx.backup() + if lx.esc { + lx.esc = false + lx.emit(itemStringEsc) + } else { + lx.emit(itemString) + } + lx.next() + lx.ignore() + return lx.pop() + } + return lexString +} + +// lexMultilineString consumes the inner contents of a string. It assumes that +// the beginning '"""' has already been consumed and ignored. +func lexMultilineString(lx *lexer) stateFn { + r := lx.next() + switch r { + default: + return lexMultilineString + case eof: + return lx.errorf(`unexpected EOF; expected '"""'`) + case '\\': + return lexMultilineStringEscape + case '"': + /// Found " → try to read two more "". + if lx.accept('"') { + if lx.accept('"') { + /// Peek ahead: the string can contain " and "", including at the + /// end: """str""""" + /// 6 or more at the end, however, is an error. + if lx.peek() == '"' { + /// Check if we already lexed 5 's; if so we have 6 now, and + /// that's just too many man! + /// + /// Second check is for the edge case: + /// + /// two quotes allowed. + /// vv + /// """lol \"""""" + /// ^^ ^^^---- closing three + /// escaped + /// + /// But ugly, but it works + if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) { + return lx.errorf(`unexpected '""""""'`) + } + lx.backup() + lx.backup() + return lexMultilineString + } + + lx.backup() /// backup: don't include the """ in the item. + lx.backup() + lx.backup() + lx.esc = false + lx.emit(itemMultilineString) + lx.next() /// Read over ''' again and discard it. + lx.next() + lx.next() + lx.ignore() + return lx.pop() + } + lx.backup() + } + return lexMultilineString + } +} + +// lexRawString consumes a raw string. Nothing can be escaped in such a string. +// It assumes that the beginning "'" has already been consumed and ignored. +func lexRawString(lx *lexer) stateFn { + r := lx.next() + switch { + default: + return lexRawString + case r == eof: + return lx.errorf(`unexpected EOF; expected "'"`) + case isNL(r): + return lx.errorPrevLine(errLexStringNL{}) + case r == '\'': + lx.backup() + lx.emit(itemRawString) + lx.next() + lx.ignore() + return lx.pop() + } +} + +// lexMultilineRawString consumes a raw string. Nothing can be escaped in such a +// string. It assumes that the beginning triple-' has already been consumed and +// ignored. +func lexMultilineRawString(lx *lexer) stateFn { + r := lx.next() + switch r { + default: + return lexMultilineRawString + case eof: + return lx.errorf(`unexpected EOF; expected "'''"`) + case '\'': + /// Found ' → try to read two more ''. + if lx.accept('\'') { + if lx.accept('\'') { + /// Peek ahead: the string can contain ' and '', including at the + /// end: '''str''''' + /// 6 or more at the end, however, is an error. + if lx.peek() == '\'' { + /// Check if we already lexed 5 's; if so we have 6 now, and + /// that's just too many man! + if strings.HasSuffix(lx.current(), "'''''") { + return lx.errorf(`unexpected "''''''"`) + } + lx.backup() + lx.backup() + return lexMultilineRawString + } + + lx.backup() /// backup: don't include the ''' in the item. + lx.backup() + lx.backup() + lx.emit(itemRawMultilineString) + lx.next() /// Read over ''' again and discard it. + lx.next() + lx.next() + lx.ignore() + return lx.pop() + } + lx.backup() + } + return lexMultilineRawString + } +} + +// lexMultilineStringEscape consumes an escaped character. It assumes that the +// preceding '\\' has already been consumed. +func lexMultilineStringEscape(lx *lexer) stateFn { + if isNL(lx.next()) { /// \ escaping newline. + return lexMultilineString + } + lx.backup() + lx.push(lexMultilineString) + return lexStringEscape(lx) +} + +func lexStringEscape(lx *lexer) stateFn { + lx.esc = true + r := lx.next() + switch r { + case 'e': + if !lx.tomlNext { + return lx.error(errLexEscape{r}) + } + fallthrough + case 'b': + fallthrough + case 't': + fallthrough + case 'n': + fallthrough + case 'f': + fallthrough + case 'r': + fallthrough + case '"': + fallthrough + case ' ', '\t': + // Inside """ .. """ strings you can use \ to escape newlines, and any + // amount of whitespace can be between the \ and \n. + fallthrough + case '\\': + return lx.pop() + case 'x': + if !lx.tomlNext { + return lx.error(errLexEscape{r}) + } + return lexHexEscape + case 'u': + return lexShortUnicodeEscape + case 'U': + return lexLongUnicodeEscape + } + return lx.error(errLexEscape{r}) +} + +func lexHexEscape(lx *lexer) stateFn { + var r rune + for i := 0; i < 2; i++ { + r = lx.next() + if !isHex(r) { + return lx.errorf(`expected two hexadecimal digits after '\x', but got %q instead`, lx.current()) + } + } + return lx.pop() +} + +func lexShortUnicodeEscape(lx *lexer) stateFn { + var r rune + for i := 0; i < 4; i++ { + r = lx.next() + if !isHex(r) { + return lx.errorf(`expected four hexadecimal digits after '\u', but got %q instead`, lx.current()) + } + } + return lx.pop() +} + +func lexLongUnicodeEscape(lx *lexer) stateFn { + var r rune + for i := 0; i < 8; i++ { + r = lx.next() + if !isHex(r) { + return lx.errorf(`expected eight hexadecimal digits after '\U', but got %q instead`, lx.current()) + } + } + return lx.pop() +} + +// lexNumberOrDateStart processes the first character of a value which begins +// with a digit. It exists to catch values starting with '0', so that +// lexBaseNumberOrDate can differentiate base prefixed integers from other +// types. +func lexNumberOrDateStart(lx *lexer) stateFn { + r := lx.next() + switch r { + case '0': + return lexBaseNumberOrDate + } + + if !isDigit(r) { + // The only way to reach this state is if the value starts + // with a digit, so specifically treat anything else as an + // error. + return lx.errorf("expected a digit but got %q", r) + } + + return lexNumberOrDate +} + +// lexNumberOrDate consumes either an integer, float or datetime. +func lexNumberOrDate(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexNumberOrDate + } + switch r { + case '-', ':': + return lexDatetime + case '_': + return lexDecimalNumber + case '.', 'e', 'E': + return lexFloat + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexDatetime consumes a Datetime, to a first approximation. +// The parser validates that it matches one of the accepted formats. +func lexDatetime(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexDatetime + } + switch r { + case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+': + return lexDatetime + } + + lx.backup() + lx.emitTrim(itemDatetime) + return lx.pop() +} + +// lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix. +func lexHexInteger(lx *lexer) stateFn { + r := lx.next() + if isHex(r) { + return lexHexInteger + } + switch r { + case '_': + return lexHexInteger + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexOctalInteger consumes an octal integer after seeing the '0o' prefix. +func lexOctalInteger(lx *lexer) stateFn { + r := lx.next() + if isOctal(r) { + return lexOctalInteger + } + switch r { + case '_': + return lexOctalInteger + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexBinaryInteger consumes a binary integer after seeing the '0b' prefix. +func lexBinaryInteger(lx *lexer) stateFn { + r := lx.next() + if isBinary(r) { + return lexBinaryInteger + } + switch r { + case '_': + return lexBinaryInteger + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexDecimalNumber consumes a decimal float or integer. +func lexDecimalNumber(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexDecimalNumber + } + switch r { + case '.', 'e', 'E': + return lexFloat + case '_': + return lexDecimalNumber + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexDecimalNumber consumes the first digit of a number beginning with a sign. +// It assumes the sign has already been consumed. Values which start with a sign +// are only allowed to be decimal integers or floats. +// +// The special "nan" and "inf" values are also recognized. +func lexDecimalNumberStart(lx *lexer) stateFn { + r := lx.next() + + // Special error cases to give users better error messages + switch r { + case 'i': + if !lx.accept('n') || !lx.accept('f') { + return lx.errorf("invalid float: '%s'", lx.current()) + } + lx.emit(itemFloat) + return lx.pop() + case 'n': + if !lx.accept('a') || !lx.accept('n') { + return lx.errorf("invalid float: '%s'", lx.current()) + } + lx.emit(itemFloat) + return lx.pop() + case '0': + p := lx.peek() + switch p { + case 'b', 'o', 'x': + return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p) + } + case '.': + return lx.errorf("floats must start with a digit, not '.'") + } + + if isDigit(r) { + return lexDecimalNumber + } + + return lx.errorf("expected a digit but got %q", r) +} + +// lexBaseNumberOrDate differentiates between the possible values which +// start with '0'. It assumes that before reaching this state, the initial '0' +// has been consumed. +func lexBaseNumberOrDate(lx *lexer) stateFn { + r := lx.next() + // Note: All datetimes start with at least two digits, so we don't + // handle date characters (':', '-', etc.) here. + if isDigit(r) { + return lexNumberOrDate + } + switch r { + case '_': + // Can only be decimal, because there can't be an underscore + // between the '0' and the base designator, and dates can't + // contain underscores. + return lexDecimalNumber + case '.', 'e', 'E': + return lexFloat + case 'b': + r = lx.peek() + if !isBinary(r) { + lx.errorf("not a binary number: '%s%c'", lx.current(), r) + } + return lexBinaryInteger + case 'o': + r = lx.peek() + if !isOctal(r) { + lx.errorf("not an octal number: '%s%c'", lx.current(), r) + } + return lexOctalInteger + case 'x': + r = lx.peek() + if !isHex(r) { + lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r) + } + return lexHexInteger + } + + lx.backup() + lx.emit(itemInteger) + return lx.pop() +} + +// lexFloat consumes the elements of a float. It allows any sequence of +// float-like characters, so floats emitted by the lexer are only a first +// approximation and must be validated by the parser. +func lexFloat(lx *lexer) stateFn { + r := lx.next() + if isDigit(r) { + return lexFloat + } + switch r { + case '_', '.', '-', '+', 'e', 'E': + return lexFloat + } + + lx.backup() + lx.emit(itemFloat) + return lx.pop() +} + +// lexBool consumes a bool string: 'true' or 'false. +func lexBool(lx *lexer) stateFn { + var rs []rune + for { + r := lx.next() + if !unicode.IsLetter(r) { + lx.backup() + break + } + rs = append(rs, r) + } + s := string(rs) + switch s { + case "true", "false": + lx.emit(itemBool) + return lx.pop() + } + return lx.errorf("expected value but found %q instead", s) +} + +// lexCommentStart begins the lexing of a comment. It will emit +// itemCommentStart and consume no characters, passing control to lexComment. +func lexCommentStart(lx *lexer) stateFn { + lx.ignore() + lx.emit(itemCommentStart) + return lexComment +} + +// lexComment lexes an entire comment. It assumes that '#' has been consumed. +// It will consume *up to* the first newline character, and pass control +// back to the last state on the stack. +func lexComment(lx *lexer) stateFn { + switch r := lx.next(); { + case isNL(r) || r == eof: + lx.backup() + lx.emit(itemText) + return lx.pop() + default: + return lexComment + } +} + +// lexSkip ignores all slurped input and moves on to the next state. +func lexSkip(lx *lexer, nextState stateFn) stateFn { + lx.ignore() + return nextState +} + +func (s stateFn) String() string { + name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name() + if i := strings.LastIndexByte(name, '.'); i > -1 { + name = name[i+1:] + } + if s == nil { + name = "<nil>" + } + return name + "()" +} + +func (itype itemType) String() string { + switch itype { + case itemError: + return "Error" + case itemNIL: + return "NIL" + case itemEOF: + return "EOF" + case itemText: + return "Text" + case itemString, itemStringEsc, itemRawString, itemMultilineString, itemRawMultilineString: + return "String" + case itemBool: + return "Bool" + case itemInteger: + return "Integer" + case itemFloat: + return "Float" + case itemDatetime: + return "DateTime" + case itemTableStart: + return "TableStart" + case itemTableEnd: + return "TableEnd" + case itemKeyStart: + return "KeyStart" + case itemKeyEnd: + return "KeyEnd" + case itemArray: + return "Array" + case itemArrayEnd: + return "ArrayEnd" + case itemCommentStart: + return "CommentStart" + case itemInlineTableStart: + return "InlineTableStart" + case itemInlineTableEnd: + return "InlineTableEnd" + } + panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype))) +} + +func (item item) String() string { + return fmt.Sprintf("(%s, %s)", item.typ, item.val) +} + +func isWhitespace(r rune) bool { return r == '\t' || r == ' ' } +func isNL(r rune) bool { return r == '\n' || r == '\r' } +func isControl(r rune) bool { // Control characters except \t, \r, \n + switch r { + case '\t', '\r', '\n': + return false + default: + return (r >= 0x00 && r <= 0x1f) || r == 0x7f + } +} +func isDigit(r rune) bool { return r >= '0' && r <= '9' } +func isBinary(r rune) bool { return r == '0' || r == '1' } +func isOctal(r rune) bool { return r >= '0' && r <= '7' } +func isHex(r rune) bool { return (r >= '0' && r <= '9') || (r|0x20 >= 'a' && r|0x20 <= 'f') } +func isBareKeyChar(r rune, tomlNext bool) bool { + if tomlNext { + return (r >= 'A' && r <= 'Z') || + (r >= 'a' && r <= 'z') || + (r >= '0' && r <= '9') || + r == '_' || r == '-' || + r == 0xb2 || r == 0xb3 || r == 0xb9 || (r >= 0xbc && r <= 0xbe) || + (r >= 0xc0 && r <= 0xd6) || (r >= 0xd8 && r <= 0xf6) || (r >= 0xf8 && r <= 0x037d) || + (r >= 0x037f && r <= 0x1fff) || + (r >= 0x200c && r <= 0x200d) || (r >= 0x203f && r <= 0x2040) || + (r >= 0x2070 && r <= 0x218f) || (r >= 0x2460 && r <= 0x24ff) || + (r >= 0x2c00 && r <= 0x2fef) || (r >= 0x3001 && r <= 0xd7ff) || + (r >= 0xf900 && r <= 0xfdcf) || (r >= 0xfdf0 && r <= 0xfffd) || + (r >= 0x10000 && r <= 0xeffff) + } + + return (r >= 'A' && r <= 'Z') || + (r >= 'a' && r <= 'z') || + (r >= '0' && r <= '9') || + r == '_' || r == '-' +} |