lex.go - mirror - Mirror free and open-source projects you like with minimal effort

lex.go (30931B)
      1 package toml
      2 
      3 import (
      4 	"fmt"
      5 	"reflect"
      6 	"runtime"
      7 	"strings"
      8 	"unicode"
      9 	"unicode/utf8"
     10 )
     11 
     12 type itemType int
     13 
     14 const (
     15 	itemError itemType = iota
     16 	itemNIL            // used in the parser to indicate no type
     17 	itemEOF
     18 	itemText
     19 	itemString
     20 	itemStringEsc
     21 	itemRawString
     22 	itemMultilineString
     23 	itemRawMultilineString
     24 	itemBool
     25 	itemInteger
     26 	itemFloat
     27 	itemDatetime
     28 	itemArray // the start of an array
     29 	itemArrayEnd
     30 	itemTableStart
     31 	itemTableEnd
     32 	itemArrayTableStart
     33 	itemArrayTableEnd
     34 	itemKeyStart
     35 	itemKeyEnd
     36 	itemCommentStart
     37 	itemInlineTableStart
     38 	itemInlineTableEnd
     39 )
     40 
     41 const eof = 0
     42 
     43 type stateFn func(lx *lexer) stateFn
     44 
     45 func (p Position) String() string {
     46 	return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
     47 }
     48 
     49 type lexer struct {
     50 	input    string
     51 	start    int
     52 	pos      int
     53 	line     int
     54 	state    stateFn
     55 	items    chan item
     56 	tomlNext bool
     57 	esc      bool
     58 
     59 	// Allow for backing up up to 4 runes. This is necessary because TOML
     60 	// contains 3-rune tokens (""" and ''').
     61 	prevWidths [4]int
     62 	nprev      int  // how many of prevWidths are in use
     63 	atEOF      bool // If we emit an eof, we can still back up, but it is not OK to call next again.
     64 
     65 	// A stack of state functions used to maintain context.
     66 	//
     67 	// The idea is to reuse parts of the state machine in various places. For
     68 	// example, values can appear at the top level or within arbitrarily nested
     69 	// arrays. The last state on the stack is used after a value has been lexed.
     70 	// Similarly for comments.
     71 	stack []stateFn
     72 }
     73 
     74 type item struct {
     75 	typ itemType
     76 	val string
     77 	err error
     78 	pos Position
     79 }
     80 
     81 func (lx *lexer) nextItem() item {
     82 	for {
     83 		select {
     84 		case item := <-lx.items:
     85 			return item
     86 		default:
     87 			lx.state = lx.state(lx)
     88 			//fmt.Printf("     STATE %-24s  current: %-10s	stack: %s\n", lx.state, lx.current(), lx.stack)
     89 		}
     90 	}
     91 }
     92 
     93 func lex(input string, tomlNext bool) *lexer {
     94 	lx := &lexer{
     95 		input:    input,
     96 		state:    lexTop,
     97 		items:    make(chan item, 10),
     98 		stack:    make([]stateFn, 0, 10),
     99 		line:     1,
    100 		tomlNext: tomlNext,
    101 	}
    102 	return lx
    103 }
    104 
    105 func (lx *lexer) push(state stateFn) {
    106 	lx.stack = append(lx.stack, state)
    107 }
    108 
    109 func (lx *lexer) pop() stateFn {
    110 	if len(lx.stack) == 0 {
    111 		return lx.errorf("BUG in lexer: no states to pop")
    112 	}
    113 	last := lx.stack[len(lx.stack)-1]
    114 	lx.stack = lx.stack[0 : len(lx.stack)-1]
    115 	return last
    116 }
    117 
    118 func (lx *lexer) current() string {
    119 	return lx.input[lx.start:lx.pos]
    120 }
    121 
    122 func (lx lexer) getPos() Position {
    123 	p := Position{
    124 		Line:  lx.line,
    125 		Start: lx.start,
    126 		Len:   lx.pos - lx.start,
    127 	}
    128 	if p.Len <= 0 {
    129 		p.Len = 1
    130 	}
    131 	return p
    132 }
    133 
    134 func (lx *lexer) emit(typ itemType) {
    135 	// Needed for multiline strings ending with an incomplete UTF-8 sequence.
    136 	if lx.start > lx.pos {
    137 		lx.error(errLexUTF8{lx.input[lx.pos]})
    138 		return
    139 	}
    140 	lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
    141 	lx.start = lx.pos
    142 }
    143 
    144 func (lx *lexer) emitTrim(typ itemType) {
    145 	lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
    146 	lx.start = lx.pos
    147 }
    148 
    149 func (lx *lexer) next() (r rune) {
    150 	if lx.atEOF {
    151 		panic("BUG in lexer: next called after EOF")
    152 	}
    153 	if lx.pos >= len(lx.input) {
    154 		lx.atEOF = true
    155 		return eof
    156 	}
    157 
    158 	if lx.input[lx.pos] == '\n' {
    159 		lx.line++
    160 	}
    161 	lx.prevWidths[3] = lx.prevWidths[2]
    162 	lx.prevWidths[2] = lx.prevWidths[1]
    163 	lx.prevWidths[1] = lx.prevWidths[0]
    164 	if lx.nprev < 4 {
    165 		lx.nprev++
    166 	}
    167 
    168 	r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
    169 	if r == utf8.RuneError && w == 1 {
    170 		lx.error(errLexUTF8{lx.input[lx.pos]})
    171 		return utf8.RuneError
    172 	}
    173 
    174 	// Note: don't use peek() here, as this calls next().
    175 	if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
    176 		lx.errorControlChar(r)
    177 		return utf8.RuneError
    178 	}
    179 
    180 	lx.prevWidths[0] = w
    181 	lx.pos += w
    182 	return r
    183 }
    184 
    185 // ignore skips over the pending input before this point.
    186 func (lx *lexer) ignore() {
    187 	lx.start = lx.pos
    188 }
    189 
    190 // backup steps back one rune. Can be called 4 times between calls to next.
    191 func (lx *lexer) backup() {
    192 	if lx.atEOF {
    193 		lx.atEOF = false
    194 		return
    195 	}
    196 	if lx.nprev < 1 {
    197 		panic("BUG in lexer: backed up too far")
    198 	}
    199 	w := lx.prevWidths[0]
    200 	lx.prevWidths[0] = lx.prevWidths[1]
    201 	lx.prevWidths[1] = lx.prevWidths[2]
    202 	lx.prevWidths[2] = lx.prevWidths[3]
    203 	lx.nprev--
    204 
    205 	lx.pos -= w
    206 	if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
    207 		lx.line--
    208 	}
    209 }
    210 
    211 // accept consumes the next rune if it's equal to `valid`.
    212 func (lx *lexer) accept(valid rune) bool {
    213 	if lx.next() == valid {
    214 		return true
    215 	}
    216 	lx.backup()
    217 	return false
    218 }
    219 
    220 // peek returns but does not consume the next rune in the input.
    221 func (lx *lexer) peek() rune {
    222 	r := lx.next()
    223 	lx.backup()
    224 	return r
    225 }
    226 
    227 // skip ignores all input that matches the given predicate.
    228 func (lx *lexer) skip(pred func(rune) bool) {
    229 	for {
    230 		r := lx.next()
    231 		if pred(r) {
    232 			continue
    233 		}
    234 		lx.backup()
    235 		lx.ignore()
    236 		return
    237 	}
    238 }
    239 
    240 // error stops all lexing by emitting an error and returning `nil`.
    241 //
    242 // Note that any value that is a character is escaped if it's a special
    243 // character (newlines, tabs, etc.).
    244 func (lx *lexer) error(err error) stateFn {
    245 	if lx.atEOF {
    246 		return lx.errorPrevLine(err)
    247 	}
    248 	lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
    249 	return nil
    250 }
    251 
    252 // errorfPrevline is like error(), but sets the position to the last column of
    253 // the previous line.
    254 //
    255 // This is so that unexpected EOF or NL errors don't show on a new blank line.
    256 func (lx *lexer) errorPrevLine(err error) stateFn {
    257 	pos := lx.getPos()
    258 	pos.Line--
    259 	pos.Len = 1
    260 	pos.Start = lx.pos - 1
    261 	lx.items <- item{typ: itemError, pos: pos, err: err}
    262 	return nil
    263 }
    264 
    265 // errorPos is like error(), but allows explicitly setting the position.
    266 func (lx *lexer) errorPos(start, length int, err error) stateFn {
    267 	pos := lx.getPos()
    268 	pos.Start = start
    269 	pos.Len = length
    270 	lx.items <- item{typ: itemError, pos: pos, err: err}
    271 	return nil
    272 }
    273 
    274 // errorf is like error, and creates a new error.
    275 func (lx *lexer) errorf(format string, values ...any) stateFn {
    276 	if lx.atEOF {
    277 		pos := lx.getPos()
    278 		pos.Line--
    279 		pos.Len = 1
    280 		pos.Start = lx.pos - 1
    281 		lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
    282 		return nil
    283 	}
    284 	lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
    285 	return nil
    286 }
    287 
    288 func (lx *lexer) errorControlChar(cc rune) stateFn {
    289 	return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
    290 }
    291 
    292 // lexTop consumes elements at the top level of TOML data.
    293 func lexTop(lx *lexer) stateFn {
    294 	r := lx.next()
    295 	if isWhitespace(r) || isNL(r) {
    296 		return lexSkip(lx, lexTop)
    297 	}
    298 	switch r {
    299 	case '#':
    300 		lx.push(lexTop)
    301 		return lexCommentStart
    302 	case '[':
    303 		return lexTableStart
    304 	case eof:
    305 		if lx.pos > lx.start {
    306 			return lx.errorf("unexpected EOF")
    307 		}
    308 		lx.emit(itemEOF)
    309 		return nil
    310 	}
    311 
    312 	// At this point, the only valid item can be a key, so we back up
    313 	// and let the key lexer do the rest.
    314 	lx.backup()
    315 	lx.push(lexTopEnd)
    316 	return lexKeyStart
    317 }
    318 
    319 // lexTopEnd is entered whenever a top-level item has been consumed. (A value
    320 // or a table.) It must see only whitespace, and will turn back to lexTop
    321 // upon a newline. If it sees EOF, it will quit the lexer successfully.
    322 func lexTopEnd(lx *lexer) stateFn {
    323 	r := lx.next()
    324 	switch {
    325 	case r == '#':
    326 		// a comment will read to a newline for us.
    327 		lx.push(lexTop)
    328 		return lexCommentStart
    329 	case isWhitespace(r):
    330 		return lexTopEnd
    331 	case isNL(r):
    332 		lx.ignore()
    333 		return lexTop
    334 	case r == eof:
    335 		lx.emit(itemEOF)
    336 		return nil
    337 	}
    338 	return lx.errorf("expected a top-level item to end with a newline, comment, or EOF, but got %q instead", r)
    339 }
    340 
    341 // lexTable lexes the beginning of a table. Namely, it makes sure that
    342 // it starts with a character other than '.' and ']'.
    343 // It assumes that '[' has already been consumed.
    344 // It also handles the case that this is an item in an array of tables.
    345 // e.g., '[[name]]'.
    346 func lexTableStart(lx *lexer) stateFn {
    347 	if lx.peek() == '[' {
    348 		lx.next()
    349 		lx.emit(itemArrayTableStart)
    350 		lx.push(lexArrayTableEnd)
    351 	} else {
    352 		lx.emit(itemTableStart)
    353 		lx.push(lexTableEnd)
    354 	}
    355 	return lexTableNameStart
    356 }
    357 
    358 func lexTableEnd(lx *lexer) stateFn {
    359 	lx.emit(itemTableEnd)
    360 	return lexTopEnd
    361 }
    362 
    363 func lexArrayTableEnd(lx *lexer) stateFn {
    364 	if r := lx.next(); r != ']' {
    365 		return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
    366 	}
    367 	lx.emit(itemArrayTableEnd)
    368 	return lexTopEnd
    369 }
    370 
    371 func lexTableNameStart(lx *lexer) stateFn {
    372 	lx.skip(isWhitespace)
    373 	switch r := lx.peek(); {
    374 	case r == ']' || r == eof:
    375 		return lx.errorf("unexpected end of table name (table names cannot be empty)")
    376 	case r == '.':
    377 		return lx.errorf("unexpected table separator (table names cannot be empty)")
    378 	case r == '"' || r == '\'':
    379 		lx.ignore()
    380 		lx.push(lexTableNameEnd)
    381 		return lexQuotedName
    382 	default:
    383 		lx.push(lexTableNameEnd)
    384 		return lexBareName
    385 	}
    386 }
    387 
    388 // lexTableNameEnd reads the end of a piece of a table name, optionally
    389 // consuming whitespace.
    390 func lexTableNameEnd(lx *lexer) stateFn {
    391 	lx.skip(isWhitespace)
    392 	switch r := lx.next(); {
    393 	case isWhitespace(r):
    394 		return lexTableNameEnd
    395 	case r == '.':
    396 		lx.ignore()
    397 		return lexTableNameStart
    398 	case r == ']':
    399 		return lx.pop()
    400 	default:
    401 		return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
    402 	}
    403 }
    404 
    405 // lexBareName lexes one part of a key or table.
    406 //
    407 // It assumes that at least one valid character for the table has already been
    408 // read.
    409 //
    410 // Lexes only one part, e.g. only 'a' inside 'a.b'.
    411 func lexBareName(lx *lexer) stateFn {
    412 	r := lx.next()
    413 	if isBareKeyChar(r, lx.tomlNext) {
    414 		return lexBareName
    415 	}
    416 	lx.backup()
    417 	lx.emit(itemText)
    418 	return lx.pop()
    419 }
    420 
    421 // lexBareName lexes one part of a key or table.
    422 //
    423 // It assumes that at least one valid character for the table has already been
    424 // read.
    425 //
    426 // Lexes only one part, e.g. only '"a"' inside '"a".b'.
    427 func lexQuotedName(lx *lexer) stateFn {
    428 	r := lx.next()
    429 	switch {
    430 	case isWhitespace(r):
    431 		return lexSkip(lx, lexValue)
    432 	case r == '"':
    433 		lx.ignore() // ignore the '"'
    434 		return lexString
    435 	case r == '\'':
    436 		lx.ignore() // ignore the "'"
    437 		return lexRawString
    438 	case r == eof:
    439 		return lx.errorf("unexpected EOF; expected value")
    440 	default:
    441 		return lx.errorf("expected value but found %q instead", r)
    442 	}
    443 }
    444 
    445 // lexKeyStart consumes all key parts until a '='.
    446 func lexKeyStart(lx *lexer) stateFn {
    447 	lx.skip(isWhitespace)
    448 	switch r := lx.peek(); {
    449 	case r == '=' || r == eof:
    450 		return lx.errorf("unexpected '=': key name appears blank")
    451 	case r == '.':
    452 		return lx.errorf("unexpected '.': keys cannot start with a '.'")
    453 	case r == '"' || r == '\'':
    454 		lx.ignore()
    455 		fallthrough
    456 	default: // Bare key
    457 		lx.emit(itemKeyStart)
    458 		return lexKeyNameStart
    459 	}
    460 }
    461 
    462 func lexKeyNameStart(lx *lexer) stateFn {
    463 	lx.skip(isWhitespace)
    464 	switch r := lx.peek(); {
    465 	case r == '=' || r == eof:
    466 		return lx.errorf("unexpected '='")
    467 	case r == '.':
    468 		return lx.errorf("unexpected '.'")
    469 	case r == '"' || r == '\'':
    470 		lx.ignore()
    471 		lx.push(lexKeyEnd)
    472 		return lexQuotedName
    473 	default:
    474 		lx.push(lexKeyEnd)
    475 		return lexBareName
    476 	}
    477 }
    478 
    479 // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
    480 // separator).
    481 func lexKeyEnd(lx *lexer) stateFn {
    482 	lx.skip(isWhitespace)
    483 	switch r := lx.next(); {
    484 	case isWhitespace(r):
    485 		return lexSkip(lx, lexKeyEnd)
    486 	case r == eof:
    487 		return lx.errorf("unexpected EOF; expected key separator '='")
    488 	case r == '.':
    489 		lx.ignore()
    490 		return lexKeyNameStart
    491 	case r == '=':
    492 		lx.emit(itemKeyEnd)
    493 		return lexSkip(lx, lexValue)
    494 	default:
    495 		return lx.errorf("expected '.' or '=', but got %q instead", r)
    496 	}
    497 }
    498 
    499 // lexValue starts the consumption of a value anywhere a value is expected.
    500 // lexValue will ignore whitespace.
    501 // After a value is lexed, the last state on the next is popped and returned.
    502 func lexValue(lx *lexer) stateFn {
    503 	// We allow whitespace to precede a value, but NOT newlines.
    504 	// In array syntax, the array states are responsible for ignoring newlines.
    505 	r := lx.next()
    506 	switch {
    507 	case isWhitespace(r):
    508 		return lexSkip(lx, lexValue)
    509 	case isDigit(r):
    510 		lx.backup() // avoid an extra state and use the same as above
    511 		return lexNumberOrDateStart
    512 	}
    513 	switch r {
    514 	case '[':
    515 		lx.ignore()
    516 		lx.emit(itemArray)
    517 		return lexArrayValue
    518 	case '{':
    519 		lx.ignore()
    520 		lx.emit(itemInlineTableStart)
    521 		return lexInlineTableValue
    522 	case '"':
    523 		if lx.accept('"') {
    524 			if lx.accept('"') {
    525 				lx.ignore() // Ignore """
    526 				return lexMultilineString
    527 			}
    528 			lx.backup()
    529 		}
    530 		lx.ignore() // ignore the '"'
    531 		return lexString
    532 	case '\'':
    533 		if lx.accept('\'') {
    534 			if lx.accept('\'') {
    535 				lx.ignore() // Ignore """
    536 				return lexMultilineRawString
    537 			}
    538 			lx.backup()
    539 		}
    540 		lx.ignore() // ignore the "'"
    541 		return lexRawString
    542 	case '.': // special error case, be kind to users
    543 		return lx.errorf("floats must start with a digit, not '.'")
    544 	case 'i', 'n':
    545 		if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
    546 			lx.emit(itemFloat)
    547 			return lx.pop()
    548 		}
    549 	case '-', '+':
    550 		return lexDecimalNumberStart
    551 	}
    552 	if unicode.IsLetter(r) {
    553 		// Be permissive here; lexBool will give a nice error if the
    554 		// user wrote something like
    555 		//   x = foo
    556 		// (i.e. not 'true' or 'false' but is something else word-like.)
    557 		lx.backup()
    558 		return lexBool
    559 	}
    560 	if r == eof {
    561 		return lx.errorf("unexpected EOF; expected value")
    562 	}
    563 	return lx.errorf("expected value but found %q instead", r)
    564 }
    565 
    566 // lexArrayValue consumes one value in an array. It assumes that '[' or ','
    567 // have already been consumed. All whitespace and newlines are ignored.
    568 func lexArrayValue(lx *lexer) stateFn {
    569 	r := lx.next()
    570 	switch {
    571 	case isWhitespace(r) || isNL(r):
    572 		return lexSkip(lx, lexArrayValue)
    573 	case r == '#':
    574 		lx.push(lexArrayValue)
    575 		return lexCommentStart
    576 	case r == ',':
    577 		return lx.errorf("unexpected comma")
    578 	case r == ']':
    579 		return lexArrayEnd
    580 	}
    581 
    582 	lx.backup()
    583 	lx.push(lexArrayValueEnd)
    584 	return lexValue
    585 }
    586 
    587 // lexArrayValueEnd consumes everything between the end of an array value and
    588 // the next value (or the end of the array): it ignores whitespace and newlines
    589 // and expects either a ',' or a ']'.
    590 func lexArrayValueEnd(lx *lexer) stateFn {
    591 	switch r := lx.next(); {
    592 	case isWhitespace(r) || isNL(r):
    593 		return lexSkip(lx, lexArrayValueEnd)
    594 	case r == '#':
    595 		lx.push(lexArrayValueEnd)
    596 		return lexCommentStart
    597 	case r == ',':
    598 		lx.ignore()
    599 		return lexArrayValue // move on to the next value
    600 	case r == ']':
    601 		return lexArrayEnd
    602 	default:
    603 		return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
    604 	}
    605 }
    606 
    607 // lexArrayEnd finishes the lexing of an array.
    608 // It assumes that a ']' has just been consumed.
    609 func lexArrayEnd(lx *lexer) stateFn {
    610 	lx.ignore()
    611 	lx.emit(itemArrayEnd)
    612 	return lx.pop()
    613 }
    614 
    615 // lexInlineTableValue consumes one key/value pair in an inline table.
    616 // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
    617 func lexInlineTableValue(lx *lexer) stateFn {
    618 	r := lx.next()
    619 	switch {
    620 	case isWhitespace(r):
    621 		return lexSkip(lx, lexInlineTableValue)
    622 	case isNL(r):
    623 		if lx.tomlNext {
    624 			return lexSkip(lx, lexInlineTableValue)
    625 		}
    626 		return lx.errorPrevLine(errLexInlineTableNL{})
    627 	case r == '#':
    628 		lx.push(lexInlineTableValue)
    629 		return lexCommentStart
    630 	case r == ',':
    631 		return lx.errorf("unexpected comma")
    632 	case r == '}':
    633 		return lexInlineTableEnd
    634 	}
    635 	lx.backup()
    636 	lx.push(lexInlineTableValueEnd)
    637 	return lexKeyStart
    638 }
    639 
    640 // lexInlineTableValueEnd consumes everything between the end of an inline table
    641 // key/value pair and the next pair (or the end of the table):
    642 // it ignores whitespace and expects either a ',' or a '}'.
    643 func lexInlineTableValueEnd(lx *lexer) stateFn {
    644 	switch r := lx.next(); {
    645 	case isWhitespace(r):
    646 		return lexSkip(lx, lexInlineTableValueEnd)
    647 	case isNL(r):
    648 		if lx.tomlNext {
    649 			return lexSkip(lx, lexInlineTableValueEnd)
    650 		}
    651 		return lx.errorPrevLine(errLexInlineTableNL{})
    652 	case r == '#':
    653 		lx.push(lexInlineTableValueEnd)
    654 		return lexCommentStart
    655 	case r == ',':
    656 		lx.ignore()
    657 		lx.skip(isWhitespace)
    658 		if lx.peek() == '}' {
    659 			if lx.tomlNext {
    660 				return lexInlineTableValueEnd
    661 			}
    662 			return lx.errorf("trailing comma not allowed in inline tables")
    663 		}
    664 		return lexInlineTableValue
    665 	case r == '}':
    666 		return lexInlineTableEnd
    667 	default:
    668 		return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
    669 	}
    670 }
    671 
    672 func runeOrEOF(r rune) string {
    673 	if r == eof {
    674 		return "end of file"
    675 	}
    676 	return "'" + string(r) + "'"
    677 }
    678 
    679 // lexInlineTableEnd finishes the lexing of an inline table.
    680 // It assumes that a '}' has just been consumed.
    681 func lexInlineTableEnd(lx *lexer) stateFn {
    682 	lx.ignore()
    683 	lx.emit(itemInlineTableEnd)
    684 	return lx.pop()
    685 }
    686 
    687 // lexString consumes the inner contents of a string. It assumes that the
    688 // beginning '"' has already been consumed and ignored.
    689 func lexString(lx *lexer) stateFn {
    690 	r := lx.next()
    691 	switch {
    692 	case r == eof:
    693 		return lx.errorf(`unexpected EOF; expected '"'`)
    694 	case isNL(r):
    695 		return lx.errorPrevLine(errLexStringNL{})
    696 	case r == '\\':
    697 		lx.push(lexString)
    698 		return lexStringEscape
    699 	case r == '"':
    700 		lx.backup()
    701 		if lx.esc {
    702 			lx.esc = false
    703 			lx.emit(itemStringEsc)
    704 		} else {
    705 			lx.emit(itemString)
    706 		}
    707 		lx.next()
    708 		lx.ignore()
    709 		return lx.pop()
    710 	}
    711 	return lexString
    712 }
    713 
    714 // lexMultilineString consumes the inner contents of a string. It assumes that
    715 // the beginning '"""' has already been consumed and ignored.
    716 func lexMultilineString(lx *lexer) stateFn {
    717 	r := lx.next()
    718 	switch r {
    719 	default:
    720 		return lexMultilineString
    721 	case eof:
    722 		return lx.errorf(`unexpected EOF; expected '"""'`)
    723 	case '\\':
    724 		return lexMultilineStringEscape
    725 	case '"':
    726 		/// Found " → try to read two more "".
    727 		if lx.accept('"') {
    728 			if lx.accept('"') {
    729 				/// Peek ahead: the string can contain " and "", including at the
    730 				/// end: """str"""""
    731 				/// 6 or more at the end, however, is an error.
    732 				if lx.peek() == '"' {
    733 					/// Check if we already lexed 5 's; if so we have 6 now, and
    734 					/// that's just too many man!
    735 					///
    736 					/// Second check is for the edge case:
    737 					///
    738 					///            two quotes allowed.
    739 					///            vv
    740 					///   """lol \""""""
    741 					///          ^^  ^^^---- closing three
    742 					///     escaped
    743 					///
    744 					/// But ugly, but it works
    745 					if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
    746 						return lx.errorf(`unexpected '""""""'`)
    747 					}
    748 					lx.backup()
    749 					lx.backup()
    750 					return lexMultilineString
    751 				}
    752 
    753 				lx.backup() /// backup: don't include the """ in the item.
    754 				lx.backup()
    755 				lx.backup()
    756 				lx.esc = false
    757 				lx.emit(itemMultilineString)
    758 				lx.next() /// Read over ''' again and discard it.
    759 				lx.next()
    760 				lx.next()
    761 				lx.ignore()
    762 				return lx.pop()
    763 			}
    764 			lx.backup()
    765 		}
    766 		return lexMultilineString
    767 	}
    768 }
    769 
    770 // lexRawString consumes a raw string. Nothing can be escaped in such a string.
    771 // It assumes that the beginning "'" has already been consumed and ignored.
    772 func lexRawString(lx *lexer) stateFn {
    773 	r := lx.next()
    774 	switch {
    775 	default:
    776 		return lexRawString
    777 	case r == eof:
    778 		return lx.errorf(`unexpected EOF; expected "'"`)
    779 	case isNL(r):
    780 		return lx.errorPrevLine(errLexStringNL{})
    781 	case r == '\'':
    782 		lx.backup()
    783 		lx.emit(itemRawString)
    784 		lx.next()
    785 		lx.ignore()
    786 		return lx.pop()
    787 	}
    788 }
    789 
    790 // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a
    791 // string. It assumes that the beginning triple-' has already been consumed and
    792 // ignored.
    793 func lexMultilineRawString(lx *lexer) stateFn {
    794 	r := lx.next()
    795 	switch r {
    796 	default:
    797 		return lexMultilineRawString
    798 	case eof:
    799 		return lx.errorf(`unexpected EOF; expected "'''"`)
    800 	case '\'':
    801 		/// Found ' → try to read two more ''.
    802 		if lx.accept('\'') {
    803 			if lx.accept('\'') {
    804 				/// Peek ahead: the string can contain ' and '', including at the
    805 				/// end: '''str'''''
    806 				/// 6 or more at the end, however, is an error.
    807 				if lx.peek() == '\'' {
    808 					/// Check if we already lexed 5 's; if so we have 6 now, and
    809 					/// that's just too many man!
    810 					if strings.HasSuffix(lx.current(), "'''''") {
    811 						return lx.errorf(`unexpected "''''''"`)
    812 					}
    813 					lx.backup()
    814 					lx.backup()
    815 					return lexMultilineRawString
    816 				}
    817 
    818 				lx.backup() /// backup: don't include the ''' in the item.
    819 				lx.backup()
    820 				lx.backup()
    821 				lx.emit(itemRawMultilineString)
    822 				lx.next() /// Read over ''' again and discard it.
    823 				lx.next()
    824 				lx.next()
    825 				lx.ignore()
    826 				return lx.pop()
    827 			}
    828 			lx.backup()
    829 		}
    830 		return lexMultilineRawString
    831 	}
    832 }
    833 
    834 // lexMultilineStringEscape consumes an escaped character. It assumes that the
    835 // preceding '\\' has already been consumed.
    836 func lexMultilineStringEscape(lx *lexer) stateFn {
    837 	if isNL(lx.next()) { /// \ escaping newline.
    838 		return lexMultilineString
    839 	}
    840 	lx.backup()
    841 	lx.push(lexMultilineString)
    842 	return lexStringEscape(lx)
    843 }
    844 
    845 func lexStringEscape(lx *lexer) stateFn {
    846 	lx.esc = true
    847 	r := lx.next()
    848 	switch r {
    849 	case 'e':
    850 		if !lx.tomlNext {
    851 			return lx.error(errLexEscape{r})
    852 		}
    853 		fallthrough
    854 	case 'b':
    855 		fallthrough
    856 	case 't':
    857 		fallthrough
    858 	case 'n':
    859 		fallthrough
    860 	case 'f':
    861 		fallthrough
    862 	case 'r':
    863 		fallthrough
    864 	case '"':
    865 		fallthrough
    866 	case ' ', '\t':
    867 		// Inside """ .. """ strings you can use \ to escape newlines, and any
    868 		// amount of whitespace can be between the \ and \n.
    869 		fallthrough
    870 	case '\\':
    871 		return lx.pop()
    872 	case 'x':
    873 		if !lx.tomlNext {
    874 			return lx.error(errLexEscape{r})
    875 		}
    876 		return lexHexEscape
    877 	case 'u':
    878 		return lexShortUnicodeEscape
    879 	case 'U':
    880 		return lexLongUnicodeEscape
    881 	}
    882 	return lx.error(errLexEscape{r})
    883 }
    884 
    885 func lexHexEscape(lx *lexer) stateFn {
    886 	var r rune
    887 	for i := 0; i < 2; i++ {
    888 		r = lx.next()
    889 		if !isHex(r) {
    890 			return lx.errorf(`expected two hexadecimal digits after '\x', but got %q instead`, lx.current())
    891 		}
    892 	}
    893 	return lx.pop()
    894 }
    895 
    896 func lexShortUnicodeEscape(lx *lexer) stateFn {
    897 	var r rune
    898 	for i := 0; i < 4; i++ {
    899 		r = lx.next()
    900 		if !isHex(r) {
    901 			return lx.errorf(`expected four hexadecimal digits after '\u', but got %q instead`, lx.current())
    902 		}
    903 	}
    904 	return lx.pop()
    905 }
    906 
    907 func lexLongUnicodeEscape(lx *lexer) stateFn {
    908 	var r rune
    909 	for i := 0; i < 8; i++ {
    910 		r = lx.next()
    911 		if !isHex(r) {
    912 			return lx.errorf(`expected eight hexadecimal digits after '\U', but got %q instead`, lx.current())
    913 		}
    914 	}
    915 	return lx.pop()
    916 }
    917 
    918 // lexNumberOrDateStart processes the first character of a value which begins
    919 // with a digit. It exists to catch values starting with '0', so that
    920 // lexBaseNumberOrDate can differentiate base prefixed integers from other
    921 // types.
    922 func lexNumberOrDateStart(lx *lexer) stateFn {
    923 	r := lx.next()
    924 	switch r {
    925 	case '0':
    926 		return lexBaseNumberOrDate
    927 	}
    928 
    929 	if !isDigit(r) {
    930 		// The only way to reach this state is if the value starts
    931 		// with a digit, so specifically treat anything else as an
    932 		// error.
    933 		return lx.errorf("expected a digit but got %q", r)
    934 	}
    935 
    936 	return lexNumberOrDate
    937 }
    938 
    939 // lexNumberOrDate consumes either an integer, float or datetime.
    940 func lexNumberOrDate(lx *lexer) stateFn {
    941 	r := lx.next()
    942 	if isDigit(r) {
    943 		return lexNumberOrDate
    944 	}
    945 	switch r {
    946 	case '-', ':':
    947 		return lexDatetime
    948 	case '_':
    949 		return lexDecimalNumber
    950 	case '.', 'e', 'E':
    951 		return lexFloat
    952 	}
    953 
    954 	lx.backup()
    955 	lx.emit(itemInteger)
    956 	return lx.pop()
    957 }
    958 
    959 // lexDatetime consumes a Datetime, to a first approximation.
    960 // The parser validates that it matches one of the accepted formats.
    961 func lexDatetime(lx *lexer) stateFn {
    962 	r := lx.next()
    963 	if isDigit(r) {
    964 		return lexDatetime
    965 	}
    966 	switch r {
    967 	case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
    968 		return lexDatetime
    969 	}
    970 
    971 	lx.backup()
    972 	lx.emitTrim(itemDatetime)
    973 	return lx.pop()
    974 }
    975 
    976 // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
    977 func lexHexInteger(lx *lexer) stateFn {
    978 	r := lx.next()
    979 	if isHex(r) {
    980 		return lexHexInteger
    981 	}
    982 	switch r {
    983 	case '_':
    984 		return lexHexInteger
    985 	}
    986 
    987 	lx.backup()
    988 	lx.emit(itemInteger)
    989 	return lx.pop()
    990 }
    991 
    992 // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
    993 func lexOctalInteger(lx *lexer) stateFn {
    994 	r := lx.next()
    995 	if isOctal(r) {
    996 		return lexOctalInteger
    997 	}
    998 	switch r {
    999 	case '_':
   1000 		return lexOctalInteger
   1001 	}
   1002 
   1003 	lx.backup()
   1004 	lx.emit(itemInteger)
   1005 	return lx.pop()
   1006 }
   1007 
   1008 // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
   1009 func lexBinaryInteger(lx *lexer) stateFn {
   1010 	r := lx.next()
   1011 	if isBinary(r) {
   1012 		return lexBinaryInteger
   1013 	}
   1014 	switch r {
   1015 	case '_':
   1016 		return lexBinaryInteger
   1017 	}
   1018 
   1019 	lx.backup()
   1020 	lx.emit(itemInteger)
   1021 	return lx.pop()
   1022 }
   1023 
   1024 // lexDecimalNumber consumes a decimal float or integer.
   1025 func lexDecimalNumber(lx *lexer) stateFn {
   1026 	r := lx.next()
   1027 	if isDigit(r) {
   1028 		return lexDecimalNumber
   1029 	}
   1030 	switch r {
   1031 	case '.', 'e', 'E':
   1032 		return lexFloat
   1033 	case '_':
   1034 		return lexDecimalNumber
   1035 	}
   1036 
   1037 	lx.backup()
   1038 	lx.emit(itemInteger)
   1039 	return lx.pop()
   1040 }
   1041 
   1042 // lexDecimalNumber consumes the first digit of a number beginning with a sign.
   1043 // It assumes the sign has already been consumed. Values which start with a sign
   1044 // are only allowed to be decimal integers or floats.
   1045 //
   1046 // The special "nan" and "inf" values are also recognized.
   1047 func lexDecimalNumberStart(lx *lexer) stateFn {
   1048 	r := lx.next()
   1049 
   1050 	// Special error cases to give users better error messages
   1051 	switch r {
   1052 	case 'i':
   1053 		if !lx.accept('n') || !lx.accept('f') {
   1054 			return lx.errorf("invalid float: '%s'", lx.current())
   1055 		}
   1056 		lx.emit(itemFloat)
   1057 		return lx.pop()
   1058 	case 'n':
   1059 		if !lx.accept('a') || !lx.accept('n') {
   1060 			return lx.errorf("invalid float: '%s'", lx.current())
   1061 		}
   1062 		lx.emit(itemFloat)
   1063 		return lx.pop()
   1064 	case '0':
   1065 		p := lx.peek()
   1066 		switch p {
   1067 		case 'b', 'o', 'x':
   1068 			return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
   1069 		}
   1070 	case '.':
   1071 		return lx.errorf("floats must start with a digit, not '.'")
   1072 	}
   1073 
   1074 	if isDigit(r) {
   1075 		return lexDecimalNumber
   1076 	}
   1077 
   1078 	return lx.errorf("expected a digit but got %q", r)
   1079 }
   1080 
   1081 // lexBaseNumberOrDate differentiates between the possible values which
   1082 // start with '0'. It assumes that before reaching this state, the initial '0'
   1083 // has been consumed.
   1084 func lexBaseNumberOrDate(lx *lexer) stateFn {
   1085 	r := lx.next()
   1086 	// Note: All datetimes start with at least two digits, so we don't
   1087 	// handle date characters (':', '-', etc.) here.
   1088 	if isDigit(r) {
   1089 		return lexNumberOrDate
   1090 	}
   1091 	switch r {
   1092 	case '_':
   1093 		// Can only be decimal, because there can't be an underscore
   1094 		// between the '0' and the base designator, and dates can't
   1095 		// contain underscores.
   1096 		return lexDecimalNumber
   1097 	case '.', 'e', 'E':
   1098 		return lexFloat
   1099 	case 'b':
   1100 		r = lx.peek()
   1101 		if !isBinary(r) {
   1102 			lx.errorf("not a binary number: '%s%c'", lx.current(), r)
   1103 		}
   1104 		return lexBinaryInteger
   1105 	case 'o':
   1106 		r = lx.peek()
   1107 		if !isOctal(r) {
   1108 			lx.errorf("not an octal number: '%s%c'", lx.current(), r)
   1109 		}
   1110 		return lexOctalInteger
   1111 	case 'x':
   1112 		r = lx.peek()
   1113 		if !isHex(r) {
   1114 			lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
   1115 		}
   1116 		return lexHexInteger
   1117 	}
   1118 
   1119 	lx.backup()
   1120 	lx.emit(itemInteger)
   1121 	return lx.pop()
   1122 }
   1123 
   1124 // lexFloat consumes the elements of a float. It allows any sequence of
   1125 // float-like characters, so floats emitted by the lexer are only a first
   1126 // approximation and must be validated by the parser.
   1127 func lexFloat(lx *lexer) stateFn {
   1128 	r := lx.next()
   1129 	if isDigit(r) {
   1130 		return lexFloat
   1131 	}
   1132 	switch r {
   1133 	case '_', '.', '-', '+', 'e', 'E':
   1134 		return lexFloat
   1135 	}
   1136 
   1137 	lx.backup()
   1138 	lx.emit(itemFloat)
   1139 	return lx.pop()
   1140 }
   1141 
   1142 // lexBool consumes a bool string: 'true' or 'false.
   1143 func lexBool(lx *lexer) stateFn {
   1144 	var rs []rune
   1145 	for {
   1146 		r := lx.next()
   1147 		if !unicode.IsLetter(r) {
   1148 			lx.backup()
   1149 			break
   1150 		}
   1151 		rs = append(rs, r)
   1152 	}
   1153 	s := string(rs)
   1154 	switch s {
   1155 	case "true", "false":
   1156 		lx.emit(itemBool)
   1157 		return lx.pop()
   1158 	}
   1159 	return lx.errorf("expected value but found %q instead", s)
   1160 }
   1161 
   1162 // lexCommentStart begins the lexing of a comment. It will emit
   1163 // itemCommentStart and consume no characters, passing control to lexComment.
   1164 func lexCommentStart(lx *lexer) stateFn {
   1165 	lx.ignore()
   1166 	lx.emit(itemCommentStart)
   1167 	return lexComment
   1168 }
   1169 
   1170 // lexComment lexes an entire comment. It assumes that '#' has been consumed.
   1171 // It will consume *up to* the first newline character, and pass control
   1172 // back to the last state on the stack.
   1173 func lexComment(lx *lexer) stateFn {
   1174 	switch r := lx.next(); {
   1175 	case isNL(r) || r == eof:
   1176 		lx.backup()
   1177 		lx.emit(itemText)
   1178 		return lx.pop()
   1179 	default:
   1180 		return lexComment
   1181 	}
   1182 }
   1183 
   1184 // lexSkip ignores all slurped input and moves on to the next state.
   1185 func lexSkip(lx *lexer, nextState stateFn) stateFn {
   1186 	lx.ignore()
   1187 	return nextState
   1188 }
   1189 
   1190 func (s stateFn) String() string {
   1191 	name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
   1192 	if i := strings.LastIndexByte(name, '.'); i > -1 {
   1193 		name = name[i+1:]
   1194 	}
   1195 	if s == nil {
   1196 		name = "<nil>"
   1197 	}
   1198 	return name + "()"
   1199 }
   1200 
   1201 func (itype itemType) String() string {
   1202 	switch itype {
   1203 	case itemError:
   1204 		return "Error"
   1205 	case itemNIL:
   1206 		return "NIL"
   1207 	case itemEOF:
   1208 		return "EOF"
   1209 	case itemText:
   1210 		return "Text"
   1211 	case itemString, itemStringEsc, itemRawString, itemMultilineString, itemRawMultilineString:
   1212 		return "String"
   1213 	case itemBool:
   1214 		return "Bool"
   1215 	case itemInteger:
   1216 		return "Integer"
   1217 	case itemFloat:
   1218 		return "Float"
   1219 	case itemDatetime:
   1220 		return "DateTime"
   1221 	case itemTableStart:
   1222 		return "TableStart"
   1223 	case itemTableEnd:
   1224 		return "TableEnd"
   1225 	case itemKeyStart:
   1226 		return "KeyStart"
   1227 	case itemKeyEnd:
   1228 		return "KeyEnd"
   1229 	case itemArray:
   1230 		return "Array"
   1231 	case itemArrayEnd:
   1232 		return "ArrayEnd"
   1233 	case itemCommentStart:
   1234 		return "CommentStart"
   1235 	case itemInlineTableStart:
   1236 		return "InlineTableStart"
   1237 	case itemInlineTableEnd:
   1238 		return "InlineTableEnd"
   1239 	}
   1240 	panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
   1241 }
   1242 
   1243 func (item item) String() string {
   1244 	return fmt.Sprintf("(%s, %s)", item.typ, item.val)
   1245 }
   1246 
   1247 func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
   1248 func isNL(r rune) bool         { return r == '\n' || r == '\r' }
   1249 func isControl(r rune) bool { // Control characters except \t, \r, \n
   1250 	switch r {
   1251 	case '\t', '\r', '\n':
   1252 		return false
   1253 	default:
   1254 		return (r >= 0x00 && r <= 0x1f) || r == 0x7f
   1255 	}
   1256 }
   1257 func isDigit(r rune) bool  { return r >= '0' && r <= '9' }
   1258 func isBinary(r rune) bool { return r == '0' || r == '1' }
   1259 func isOctal(r rune) bool  { return r >= '0' && r <= '7' }
   1260 func isHex(r rune) bool    { return (r >= '0' && r <= '9') || (r|0x20 >= 'a' && r|0x20 <= 'f') }
   1261 func isBareKeyChar(r rune, tomlNext bool) bool {
   1262 	if tomlNext {
   1263 		return (r >= 'A' && r <= 'Z') ||
   1264 			(r >= 'a' && r <= 'z') ||
   1265 			(r >= '0' && r <= '9') ||
   1266 			r == '_' || r == '-' ||
   1267 			r == 0xb2 || r == 0xb3 || r == 0xb9 || (r >= 0xbc && r <= 0xbe) ||
   1268 			(r >= 0xc0 && r <= 0xd6) || (r >= 0xd8 && r <= 0xf6) || (r >= 0xf8 && r <= 0x037d) ||
   1269 			(r >= 0x037f && r <= 0x1fff) ||
   1270 			(r >= 0x200c && r <= 0x200d) || (r >= 0x203f && r <= 0x2040) ||
   1271 			(r >= 0x2070 && r <= 0x218f) || (r >= 0x2460 && r <= 0x24ff) ||
   1272 			(r >= 0x2c00 && r <= 0x2fef) || (r >= 0x3001 && r <= 0xd7ff) ||
   1273 			(r >= 0xf900 && r <= 0xfdcf) || (r >= 0xfdf0 && r <= 0xfffd) ||
   1274 			(r >= 0x10000 && r <= 0xeffff)
   1275 	}
   1276 
   1277 	return (r >= 'A' && r <= 'Z') ||
   1278 		(r >= 'a' && r <= 'z') ||
   1279 		(r >= '0' && r <= '9') ||
   1280 		r == '_' || r == '-'
   1281 }
	mirror Mirror free and open-source projects you like with minimal effort
	git clone git://git.server.ky/slackcoder/mirror
	Log \| Files \| Refs \| README