lex.go (30931B)
1 package toml 2 3 import ( 4 "fmt" 5 "reflect" 6 "runtime" 7 "strings" 8 "unicode" 9 "unicode/utf8" 10 ) 11 12 type itemType int 13 14 const ( 15 itemError itemType = iota 16 itemNIL // used in the parser to indicate no type 17 itemEOF 18 itemText 19 itemString 20 itemStringEsc 21 itemRawString 22 itemMultilineString 23 itemRawMultilineString 24 itemBool 25 itemInteger 26 itemFloat 27 itemDatetime 28 itemArray // the start of an array 29 itemArrayEnd 30 itemTableStart 31 itemTableEnd 32 itemArrayTableStart 33 itemArrayTableEnd 34 itemKeyStart 35 itemKeyEnd 36 itemCommentStart 37 itemInlineTableStart 38 itemInlineTableEnd 39 ) 40 41 const eof = 0 42 43 type stateFn func(lx *lexer) stateFn 44 45 func (p Position) String() string { 46 return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len) 47 } 48 49 type lexer struct { 50 input string 51 start int 52 pos int 53 line int 54 state stateFn 55 items chan item 56 tomlNext bool 57 esc bool 58 59 // Allow for backing up up to 4 runes. This is necessary because TOML 60 // contains 3-rune tokens (""" and '''). 61 prevWidths [4]int 62 nprev int // how many of prevWidths are in use 63 atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again. 64 65 // A stack of state functions used to maintain context. 66 // 67 // The idea is to reuse parts of the state machine in various places. For 68 // example, values can appear at the top level or within arbitrarily nested 69 // arrays. The last state on the stack is used after a value has been lexed. 70 // Similarly for comments. 71 stack []stateFn 72 } 73 74 type item struct { 75 typ itemType 76 val string 77 err error 78 pos Position 79 } 80 81 func (lx *lexer) nextItem() item { 82 for { 83 select { 84 case item := <-lx.items: 85 return item 86 default: 87 lx.state = lx.state(lx) 88 //fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack) 89 } 90 } 91 } 92 93 func lex(input string, tomlNext bool) *lexer { 94 lx := &lexer{ 95 input: input, 96 state: lexTop, 97 items: make(chan item, 10), 98 stack: make([]stateFn, 0, 10), 99 line: 1, 100 tomlNext: tomlNext, 101 } 102 return lx 103 } 104 105 func (lx *lexer) push(state stateFn) { 106 lx.stack = append(lx.stack, state) 107 } 108 109 func (lx *lexer) pop() stateFn { 110 if len(lx.stack) == 0 { 111 return lx.errorf("BUG in lexer: no states to pop") 112 } 113 last := lx.stack[len(lx.stack)-1] 114 lx.stack = lx.stack[0 : len(lx.stack)-1] 115 return last 116 } 117 118 func (lx *lexer) current() string { 119 return lx.input[lx.start:lx.pos] 120 } 121 122 func (lx lexer) getPos() Position { 123 p := Position{ 124 Line: lx.line, 125 Start: lx.start, 126 Len: lx.pos - lx.start, 127 } 128 if p.Len <= 0 { 129 p.Len = 1 130 } 131 return p 132 } 133 134 func (lx *lexer) emit(typ itemType) { 135 // Needed for multiline strings ending with an incomplete UTF-8 sequence. 136 if lx.start > lx.pos { 137 lx.error(errLexUTF8{lx.input[lx.pos]}) 138 return 139 } 140 lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()} 141 lx.start = lx.pos 142 } 143 144 func (lx *lexer) emitTrim(typ itemType) { 145 lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())} 146 lx.start = lx.pos 147 } 148 149 func (lx *lexer) next() (r rune) { 150 if lx.atEOF { 151 panic("BUG in lexer: next called after EOF") 152 } 153 if lx.pos >= len(lx.input) { 154 lx.atEOF = true 155 return eof 156 } 157 158 if lx.input[lx.pos] == '\n' { 159 lx.line++ 160 } 161 lx.prevWidths[3] = lx.prevWidths[2] 162 lx.prevWidths[2] = lx.prevWidths[1] 163 lx.prevWidths[1] = lx.prevWidths[0] 164 if lx.nprev < 4 { 165 lx.nprev++ 166 } 167 168 r, w := utf8.DecodeRuneInString(lx.input[lx.pos:]) 169 if r == utf8.RuneError && w == 1 { 170 lx.error(errLexUTF8{lx.input[lx.pos]}) 171 return utf8.RuneError 172 } 173 174 // Note: don't use peek() here, as this calls next(). 175 if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) { 176 lx.errorControlChar(r) 177 return utf8.RuneError 178 } 179 180 lx.prevWidths[0] = w 181 lx.pos += w 182 return r 183 } 184 185 // ignore skips over the pending input before this point. 186 func (lx *lexer) ignore() { 187 lx.start = lx.pos 188 } 189 190 // backup steps back one rune. Can be called 4 times between calls to next. 191 func (lx *lexer) backup() { 192 if lx.atEOF { 193 lx.atEOF = false 194 return 195 } 196 if lx.nprev < 1 { 197 panic("BUG in lexer: backed up too far") 198 } 199 w := lx.prevWidths[0] 200 lx.prevWidths[0] = lx.prevWidths[1] 201 lx.prevWidths[1] = lx.prevWidths[2] 202 lx.prevWidths[2] = lx.prevWidths[3] 203 lx.nprev-- 204 205 lx.pos -= w 206 if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' { 207 lx.line-- 208 } 209 } 210 211 // accept consumes the next rune if it's equal to `valid`. 212 func (lx *lexer) accept(valid rune) bool { 213 if lx.next() == valid { 214 return true 215 } 216 lx.backup() 217 return false 218 } 219 220 // peek returns but does not consume the next rune in the input. 221 func (lx *lexer) peek() rune { 222 r := lx.next() 223 lx.backup() 224 return r 225 } 226 227 // skip ignores all input that matches the given predicate. 228 func (lx *lexer) skip(pred func(rune) bool) { 229 for { 230 r := lx.next() 231 if pred(r) { 232 continue 233 } 234 lx.backup() 235 lx.ignore() 236 return 237 } 238 } 239 240 // error stops all lexing by emitting an error and returning `nil`. 241 // 242 // Note that any value that is a character is escaped if it's a special 243 // character (newlines, tabs, etc.). 244 func (lx *lexer) error(err error) stateFn { 245 if lx.atEOF { 246 return lx.errorPrevLine(err) 247 } 248 lx.items <- item{typ: itemError, pos: lx.getPos(), err: err} 249 return nil 250 } 251 252 // errorfPrevline is like error(), but sets the position to the last column of 253 // the previous line. 254 // 255 // This is so that unexpected EOF or NL errors don't show on a new blank line. 256 func (lx *lexer) errorPrevLine(err error) stateFn { 257 pos := lx.getPos() 258 pos.Line-- 259 pos.Len = 1 260 pos.Start = lx.pos - 1 261 lx.items <- item{typ: itemError, pos: pos, err: err} 262 return nil 263 } 264 265 // errorPos is like error(), but allows explicitly setting the position. 266 func (lx *lexer) errorPos(start, length int, err error) stateFn { 267 pos := lx.getPos() 268 pos.Start = start 269 pos.Len = length 270 lx.items <- item{typ: itemError, pos: pos, err: err} 271 return nil 272 } 273 274 // errorf is like error, and creates a new error. 275 func (lx *lexer) errorf(format string, values ...any) stateFn { 276 if lx.atEOF { 277 pos := lx.getPos() 278 pos.Line-- 279 pos.Len = 1 280 pos.Start = lx.pos - 1 281 lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)} 282 return nil 283 } 284 lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)} 285 return nil 286 } 287 288 func (lx *lexer) errorControlChar(cc rune) stateFn { 289 return lx.errorPos(lx.pos-1, 1, errLexControl{cc}) 290 } 291 292 // lexTop consumes elements at the top level of TOML data. 293 func lexTop(lx *lexer) stateFn { 294 r := lx.next() 295 if isWhitespace(r) || isNL(r) { 296 return lexSkip(lx, lexTop) 297 } 298 switch r { 299 case '#': 300 lx.push(lexTop) 301 return lexCommentStart 302 case '[': 303 return lexTableStart 304 case eof: 305 if lx.pos > lx.start { 306 return lx.errorf("unexpected EOF") 307 } 308 lx.emit(itemEOF) 309 return nil 310 } 311 312 // At this point, the only valid item can be a key, so we back up 313 // and let the key lexer do the rest. 314 lx.backup() 315 lx.push(lexTopEnd) 316 return lexKeyStart 317 } 318 319 // lexTopEnd is entered whenever a top-level item has been consumed. (A value 320 // or a table.) It must see only whitespace, and will turn back to lexTop 321 // upon a newline. If it sees EOF, it will quit the lexer successfully. 322 func lexTopEnd(lx *lexer) stateFn { 323 r := lx.next() 324 switch { 325 case r == '#': 326 // a comment will read to a newline for us. 327 lx.push(lexTop) 328 return lexCommentStart 329 case isWhitespace(r): 330 return lexTopEnd 331 case isNL(r): 332 lx.ignore() 333 return lexTop 334 case r == eof: 335 lx.emit(itemEOF) 336 return nil 337 } 338 return lx.errorf("expected a top-level item to end with a newline, comment, or EOF, but got %q instead", r) 339 } 340 341 // lexTable lexes the beginning of a table. Namely, it makes sure that 342 // it starts with a character other than '.' and ']'. 343 // It assumes that '[' has already been consumed. 344 // It also handles the case that this is an item in an array of tables. 345 // e.g., '[[name]]'. 346 func lexTableStart(lx *lexer) stateFn { 347 if lx.peek() == '[' { 348 lx.next() 349 lx.emit(itemArrayTableStart) 350 lx.push(lexArrayTableEnd) 351 } else { 352 lx.emit(itemTableStart) 353 lx.push(lexTableEnd) 354 } 355 return lexTableNameStart 356 } 357 358 func lexTableEnd(lx *lexer) stateFn { 359 lx.emit(itemTableEnd) 360 return lexTopEnd 361 } 362 363 func lexArrayTableEnd(lx *lexer) stateFn { 364 if r := lx.next(); r != ']' { 365 return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r) 366 } 367 lx.emit(itemArrayTableEnd) 368 return lexTopEnd 369 } 370 371 func lexTableNameStart(lx *lexer) stateFn { 372 lx.skip(isWhitespace) 373 switch r := lx.peek(); { 374 case r == ']' || r == eof: 375 return lx.errorf("unexpected end of table name (table names cannot be empty)") 376 case r == '.': 377 return lx.errorf("unexpected table separator (table names cannot be empty)") 378 case r == '"' || r == '\'': 379 lx.ignore() 380 lx.push(lexTableNameEnd) 381 return lexQuotedName 382 default: 383 lx.push(lexTableNameEnd) 384 return lexBareName 385 } 386 } 387 388 // lexTableNameEnd reads the end of a piece of a table name, optionally 389 // consuming whitespace. 390 func lexTableNameEnd(lx *lexer) stateFn { 391 lx.skip(isWhitespace) 392 switch r := lx.next(); { 393 case isWhitespace(r): 394 return lexTableNameEnd 395 case r == '.': 396 lx.ignore() 397 return lexTableNameStart 398 case r == ']': 399 return lx.pop() 400 default: 401 return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r) 402 } 403 } 404 405 // lexBareName lexes one part of a key or table. 406 // 407 // It assumes that at least one valid character for the table has already been 408 // read. 409 // 410 // Lexes only one part, e.g. only 'a' inside 'a.b'. 411 func lexBareName(lx *lexer) stateFn { 412 r := lx.next() 413 if isBareKeyChar(r, lx.tomlNext) { 414 return lexBareName 415 } 416 lx.backup() 417 lx.emit(itemText) 418 return lx.pop() 419 } 420 421 // lexBareName lexes one part of a key or table. 422 // 423 // It assumes that at least one valid character for the table has already been 424 // read. 425 // 426 // Lexes only one part, e.g. only '"a"' inside '"a".b'. 427 func lexQuotedName(lx *lexer) stateFn { 428 r := lx.next() 429 switch { 430 case isWhitespace(r): 431 return lexSkip(lx, lexValue) 432 case r == '"': 433 lx.ignore() // ignore the '"' 434 return lexString 435 case r == '\'': 436 lx.ignore() // ignore the "'" 437 return lexRawString 438 case r == eof: 439 return lx.errorf("unexpected EOF; expected value") 440 default: 441 return lx.errorf("expected value but found %q instead", r) 442 } 443 } 444 445 // lexKeyStart consumes all key parts until a '='. 446 func lexKeyStart(lx *lexer) stateFn { 447 lx.skip(isWhitespace) 448 switch r := lx.peek(); { 449 case r == '=' || r == eof: 450 return lx.errorf("unexpected '=': key name appears blank") 451 case r == '.': 452 return lx.errorf("unexpected '.': keys cannot start with a '.'") 453 case r == '"' || r == '\'': 454 lx.ignore() 455 fallthrough 456 default: // Bare key 457 lx.emit(itemKeyStart) 458 return lexKeyNameStart 459 } 460 } 461 462 func lexKeyNameStart(lx *lexer) stateFn { 463 lx.skip(isWhitespace) 464 switch r := lx.peek(); { 465 case r == '=' || r == eof: 466 return lx.errorf("unexpected '='") 467 case r == '.': 468 return lx.errorf("unexpected '.'") 469 case r == '"' || r == '\'': 470 lx.ignore() 471 lx.push(lexKeyEnd) 472 return lexQuotedName 473 default: 474 lx.push(lexKeyEnd) 475 return lexBareName 476 } 477 } 478 479 // lexKeyEnd consumes the end of a key and trims whitespace (up to the key 480 // separator). 481 func lexKeyEnd(lx *lexer) stateFn { 482 lx.skip(isWhitespace) 483 switch r := lx.next(); { 484 case isWhitespace(r): 485 return lexSkip(lx, lexKeyEnd) 486 case r == eof: 487 return lx.errorf("unexpected EOF; expected key separator '='") 488 case r == '.': 489 lx.ignore() 490 return lexKeyNameStart 491 case r == '=': 492 lx.emit(itemKeyEnd) 493 return lexSkip(lx, lexValue) 494 default: 495 return lx.errorf("expected '.' or '=', but got %q instead", r) 496 } 497 } 498 499 // lexValue starts the consumption of a value anywhere a value is expected. 500 // lexValue will ignore whitespace. 501 // After a value is lexed, the last state on the next is popped and returned. 502 func lexValue(lx *lexer) stateFn { 503 // We allow whitespace to precede a value, but NOT newlines. 504 // In array syntax, the array states are responsible for ignoring newlines. 505 r := lx.next() 506 switch { 507 case isWhitespace(r): 508 return lexSkip(lx, lexValue) 509 case isDigit(r): 510 lx.backup() // avoid an extra state and use the same as above 511 return lexNumberOrDateStart 512 } 513 switch r { 514 case '[': 515 lx.ignore() 516 lx.emit(itemArray) 517 return lexArrayValue 518 case '{': 519 lx.ignore() 520 lx.emit(itemInlineTableStart) 521 return lexInlineTableValue 522 case '"': 523 if lx.accept('"') { 524 if lx.accept('"') { 525 lx.ignore() // Ignore """ 526 return lexMultilineString 527 } 528 lx.backup() 529 } 530 lx.ignore() // ignore the '"' 531 return lexString 532 case '\'': 533 if lx.accept('\'') { 534 if lx.accept('\'') { 535 lx.ignore() // Ignore """ 536 return lexMultilineRawString 537 } 538 lx.backup() 539 } 540 lx.ignore() // ignore the "'" 541 return lexRawString 542 case '.': // special error case, be kind to users 543 return lx.errorf("floats must start with a digit, not '.'") 544 case 'i', 'n': 545 if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) { 546 lx.emit(itemFloat) 547 return lx.pop() 548 } 549 case '-', '+': 550 return lexDecimalNumberStart 551 } 552 if unicode.IsLetter(r) { 553 // Be permissive here; lexBool will give a nice error if the 554 // user wrote something like 555 // x = foo 556 // (i.e. not 'true' or 'false' but is something else word-like.) 557 lx.backup() 558 return lexBool 559 } 560 if r == eof { 561 return lx.errorf("unexpected EOF; expected value") 562 } 563 return lx.errorf("expected value but found %q instead", r) 564 } 565 566 // lexArrayValue consumes one value in an array. It assumes that '[' or ',' 567 // have already been consumed. All whitespace and newlines are ignored. 568 func lexArrayValue(lx *lexer) stateFn { 569 r := lx.next() 570 switch { 571 case isWhitespace(r) || isNL(r): 572 return lexSkip(lx, lexArrayValue) 573 case r == '#': 574 lx.push(lexArrayValue) 575 return lexCommentStart 576 case r == ',': 577 return lx.errorf("unexpected comma") 578 case r == ']': 579 return lexArrayEnd 580 } 581 582 lx.backup() 583 lx.push(lexArrayValueEnd) 584 return lexValue 585 } 586 587 // lexArrayValueEnd consumes everything between the end of an array value and 588 // the next value (or the end of the array): it ignores whitespace and newlines 589 // and expects either a ',' or a ']'. 590 func lexArrayValueEnd(lx *lexer) stateFn { 591 switch r := lx.next(); { 592 case isWhitespace(r) || isNL(r): 593 return lexSkip(lx, lexArrayValueEnd) 594 case r == '#': 595 lx.push(lexArrayValueEnd) 596 return lexCommentStart 597 case r == ',': 598 lx.ignore() 599 return lexArrayValue // move on to the next value 600 case r == ']': 601 return lexArrayEnd 602 default: 603 return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r)) 604 } 605 } 606 607 // lexArrayEnd finishes the lexing of an array. 608 // It assumes that a ']' has just been consumed. 609 func lexArrayEnd(lx *lexer) stateFn { 610 lx.ignore() 611 lx.emit(itemArrayEnd) 612 return lx.pop() 613 } 614 615 // lexInlineTableValue consumes one key/value pair in an inline table. 616 // It assumes that '{' or ',' have already been consumed. Whitespace is ignored. 617 func lexInlineTableValue(lx *lexer) stateFn { 618 r := lx.next() 619 switch { 620 case isWhitespace(r): 621 return lexSkip(lx, lexInlineTableValue) 622 case isNL(r): 623 if lx.tomlNext { 624 return lexSkip(lx, lexInlineTableValue) 625 } 626 return lx.errorPrevLine(errLexInlineTableNL{}) 627 case r == '#': 628 lx.push(lexInlineTableValue) 629 return lexCommentStart 630 case r == ',': 631 return lx.errorf("unexpected comma") 632 case r == '}': 633 return lexInlineTableEnd 634 } 635 lx.backup() 636 lx.push(lexInlineTableValueEnd) 637 return lexKeyStart 638 } 639 640 // lexInlineTableValueEnd consumes everything between the end of an inline table 641 // key/value pair and the next pair (or the end of the table): 642 // it ignores whitespace and expects either a ',' or a '}'. 643 func lexInlineTableValueEnd(lx *lexer) stateFn { 644 switch r := lx.next(); { 645 case isWhitespace(r): 646 return lexSkip(lx, lexInlineTableValueEnd) 647 case isNL(r): 648 if lx.tomlNext { 649 return lexSkip(lx, lexInlineTableValueEnd) 650 } 651 return lx.errorPrevLine(errLexInlineTableNL{}) 652 case r == '#': 653 lx.push(lexInlineTableValueEnd) 654 return lexCommentStart 655 case r == ',': 656 lx.ignore() 657 lx.skip(isWhitespace) 658 if lx.peek() == '}' { 659 if lx.tomlNext { 660 return lexInlineTableValueEnd 661 } 662 return lx.errorf("trailing comma not allowed in inline tables") 663 } 664 return lexInlineTableValue 665 case r == '}': 666 return lexInlineTableEnd 667 default: 668 return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r)) 669 } 670 } 671 672 func runeOrEOF(r rune) string { 673 if r == eof { 674 return "end of file" 675 } 676 return "'" + string(r) + "'" 677 } 678 679 // lexInlineTableEnd finishes the lexing of an inline table. 680 // It assumes that a '}' has just been consumed. 681 func lexInlineTableEnd(lx *lexer) stateFn { 682 lx.ignore() 683 lx.emit(itemInlineTableEnd) 684 return lx.pop() 685 } 686 687 // lexString consumes the inner contents of a string. It assumes that the 688 // beginning '"' has already been consumed and ignored. 689 func lexString(lx *lexer) stateFn { 690 r := lx.next() 691 switch { 692 case r == eof: 693 return lx.errorf(`unexpected EOF; expected '"'`) 694 case isNL(r): 695 return lx.errorPrevLine(errLexStringNL{}) 696 case r == '\\': 697 lx.push(lexString) 698 return lexStringEscape 699 case r == '"': 700 lx.backup() 701 if lx.esc { 702 lx.esc = false 703 lx.emit(itemStringEsc) 704 } else { 705 lx.emit(itemString) 706 } 707 lx.next() 708 lx.ignore() 709 return lx.pop() 710 } 711 return lexString 712 } 713 714 // lexMultilineString consumes the inner contents of a string. It assumes that 715 // the beginning '"""' has already been consumed and ignored. 716 func lexMultilineString(lx *lexer) stateFn { 717 r := lx.next() 718 switch r { 719 default: 720 return lexMultilineString 721 case eof: 722 return lx.errorf(`unexpected EOF; expected '"""'`) 723 case '\\': 724 return lexMultilineStringEscape 725 case '"': 726 /// Found " → try to read two more "". 727 if lx.accept('"') { 728 if lx.accept('"') { 729 /// Peek ahead: the string can contain " and "", including at the 730 /// end: """str""""" 731 /// 6 or more at the end, however, is an error. 732 if lx.peek() == '"' { 733 /// Check if we already lexed 5 's; if so we have 6 now, and 734 /// that's just too many man! 735 /// 736 /// Second check is for the edge case: 737 /// 738 /// two quotes allowed. 739 /// vv 740 /// """lol \"""""" 741 /// ^^ ^^^---- closing three 742 /// escaped 743 /// 744 /// But ugly, but it works 745 if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) { 746 return lx.errorf(`unexpected '""""""'`) 747 } 748 lx.backup() 749 lx.backup() 750 return lexMultilineString 751 } 752 753 lx.backup() /// backup: don't include the """ in the item. 754 lx.backup() 755 lx.backup() 756 lx.esc = false 757 lx.emit(itemMultilineString) 758 lx.next() /// Read over ''' again and discard it. 759 lx.next() 760 lx.next() 761 lx.ignore() 762 return lx.pop() 763 } 764 lx.backup() 765 } 766 return lexMultilineString 767 } 768 } 769 770 // lexRawString consumes a raw string. Nothing can be escaped in such a string. 771 // It assumes that the beginning "'" has already been consumed and ignored. 772 func lexRawString(lx *lexer) stateFn { 773 r := lx.next() 774 switch { 775 default: 776 return lexRawString 777 case r == eof: 778 return lx.errorf(`unexpected EOF; expected "'"`) 779 case isNL(r): 780 return lx.errorPrevLine(errLexStringNL{}) 781 case r == '\'': 782 lx.backup() 783 lx.emit(itemRawString) 784 lx.next() 785 lx.ignore() 786 return lx.pop() 787 } 788 } 789 790 // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a 791 // string. It assumes that the beginning triple-' has already been consumed and 792 // ignored. 793 func lexMultilineRawString(lx *lexer) stateFn { 794 r := lx.next() 795 switch r { 796 default: 797 return lexMultilineRawString 798 case eof: 799 return lx.errorf(`unexpected EOF; expected "'''"`) 800 case '\'': 801 /// Found ' → try to read two more ''. 802 if lx.accept('\'') { 803 if lx.accept('\'') { 804 /// Peek ahead: the string can contain ' and '', including at the 805 /// end: '''str''''' 806 /// 6 or more at the end, however, is an error. 807 if lx.peek() == '\'' { 808 /// Check if we already lexed 5 's; if so we have 6 now, and 809 /// that's just too many man! 810 if strings.HasSuffix(lx.current(), "'''''") { 811 return lx.errorf(`unexpected "''''''"`) 812 } 813 lx.backup() 814 lx.backup() 815 return lexMultilineRawString 816 } 817 818 lx.backup() /// backup: don't include the ''' in the item. 819 lx.backup() 820 lx.backup() 821 lx.emit(itemRawMultilineString) 822 lx.next() /// Read over ''' again and discard it. 823 lx.next() 824 lx.next() 825 lx.ignore() 826 return lx.pop() 827 } 828 lx.backup() 829 } 830 return lexMultilineRawString 831 } 832 } 833 834 // lexMultilineStringEscape consumes an escaped character. It assumes that the 835 // preceding '\\' has already been consumed. 836 func lexMultilineStringEscape(lx *lexer) stateFn { 837 if isNL(lx.next()) { /// \ escaping newline. 838 return lexMultilineString 839 } 840 lx.backup() 841 lx.push(lexMultilineString) 842 return lexStringEscape(lx) 843 } 844 845 func lexStringEscape(lx *lexer) stateFn { 846 lx.esc = true 847 r := lx.next() 848 switch r { 849 case 'e': 850 if !lx.tomlNext { 851 return lx.error(errLexEscape{r}) 852 } 853 fallthrough 854 case 'b': 855 fallthrough 856 case 't': 857 fallthrough 858 case 'n': 859 fallthrough 860 case 'f': 861 fallthrough 862 case 'r': 863 fallthrough 864 case '"': 865 fallthrough 866 case ' ', '\t': 867 // Inside """ .. """ strings you can use \ to escape newlines, and any 868 // amount of whitespace can be between the \ and \n. 869 fallthrough 870 case '\\': 871 return lx.pop() 872 case 'x': 873 if !lx.tomlNext { 874 return lx.error(errLexEscape{r}) 875 } 876 return lexHexEscape 877 case 'u': 878 return lexShortUnicodeEscape 879 case 'U': 880 return lexLongUnicodeEscape 881 } 882 return lx.error(errLexEscape{r}) 883 } 884 885 func lexHexEscape(lx *lexer) stateFn { 886 var r rune 887 for i := 0; i < 2; i++ { 888 r = lx.next() 889 if !isHex(r) { 890 return lx.errorf(`expected two hexadecimal digits after '\x', but got %q instead`, lx.current()) 891 } 892 } 893 return lx.pop() 894 } 895 896 func lexShortUnicodeEscape(lx *lexer) stateFn { 897 var r rune 898 for i := 0; i < 4; i++ { 899 r = lx.next() 900 if !isHex(r) { 901 return lx.errorf(`expected four hexadecimal digits after '\u', but got %q instead`, lx.current()) 902 } 903 } 904 return lx.pop() 905 } 906 907 func lexLongUnicodeEscape(lx *lexer) stateFn { 908 var r rune 909 for i := 0; i < 8; i++ { 910 r = lx.next() 911 if !isHex(r) { 912 return lx.errorf(`expected eight hexadecimal digits after '\U', but got %q instead`, lx.current()) 913 } 914 } 915 return lx.pop() 916 } 917 918 // lexNumberOrDateStart processes the first character of a value which begins 919 // with a digit. It exists to catch values starting with '0', so that 920 // lexBaseNumberOrDate can differentiate base prefixed integers from other 921 // types. 922 func lexNumberOrDateStart(lx *lexer) stateFn { 923 r := lx.next() 924 switch r { 925 case '0': 926 return lexBaseNumberOrDate 927 } 928 929 if !isDigit(r) { 930 // The only way to reach this state is if the value starts 931 // with a digit, so specifically treat anything else as an 932 // error. 933 return lx.errorf("expected a digit but got %q", r) 934 } 935 936 return lexNumberOrDate 937 } 938 939 // lexNumberOrDate consumes either an integer, float or datetime. 940 func lexNumberOrDate(lx *lexer) stateFn { 941 r := lx.next() 942 if isDigit(r) { 943 return lexNumberOrDate 944 } 945 switch r { 946 case '-', ':': 947 return lexDatetime 948 case '_': 949 return lexDecimalNumber 950 case '.', 'e', 'E': 951 return lexFloat 952 } 953 954 lx.backup() 955 lx.emit(itemInteger) 956 return lx.pop() 957 } 958 959 // lexDatetime consumes a Datetime, to a first approximation. 960 // The parser validates that it matches one of the accepted formats. 961 func lexDatetime(lx *lexer) stateFn { 962 r := lx.next() 963 if isDigit(r) { 964 return lexDatetime 965 } 966 switch r { 967 case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+': 968 return lexDatetime 969 } 970 971 lx.backup() 972 lx.emitTrim(itemDatetime) 973 return lx.pop() 974 } 975 976 // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix. 977 func lexHexInteger(lx *lexer) stateFn { 978 r := lx.next() 979 if isHex(r) { 980 return lexHexInteger 981 } 982 switch r { 983 case '_': 984 return lexHexInteger 985 } 986 987 lx.backup() 988 lx.emit(itemInteger) 989 return lx.pop() 990 } 991 992 // lexOctalInteger consumes an octal integer after seeing the '0o' prefix. 993 func lexOctalInteger(lx *lexer) stateFn { 994 r := lx.next() 995 if isOctal(r) { 996 return lexOctalInteger 997 } 998 switch r { 999 case '_': 1000 return lexOctalInteger 1001 } 1002 1003 lx.backup() 1004 lx.emit(itemInteger) 1005 return lx.pop() 1006 } 1007 1008 // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix. 1009 func lexBinaryInteger(lx *lexer) stateFn { 1010 r := lx.next() 1011 if isBinary(r) { 1012 return lexBinaryInteger 1013 } 1014 switch r { 1015 case '_': 1016 return lexBinaryInteger 1017 } 1018 1019 lx.backup() 1020 lx.emit(itemInteger) 1021 return lx.pop() 1022 } 1023 1024 // lexDecimalNumber consumes a decimal float or integer. 1025 func lexDecimalNumber(lx *lexer) stateFn { 1026 r := lx.next() 1027 if isDigit(r) { 1028 return lexDecimalNumber 1029 } 1030 switch r { 1031 case '.', 'e', 'E': 1032 return lexFloat 1033 case '_': 1034 return lexDecimalNumber 1035 } 1036 1037 lx.backup() 1038 lx.emit(itemInteger) 1039 return lx.pop() 1040 } 1041 1042 // lexDecimalNumber consumes the first digit of a number beginning with a sign. 1043 // It assumes the sign has already been consumed. Values which start with a sign 1044 // are only allowed to be decimal integers or floats. 1045 // 1046 // The special "nan" and "inf" values are also recognized. 1047 func lexDecimalNumberStart(lx *lexer) stateFn { 1048 r := lx.next() 1049 1050 // Special error cases to give users better error messages 1051 switch r { 1052 case 'i': 1053 if !lx.accept('n') || !lx.accept('f') { 1054 return lx.errorf("invalid float: '%s'", lx.current()) 1055 } 1056 lx.emit(itemFloat) 1057 return lx.pop() 1058 case 'n': 1059 if !lx.accept('a') || !lx.accept('n') { 1060 return lx.errorf("invalid float: '%s'", lx.current()) 1061 } 1062 lx.emit(itemFloat) 1063 return lx.pop() 1064 case '0': 1065 p := lx.peek() 1066 switch p { 1067 case 'b', 'o', 'x': 1068 return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p) 1069 } 1070 case '.': 1071 return lx.errorf("floats must start with a digit, not '.'") 1072 } 1073 1074 if isDigit(r) { 1075 return lexDecimalNumber 1076 } 1077 1078 return lx.errorf("expected a digit but got %q", r) 1079 } 1080 1081 // lexBaseNumberOrDate differentiates between the possible values which 1082 // start with '0'. It assumes that before reaching this state, the initial '0' 1083 // has been consumed. 1084 func lexBaseNumberOrDate(lx *lexer) stateFn { 1085 r := lx.next() 1086 // Note: All datetimes start with at least two digits, so we don't 1087 // handle date characters (':', '-', etc.) here. 1088 if isDigit(r) { 1089 return lexNumberOrDate 1090 } 1091 switch r { 1092 case '_': 1093 // Can only be decimal, because there can't be an underscore 1094 // between the '0' and the base designator, and dates can't 1095 // contain underscores. 1096 return lexDecimalNumber 1097 case '.', 'e', 'E': 1098 return lexFloat 1099 case 'b': 1100 r = lx.peek() 1101 if !isBinary(r) { 1102 lx.errorf("not a binary number: '%s%c'", lx.current(), r) 1103 } 1104 return lexBinaryInteger 1105 case 'o': 1106 r = lx.peek() 1107 if !isOctal(r) { 1108 lx.errorf("not an octal number: '%s%c'", lx.current(), r) 1109 } 1110 return lexOctalInteger 1111 case 'x': 1112 r = lx.peek() 1113 if !isHex(r) { 1114 lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r) 1115 } 1116 return lexHexInteger 1117 } 1118 1119 lx.backup() 1120 lx.emit(itemInteger) 1121 return lx.pop() 1122 } 1123 1124 // lexFloat consumes the elements of a float. It allows any sequence of 1125 // float-like characters, so floats emitted by the lexer are only a first 1126 // approximation and must be validated by the parser. 1127 func lexFloat(lx *lexer) stateFn { 1128 r := lx.next() 1129 if isDigit(r) { 1130 return lexFloat 1131 } 1132 switch r { 1133 case '_', '.', '-', '+', 'e', 'E': 1134 return lexFloat 1135 } 1136 1137 lx.backup() 1138 lx.emit(itemFloat) 1139 return lx.pop() 1140 } 1141 1142 // lexBool consumes a bool string: 'true' or 'false. 1143 func lexBool(lx *lexer) stateFn { 1144 var rs []rune 1145 for { 1146 r := lx.next() 1147 if !unicode.IsLetter(r) { 1148 lx.backup() 1149 break 1150 } 1151 rs = append(rs, r) 1152 } 1153 s := string(rs) 1154 switch s { 1155 case "true", "false": 1156 lx.emit(itemBool) 1157 return lx.pop() 1158 } 1159 return lx.errorf("expected value but found %q instead", s) 1160 } 1161 1162 // lexCommentStart begins the lexing of a comment. It will emit 1163 // itemCommentStart and consume no characters, passing control to lexComment. 1164 func lexCommentStart(lx *lexer) stateFn { 1165 lx.ignore() 1166 lx.emit(itemCommentStart) 1167 return lexComment 1168 } 1169 1170 // lexComment lexes an entire comment. It assumes that '#' has been consumed. 1171 // It will consume *up to* the first newline character, and pass control 1172 // back to the last state on the stack. 1173 func lexComment(lx *lexer) stateFn { 1174 switch r := lx.next(); { 1175 case isNL(r) || r == eof: 1176 lx.backup() 1177 lx.emit(itemText) 1178 return lx.pop() 1179 default: 1180 return lexComment 1181 } 1182 } 1183 1184 // lexSkip ignores all slurped input and moves on to the next state. 1185 func lexSkip(lx *lexer, nextState stateFn) stateFn { 1186 lx.ignore() 1187 return nextState 1188 } 1189 1190 func (s stateFn) String() string { 1191 name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name() 1192 if i := strings.LastIndexByte(name, '.'); i > -1 { 1193 name = name[i+1:] 1194 } 1195 if s == nil { 1196 name = "<nil>" 1197 } 1198 return name + "()" 1199 } 1200 1201 func (itype itemType) String() string { 1202 switch itype { 1203 case itemError: 1204 return "Error" 1205 case itemNIL: 1206 return "NIL" 1207 case itemEOF: 1208 return "EOF" 1209 case itemText: 1210 return "Text" 1211 case itemString, itemStringEsc, itemRawString, itemMultilineString, itemRawMultilineString: 1212 return "String" 1213 case itemBool: 1214 return "Bool" 1215 case itemInteger: 1216 return "Integer" 1217 case itemFloat: 1218 return "Float" 1219 case itemDatetime: 1220 return "DateTime" 1221 case itemTableStart: 1222 return "TableStart" 1223 case itemTableEnd: 1224 return "TableEnd" 1225 case itemKeyStart: 1226 return "KeyStart" 1227 case itemKeyEnd: 1228 return "KeyEnd" 1229 case itemArray: 1230 return "Array" 1231 case itemArrayEnd: 1232 return "ArrayEnd" 1233 case itemCommentStart: 1234 return "CommentStart" 1235 case itemInlineTableStart: 1236 return "InlineTableStart" 1237 case itemInlineTableEnd: 1238 return "InlineTableEnd" 1239 } 1240 panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype))) 1241 } 1242 1243 func (item item) String() string { 1244 return fmt.Sprintf("(%s, %s)", item.typ, item.val) 1245 } 1246 1247 func isWhitespace(r rune) bool { return r == '\t' || r == ' ' } 1248 func isNL(r rune) bool { return r == '\n' || r == '\r' } 1249 func isControl(r rune) bool { // Control characters except \t, \r, \n 1250 switch r { 1251 case '\t', '\r', '\n': 1252 return false 1253 default: 1254 return (r >= 0x00 && r <= 0x1f) || r == 0x7f 1255 } 1256 } 1257 func isDigit(r rune) bool { return r >= '0' && r <= '9' } 1258 func isBinary(r rune) bool { return r == '0' || r == '1' } 1259 func isOctal(r rune) bool { return r >= '0' && r <= '7' } 1260 func isHex(r rune) bool { return (r >= '0' && r <= '9') || (r|0x20 >= 'a' && r|0x20 <= 'f') } 1261 func isBareKeyChar(r rune, tomlNext bool) bool { 1262 if tomlNext { 1263 return (r >= 'A' && r <= 'Z') || 1264 (r >= 'a' && r <= 'z') || 1265 (r >= '0' && r <= '9') || 1266 r == '_' || r == '-' || 1267 r == 0xb2 || r == 0xb3 || r == 0xb9 || (r >= 0xbc && r <= 0xbe) || 1268 (r >= 0xc0 && r <= 0xd6) || (r >= 0xd8 && r <= 0xf6) || (r >= 0xf8 && r <= 0x037d) || 1269 (r >= 0x037f && r <= 0x1fff) || 1270 (r >= 0x200c && r <= 0x200d) || (r >= 0x203f && r <= 0x2040) || 1271 (r >= 0x2070 && r <= 0x218f) || (r >= 0x2460 && r <= 0x24ff) || 1272 (r >= 0x2c00 && r <= 0x2fef) || (r >= 0x3001 && r <= 0xd7ff) || 1273 (r >= 0xf900 && r <= 0xfdcf) || (r >= 0xfdf0 && r <= 0xfffd) || 1274 (r >= 0x10000 && r <= 0xeffff) 1275 } 1276 1277 return (r >= 'A' && r <= 'Z') || 1278 (r >= 'a' && r <= 'z') || 1279 (r >= '0' && r <= '9') || 1280 r == '_' || r == '-' 1281 }