1// Copyright 2017 The Bazel Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7// A lexical scanner for Starlark.
8
9import (
10	"fmt"
11	"io"
12	"io/ioutil"
13	"log"
14	"math/big"
15	"os"
16	"strconv"
17	"strings"
18	"unicode"
19	"unicode/utf8"
20)
21
22// A Token represents a Starlark lexical token.
23type Token int8
24
25const (
26	ILLEGAL Token = iota
27	EOF
28
29	NEWLINE
30	INDENT
31	OUTDENT
32
33	// Tokens with values
34	IDENT  // x
35	INT    // 123
36	FLOAT  // 1.23e45
37	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
38	BYTES  // b"foo", etc
39
40	// Punctuation
41	PLUS          // +
42	MINUS         // -
43	STAR          // *
44	SLASH         // /
45	SLASHSLASH    // //
46	PERCENT       // %
47	AMP           // &
48	PIPE          // |
49	CIRCUMFLEX    // ^
50	LTLT          // <<
51	GTGT          // >>
52	TILDE         // ~
53	DOT           // .
54	COMMA         // ,
55	EQ            // =
56	SEMI          // ;
57	COLON         // :
58	LPAREN        // (
59	RPAREN        // )
60	LBRACK        // [
61	RBRACK        // ]
62	LBRACE        // {
63	RBRACE        // }
64	LT            // <
65	GT            // >
66	GE            // >=
67	LE            // <=
68	EQL           // ==
69	NEQ           // !=
70	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
71	MINUS_EQ      // -=
72	STAR_EQ       // *=
73	SLASH_EQ      // /=
74	SLASHSLASH_EQ // //=
75	PERCENT_EQ    // %=
76	AMP_EQ        // &=
77	PIPE_EQ       // |=
78	CIRCUMFLEX_EQ // ^=
79	LTLT_EQ       // <<=
80	GTGT_EQ       // >>=
81	STARSTAR      // **
82
83	// Keywords
84	AND
85	BREAK
86	CONTINUE
87	DEF
88	ELIF
89	ELSE
90	FOR
91	IF
92	IN
93	LAMBDA
94	LOAD
95	NOT
96	NOT_IN // synthesized by parser from NOT IN
97	OR
98	PASS
99	RETURN
100	WHILE
101
102	maxToken
103)
104
105func (tok Token) String() string { return tokenNames[tok] }
106
107// GoString is like String but quotes punctuation tokens.
108// Use Sprintf("%#v", tok) when constructing error messages.
109func (tok Token) GoString() string {
110	if tok >= PLUS && tok <= STARSTAR {
111		return "'" + tokenNames[tok] + "'"
112	}
113	return tokenNames[tok]
114}
115
116var tokenNames = [...]string{
117	ILLEGAL:       "illegal token",
118	EOF:           "end of file",
119	NEWLINE:       "newline",
120	INDENT:        "indent",
121	OUTDENT:       "outdent",
122	IDENT:         "identifier",
123	INT:           "int literal",
124	FLOAT:         "float literal",
125	STRING:        "string literal",
126	PLUS:          "+",
127	MINUS:         "-",
128	STAR:          "*",
129	SLASH:         "/",
130	SLASHSLASH:    "//",
131	PERCENT:       "%",
132	AMP:           "&",
133	PIPE:          "|",
134	CIRCUMFLEX:    "^",
135	LTLT:          "<<",
136	GTGT:          ">>",
137	TILDE:         "~",
138	DOT:           ".",
139	COMMA:         ",",
140	EQ:            "=",
141	SEMI:          ";",
142	COLON:         ":",
143	LPAREN:        "(",
144	RPAREN:        ")",
145	LBRACK:        "[",
146	RBRACK:        "]",
147	LBRACE:        "{",
148	RBRACE:        "}",
149	LT:            "<",
150	GT:            ">",
151	GE:            ">=",
152	LE:            "<=",
153	EQL:           "==",
154	NEQ:           "!=",
155	PLUS_EQ:       "+=",
156	MINUS_EQ:      "-=",
157	STAR_EQ:       "*=",
158	SLASH_EQ:      "/=",
159	SLASHSLASH_EQ: "//=",
160	PERCENT_EQ:    "%=",
161	AMP_EQ:        "&=",
162	PIPE_EQ:       "|=",
163	CIRCUMFLEX_EQ: "^=",
164	LTLT_EQ:       "<<=",
165	GTGT_EQ:       ">>=",
166	STARSTAR:      "**",
167	AND:           "and",
168	BREAK:         "break",
169	CONTINUE:      "continue",
170	DEF:           "def",
171	ELIF:          "elif",
172	ELSE:          "else",
173	FOR:           "for",
174	IF:            "if",
175	IN:            "in",
176	LAMBDA:        "lambda",
177	LOAD:          "load",
178	NOT:           "not",
179	NOT_IN:        "not in",
180	OR:            "or",
181	PASS:          "pass",
182	RETURN:        "return",
183	WHILE:         "while",
184}
185
186// A FilePortion describes the content of a portion of a file.
187// Callers may provide a FilePortion for the src argument of Parse
188// when the desired initial line and column numbers are not (1, 1),
189// such as when an expression is parsed from within larger file.
190type FilePortion struct {
191	Content             []byte
192	FirstLine, FirstCol int32
193}
194
195// A Position describes the location of a rune of input.
196type Position struct {
197	file *string // filename (indirect for compactness)
198	Line int32   // 1-based line number; 0 if line unknown
199	Col  int32   // 1-based column (rune) number; 0 if column unknown
200}
201
202// IsValid reports whether the position is valid.
203func (p Position) IsValid() bool { return p.file != nil }
204
205// Filename returns the name of the file containing this position.
206func (p Position) Filename() string {
207	if p.file != nil {
208		return *p.file
209	}
210	return "<invalid>"
211}
212
213// MakePosition returns position with the specified components.
214func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
215
216// add returns the position at the end of s, assuming it starts at p.
217func (p Position) add(s string) Position {
218	if n := strings.Count(s, "\n"); n > 0 {
219		p.Line += int32(n)
220		s = s[strings.LastIndex(s, "\n")+1:]
221		p.Col = 1
222	}
223	p.Col += int32(utf8.RuneCountInString(s))
224	return p
225}
226
227func (p Position) String() string {
228	file := p.Filename()
229	if p.Line > 0 {
230		if p.Col > 0 {
231			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
232		}
233		return fmt.Sprintf("%s:%d", file, p.Line)
234	}
235	return file
236}
237
238func (p Position) isBefore(q Position) bool {
239	if p.Line != q.Line {
240		return p.Line < q.Line
241	}
242	return p.Col < q.Col
243}
244
245// An scanner represents a single input file being parsed.
246type scanner struct {
247	rest           []byte    // rest of input (in REPL, a line of input)
248	token          []byte    // token being scanned
249	pos            Position  // current input position
250	depth          int       // nesting of [ ] { } ( )
251	indentstk      []int     // stack of indentation levels
252	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
253	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
254	keepComments   bool      // accumulate comments in slice
255	lineComments   []Comment // list of full line comments (if keepComments)
256	suffixComments []Comment // list of suffix comments (if keepComments)
257
258	readline func() ([]byte, error) // read next line of input (REPL only)
259}
260
261func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
262	var firstLine, firstCol int32 = 1, 1
263	if portion, ok := src.(FilePortion); ok {
264		firstLine, firstCol = portion.FirstLine, portion.FirstCol
265	}
266	sc := &scanner{
267		pos:          MakePosition(&filename, firstLine, firstCol),
268		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
269		lineStart:    true,
270		keepComments: keepComments,
271	}
272	sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
273	if sc.readline == nil {
274		data, err := readSource(filename, src)
275		if err != nil {
276			return nil, err
277		}
278		sc.rest = data
279	}
280	return sc, nil
281}
282
283func readSource(filename string, src interface{}) ([]byte, error) {
284	switch src := src.(type) {
285	case string:
286		return []byte(src), nil
287	case []byte:
288		return src, nil
289	case io.Reader:
290		data, err := ioutil.ReadAll(src)
291		if err != nil {
292			err = &os.PathError{Op: "read", Path: filename, Err: err}
293			return nil, err
294		}
295		return data, nil
296	case FilePortion:
297		return src.Content, nil
298	case nil:
299		return ioutil.ReadFile(filename)
300	default:
301		return nil, fmt.Errorf("invalid source: %T", src)
302	}
303}
304
305// An Error describes the nature and position of a scanner or parser error.
306type Error struct {
307	Pos Position
308	Msg string
309}
310
311func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
312
313// errorf is called to report an error.
314// errorf does not return: it panics.
315func (sc *scanner) error(pos Position, s string) {
316	panic(Error{pos, s})
317}
318
319func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
320	sc.error(pos, fmt.Sprintf(format, args...))
321}
322
323func (sc *scanner) recover(err *error) {
324	// The scanner and parser panic both for routine errors like
325	// syntax errors and for programmer bugs like array index
326	// errors.  Turn both into error returns.  Catching bug panics
327	// is especially important when processing many files.
328	switch e := recover().(type) {
329	case nil:
330		// no panic
331	case Error:
332		*err = e
333	default:
334		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
335		if debug {
336			log.Fatal(*err)
337		}
338	}
339}
340
341// eof reports whether the input has reached end of file.
342func (sc *scanner) eof() bool {
343	return len(sc.rest) == 0 && !sc.readLine()
344}
345
346// readLine attempts to read another line of input.
347// Precondition: len(sc.rest)==0.
348func (sc *scanner) readLine() bool {
349	if sc.readline != nil {
350		var err error
351		sc.rest, err = sc.readline()
352		if err != nil {
353			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
354		}
355		return len(sc.rest) > 0
356	}
357	return false
358}
359
360// peekRune returns the next rune in the input without consuming it.
361// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
362func (sc *scanner) peekRune() rune {
363	// TODO(adonovan): opt: measure and perhaps inline eof.
364	if sc.eof() {
365		return 0
366	}
367
368	// fast path: ASCII
369	if b := sc.rest[0]; b < utf8.RuneSelf {
370		if b == '\r' {
371			return '\n'
372		}
373		return rune(b)
374	}
375
376	r, _ := utf8.DecodeRune(sc.rest)
377	return r
378}
379
380// readRune consumes and returns the next rune in the input.
381// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
382func (sc *scanner) readRune() rune {
383	// eof() has been inlined here, both to avoid a call
384	// and to establish len(rest)>0 to avoid a bounds check.
385	if len(sc.rest) == 0 {
386		if !sc.readLine() {
387			sc.error(sc.pos, "internal scanner error: readRune at EOF")
388		}
389		// Redundant, but eliminates the bounds-check below.
390		if len(sc.rest) == 0 {
391			return 0
392		}
393	}
394
395	// fast path: ASCII
396	if b := sc.rest[0]; b < utf8.RuneSelf {
397		r := rune(b)
398		sc.rest = sc.rest[1:]
399		if r == '\r' {
400			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
401				sc.rest = sc.rest[1:]
402			}
403			r = '\n'
404		}
405		if r == '\n' {
406			sc.pos.Line++
407			sc.pos.Col = 1
408		} else {
409			sc.pos.Col++
410		}
411		return r
412	}
413
414	r, size := utf8.DecodeRune(sc.rest)
415	sc.rest = sc.rest[size:]
416	sc.pos.Col++
417	return r
418}
419
420// tokenValue records the position and value associated with each token.
421type tokenValue struct {
422	raw    string   // raw text of token
423	int    int64    // decoded int
424	bigInt *big.Int // decoded integers > int64
425	float  float64  // decoded float
426	string string   // decoded string or bytes
427	pos    Position // start position of token
428}
429
430// startToken marks the beginning of the next input token.
431// It must be followed by a call to endToken once the token has
432// been consumed using readRune.
433func (sc *scanner) startToken(val *tokenValue) {
434	sc.token = sc.rest
435	val.raw = ""
436	val.pos = sc.pos
437}
438
439// endToken marks the end of an input token.
440// It records the actual token string in val.raw if the caller
441// has not done that already.
442func (sc *scanner) endToken(val *tokenValue) {
443	if val.raw == "" {
444		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
445	}
446}
447
448// nextToken is called by the parser to obtain the next input token.
449// It returns the token value and sets val to the data associated with
450// the token.
451//
452// For all our input tokens, the associated data is val.pos (the
453// position where the token begins), val.raw (the input string
454// corresponding to the token).  For string and int tokens, the string
455// and int fields additionally contain the token's interpreted value.
456func (sc *scanner) nextToken(val *tokenValue) Token {
457
458	// The following distribution of tokens guides case ordering:
459	//
460	//      COMMA          27   %
461	//      STRING         23   %
462	//      IDENT          15   %
463	//      EQL            11   %
464	//      LBRACK          5.5 %
465	//      RBRACK          5.5 %
466	//      NEWLINE         3   %
467	//      LPAREN          2.9 %
468	//      RPAREN          2.9 %
469	//      INT             2   %
470	//      others        < 1   %
471	//
472	// Although NEWLINE tokens are infrequent, and lineStart is
473	// usually (~97%) false on entry, skipped newlines account for
474	// about 50% of all iterations of the 'start' loop.
475
476start:
477	var c rune
478
479	// Deal with leading spaces and indentation.
480	blank := false
481	savedLineStart := sc.lineStart
482	if sc.lineStart {
483		sc.lineStart = false
484		col := 0
485		for {
486			c = sc.peekRune()
487			if c == ' ' {
488				col++
489				sc.readRune()
490			} else if c == '\t' {
491				const tab = 8
492				col += int(tab - (sc.pos.Col-1)%tab)
493				sc.readRune()
494			} else {
495				break
496			}
497		}
498
499		// The third clause matches EOF.
500		if c == '#' || c == '\n' || c == 0 {
501			blank = true
502		}
503
504		// Compute indentation level for non-blank lines not
505		// inside an expression.  This is not the common case.
506		if !blank && sc.depth == 0 {
507			cur := sc.indentstk[len(sc.indentstk)-1]
508			if col > cur {
509				// indent
510				sc.dents++
511				sc.indentstk = append(sc.indentstk, col)
512			} else if col < cur {
513				// outdent(s)
514				for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
515					sc.dents--
516					sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
517				}
518				if col != sc.indentstk[len(sc.indentstk)-1] {
519					sc.error(sc.pos, "unindent does not match any outer indentation level")
520				}
521			}
522		}
523	}
524
525	// Return saved indentation tokens.
526	if sc.dents != 0 {
527		sc.startToken(val)
528		sc.endToken(val)
529		if sc.dents < 0 {
530			sc.dents++
531			return OUTDENT
532		} else {
533			sc.dents--
534			return INDENT
535		}
536	}
537
538	// start of line proper
539	c = sc.peekRune()
540
541	// Skip spaces.
542	for c == ' ' || c == '\t' {
543		sc.readRune()
544		c = sc.peekRune()
545	}
546
547	// comment
548	if c == '#' {
549		if sc.keepComments {
550			sc.startToken(val)
551		}
552		// Consume up to newline (included).
553		for c != 0 && c != '\n' {
554			sc.readRune()
555			c = sc.peekRune()
556		}
557		if sc.keepComments {
558			sc.endToken(val)
559			if blank {
560				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
561			} else {
562				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
563			}
564		}
565	}
566
567	// newline
568	if c == '\n' {
569		sc.lineStart = true
570
571		// Ignore newlines within expressions (common case).
572		if sc.depth > 0 {
573			sc.readRune()
574			goto start
575		}
576
577		// Ignore blank lines, except in the REPL,
578		// where they emit OUTDENTs and NEWLINE.
579		if blank {
580			if sc.readline == nil {
581				sc.readRune()
582				goto start
583			} else if len(sc.indentstk) > 1 {
584				sc.dents = 1 - len(sc.indentstk)
585				sc.indentstk = sc.indentstk[:1]
586				goto start
587			}
588		}
589
590		// At top-level (not in an expression).
591		sc.startToken(val)
592		sc.readRune()
593		val.raw = "\n"
594		return NEWLINE
595	}
596
597	// end of file
598	if c == 0 {
599		// Emit OUTDENTs for unfinished indentation,
600		// preceded by a NEWLINE if we haven't just emitted one.
601		if len(sc.indentstk) > 1 {
602			if savedLineStart {
603				sc.dents = 1 - len(sc.indentstk)
604				sc.indentstk = sc.indentstk[:1]
605				goto start
606			} else {
607				sc.lineStart = true
608				sc.startToken(val)
609				val.raw = "\n"
610				return NEWLINE
611			}
612		}
613
614		sc.startToken(val)
615		sc.endToken(val)
616		return EOF
617	}
618
619	// line continuation
620	if c == '\\' {
621		sc.readRune()
622		if sc.peekRune() != '\n' {
623			sc.errorf(sc.pos, "stray backslash in program")
624		}
625		sc.readRune()
626		goto start
627	}
628
629	// start of the next token
630	sc.startToken(val)
631
632	// comma (common case)
633	if c == ',' {
634		sc.readRune()
635		sc.endToken(val)
636		return COMMA
637	}
638
639	// string literal
640	if c == '"' || c == '\'' {
641		return sc.scanString(val, c)
642	}
643
644	// identifier or keyword
645	if isIdentStart(c) {
646		if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
647			//  r"..."
648			//  b"..."
649			sc.readRune()
650			c = sc.peekRune()
651			return sc.scanString(val, c)
652		} else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
653			// rb"..."
654			sc.readRune()
655			sc.readRune()
656			c = sc.peekRune()
657			return sc.scanString(val, c)
658		}
659
660		for isIdent(c) {
661			sc.readRune()
662			c = sc.peekRune()
663		}
664		sc.endToken(val)
665		if k, ok := keywordToken[val.raw]; ok {
666			return k
667		}
668
669		return IDENT
670	}
671
672	// brackets
673	switch c {
674	case '[', '(', '{':
675		sc.depth++
676		sc.readRune()
677		sc.endToken(val)
678		switch c {
679		case '[':
680			return LBRACK
681		case '(':
682			return LPAREN
683		case '{':
684			return LBRACE
685		}
686		panic("unreachable")
687
688	case ']', ')', '}':
689		if sc.depth == 0 {
690			sc.errorf(sc.pos, "unexpected %q", c)
691		} else {
692			sc.depth--
693		}
694		sc.readRune()
695		sc.endToken(val)
696		switch c {
697		case ']':
698			return RBRACK
699		case ')':
700			return RPAREN
701		case '}':
702			return RBRACE
703		}
704		panic("unreachable")
705	}
706
707	// int or float literal, or period
708	if isdigit(c) || c == '.' {
709		return sc.scanNumber(val, c)
710	}
711
712	// other punctuation
713	defer sc.endToken(val)
714	switch c {
715	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
716		start := sc.pos
717		sc.readRune()
718		if sc.peekRune() == '=' {
719			sc.readRune()
720			switch c {
721			case '<':
722				return LE
723			case '>':
724				return GE
725			case '=':
726				return EQL
727			case '!':
728				return NEQ
729			case '+':
730				return PLUS_EQ
731			case '-':
732				return MINUS_EQ
733			case '/':
734				return SLASH_EQ
735			case '%':
736				return PERCENT_EQ
737			case '&':
738				return AMP_EQ
739			case '|':
740				return PIPE_EQ
741			case '^':
742				return CIRCUMFLEX_EQ
743			}
744		}
745		switch c {
746		case '=':
747			return EQ
748		case '<':
749			if sc.peekRune() == '<' {
750				sc.readRune()
751				if sc.peekRune() == '=' {
752					sc.readRune()
753					return LTLT_EQ
754				} else {
755					return LTLT
756				}
757			}
758			return LT
759		case '>':
760			if sc.peekRune() == '>' {
761				sc.readRune()
762				if sc.peekRune() == '=' {
763					sc.readRune()
764					return GTGT_EQ
765				} else {
766					return GTGT
767				}
768			}
769			return GT
770		case '!':
771			sc.error(start, "unexpected input character '!'")
772		case '+':
773			return PLUS
774		case '-':
775			return MINUS
776		case '/':
777			if sc.peekRune() == '/' {
778				sc.readRune()
779				if sc.peekRune() == '=' {
780					sc.readRune()
781					return SLASHSLASH_EQ
782				} else {
783					return SLASHSLASH
784				}
785			}
786			return SLASH
787		case '%':
788			return PERCENT
789		case '&':
790			return AMP
791		case '|':
792			return PIPE
793		case '^':
794			return CIRCUMFLEX
795		}
796		panic("unreachable")
797
798	case ':', ';', '~': // single-char tokens (except comma)
799		sc.readRune()
800		switch c {
801		case ':':
802			return COLON
803		case ';':
804			return SEMI
805		case '~':
806			return TILDE
807		}
808		panic("unreachable")
809
810	case '*': // possibly followed by '*' or '='
811		sc.readRune()
812		switch sc.peekRune() {
813		case '*':
814			sc.readRune()
815			return STARSTAR
816		case '=':
817			sc.readRune()
818			return STAR_EQ
819		}
820		return STAR
821	}
822
823	sc.errorf(sc.pos, "unexpected input character %#q", c)
824	panic("unreachable")
825}
826
827func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
828	start := sc.pos
829	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
830	sc.readRune()
831
832	// String literals may contain escaped or unescaped newlines,
833	// causing them to span multiple lines (gulps) of REPL input;
834	// they are the only such token. Thus we cannot call endToken,
835	// as it assumes sc.rest is unchanged since startToken.
836	// Instead, buffer the token here.
837	// TODO(adonovan): opt: buffer only if we encounter a newline.
838	raw := new(strings.Builder)
839
840	// Copy the prefix, e.g. r' or " (see startToken).
841	raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
842
843	if !triple {
844		// single-quoted string literal
845		for {
846			if sc.eof() {
847				sc.error(val.pos, "unexpected EOF in string")
848			}
849			c := sc.readRune()
850			raw.WriteRune(c)
851			if c == quote {
852				break
853			}
854			if c == '\n' {
855				sc.error(val.pos, "unexpected newline in string")
856			}
857			if c == '\\' {
858				if sc.eof() {
859					sc.error(val.pos, "unexpected EOF in string")
860				}
861				c = sc.readRune()
862				raw.WriteRune(c)
863			}
864		}
865	} else {
866		// triple-quoted string literal
867		sc.readRune()
868		raw.WriteRune(quote)
869		sc.readRune()
870		raw.WriteRune(quote)
871
872		quoteCount := 0
873		for {
874			if sc.eof() {
875				sc.error(val.pos, "unexpected EOF in string")
876			}
877			c := sc.readRune()
878			raw.WriteRune(c)
879			if c == quote {
880				quoteCount++
881				if quoteCount == 3 {
882					break
883				}
884			} else {
885				quoteCount = 0
886			}
887			if c == '\\' {
888				if sc.eof() {
889					sc.error(val.pos, "unexpected EOF in string")
890				}
891				c = sc.readRune()
892				raw.WriteRune(c)
893			}
894		}
895	}
896	val.raw = raw.String()
897
898	s, _, isByte, err := unquote(val.raw)
899	if err != nil {
900		sc.error(start, err.Error())
901	}
902	val.string = s
903	if isByte {
904		return BYTES
905	} else {
906		return STRING
907	}
908}
909
910func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
911	// https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
912	//
913	// Python features not supported:
914	// - integer literals of >64 bits of precision
915	// - 123L or 123l long suffix
916	// - traditional octal: 0755
917	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
918
919	start := sc.pos
920	fraction, exponent := false, false
921
922	if c == '.' {
923		// dot or start of fraction
924		sc.readRune()
925		c = sc.peekRune()
926		if !isdigit(c) {
927			sc.endToken(val)
928			return DOT
929		}
930		fraction = true
931	} else if c == '0' {
932		// hex, octal, binary or float
933		sc.readRune()
934		c = sc.peekRune()
935
936		if c == '.' {
937			fraction = true
938		} else if c == 'x' || c == 'X' {
939			// hex
940			sc.readRune()
941			c = sc.peekRune()
942			if !isxdigit(c) {
943				sc.error(start, "invalid hex literal")
944			}
945			for isxdigit(c) {
946				sc.readRune()
947				c = sc.peekRune()
948			}
949		} else if c == 'o' || c == 'O' {
950			// octal
951			sc.readRune()
952			c = sc.peekRune()
953			if !isodigit(c) {
954				sc.error(sc.pos, "invalid octal literal")
955			}
956			for isodigit(c) {
957				sc.readRune()
958				c = sc.peekRune()
959			}
960		} else if c == 'b' || c == 'B' {
961			// binary
962			sc.readRune()
963			c = sc.peekRune()
964			if !isbdigit(c) {
965				sc.error(sc.pos, "invalid binary literal")
966			}
967			for isbdigit(c) {
968				sc.readRune()
969				c = sc.peekRune()
970			}
971		} else {
972			// float (or obsolete octal "0755")
973			allzeros, octal := true, true
974			for isdigit(c) {
975				if c != '0' {
976					allzeros = false
977				}
978				if c > '7' {
979					octal = false
980				}
981				sc.readRune()
982				c = sc.peekRune()
983			}
984			if c == '.' {
985				fraction = true
986			} else if c == 'e' || c == 'E' {
987				exponent = true
988			} else if octal && !allzeros {
989				sc.endToken(val)
990				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
991			}
992		}
993	} else {
994		// decimal
995		for isdigit(c) {
996			sc.readRune()
997			c = sc.peekRune()
998		}
999
1000		if c == '.' {
1001			fraction = true
1002		} else if c == 'e' || c == 'E' {
1003			exponent = true
1004		}
1005	}
1006
1007	if fraction {
1008		sc.readRune() // consume '.'
1009		c = sc.peekRune()
1010		for isdigit(c) {
1011			sc.readRune()
1012			c = sc.peekRune()
1013		}
1014
1015		if c == 'e' || c == 'E' {
1016			exponent = true
1017		}
1018	}
1019
1020	if exponent {
1021		sc.readRune() // consume [eE]
1022		c = sc.peekRune()
1023		if c == '+' || c == '-' {
1024			sc.readRune()
1025			c = sc.peekRune()
1026			if !isdigit(c) {
1027				sc.error(sc.pos, "invalid float literal")
1028			}
1029		}
1030		for isdigit(c) {
1031			sc.readRune()
1032			c = sc.peekRune()
1033		}
1034	}
1035
1036	sc.endToken(val)
1037	if fraction || exponent {
1038		var err error
1039		val.float, err = strconv.ParseFloat(val.raw, 64)
1040		if err != nil {
1041			sc.error(sc.pos, "invalid float literal")
1042		}
1043		return FLOAT
1044	} else {
1045		var err error
1046		s := val.raw
1047		val.bigInt = nil
1048		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
1049			val.int, err = strconv.ParseInt(s[2:], 8, 64)
1050		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
1051			val.int, err = strconv.ParseInt(s[2:], 2, 64)
1052		} else {
1053			val.int, err = strconv.ParseInt(s, 0, 64)
1054			if err != nil {
1055				num := new(big.Int)
1056				var ok bool
1057				val.bigInt, ok = num.SetString(s, 0)
1058				if ok {
1059					err = nil
1060				}
1061			}
1062		}
1063		if err != nil {
1064			sc.error(start, "invalid int literal")
1065		}
1066		return INT
1067	}
1068}
1069
1070// isIdent reports whether c is an identifier rune.
1071func isIdent(c rune) bool {
1072	return isdigit(c) || isIdentStart(c)
1073}
1074
1075func isIdentStart(c rune) bool {
1076	return 'a' <= c && c <= 'z' ||
1077		'A' <= c && c <= 'Z' ||
1078		c == '_' ||
1079		unicode.IsLetter(c)
1080}
1081
1082func isdigit(c rune) bool  { return '0' <= c && c <= '9' }
1083func isodigit(c rune) bool { return '0' <= c && c <= '7' }
1084func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
1085func isbdigit(c rune) bool { return '0' == c || c == '1' }
1086
1087// keywordToken records the special tokens for
1088// strings that should not be treated as ordinary identifiers.
1089var keywordToken = map[string]Token{
1090	"and":      AND,
1091	"break":    BREAK,
1092	"continue": CONTINUE,
1093	"def":      DEF,
1094	"elif":     ELIF,
1095	"else":     ELSE,
1096	"for":      FOR,
1097	"if":       IF,
1098	"in":       IN,
1099	"lambda":   LAMBDA,
1100	"load":     LOAD,
1101	"not":      NOT,
1102	"or":       OR,
1103	"pass":     PASS,
1104	"return":   RETURN,
1105	"while":    WHILE,
1106
1107	// reserved words:
1108	"as": ILLEGAL,
1109	// "assert":   ILLEGAL, // heavily used by our tests
1110	"class":    ILLEGAL,
1111	"del":      ILLEGAL,
1112	"except":   ILLEGAL,
1113	"finally":  ILLEGAL,
1114	"from":     ILLEGAL,
1115	"global":   ILLEGAL,
1116	"import":   ILLEGAL,
1117	"is":       ILLEGAL,
1118	"nonlocal": ILLEGAL,
1119	"raise":    ILLEGAL,
1120	"try":      ILLEGAL,
1121	"with":     ILLEGAL,
1122	"yield":    ILLEGAL,
1123}
1124