1// Copyright 2017 The Bazel Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7// Starlark quoted string utilities.
8
9import (
10	"fmt"
11	"strconv"
12	"strings"
13	"unicode"
14	"unicode/utf8"
15)
16
17// unesc maps single-letter chars following \ to their actual values.
18var unesc = [256]byte{
19	'a':  '\a',
20	'b':  '\b',
21	'f':  '\f',
22	'n':  '\n',
23	'r':  '\r',
24	't':  '\t',
25	'v':  '\v',
26	'\\': '\\',
27	'\'': '\'',
28	'"':  '"',
29}
30
31// esc maps escape-worthy bytes to the char that should follow \.
32var esc = [256]byte{
33	'\a': 'a',
34	'\b': 'b',
35	'\f': 'f',
36	'\n': 'n',
37	'\r': 'r',
38	'\t': 't',
39	'\v': 'v',
40	'\\': '\\',
41	'\'': '\'',
42	'"':  '"',
43}
44
45// unquote unquotes the quoted string, returning the actual
46// string value, whether the original was triple-quoted,
47// whether it was a byte string, and an error describing invalid input.
48func unquote(quoted string) (s string, triple, isByte bool, err error) {
49	// Check for raw prefix: means don't interpret the inner \.
50	raw := false
51	if strings.HasPrefix(quoted, "r") {
52		raw = true
53		quoted = quoted[1:]
54	}
55	// Check for bytes prefix.
56	if strings.HasPrefix(quoted, "b") {
57		isByte = true
58		quoted = quoted[1:]
59	}
60
61	if len(quoted) < 2 {
62		err = fmt.Errorf("string literal too short")
63		return
64	}
65
66	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
67		err = fmt.Errorf("string literal has invalid quotes")
68		return
69	}
70
71	// Check for triple quoted string.
72	quote := quoted[0]
73	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
74		triple = true
75		quoted = quoted[3 : len(quoted)-3]
76	} else {
77		quoted = quoted[1 : len(quoted)-1]
78	}
79
80	// Now quoted is the quoted data, but no quotes.
81	// If we're in raw mode or there are no escapes or
82	// carriage returns, we're done.
83	var unquoteChars string
84	if raw {
85		unquoteChars = "\r"
86	} else {
87		unquoteChars = "\\\r"
88	}
89	if !strings.ContainsAny(quoted, unquoteChars) {
90		s = quoted
91		return
92	}
93
94	// Otherwise process quoted string.
95	// Each iteration processes one escape sequence along with the
96	// plain text leading up to it.
97	buf := new(strings.Builder)
98	for {
99		// Remove prefix before escape sequence.
100		i := strings.IndexAny(quoted, unquoteChars)
101		if i < 0 {
102			i = len(quoted)
103		}
104		buf.WriteString(quoted[:i])
105		quoted = quoted[i:]
106
107		if len(quoted) == 0 {
108			break
109		}
110
111		// Process carriage return.
112		if quoted[0] == '\r' {
113			buf.WriteByte('\n')
114			if len(quoted) > 1 && quoted[1] == '\n' {
115				quoted = quoted[2:]
116			} else {
117				quoted = quoted[1:]
118			}
119			continue
120		}
121
122		// Process escape sequence.
123		if len(quoted) == 1 {
124			err = fmt.Errorf(`truncated escape sequence \`)
125			return
126		}
127
128		switch quoted[1] {
129		default:
130			// In Starlark, like Go, a backslash must escape something.
131			// (Python still treats unnecessary backslashes literally,
132			// but since 3.6 has emitted a deprecation warning.)
133			err = fmt.Errorf("invalid escape sequence \\%c", quoted[1])
134			return
135
136		case '\n':
137			// Ignore the escape and the line break.
138			quoted = quoted[2:]
139
140		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
141			// One-char escape.
142			// Escapes are allowed for both kinds of quotation
143			// mark, not just the kind in use.
144			buf.WriteByte(unesc[quoted[1]])
145			quoted = quoted[2:]
146
147		case '0', '1', '2', '3', '4', '5', '6', '7':
148			// Octal escape, up to 3 digits, \OOO.
149			n := int(quoted[1] - '0')
150			quoted = quoted[2:]
151			for i := 1; i < 3; i++ {
152				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
153					break
154				}
155				n = n*8 + int(quoted[0]-'0')
156				quoted = quoted[1:]
157			}
158			if !isByte && n > 127 {
159				err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
160				return
161			}
162			if n >= 256 {
163				// NOTE: Python silently discards the high bit,
164				// so that '\541' == '\141' == 'a'.
165				// Let's see if we can avoid doing that in BUILD files.
166				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
167				return
168			}
169			buf.WriteByte(byte(n))
170
171		case 'x':
172			// Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
173			if len(quoted) < 4 {
174				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
175				return
176			}
177			n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
178			if err1 != nil {
179				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
180				return
181			}
182			if !isByte && n > 127 {
183				err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
184					quoted[:4], n, n)
185				return
186			}
187			buf.WriteByte(byte(n))
188			quoted = quoted[4:]
189
190		case 'u', 'U':
191			// Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
192			sz := 6
193			if quoted[1] == 'U' {
194				sz = 10
195			}
196			if len(quoted) < sz {
197				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
198				return
199			}
200			n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
201			if err1 != nil {
202				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
203				return
204			}
205			if n > unicode.MaxRune {
206				err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
207					quoted[:sz], n)
208				return
209			}
210			// As in Go, surrogates are disallowed.
211			if 0xD800 <= n && n < 0xE000 {
212				err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
213				return
214			}
215			buf.WriteRune(rune(n))
216			quoted = quoted[sz:]
217		}
218	}
219
220	s = buf.String()
221	return
222}
223
224// indexByte returns the index of the first instance of b in s, or else -1.
225func indexByte(s string, b byte) int {
226	for i := 0; i < len(s); i++ {
227		if s[i] == b {
228			return i
229		}
230	}
231	return -1
232}
233
234// Quote returns a Starlark literal that denotes s.
235// If b, it returns a bytes literal.
236func Quote(s string, b bool) string {
237	const hex = "0123456789abcdef"
238	var runeTmp [utf8.UTFMax]byte
239
240	buf := make([]byte, 0, 3*len(s)/2)
241	if b {
242		buf = append(buf, 'b')
243	}
244	buf = append(buf, '"')
245	for width := 0; len(s) > 0; s = s[width:] {
246		r := rune(s[0])
247		width = 1
248		if r >= utf8.RuneSelf {
249			r, width = utf8.DecodeRuneInString(s)
250		}
251		if width == 1 && r == utf8.RuneError {
252			// String (!b) literals accept \xXX escapes only for ASCII,
253			// but we must use them here to represent invalid bytes.
254			// The result is not a legal literal.
255			buf = append(buf, `\x`...)
256			buf = append(buf, hex[s[0]>>4])
257			buf = append(buf, hex[s[0]&0xF])
258			continue
259		}
260		if r == '"' || r == '\\' { // always backslashed
261			buf = append(buf, '\\')
262			buf = append(buf, byte(r))
263			continue
264		}
265		if strconv.IsPrint(r) {
266			n := utf8.EncodeRune(runeTmp[:], r)
267			buf = append(buf, runeTmp[:n]...)
268			continue
269		}
270		switch r {
271		case '\a':
272			buf = append(buf, `\a`...)
273		case '\b':
274			buf = append(buf, `\b`...)
275		case '\f':
276			buf = append(buf, `\f`...)
277		case '\n':
278			buf = append(buf, `\n`...)
279		case '\r':
280			buf = append(buf, `\r`...)
281		case '\t':
282			buf = append(buf, `\t`...)
283		case '\v':
284			buf = append(buf, `\v`...)
285		default:
286			switch {
287			case r < ' ' || r == 0x7f:
288				buf = append(buf, `\x`...)
289				buf = append(buf, hex[byte(r)>>4])
290				buf = append(buf, hex[byte(r)&0xF])
291			case r > utf8.MaxRune:
292				r = 0xFFFD
293				fallthrough
294			case r < 0x10000:
295				buf = append(buf, `\u`...)
296				for s := 12; s >= 0; s -= 4 {
297					buf = append(buf, hex[r>>uint(s)&0xF])
298				}
299			default:
300				buf = append(buf, `\U`...)
301				for s := 28; s >= 0; s -= 4 {
302					buf = append(buf, hex[r>>uint(s)&0xF])
303				}
304			}
305		}
306	}
307	buf = append(buf, '"')
308	return string(buf)
309}
310