1 /*
2 ** $Id: llex.c $
3 ** Lexical Analyzer
4 ** See Copyright Notice in lua.h
5 */
6 
7 #define llex_c
8 #define LUA_CORE
9 
10 #include "lprefix.h"
11 
12 
13 #include <locale.h>
14 #include <string.h>
15 
16 #include "lua.h"
17 
18 #include "lctype.h"
19 #include "ldebug.h"
20 #include "ldo.h"
21 #include "lgc.h"
22 #include "llex.h"
23 #include "lobject.h"
24 #include "lparser.h"
25 #include "lstate.h"
26 #include "lstring.h"
27 #include "ltable.h"
28 #include "lzio.h"
29 
30 
31 
32 #define next(ls)	(ls->current = zgetc(ls->z))
33 
34 
35 
36 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
37 
38 
39 /* ORDER RESERVED */
40 static const char *const luaX_tokens [] = {
41     "and", "break", "do", "else", "elseif",
42     "end", "false", "for", "function", "goto", "if",
43     "in", "local", "nil", "not", "or", "repeat",
44     "return", "then", "true", "until", "while",
45     "//", "..", "...", "==", ">=", "<=", "~=",
46     "<<", ">>", "::", "<eof>",
47     "<number>", "<integer>", "<name>", "<string>"
48 };
49 
50 
51 #define save_and_next(ls) (save(ls, ls->current), next(ls))
52 
53 
54 static l_noret lexerror (LexState *ls, const char *msg, int token);
55 
56 
save(LexState * ls,int c)57 static void save (LexState *ls, int c) {
58   Mbuffer *b = ls->buff;
59   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
60     size_t newsize;
61     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
62       lexerror(ls, "lexical element too long", 0);
63     newsize = luaZ_sizebuffer(b) * 2;
64     luaZ_resizebuffer(ls->L, b, newsize);
65   }
66   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
67 }
68 
69 
luaX_init(lua_State * L)70 void luaX_init (lua_State *L) {
71   int i;
72   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
73   luaC_fix(L, obj2gco(e));  /* never collect this name */
74   for (i=0; i<NUM_RESERVED; i++) {
75     TString *ts = luaS_new(L, luaX_tokens[i]);
76     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
77     ts->extra = cast_byte(i+1);  /* reserved word */
78   }
79 }
80 
81 
luaX_token2str(LexState * ls,int token)82 const char *luaX_token2str (LexState *ls, int token) {
83   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
84     if (lisprint(token))
85       return luaO_pushfstring(ls->L, "'%c'", token);
86     else  /* control character */
87       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
88   }
89   else {
90     const char *s = luaX_tokens[token - FIRST_RESERVED];
91     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
92       return luaO_pushfstring(ls->L, "'%s'", s);
93     else  /* names, strings, and numerals */
94       return s;
95   }
96 }
97 
98 
txtToken(LexState * ls,int token)99 static const char *txtToken (LexState *ls, int token) {
100   switch (token) {
101     case TK_NAME: case TK_STRING:
102     case TK_FLT: case TK_INT:
103       save(ls, '\0');
104       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
105     default:
106       return luaX_token2str(ls, token);
107   }
108 }
109 
110 
lexerror(LexState * ls,const char * msg,int token)111 static l_noret lexerror (LexState *ls, const char *msg, int token) {
112   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
113   if (token)
114     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
115   luaD_throw(ls->L, LUA_ERRSYNTAX);
116 }
117 
118 
luaX_syntaxerror(LexState * ls,const char * msg)119 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
120   lexerror(ls, msg, ls->t.token);
121 }
122 
123 
124 /*
125 ** creates a new string and anchors it in scanner's table so that
126 ** it will not be collected until the end of the compilation
127 ** (by that time it should be anchored somewhere)
128 */
luaX_newstring(LexState * ls,const char * str,size_t l)129 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
130   lua_State *L = ls->L;
131   TValue *o;  /* entry for 'str' */
132   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
133   setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
134   o = luaH_set(L, ls->h, s2v(L->top - 1));
135   if (isempty(o)) {  /* not in use yet? */
136     /* boolean value does not need GC barrier;
137        table is not a metatable, so it does not need to invalidate cache */
138     setbtvalue(o);  /* t[string] = true */
139     luaC_checkGC(L);
140   }
141   else {  /* string already present */
142     ts = keystrval(nodefromval(o));  /* re-use value previously stored */
143   }
144   L->top--;  /* remove string from stack */
145   return ts;
146 }
147 
148 
149 /*
150 ** increment line number and skips newline sequence (any of
151 ** \n, \r, \n\r, or \r\n)
152 */
inclinenumber(LexState * ls)153 static void inclinenumber (LexState *ls) {
154   int old = ls->current;
155   lua_assert(currIsNewline(ls));
156   next(ls);  /* skip '\n' or '\r' */
157   if (currIsNewline(ls) && ls->current != old)
158     next(ls);  /* skip '\n\r' or '\r\n' */
159   if (++ls->linenumber >= MAX_INT)
160     lexerror(ls, "chunk has too many lines", 0);
161 }
162 
163 
luaX_setinput(lua_State * L,LexState * ls,ZIO * z,TString * source,int firstchar)164 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
165                     int firstchar) {
166   ls->t.token = 0;
167   ls->L = L;
168   ls->current = firstchar;
169   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
170   ls->z = z;
171   ls->fs = NULL;
172   ls->linenumber = 1;
173   ls->lastline = 1;
174   ls->source = source;
175   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
176   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
177 }
178 
179 
180 
181 /*
182 ** =======================================================
183 ** LEXICAL ANALYZER
184 ** =======================================================
185 */
186 
187 
check_next1(LexState * ls,int c)188 static int check_next1 (LexState *ls, int c) {
189   if (ls->current == c) {
190     next(ls);
191     return 1;
192   }
193   else return 0;
194 }
195 
196 
197 /*
198 ** Check whether current char is in set 'set' (with two chars) and
199 ** saves it
200 */
check_next2(LexState * ls,const char * set)201 static int check_next2 (LexState *ls, const char *set) {
202   lua_assert(set[2] == '\0');
203   if (ls->current == set[0] || ls->current == set[1]) {
204     save_and_next(ls);
205     return 1;
206   }
207   else return 0;
208 }
209 
210 
211 /* LUA_NUMBER */
212 /*
213 ** This function is quite liberal in what it accepts, as 'luaO_str2num'
214 ** will reject ill-formed numerals. Roughly, it accepts the following
215 ** pattern:
216 **
217 **   %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
218 **
219 ** The only tricky part is to accept [+-] only after a valid exponent
220 ** mark, to avoid reading '3-4' or '0xe+1' as a single number.
221 **
222 ** The caller might have already read an initial dot.
223 */
read_numeral(LexState * ls,SemInfo * seminfo)224 static int read_numeral (LexState *ls, SemInfo *seminfo) {
225   TValue obj;
226   const char *expo = "Ee";
227   int first = ls->current;
228   lua_assert(lisdigit(ls->current));
229   save_and_next(ls);
230   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
231     expo = "Pp";
232   for (;;) {
233     if (check_next2(ls, expo))  /* exponent mark? */
234       check_next2(ls, "-+");  /* optional exponent sign */
235     else if (lisxdigit(ls->current) || ls->current == '.')  /* '%x|%.' */
236       save_and_next(ls);
237     else break;
238   }
239   if (lislalpha(ls->current))  /* is numeral touching a letter? */
240     save_and_next(ls);  /* force an error */
241   save(ls, '\0');
242   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
243     lexerror(ls, "malformed number", TK_FLT);
244   if (ttisinteger(&obj)) {
245     seminfo->i = ivalue(&obj);
246     return TK_INT;
247   }
248   else {
249     lua_assert(ttisfloat(&obj));
250     seminfo->r = fltvalue(&obj);
251     return TK_FLT;
252   }
253 }
254 
255 
256 /*
257 ** reads a sequence '[=*[' or ']=*]', leaving the last bracket.
258 ** If sequence is well formed, return its number of '='s + 2; otherwise,
259 ** return 1 if there is no '='s or 0 otherwise (an unfinished '[==...').
260 */
skip_sep(LexState * ls)261 static size_t skip_sep (LexState *ls) {
262   size_t count = 0;
263   int s = ls->current;
264   lua_assert(s == '[' || s == ']');
265   save_and_next(ls);
266   while (ls->current == '=') {
267     save_and_next(ls);
268     count++;
269   }
270   return (ls->current == s) ? count + 2
271          : (count == 0) ? 1
272          : 0;
273 }
274 
275 
read_long_string(LexState * ls,SemInfo * seminfo,size_t sep)276 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
277   int line = ls->linenumber;  /* initial line (for error message) */
278   save_and_next(ls);  /* skip 2nd '[' */
279   if (currIsNewline(ls))  /* string starts with a newline? */
280     inclinenumber(ls);  /* skip it */
281   for (;;) {
282     switch (ls->current) {
283       case EOZ: {  /* error */
284         const char *what = (seminfo ? "string" : "comment");
285         const char *msg = luaO_pushfstring(ls->L,
286                      "unfinished long %s (starting at line %d)", what, line);
287         lexerror(ls, msg, TK_EOS);
288         break;  /* to avoid warnings */
289       }
290       case ']': {
291         if (skip_sep(ls) == sep) {
292           save_and_next(ls);  /* skip 2nd ']' */
293           goto endloop;
294         }
295         break;
296       }
297       case '\n': case '\r': {
298         save(ls, '\n');
299         inclinenumber(ls);
300         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
301         break;
302       }
303       default: {
304         if (seminfo) save_and_next(ls);
305         else next(ls);
306       }
307     }
308   } endloop:
309   if (seminfo)
310     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
311                                      luaZ_bufflen(ls->buff) - 2 * sep);
312 }
313 
314 
esccheck(LexState * ls,int c,const char * msg)315 static void esccheck (LexState *ls, int c, const char *msg) {
316   if (!c) {
317     if (ls->current != EOZ)
318       save_and_next(ls);  /* add current to buffer for error message */
319     lexerror(ls, msg, TK_STRING);
320   }
321 }
322 
323 
gethexa(LexState * ls)324 static int gethexa (LexState *ls) {
325   save_and_next(ls);
326   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
327   return luaO_hexavalue(ls->current);
328 }
329 
330 
readhexaesc(LexState * ls)331 static int readhexaesc (LexState *ls) {
332   int r = gethexa(ls);
333   r = (r << 4) + gethexa(ls);
334   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
335   return r;
336 }
337 
338 
readutf8esc(LexState * ls)339 static unsigned long readutf8esc (LexState *ls) {
340   unsigned long r;
341   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
342   save_and_next(ls);  /* skip 'u' */
343   esccheck(ls, ls->current == '{', "missing '{'");
344   r = gethexa(ls);  /* must have at least one digit */
345   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
346     i++;
347     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
348     r = (r << 4) + luaO_hexavalue(ls->current);
349   }
350   esccheck(ls, ls->current == '}', "missing '}'");
351   next(ls);  /* skip '}' */
352   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
353   return r;
354 }
355 
356 
utf8esc(LexState * ls)357 static void utf8esc (LexState *ls) {
358   char buff[UTF8BUFFSZ];
359   int n = luaO_utf8esc(buff, readutf8esc(ls));
360   for (; n > 0; n--)  /* add 'buff' to string */
361     save(ls, buff[UTF8BUFFSZ - n]);
362 }
363 
364 
readdecesc(LexState * ls)365 static int readdecesc (LexState *ls) {
366   int i;
367   int r = 0;  /* result accumulator */
368   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
369     r = 10*r + ls->current - '0';
370     save_and_next(ls);
371   }
372   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
373   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
374   return r;
375 }
376 
377 
read_string(LexState * ls,int del,SemInfo * seminfo)378 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
379   save_and_next(ls);  /* keep delimiter (for error messages) */
380   while (ls->current != del) {
381     switch (ls->current) {
382       case EOZ:
383         lexerror(ls, "unfinished string", TK_EOS);
384         break;  /* to avoid warnings */
385       case '\n':
386       case '\r':
387         lexerror(ls, "unfinished string", TK_STRING);
388         break;  /* to avoid warnings */
389       case '\\': {  /* escape sequences */
390         int c;  /* final character to be saved */
391         save_and_next(ls);  /* keep '\\' for error messages */
392         switch (ls->current) {
393           case 'a': c = '\a'; goto read_save;
394           case 'b': c = '\b'; goto read_save;
395           case 'f': c = '\f'; goto read_save;
396           case 'n': c = '\n'; goto read_save;
397           case 'r': c = '\r'; goto read_save;
398           case 't': c = '\t'; goto read_save;
399           case 'v': c = '\v'; goto read_save;
400           case 'x': c = readhexaesc(ls); goto read_save;
401           case 'u': utf8esc(ls);  goto no_save;
402           case '\n': case '\r':
403             inclinenumber(ls); c = '\n'; goto only_save;
404           case '\\': case '\"': case '\'':
405             c = ls->current; goto read_save;
406           case EOZ: goto no_save;  /* will raise an error next loop */
407           case 'z': {  /* zap following span of spaces */
408             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
409             next(ls);  /* skip the 'z' */
410             while (lisspace(ls->current)) {
411               if (currIsNewline(ls)) inclinenumber(ls);
412               else next(ls);
413             }
414             goto no_save;
415           }
416           default: {
417             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
418             c = readdecesc(ls);  /* digital escape '\ddd' */
419             goto only_save;
420           }
421         }
422        read_save:
423          next(ls);
424          /* go through */
425        only_save:
426          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
427          save(ls, c);
428          /* go through */
429        no_save: break;
430       }
431       default:
432         save_and_next(ls);
433     }
434   }
435   save_and_next(ls);  /* skip delimiter */
436   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
437                                    luaZ_bufflen(ls->buff) - 2);
438 }
439 
440 
llex(LexState * ls,SemInfo * seminfo)441 static int llex (LexState *ls, SemInfo *seminfo) {
442   luaZ_resetbuffer(ls->buff);
443   for (;;) {
444     switch (ls->current) {
445       case '\n': case '\r': {  /* line breaks */
446         inclinenumber(ls);
447         break;
448       }
449       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
450         next(ls);
451         break;
452       }
453       case '-': {  /* '-' or '--' (comment) */
454         next(ls);
455         if (ls->current != '-') return '-';
456         /* else is a comment */
457         next(ls);
458         if (ls->current == '[') {  /* long comment? */
459           size_t sep = skip_sep(ls);
460           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
461           if (sep >= 2) {
462             read_long_string(ls, NULL, sep);  /* skip long comment */
463             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
464             break;
465           }
466         }
467         /* else short comment */
468         while (!currIsNewline(ls) && ls->current != EOZ)
469           next(ls);  /* skip until end of line (or end of file) */
470         break;
471       }
472       case '[': {  /* long string or simply '[' */
473         size_t sep = skip_sep(ls);
474         if (sep >= 2) {
475           read_long_string(ls, seminfo, sep);
476           return TK_STRING;
477         }
478         else if (sep == 0)  /* '[=...' missing second bracket? */
479           lexerror(ls, "invalid long string delimiter", TK_STRING);
480         return '[';
481       }
482       case '=': {
483         next(ls);
484         if (check_next1(ls, '=')) return TK_EQ;
485         else return '=';
486       }
487       case '<': {
488         next(ls);
489         if (check_next1(ls, '=')) return TK_LE;
490         else if (check_next1(ls, '<')) return TK_SHL;
491         else return '<';
492       }
493       case '>': {
494         next(ls);
495         if (check_next1(ls, '=')) return TK_GE;
496         else if (check_next1(ls, '>')) return TK_SHR;
497         else return '>';
498       }
499       case '/': {
500         next(ls);
501         if (check_next1(ls, '/')) return TK_IDIV;
502         else return '/';
503       }
504       case '~': {
505         next(ls);
506         if (check_next1(ls, '=')) return TK_NE;
507         else return '~';
508       }
509       case ':': {
510         next(ls);
511         if (check_next1(ls, ':')) return TK_DBCOLON;
512         else return ':';
513       }
514       case '"': case '\'': {  /* short literal strings */
515         read_string(ls, ls->current, seminfo);
516         return TK_STRING;
517       }
518       case '.': {  /* '.', '..', '...', or number */
519         save_and_next(ls);
520         if (check_next1(ls, '.')) {
521           if (check_next1(ls, '.'))
522             return TK_DOTS;   /* '...' */
523           else return TK_CONCAT;   /* '..' */
524         }
525         else if (!lisdigit(ls->current)) return '.';
526         else return read_numeral(ls, seminfo);
527       }
528       case '0': case '1': case '2': case '3': case '4':
529       case '5': case '6': case '7': case '8': case '9': {
530         return read_numeral(ls, seminfo);
531       }
532       case EOZ: {
533         return TK_EOS;
534       }
535       default: {
536         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
537           TString *ts;
538           do {
539             save_and_next(ls);
540           } while (lislalnum(ls->current));
541           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
542                                   luaZ_bufflen(ls->buff));
543           seminfo->ts = ts;
544           if (isreserved(ts))  /* reserved word? */
545             return ts->extra - 1 + FIRST_RESERVED;
546           else {
547             return TK_NAME;
548           }
549         }
550         else {  /* single-char tokens (+ - / ...) */
551           int c = ls->current;
552           next(ls);
553           return c;
554         }
555       }
556     }
557   }
558 }
559 
560 
luaX_next(LexState * ls)561 void luaX_next (LexState *ls) {
562   ls->lastline = ls->linenumber;
563   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
564     ls->t = ls->lookahead;  /* use this one */
565     ls->lookahead.token = TK_EOS;  /* and discharge it */
566   }
567   else
568     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
569 }
570 
571 
luaX_lookahead(LexState * ls)572 int luaX_lookahead (LexState *ls) {
573   lua_assert(ls->lookahead.token == TK_EOS);
574   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
575   return ls->lookahead.token;
576 }
577 
578