1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31 
32 extern YYSTYPE	yylval;
33 extern int	infunc;
34 
35 int	lineno	= 1;
36 int	bracecnt = 0;
37 int	brackcnt  = 0;
38 int	parencnt = 0;
39 
40 typedef struct Keyword {
41 	const char *word;
42 	int	sub;
43 	int	type;
44 } Keyword;
45 
46 Keyword keywords[] ={	/* keep sorted: binary searched */
47 	{ "BEGIN",	XBEGIN,		XBEGIN },
48 	{ "END",	XEND,		XEND },
49 	{ "NF",		VARNF,		VARNF },
50 	{ "atan2",	FATAN,		BLTIN },
51 	{ "break",	BREAK,		BREAK },
52 	{ "close",	CLOSE,		CLOSE },
53 	{ "continue",	CONTINUE,	CONTINUE },
54 	{ "cos",	FCOS,		BLTIN },
55 	{ "delete",	DELETE,		DELETE },
56 	{ "do",		DO,		DO },
57 	{ "else",	ELSE,		ELSE },
58 	{ "exit",	EXIT,		EXIT },
59 	{ "exp",	FEXP,		BLTIN },
60 	{ "fflush",	FFLUSH,		BLTIN },
61 	{ "for",	FOR,		FOR },
62 	{ "func",	FUNC,		FUNC },
63 	{ "function",	FUNC,		FUNC },
64 	{ "getline",	GETLINE,	GETLINE },
65 	{ "gsub",	GSUB,		GSUB },
66 	{ "if",		IF,		IF },
67 	{ "in",		IN,		IN },
68 	{ "index",	INDEX,		INDEX },
69 	{ "int",	FINT,		BLTIN },
70 	{ "length",	FLENGTH,	BLTIN },
71 	{ "log",	FLOG,		BLTIN },
72 	{ "match",	MATCHFCN,	MATCHFCN },
73 	{ "next",	NEXT,		NEXT },
74 	{ "nextfile",	NEXTFILE,	NEXTFILE },
75 	{ "print",	PRINT,		PRINT },
76 	{ "printf",	PRINTF,		PRINTF },
77 	{ "rand",	FRAND,		BLTIN },
78 	{ "return",	RETURN,		RETURN },
79 	{ "sin",	FSIN,		BLTIN },
80 	{ "split",	SPLIT,		SPLIT },
81 	{ "sprintf",	SPRINTF,	SPRINTF },
82 	{ "sqrt",	FSQRT,		BLTIN },
83 	{ "srand",	FSRAND,		BLTIN },
84 	{ "sub",	SUB,		SUB },
85 	{ "substr",	SUBSTR,		SUBSTR },
86 	{ "system",	FSYSTEM,	BLTIN },
87 	{ "tolower",	FTOLOWER,	BLTIN },
88 	{ "toupper",	FTOUPPER,	BLTIN },
89 	{ "while",	WHILE,		WHILE },
90 };
91 
92 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93 
peek(void)94 int peek(void)
95 {
96 	int c = input();
97 	unput(c);
98 	return c;
99 }
100 
gettok(char ** pbuf,int * psz)101 int gettok(char **pbuf, int *psz)	/* get next input token */
102 {
103 	int c, retc;
104 	char *buf = *pbuf;
105 	int sz = *psz;
106 	char *bp = buf;
107 
108 	c = input();
109 	if (c == 0)
110 		return 0;
111 	buf[0] = c;
112 	buf[1] = 0;
113 	if (!isalnum(c) && c != '.' && c != '_')
114 		return c;
115 
116 	*bp++ = c;
117 	if (isalpha(c) || c == '_') {	/* it's a varname */
118 		for ( ; (c = input()) != 0; ) {
119 			if (bp-buf >= sz)
120 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 					FATAL( "out of space for name %.10s...", buf );
122 			if (isalnum(c) || c == '_')
123 				*bp++ = c;
124 			else {
125 				*bp = 0;
126 				unput(c);
127 				break;
128 			}
129 		}
130 		*bp = 0;
131 		retc = 'a';	/* alphanumeric */
132 	} else {	/* maybe it's a number, but could be . */
133 		char *rem;
134 		/* read input until can't be a number */
135 		for ( ; (c = input()) != 0; ) {
136 			if (bp-buf >= sz)
137 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 					FATAL( "out of space for number %.10s...", buf );
139 			if (isdigit(c) || c == 'e' || c == 'E'
140 			  || c == '.' || c == '+' || c == '-')
141 				*bp++ = c;
142 			else {
143 				unput(c);
144 				break;
145 			}
146 		}
147 		*bp = 0;
148 		strtod(buf, &rem);	/* parse the number */
149 		if (rem == buf) {	/* it wasn't a valid number at all */
150 			buf[1] = 0;	/* return one character as token */
151 			retc = buf[0];	/* character is its own type */
152 			unputstr(rem+1); /* put rest back for later */
153 		} else {	/* some prefix was a number */
154 			unputstr(rem);	/* put rest back for later */
155 			rem[0] = 0;	/* truncate buf after number part */
156 			retc = '0';	/* type is number */
157 		}
158 	}
159 	*pbuf = buf;
160 	*psz = sz;
161 	return retc;
162 }
163 
164 int	word(char *);
165 int	string(void);
166 int	regexpr(void);
167 int	sc	= 0;	/* 1 => return a } right now */
168 int	reg	= 0;	/* 1 => return a REGEXPR now */
169 
yylex(void)170 int yylex(void)
171 {
172 	int c;
173 	static char *buf = 0;
174 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175 
176 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
177 		FATAL( "out of space in yylex" );
178 	if (sc) {
179 		sc = 0;
180 		RET('}');
181 	}
182 	if (reg) {
183 		reg = 0;
184 		return regexpr();
185 	}
186 	for (;;) {
187 		c = gettok(&buf, &bufsize);
188 		if (c == 0)
189 			return 0;
190 		if (isalpha(c) || c == '_')
191 			return word(buf);
192 		if (isdigit(c)) {
193 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
194 			/* should this also have STR set? */
195 			RET(NUMBER);
196 		}
197 
198 		yylval.i = c;
199 		switch (c) {
200 		case '\n':	/* {EOL} */
201 			lineno++;
202 			RET(NL);
203 		case '\r':	/* assume \n is coming */
204 		case ' ':	/* {WS}+ */
205 		case '\t':
206 			break;
207 		case '#':	/* #.* strip comments */
208 			while ((c = input()) != '\n' && c != 0)
209 				;
210 			unput(c);
211 			break;
212 		case ';':
213 			RET(';');
214 		case '\\':
215 			if (peek() == '\n') {
216 				input();
217 				lineno++;
218 			} else if (peek() == '\r') {
219 				input(); input();	/* \n */
220 				lineno++;
221 			} else {
222 				RET(c);
223 			}
224 			break;
225 		case '&':
226 			if (peek() == '&') {
227 				input(); RET(AND);
228 			} else
229 				RET('&');
230 		case '|':
231 			if (peek() == '|') {
232 				input(); RET(BOR);
233 			} else
234 				RET('|');
235 		case '!':
236 			if (peek() == '=') {
237 				input(); yylval.i = NE; RET(NE);
238 			} else if (peek() == '~') {
239 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
240 			} else
241 				RET(NOT);
242 		case '~':
243 			yylval.i = MATCH;
244 			RET(MATCHOP);
245 		case '<':
246 			if (peek() == '=') {
247 				input(); yylval.i = LE; RET(LE);
248 			} else {
249 				yylval.i = LT; RET(LT);
250 			}
251 		case '=':
252 			if (peek() == '=') {
253 				input(); yylval.i = EQ; RET(EQ);
254 			} else {
255 				yylval.i = ASSIGN; RET(ASGNOP);
256 			}
257 		case '>':
258 			if (peek() == '=') {
259 				input(); yylval.i = GE; RET(GE);
260 			} else if (peek() == '>') {
261 				input(); yylval.i = APPEND; RET(APPEND);
262 			} else {
263 				yylval.i = GT; RET(GT);
264 			}
265 		case '+':
266 			if (peek() == '+') {
267 				input(); yylval.i = INCR; RET(INCR);
268 			} else if (peek() == '=') {
269 				input(); yylval.i = ADDEQ; RET(ASGNOP);
270 			} else
271 				RET('+');
272 		case '-':
273 			if (peek() == '-') {
274 				input(); yylval.i = DECR; RET(DECR);
275 			} else if (peek() == '=') {
276 				input(); yylval.i = SUBEQ; RET(ASGNOP);
277 			} else
278 				RET('-');
279 		case '*':
280 			if (peek() == '=') {	/* *= */
281 				input(); yylval.i = MULTEQ; RET(ASGNOP);
282 			} else if (peek() == '*') {	/* ** or **= */
283 				input();	/* eat 2nd * */
284 				if (peek() == '=') {
285 					input(); yylval.i = POWEQ; RET(ASGNOP);
286 				} else {
287 					RET(POWER);
288 				}
289 			} else
290 				RET('*');
291 		case '/':
292 			RET('/');
293 		case '%':
294 			if (peek() == '=') {
295 				input(); yylval.i = MODEQ; RET(ASGNOP);
296 			} else
297 				RET('%');
298 		case '^':
299 			if (peek() == '=') {
300 				input(); yylval.i = POWEQ; RET(ASGNOP);
301 			} else
302 				RET(POWER);
303 
304 		case '$':
305 			/* BUG: awkward, if not wrong */
306 			c = gettok(&buf, &bufsize);
307 			if (isalpha(c)) {
308 				if (strcmp(buf, "NF") == 0) {	/* very special */
309 					unputstr("(NF)");
310 					RET(INDIRECT);
311 				}
312 				c = peek();
313 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
314 					unputstr(buf);
315 					RET(INDIRECT);
316 				}
317 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
318 				RET(IVAR);
319 			} else if (c == 0) {	/*  */
320 				SYNTAX( "unexpected end of input after $" );
321 				RET(';');
322 			} else {
323 				unputstr(buf);
324 				RET(INDIRECT);
325 			}
326 
327 		case '}':
328 			if (--bracecnt < 0)
329 				SYNTAX( "extra }" );
330 			sc = 1;
331 			RET(';');
332 		case ']':
333 			if (--brackcnt < 0)
334 				SYNTAX( "extra ]" );
335 			RET(']');
336 		case ')':
337 			if (--parencnt < 0)
338 				SYNTAX( "extra )" );
339 			RET(')');
340 		case '{':
341 			bracecnt++;
342 			RET('{');
343 		case '[':
344 			brackcnt++;
345 			RET('[');
346 		case '(':
347 			parencnt++;
348 			RET('(');
349 
350 		case '"':
351 			return string();	/* BUG: should be like tran.c ? */
352 
353 		default:
354 			RET(c);
355 		}
356 	}
357 }
358 
string(void)359 int string(void)
360 {
361 	int c, n;
362 	char *s, *bp;
363 	static char *buf = 0;
364 	static int bufsz = 500;
365 
366 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
367 		FATAL("out of space for strings");
368 	for (bp = buf; (c = input()) != '"'; ) {
369 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
370 			FATAL("out of space for string %.10s...", buf);
371 		switch (c) {
372 		case '\n':
373 		case '\r':
374 		case 0:
375 			*bp = '\0';
376 			SYNTAX( "non-terminated string %.10s...", buf );
377 			if (c == 0)	/* hopeless */
378 				FATAL( "giving up" );
379 			lineno++;
380 			break;
381 		case '\\':
382 			c = input();
383 			switch (c) {
384 			case '"': *bp++ = '"'; break;
385 			case 'n': *bp++ = '\n'; break;
386 			case 't': *bp++ = '\t'; break;
387 			case 'f': *bp++ = '\f'; break;
388 			case 'r': *bp++ = '\r'; break;
389 			case 'b': *bp++ = '\b'; break;
390 			case 'v': *bp++ = '\v'; break;
391 			case 'a': *bp++ = '\007'; break;
392 			case '\\': *bp++ = '\\'; break;
393 
394 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
395 			case '3': case '4': case '5': case '6': case '7':
396 				n = c - '0';
397 				if ((c = peek()) >= '0' && c < '8') {
398 					n = 8 * n + input() - '0';
399 					if ((c = peek()) >= '0' && c < '8')
400 						n = 8 * n + input() - '0';
401 				}
402 				*bp++ = n;
403 				break;
404 
405 			case 'x':	/* hex  \x0-9a-fA-F + */
406 			    {	char xbuf[100], *px;
407 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
408 					if (isdigit(c)
409 					 || (c >= 'a' && c <= 'f')
410 					 || (c >= 'A' && c <= 'F'))
411 						*px++ = c;
412 					else
413 						break;
414 				}
415 				*px = 0;
416 				unput(c);
417 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
418 				*bp++ = n;
419 				break;
420 			    }
421 
422 			default:
423 				*bp++ = c;
424 				break;
425 			}
426 			break;
427 		default:
428 			*bp++ = c;
429 			break;
430 		}
431 	}
432 	*bp = 0;
433 	s = tostring(buf);
434 	*bp++ = ' '; *bp++ = 0;
435 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
436 	RET(STRING);
437 }
438 
439 
binsearch(char * w,Keyword * kp,int n)440 int binsearch(char *w, Keyword *kp, int n)
441 {
442 	int cond, low, mid, high;
443 
444 	low = 0;
445 	high = n - 1;
446 	while (low <= high) {
447 		mid = (low + high) / 2;
448 		if ((cond = strcmp(w, kp[mid].word)) < 0)
449 			high = mid - 1;
450 		else if (cond > 0)
451 			low = mid + 1;
452 		else
453 			return mid;
454 	}
455 	return -1;
456 }
457 
word(char * w)458 int word(char *w)
459 {
460 	Keyword *kp;
461 	int c, n;
462 
463 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
464 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
465 	kp = keywords + n;
466 	if (n != -1) {	/* found in table */
467 		yylval.i = kp->sub;
468 		switch (kp->type) {	/* special handling */
469 		case BLTIN:
470 			if (kp->sub == FSYSTEM && safe)
471 				SYNTAX( "system is unsafe" );
472 			RET(kp->type);
473 		case FUNC:
474 			if (infunc)
475 				SYNTAX( "illegal nested function" );
476 			RET(kp->type);
477 		case RETURN:
478 			if (!infunc)
479 				SYNTAX( "return not in function" );
480 			RET(kp->type);
481 		case VARNF:
482 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483 			RET(VARNF);
484 		default:
485 			RET(kp->type);
486 		}
487 	}
488 	c = peek();	/* look for '(' */
489 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490 		yylval.i = n;
491 		RET(ARG);
492 	} else {
493 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494 		if (c == '(') {
495 			RET(CALL);
496 		} else {
497 			RET(VAR);
498 		}
499 	}
500 }
501 
startreg(void)502 void startreg(void)	/* next call to yylex will return a regular expression */
503 {
504 	reg = 1;
505 }
506 
regexpr(void)507 int regexpr(void)
508 {
509 	int c;
510 	static char *buf = 0;
511 	static int bufsz = 500;
512 	char *bp;
513 
514 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515 		FATAL("out of space for rex expr");
516 	bp = buf;
517 	for ( ; (c = input()) != '/' && c != 0; ) {
518 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
519 			FATAL("out of space for reg expr %.10s...", buf);
520 		if (c == '\n') {
521 			*bp = '\0';
522 			SYNTAX( "newline in regular expression %.10s...", buf );
523 			unput('\n');
524 			break;
525 		} else if (c == '\\') {
526 			*bp++ = '\\';
527 			*bp++ = input();
528 		} else {
529 			*bp++ = c;
530 		}
531 	}
532 	*bp = 0;
533 	if (c == 0)
534 		SYNTAX("non-terminated regular expression %.10s...", buf);
535 	yylval.s = tostring(buf);
536 	unput('/');
537 	RET(REGEXPR);
538 }
539 
540 /* low-level lexical stuff, sort of inherited from lex */
541 
542 char	ebuf[300];
543 char	*ep = ebuf;
544 char	yysbuf[100];	/* pushback buffer */
545 char	*yysptr = yysbuf;
546 FILE	*yyin = 0;
547 
input(void)548 int input(void)	/* get next lexical input character */
549 {
550 	int c;
551 	extern char *lexprog;
552 
553 	if (yysptr > yysbuf)
554 		c = (uschar)*--yysptr;
555 	else if (lexprog != NULL) {	/* awk '...' */
556 		if ((c = (uschar)*lexprog) != 0)
557 			lexprog++;
558 	} else				/* awk -f ... */
559 		c = pgetc();
560 	if (c == EOF)
561 		c = 0;
562 	if (ep >= ebuf + sizeof ebuf)
563 		ep = ebuf;
564 	*ep = c;
565 	if (c != 0) {
566 		ep++;
567 	}
568 	return (c);
569 }
570 
unput(int c)571 void unput(int c)	/* put lexical character back on input */
572 {
573 	if (yysptr >= yysbuf + sizeof(yysbuf))
574 		FATAL("pushed back too much: %.20s...", yysbuf);
575 	*yysptr++ = c;
576 	if (--ep < ebuf)
577 		ep = ebuf + sizeof(ebuf) - 1;
578 }
579 
unputstr(const char * s)580 void unputstr(const char *s)	/* put a string back on input */
581 {
582 	int i;
583 
584 	for (i = strlen(s)-1; i >= 0; i--)
585 		unput(s[i]);
586 }
587