1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define	DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)		(v)->nobj	/* badly overloaded here */
40 #define info(v)		(v)->ntype	/* badly overloaded here */
41 #define left(v)		(v)->narg[0]
42 #define right(v)	(v)->narg[1]
43 #define parent(v)	(v)->nnext
44 
45 #define LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF	case EMPTYRE:		/* empty string in regexp */
47 #define UNARY	case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50 	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 		left is index, right contains value or pointer to value
52 	unary (STAR, PLUS, QUEST): left is child, right is null
53 	binary (CAT, OR): left and right are children
54 	parent contains pointer to parent
55 */
56 
57 
58 int	*setvec;
59 int	*tmpset;
60 int	maxsetvec = 0;
61 
62 int	rtok;		/* next token in current re */
63 int	rlxval;
64 static const uschar	*rlxstr;
65 static const uschar	*prestr;	/* current position in current re */
66 static const uschar	*lastre;	/* origin of last re */
67 static const uschar	*lastatom;	/* origin of last Atom */
68 static const uschar	*starttok;
69 static const uschar 	*basestr;	/* starts with original, replaced during
70 				   repetition processing */
71 static const uschar 	*firstbasestr;
72 
73 static	int setcnt;
74 static	int poscnt;
75 
76 const char	*patbeg;
77 int	patlen;
78 
79 #define	NFA	128	/* cache this many dynamic fa's */
80 fa	*fatab[NFA];
81 int	nfatab	= 0;	/* entries in fatab */
82 
83 static int *
intalloc(size_t n,const char * f)84 intalloc(size_t n, const char *f)
85 {
86 	int *p = (int *) calloc(n, sizeof(int));
87 	if (p == NULL)
88 		overflo(f);
89 	return p;
90 }
91 
92 static void
resizesetvec(const char * f)93 resizesetvec(const char *f)
94 {
95 	if (maxsetvec == 0)
96 		maxsetvec = MAXLIN;
97 	else
98 		maxsetvec *= 4;
99 	setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
100 	tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 	if (setvec == NULL || tmpset == NULL)
102 		overflo(f);
103 }
104 
105 static void
resize_state(fa * f,int state)106 resize_state(fa *f, int state)
107 {
108 	unsigned int **p;
109 	uschar *p2;
110 	int **p3;
111 	int i, new_count;
112 
113 	if (++state < f->state_count)
114 		return;
115 
116 	new_count = state + 10; /* needs to be tuned */
117 
118 	p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
119 	if (p == NULL)
120 		goto out;
121 	f->gototab = p;
122 
123 	p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
124 	if (p2 == NULL)
125 		goto out;
126 	f->out = p2;
127 
128 	p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
129 	if (p3 == NULL)
130 		goto out;
131 	f->posns = p3;
132 
133 	for (i = f->state_count; i < new_count; ++i) {
134 		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
135 		if (f->gototab[i] == NULL)
136 			goto out;
137 		f->out[i]  = 0;
138 		f->posns[i] = NULL;
139 	}
140 	f->state_count = new_count;
141 	return;
142 out:
143 	overflo(__func__);
144 }
145 
makedfa(const char * s,bool anchor)146 fa *makedfa(const char *s, bool anchor)	/* returns dfa for reg expr s */
147 {
148 	int i, use, nuse;
149 	fa *pfa;
150 	static int now = 1;
151 
152 	if (setvec == NULL) {	/* first time through any RE */
153 		resizesetvec(__func__);
154 	}
155 
156 	if (compile_time != RUNNING)	/* a constant for sure */
157 		return mkdfa(s, anchor);
158 	for (i = 0; i < nfatab; i++)	/* is it there already? */
159 		if (fatab[i]->anchor == anchor
160 		  && strcmp((const char *) fatab[i]->restr, s) == 0) {
161 			fatab[i]->use = now++;
162 			return fatab[i];
163 		}
164 	pfa = mkdfa(s, anchor);
165 	if (nfatab < NFA) {	/* room for another */
166 		fatab[nfatab] = pfa;
167 		fatab[nfatab]->use = now++;
168 		nfatab++;
169 		return pfa;
170 	}
171 	use = fatab[0]->use;	/* replace least-recently used */
172 	nuse = 0;
173 	for (i = 1; i < nfatab; i++)
174 		if (fatab[i]->use < use) {
175 			use = fatab[i]->use;
176 			nuse = i;
177 		}
178 	freefa(fatab[nuse]);
179 	fatab[nuse] = pfa;
180 	pfa->use = now++;
181 	return pfa;
182 }
183 
mkdfa(const char * s,bool anchor)184 fa *mkdfa(const char *s, bool anchor)	/* does the real work of making a dfa */
185 				/* anchor = true for anchored matches, else false */
186 {
187 	Node *p, *p1;
188 	fa *f;
189 
190 	firstbasestr = (const uschar *) s;
191 	basestr = firstbasestr;
192 	p = reparse(s);
193 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
194 		/* put ALL STAR in front of reg.  exp. */
195 	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
196 		/* put FINAL after reg.  exp. */
197 
198 	poscnt = 0;
199 	penter(p1);	/* enter parent pointers and leaf indices */
200 	if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
201 		overflo(__func__);
202 	f->accept = poscnt-1;	/* penter has computed number of positions in re */
203 	cfoll(f, p1);	/* set up follow sets */
204 	freetr(p1);
205 	resize_state(f, 1);
206 	f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
207 	f->posns[1] = intalloc(1, __func__);
208 	*f->posns[1] = 0;
209 	f->initstat = makeinit(f, anchor);
210 	f->anchor = anchor;
211 	f->restr = (uschar *) tostring(s);
212 	if (firstbasestr != basestr) {
213 		if (basestr)
214 			xfree(basestr);
215 	}
216 	return f;
217 }
218 
makeinit(fa * f,bool anchor)219 int makeinit(fa *f, bool anchor)
220 {
221 	int i, k;
222 
223 	f->curstat = 2;
224 	f->out[2] = 0;
225 	k = *(f->re[0].lfollow);
226 	xfree(f->posns[2]);
227 	f->posns[2] = intalloc(k + 1,  __func__);
228 	for (i = 0; i <= k; i++) {
229 		(f->posns[2])[i] = (f->re[0].lfollow)[i];
230 	}
231 	if ((f->posns[2])[1] == f->accept)
232 		f->out[2] = 1;
233 	for (i = 0; i < NCHARS; i++)
234 		f->gototab[2][i] = 0;
235 	f->curstat = cgoto(f, 2, HAT);
236 	if (anchor) {
237 		*f->posns[2] = k-1;	/* leave out position 0 */
238 		for (i = 0; i < k; i++) {
239 			(f->posns[0])[i] = (f->posns[2])[i];
240 		}
241 
242 		f->out[0] = f->out[2];
243 		if (f->curstat != 2)
244 			--(*f->posns[f->curstat]);
245 	}
246 	return f->curstat;
247 }
248 
penter(Node * p)249 void penter(Node *p)	/* set up parent pointers and leaf indices */
250 {
251 	switch (type(p)) {
252 	ELEAF
253 	LEAF
254 		info(p) = poscnt;
255 		poscnt++;
256 		break;
257 	UNARY
258 		penter(left(p));
259 		parent(left(p)) = p;
260 		break;
261 	case CAT:
262 	case OR:
263 		penter(left(p));
264 		penter(right(p));
265 		parent(left(p)) = p;
266 		parent(right(p)) = p;
267 		break;
268 	case ZERO:
269 		break;
270 	default:	/* can't happen */
271 		FATAL("can't happen: unknown type %d in penter", type(p));
272 		break;
273 	}
274 }
275 
freetr(Node * p)276 void freetr(Node *p)	/* free parse tree */
277 {
278 	switch (type(p)) {
279 	ELEAF
280 	LEAF
281 		xfree(p);
282 		break;
283 	UNARY
284 	case ZERO:
285 		freetr(left(p));
286 		xfree(p);
287 		break;
288 	case CAT:
289 	case OR:
290 		freetr(left(p));
291 		freetr(right(p));
292 		xfree(p);
293 		break;
294 	default:	/* can't happen */
295 		FATAL("can't happen: unknown type %d in freetr", type(p));
296 		break;
297 	}
298 }
299 
300 /* in the parsing of regular expressions, metacharacters like . have */
301 /* to be seen literally;  \056 is not a metacharacter. */
302 
hexstr(const uschar ** pp)303 int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
304 {			/* only pick up one 8-bit byte (2 chars) */
305 	const uschar *p;
306 	int n = 0;
307 	int i;
308 
309 	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
310 		if (isdigit(*p))
311 			n = 16 * n + *p - '0';
312 		else if (*p >= 'a' && *p <= 'f')
313 			n = 16 * n + *p - 'a' + 10;
314 		else if (*p >= 'A' && *p <= 'F')
315 			n = 16 * n + *p - 'A' + 10;
316 	}
317 	*pp = p;
318 	return n;
319 }
320 
321 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */
322 
quoted(const uschar ** pp)323 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
324 			/* and increment *pp */
325 {
326 	const uschar *p = *pp;
327 	int c;
328 
329 	if ((c = *p++) == 't')
330 		c = '\t';
331 	else if (c == 'n')
332 		c = '\n';
333 	else if (c == 'f')
334 		c = '\f';
335 	else if (c == 'r')
336 		c = '\r';
337 	else if (c == 'b')
338 		c = '\b';
339 	else if (c == 'v')
340 		c = '\v';
341 	else if (c == 'a')
342 		c = '\a';
343 	else if (c == '\\')
344 		c = '\\';
345 	else if (c == 'x') {	/* hexadecimal goo follows */
346 		c = hexstr(&p);	/* this adds a null if number is invalid */
347 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
348 		int n = c - '0';
349 		if (isoctdigit(*p)) {
350 			n = 8 * n + *p++ - '0';
351 			if (isoctdigit(*p))
352 				n = 8 * n + *p++ - '0';
353 		}
354 		c = n;
355 	} /* else */
356 		/* c = c; */
357 	*pp = p;
358 	return c;
359 }
360 
cclenter(const char * argp)361 char *cclenter(const char *argp)	/* add a character class */
362 {
363 	int i, c, c2;
364 	const uschar *op, *p = (const uschar *) argp;
365 	uschar *bp;
366 	static uschar *buf = NULL;
367 	static int bufsz = 100;
368 
369 	op = p;
370 	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
371 		FATAL("out of space for character class [%.10s...] 1", p);
372 	bp = buf;
373 	for (i = 0; (c = *p++) != 0; ) {
374 		if (c == '\\') {
375 			c = quoted(&p);
376 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
377 			if (*p != 0) {
378 				c = bp[-1];
379 				c2 = *p++;
380 				if (c2 == '\\')
381 					c2 = quoted(&p);
382 				if (c > c2) {	/* empty; ignore */
383 					bp--;
384 					i--;
385 					continue;
386 				}
387 				while (c < c2) {
388 					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
389 						FATAL("out of space for character class [%.10s...] 2", p);
390 					*bp++ = ++c;
391 					i++;
392 				}
393 				continue;
394 			}
395 		}
396 		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
397 			FATAL("out of space for character class [%.10s...] 3", p);
398 		*bp++ = c;
399 		i++;
400 	}
401 	*bp = 0;
402 	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
403 	xfree(op);
404 	return (char *) tostring((char *) buf);
405 }
406 
overflo(const char * s)407 void overflo(const char *s)
408 {
409 	FATAL("regular expression too big: out of space in %.30s...", s);
410 }
411 
cfoll(fa * f,Node * v)412 void cfoll(fa *f, Node *v)	/* enter follow set of each leaf of vertex v into lfollow[leaf] */
413 {
414 	int i;
415 	int *p;
416 
417 	switch (type(v)) {
418 	ELEAF
419 	LEAF
420 		f->re[info(v)].ltype = type(v);
421 		f->re[info(v)].lval.np = right(v);
422 		while (f->accept >= maxsetvec) {	/* guessing here! */
423 			resizesetvec(__func__);
424 		}
425 		for (i = 0; i <= f->accept; i++)
426 			setvec[i] = 0;
427 		setcnt = 0;
428 		follow(v);	/* computes setvec and setcnt */
429 		p = intalloc(setcnt + 1, __func__);
430 		f->re[info(v)].lfollow = p;
431 		*p = setcnt;
432 		for (i = f->accept; i >= 0; i--)
433 			if (setvec[i] == 1)
434 				*++p = i;
435 		break;
436 	UNARY
437 		cfoll(f,left(v));
438 		break;
439 	case CAT:
440 	case OR:
441 		cfoll(f,left(v));
442 		cfoll(f,right(v));
443 		break;
444 	case ZERO:
445 		break;
446 	default:	/* can't happen */
447 		FATAL("can't happen: unknown type %d in cfoll", type(v));
448 	}
449 }
450 
first(Node * p)451 int first(Node *p)	/* collects initially active leaves of p into setvec */
452 			/* returns 0 if p matches empty string */
453 {
454 	int b, lp;
455 
456 	switch (type(p)) {
457 	ELEAF
458 	LEAF
459 		lp = info(p);	/* look for high-water mark of subscripts */
460 		while (setcnt >= maxsetvec || lp >= maxsetvec) {	/* guessing here! */
461 			resizesetvec(__func__);
462 		}
463 		if (type(p) == EMPTYRE) {
464 			setvec[lp] = 0;
465 			return(0);
466 		}
467 		if (setvec[lp] != 1) {
468 			setvec[lp] = 1;
469 			setcnt++;
470 		}
471 		if (type(p) == CCL && (*(char *) right(p)) == '\0')
472 			return(0);		/* empty CCL */
473 		return(1);
474 	case PLUS:
475 		if (first(left(p)) == 0)
476 			return(0);
477 		return(1);
478 	case STAR:
479 	case QUEST:
480 		first(left(p));
481 		return(0);
482 	case CAT:
483 		if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
484 		return(1);
485 	case OR:
486 		b = first(right(p));
487 		if (first(left(p)) == 0 || b == 0) return(0);
488 		return(1);
489 	case ZERO:
490 		return 0;
491 	}
492 	FATAL("can't happen: unknown type %d in first", type(p));	/* can't happen */
493 	return(-1);
494 }
495 
follow(Node * v)496 void follow(Node *v)	/* collects leaves that can follow v into setvec */
497 {
498 	Node *p;
499 
500 	if (type(v) == FINAL)
501 		return;
502 	p = parent(v);
503 	switch (type(p)) {
504 	case STAR:
505 	case PLUS:
506 		first(v);
507 		follow(p);
508 		return;
509 
510 	case OR:
511 	case QUEST:
512 		follow(p);
513 		return;
514 
515 	case CAT:
516 		if (v == left(p)) {	/* v is left child of p */
517 			if (first(right(p)) == 0) {
518 				follow(p);
519 				return;
520 			}
521 		} else		/* v is right child */
522 			follow(p);
523 		return;
524 	}
525 }
526 
member(int c,const char * sarg)527 int member(int c, const char *sarg)	/* is c in s? */
528 {
529 	const uschar *s = (const uschar *) sarg;
530 
531 	while (*s)
532 		if (c == *s++)
533 			return(1);
534 	return(0);
535 }
536 
match(fa * f,const char * p0)537 int match(fa *f, const char *p0)	/* shortest match ? */
538 {
539 	int s, ns;
540 	const uschar *p = (const uschar *) p0;
541 
542 	s = f->initstat;
543 	assert (s < f->state_count);
544 
545 	if (f->out[s])
546 		return(1);
547 	do {
548 		/* assert(*p < NCHARS); */
549 		if ((ns = f->gototab[s][*p]) != 0)
550 			s = ns;
551 		else
552 			s = cgoto(f, s, *p);
553 		if (f->out[s])
554 			return(1);
555 	} while (*p++ != 0);
556 	return(0);
557 }
558 
pmatch(fa * f,const char * p0)559 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
560 {
561 	int s, ns;
562 	const uschar *p = (const uschar *) p0;
563 	const uschar *q;
564 
565 	s = f->initstat;
566 	assert(s < f->state_count);
567 
568 	patbeg = (const char *)p;
569 	patlen = -1;
570 	do {
571 		q = p;
572 		do {
573 			if (f->out[s])		/* final state */
574 				patlen = q-p;
575 			/* assert(*q < NCHARS); */
576 			if ((ns = f->gototab[s][*q]) != 0)
577 				s = ns;
578 			else
579 				s = cgoto(f, s, *q);
580 
581 			assert(s < f->state_count);
582 
583 			if (s == 1) {	/* no transition */
584 				if (patlen >= 0) {
585 					patbeg = (const char *) p;
586 					return(1);
587 				}
588 				else
589 					goto nextin;	/* no match */
590 			}
591 		} while (*q++ != 0);
592 		if (f->out[s])
593 			patlen = q-p-1;	/* don't count $ */
594 		if (patlen >= 0) {
595 			patbeg = (const char *) p;
596 			return(1);
597 		}
598 	nextin:
599 		s = 2;
600 	} while (*p++);
601 	return (0);
602 }
603 
nematch(fa * f,const char * p0)604 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
605 {
606 	int s, ns;
607 	const uschar *p = (const uschar *) p0;
608 	const uschar *q;
609 
610 	s = f->initstat;
611 	assert(s < f->state_count);
612 
613 	patbeg = (const char *)p;
614 	patlen = -1;
615 	while (*p) {
616 		q = p;
617 		do {
618 			if (f->out[s])		/* final state */
619 				patlen = q-p;
620 			/* assert(*q < NCHARS); */
621 			if ((ns = f->gototab[s][*q]) != 0)
622 				s = ns;
623 			else
624 				s = cgoto(f, s, *q);
625 			if (s == 1) {	/* no transition */
626 				if (patlen > 0) {
627 					patbeg = (const char *) p;
628 					return(1);
629 				} else
630 					goto nnextin;	/* no nonempty match */
631 			}
632 		} while (*q++ != 0);
633 		if (f->out[s])
634 			patlen = q-p-1;	/* don't count $ */
635 		if (patlen > 0 ) {
636 			patbeg = (const char *) p;
637 			return(1);
638 		}
639 	nnextin:
640 		s = 2;
641 		p++;
642 	}
643 	return (0);
644 }
645 
646 
647 /*
648  * NAME
649  *     fnematch
650  *
651  * DESCRIPTION
652  *     A stream-fed version of nematch which transfers characters to a
653  *     null-terminated buffer. All characters up to and including the last
654  *     character of the matching text or EOF are placed in the buffer. If
655  *     a match is found, patbeg and patlen are set appropriately.
656  *
657  * RETURN VALUES
658  *     false    No match found.
659  *     true     Match found.
660  */
661 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)662 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
663 {
664 	char *buf = *pbuf;
665 	int bufsize = *pbufsize;
666 	int c, i, j, k, ns, s;
667 
668 	s = pfa->initstat;
669 	patlen = 0;
670 
671 	/*
672 	 * All indices relative to buf.
673 	 * i <= j <= k <= bufsize
674 	 *
675 	 * i: origin of active substring
676 	 * j: current character
677 	 * k: destination of next getc()
678 	 */
679 	i = -1, k = 0;
680         do {
681 		j = i++;
682 		do {
683 			if (++j == k) {
684 				if (k == bufsize)
685 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
686 						FATAL("stream '%.30s...' too long", buf);
687 				buf[k++] = (c = getc(f)) != EOF ? c : 0;
688 			}
689 			c = (uschar)buf[j];
690 			/* assert(c < NCHARS); */
691 
692 			if ((ns = pfa->gototab[s][c]) != 0)
693 				s = ns;
694 			else
695 				s = cgoto(pfa, s, c);
696 
697 			if (pfa->out[s]) {	/* final state */
698 				patlen = j - i + 1;
699 				if (c == 0)	/* don't count $ */
700 					patlen--;
701 			}
702 		} while (buf[j] && s != 1);
703 		s = 2;
704 	} while (buf[i] && !patlen);
705 
706 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
707 	*pbuf = buf;
708 	*pbufsize = bufsize;
709 
710 	if (patlen) {
711 		patbeg = (char *) buf + i;
712 		/*
713 		 * Under no circumstances is the last character fed to
714 		 * the automaton part of the match. It is EOF's nullbyte,
715 		 * or it sent the automaton into a state with no further
716 		 * transitions available (s==1), or both. Room for a
717 		 * terminating nullbyte is guaranteed.
718 		 *
719 		 * ungetc any chars after the end of matching text
720 		 * (except for EOF's nullbyte, if present) and null
721 		 * terminate the buffer.
722 		 */
723 		do
724 			if (buf[--k] && ungetc(buf[k], f) == EOF)
725 				FATAL("unable to ungetc '%c'", buf[k]);
726 		while (k > i + patlen);
727 		buf[k] = '\0';
728 		return true;
729 	}
730 	else
731 		return false;
732 }
733 
reparse(const char * p)734 Node *reparse(const char *p)	/* parses regular expression pointed to by p */
735 {			/* uses relex() to scan regular expression */
736 	Node *np;
737 
738 	DPRINTF("reparse <%s>\n", p);
739 	lastre = prestr = (const uschar *) p;	/* prestr points to string to be parsed */
740 	rtok = relex();
741 	/* GNU compatibility: an empty regexp matches anything */
742 	if (rtok == '\0') {
743 		/* FATAL("empty regular expression"); previous */
744 		return(op2(EMPTYRE, NIL, NIL));
745 	}
746 	np = regexp();
747 	if (rtok != '\0')
748 		FATAL("syntax error in regular expression %s at %s", lastre, prestr);
749 	return(np);
750 }
751 
regexp(void)752 Node *regexp(void)	/* top-level parse of reg expr */
753 {
754 	return (alt(concat(primary())));
755 }
756 
primary(void)757 Node *primary(void)
758 {
759 	Node *np;
760 	int savelastatom;
761 
762 	switch (rtok) {
763 	case CHAR:
764 		lastatom = starttok;
765 		np = op2(CHAR, NIL, itonp(rlxval));
766 		rtok = relex();
767 		return (unary(np));
768 	case ALL:
769 		rtok = relex();
770 		return (unary(op2(ALL, NIL, NIL)));
771 	case EMPTYRE:
772 		rtok = relex();
773 		return (unary(op2(EMPTYRE, NIL, NIL)));
774 	case DOT:
775 		lastatom = starttok;
776 		rtok = relex();
777 		return (unary(op2(DOT, NIL, NIL)));
778 	case CCL:
779 		np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
780 		lastatom = starttok;
781 		rtok = relex();
782 		return (unary(np));
783 	case NCCL:
784 		np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
785 		lastatom = starttok;
786 		rtok = relex();
787 		return (unary(np));
788 	case '^':
789 		rtok = relex();
790 		return (unary(op2(CHAR, NIL, itonp(HAT))));
791 	case '$':
792 		rtok = relex();
793 		return (unary(op2(CHAR, NIL, NIL)));
794 	case '(':
795 		lastatom = starttok;
796 		savelastatom = starttok - basestr; /* Retain over recursion */
797 		rtok = relex();
798 		if (rtok == ')') {	/* special pleading for () */
799 			rtok = relex();
800 			return unary(op2(CCL, NIL, (Node *) tostring("")));
801 		}
802 		np = regexp();
803 		if (rtok == ')') {
804 			lastatom = basestr + savelastatom; /* Restore */
805 			rtok = relex();
806 			return (unary(np));
807 		}
808 		else
809 			FATAL("syntax error in regular expression %s at %s", lastre, prestr);
810 	default:
811 		FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
812 	}
813 	return 0;	/*NOTREACHED*/
814 }
815 
concat(Node * np)816 Node *concat(Node *np)
817 {
818 	switch (rtok) {
819 	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
820 		return (concat(op2(CAT, np, primary())));
821 	case EMPTYRE:
822 		rtok = relex();
823 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
824 				primary())));
825 	}
826 	return (np);
827 }
828 
alt(Node * np)829 Node *alt(Node *np)
830 {
831 	if (rtok == OR) {
832 		rtok = relex();
833 		return (alt(op2(OR, np, concat(primary()))));
834 	}
835 	return (np);
836 }
837 
unary(Node * np)838 Node *unary(Node *np)
839 {
840 	switch (rtok) {
841 	case STAR:
842 		rtok = relex();
843 		return (unary(op2(STAR, np, NIL)));
844 	case PLUS:
845 		rtok = relex();
846 		return (unary(op2(PLUS, np, NIL)));
847 	case QUEST:
848 		rtok = relex();
849 		return (unary(op2(QUEST, np, NIL)));
850 	case ZERO:
851 		rtok = relex();
852 		return (unary(op2(ZERO, np, NIL)));
853 	default:
854 		return (np);
855 	}
856 }
857 
858 /*
859  * Character class definitions conformant to the POSIX locale as
860  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
861  * and operating character sets are both ASCII (ISO646) or supersets
862  * thereof.
863  *
864  * Note that to avoid overflowing the temporary buffer used in
865  * relex(), the expanded character class (prior to range expansion)
866  * must be less than twice the size of their full name.
867  */
868 
869 /* Because isblank doesn't show up in any of the header files on any
870  * system i use, it's defined here.  if some other locale has a richer
871  * definition of "blank", define HAS_ISBLANK and provide your own
872  * version.
873  * the parentheses here are an attempt to find a path through the maze
874  * of macro definition and/or function and/or version provided.  thanks
875  * to nelson beebe for the suggestion; let's see if it works everywhere.
876  */
877 
878 /* #define HAS_ISBLANK */
879 #ifndef HAS_ISBLANK
880 
881 int (xisblank)(int c)
882 {
883 	return c==' ' || c=='\t';
884 }
885 
886 #endif
887 
888 static const struct charclass {
889 	const char *cc_name;
890 	int cc_namelen;
891 	int (*cc_func)(int);
892 } charclasses[] = {
893 	{ "alnum",	5,	isalnum },
894 	{ "alpha",	5,	isalpha },
895 #ifndef HAS_ISBLANK
896 	{ "blank",	5,	xisblank },
897 #else
898 	{ "blank",	5,	isblank },
899 #endif
900 	{ "cntrl",	5,	iscntrl },
901 	{ "digit",	5,	isdigit },
902 	{ "graph",	5,	isgraph },
903 	{ "lower",	5,	islower },
904 	{ "print",	5,	isprint },
905 	{ "punct",	5,	ispunct },
906 	{ "space",	5,	isspace },
907 	{ "upper",	5,	isupper },
908 	{ "xdigit",	6,	isxdigit },
909 	{ NULL,		0,	NULL },
910 };
911 
912 #define REPEAT_SIMPLE		0
913 #define REPEAT_PLUS_APPENDED	1
914 #define REPEAT_WITH_Q		2
915 #define REPEAT_ZERO		3
916 
917 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)918 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
919 	       int atomlen, int firstnum, int secondnum, int special_case)
920 {
921 	int i, j;
922 	uschar *buf = 0;
923 	int ret = 1;
924 	int init_q = (firstnum == 0);		/* first added char will be ? */
925 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
926 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
927 	int suffix_length = strlen((const char *) reptok) - reptoklen;	/* string after rep specifier	*/
928 	int size = prefix_length +  suffix_length;
929 
930 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
931 		size += atomlen*(firstnum-1);
932 	}
933 
934 	/* Adjust size of buffer for special cases */
935 	if (special_case == REPEAT_PLUS_APPENDED) {
936 		size++;		/* for the final + */
937 	} else if (special_case == REPEAT_WITH_Q) {
938 		size += init_q + (atomlen+1)* n_q_reps;
939 	} else if (special_case == REPEAT_ZERO) {
940 		size += 2;	/* just a null ERE: () */
941 	}
942 	if ((buf = (uschar *) malloc(size + 1)) == NULL)
943 		FATAL("out of space in reg expr %.10s..", lastre);
944 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
945 	j = prefix_length;
946 	if (special_case == REPEAT_ZERO) {
947 		j -= atomlen;
948 		buf[j++] = '(';
949 		buf[j++] = ')';
950 	}
951 	for (i = 1; i < firstnum; i++) {	/* copy x reps 	*/
952 		memcpy(&buf[j], atom, atomlen);
953 		j += atomlen;
954 	}
955 	if (special_case == REPEAT_PLUS_APPENDED) {
956 		buf[j++] = '+';
957 	} else if (special_case == REPEAT_WITH_Q) {
958 		if (init_q)
959 			buf[j++] = '?';
960 		for (i = init_q; i < n_q_reps; i++) {	/* copy x? reps */
961 			memcpy(&buf[j], atom, atomlen);
962 			j += atomlen;
963 			buf[j++] = '?';
964 		}
965 	}
966 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
967 	if (special_case == REPEAT_ZERO) {
968 		buf[j+suffix_length] = '\0';
969 	} else {
970 		buf[size] = '\0';
971 	}
972 	/* free old basestr */
973 	if (firstbasestr != basestr) {
974 		if (basestr)
975 			xfree(basestr);
976 	}
977 	basestr = buf;
978 	prestr  = buf + prefix_length;
979 	if (special_case == REPEAT_ZERO) {
980 		prestr  -= atomlen;
981 		ret++;
982 	}
983 	return ret;
984 }
985 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)986 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
987 		  int atomlen, int firstnum, int secondnum)
988 {
989 	/*
990 	   In general, the repetition specifier or "bound" is replaced here
991 	   by an equivalent ERE string, repeating the immediately previous atom
992 	   and appending ? and + as needed. Note that the first copy of the
993 	   atom is left in place, except in the special_case of a zero-repeat
994 	   (i.e., {0}).
995 	 */
996 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
997 		if (firstnum < 2) {
998 			/* 0 or 1: should be handled before you get here */
999 			FATAL("internal error");
1000 		} else {
1001 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1002 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
1003 		}
1004 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
1005 		if (firstnum == 0) {	/* {0} or {0,0} */
1006 			/* This case is unusual because the resulting
1007 			   replacement string might actually be SMALLER than
1008 			   the original ERE */
1009 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1010 					firstnum, secondnum, REPEAT_ZERO);
1011 		} else {		/* (firstnum >= 1) */
1012 			return replace_repeat(reptok, reptoklen, atom, atomlen,
1013 					firstnum, secondnum, REPEAT_SIMPLE);
1014 		}
1015 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
1016 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
1017 		return replace_repeat(reptok, reptoklen, atom, atomlen,
1018 					firstnum, secondnum, REPEAT_WITH_Q);
1019 	} else {	/* Error - shouldn't be here (n>m) */
1020 		FATAL("internal error");
1021 	}
1022 	return 0;
1023 }
1024 
relex(void)1025 int relex(void)		/* lexical analyzer for reparse */
1026 {
1027 	int c, n;
1028 	int cflag;
1029 	static uschar *buf = NULL;
1030 	static int bufsz = 100;
1031 	uschar *bp;
1032 	const struct charclass *cc;
1033 	int i;
1034 	int num, m;
1035 	bool commafound, digitfound;
1036 	const uschar *startreptok;
1037 	static int parens = 0;
1038 
1039 rescan:
1040 	starttok = prestr;
1041 
1042 	switch (c = *prestr++) {
1043 	case '|': return OR;
1044 	case '*': return STAR;
1045 	case '+': return PLUS;
1046 	case '?': return QUEST;
1047 	case '.': return DOT;
1048 	case '\0': prestr--; return '\0';
1049 	case '^':
1050 	case '$':
1051 		return c;
1052 	case '(':
1053 		parens++;
1054 		return c;
1055 	case ')':
1056 		if (parens) {
1057 			parens--;
1058 			return c;
1059 		}
1060 		/* unmatched close parenthesis; per POSIX, treat as literal */
1061 		rlxval = c;
1062 		return CHAR;
1063 	case '\\':
1064 		rlxval = quoted(&prestr);
1065 		return CHAR;
1066 	default:
1067 		rlxval = c;
1068 		return CHAR;
1069 	case '[':
1070 		if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1071 			FATAL("out of space in reg expr %.10s..", lastre);
1072 		bp = buf;
1073 		if (*prestr == '^') {
1074 			cflag = 1;
1075 			prestr++;
1076 		}
1077 		else
1078 			cflag = 0;
1079 		n = 2 * strlen((const char *) prestr)+1;
1080 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1081 			FATAL("out of space for reg expr %.10s...", lastre);
1082 		for (; ; ) {
1083 			if ((c = *prestr++) == '\\') {
1084 				*bp++ = '\\';
1085 				if ((c = *prestr++) == '\0')
1086 					FATAL("nonterminated character class %.20s...", lastre);
1087 				*bp++ = c;
1088 			/* } else if (c == '\n') { */
1089 			/* 	FATAL("newline in character class %.20s...", lastre); */
1090 			} else if (c == '[' && *prestr == ':') {
1091 				/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1092 				for (cc = charclasses; cc->cc_name; cc++)
1093 					if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1094 						break;
1095 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1096 				    prestr[2 + cc->cc_namelen] == ']') {
1097 					prestr += cc->cc_namelen + 3;
1098 					/*
1099 					 * BUG: We begin at 1, instead of 0, since we
1100 					 * would otherwise prematurely terminate the
1101 					 * string for classes like [[:cntrl:]]. This
1102 					 * means that we can't match the NUL character,
1103 					 * not without first adapting the entire
1104 					 * program to track each string's length.
1105 					 */
1106 					for (i = 1; i <= UCHAR_MAX; i++) {
1107 						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1108 						    FATAL("out of space for reg expr %.10s...", lastre);
1109 						if (cc->cc_func(i)) {
1110 							/* escape backslash */
1111 							if (i == '\\') {
1112 								*bp++ = '\\';
1113 								n++;
1114 							}
1115 
1116 							*bp++ = i;
1117 							n++;
1118 						}
1119 					}
1120 				} else
1121 					*bp++ = c;
1122 			} else if (c == '[' && *prestr == '.') {
1123 				char collate_char;
1124 				prestr++;
1125 				collate_char = *prestr++;
1126 				if (*prestr == '.' && prestr[1] == ']') {
1127 					prestr += 2;
1128 					/* Found it: map via locale TBD: for
1129 					   now, simply return this char.  This
1130 					   is sufficient to pass conformance
1131 					   test awk.ex 156
1132 					 */
1133 					if (*prestr == ']') {
1134 						prestr++;
1135 						rlxval = collate_char;
1136 						return CHAR;
1137 					}
1138 				}
1139 			} else if (c == '[' && *prestr == '=') {
1140 				char equiv_char;
1141 				prestr++;
1142 				equiv_char = *prestr++;
1143 				if (*prestr == '=' && prestr[1] == ']') {
1144 					prestr += 2;
1145 					/* Found it: map via locale TBD: for now
1146 					   simply return this char. This is
1147 					   sufficient to pass conformance test
1148 					   awk.ex 156
1149 					 */
1150 					if (*prestr == ']') {
1151 						prestr++;
1152 						rlxval = equiv_char;
1153 						return CHAR;
1154 					}
1155 				}
1156 			} else if (c == '\0') {
1157 				FATAL("nonterminated character class %.20s", lastre);
1158 			} else if (bp == buf) {	/* 1st char is special */
1159 				*bp++ = c;
1160 			} else if (c == ']') {
1161 				*bp++ = 0;
1162 				rlxstr = (uschar *) tostring((char *) buf);
1163 				if (cflag == 0)
1164 					return CCL;
1165 				else
1166 					return NCCL;
1167 			} else
1168 				*bp++ = c;
1169 		}
1170 		break;
1171 	case '{':
1172 		if (isdigit(*(prestr))) {
1173 			num = 0;	/* Process as a repetition */
1174 			n = -1; m = -1;
1175 			commafound = false;
1176 			digitfound = false;
1177 			startreptok = prestr-1;
1178 			/* Remember start of previous atom here ? */
1179 		} else {        	/* just a { char, not a repetition */
1180 			rlxval = c;
1181 			return CHAR;
1182                 }
1183 		for (; ; ) {
1184 			if ((c = *prestr++) == '}') {
1185 				if (commafound) {
1186 					if (digitfound) { /* {n,m} */
1187 						m = num;
1188 						if (m < n)
1189 							FATAL("illegal repetition expression: class %.20s",
1190 								lastre);
1191 						if (n == 0 && m == 1) {
1192 							return QUEST;
1193 						}
1194 					} else {	/* {n,} */
1195 						if (n == 0)
1196 							return STAR;
1197 						else if (n == 1)
1198 							return PLUS;
1199 					}
1200 				} else {
1201 					if (digitfound) { /* {n} same as {n,n} */
1202 						n = num;
1203 						m = num;
1204 					} else {	/* {} */
1205 						FATAL("illegal repetition expression: class %.20s",
1206 							lastre);
1207 					}
1208 				}
1209 				if (repeat(starttok, prestr-starttok, lastatom,
1210 					   startreptok - lastatom, n, m) > 0) {
1211 					if (n == 0 && m == 0) {
1212 						return ZERO;
1213 					}
1214 					/* must rescan input for next token */
1215 					goto rescan;
1216 				}
1217 				/* Failed to replace: eat up {...} characters
1218 				   and treat like just PLUS */
1219 				return PLUS;
1220 			} else if (c == '\0') {
1221 				FATAL("nonterminated character class %.20s",
1222 					lastre);
1223 			} else if (isdigit(c)) {
1224 				num = 10 * num + c - '0';
1225 				digitfound = true;
1226 			} else if (c == ',') {
1227 				if (commafound)
1228 					FATAL("illegal repetition expression: class %.20s",
1229 						lastre);
1230 				/* looking for {n,} or {n,m} */
1231 				commafound = true;
1232 				n = num;
1233 				digitfound = false; /* reset */
1234 				num = 0;
1235 			} else {
1236 				FATAL("illegal repetition expression: class %.20s",
1237 					lastre);
1238 			}
1239 		}
1240 		break;
1241 	}
1242 }
1243 
cgoto(fa * f,int s,int c)1244 int cgoto(fa *f, int s, int c)
1245 {
1246 	int *p, *q;
1247 	int i, j, k;
1248 
1249 	assert(c == HAT || c < NCHARS);
1250 	while (f->accept >= maxsetvec) {	/* guessing here! */
1251 		resizesetvec(__func__);
1252 	}
1253 	for (i = 0; i <= f->accept; i++)
1254 		setvec[i] = 0;
1255 	setcnt = 0;
1256 	resize_state(f, s);
1257 	/* compute positions of gototab[s,c] into setvec */
1258 	p = f->posns[s];
1259 	for (i = 1; i <= *p; i++) {
1260 		if ((k = f->re[p[i]].ltype) != FINAL) {
1261 			if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1262 			 || (k == DOT && c != 0 && c != HAT)
1263 			 || (k == ALL && c != 0)
1264 			 || (k == EMPTYRE && c != 0)
1265 			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1266 			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1267 				q = f->re[p[i]].lfollow;
1268 				for (j = 1; j <= *q; j++) {
1269 					if (q[j] >= maxsetvec) {
1270 						resizesetvec(__func__);
1271 					}
1272 					if (setvec[q[j]] == 0) {
1273 						setcnt++;
1274 						setvec[q[j]] = 1;
1275 					}
1276 				}
1277 			}
1278 		}
1279 	}
1280 	/* determine if setvec is a previous state */
1281 	tmpset[0] = setcnt;
1282 	j = 1;
1283 	for (i = f->accept; i >= 0; i--)
1284 		if (setvec[i]) {
1285 			tmpset[j++] = i;
1286 		}
1287 	resize_state(f, f->curstat > s ? f->curstat : s);
1288 	/* tmpset == previous state? */
1289 	for (i = 1; i <= f->curstat; i++) {
1290 		p = f->posns[i];
1291 		if ((k = tmpset[0]) != p[0])
1292 			goto different;
1293 		for (j = 1; j <= k; j++)
1294 			if (tmpset[j] != p[j])
1295 				goto different;
1296 		/* setvec is state i */
1297 		if (c != HAT)
1298 			f->gototab[s][c] = i;
1299 		return i;
1300 	  different:;
1301 	}
1302 
1303 	/* add tmpset to current set of states */
1304 	++(f->curstat);
1305 	resize_state(f, f->curstat);
1306 	for (i = 0; i < NCHARS; i++)
1307 		f->gototab[f->curstat][i] = 0;
1308 	xfree(f->posns[f->curstat]);
1309 	p = intalloc(setcnt + 1, __func__);
1310 
1311 	f->posns[f->curstat] = p;
1312 	if (c != HAT)
1313 		f->gototab[s][c] = f->curstat;
1314 	for (i = 0; i <= setcnt; i++)
1315 		p[i] = tmpset[i];
1316 	if (setvec[f->accept])
1317 		f->out[f->curstat] = 1;
1318 	else
1319 		f->out[f->curstat] = 0;
1320 	return f->curstat;
1321 }
1322 
1323 
freefa(fa * f)1324 void freefa(fa *f)	/* free a finite automaton */
1325 {
1326 	int i;
1327 
1328 	if (f == NULL)
1329 		return;
1330 	for (i = 0; i < f->state_count; i++)
1331 		xfree(f->gototab[i])
1332 	for (i = 0; i <= f->curstat; i++)
1333 		xfree(f->posns[i]);
1334 	for (i = 0; i <= f->accept; i++) {
1335 		xfree(f->re[i].lfollow);
1336 		if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1337 			xfree(f->re[i].lval.np);
1338 	}
1339 	xfree(f->restr);
1340 	xfree(f->out);
1341 	xfree(f->posns);
1342 	xfree(f->gototab);
1343 	xfree(f);
1344 }
1345