1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #define DEBUG
28
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36
37 #define MAXLIN 22
38
39 #define type(v) (v)->nobj /* badly overloaded here */
40 #define info(v) (v)->ntype /* badly overloaded here */
41 #define left(v) (v)->narg[0]
42 #define right(v) (v)->narg[1]
43 #define parent(v) (v)->nnext
44
45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF case EMPTYRE: /* empty string in regexp */
47 #define UNARY case STAR: case PLUS: case QUEST:
48
49 /* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55 */
56
57
58 int *setvec;
59 int *tmpset;
60 int maxsetvec = 0;
61
62 int rtok; /* next token in current re */
63 int rlxval;
64 static const uschar *rlxstr;
65 static const uschar *prestr; /* current position in current re */
66 static const uschar *lastre; /* origin of last re */
67 static const uschar *lastatom; /* origin of last Atom */
68 static const uschar *starttok;
69 static const uschar *basestr; /* starts with original, replaced during
70 repetition processing */
71 static const uschar *firstbasestr;
72
73 static int setcnt;
74 static int poscnt;
75
76 const char *patbeg;
77 int patlen;
78
79 #define NFA 128 /* cache this many dynamic fa's */
80 fa *fatab[NFA];
81 int nfatab = 0; /* entries in fatab */
82
83 static int *
intalloc(size_t n,const char * f)84 intalloc(size_t n, const char *f)
85 {
86 int *p = (int *) calloc(n, sizeof(int));
87 if (p == NULL)
88 overflo(f);
89 return p;
90 }
91
92 static void
resizesetvec(const char * f)93 resizesetvec(const char *f)
94 {
95 if (maxsetvec == 0)
96 maxsetvec = MAXLIN;
97 else
98 maxsetvec *= 4;
99 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
100 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 if (setvec == NULL || tmpset == NULL)
102 overflo(f);
103 }
104
105 static void
resize_state(fa * f,int state)106 resize_state(fa *f, int state)
107 {
108 unsigned int **p;
109 uschar *p2;
110 int **p3;
111 int i, new_count;
112
113 if (++state < f->state_count)
114 return;
115
116 new_count = state + 10; /* needs to be tuned */
117
118 p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
119 if (p == NULL)
120 goto out;
121 f->gototab = p;
122
123 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
124 if (p2 == NULL)
125 goto out;
126 f->out = p2;
127
128 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
129 if (p3 == NULL)
130 goto out;
131 f->posns = p3;
132
133 for (i = f->state_count; i < new_count; ++i) {
134 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
135 if (f->gototab[i] == NULL)
136 goto out;
137 f->out[i] = 0;
138 f->posns[i] = NULL;
139 }
140 f->state_count = new_count;
141 return;
142 out:
143 overflo(__func__);
144 }
145
makedfa(const char * s,bool anchor)146 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
147 {
148 int i, use, nuse;
149 fa *pfa;
150 static int now = 1;
151
152 if (setvec == NULL) { /* first time through any RE */
153 resizesetvec(__func__);
154 }
155
156 if (compile_time != RUNNING) /* a constant for sure */
157 return mkdfa(s, anchor);
158 for (i = 0; i < nfatab; i++) /* is it there already? */
159 if (fatab[i]->anchor == anchor
160 && strcmp((const char *) fatab[i]->restr, s) == 0) {
161 fatab[i]->use = now++;
162 return fatab[i];
163 }
164 pfa = mkdfa(s, anchor);
165 if (nfatab < NFA) { /* room for another */
166 fatab[nfatab] = pfa;
167 fatab[nfatab]->use = now++;
168 nfatab++;
169 return pfa;
170 }
171 use = fatab[0]->use; /* replace least-recently used */
172 nuse = 0;
173 for (i = 1; i < nfatab; i++)
174 if (fatab[i]->use < use) {
175 use = fatab[i]->use;
176 nuse = i;
177 }
178 freefa(fatab[nuse]);
179 fatab[nuse] = pfa;
180 pfa->use = now++;
181 return pfa;
182 }
183
mkdfa(const char * s,bool anchor)184 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
185 /* anchor = true for anchored matches, else false */
186 {
187 Node *p, *p1;
188 fa *f;
189
190 firstbasestr = (const uschar *) s;
191 basestr = firstbasestr;
192 p = reparse(s);
193 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
194 /* put ALL STAR in front of reg. exp. */
195 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
196 /* put FINAL after reg. exp. */
197
198 poscnt = 0;
199 penter(p1); /* enter parent pointers and leaf indices */
200 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
201 overflo(__func__);
202 f->accept = poscnt-1; /* penter has computed number of positions in re */
203 cfoll(f, p1); /* set up follow sets */
204 freetr(p1);
205 resize_state(f, 1);
206 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
207 f->posns[1] = intalloc(1, __func__);
208 *f->posns[1] = 0;
209 f->initstat = makeinit(f, anchor);
210 f->anchor = anchor;
211 f->restr = (uschar *) tostring(s);
212 if (firstbasestr != basestr) {
213 if (basestr)
214 xfree(basestr);
215 }
216 return f;
217 }
218
makeinit(fa * f,bool anchor)219 int makeinit(fa *f, bool anchor)
220 {
221 int i, k;
222
223 f->curstat = 2;
224 f->out[2] = 0;
225 k = *(f->re[0].lfollow);
226 xfree(f->posns[2]);
227 f->posns[2] = intalloc(k + 1, __func__);
228 for (i = 0; i <= k; i++) {
229 (f->posns[2])[i] = (f->re[0].lfollow)[i];
230 }
231 if ((f->posns[2])[1] == f->accept)
232 f->out[2] = 1;
233 for (i = 0; i < NCHARS; i++)
234 f->gototab[2][i] = 0;
235 f->curstat = cgoto(f, 2, HAT);
236 if (anchor) {
237 *f->posns[2] = k-1; /* leave out position 0 */
238 for (i = 0; i < k; i++) {
239 (f->posns[0])[i] = (f->posns[2])[i];
240 }
241
242 f->out[0] = f->out[2];
243 if (f->curstat != 2)
244 --(*f->posns[f->curstat]);
245 }
246 return f->curstat;
247 }
248
penter(Node * p)249 void penter(Node *p) /* set up parent pointers and leaf indices */
250 {
251 switch (type(p)) {
252 ELEAF
253 LEAF
254 info(p) = poscnt;
255 poscnt++;
256 break;
257 UNARY
258 penter(left(p));
259 parent(left(p)) = p;
260 break;
261 case CAT:
262 case OR:
263 penter(left(p));
264 penter(right(p));
265 parent(left(p)) = p;
266 parent(right(p)) = p;
267 break;
268 case ZERO:
269 break;
270 default: /* can't happen */
271 FATAL("can't happen: unknown type %d in penter", type(p));
272 break;
273 }
274 }
275
freetr(Node * p)276 void freetr(Node *p) /* free parse tree */
277 {
278 switch (type(p)) {
279 ELEAF
280 LEAF
281 xfree(p);
282 break;
283 UNARY
284 case ZERO:
285 freetr(left(p));
286 xfree(p);
287 break;
288 case CAT:
289 case OR:
290 freetr(left(p));
291 freetr(right(p));
292 xfree(p);
293 break;
294 default: /* can't happen */
295 FATAL("can't happen: unknown type %d in freetr", type(p));
296 break;
297 }
298 }
299
300 /* in the parsing of regular expressions, metacharacters like . have */
301 /* to be seen literally; \056 is not a metacharacter. */
302
hexstr(const uschar ** pp)303 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
304 { /* only pick up one 8-bit byte (2 chars) */
305 const uschar *p;
306 int n = 0;
307 int i;
308
309 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
310 if (isdigit(*p))
311 n = 16 * n + *p - '0';
312 else if (*p >= 'a' && *p <= 'f')
313 n = 16 * n + *p - 'a' + 10;
314 else if (*p >= 'A' && *p <= 'F')
315 n = 16 * n + *p - 'A' + 10;
316 }
317 *pp = p;
318 return n;
319 }
320
321 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
322
quoted(const uschar ** pp)323 int quoted(const uschar **pp) /* pick up next thing after a \\ */
324 /* and increment *pp */
325 {
326 const uschar *p = *pp;
327 int c;
328
329 if ((c = *p++) == 't')
330 c = '\t';
331 else if (c == 'n')
332 c = '\n';
333 else if (c == 'f')
334 c = '\f';
335 else if (c == 'r')
336 c = '\r';
337 else if (c == 'b')
338 c = '\b';
339 else if (c == 'v')
340 c = '\v';
341 else if (c == 'a')
342 c = '\a';
343 else if (c == '\\')
344 c = '\\';
345 else if (c == 'x') { /* hexadecimal goo follows */
346 c = hexstr(&p); /* this adds a null if number is invalid */
347 } else if (isoctdigit(c)) { /* \d \dd \ddd */
348 int n = c - '0';
349 if (isoctdigit(*p)) {
350 n = 8 * n + *p++ - '0';
351 if (isoctdigit(*p))
352 n = 8 * n + *p++ - '0';
353 }
354 c = n;
355 } /* else */
356 /* c = c; */
357 *pp = p;
358 return c;
359 }
360
cclenter(const char * argp)361 char *cclenter(const char *argp) /* add a character class */
362 {
363 int i, c, c2;
364 const uschar *op, *p = (const uschar *) argp;
365 uschar *bp;
366 static uschar *buf = NULL;
367 static int bufsz = 100;
368
369 op = p;
370 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
371 FATAL("out of space for character class [%.10s...] 1", p);
372 bp = buf;
373 for (i = 0; (c = *p++) != 0; ) {
374 if (c == '\\') {
375 c = quoted(&p);
376 } else if (c == '-' && i > 0 && bp[-1] != 0) {
377 if (*p != 0) {
378 c = bp[-1];
379 c2 = *p++;
380 if (c2 == '\\')
381 c2 = quoted(&p);
382 if (c > c2) { /* empty; ignore */
383 bp--;
384 i--;
385 continue;
386 }
387 while (c < c2) {
388 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
389 FATAL("out of space for character class [%.10s...] 2", p);
390 *bp++ = ++c;
391 i++;
392 }
393 continue;
394 }
395 }
396 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
397 FATAL("out of space for character class [%.10s...] 3", p);
398 *bp++ = c;
399 i++;
400 }
401 *bp = 0;
402 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
403 xfree(op);
404 return (char *) tostring((char *) buf);
405 }
406
overflo(const char * s)407 void overflo(const char *s)
408 {
409 FATAL("regular expression too big: out of space in %.30s...", s);
410 }
411
cfoll(fa * f,Node * v)412 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
413 {
414 int i;
415 int *p;
416
417 switch (type(v)) {
418 ELEAF
419 LEAF
420 f->re[info(v)].ltype = type(v);
421 f->re[info(v)].lval.np = right(v);
422 while (f->accept >= maxsetvec) { /* guessing here! */
423 resizesetvec(__func__);
424 }
425 for (i = 0; i <= f->accept; i++)
426 setvec[i] = 0;
427 setcnt = 0;
428 follow(v); /* computes setvec and setcnt */
429 p = intalloc(setcnt + 1, __func__);
430 f->re[info(v)].lfollow = p;
431 *p = setcnt;
432 for (i = f->accept; i >= 0; i--)
433 if (setvec[i] == 1)
434 *++p = i;
435 break;
436 UNARY
437 cfoll(f,left(v));
438 break;
439 case CAT:
440 case OR:
441 cfoll(f,left(v));
442 cfoll(f,right(v));
443 break;
444 case ZERO:
445 break;
446 default: /* can't happen */
447 FATAL("can't happen: unknown type %d in cfoll", type(v));
448 }
449 }
450
first(Node * p)451 int first(Node *p) /* collects initially active leaves of p into setvec */
452 /* returns 0 if p matches empty string */
453 {
454 int b, lp;
455
456 switch (type(p)) {
457 ELEAF
458 LEAF
459 lp = info(p); /* look for high-water mark of subscripts */
460 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
461 resizesetvec(__func__);
462 }
463 if (type(p) == EMPTYRE) {
464 setvec[lp] = 0;
465 return(0);
466 }
467 if (setvec[lp] != 1) {
468 setvec[lp] = 1;
469 setcnt++;
470 }
471 if (type(p) == CCL && (*(char *) right(p)) == '\0')
472 return(0); /* empty CCL */
473 return(1);
474 case PLUS:
475 if (first(left(p)) == 0)
476 return(0);
477 return(1);
478 case STAR:
479 case QUEST:
480 first(left(p));
481 return(0);
482 case CAT:
483 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
484 return(1);
485 case OR:
486 b = first(right(p));
487 if (first(left(p)) == 0 || b == 0) return(0);
488 return(1);
489 case ZERO:
490 return 0;
491 }
492 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
493 return(-1);
494 }
495
follow(Node * v)496 void follow(Node *v) /* collects leaves that can follow v into setvec */
497 {
498 Node *p;
499
500 if (type(v) == FINAL)
501 return;
502 p = parent(v);
503 switch (type(p)) {
504 case STAR:
505 case PLUS:
506 first(v);
507 follow(p);
508 return;
509
510 case OR:
511 case QUEST:
512 follow(p);
513 return;
514
515 case CAT:
516 if (v == left(p)) { /* v is left child of p */
517 if (first(right(p)) == 0) {
518 follow(p);
519 return;
520 }
521 } else /* v is right child */
522 follow(p);
523 return;
524 }
525 }
526
member(int c,const char * sarg)527 int member(int c, const char *sarg) /* is c in s? */
528 {
529 const uschar *s = (const uschar *) sarg;
530
531 while (*s)
532 if (c == *s++)
533 return(1);
534 return(0);
535 }
536
match(fa * f,const char * p0)537 int match(fa *f, const char *p0) /* shortest match ? */
538 {
539 int s, ns;
540 const uschar *p = (const uschar *) p0;
541
542 s = f->initstat;
543 assert (s < f->state_count);
544
545 if (f->out[s])
546 return(1);
547 do {
548 /* assert(*p < NCHARS); */
549 if ((ns = f->gototab[s][*p]) != 0)
550 s = ns;
551 else
552 s = cgoto(f, s, *p);
553 if (f->out[s])
554 return(1);
555 } while (*p++ != 0);
556 return(0);
557 }
558
pmatch(fa * f,const char * p0)559 int pmatch(fa *f, const char *p0) /* longest match, for sub */
560 {
561 int s, ns;
562 const uschar *p = (const uschar *) p0;
563 const uschar *q;
564
565 s = f->initstat;
566 assert(s < f->state_count);
567
568 patbeg = (const char *)p;
569 patlen = -1;
570 do {
571 q = p;
572 do {
573 if (f->out[s]) /* final state */
574 patlen = q-p;
575 /* assert(*q < NCHARS); */
576 if ((ns = f->gototab[s][*q]) != 0)
577 s = ns;
578 else
579 s = cgoto(f, s, *q);
580
581 assert(s < f->state_count);
582
583 if (s == 1) { /* no transition */
584 if (patlen >= 0) {
585 patbeg = (const char *) p;
586 return(1);
587 }
588 else
589 goto nextin; /* no match */
590 }
591 } while (*q++ != 0);
592 if (f->out[s])
593 patlen = q-p-1; /* don't count $ */
594 if (patlen >= 0) {
595 patbeg = (const char *) p;
596 return(1);
597 }
598 nextin:
599 s = 2;
600 } while (*p++);
601 return (0);
602 }
603
nematch(fa * f,const char * p0)604 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
605 {
606 int s, ns;
607 const uschar *p = (const uschar *) p0;
608 const uschar *q;
609
610 s = f->initstat;
611 assert(s < f->state_count);
612
613 patbeg = (const char *)p;
614 patlen = -1;
615 while (*p) {
616 q = p;
617 do {
618 if (f->out[s]) /* final state */
619 patlen = q-p;
620 /* assert(*q < NCHARS); */
621 if ((ns = f->gototab[s][*q]) != 0)
622 s = ns;
623 else
624 s = cgoto(f, s, *q);
625 if (s == 1) { /* no transition */
626 if (patlen > 0) {
627 patbeg = (const char *) p;
628 return(1);
629 } else
630 goto nnextin; /* no nonempty match */
631 }
632 } while (*q++ != 0);
633 if (f->out[s])
634 patlen = q-p-1; /* don't count $ */
635 if (patlen > 0 ) {
636 patbeg = (const char *) p;
637 return(1);
638 }
639 nnextin:
640 s = 2;
641 p++;
642 }
643 return (0);
644 }
645
646
647 /*
648 * NAME
649 * fnematch
650 *
651 * DESCRIPTION
652 * A stream-fed version of nematch which transfers characters to a
653 * null-terminated buffer. All characters up to and including the last
654 * character of the matching text or EOF are placed in the buffer. If
655 * a match is found, patbeg and patlen are set appropriately.
656 *
657 * RETURN VALUES
658 * false No match found.
659 * true Match found.
660 */
661
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)662 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
663 {
664 char *buf = *pbuf;
665 int bufsize = *pbufsize;
666 int c, i, j, k, ns, s;
667
668 s = pfa->initstat;
669 patlen = 0;
670
671 /*
672 * All indices relative to buf.
673 * i <= j <= k <= bufsize
674 *
675 * i: origin of active substring
676 * j: current character
677 * k: destination of next getc()
678 */
679 i = -1, k = 0;
680 do {
681 j = i++;
682 do {
683 if (++j == k) {
684 if (k == bufsize)
685 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
686 FATAL("stream '%.30s...' too long", buf);
687 buf[k++] = (c = getc(f)) != EOF ? c : 0;
688 }
689 c = (uschar)buf[j];
690 /* assert(c < NCHARS); */
691
692 if ((ns = pfa->gototab[s][c]) != 0)
693 s = ns;
694 else
695 s = cgoto(pfa, s, c);
696
697 if (pfa->out[s]) { /* final state */
698 patlen = j - i + 1;
699 if (c == 0) /* don't count $ */
700 patlen--;
701 }
702 } while (buf[j] && s != 1);
703 s = 2;
704 } while (buf[i] && !patlen);
705
706 /* adjbuf() may have relocated a resized buffer. Inform the world. */
707 *pbuf = buf;
708 *pbufsize = bufsize;
709
710 if (patlen) {
711 patbeg = (char *) buf + i;
712 /*
713 * Under no circumstances is the last character fed to
714 * the automaton part of the match. It is EOF's nullbyte,
715 * or it sent the automaton into a state with no further
716 * transitions available (s==1), or both. Room for a
717 * terminating nullbyte is guaranteed.
718 *
719 * ungetc any chars after the end of matching text
720 * (except for EOF's nullbyte, if present) and null
721 * terminate the buffer.
722 */
723 do
724 if (buf[--k] && ungetc(buf[k], f) == EOF)
725 FATAL("unable to ungetc '%c'", buf[k]);
726 while (k > i + patlen);
727 buf[k] = '\0';
728 return true;
729 }
730 else
731 return false;
732 }
733
reparse(const char * p)734 Node *reparse(const char *p) /* parses regular expression pointed to by p */
735 { /* uses relex() to scan regular expression */
736 Node *np;
737
738 DPRINTF("reparse <%s>\n", p);
739 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
740 rtok = relex();
741 /* GNU compatibility: an empty regexp matches anything */
742 if (rtok == '\0') {
743 /* FATAL("empty regular expression"); previous */
744 return(op2(EMPTYRE, NIL, NIL));
745 }
746 np = regexp();
747 if (rtok != '\0')
748 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
749 return(np);
750 }
751
regexp(void)752 Node *regexp(void) /* top-level parse of reg expr */
753 {
754 return (alt(concat(primary())));
755 }
756
primary(void)757 Node *primary(void)
758 {
759 Node *np;
760 int savelastatom;
761
762 switch (rtok) {
763 case CHAR:
764 lastatom = starttok;
765 np = op2(CHAR, NIL, itonp(rlxval));
766 rtok = relex();
767 return (unary(np));
768 case ALL:
769 rtok = relex();
770 return (unary(op2(ALL, NIL, NIL)));
771 case EMPTYRE:
772 rtok = relex();
773 return (unary(op2(EMPTYRE, NIL, NIL)));
774 case DOT:
775 lastatom = starttok;
776 rtok = relex();
777 return (unary(op2(DOT, NIL, NIL)));
778 case CCL:
779 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
780 lastatom = starttok;
781 rtok = relex();
782 return (unary(np));
783 case NCCL:
784 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
785 lastatom = starttok;
786 rtok = relex();
787 return (unary(np));
788 case '^':
789 rtok = relex();
790 return (unary(op2(CHAR, NIL, itonp(HAT))));
791 case '$':
792 rtok = relex();
793 return (unary(op2(CHAR, NIL, NIL)));
794 case '(':
795 lastatom = starttok;
796 savelastatom = starttok - basestr; /* Retain over recursion */
797 rtok = relex();
798 if (rtok == ')') { /* special pleading for () */
799 rtok = relex();
800 return unary(op2(CCL, NIL, (Node *) tostring("")));
801 }
802 np = regexp();
803 if (rtok == ')') {
804 lastatom = basestr + savelastatom; /* Restore */
805 rtok = relex();
806 return (unary(np));
807 }
808 else
809 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
810 default:
811 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
812 }
813 return 0; /*NOTREACHED*/
814 }
815
concat(Node * np)816 Node *concat(Node *np)
817 {
818 switch (rtok) {
819 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
820 return (concat(op2(CAT, np, primary())));
821 case EMPTYRE:
822 rtok = relex();
823 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
824 primary())));
825 }
826 return (np);
827 }
828
alt(Node * np)829 Node *alt(Node *np)
830 {
831 if (rtok == OR) {
832 rtok = relex();
833 return (alt(op2(OR, np, concat(primary()))));
834 }
835 return (np);
836 }
837
unary(Node * np)838 Node *unary(Node *np)
839 {
840 switch (rtok) {
841 case STAR:
842 rtok = relex();
843 return (unary(op2(STAR, np, NIL)));
844 case PLUS:
845 rtok = relex();
846 return (unary(op2(PLUS, np, NIL)));
847 case QUEST:
848 rtok = relex();
849 return (unary(op2(QUEST, np, NIL)));
850 case ZERO:
851 rtok = relex();
852 return (unary(op2(ZERO, np, NIL)));
853 default:
854 return (np);
855 }
856 }
857
858 /*
859 * Character class definitions conformant to the POSIX locale as
860 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
861 * and operating character sets are both ASCII (ISO646) or supersets
862 * thereof.
863 *
864 * Note that to avoid overflowing the temporary buffer used in
865 * relex(), the expanded character class (prior to range expansion)
866 * must be less than twice the size of their full name.
867 */
868
869 /* Because isblank doesn't show up in any of the header files on any
870 * system i use, it's defined here. if some other locale has a richer
871 * definition of "blank", define HAS_ISBLANK and provide your own
872 * version.
873 * the parentheses here are an attempt to find a path through the maze
874 * of macro definition and/or function and/or version provided. thanks
875 * to nelson beebe for the suggestion; let's see if it works everywhere.
876 */
877
878 /* #define HAS_ISBLANK */
879 #ifndef HAS_ISBLANK
880
881 int (xisblank)(int c)
882 {
883 return c==' ' || c=='\t';
884 }
885
886 #endif
887
888 static const struct charclass {
889 const char *cc_name;
890 int cc_namelen;
891 int (*cc_func)(int);
892 } charclasses[] = {
893 { "alnum", 5, isalnum },
894 { "alpha", 5, isalpha },
895 #ifndef HAS_ISBLANK
896 { "blank", 5, xisblank },
897 #else
898 { "blank", 5, isblank },
899 #endif
900 { "cntrl", 5, iscntrl },
901 { "digit", 5, isdigit },
902 { "graph", 5, isgraph },
903 { "lower", 5, islower },
904 { "print", 5, isprint },
905 { "punct", 5, ispunct },
906 { "space", 5, isspace },
907 { "upper", 5, isupper },
908 { "xdigit", 6, isxdigit },
909 { NULL, 0, NULL },
910 };
911
912 #define REPEAT_SIMPLE 0
913 #define REPEAT_PLUS_APPENDED 1
914 #define REPEAT_WITH_Q 2
915 #define REPEAT_ZERO 3
916
917 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)918 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
919 int atomlen, int firstnum, int secondnum, int special_case)
920 {
921 int i, j;
922 uschar *buf = 0;
923 int ret = 1;
924 int init_q = (firstnum == 0); /* first added char will be ? */
925 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
926 int prefix_length = reptok - basestr; /* prefix includes first rep */
927 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
928 int size = prefix_length + suffix_length;
929
930 if (firstnum > 1) { /* add room for reps 2 through firstnum */
931 size += atomlen*(firstnum-1);
932 }
933
934 /* Adjust size of buffer for special cases */
935 if (special_case == REPEAT_PLUS_APPENDED) {
936 size++; /* for the final + */
937 } else if (special_case == REPEAT_WITH_Q) {
938 size += init_q + (atomlen+1)* n_q_reps;
939 } else if (special_case == REPEAT_ZERO) {
940 size += 2; /* just a null ERE: () */
941 }
942 if ((buf = (uschar *) malloc(size + 1)) == NULL)
943 FATAL("out of space in reg expr %.10s..", lastre);
944 memcpy(buf, basestr, prefix_length); /* copy prefix */
945 j = prefix_length;
946 if (special_case == REPEAT_ZERO) {
947 j -= atomlen;
948 buf[j++] = '(';
949 buf[j++] = ')';
950 }
951 for (i = 1; i < firstnum; i++) { /* copy x reps */
952 memcpy(&buf[j], atom, atomlen);
953 j += atomlen;
954 }
955 if (special_case == REPEAT_PLUS_APPENDED) {
956 buf[j++] = '+';
957 } else if (special_case == REPEAT_WITH_Q) {
958 if (init_q)
959 buf[j++] = '?';
960 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
961 memcpy(&buf[j], atom, atomlen);
962 j += atomlen;
963 buf[j++] = '?';
964 }
965 }
966 memcpy(&buf[j], reptok+reptoklen, suffix_length);
967 if (special_case == REPEAT_ZERO) {
968 buf[j+suffix_length] = '\0';
969 } else {
970 buf[size] = '\0';
971 }
972 /* free old basestr */
973 if (firstbasestr != basestr) {
974 if (basestr)
975 xfree(basestr);
976 }
977 basestr = buf;
978 prestr = buf + prefix_length;
979 if (special_case == REPEAT_ZERO) {
980 prestr -= atomlen;
981 ret++;
982 }
983 return ret;
984 }
985
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)986 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
987 int atomlen, int firstnum, int secondnum)
988 {
989 /*
990 In general, the repetition specifier or "bound" is replaced here
991 by an equivalent ERE string, repeating the immediately previous atom
992 and appending ? and + as needed. Note that the first copy of the
993 atom is left in place, except in the special_case of a zero-repeat
994 (i.e., {0}).
995 */
996 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
997 if (firstnum < 2) {
998 /* 0 or 1: should be handled before you get here */
999 FATAL("internal error");
1000 } else {
1001 return replace_repeat(reptok, reptoklen, atom, atomlen,
1002 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1003 }
1004 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1005 if (firstnum == 0) { /* {0} or {0,0} */
1006 /* This case is unusual because the resulting
1007 replacement string might actually be SMALLER than
1008 the original ERE */
1009 return replace_repeat(reptok, reptoklen, atom, atomlen,
1010 firstnum, secondnum, REPEAT_ZERO);
1011 } else { /* (firstnum >= 1) */
1012 return replace_repeat(reptok, reptoklen, atom, atomlen,
1013 firstnum, secondnum, REPEAT_SIMPLE);
1014 }
1015 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1016 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1017 return replace_repeat(reptok, reptoklen, atom, atomlen,
1018 firstnum, secondnum, REPEAT_WITH_Q);
1019 } else { /* Error - shouldn't be here (n>m) */
1020 FATAL("internal error");
1021 }
1022 return 0;
1023 }
1024
relex(void)1025 int relex(void) /* lexical analyzer for reparse */
1026 {
1027 int c, n;
1028 int cflag;
1029 static uschar *buf = NULL;
1030 static int bufsz = 100;
1031 uschar *bp;
1032 const struct charclass *cc;
1033 int i;
1034 int num, m;
1035 bool commafound, digitfound;
1036 const uschar *startreptok;
1037 static int parens = 0;
1038
1039 rescan:
1040 starttok = prestr;
1041
1042 switch (c = *prestr++) {
1043 case '|': return OR;
1044 case '*': return STAR;
1045 case '+': return PLUS;
1046 case '?': return QUEST;
1047 case '.': return DOT;
1048 case '\0': prestr--; return '\0';
1049 case '^':
1050 case '$':
1051 return c;
1052 case '(':
1053 parens++;
1054 return c;
1055 case ')':
1056 if (parens) {
1057 parens--;
1058 return c;
1059 }
1060 /* unmatched close parenthesis; per POSIX, treat as literal */
1061 rlxval = c;
1062 return CHAR;
1063 case '\\':
1064 rlxval = quoted(&prestr);
1065 return CHAR;
1066 default:
1067 rlxval = c;
1068 return CHAR;
1069 case '[':
1070 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1071 FATAL("out of space in reg expr %.10s..", lastre);
1072 bp = buf;
1073 if (*prestr == '^') {
1074 cflag = 1;
1075 prestr++;
1076 }
1077 else
1078 cflag = 0;
1079 n = 2 * strlen((const char *) prestr)+1;
1080 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1081 FATAL("out of space for reg expr %.10s...", lastre);
1082 for (; ; ) {
1083 if ((c = *prestr++) == '\\') {
1084 *bp++ = '\\';
1085 if ((c = *prestr++) == '\0')
1086 FATAL("nonterminated character class %.20s...", lastre);
1087 *bp++ = c;
1088 /* } else if (c == '\n') { */
1089 /* FATAL("newline in character class %.20s...", lastre); */
1090 } else if (c == '[' && *prestr == ':') {
1091 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1092 for (cc = charclasses; cc->cc_name; cc++)
1093 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1094 break;
1095 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1096 prestr[2 + cc->cc_namelen] == ']') {
1097 prestr += cc->cc_namelen + 3;
1098 /*
1099 * BUG: We begin at 1, instead of 0, since we
1100 * would otherwise prematurely terminate the
1101 * string for classes like [[:cntrl:]]. This
1102 * means that we can't match the NUL character,
1103 * not without first adapting the entire
1104 * program to track each string's length.
1105 */
1106 for (i = 1; i <= UCHAR_MAX; i++) {
1107 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1108 FATAL("out of space for reg expr %.10s...", lastre);
1109 if (cc->cc_func(i)) {
1110 /* escape backslash */
1111 if (i == '\\') {
1112 *bp++ = '\\';
1113 n++;
1114 }
1115
1116 *bp++ = i;
1117 n++;
1118 }
1119 }
1120 } else
1121 *bp++ = c;
1122 } else if (c == '[' && *prestr == '.') {
1123 char collate_char;
1124 prestr++;
1125 collate_char = *prestr++;
1126 if (*prestr == '.' && prestr[1] == ']') {
1127 prestr += 2;
1128 /* Found it: map via locale TBD: for
1129 now, simply return this char. This
1130 is sufficient to pass conformance
1131 test awk.ex 156
1132 */
1133 if (*prestr == ']') {
1134 prestr++;
1135 rlxval = collate_char;
1136 return CHAR;
1137 }
1138 }
1139 } else if (c == '[' && *prestr == '=') {
1140 char equiv_char;
1141 prestr++;
1142 equiv_char = *prestr++;
1143 if (*prestr == '=' && prestr[1] == ']') {
1144 prestr += 2;
1145 /* Found it: map via locale TBD: for now
1146 simply return this char. This is
1147 sufficient to pass conformance test
1148 awk.ex 156
1149 */
1150 if (*prestr == ']') {
1151 prestr++;
1152 rlxval = equiv_char;
1153 return CHAR;
1154 }
1155 }
1156 } else if (c == '\0') {
1157 FATAL("nonterminated character class %.20s", lastre);
1158 } else if (bp == buf) { /* 1st char is special */
1159 *bp++ = c;
1160 } else if (c == ']') {
1161 *bp++ = 0;
1162 rlxstr = (uschar *) tostring((char *) buf);
1163 if (cflag == 0)
1164 return CCL;
1165 else
1166 return NCCL;
1167 } else
1168 *bp++ = c;
1169 }
1170 break;
1171 case '{':
1172 if (isdigit(*(prestr))) {
1173 num = 0; /* Process as a repetition */
1174 n = -1; m = -1;
1175 commafound = false;
1176 digitfound = false;
1177 startreptok = prestr-1;
1178 /* Remember start of previous atom here ? */
1179 } else { /* just a { char, not a repetition */
1180 rlxval = c;
1181 return CHAR;
1182 }
1183 for (; ; ) {
1184 if ((c = *prestr++) == '}') {
1185 if (commafound) {
1186 if (digitfound) { /* {n,m} */
1187 m = num;
1188 if (m < n)
1189 FATAL("illegal repetition expression: class %.20s",
1190 lastre);
1191 if (n == 0 && m == 1) {
1192 return QUEST;
1193 }
1194 } else { /* {n,} */
1195 if (n == 0)
1196 return STAR;
1197 else if (n == 1)
1198 return PLUS;
1199 }
1200 } else {
1201 if (digitfound) { /* {n} same as {n,n} */
1202 n = num;
1203 m = num;
1204 } else { /* {} */
1205 FATAL("illegal repetition expression: class %.20s",
1206 lastre);
1207 }
1208 }
1209 if (repeat(starttok, prestr-starttok, lastatom,
1210 startreptok - lastatom, n, m) > 0) {
1211 if (n == 0 && m == 0) {
1212 return ZERO;
1213 }
1214 /* must rescan input for next token */
1215 goto rescan;
1216 }
1217 /* Failed to replace: eat up {...} characters
1218 and treat like just PLUS */
1219 return PLUS;
1220 } else if (c == '\0') {
1221 FATAL("nonterminated character class %.20s",
1222 lastre);
1223 } else if (isdigit(c)) {
1224 num = 10 * num + c - '0';
1225 digitfound = true;
1226 } else if (c == ',') {
1227 if (commafound)
1228 FATAL("illegal repetition expression: class %.20s",
1229 lastre);
1230 /* looking for {n,} or {n,m} */
1231 commafound = true;
1232 n = num;
1233 digitfound = false; /* reset */
1234 num = 0;
1235 } else {
1236 FATAL("illegal repetition expression: class %.20s",
1237 lastre);
1238 }
1239 }
1240 break;
1241 }
1242 }
1243
cgoto(fa * f,int s,int c)1244 int cgoto(fa *f, int s, int c)
1245 {
1246 int *p, *q;
1247 int i, j, k;
1248
1249 assert(c == HAT || c < NCHARS);
1250 while (f->accept >= maxsetvec) { /* guessing here! */
1251 resizesetvec(__func__);
1252 }
1253 for (i = 0; i <= f->accept; i++)
1254 setvec[i] = 0;
1255 setcnt = 0;
1256 resize_state(f, s);
1257 /* compute positions of gototab[s,c] into setvec */
1258 p = f->posns[s];
1259 for (i = 1; i <= *p; i++) {
1260 if ((k = f->re[p[i]].ltype) != FINAL) {
1261 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1262 || (k == DOT && c != 0 && c != HAT)
1263 || (k == ALL && c != 0)
1264 || (k == EMPTYRE && c != 0)
1265 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1266 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1267 q = f->re[p[i]].lfollow;
1268 for (j = 1; j <= *q; j++) {
1269 if (q[j] >= maxsetvec) {
1270 resizesetvec(__func__);
1271 }
1272 if (setvec[q[j]] == 0) {
1273 setcnt++;
1274 setvec[q[j]] = 1;
1275 }
1276 }
1277 }
1278 }
1279 }
1280 /* determine if setvec is a previous state */
1281 tmpset[0] = setcnt;
1282 j = 1;
1283 for (i = f->accept; i >= 0; i--)
1284 if (setvec[i]) {
1285 tmpset[j++] = i;
1286 }
1287 resize_state(f, f->curstat > s ? f->curstat : s);
1288 /* tmpset == previous state? */
1289 for (i = 1; i <= f->curstat; i++) {
1290 p = f->posns[i];
1291 if ((k = tmpset[0]) != p[0])
1292 goto different;
1293 for (j = 1; j <= k; j++)
1294 if (tmpset[j] != p[j])
1295 goto different;
1296 /* setvec is state i */
1297 if (c != HAT)
1298 f->gototab[s][c] = i;
1299 return i;
1300 different:;
1301 }
1302
1303 /* add tmpset to current set of states */
1304 ++(f->curstat);
1305 resize_state(f, f->curstat);
1306 for (i = 0; i < NCHARS; i++)
1307 f->gototab[f->curstat][i] = 0;
1308 xfree(f->posns[f->curstat]);
1309 p = intalloc(setcnt + 1, __func__);
1310
1311 f->posns[f->curstat] = p;
1312 if (c != HAT)
1313 f->gototab[s][c] = f->curstat;
1314 for (i = 0; i <= setcnt; i++)
1315 p[i] = tmpset[i];
1316 if (setvec[f->accept])
1317 f->out[f->curstat] = 1;
1318 else
1319 f->out[f->curstat] = 0;
1320 return f->curstat;
1321 }
1322
1323
freefa(fa * f)1324 void freefa(fa *f) /* free a finite automaton */
1325 {
1326 int i;
1327
1328 if (f == NULL)
1329 return;
1330 for (i = 0; i < f->state_count; i++)
1331 xfree(f->gototab[i])
1332 for (i = 0; i <= f->curstat; i++)
1333 xfree(f->posns[i]);
1334 for (i = 0; i <= f->accept; i++) {
1335 xfree(f->re[i].lfollow);
1336 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1337 xfree(f->re[i].lval.np);
1338 }
1339 xfree(f->restr);
1340 xfree(f->out);
1341 xfree(f->posns);
1342 xfree(f->gototab);
1343 xfree(f);
1344 }
1345