1 /*
2  * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3  * All rights reserved
4  *
5  * "THE BEER-WARE LICENSE" (Revision 42):
6  * Sergey Lyubka wrote this file.  As long as you retain this notice you
7  * can do whatever you want with this stuff. If we meet some day, and you think
8  * this stuff is worth it, you can buy me a beer in return.
9  */
10 
11 /*
12  * Downloaded Sat Nov  5 17:43:06 CET 2011 at
13  * http://slre.sourceforge.net/1.0/slre.c
14  */
15 
16 #ifdef SLRE_TEST
17 #include <stdio.h>
18 #include <assert.h>
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #else
23 #include <common.h>
24 #include <linux/ctype.h>
25 #endif /* SLRE_TEST */
26 
27 #include <errno.h>
28 
29 #include <slre.h>
30 
31 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32 	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
33 
34 #ifdef SLRE_TEST
35 static struct {
36 	const char	*name;
37 	int		narg;
38 	const char	*flags;
39 } opcodes[] = {
40 	{"END",		0, ""},		/* End of code block or program	*/
41 	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
42 	{"ANY",		0, ""},		/* Match any character, "."	*/
43 	{"EXACT",	2, "d"},	/* Match exact string		*/
44 	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
45 	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
46 	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
47 	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
48 	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
49 	{"EOL",		0, ""},		/* End of string, "$"		*/
50 	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
51 	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
52 	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
53 	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
54 	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
55 	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
56 	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
57 	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
58 };
59 #endif /* SLRE_TEST */
60 
61 /*
62  * Commands and operands are all unsigned char (1 byte long). All code offsets
63  * are relative to current address, and positive (always point forward). Data
64  * offsets are absolute. Commands with operands:
65  *
66  * BRANCH offset1 offset2
67  *	Try to match the code block that follows the BRANCH instruction
68  *	(code block ends with END). If no match, try to match code block that
69  *	starts at offset1. If either of these match, jump to offset2.
70  *
71  * EXACT data_offset data_length
72  *	Try to match exact string. String is recorded in data section from
73  *	data_offset, and has length data_length.
74  *
75  * OPEN capture_number
76  * CLOSE capture_number
77  *	If the user have passed 'struct cap' array for captures, OPEN
78  *	records the beginning of the matched substring (cap->ptr), CLOSE
79  *	sets the length (cap->len) for respective capture_number.
80  *
81  * STAR code_offset
82  * PLUS code_offset
83  * QUEST code_offset
84  *	*, +, ?, respectively. Try to gobble as much as possible from the
85  *	matched buffer, until code block that follows these instructions
86  *	matches. When the longest possible string is matched,
87  *	jump to code_offset
88  *
89  * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
90  */
91 
92 static const char *meta_chars = "|.^$*+?()[\\";
93 
94 #ifdef SLRE_TEST
95 
96 static void
print_character_set(FILE * fp,const unsigned char * p,int len)97 print_character_set(FILE *fp, const unsigned char *p, int len)
98 {
99 	int	i;
100 
101 	for (i = 0; i < len; i++) {
102 		if (i > 0)
103 			(void) fputc(',', fp);
104 		if (p[i] == 0) {
105 			i++;
106 			if (p[i] == 0)
107 				(void) fprintf(fp, "\\x%02x", p[i]);
108 			else
109 				(void) fprintf(fp, "%s", opcodes[p[i]].name);
110 		} else if (isprint(p[i])) {
111 			(void) fputc(p[i], fp);
112 		} else {
113 			(void) fprintf(fp, "\\x%02x", p[i]);
114 		}
115 	}
116 }
117 
118 void
slre_dump(const struct slre * r,FILE * fp)119 slre_dump(const struct slre *r, FILE *fp)
120 {
121 	int	i, j, ch, op, pc;
122 
123 	for (pc = 0; pc < r->code_size; pc++) {
124 
125 		op = r->code[pc];
126 		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
127 
128 		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129 			switch (opcodes[op].flags[i]) {
130 			case 'i':
131 				(void) fprintf(fp, "%d ", r->code[pc + 1]);
132 				pc++;
133 				break;
134 			case 'o':
135 				(void) fprintf(fp, "%d ",
136 				    pc + r->code[pc + 1] - i);
137 				pc++;
138 				break;
139 			case 'D':
140 				print_character_set(fp, r->data +
141 				    r->code[pc + 1], r->code[pc + 2]);
142 				pc += 2;
143 				break;
144 			case 'd':
145 				(void) fputc('"', fp);
146 				for (j = 0; j < r->code[pc + 2]; j++) {
147 					ch = r->data[r->code[pc + 1] + j];
148 					if (isprint(ch)) {
149 						(void) fputc(ch, fp);
150 					} else {
151 						(void) fprintf(fp,
152 							"\\x%02x", ch);
153 					}
154 				}
155 				(void) fputc('"', fp);
156 				pc += 2;
157 				break;
158 			}
159 
160 		(void) fputc('\n', fp);
161 	}
162 }
163 #endif /* SLRE_TEST */
164 
165 static void
set_jump_offset(struct slre * r,int pc,int offset)166 set_jump_offset(struct slre *r, int pc, int offset)
167 {
168 	assert(offset < r->code_size);
169 
170 	if (r->code_size - offset > 0xff)
171 		r->err_str = "Jump offset is too big";
172 	else
173 		r->code[pc] = (unsigned char) (r->code_size - offset);
174 }
175 
176 static void
emit(struct slre * r,int code)177 emit(struct slre *r, int code)
178 {
179 	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180 		r->err_str = "RE is too long (code overflow)";
181 	else
182 		r->code[r->code_size++] = (unsigned char) code;
183 }
184 
185 static void
store_char_in_data(struct slre * r,int ch)186 store_char_in_data(struct slre *r, int ch)
187 {
188 	if (r->data_size >= (int) sizeof(r->data))
189 		r->err_str = "RE is too long (data overflow)";
190 	else
191 		r->data[r->data_size++] = ch;
192 }
193 
194 static void
exact(struct slre * r,const char ** re)195 exact(struct slre *r, const char **re)
196 {
197 	int	old_data_size = r->data_size;
198 
199 	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200 		store_char_in_data(r, *(*re)++);
201 
202 	emit(r, EXACT);
203 	emit(r, old_data_size);
204 	emit(r, r->data_size - old_data_size);
205 }
206 
207 static int
get_escape_char(const char ** re)208 get_escape_char(const char **re)
209 {
210 	int	res;
211 
212 	switch (*(*re)++) {
213 	case 'n':
214 		res = '\n';
215 		break;
216 	case 'r':
217 		res = '\r';
218 		break;
219 	case 't':
220 		res = '\t';
221 		break;
222 	case '0':
223 		res = 0;
224 		break;
225 	case 'S':
226 		res = NONSPACE << 8;
227 		break;
228 	case 's':
229 		res = SPACE << 8;
230 		break;
231 	case 'd':
232 		res = DIGIT << 8;
233 		break;
234 	default:
235 		res = (*re)[-1];
236 		break;
237 	}
238 
239 	return res;
240 }
241 
242 static void
anyof(struct slre * r,const char ** re)243 anyof(struct slre *r, const char **re)
244 {
245 	int	esc, old_data_size = r->data_size, op = ANYOF;
246 
247 	if (**re == '^') {
248 		op = ANYBUT;
249 		(*re)++;
250 	}
251 
252 	while (**re != '\0')
253 
254 		switch (*(*re)++) {
255 		case ']':
256 			emit(r, op);
257 			emit(r, old_data_size);
258 			emit(r, r->data_size - old_data_size);
259 			return;
260 			/* NOTREACHED */
261 			break;
262 		case '\\':
263 			esc = get_escape_char(re);
264 			if ((esc & 0xff) == 0) {
265 				store_char_in_data(r, 0);
266 				store_char_in_data(r, esc >> 8);
267 			} else {
268 				store_char_in_data(r, esc);
269 			}
270 			break;
271 		default:
272 			store_char_in_data(r, (*re)[-1]);
273 			break;
274 		}
275 
276 	r->err_str = "No closing ']' bracket";
277 }
278 
279 static void
relocate(struct slre * r,int begin,int shift)280 relocate(struct slre *r, int begin, int shift)
281 {
282 	emit(r, END);
283 	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284 	r->code_size += shift;
285 }
286 
287 static void
quantifier(struct slre * r,int prev,int op)288 quantifier(struct slre *r, int prev, int op)
289 {
290 	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
291 		r->code[prev + 2]--;
292 		emit(r, EXACT);
293 		emit(r, r->code[prev + 1] + r->code[prev + 2]);
294 		emit(r, 1);
295 		prev = r->code_size - 3;
296 	}
297 	relocate(r, prev, 2);
298 	r->code[prev] = op;
299 	set_jump_offset(r, prev + 1, prev);
300 }
301 
302 static void
exact_one_char(struct slre * r,int ch)303 exact_one_char(struct slre *r, int ch)
304 {
305 	emit(r, EXACT);
306 	emit(r, r->data_size);
307 	emit(r, 1);
308 	store_char_in_data(r, ch);
309 }
310 
311 static void
fixup_branch(struct slre * r,int fixup)312 fixup_branch(struct slre *r, int fixup)
313 {
314 	if (fixup > 0) {
315 		emit(r, END);
316 		set_jump_offset(r, fixup, fixup - 2);
317 	}
318 }
319 
320 static void
compile(struct slre * r,const char ** re)321 compile(struct slre *r, const char **re)
322 {
323 	int	op, esc, branch_start, last_op, fixup, cap_no, level;
324 
325 	fixup = 0;
326 	level = r->num_caps;
327 	branch_start = last_op = r->code_size;
328 
329 	for (;;)
330 		switch (*(*re)++) {
331 		case '\0':
332 			(*re)--;
333 			return;
334 			/* NOTREACHED */
335 			break;
336 		case '^':
337 			emit(r, BOL);
338 			break;
339 		case '$':
340 			emit(r, EOL);
341 			break;
342 		case '.':
343 			last_op = r->code_size;
344 			emit(r, ANY);
345 			break;
346 		case '[':
347 			last_op = r->code_size;
348 			anyof(r, re);
349 			break;
350 		case '\\':
351 			last_op = r->code_size;
352 			esc = get_escape_char(re);
353 			if (esc & 0xff00)
354 				emit(r, esc >> 8);
355 			else
356 				exact_one_char(r, esc);
357 			break;
358 		case '(':
359 			last_op = r->code_size;
360 			cap_no = ++r->num_caps;
361 			emit(r, OPEN);
362 			emit(r, cap_no);
363 
364 			compile(r, re);
365 			if (*(*re)++ != ')') {
366 				r->err_str = "No closing bracket";
367 				return;
368 			}
369 
370 			emit(r, CLOSE);
371 			emit(r, cap_no);
372 			break;
373 		case ')':
374 			(*re)--;
375 			fixup_branch(r, fixup);
376 			if (level == 0) {
377 				r->err_str = "Unbalanced brackets";
378 				return;
379 			}
380 			return;
381 			/* NOTREACHED */
382 			break;
383 		case '+':
384 		case '*':
385 			op = (*re)[-1] == '*' ? STAR : PLUS;
386 			if (**re == '?') {
387 				(*re)++;
388 				op = op == STAR ? STARQ : PLUSQ;
389 			}
390 			quantifier(r, last_op, op);
391 			break;
392 		case '?':
393 			quantifier(r, last_op, QUEST);
394 			break;
395 		case '|':
396 			fixup_branch(r, fixup);
397 			relocate(r, branch_start, 3);
398 			r->code[branch_start] = BRANCH;
399 			set_jump_offset(r, branch_start + 1, branch_start);
400 			fixup = branch_start + 2;
401 			r->code[fixup] = 0xff;
402 			break;
403 		default:
404 			(*re)--;
405 			last_op = r->code_size;
406 			exact(r, re);
407 			break;
408 		}
409 }
410 
411 int
slre_compile(struct slre * r,const char * re)412 slre_compile(struct slre *r, const char *re)
413 {
414 	r->err_str = NULL;
415 	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
416 
417 	if (*re == '^')
418 		r->anchored++;
419 
420 	emit(r, OPEN);	/* This will capture what matches full RE */
421 	emit(r, 0);
422 
423 	while (*re != '\0')
424 		compile(r, &re);
425 
426 	if (r->code[2] == BRANCH)
427 		fixup_branch(r, 4);
428 
429 	emit(r, CLOSE);
430 	emit(r, 0);
431 	emit(r, END);
432 
433 	return (r->err_str == NULL ? 1 : 0);
434 }
435 
436 static int match(const struct slre *, int,
437 		const char *, int, int *, struct cap *);
438 
439 static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)440 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
441 {
442 	int	saved_offset, matched_offset;
443 
444 	matched_offset = *ofs;
445 
446 	while (match(r, pc + 2, s, len, ofs, NULL)) {
447 		saved_offset = *ofs;
448 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449 			matched_offset = saved_offset;
450 		*ofs = saved_offset;
451 	}
452 
453 	*ofs = matched_offset;
454 }
455 
456 static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)457 loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
458 {
459 	int	saved_offset = *ofs;
460 
461 	while (match(r, pc + 2, s, len, ofs, NULL)) {
462 		saved_offset = *ofs;
463 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
464 			break;
465 	}
466 
467 	*ofs = saved_offset;
468 }
469 
470 static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)471 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
472 {
473 	int	i, ch;
474 
475 	ch = s[*ofs];
476 
477 	for (i = 0; i < len; i++)
478 		if (p[i] == ch) {
479 			(*ofs)++;
480 			return 1;
481 		}
482 
483 	return 0;
484 }
485 
486 static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)487 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
488 {
489 	int	i, ch;
490 
491 	ch = s[*ofs];
492 
493 	for (i = 0; i < len; i++) {
494 		if (p[i] == ch)
495 			return 0;
496 	}
497 
498 	(*ofs)++;
499 	return 1;
500 }
501 
502 static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)503 match(const struct slre *r, int pc, const char *s, int len,
504 		int *ofs, struct cap *caps)
505 {
506 	int	n, saved_offset, res = 1;
507 
508 	while (res && r->code[pc] != END) {
509 
510 		assert(pc < r->code_size);
511 		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
512 
513 		switch (r->code[pc]) {
514 		case BRANCH:
515 			saved_offset = *ofs;
516 			res = match(r, pc + 3, s, len, ofs, caps);
517 			if (res == 0) {
518 				*ofs = saved_offset;
519 				res = match(r, pc + r->code[pc + 1],
520 				    s, len, ofs, caps);
521 			}
522 			pc += r->code[pc + 2];
523 			break;
524 		case EXACT:
525 			res = 0;
526 			n = r->code[pc + 2];	/* String length */
527 			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528 			    r->code[pc + 1], n)) {
529 				(*ofs) += n;
530 				res = 1;
531 			}
532 			pc += 3;
533 			break;
534 		case QUEST:
535 			res = 1;
536 			saved_offset = *ofs;
537 			if (!match(r, pc + 2, s, len, ofs, caps))
538 				*ofs = saved_offset;
539 			pc += r->code[pc + 1];
540 			break;
541 		case STAR:
542 			res = 1;
543 			loop_greedy(r, pc, s, len, ofs);
544 			pc += r->code[pc + 1];
545 			break;
546 		case STARQ:
547 			res = 1;
548 			loop_non_greedy(r, pc, s, len, ofs);
549 			pc += r->code[pc + 1];
550 			break;
551 		case PLUS:
552 			res = match(r, pc + 2, s, len, ofs, caps);
553 			if (res == 0)
554 				break;
555 
556 			loop_greedy(r, pc, s, len, ofs);
557 			pc += r->code[pc + 1];
558 			break;
559 		case PLUSQ:
560 			res = match(r, pc + 2, s, len, ofs, caps);
561 			if (res == 0)
562 				break;
563 
564 			loop_non_greedy(r, pc, s, len, ofs);
565 			pc += r->code[pc + 1];
566 			break;
567 		case SPACE:
568 			res = 0;
569 			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
570 				(*ofs)++;
571 				res = 1;
572 			}
573 			pc++;
574 			break;
575 		case NONSPACE:
576 			res = 0;
577 			if (*ofs < len &&
578 					!isspace(((unsigned char *)s)[*ofs])) {
579 				(*ofs)++;
580 				res = 1;
581 			}
582 			pc++;
583 			break;
584 		case DIGIT:
585 			res = 0;
586 			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
587 				(*ofs)++;
588 				res = 1;
589 			}
590 			pc++;
591 			break;
592 		case ANY:
593 			res = 0;
594 			if (*ofs < len) {
595 				(*ofs)++;
596 				res = 1;
597 			}
598 			pc++;
599 			break;
600 		case ANYOF:
601 			res = 0;
602 			if (*ofs < len)
603 				res = is_any_of(r->data + r->code[pc + 1],
604 					r->code[pc + 2], s, ofs);
605 			pc += 3;
606 			break;
607 		case ANYBUT:
608 			res = 0;
609 			if (*ofs < len)
610 				res = is_any_but(r->data + r->code[pc + 1],
611 					r->code[pc + 2], s, ofs);
612 			pc += 3;
613 			break;
614 		case BOL:
615 			res = *ofs == 0 ? 1 : 0;
616 			pc++;
617 			break;
618 		case EOL:
619 			res = *ofs == len ? 1 : 0;
620 			pc++;
621 			break;
622 		case OPEN:
623 			if (caps != NULL)
624 				caps[r->code[pc + 1]].ptr = s + *ofs;
625 			pc += 2;
626 			break;
627 		case CLOSE:
628 			if (caps != NULL)
629 				caps[r->code[pc + 1]].len = (s + *ofs) -
630 				    caps[r->code[pc + 1]].ptr;
631 			pc += 2;
632 			break;
633 		case END:
634 			pc++;
635 			break;
636 		default:
637 			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
638 			assert(0);
639 			break;
640 		}
641 	}
642 
643 	return res;
644 }
645 
646 int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)647 slre_match(const struct slre *r, const char *buf, int len,
648 		struct cap *caps)
649 {
650 	int	i, ofs = 0, res = 0;
651 
652 	if (r->anchored) {
653 		res = match(r, 0, buf, len, &ofs, caps);
654 	} else {
655 		for (i = 0; i < len && res == 0; i++) {
656 			ofs = i;
657 			res = match(r, 0, buf, len, &ofs, caps);
658 		}
659 	}
660 
661 	return res;
662 }
663 
664 #ifdef SLRE_TEST
665 #define N_CAPS	5
666 
main(int argc,char * argv[])667 int main(int argc, char *argv[])
668 {
669 	struct slre	slre;
670 	struct cap	caps[N_CAPS];
671 	unsigned char	data[1 * 1024 * 1024];
672 	FILE		*fp;
673 	int		i, res, len;
674 
675 	if (argc < 2) {
676 		fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
677 		return 1;
678 	}
679 
680 	fp = fopen(argv[2], "rb");
681 	if (fp == NULL) {
682 		fprintf(stderr, "Error: cannot open %s:%s\n",
683 			argv[2], strerror(errno));
684 		return 1;
685 	}
686 
687 	if (!slre_compile(&slre, argv[1])) {
688 		fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
689 		return 1;
690 	}
691 
692 	slre_dump(&slre, stderr);
693 
694 	while (fgets(data, sizeof(data), fp) != NULL) {
695 		len = strlen(data);
696 
697 		if ((len > 0) && (data[len-1] == '\n')) {
698 			data[len-1] = '\0';
699 			--len;
700 		}
701 
702 		printf("Data = \"%s\"\n", data);
703 
704 		(void) memset(caps, 0, sizeof(caps));
705 
706 		res = 0;
707 
708 		res = slre_match(&slre, data, len, caps);
709 		printf("Result [%d]: %d\n", i, res);
710 
711 		for (i = 0; i < N_CAPS; i++) {
712 			if (caps[i].len > 0) {
713 				printf("Substring %d: len=%d  [%.*s]\n", i,
714 					caps[i].len,
715 					caps[i].len, caps[i].ptr);
716 			}
717 		}
718 		printf("----------------------------------------------------\n");
719 	}
720 	(void) fclose(fp);
721 
722 	return 0;
723 }
724 #endif /* SLRE_TEST */
725