1 /**********************************************************************
2 regcomp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * (C) Copyright 2015 Hewlett Packard Enterprise Development LP<BR>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include "regparse.h"
33
34 OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
35
36 extern OnigCaseFoldType
onig_get_default_case_fold_flag(void)37 onig_get_default_case_fold_flag(void)
38 {
39 return OnigDefaultCaseFoldFlag;
40 }
41
42 extern int
onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)43 onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
44 {
45 OnigDefaultCaseFoldFlag = case_fold_flag;
46 return 0;
47 }
48
49
50 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
51 static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
52 #endif
53
54 static UChar*
str_dup(UChar * s,UChar * end)55 str_dup(UChar* s, UChar* end)
56 {
57 int len = (int)(end - s);
58
59 if (len > 0) {
60 UChar* r = (UChar* )xmalloc(len + 1);
61 CHECK_NULL_RETURN(r);
62 xmemcpy(r, s, len);
63 r[len] = (UChar )0;
64 return r;
65 }
66 else return NULL;
67 }
68
69 static void
swap_node(Node * a,Node * b)70 swap_node(Node* a, Node* b)
71 {
72 Node c;
73 CopyMem (&c, a, sizeof (Node));
74 CopyMem (a, b, sizeof (Node));
75 CopyMem (b, &c, sizeof (Node));
76
77 if (NTYPE(a) == NT_STR) {
78 StrNode* sn = NSTR(a);
79 if (sn->capa == 0) {
80 int len = (int)(sn->end - sn->s);
81 sn->s = sn->buf;
82 sn->end = sn->s + len;
83 }
84 }
85
86 if (NTYPE(b) == NT_STR) {
87 StrNode* sn = NSTR(b);
88 if (sn->capa == 0) {
89 int len = (int)(sn->end - sn->s);
90 sn->s = sn->buf;
91 sn->end = sn->s + len;
92 }
93 }
94 }
95
96 static OnigDistance
distance_add(OnigDistance d1,OnigDistance d2)97 distance_add(OnigDistance d1, OnigDistance d2)
98 {
99 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
100 return ONIG_INFINITE_DISTANCE;
101 else {
102 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
103 else return ONIG_INFINITE_DISTANCE;
104 }
105 }
106
107 static OnigDistance
distance_multiply(OnigDistance d,int m)108 distance_multiply(OnigDistance d, int m)
109 {
110 if (m == 0) return 0;
111
112 if (d < ONIG_INFINITE_DISTANCE / m)
113 return d * m;
114 else
115 return ONIG_INFINITE_DISTANCE;
116 }
117
118 static int
bitset_is_empty(BitSetRef bs)119 bitset_is_empty(BitSetRef bs)
120 {
121 int i;
122 for (i = 0; i < (int )BITSET_SIZE; i++) {
123 if (bs[i] != 0) return 0;
124 }
125 return 1;
126 }
127
128 #ifdef ONIG_DEBUG
129 static int
bitset_on_num(BitSetRef bs)130 bitset_on_num(BitSetRef bs)
131 {
132 int i, n;
133
134 n = 0;
135 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
136 if (BITSET_AT(bs, i)) n++;
137 }
138 return n;
139 }
140 #endif
141
142 extern int
onig_bbuf_init(BBuf * buf,int size)143 onig_bbuf_init(BBuf* buf, int size)
144 {
145 if (size <= 0) {
146 size = 0;
147 buf->p = NULL;
148 }
149 else {
150 buf->p = (UChar* )xmalloc(size);
151 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
152 }
153
154 buf->alloc = size;
155 buf->used = 0;
156 return 0;
157 }
158
159
160 #ifdef USE_SUBEXP_CALL
161
162 static int
unset_addr_list_init(UnsetAddrList * uslist,int size)163 unset_addr_list_init(UnsetAddrList* uslist, int size)
164 {
165 UnsetAddr* p;
166
167 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
168 CHECK_NULL_RETURN_MEMERR(p);
169 uslist->num = 0;
170 uslist->alloc = size;
171 uslist->us = p;
172 return 0;
173 }
174
175 static void
unset_addr_list_end(UnsetAddrList * uslist)176 unset_addr_list_end(UnsetAddrList* uslist)
177 {
178 if (IS_NOT_NULL(uslist->us))
179 xfree(uslist->us);
180 }
181
182 static int
unset_addr_list_add(UnsetAddrList * uslist,int offset,struct _Node * node)183 unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
184 {
185 UnsetAddr* p;
186 int size;
187
188 if (uslist->num >= uslist->alloc) {
189 size = uslist->alloc * 2;
190 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size, sizeof(UnsetAddr) * uslist->alloc);
191 CHECK_NULL_RETURN_MEMERR(p);
192 uslist->alloc = size;
193 uslist->us = p;
194 }
195
196 uslist->us[uslist->num].offset = offset;
197 uslist->us[uslist->num].target = node;
198 uslist->num++;
199 return 0;
200 }
201 #endif /* USE_SUBEXP_CALL */
202
203
204 static int
add_opcode(regex_t * reg,int opcode)205 add_opcode(regex_t* reg, int opcode)
206 {
207 BBUF_ADD1(reg, ((unsigned char)opcode));
208 return 0;
209 }
210
211 #ifdef USE_COMBINATION_EXPLOSION_CHECK
212 static int
add_state_check_num(regex_t * reg,int num)213 add_state_check_num(regex_t* reg, int num)
214 {
215 StateCheckNumType n = (StateCheckNumType )num;
216
217 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
218 return 0;
219 }
220 #endif
221
222 static int
add_rel_addr(regex_t * reg,int addr)223 add_rel_addr(regex_t* reg, int addr)
224 {
225 RelAddrType ra = (RelAddrType )addr;
226
227 BBUF_ADD(reg, &ra, SIZE_RELADDR);
228 return 0;
229 }
230
231 static int
add_abs_addr(regex_t * reg,int addr)232 add_abs_addr(regex_t* reg, int addr)
233 {
234 AbsAddrType ra = (AbsAddrType )addr;
235
236 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
237 return 0;
238 }
239
240 static int
add_length(regex_t * reg,int len)241 add_length(regex_t* reg, int len)
242 {
243 LengthType l = (LengthType )len;
244
245 BBUF_ADD(reg, &l, SIZE_LENGTH);
246 return 0;
247 }
248
249 static int
add_mem_num(regex_t * reg,int num)250 add_mem_num(regex_t* reg, int num)
251 {
252 MemNumType n = (MemNumType )num;
253
254 BBUF_ADD(reg, &n, SIZE_MEMNUM);
255 return 0;
256 }
257
258 static int
add_pointer(regex_t * reg,void * addr)259 add_pointer(regex_t* reg, void* addr)
260 {
261 PointerType ptr = (PointerType )addr;
262
263 BBUF_ADD(reg, &ptr, SIZE_POINTER);
264 return 0;
265 }
266
267 static int
add_option(regex_t * reg,OnigOptionType option)268 add_option(regex_t* reg, OnigOptionType option)
269 {
270 BBUF_ADD(reg, &option, SIZE_OPTION);
271 return 0;
272 }
273
274 static int
add_opcode_rel_addr(regex_t * reg,int opcode,int addr)275 add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
276 {
277 int r;
278
279 r = add_opcode(reg, opcode);
280 if (r) return r;
281 r = add_rel_addr(reg, addr);
282 return r;
283 }
284
285 static int
add_bytes(regex_t * reg,UChar * bytes,int len)286 add_bytes(regex_t* reg, UChar* bytes, int len)
287 {
288 BBUF_ADD(reg, bytes, len);
289 return 0;
290 }
291
292 static int
add_bitset(regex_t * reg,BitSetRef bs)293 add_bitset(regex_t* reg, BitSetRef bs)
294 {
295 BBUF_ADD(reg, bs, SIZE_BITSET);
296 return 0;
297 }
298
299 static int
add_opcode_option(regex_t * reg,int opcode,OnigOptionType option)300 add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
301 {
302 int r;
303
304 r = add_opcode(reg, opcode);
305 if (r) return r;
306 r = add_option(reg, option);
307 return r;
308 }
309
310 static int compile_length_tree(Node* node, regex_t* reg);
311 static int compile_tree(Node* node, regex_t* reg);
312
313
314 #define IS_NEED_STR_LEN_OP_EXACT(op) \
315 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
316 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
317
318 static int
select_str_opcode(int mb_len,int str_len,int ignore_case)319 select_str_opcode(int mb_len, int str_len, int ignore_case)
320 {
321 int op;
322
323 if (ignore_case) {
324 switch (str_len) {
325 case 1: op = OP_EXACT1_IC; break;
326 default: op = OP_EXACTN_IC; break;
327 }
328 }
329 else {
330 switch (mb_len) {
331 case 1:
332 switch (str_len) {
333 case 1: op = OP_EXACT1; break;
334 case 2: op = OP_EXACT2; break;
335 case 3: op = OP_EXACT3; break;
336 case 4: op = OP_EXACT4; break;
337 case 5: op = OP_EXACT5; break;
338 default: op = OP_EXACTN; break;
339 }
340 break;
341
342 case 2:
343 switch (str_len) {
344 case 1: op = OP_EXACTMB2N1; break;
345 case 2: op = OP_EXACTMB2N2; break;
346 case 3: op = OP_EXACTMB2N3; break;
347 default: op = OP_EXACTMB2N; break;
348 }
349 break;
350
351 case 3:
352 op = OP_EXACTMB3N;
353 break;
354
355 default:
356 op = OP_EXACTMBN;
357 break;
358 }
359 }
360 return op;
361 }
362
363 static int
compile_tree_empty_check(Node * node,regex_t * reg,int empty_info)364 compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
365 {
366 int r;
367 int saved_num_null_check = reg->num_null_check;
368
369 if (empty_info != 0) {
370 r = add_opcode(reg, OP_NULL_CHECK_START);
371 if (r) return r;
372 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
373 if (r) return r;
374 reg->num_null_check++;
375 }
376
377 r = compile_tree(node, reg);
378 if (r) return r;
379
380 if (empty_info != 0) {
381 if (empty_info == NQ_TARGET_IS_EMPTY)
382 r = add_opcode(reg, OP_NULL_CHECK_END);
383 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
384 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
385 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
386 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
387
388 if (r) return r;
389 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
390 }
391 return r;
392 }
393
394 #ifdef USE_SUBEXP_CALL
395 static int
compile_call(CallNode * node,regex_t * reg)396 compile_call(CallNode* node, regex_t* reg)
397 {
398 int r;
399
400 r = add_opcode(reg, OP_CALL);
401 if (r) return r;
402 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
403 node->target);
404 if (r) return r;
405 r = add_abs_addr(reg, 0 /*dummy addr.*/);
406 return r;
407 }
408 #endif
409
410 static int
compile_tree_n_times(Node * node,int n,regex_t * reg)411 compile_tree_n_times(Node* node, int n, regex_t* reg)
412 {
413 int i, r;
414
415 for (i = 0; i < n; i++) {
416 r = compile_tree(node, reg);
417 if (r) return r;
418 }
419 return 0;
420 }
421
422 static int
add_compile_string_length(UChar * s ARG_UNUSED,int mb_len,int str_len,regex_t * reg ARG_UNUSED,int ignore_case)423 add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len,
424 regex_t* reg ARG_UNUSED, int ignore_case)
425 {
426 int len;
427 int op = select_str_opcode(mb_len, str_len, ignore_case);
428
429 len = SIZE_OPCODE;
430
431 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
432 if (IS_NEED_STR_LEN_OP_EXACT(op))
433 len += SIZE_LENGTH;
434
435 len += mb_len * str_len;
436 return len;
437 }
438
439 static int
add_compile_string(UChar * s,int mb_len,int str_len,regex_t * reg,int ignore_case)440 add_compile_string(UChar* s, int mb_len, int str_len,
441 regex_t* reg, int ignore_case)
442 {
443 int op = select_str_opcode(mb_len, str_len, ignore_case);
444 add_opcode(reg, op);
445
446 if (op == OP_EXACTMBN)
447 add_length(reg, mb_len);
448
449 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
450 if (op == OP_EXACTN_IC)
451 add_length(reg, mb_len * str_len);
452 else
453 add_length(reg, str_len);
454 }
455
456 add_bytes(reg, s, mb_len * str_len);
457 return 0;
458 }
459
460
461 static int
compile_length_string_node(Node * node,regex_t * reg)462 compile_length_string_node(Node* node, regex_t* reg)
463 {
464 int rlen, r, len, prev_len, slen, ambig;
465 OnigEncoding enc = reg->enc;
466 UChar *p, *prev;
467 StrNode* sn;
468
469 sn = NSTR(node);
470 if (sn->end <= sn->s)
471 return 0;
472
473 ambig = NSTRING_IS_AMBIG(node);
474
475 p = prev = sn->s;
476 prev_len = enclen(enc, p);
477 p += prev_len;
478 slen = 1;
479 rlen = 0;
480
481 for (; p < sn->end; ) {
482 len = enclen(enc, p);
483 if (len == prev_len) {
484 slen++;
485 }
486 else {
487 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
488 rlen += r;
489 prev = p;
490 slen = 1;
491 prev_len = len;
492 }
493 p += len;
494 }
495 r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
496 rlen += r;
497 return rlen;
498 }
499
500 static int
compile_length_string_raw_node(StrNode * sn,regex_t * reg)501 compile_length_string_raw_node(StrNode* sn, regex_t* reg)
502 {
503 if (sn->end <= sn->s)
504 return 0;
505
506 return add_compile_string_length(sn->s, 1 /* sb */, (int)(sn->end - sn->s), reg, 0);
507 }
508
509 static int
compile_string_node(Node * node,regex_t * reg)510 compile_string_node(Node* node, regex_t* reg)
511 {
512 int r, len, prev_len, slen, ambig;
513 OnigEncoding enc = reg->enc;
514 UChar *p, *prev, *end;
515 StrNode* sn;
516
517 sn = NSTR(node);
518 if (sn->end <= sn->s)
519 return 0;
520
521 end = sn->end;
522 ambig = NSTRING_IS_AMBIG(node);
523
524 p = prev = sn->s;
525 prev_len = enclen(enc, p);
526 p += prev_len;
527 slen = 1;
528
529 for (; p < end; ) {
530 len = enclen(enc, p);
531 if (len == prev_len) {
532 slen++;
533 }
534 else {
535 r = add_compile_string(prev, prev_len, slen, reg, ambig);
536 if (r) return r;
537
538 prev = p;
539 slen = 1;
540 prev_len = len;
541 }
542
543 p += len;
544 }
545 return add_compile_string(prev, prev_len, slen, reg, ambig);
546 }
547
548 static int
compile_string_raw_node(StrNode * sn,regex_t * reg)549 compile_string_raw_node(StrNode* sn, regex_t* reg)
550 {
551 if (sn->end <= sn->s)
552 return 0;
553
554 return add_compile_string(sn->s, 1 /* sb */, (int)(sn->end - sn->s), reg, 0);
555 }
556
557 static int
add_multi_byte_cclass(BBuf * mbuf,regex_t * reg)558 add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
559 {
560 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
561 add_length(reg, mbuf->used);
562 return add_bytes(reg, mbuf->p, mbuf->used);
563 #else
564 int r, pad_size;
565 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
566
567 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
568 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
569 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
570
571 r = add_bytes(reg, mbuf->p, mbuf->used);
572
573 /* padding for return value from compile_length_cclass_node() to be fix. */
574 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
575 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
576 return r;
577 #endif
578 }
579
580 static int
compile_length_cclass_node(CClassNode * cc,regex_t * reg)581 compile_length_cclass_node(CClassNode* cc, regex_t* reg)
582 {
583 int len;
584
585 if (IS_NCCLASS_SHARE(cc)) {
586 len = SIZE_OPCODE + SIZE_POINTER;
587 return len;
588 }
589
590 if (IS_NULL(cc->mbuf)) {
591 len = SIZE_OPCODE + SIZE_BITSET;
592 }
593 else {
594 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
595 len = SIZE_OPCODE;
596 }
597 else {
598 len = SIZE_OPCODE + SIZE_BITSET;
599 }
600 #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
601 len += SIZE_LENGTH + cc->mbuf->used;
602 #else
603 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
604 #endif
605 }
606
607 return len;
608 }
609
610 static int
compile_cclass_node(CClassNode * cc,regex_t * reg)611 compile_cclass_node(CClassNode* cc, regex_t* reg)
612 {
613 int r;
614
615 if (IS_NCCLASS_SHARE(cc)) {
616 add_opcode(reg, OP_CCLASS_NODE);
617 r = add_pointer(reg, cc);
618 return r;
619 }
620
621 if (IS_NULL(cc->mbuf)) {
622 if (IS_NCCLASS_NOT(cc))
623 add_opcode(reg, OP_CCLASS_NOT);
624 else
625 add_opcode(reg, OP_CCLASS);
626
627 r = add_bitset(reg, cc->bs);
628 }
629 else {
630 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
631 if (IS_NCCLASS_NOT(cc))
632 add_opcode(reg, OP_CCLASS_MB_NOT);
633 else
634 add_opcode(reg, OP_CCLASS_MB);
635
636 r = add_multi_byte_cclass(cc->mbuf, reg);
637 }
638 else {
639 if (IS_NCCLASS_NOT(cc))
640 add_opcode(reg, OP_CCLASS_MIX_NOT);
641 else
642 add_opcode(reg, OP_CCLASS_MIX);
643
644 r = add_bitset(reg, cc->bs);
645 if (r) return r;
646 r = add_multi_byte_cclass(cc->mbuf, reg);
647 }
648 }
649
650 return r;
651 }
652
653 static int
entry_repeat_range(regex_t * reg,int id,int lower,int upper)654 entry_repeat_range(regex_t* reg, int id, int lower, int upper)
655 {
656 #define REPEAT_RANGE_ALLOC 4
657
658 OnigRepeatRange* p;
659
660 if (reg->repeat_range_alloc == 0) {
661 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
662 CHECK_NULL_RETURN_MEMERR(p);
663 reg->repeat_range = p;
664 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
665 }
666 else if (reg->repeat_range_alloc <= id) {
667 int n;
668 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
669 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
670 sizeof(OnigRepeatRange) * n,
671 sizeof(OnigRepeatRange) * reg->repeat_range_alloc);
672 CHECK_NULL_RETURN_MEMERR(p);
673 reg->repeat_range = p;
674 reg->repeat_range_alloc = n;
675 }
676 else {
677 p = reg->repeat_range;
678 }
679
680 p[id].lower = lower;
681 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
682 return 0;
683 }
684
685 static int
compile_range_repeat_node(QtfrNode * qn,int target_len,int empty_info,regex_t * reg)686 compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
687 regex_t* reg)
688 {
689 int r;
690 int num_repeat = reg->num_repeat;
691
692 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
693 if (r) return r;
694 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
695 reg->num_repeat++;
696 if (r) return r;
697 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
698 if (r) return r;
699
700 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
701 if (r) return r;
702
703 r = compile_tree_empty_check(qn->target, reg, empty_info);
704 if (r) return r;
705
706 if (
707 #ifdef USE_SUBEXP_CALL
708 reg->num_call > 0 ||
709 #endif
710 IS_QUANTIFIER_IN_REPEAT(qn)) {
711 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
712 }
713 else {
714 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
715 }
716 if (r) return r;
717 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
718 return r;
719 }
720
721 static int
is_anychar_star_quantifier(QtfrNode * qn)722 is_anychar_star_quantifier(QtfrNode* qn)
723 {
724 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
725 NTYPE(qn->target) == NT_CANY)
726 return 1;
727 else
728 return 0;
729 }
730
731 #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
732 #define CKN_ON (ckn > 0)
733
734 #ifdef USE_COMBINATION_EXPLOSION_CHECK
735
736 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)737 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
738 {
739 int len, mod_tlen, cklen;
740 int ckn;
741 int infinite = IS_REPEAT_INFINITE(qn->upper);
742 int empty_info = qn->target_empty_info;
743 int tlen = compile_length_tree(qn->target, reg);
744
745 if (tlen < 0) return tlen;
746
747 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
748
749 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
750
751 /* anychar repeat */
752 if (NTYPE(qn->target) == NT_CANY) {
753 if (qn->greedy && infinite) {
754 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
755 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
756 else
757 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
758 }
759 }
760
761 if (empty_info != 0)
762 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
763 else
764 mod_tlen = tlen;
765
766 if (infinite && qn->lower <= 1) {
767 if (qn->greedy) {
768 if (qn->lower == 1)
769 len = SIZE_OP_JUMP;
770 else
771 len = 0;
772
773 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
774 }
775 else {
776 if (qn->lower == 0)
777 len = SIZE_OP_JUMP;
778 else
779 len = 0;
780
781 len += mod_tlen + SIZE_OP_PUSH + cklen;
782 }
783 }
784 else if (qn->upper == 0) {
785 if (qn->is_refered != 0) /* /(?<n>..){0}/ */
786 len = SIZE_OP_JUMP + tlen;
787 else
788 len = 0;
789 }
790 else if (qn->upper == 1 && qn->greedy) {
791 if (qn->lower == 0) {
792 if (CKN_ON) {
793 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
794 }
795 else {
796 len = SIZE_OP_PUSH + tlen;
797 }
798 }
799 else {
800 len = tlen;
801 }
802 }
803 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
804 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
805 }
806 else {
807 len = SIZE_OP_REPEAT_INC
808 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
809 if (CKN_ON)
810 len += SIZE_OP_STATE_CHECK;
811 }
812
813 return len;
814 }
815
816 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)817 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
818 {
819 int r, mod_tlen;
820 int ckn;
821 int infinite = IS_REPEAT_INFINITE(qn->upper);
822 int empty_info = qn->target_empty_info;
823 int tlen = compile_length_tree(qn->target, reg);
824
825 if (tlen < 0) return tlen;
826
827 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
828
829 if (is_anychar_star_quantifier(qn)) {
830 r = compile_tree_n_times(qn->target, qn->lower, reg);
831 if (r) return r;
832 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
833 if (IS_MULTILINE(reg->options))
834 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
835 else
836 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
837 if (r) return r;
838 if (CKN_ON) {
839 r = add_state_check_num(reg, ckn);
840 if (r) return r;
841 }
842
843 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
844 }
845 else {
846 if (IS_MULTILINE(reg->options)) {
847 r = add_opcode(reg, (CKN_ON ?
848 OP_STATE_CHECK_ANYCHAR_ML_STAR
849 : OP_ANYCHAR_ML_STAR));
850 }
851 else {
852 r = add_opcode(reg, (CKN_ON ?
853 OP_STATE_CHECK_ANYCHAR_STAR
854 : OP_ANYCHAR_STAR));
855 }
856 if (r) return r;
857 if (CKN_ON)
858 r = add_state_check_num(reg, ckn);
859
860 return r;
861 }
862 }
863
864 if (empty_info != 0)
865 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
866 else
867 mod_tlen = tlen;
868
869 if (infinite && qn->lower <= 1) {
870 if (qn->greedy) {
871 if (qn->lower == 1) {
872 r = add_opcode_rel_addr(reg, OP_JUMP,
873 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
874 if (r) return r;
875 }
876
877 if (CKN_ON) {
878 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
879 if (r) return r;
880 r = add_state_check_num(reg, ckn);
881 if (r) return r;
882 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
883 }
884 else {
885 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
886 }
887 if (r) return r;
888 r = compile_tree_empty_check(qn->target, reg, empty_info);
889 if (r) return r;
890 r = add_opcode_rel_addr(reg, OP_JUMP,
891 -(mod_tlen + (int )SIZE_OP_JUMP
892 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
893 }
894 else {
895 if (qn->lower == 0) {
896 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
897 if (r) return r;
898 }
899 r = compile_tree_empty_check(qn->target, reg, empty_info);
900 if (r) return r;
901 if (CKN_ON) {
902 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
903 if (r) return r;
904 r = add_state_check_num(reg, ckn);
905 if (r) return r;
906 r = add_rel_addr(reg,
907 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
908 }
909 else
910 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
911 }
912 }
913 else if (qn->upper == 0) {
914 if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
915 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
916 if (r) return r;
917 r = compile_tree(qn->target, reg);
918 }
919 else
920 r = 0;
921 }
922 else if (qn->upper == 1 && qn->greedy) {
923 if (qn->lower == 0) {
924 if (CKN_ON) {
925 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
926 if (r) return r;
927 r = add_state_check_num(reg, ckn);
928 if (r) return r;
929 r = add_rel_addr(reg, tlen);
930 }
931 else {
932 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
933 }
934 if (r) return r;
935 }
936
937 r = compile_tree(qn->target, reg);
938 }
939 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
940 if (CKN_ON) {
941 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
942 if (r) return r;
943 r = add_state_check_num(reg, ckn);
944 if (r) return r;
945 r = add_rel_addr(reg, SIZE_OP_JUMP);
946 }
947 else {
948 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
949 }
950
951 if (r) return r;
952 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
953 if (r) return r;
954 r = compile_tree(qn->target, reg);
955 }
956 else {
957 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
958 if (CKN_ON) {
959 if (r) return r;
960 r = add_opcode(reg, OP_STATE_CHECK);
961 if (r) return r;
962 r = add_state_check_num(reg, ckn);
963 }
964 }
965 return r;
966 }
967
968 #else /* USE_COMBINATION_EXPLOSION_CHECK */
969
970 static int
compile_length_quantifier_node(QtfrNode * qn,regex_t * reg)971 compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
972 {
973 int len, mod_tlen;
974 int infinite = IS_REPEAT_INFINITE(qn->upper);
975 int empty_info = qn->target_empty_info;
976 int tlen = compile_length_tree(qn->target, reg);
977
978 if (tlen < 0) return tlen;
979
980 /* anychar repeat */
981 if (NTYPE(qn->target) == NT_CANY) {
982 if (qn->greedy && infinite) {
983 if (IS_NOT_NULL(qn->next_head_exact))
984 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
985 else
986 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
987 }
988 }
989
990 if (empty_info != 0)
991 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
992 else
993 mod_tlen = tlen;
994
995 if (infinite &&
996 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
997 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
998 len = SIZE_OP_JUMP;
999 }
1000 else {
1001 len = tlen * qn->lower;
1002 }
1003
1004 if (qn->greedy) {
1005 if (IS_NOT_NULL(qn->head_exact))
1006 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1007 else if (IS_NOT_NULL(qn->next_head_exact))
1008 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1009 else
1010 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1011 }
1012 else
1013 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1014 }
1015 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1016 len = SIZE_OP_JUMP + tlen;
1017 }
1018 else if (!infinite && qn->greedy &&
1019 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1020 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1021 len = tlen * qn->lower;
1022 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1023 }
1024 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1025 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1026 }
1027 else {
1028 len = SIZE_OP_REPEAT_INC
1029 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1030 }
1031
1032 return len;
1033 }
1034
1035 static int
compile_quantifier_node(QtfrNode * qn,regex_t * reg)1036 compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1037 {
1038 int i, r, mod_tlen;
1039 int infinite = IS_REPEAT_INFINITE(qn->upper);
1040 int empty_info = qn->target_empty_info;
1041 int tlen = compile_length_tree(qn->target, reg);
1042
1043 if (tlen < 0) return tlen;
1044
1045 if (is_anychar_star_quantifier(qn)) {
1046 r = compile_tree_n_times(qn->target, qn->lower, reg);
1047 if (r) return r;
1048 if (IS_NOT_NULL(qn->next_head_exact)) {
1049 if (IS_MULTILINE(reg->options))
1050 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1051 else
1052 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1053 if (r) return r;
1054 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1055 }
1056 else {
1057 if (IS_MULTILINE(reg->options))
1058 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1059 else
1060 return add_opcode(reg, OP_ANYCHAR_STAR);
1061 }
1062 }
1063
1064 if (empty_info != 0)
1065 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1066 else
1067 mod_tlen = tlen;
1068
1069 if (infinite &&
1070 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1071 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1072 if (qn->greedy) {
1073 if (IS_NOT_NULL(qn->head_exact))
1074 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1075 else if (IS_NOT_NULL(qn->next_head_exact))
1076 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1077 else
1078 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1079 }
1080 else {
1081 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1082 }
1083 if (r) return r;
1084 }
1085 else {
1086 r = compile_tree_n_times(qn->target, qn->lower, reg);
1087 if (r) return r;
1088 }
1089
1090 if (qn->greedy) {
1091 if (IS_NOT_NULL(qn->head_exact)) {
1092 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1093 mod_tlen + SIZE_OP_JUMP);
1094 if (r) return r;
1095 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1096 r = compile_tree_empty_check(qn->target, reg, empty_info);
1097 if (r) return r;
1098 r = add_opcode_rel_addr(reg, OP_JUMP,
1099 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1100 }
1101 else if (IS_NOT_NULL(qn->next_head_exact)) {
1102 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1103 mod_tlen + SIZE_OP_JUMP);
1104 if (r) return r;
1105 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1106 r = compile_tree_empty_check(qn->target, reg, empty_info);
1107 if (r) return r;
1108 r = add_opcode_rel_addr(reg, OP_JUMP,
1109 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1110 }
1111 else {
1112 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1113 if (r) return r;
1114 r = compile_tree_empty_check(qn->target, reg, empty_info);
1115 if (r) return r;
1116 r = add_opcode_rel_addr(reg, OP_JUMP,
1117 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1118 }
1119 }
1120 else {
1121 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1122 if (r) return r;
1123 r = compile_tree_empty_check(qn->target, reg, empty_info);
1124 if (r) return r;
1125 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1126 }
1127 }
1128 else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
1129 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1130 if (r) return r;
1131 r = compile_tree(qn->target, reg);
1132 }
1133 else if (!infinite && qn->greedy &&
1134 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1135 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1136 int n = qn->upper - qn->lower;
1137
1138 r = compile_tree_n_times(qn->target, qn->lower, reg);
1139 if (r) return r;
1140
1141 for (i = 0; i < n; i++) {
1142 r = add_opcode_rel_addr(reg, OP_PUSH,
1143 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1144 if (r) return r;
1145 r = compile_tree(qn->target, reg);
1146 if (r) return r;
1147 }
1148 }
1149 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1150 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1151 if (r) return r;
1152 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1153 if (r) return r;
1154 r = compile_tree(qn->target, reg);
1155 }
1156 else {
1157 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1158 }
1159 return r;
1160 }
1161 #endif /* USE_COMBINATION_EXPLOSION_CHECK */
1162
1163 static int
compile_length_option_node(EncloseNode * node,regex_t * reg)1164 compile_length_option_node(EncloseNode* node, regex_t* reg)
1165 {
1166 int tlen;
1167 OnigOptionType prev = reg->options;
1168
1169 reg->options = node->option;
1170 tlen = compile_length_tree(node->target, reg);
1171 reg->options = prev;
1172
1173 if (tlen < 0) return tlen;
1174
1175 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1176 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1177 + tlen + SIZE_OP_SET_OPTION;
1178 }
1179 else
1180 return tlen;
1181 }
1182
1183 static int
compile_option_node(EncloseNode * node,regex_t * reg)1184 compile_option_node(EncloseNode* node, regex_t* reg)
1185 {
1186 int r;
1187 OnigOptionType prev = reg->options;
1188
1189 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1190 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1191 if (r) return r;
1192 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1193 if (r) return r;
1194 r = add_opcode(reg, OP_FAIL);
1195 if (r) return r;
1196 }
1197
1198 reg->options = node->option;
1199 r = compile_tree(node->target, reg);
1200 reg->options = prev;
1201
1202 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1203 if (r) return r;
1204 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1205 }
1206 return r;
1207 }
1208
1209 static int
compile_length_enclose_node(EncloseNode * node,regex_t * reg)1210 compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1211 {
1212 int len;
1213 int tlen;
1214 QtfrNode* qn;
1215
1216 qn = NULL;
1217
1218 if (node->type == ENCLOSE_OPTION)
1219 return compile_length_option_node(node, reg);
1220
1221 if (node->target) {
1222 tlen = compile_length_tree(node->target, reg);
1223 if (tlen < 0) return tlen;
1224 }
1225 else
1226 tlen = 0;
1227
1228 switch (node->type) {
1229 case ENCLOSE_MEMORY:
1230 #ifdef USE_SUBEXP_CALL
1231 if (IS_ENCLOSE_CALLED(node)) {
1232 len = SIZE_OP_MEMORY_START_PUSH + tlen
1233 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1234 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1235 len += (IS_ENCLOSE_RECURSION(node)
1236 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1237 else
1238 len += (IS_ENCLOSE_RECURSION(node)
1239 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1240 }
1241 else
1242 #endif
1243 {
1244 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1245 len = SIZE_OP_MEMORY_START_PUSH;
1246 else
1247 len = SIZE_OP_MEMORY_START;
1248
1249 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1250 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1251 }
1252 break;
1253
1254 case ENCLOSE_STOP_BACKTRACK:
1255 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1256 if (node->target == NULL) {
1257 CHECK_NULL_RETURN_MEMERR(node->target);
1258 }
1259 qn = NQTFR(node->target);
1260 tlen = compile_length_tree(qn->target, reg);
1261 if (tlen < 0) return tlen;
1262
1263 len = tlen * qn->lower
1264 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1265 }
1266 else {
1267 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1268 }
1269 break;
1270
1271 default:
1272 return ONIGERR_TYPE_BUG;
1273 break;
1274 }
1275
1276 return len;
1277 }
1278
1279 static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1280
1281 static int
compile_enclose_node(EncloseNode * node,regex_t * reg)1282 compile_enclose_node(EncloseNode* node, regex_t* reg)
1283 {
1284 int r, len;
1285
1286 if (node->type == ENCLOSE_OPTION)
1287 return compile_option_node(node, reg);
1288
1289 switch (node->type) {
1290 case ENCLOSE_MEMORY:
1291 #ifdef USE_SUBEXP_CALL
1292 if (IS_ENCLOSE_CALLED(node)) {
1293 r = add_opcode(reg, OP_CALL);
1294 if (r) return r;
1295 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1296 node->state |= NST_ADDR_FIXED;
1297 r = add_abs_addr(reg, (int )node->call_addr);
1298 if (r) return r;
1299 len = compile_length_tree(node->target, reg);
1300 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1301 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1302 len += (IS_ENCLOSE_RECURSION(node)
1303 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1304 else
1305 len += (IS_ENCLOSE_RECURSION(node)
1306 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1307
1308 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1309 if (r) return r;
1310 }
1311 #endif
1312 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1313 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1314 else
1315 r = add_opcode(reg, OP_MEMORY_START);
1316 if (r) return r;
1317 r = add_mem_num(reg, node->regnum);
1318 if (r) return r;
1319 r = compile_tree(node->target, reg);
1320 if (r) return r;
1321 #ifdef USE_SUBEXP_CALL
1322 if (IS_ENCLOSE_CALLED(node)) {
1323 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1324 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1325 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1326 else
1327 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1328 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1329
1330 if (r) return r;
1331 r = add_mem_num(reg, node->regnum);
1332 if (r) return r;
1333 r = add_opcode(reg, OP_RETURN);
1334 }
1335 else
1336 #endif
1337 {
1338 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1339 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1340 else
1341 r = add_opcode(reg, OP_MEMORY_END);
1342 if (r) return r;
1343 r = add_mem_num(reg, node->regnum);
1344 }
1345 break;
1346
1347 case ENCLOSE_STOP_BACKTRACK:
1348 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1349 QtfrNode* qn = NQTFR(node->target);
1350 r = compile_tree_n_times(qn->target, qn->lower, reg);
1351 if (r) return r;
1352
1353 len = compile_length_tree(qn->target, reg);
1354 if (len < 0) return len;
1355
1356 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1357 if (r) return r;
1358 r = compile_tree(qn->target, reg);
1359 if (r) return r;
1360 r = add_opcode(reg, OP_POP);
1361 if (r) return r;
1362 r = add_opcode_rel_addr(reg, OP_JUMP,
1363 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1364 }
1365 else {
1366 r = add_opcode(reg, OP_PUSH_STOP_BT);
1367 if (r) return r;
1368 r = compile_tree(node->target, reg);
1369 if (r) return r;
1370 r = add_opcode(reg, OP_POP_STOP_BT);
1371 }
1372 break;
1373
1374 default:
1375 return ONIGERR_TYPE_BUG;
1376 break;
1377 }
1378
1379 return r;
1380 }
1381
1382 static int
compile_length_anchor_node(AnchorNode * node,regex_t * reg)1383 compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1384 {
1385 int len;
1386 int tlen = 0;
1387
1388 if (node->target) {
1389 tlen = compile_length_tree(node->target, reg);
1390 if (tlen < 0) return tlen;
1391 }
1392
1393 switch (node->type) {
1394 case ANCHOR_PREC_READ:
1395 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1396 break;
1397 case ANCHOR_PREC_READ_NOT:
1398 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1399 break;
1400 case ANCHOR_LOOK_BEHIND:
1401 len = SIZE_OP_LOOK_BEHIND + tlen;
1402 break;
1403 case ANCHOR_LOOK_BEHIND_NOT:
1404 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1405 break;
1406
1407 default:
1408 len = SIZE_OPCODE;
1409 break;
1410 }
1411
1412 return len;
1413 }
1414
1415 static int
compile_anchor_node(AnchorNode * node,regex_t * reg)1416 compile_anchor_node(AnchorNode* node, regex_t* reg)
1417 {
1418 int r, len;
1419
1420 switch (node->type) {
1421 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1422 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1423 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1424 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1425 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1426 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1427
1428 case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
1429 case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
1430 #ifdef USE_WORD_BEGIN_END
1431 case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
1432 case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
1433 #endif
1434
1435 case ANCHOR_PREC_READ:
1436 r = add_opcode(reg, OP_PUSH_POS);
1437 if (r) return r;
1438 r = compile_tree(node->target, reg);
1439 if (r) return r;
1440 r = add_opcode(reg, OP_POP_POS);
1441 break;
1442
1443 case ANCHOR_PREC_READ_NOT:
1444 len = compile_length_tree(node->target, reg);
1445 if (len < 0) return len;
1446 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1447 if (r) return r;
1448 r = compile_tree(node->target, reg);
1449 if (r) return r;
1450 r = add_opcode(reg, OP_FAIL_POS);
1451 break;
1452
1453 case ANCHOR_LOOK_BEHIND:
1454 {
1455 int n;
1456 r = add_opcode(reg, OP_LOOK_BEHIND);
1457 if (r) return r;
1458 if (node->char_len < 0) {
1459 r = get_char_length_tree(node->target, reg, &n);
1460 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1461 }
1462 else
1463 n = node->char_len;
1464 r = add_length(reg, n);
1465 if (r) return r;
1466 r = compile_tree(node->target, reg);
1467 }
1468 break;
1469
1470 case ANCHOR_LOOK_BEHIND_NOT:
1471 {
1472 int n;
1473 len = compile_length_tree(node->target, reg);
1474 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1475 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1476 if (r) return r;
1477 if (node->char_len < 0) {
1478 r = get_char_length_tree(node->target, reg, &n);
1479 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1480 }
1481 else
1482 n = node->char_len;
1483 r = add_length(reg, n);
1484 if (r) return r;
1485 r = compile_tree(node->target, reg);
1486 if (r) return r;
1487 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1488 }
1489 break;
1490
1491 default:
1492 return ONIGERR_TYPE_BUG;
1493 break;
1494 }
1495
1496 return r;
1497 }
1498
1499 static int
compile_length_tree(Node * node,regex_t * reg)1500 compile_length_tree(Node* node, regex_t* reg)
1501 {
1502 int len, type, r;
1503
1504 type = NTYPE(node);
1505 switch (type) {
1506 case NT_LIST:
1507 len = 0;
1508 do {
1509 r = compile_length_tree(NCAR(node), reg);
1510 if (r < 0) return r;
1511 len += r;
1512 } while (IS_NOT_NULL(node = NCDR(node)));
1513 r = len;
1514 break;
1515
1516 case NT_ALT:
1517 {
1518 int n;
1519
1520 n = r = 0;
1521 do {
1522 r += compile_length_tree(NCAR(node), reg);
1523 n++;
1524 } while (IS_NOT_NULL(node = NCDR(node)));
1525 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1526 }
1527 break;
1528
1529 case NT_STR:
1530 if (NSTRING_IS_RAW(node))
1531 r = compile_length_string_raw_node(NSTR(node), reg);
1532 else
1533 r = compile_length_string_node(node, reg);
1534 break;
1535
1536 case NT_CCLASS:
1537 r = compile_length_cclass_node(NCCLASS(node), reg);
1538 break;
1539
1540 case NT_CTYPE:
1541 case NT_CANY:
1542 r = SIZE_OPCODE;
1543 break;
1544
1545 case NT_BREF:
1546 {
1547 BRefNode* br = NBREF(node);
1548
1549 #ifdef USE_BACKREF_WITH_LEVEL
1550 if (IS_BACKREF_NEST_LEVEL(br)) {
1551 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1552 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1553 }
1554 else
1555 #endif
1556 if (br->back_num == 1) {
1557 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1558 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1559 }
1560 else {
1561 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1562 }
1563 }
1564 break;
1565
1566 #ifdef USE_SUBEXP_CALL
1567 case NT_CALL:
1568 r = SIZE_OP_CALL;
1569 break;
1570 #endif
1571
1572 case NT_QTFR:
1573 r = compile_length_quantifier_node(NQTFR(node), reg);
1574 break;
1575
1576 case NT_ENCLOSE:
1577 r = compile_length_enclose_node(NENCLOSE(node), reg);
1578 break;
1579
1580 case NT_ANCHOR:
1581 r = compile_length_anchor_node(NANCHOR(node), reg);
1582 break;
1583
1584 default:
1585 return ONIGERR_TYPE_BUG;
1586 break;
1587 }
1588
1589 return r;
1590 }
1591
1592 static int
compile_tree(Node * node,regex_t * reg)1593 compile_tree(Node* node, regex_t* reg)
1594 {
1595 int n, type, len, pos, r = 0;
1596
1597 type = NTYPE(node);
1598 switch (type) {
1599 case NT_LIST:
1600 do {
1601 r = compile_tree(NCAR(node), reg);
1602 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1603 break;
1604
1605 case NT_ALT:
1606 {
1607 Node* x = node;
1608 len = 0;
1609 do {
1610 len += compile_length_tree(NCAR(x), reg);
1611 if (NCDR(x) != NULL) {
1612 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1613 }
1614 } while (IS_NOT_NULL(x = NCDR(x)));
1615 pos = reg->used + len; /* goal position */
1616
1617 do {
1618 len = compile_length_tree(NCAR(node), reg);
1619 if (IS_NOT_NULL(NCDR(node))) {
1620 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1621 if (r) break;
1622 }
1623 r = compile_tree(NCAR(node), reg);
1624 if (r) break;
1625 if (IS_NOT_NULL(NCDR(node))) {
1626 len = pos - (reg->used + SIZE_OP_JUMP);
1627 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1628 if (r) break;
1629 }
1630 } while (IS_NOT_NULL(node = NCDR(node)));
1631 }
1632 break;
1633
1634 case NT_STR:
1635 if (NSTRING_IS_RAW(node))
1636 r = compile_string_raw_node(NSTR(node), reg);
1637 else
1638 r = compile_string_node(node, reg);
1639 break;
1640
1641 case NT_CCLASS:
1642 r = compile_cclass_node(NCCLASS(node), reg);
1643 break;
1644
1645 case NT_CTYPE:
1646 {
1647 int op;
1648
1649 switch (NCTYPE(node)->ctype) {
1650 case ONIGENC_CTYPE_WORD:
1651 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1652 else op = OP_WORD;
1653 break;
1654 default:
1655 return ONIGERR_TYPE_BUG;
1656 break;
1657 }
1658 r = add_opcode(reg, op);
1659 }
1660 break;
1661
1662 case NT_CANY:
1663 if (IS_MULTILINE(reg->options))
1664 r = add_opcode(reg, OP_ANYCHAR_ML);
1665 else
1666 r = add_opcode(reg, OP_ANYCHAR);
1667 break;
1668
1669 case NT_BREF:
1670 {
1671 BRefNode* br = NBREF(node);
1672
1673 #ifdef USE_BACKREF_WITH_LEVEL
1674 if (IS_BACKREF_NEST_LEVEL(br)) {
1675 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1676 if (r) return r;
1677 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1678 if (r) return r;
1679 r = add_length(reg, br->nest_level);
1680 if (r) return r;
1681
1682 goto add_bacref_mems;
1683 }
1684 else
1685 #endif
1686 if (br->back_num == 1) {
1687 n = br->back_static[0];
1688 if (IS_IGNORECASE(reg->options)) {
1689 r = add_opcode(reg, OP_BACKREFN_IC);
1690 if (r) return r;
1691 r = add_mem_num(reg, n);
1692 }
1693 else {
1694 switch (n) {
1695 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1696 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1697 default:
1698 r = add_opcode(reg, OP_BACKREFN);
1699 if (r) return r;
1700 r = add_mem_num(reg, n);
1701 break;
1702 }
1703 }
1704 }
1705 else {
1706 int i;
1707 int* p;
1708
1709 if (IS_IGNORECASE(reg->options)) {
1710 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1711 }
1712 else {
1713 r = add_opcode(reg, OP_BACKREF_MULTI);
1714 }
1715 if (r) return r;
1716
1717 #ifdef USE_BACKREF_WITH_LEVEL
1718 add_bacref_mems:
1719 #endif
1720 r = add_length(reg, br->back_num);
1721 if (r) return r;
1722 p = BACKREFS_P(br);
1723 for (i = br->back_num - 1; i >= 0; i--) {
1724 r = add_mem_num(reg, p[i]);
1725 if (r) return r;
1726 }
1727 }
1728 }
1729 break;
1730
1731 #ifdef USE_SUBEXP_CALL
1732 case NT_CALL:
1733 r = compile_call(NCALL(node), reg);
1734 break;
1735 #endif
1736
1737 case NT_QTFR:
1738 r = compile_quantifier_node(NQTFR(node), reg);
1739 break;
1740
1741 case NT_ENCLOSE:
1742 r = compile_enclose_node(NENCLOSE(node), reg);
1743 break;
1744
1745 case NT_ANCHOR:
1746 r = compile_anchor_node(NANCHOR(node), reg);
1747 break;
1748
1749 default:
1750 #ifdef ONIG_DEBUG
1751 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1752 #endif
1753 break;
1754 }
1755
1756 return r;
1757 }
1758
1759 #ifdef USE_NAMED_GROUP
1760
1761 static int
noname_disable_map(Node ** plink,GroupNumRemap * map,int * counter)1762 noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1763 {
1764 int r = 0;
1765 Node* node = *plink;
1766
1767 switch (NTYPE(node)) {
1768 case NT_LIST:
1769 case NT_ALT:
1770 do {
1771 r = noname_disable_map(&(NCAR(node)), map, counter);
1772 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1773 break;
1774
1775 case NT_QTFR:
1776 {
1777 Node** ptarget = &(NQTFR(node)->target);
1778 Node* old = *ptarget;
1779 r = noname_disable_map(ptarget, map, counter);
1780 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1781 onig_reduce_nested_quantifier(node, *ptarget);
1782 }
1783 }
1784 break;
1785
1786 case NT_ENCLOSE:
1787 {
1788 EncloseNode* en = NENCLOSE(node);
1789 if (en->type == ENCLOSE_MEMORY) {
1790 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1791 (*counter)++;
1792 map[en->regnum].new_val = *counter;
1793 en->regnum = *counter;
1794 r = noname_disable_map(&(en->target), map, counter);
1795 }
1796 else {
1797 *plink = en->target;
1798 en->target = NULL_NODE;
1799 onig_node_free(node);
1800 r = noname_disable_map(plink, map, counter);
1801 }
1802 }
1803 else
1804 r = noname_disable_map(&(en->target), map, counter);
1805 }
1806 break;
1807
1808 default:
1809 break;
1810 }
1811
1812 return r;
1813 }
1814
1815 static int
renumber_node_backref(Node * node,GroupNumRemap * map)1816 renumber_node_backref(Node* node, GroupNumRemap* map)
1817 {
1818 int i, pos, n, old_num;
1819 int *backs;
1820 BRefNode* bn = NBREF(node);
1821
1822 if (! IS_BACKREF_NAME_REF(bn))
1823 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1824
1825 old_num = bn->back_num;
1826 if (IS_NULL(bn->back_dynamic))
1827 backs = bn->back_static;
1828 else
1829 backs = bn->back_dynamic;
1830
1831 for (i = 0, pos = 0; i < old_num; i++) {
1832 n = map[backs[i]].new_val;
1833 if (n > 0) {
1834 backs[pos] = n;
1835 pos++;
1836 }
1837 }
1838
1839 bn->back_num = pos;
1840 return 0;
1841 }
1842
1843 static int
renumber_by_map(Node * node,GroupNumRemap * map)1844 renumber_by_map(Node* node, GroupNumRemap* map)
1845 {
1846 int r = 0;
1847
1848 switch (NTYPE(node)) {
1849 case NT_LIST:
1850 case NT_ALT:
1851 do {
1852 r = renumber_by_map(NCAR(node), map);
1853 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1854 break;
1855 case NT_QTFR:
1856 r = renumber_by_map(NQTFR(node)->target, map);
1857 break;
1858 case NT_ENCLOSE:
1859 r = renumber_by_map(NENCLOSE(node)->target, map);
1860 break;
1861
1862 case NT_BREF:
1863 r = renumber_node_backref(node, map);
1864 break;
1865
1866 default:
1867 break;
1868 }
1869
1870 return r;
1871 }
1872
1873 static int
numbered_ref_check(Node * node)1874 numbered_ref_check(Node* node)
1875 {
1876 int r = 0;
1877
1878 switch (NTYPE(node)) {
1879 case NT_LIST:
1880 case NT_ALT:
1881 do {
1882 r = numbered_ref_check(NCAR(node));
1883 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1884 break;
1885 case NT_QTFR:
1886 r = numbered_ref_check(NQTFR(node)->target);
1887 break;
1888 case NT_ENCLOSE:
1889 r = numbered_ref_check(NENCLOSE(node)->target);
1890 break;
1891
1892 case NT_BREF:
1893 if (! IS_BACKREF_NAME_REF(NBREF(node)))
1894 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1895 break;
1896
1897 default:
1898 break;
1899 }
1900
1901 return r;
1902 }
1903
1904 static int
disable_noname_group_capture(Node ** root,regex_t * reg,ScanEnv * env)1905 disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
1906 {
1907 int r, i, pos, counter;
1908 int Result;
1909 BitStatusType loc;
1910 GroupNumRemap* map;
1911
1912 map = (GroupNumRemap* )xmalloc(sizeof(GroupNumRemap) * (env->num_mem + 1));
1913 CHECK_NULL_RETURN_MEMERR(map);
1914 for (i = 1; i <= env->num_mem; i++) {
1915 map[i].new_val = 0;
1916 }
1917 counter = 0;
1918 r = noname_disable_map(root, map, &counter);
1919 if (r != 0) return r;
1920
1921 r = renumber_by_map(*root, map);
1922 if (r != 0) return r;
1923
1924 for (i = 1, pos = 1; i <= env->num_mem; i++) {
1925 if (map[i].new_val > 0) {
1926 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
1927 pos++;
1928 }
1929 }
1930
1931 loc = env->capture_history;
1932 BIT_STATUS_CLEAR(env->capture_history);
1933 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
1934 if (BIT_STATUS_AT(loc, i)) {
1935 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
1936 }
1937 }
1938
1939 env->num_mem = env->num_named;
1940 reg->num_mem = env->num_named;
1941
1942 Result = onig_renumber_name_table(reg, map);
1943 xfree(map);
1944 return Result;
1945 }
1946 #endif /* USE_NAMED_GROUP */
1947
1948 #ifdef USE_SUBEXP_CALL
1949 static int
unset_addr_list_fix(UnsetAddrList * uslist,regex_t * reg)1950 unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
1951 {
1952 int i, offset;
1953 EncloseNode* en;
1954 AbsAddrType addr;
1955
1956 for (i = 0; i < uslist->num; i++) {
1957 en = NENCLOSE(uslist->us[i].target);
1958 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
1959 addr = en->call_addr;
1960 offset = uslist->us[i].offset;
1961
1962 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
1963 }
1964 return 0;
1965 }
1966 #endif
1967
1968 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
1969 static int
quantifiers_memory_node_info(Node * node)1970 quantifiers_memory_node_info(Node* node)
1971 {
1972 int r = 0;
1973
1974 switch (NTYPE(node)) {
1975 case NT_LIST:
1976 case NT_ALT:
1977 {
1978 int v;
1979 do {
1980 v = quantifiers_memory_node_info(NCAR(node));
1981 if (v > r) r = v;
1982 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
1983 }
1984 break;
1985
1986 #ifdef USE_SUBEXP_CALL
1987 case NT_CALL:
1988 if (IS_CALL_RECURSION(NCALL(node))) {
1989 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
1990 }
1991 else
1992 r = quantifiers_memory_node_info(NCALL(node)->target);
1993 break;
1994 #endif
1995
1996 case NT_QTFR:
1997 {
1998 QtfrNode* qn = NQTFR(node);
1999 if (qn->upper != 0) {
2000 r = quantifiers_memory_node_info(qn->target);
2001 }
2002 }
2003 break;
2004
2005 case NT_ENCLOSE:
2006 {
2007 EncloseNode* en = NENCLOSE(node);
2008 switch (en->type) {
2009 case ENCLOSE_MEMORY:
2010 return NQ_TARGET_IS_EMPTY_MEM;
2011 break;
2012
2013 case ENCLOSE_OPTION:
2014 case ENCLOSE_STOP_BACKTRACK:
2015 r = quantifiers_memory_node_info(en->target);
2016 break;
2017 default:
2018 break;
2019 }
2020 }
2021 break;
2022
2023 case NT_BREF:
2024 case NT_STR:
2025 case NT_CTYPE:
2026 case NT_CCLASS:
2027 case NT_CANY:
2028 case NT_ANCHOR:
2029 default:
2030 break;
2031 }
2032
2033 return r;
2034 }
2035 #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2036
2037 static int
get_min_match_length(Node * node,OnigDistance * min,ScanEnv * env)2038 get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2039 {
2040 OnigDistance tmin;
2041 int r = 0;
2042
2043 *min = 0;
2044 switch (NTYPE(node)) {
2045 case NT_BREF:
2046 {
2047 int i;
2048 int* backs;
2049 Node** nodes = SCANENV_MEM_NODES(env);
2050 BRefNode* br = NBREF(node);
2051 if (br->state & NST_RECURSION) break;
2052
2053 backs = BACKREFS_P(br);
2054 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2055 r = get_min_match_length(nodes[backs[0]], min, env);
2056 if (r != 0) break;
2057 for (i = 1; i < br->back_num; i++) {
2058 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2059 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2060 if (r != 0) break;
2061 if (*min > tmin) *min = tmin;
2062 }
2063 }
2064 break;
2065
2066 #ifdef USE_SUBEXP_CALL
2067 case NT_CALL:
2068 if (IS_CALL_RECURSION(NCALL(node))) {
2069 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2070 if (IS_ENCLOSE_MIN_FIXED(en))
2071 *min = en->min_len;
2072 }
2073 else
2074 r = get_min_match_length(NCALL(node)->target, min, env);
2075 break;
2076 #endif
2077
2078 case NT_LIST:
2079 do {
2080 r = get_min_match_length(NCAR(node), &tmin, env);
2081 if (r == 0) *min += tmin;
2082 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2083 break;
2084
2085 case NT_ALT:
2086 {
2087 Node *x, *y;
2088 y = node;
2089 do {
2090 x = NCAR(y);
2091 r = get_min_match_length(x, &tmin, env);
2092 if (r != 0) break;
2093 if (y == node) *min = tmin;
2094 else if (*min > tmin) *min = tmin;
2095 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2096 }
2097 break;
2098
2099 case NT_STR:
2100 {
2101 StrNode* sn = NSTR(node);
2102 *min = (OnigDistance)(sn->end - sn->s);
2103 }
2104 break;
2105
2106 case NT_CTYPE:
2107 *min = 1;
2108 break;
2109
2110 case NT_CCLASS:
2111 case NT_CANY:
2112 *min = 1;
2113 break;
2114
2115 case NT_QTFR:
2116 {
2117 QtfrNode* qn = NQTFR(node);
2118
2119 if (qn->lower > 0) {
2120 r = get_min_match_length(qn->target, min, env);
2121 if (r == 0)
2122 *min = distance_multiply(*min, qn->lower);
2123 }
2124 }
2125 break;
2126
2127 case NT_ENCLOSE:
2128 {
2129 EncloseNode* en = NENCLOSE(node);
2130 switch (en->type) {
2131 case ENCLOSE_MEMORY:
2132 #ifdef USE_SUBEXP_CALL
2133 if (IS_ENCLOSE_MIN_FIXED(en))
2134 *min = en->min_len;
2135 else {
2136 r = get_min_match_length(en->target, min, env);
2137 if (r == 0) {
2138 en->min_len = *min;
2139 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2140 }
2141 }
2142 break;
2143 #endif
2144 case ENCLOSE_OPTION:
2145 case ENCLOSE_STOP_BACKTRACK:
2146 r = get_min_match_length(en->target, min, env);
2147 break;
2148 }
2149 }
2150 break;
2151
2152 case NT_ANCHOR:
2153 default:
2154 break;
2155 }
2156
2157 return r;
2158 }
2159
2160 static int
get_max_match_length(Node * node,OnigDistance * max,ScanEnv * env)2161 get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2162 {
2163 OnigDistance tmax;
2164 int r = 0;
2165
2166 *max = 0;
2167 switch (NTYPE(node)) {
2168 case NT_LIST:
2169 do {
2170 r = get_max_match_length(NCAR(node), &tmax, env);
2171 if (r == 0)
2172 *max = distance_add(*max, tmax);
2173 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2174 break;
2175
2176 case NT_ALT:
2177 do {
2178 r = get_max_match_length(NCAR(node), &tmax, env);
2179 if (r == 0 && *max < tmax) *max = tmax;
2180 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2181 break;
2182
2183 case NT_STR:
2184 {
2185 StrNode* sn = NSTR(node);
2186 *max = (OnigDistance)(sn->end - sn->s);
2187 }
2188 break;
2189
2190 case NT_CTYPE:
2191 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2192 break;
2193
2194 case NT_CCLASS:
2195 case NT_CANY:
2196 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2197 break;
2198
2199 case NT_BREF:
2200 {
2201 int i;
2202 int* backs;
2203 Node** nodes = SCANENV_MEM_NODES(env);
2204 BRefNode* br = NBREF(node);
2205 if (br->state & NST_RECURSION) {
2206 *max = ONIG_INFINITE_DISTANCE;
2207 break;
2208 }
2209 backs = BACKREFS_P(br);
2210 for (i = 0; i < br->back_num; i++) {
2211 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2212 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2213 if (r != 0) break;
2214 if (*max < tmax) *max = tmax;
2215 }
2216 }
2217 break;
2218
2219 #ifdef USE_SUBEXP_CALL
2220 case NT_CALL:
2221 if (! IS_CALL_RECURSION(NCALL(node)))
2222 r = get_max_match_length(NCALL(node)->target, max, env);
2223 else
2224 *max = ONIG_INFINITE_DISTANCE;
2225 break;
2226 #endif
2227
2228 case NT_QTFR:
2229 {
2230 QtfrNode* qn = NQTFR(node);
2231
2232 if (qn->upper != 0) {
2233 r = get_max_match_length(qn->target, max, env);
2234 if (r == 0 && *max != 0) {
2235 if (! IS_REPEAT_INFINITE(qn->upper))
2236 *max = distance_multiply(*max, qn->upper);
2237 else
2238 *max = ONIG_INFINITE_DISTANCE;
2239 }
2240 }
2241 }
2242 break;
2243
2244 case NT_ENCLOSE:
2245 {
2246 EncloseNode* en = NENCLOSE(node);
2247 switch (en->type) {
2248 case ENCLOSE_MEMORY:
2249 #ifdef USE_SUBEXP_CALL
2250 if (IS_ENCLOSE_MAX_FIXED(en))
2251 *max = en->max_len;
2252 else {
2253 r = get_max_match_length(en->target, max, env);
2254 if (r == 0) {
2255 en->max_len = *max;
2256 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2257 }
2258 }
2259 break;
2260 #endif
2261 case ENCLOSE_OPTION:
2262 case ENCLOSE_STOP_BACKTRACK:
2263 r = get_max_match_length(en->target, max, env);
2264 break;
2265 }
2266 }
2267 break;
2268
2269 case NT_ANCHOR:
2270 default:
2271 break;
2272 }
2273
2274 return r;
2275 }
2276
2277 #define GET_CHAR_LEN_VARLEN -1
2278 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2279
2280 /* fixed size pattern node only */
2281 static int
get_char_length_tree1(Node * node,regex_t * reg,int * len,int level)2282 get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2283 {
2284 int tlen;
2285 int r = 0;
2286
2287 level++;
2288 *len = 0;
2289 switch (NTYPE(node)) {
2290 case NT_LIST:
2291 do {
2292 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2293 if (r == 0)
2294 *len = distance_add(*len, tlen);
2295 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2296 break;
2297
2298 case NT_ALT:
2299 {
2300 int tlen2;
2301 int varlen = 0;
2302
2303 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2304 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2305 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2306 if (r == 0) {
2307 if (tlen != tlen2)
2308 varlen = 1;
2309 }
2310 }
2311 if (r == 0) {
2312 if (varlen != 0) {
2313 if (level == 1)
2314 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2315 else
2316 r = GET_CHAR_LEN_VARLEN;
2317 }
2318 else
2319 *len = tlen;
2320 }
2321 }
2322 break;
2323
2324 case NT_STR:
2325 {
2326 StrNode* sn = NSTR(node);
2327 UChar *s = sn->s;
2328 while (s < sn->end) {
2329 s += enclen(reg->enc, s);
2330 (*len)++;
2331 }
2332 }
2333 break;
2334
2335 case NT_QTFR:
2336 {
2337 QtfrNode* qn = NQTFR(node);
2338 if (qn->lower == qn->upper) {
2339 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2340 if (r == 0)
2341 *len = distance_multiply(tlen, qn->lower);
2342 }
2343 else
2344 r = GET_CHAR_LEN_VARLEN;
2345 }
2346 break;
2347
2348 #ifdef USE_SUBEXP_CALL
2349 case NT_CALL:
2350 if (! IS_CALL_RECURSION(NCALL(node)))
2351 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2352 else
2353 r = GET_CHAR_LEN_VARLEN;
2354 break;
2355 #endif
2356
2357 case NT_CTYPE:
2358 *len = 1;
2359 break;
2360
2361 case NT_CCLASS:
2362 case NT_CANY:
2363 *len = 1;
2364 break;
2365
2366 case NT_ENCLOSE:
2367 {
2368 EncloseNode* en = NENCLOSE(node);
2369 switch (en->type) {
2370 case ENCLOSE_MEMORY:
2371 #ifdef USE_SUBEXP_CALL
2372 if (IS_ENCLOSE_CLEN_FIXED(en))
2373 *len = en->char_len;
2374 else {
2375 r = get_char_length_tree1(en->target, reg, len, level);
2376 if (r == 0) {
2377 en->char_len = *len;
2378 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2379 }
2380 }
2381 break;
2382 #endif
2383 case ENCLOSE_OPTION:
2384 case ENCLOSE_STOP_BACKTRACK:
2385 r = get_char_length_tree1(en->target, reg, len, level);
2386 break;
2387 default:
2388 break;
2389 }
2390 }
2391 break;
2392
2393 case NT_ANCHOR:
2394 break;
2395
2396 default:
2397 r = GET_CHAR_LEN_VARLEN;
2398 break;
2399 }
2400
2401 return r;
2402 }
2403
2404 static int
get_char_length_tree(Node * node,regex_t * reg,int * len)2405 get_char_length_tree(Node* node, regex_t* reg, int* len)
2406 {
2407 return get_char_length_tree1(node, reg, len, 0);
2408 }
2409
2410 /* x is not included y ==> 1 : 0 */
2411 static int
is_not_included(Node * x,Node * y,regex_t * reg)2412 is_not_included(Node* x, Node* y, regex_t* reg)
2413 {
2414 int i, len;
2415 OnigCodePoint code;
2416 UChar *p;
2417 int ytype;
2418
2419 retry:
2420 ytype = NTYPE(y);
2421 switch (NTYPE(x)) {
2422 case NT_CTYPE:
2423 {
2424 switch (ytype) {
2425 case NT_CTYPE:
2426 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2427 NCTYPE(y)->not != NCTYPE(x)->not)
2428 return 1;
2429 else
2430 return 0;
2431 break;
2432
2433 case NT_CCLASS:
2434 swap:
2435 {
2436 Node* tmp;
2437 tmp = x; x = y; y = tmp;
2438 goto retry;
2439 }
2440 break;
2441
2442 case NT_STR:
2443 goto swap;
2444 break;
2445
2446 default:
2447 break;
2448 }
2449 }
2450 break;
2451
2452 case NT_CCLASS:
2453 {
2454 CClassNode* xc = NCCLASS(x);
2455 switch (ytype) {
2456 case NT_CTYPE:
2457 switch (NCTYPE(y)->ctype) {
2458 case ONIGENC_CTYPE_WORD:
2459 if (NCTYPE(y)->not == 0) {
2460 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2461 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2462 if (BITSET_AT(xc->bs, i)) {
2463 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2464 }
2465 }
2466 return 1;
2467 }
2468 return 0;
2469 }
2470 else {
2471 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2472 if (! IS_CODE_SB_WORD(reg->enc, i)) {
2473 if (!IS_NCCLASS_NOT(xc)) {
2474 if (BITSET_AT(xc->bs, i))
2475 return 0;
2476 }
2477 else {
2478 if (! BITSET_AT(xc->bs, i))
2479 return 0;
2480 }
2481 }
2482 }
2483 return 1;
2484 }
2485 break;
2486
2487 default:
2488 break;
2489 }
2490 break;
2491
2492 case NT_CCLASS:
2493 {
2494 int v;
2495 CClassNode* yc = NCCLASS(y);
2496
2497 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2498 v = BITSET_AT(xc->bs, i);
2499 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2500 (v == 0 && IS_NCCLASS_NOT(xc))) {
2501 v = BITSET_AT(yc->bs, i);
2502 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2503 (v == 0 && IS_NCCLASS_NOT(yc)))
2504 return 0;
2505 }
2506 }
2507 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2508 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2509 return 1;
2510 return 0;
2511 }
2512 break;
2513
2514 case NT_STR:
2515 goto swap;
2516 break;
2517
2518 default:
2519 break;
2520 }
2521 }
2522 break;
2523
2524 case NT_STR:
2525 {
2526 StrNode* xs = NSTR(x);
2527 if (NSTRING_LEN(x) == 0)
2528 break;
2529
2530 //c = *(xs->s);
2531 switch (ytype) {
2532 case NT_CTYPE:
2533 switch (NCTYPE(y)->ctype) {
2534 case ONIGENC_CTYPE_WORD:
2535 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2536 return NCTYPE(y)->not;
2537 else
2538 return !(NCTYPE(y)->not);
2539 break;
2540 default:
2541 break;
2542 }
2543 break;
2544
2545 case NT_CCLASS:
2546 {
2547 CClassNode* cc = NCCLASS(y);
2548
2549 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2550 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2551 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2552 }
2553 break;
2554
2555 case NT_STR:
2556 {
2557 UChar *q;
2558 StrNode* ys = NSTR(y);
2559 len = NSTRING_LEN(x);
2560 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2561 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2562 /* tiny version */
2563 return 0;
2564 }
2565 else {
2566 for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
2567 if (*p != *q) return 1;
2568 }
2569 }
2570 }
2571 break;
2572
2573 default:
2574 break;
2575 }
2576 }
2577 break;
2578
2579 default:
2580 break;
2581 }
2582
2583 return 0;
2584 }
2585
2586 static Node*
get_head_value_node(Node * node,int exact,regex_t * reg)2587 get_head_value_node(Node* node, int exact, regex_t* reg)
2588 {
2589 Node* n = NULL_NODE;
2590
2591 switch (NTYPE(node)) {
2592 case NT_BREF:
2593 case NT_ALT:
2594 case NT_CANY:
2595 #ifdef USE_SUBEXP_CALL
2596 case NT_CALL:
2597 #endif
2598 break;
2599
2600 case NT_CTYPE:
2601 case NT_CCLASS:
2602 if (exact == 0) {
2603 n = node;
2604 }
2605 break;
2606
2607 case NT_LIST:
2608 n = get_head_value_node(NCAR(node), exact, reg);
2609 break;
2610
2611 case NT_STR:
2612 {
2613 StrNode* sn = NSTR(node);
2614
2615 if (sn->end <= sn->s)
2616 break;
2617
2618 if (exact != 0 &&
2619 !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
2620 }
2621 else {
2622 n = node;
2623 }
2624 }
2625 break;
2626
2627 case NT_QTFR:
2628 {
2629 QtfrNode* qn = NQTFR(node);
2630 if (qn->lower > 0) {
2631 if (IS_NOT_NULL(qn->head_exact))
2632 n = qn->head_exact;
2633 else
2634 n = get_head_value_node(qn->target, exact, reg);
2635 }
2636 }
2637 break;
2638
2639 case NT_ENCLOSE:
2640 {
2641 EncloseNode* en = NENCLOSE(node);
2642 switch (en->type) {
2643 case ENCLOSE_OPTION:
2644 {
2645 OnigOptionType options = reg->options;
2646
2647 reg->options = NENCLOSE(node)->option;
2648 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2649 reg->options = options;
2650 }
2651 break;
2652
2653 case ENCLOSE_MEMORY:
2654 case ENCLOSE_STOP_BACKTRACK:
2655 n = get_head_value_node(en->target, exact, reg);
2656 break;
2657 }
2658 }
2659 break;
2660
2661 case NT_ANCHOR:
2662 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2663 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2664 break;
2665
2666 default:
2667 break;
2668 }
2669
2670 return n;
2671 }
2672
2673 static int
check_type_tree(Node * node,int type_mask,int enclose_mask,int anchor_mask)2674 check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2675 {
2676 int type, r = 0;
2677
2678 type = NTYPE(node);
2679 if ((NTYPE2BIT(type) & type_mask) == 0)
2680 return 1;
2681
2682 switch (type) {
2683 case NT_LIST:
2684 case NT_ALT:
2685 do {
2686 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2687 anchor_mask);
2688 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2689 break;
2690
2691 case NT_QTFR:
2692 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2693 anchor_mask);
2694 break;
2695
2696 case NT_ENCLOSE:
2697 {
2698 EncloseNode* en = NENCLOSE(node);
2699 if ((en->type & enclose_mask) == 0)
2700 return 1;
2701
2702 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2703 }
2704 break;
2705
2706 case NT_ANCHOR:
2707 type = NANCHOR(node)->type;
2708 if ((type & anchor_mask) == 0)
2709 return 1;
2710
2711 if (NANCHOR(node)->target)
2712 r = check_type_tree(NANCHOR(node)->target,
2713 type_mask, enclose_mask, anchor_mask);
2714 break;
2715
2716 default:
2717 break;
2718 }
2719 return r;
2720 }
2721
2722 #ifdef USE_SUBEXP_CALL
2723
2724 #define RECURSION_EXIST 1
2725 #define RECURSION_INFINITE 2
2726
2727 static int
subexp_inf_recursive_check(Node * node,ScanEnv * env,int head)2728 subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2729 {
2730 int type;
2731 int r = 0;
2732
2733 type = NTYPE(node);
2734 switch (type) {
2735 case NT_LIST:
2736 {
2737 Node *x;
2738 OnigDistance min;
2739 int ret;
2740
2741 x = node;
2742 do {
2743 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2744 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2745 r |= ret;
2746 if (head) {
2747 ret = get_min_match_length(NCAR(x), &min, env);
2748 if (ret != 0) return ret;
2749 if (min != 0) head = 0;
2750 }
2751 } while (IS_NOT_NULL(x = NCDR(x)));
2752 }
2753 break;
2754
2755 case NT_ALT:
2756 {
2757 int ret;
2758 r = RECURSION_EXIST;
2759 do {
2760 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2761 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2762 r &= ret;
2763 } while (IS_NOT_NULL(node = NCDR(node)));
2764 }
2765 break;
2766
2767 case NT_QTFR:
2768 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2769 if (r == RECURSION_EXIST) {
2770 if (NQTFR(node)->lower == 0) r = 0;
2771 }
2772 break;
2773
2774 case NT_ANCHOR:
2775 {
2776 AnchorNode* an = NANCHOR(node);
2777 switch (an->type) {
2778 case ANCHOR_PREC_READ:
2779 case ANCHOR_PREC_READ_NOT:
2780 case ANCHOR_LOOK_BEHIND:
2781 case ANCHOR_LOOK_BEHIND_NOT:
2782 r = subexp_inf_recursive_check(an->target, env, head);
2783 break;
2784 }
2785 }
2786 break;
2787
2788 case NT_CALL:
2789 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2790 break;
2791
2792 case NT_ENCLOSE:
2793 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2794 return 0;
2795 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2796 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2797 else {
2798 SET_ENCLOSE_STATUS(node, NST_MARK2);
2799 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2800 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2801 }
2802 break;
2803
2804 default:
2805 break;
2806 }
2807
2808 return r;
2809 }
2810
2811 static int
subexp_inf_recursive_check_trav(Node * node,ScanEnv * env)2812 subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
2813 {
2814 int type;
2815 int r = 0;
2816
2817 type = NTYPE(node);
2818 switch (type) {
2819 case NT_LIST:
2820 case NT_ALT:
2821 do {
2822 r = subexp_inf_recursive_check_trav(NCAR(node), env);
2823 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2824 break;
2825
2826 case NT_QTFR:
2827 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
2828 break;
2829
2830 case NT_ANCHOR:
2831 {
2832 AnchorNode* an = NANCHOR(node);
2833 switch (an->type) {
2834 case ANCHOR_PREC_READ:
2835 case ANCHOR_PREC_READ_NOT:
2836 case ANCHOR_LOOK_BEHIND:
2837 case ANCHOR_LOOK_BEHIND_NOT:
2838 r = subexp_inf_recursive_check_trav(an->target, env);
2839 break;
2840 }
2841 }
2842 break;
2843
2844 case NT_ENCLOSE:
2845 {
2846 EncloseNode* en = NENCLOSE(node);
2847
2848 if (IS_ENCLOSE_RECURSION(en)) {
2849 SET_ENCLOSE_STATUS(node, NST_MARK1);
2850 r = subexp_inf_recursive_check(en->target, env, 1);
2851 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
2852 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2853 }
2854 r = subexp_inf_recursive_check_trav(en->target, env);
2855 }
2856
2857 break;
2858
2859 default:
2860 break;
2861 }
2862
2863 return r;
2864 }
2865
2866 static int
subexp_recursive_check(Node * node)2867 subexp_recursive_check(Node* node)
2868 {
2869 int r = 0;
2870
2871 switch (NTYPE(node)) {
2872 case NT_LIST:
2873 case NT_ALT:
2874 do {
2875 r |= subexp_recursive_check(NCAR(node));
2876 } while (IS_NOT_NULL(node = NCDR(node)));
2877 break;
2878
2879 case NT_QTFR:
2880 r = subexp_recursive_check(NQTFR(node)->target);
2881 break;
2882
2883 case NT_ANCHOR:
2884 {
2885 AnchorNode* an = NANCHOR(node);
2886 switch (an->type) {
2887 case ANCHOR_PREC_READ:
2888 case ANCHOR_PREC_READ_NOT:
2889 case ANCHOR_LOOK_BEHIND:
2890 case ANCHOR_LOOK_BEHIND_NOT:
2891 r = subexp_recursive_check(an->target);
2892 break;
2893 }
2894 }
2895 break;
2896
2897 case NT_CALL:
2898 r = subexp_recursive_check(NCALL(node)->target);
2899 if (r != 0) SET_CALL_RECURSION(node);
2900 break;
2901
2902 case NT_ENCLOSE:
2903 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2904 return 0;
2905 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2906 return 1; /* recursion */
2907 else {
2908 SET_ENCLOSE_STATUS(node, NST_MARK2);
2909 r = subexp_recursive_check(NENCLOSE(node)->target);
2910 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2911 }
2912 break;
2913
2914 default:
2915 break;
2916 }
2917
2918 return r;
2919 }
2920
2921
2922 static int
subexp_recursive_check_trav(Node * node,ScanEnv * env)2923 subexp_recursive_check_trav(Node* node, ScanEnv* env)
2924 {
2925 #define FOUND_CALLED_NODE 1
2926
2927 int type;
2928 int r = 0;
2929
2930 type = NTYPE(node);
2931 switch (type) {
2932 case NT_LIST:
2933 case NT_ALT:
2934 {
2935 int ret;
2936 do {
2937 ret = subexp_recursive_check_trav(NCAR(node), env);
2938 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
2939 else if (ret < 0) return ret;
2940 } while (IS_NOT_NULL(node = NCDR(node)));
2941 }
2942 break;
2943
2944 case NT_QTFR:
2945 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
2946 if (NQTFR(node)->upper == 0) {
2947 if (r == FOUND_CALLED_NODE)
2948 NQTFR(node)->is_refered = 1;
2949 }
2950 break;
2951
2952 case NT_ANCHOR:
2953 {
2954 AnchorNode* an = NANCHOR(node);
2955 switch (an->type) {
2956 case ANCHOR_PREC_READ:
2957 case ANCHOR_PREC_READ_NOT:
2958 case ANCHOR_LOOK_BEHIND:
2959 case ANCHOR_LOOK_BEHIND_NOT:
2960 r = subexp_recursive_check_trav(an->target, env);
2961 break;
2962 }
2963 }
2964 break;
2965
2966 case NT_ENCLOSE:
2967 {
2968 EncloseNode* en = NENCLOSE(node);
2969
2970 if (! IS_ENCLOSE_RECURSION(en)) {
2971 if (IS_ENCLOSE_CALLED(en)) {
2972 SET_ENCLOSE_STATUS(node, NST_MARK1);
2973 r = subexp_recursive_check(en->target);
2974 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
2975 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2976 }
2977 }
2978 r = subexp_recursive_check_trav(en->target, env);
2979 if (IS_ENCLOSE_CALLED(en))
2980 r |= FOUND_CALLED_NODE;
2981 }
2982 break;
2983
2984 default:
2985 break;
2986 }
2987
2988 return r;
2989 }
2990
2991 static int
setup_subexp_call(Node * node,ScanEnv * env)2992 setup_subexp_call(Node* node, ScanEnv* env)
2993 {
2994 int type;
2995 int r = 0;
2996
2997 type = NTYPE(node);
2998 switch (type) {
2999 case NT_LIST:
3000 do {
3001 r = setup_subexp_call(NCAR(node), env);
3002 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3003 break;
3004
3005 case NT_ALT:
3006 do {
3007 r = setup_subexp_call(NCAR(node), env);
3008 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3009 break;
3010
3011 case NT_QTFR:
3012 r = setup_subexp_call(NQTFR(node)->target, env);
3013 break;
3014 case NT_ENCLOSE:
3015 r = setup_subexp_call(NENCLOSE(node)->target, env);
3016 break;
3017
3018 case NT_CALL:
3019 {
3020 CallNode* cn = NCALL(node);
3021 Node** nodes = SCANENV_MEM_NODES(env);
3022
3023 if (cn->group_num != 0) {
3024 int gnum = cn->group_num;
3025
3026 #ifdef USE_NAMED_GROUP
3027 if (env->num_named > 0 &&
3028 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3029 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3030 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3031 }
3032 #endif
3033 if (gnum > env->num_mem) {
3034 onig_scan_env_set_error_string(env,
3035 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3036 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3037 }
3038
3039 #ifdef USE_NAMED_GROUP
3040 set_call_attr:
3041 #endif
3042 cn->target = nodes[cn->group_num];
3043 if (IS_NULL(cn->target)) {
3044 onig_scan_env_set_error_string(env,
3045 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3046 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3047 }
3048 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3049 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3050 cn->unset_addr_list = env->unset_addr_list;
3051 }
3052 #ifdef USE_NAMED_GROUP
3053 else {
3054 int *refs;
3055
3056 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3057 &refs);
3058 if (n <= 0) {
3059 onig_scan_env_set_error_string(env,
3060 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3061 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3062 }
3063 else if (n > 1) {
3064 onig_scan_env_set_error_string(env,
3065 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3066 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3067 }
3068 else {
3069 cn->group_num = refs[0];
3070 goto set_call_attr;
3071 }
3072 }
3073 #endif
3074 }
3075 break;
3076
3077 case NT_ANCHOR:
3078 {
3079 AnchorNode* an = NANCHOR(node);
3080
3081 switch (an->type) {
3082 case ANCHOR_PREC_READ:
3083 case ANCHOR_PREC_READ_NOT:
3084 case ANCHOR_LOOK_BEHIND:
3085 case ANCHOR_LOOK_BEHIND_NOT:
3086 r = setup_subexp_call(an->target, env);
3087 break;
3088 }
3089 }
3090 break;
3091
3092 default:
3093 break;
3094 }
3095
3096 return r;
3097 }
3098 #endif
3099
3100 /* divide different length alternatives in look-behind.
3101 (?<=A|B) ==> (?<=A)|(?<=B)
3102 (?<!A|B) ==> (?<!A)(?<!B)
3103 */
3104 static int
divide_look_behind_alternatives(Node * node)3105 divide_look_behind_alternatives(Node* node)
3106 {
3107 Node *head, *np, *insert_node;
3108 AnchorNode* an = NANCHOR(node);
3109 int anc_type = an->type;
3110
3111 head = an->target;
3112 np = NCAR(head);
3113 swap_node(node, head);
3114 NCAR(node) = head;
3115 NANCHOR(head)->target = np;
3116
3117 np = node;
3118 while ((np = NCDR(np)) != NULL_NODE) {
3119 insert_node = onig_node_new_anchor(anc_type);
3120 CHECK_NULL_RETURN_MEMERR(insert_node);
3121 NANCHOR(insert_node)->target = NCAR(np);
3122 NCAR(np) = insert_node;
3123 }
3124
3125 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3126 np = node;
3127 do {
3128 SET_NTYPE(np, NT_LIST); /* alt -> list */
3129 } while ((np = NCDR(np)) != NULL_NODE);
3130 }
3131 return 0;
3132 }
3133
3134 static int
setup_look_behind(Node * node,regex_t * reg,ScanEnv * env)3135 setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3136 {
3137 int r, len;
3138 AnchorNode* an = NANCHOR(node);
3139
3140 r = get_char_length_tree(an->target, reg, &len);
3141 if (r == 0)
3142 an->char_len = len;
3143 else if (r == GET_CHAR_LEN_VARLEN)
3144 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3145 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3146 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3147 r = divide_look_behind_alternatives(node);
3148 else
3149 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3150 }
3151
3152 return r;
3153 }
3154
3155 static int
next_setup(Node * node,Node * next_node,regex_t * reg)3156 next_setup(Node* node, Node* next_node, regex_t* reg)
3157 {
3158 int type;
3159
3160 retry:
3161 type = NTYPE(node);
3162 if (type == NT_QTFR) {
3163 QtfrNode* qn = NQTFR(node);
3164 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3165 #ifdef USE_QTFR_PEEK_NEXT
3166 Node* n = get_head_value_node(next_node, 1, reg);
3167 /* '\0': for UTF-16BE etc... */
3168 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3169 qn->next_head_exact = n;
3170 }
3171 #endif
3172 /* automatic posseivation a*b ==> (?>a*)b */
3173 if (qn->lower <= 1) {
3174 int ttype = NTYPE(qn->target);
3175 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3176 Node *x, *y;
3177 x = get_head_value_node(qn->target, 0, reg);
3178 if (IS_NOT_NULL(x)) {
3179 y = get_head_value_node(next_node, 0, reg);
3180 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3181 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3182 CHECK_NULL_RETURN_MEMERR(en);
3183 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3184 swap_node(node, en);
3185 NENCLOSE(node)->target = en;
3186 }
3187 }
3188 }
3189 }
3190 }
3191 }
3192 else if (type == NT_ENCLOSE) {
3193 EncloseNode* en = NENCLOSE(node);
3194 if (en->type == ENCLOSE_MEMORY) {
3195 node = en->target;
3196 goto retry;
3197 }
3198 }
3199 return 0;
3200 }
3201
3202
3203 static int
update_string_node_case_fold(regex_t * reg,Node * node)3204 update_string_node_case_fold(regex_t* reg, Node *node)
3205 {
3206 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3207 UChar *sbuf, *ebuf, *sp;
3208 int r, i, len, sbuf_size;
3209 StrNode* sn = NSTR(node);
3210
3211 end = sn->end;
3212 sbuf_size = (int)(end - sn->s) * 2;
3213 sbuf = (UChar* )xmalloc(sbuf_size);
3214 CHECK_NULL_RETURN_MEMERR(sbuf);
3215 ebuf = sbuf + sbuf_size;
3216
3217 sp = sbuf;
3218 p = sn->s;
3219 while (p < end) {
3220 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3221 for (i = 0; i < len; i++) {
3222 if (sp >= ebuf) {
3223 sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2, sbuf_size);
3224 CHECK_NULL_RETURN_MEMERR(sbuf);
3225 sp = sbuf + sbuf_size;
3226 sbuf_size *= 2;
3227 ebuf = sbuf + sbuf_size;
3228 }
3229
3230 *sp++ = buf[i];
3231 }
3232 }
3233
3234 r = onig_node_str_set(node, sbuf, sp);
3235 if (r != 0) {
3236 xfree(sbuf);
3237 return r;
3238 }
3239
3240 xfree(sbuf);
3241 return 0;
3242 }
3243
3244 static int
expand_case_fold_make_rem_string(Node ** rnode,UChar * s,UChar * end,regex_t * reg)3245 expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3246 regex_t* reg)
3247 {
3248 int r;
3249 Node *node;
3250
3251 node = onig_node_new_str(s, end);
3252 if (IS_NULL(node)) return ONIGERR_MEMORY;
3253
3254 r = update_string_node_case_fold(reg, node);
3255 if (r != 0) {
3256 onig_node_free(node);
3257 return r;
3258 }
3259
3260 NSTRING_SET_AMBIG(node);
3261 NSTRING_SET_DONT_GET_OPT_INFO(node);
3262 *rnode = node;
3263 return 0;
3264 }
3265
3266 static int
expand_case_fold_string_alt(int item_num,OnigCaseFoldCodeItem items[],UChar * p,int slen,UChar * end,regex_t * reg,Node ** rnode)3267 expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3268 UChar *p, int slen, UChar *end,
3269 regex_t* reg, Node **rnode)
3270 {
3271 int r, i, j, len, varlen;
3272 Node *anode, *var_anode, *snode, *xnode, *an;
3273 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3274 xnode = NULL_NODE;
3275
3276 *rnode = var_anode = NULL_NODE;
3277
3278 varlen = 0;
3279 for (i = 0; i < item_num; i++) {
3280 if (items[i].byte_len != slen) {
3281 varlen = 1;
3282 break;
3283 }
3284 }
3285
3286 if (varlen != 0) {
3287 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3288 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3289
3290 xnode = onig_node_new_list(NULL, NULL);
3291 if (IS_NULL(xnode)) goto mem_err;
3292 NCAR(var_anode) = xnode;
3293
3294 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3295 if (IS_NULL(anode)) goto mem_err;
3296 NCAR(xnode) = anode;
3297 }
3298 else {
3299 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3300 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3301 }
3302
3303 snode = onig_node_new_str(p, p + slen);
3304 if (IS_NULL(snode)) goto mem_err;
3305
3306 NCAR(anode) = snode;
3307
3308 for (i = 0; i < item_num; i++) {
3309 snode = onig_node_new_str(NULL, NULL);
3310 if (IS_NULL(snode)) goto mem_err;
3311
3312 for (j = 0; j < items[i].code_len; j++) {
3313 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3314 if (len < 0) {
3315 r = len;
3316 goto mem_err2;
3317 }
3318
3319 r = onig_node_str_cat(snode, buf, buf + len);
3320 if (r != 0) goto mem_err2;
3321 }
3322
3323 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3324 if (IS_NULL(an)) {
3325 goto mem_err2;
3326 }
3327
3328 if (items[i].byte_len != slen) {
3329 Node *rem = NULL_NODE;
3330 UChar *q = p + items[i].byte_len;
3331
3332 if (q < end) {
3333 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3334 if (r != 0) {
3335 onig_node_free(an);
3336 goto mem_err2;
3337 }
3338
3339 xnode = onig_node_list_add(NULL_NODE, snode);
3340 if (IS_NULL(xnode)) {
3341 onig_node_free(an);
3342 onig_node_free(rem);
3343 goto mem_err2;
3344 }
3345 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3346 onig_node_free(an);
3347 onig_node_free(xnode);
3348 onig_node_free(rem);
3349 goto mem_err;
3350 }
3351
3352 NCAR(an) = xnode;
3353 }
3354 else {
3355 NCAR(an) = snode;
3356 }
3357
3358 if (var_anode == NULL) {
3359 onig_node_free(an);
3360 onig_node_free(xnode);
3361 onig_node_free(rem);
3362 goto mem_err2;
3363 }
3364 NCDR(var_anode) = an;
3365 var_anode = an;
3366 }
3367 else {
3368 NCAR(an) = snode;
3369 NCDR(anode) = an;
3370 anode = an;
3371 }
3372 }
3373
3374 return varlen;
3375
3376 mem_err2:
3377 onig_node_free(snode);
3378
3379 mem_err:
3380 onig_node_free(*rnode);
3381
3382 return ONIGERR_MEMORY;
3383 }
3384
3385 static int
expand_case_fold_string(Node * node,regex_t * reg)3386 expand_case_fold_string(Node* node, regex_t* reg)
3387 {
3388 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3389
3390 int r, n, len, alt_num;
3391 UChar *start, *end, *p;
3392 Node *top_root, *root, *snode, *prev_node;
3393 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3394 StrNode* sn = NSTR(node);
3395
3396 if (NSTRING_IS_AMBIG(node)) return 0;
3397
3398 start = sn->s;
3399 end = sn->end;
3400 if (start >= end) return 0;
3401
3402 r = 0;
3403 top_root = root = prev_node = snode = NULL_NODE;
3404 alt_num = 1;
3405 p = start;
3406 while (p < end) {
3407 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3408 p, end, items);
3409 if (n < 0) {
3410 r = n;
3411 goto err;
3412 }
3413
3414 len = enclen(reg->enc, p);
3415
3416 if (n == 0) {
3417 if (IS_NULL(snode)) {
3418 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3419 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3420 if (IS_NULL(root)) {
3421 onig_node_free(prev_node);
3422 goto mem_err;
3423 }
3424 }
3425
3426 prev_node = snode = onig_node_new_str(NULL, NULL);
3427 if (IS_NULL(snode)) goto mem_err;
3428 if (IS_NOT_NULL(root)) {
3429 if (IS_NULL(onig_node_list_add(root, snode))) {
3430 onig_node_free(snode);
3431 goto mem_err;
3432 }
3433 }
3434 }
3435
3436 r = onig_node_str_cat(snode, p, p + len);
3437 if (r != 0) goto err;
3438 }
3439 else {
3440 alt_num *= (n + 1);
3441 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3442
3443 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3444 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3445 if (IS_NULL(root)) {
3446 onig_node_free(prev_node);
3447 goto mem_err;
3448 }
3449 }
3450
3451 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3452 if (r < 0) goto mem_err;
3453 if (r == 1) {
3454 if (IS_NULL(root)) {
3455 top_root = prev_node;
3456 }
3457 else {
3458 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3459 onig_node_free(prev_node);
3460 goto mem_err;
3461 }
3462 }
3463
3464 root = NCAR(prev_node);
3465 }
3466 else { /* r == 0 */
3467 if (IS_NOT_NULL(root)) {
3468 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3469 onig_node_free(prev_node);
3470 goto mem_err;
3471 }
3472 }
3473 }
3474
3475 snode = NULL_NODE;
3476 }
3477
3478 p += len;
3479 }
3480
3481 if (p < end) {
3482 Node *srem;
3483
3484 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3485 if (r != 0) goto mem_err;
3486
3487 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3488 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3489 if (IS_NULL(root)) {
3490 onig_node_free(srem);
3491 onig_node_free(prev_node);
3492 goto mem_err;
3493 }
3494 }
3495
3496 if (IS_NULL(root)) {
3497 prev_node = srem;
3498 }
3499 else {
3500 if (IS_NULL(onig_node_list_add(root, srem))) {
3501 onig_node_free(srem);
3502 goto mem_err;
3503 }
3504 }
3505 }
3506
3507 /* ending */
3508 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3509 swap_node(node, top_root);
3510 onig_node_free(top_root);
3511 return 0;
3512
3513 mem_err:
3514 r = ONIGERR_MEMORY;
3515
3516 err:
3517 onig_node_free(top_root);
3518 return r;
3519 }
3520
3521
3522 #ifdef USE_COMBINATION_EXPLOSION_CHECK
3523
3524 #define CEC_THRES_NUM_BIG_REPEAT 512
3525 #define CEC_INFINITE_NUM 0x7fffffff
3526
3527 #define CEC_IN_INFINITE_REPEAT (1<<0)
3528 #define CEC_IN_FINITE_REPEAT (1<<1)
3529 #define CEC_CONT_BIG_REPEAT (1<<2)
3530
3531 static int
setup_comb_exp_check(Node * node,int state,ScanEnv * env)3532 setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3533 {
3534 int type;
3535 int r = state;
3536
3537 type = NTYPE(node);
3538 switch (type) {
3539 case NT_LIST:
3540 {
3541 Node* prev = NULL_NODE;
3542 do {
3543 r = setup_comb_exp_check(NCAR(node), r, env);
3544 prev = NCAR(node);
3545 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3546 }
3547 break;
3548
3549 case NT_ALT:
3550 {
3551 int ret;
3552 do {
3553 ret = setup_comb_exp_check(NCAR(node), state, env);
3554 r |= ret;
3555 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3556 }
3557 break;
3558
3559 case NT_QTFR:
3560 {
3561 int child_state = state;
3562 int add_state = 0;
3563 QtfrNode* qn = NQTFR(node);
3564 Node* target = qn->target;
3565 int var_num;
3566
3567 if (! IS_REPEAT_INFINITE(qn->upper)) {
3568 if (qn->upper > 1) {
3569 /* {0,1}, {1,1} are allowed */
3570 child_state |= CEC_IN_FINITE_REPEAT;
3571
3572 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3573 if (env->backrefed_mem == 0) {
3574 if (NTYPE(qn->target) == NT_ENCLOSE) {
3575 EncloseNode* en = NENCLOSE(qn->target);
3576 if (en->type == ENCLOSE_MEMORY) {
3577 if (NTYPE(en->target) == NT_QTFR) {
3578 QtfrNode* q = NQTFR(en->target);
3579 if (IS_REPEAT_INFINITE(q->upper)
3580 && q->greedy == qn->greedy) {
3581 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3582 if (qn->upper == 1)
3583 child_state = state;
3584 }
3585 }
3586 }
3587 }
3588 }
3589 }
3590 }
3591
3592 if (state & CEC_IN_FINITE_REPEAT) {
3593 qn->comb_exp_check_num = -1;
3594 }
3595 else {
3596 if (IS_REPEAT_INFINITE(qn->upper)) {
3597 var_num = CEC_INFINITE_NUM;
3598 child_state |= CEC_IN_INFINITE_REPEAT;
3599 }
3600 else {
3601 var_num = qn->upper - qn->lower;
3602 }
3603
3604 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3605 add_state |= CEC_CONT_BIG_REPEAT;
3606
3607 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3608 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3609 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3610 if (qn->comb_exp_check_num == 0) {
3611 env->num_comb_exp_check++;
3612 qn->comb_exp_check_num = env->num_comb_exp_check;
3613 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3614 env->comb_exp_max_regnum = env->curr_max_regnum;
3615 }
3616 }
3617 }
3618
3619 r = setup_comb_exp_check(target, child_state, env);
3620 r |= add_state;
3621 }
3622 break;
3623
3624 case NT_ENCLOSE:
3625 {
3626 EncloseNode* en = NENCLOSE(node);
3627
3628 switch (en->type) {
3629 case ENCLOSE_MEMORY:
3630 {
3631 if (env->curr_max_regnum < en->regnum)
3632 env->curr_max_regnum = en->regnum;
3633
3634 r = setup_comb_exp_check(en->target, state, env);
3635 }
3636 break;
3637
3638 default:
3639 r = setup_comb_exp_check(en->target, state, env);
3640 break;
3641 }
3642 }
3643 break;
3644
3645 #ifdef USE_SUBEXP_CALL
3646 case NT_CALL:
3647 if (IS_CALL_RECURSION(NCALL(node)))
3648 env->has_recursion = 1;
3649 else
3650 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3651 break;
3652 #endif
3653
3654 default:
3655 break;
3656 }
3657
3658 return r;
3659 }
3660 #endif
3661
3662 #define IN_ALT (1<<0)
3663 #define IN_NOT (1<<1)
3664 #define IN_REPEAT (1<<2)
3665 #define IN_VAR_REPEAT (1<<3)
3666
3667 /* setup_tree does the following work.
3668 1. check empty loop. (set qn->target_empty_info)
3669 2. expand ignore-case in char class.
3670 3. set memory status bit flags. (reg->mem_stats)
3671 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3672 5. find invalid patterns in look-behind.
3673 6. expand repeated string.
3674 */
3675 static int
setup_tree(Node * node,regex_t * reg,int state,ScanEnv * env)3676 setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3677 {
3678 int type;
3679 int r = 0;
3680
3681 type = NTYPE(node);
3682 switch (type) {
3683 case NT_LIST:
3684 {
3685 Node* prev = NULL_NODE;
3686 do {
3687 r = setup_tree(NCAR(node), reg, state, env);
3688 if (IS_NOT_NULL(prev) && r == 0) {
3689 r = next_setup(prev, NCAR(node), reg);
3690 }
3691 prev = NCAR(node);
3692 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3693 }
3694 break;
3695
3696 case NT_ALT:
3697 do {
3698 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3699 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3700 break;
3701
3702 case NT_CCLASS:
3703 break;
3704
3705 case NT_STR:
3706 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3707 r = expand_case_fold_string(node, reg);
3708 }
3709 break;
3710
3711 case NT_CTYPE:
3712 case NT_CANY:
3713 break;
3714
3715 #ifdef USE_SUBEXP_CALL
3716 case NT_CALL:
3717 break;
3718 #endif
3719
3720 case NT_BREF:
3721 {
3722 int i;
3723 int* p;
3724 Node** nodes = SCANENV_MEM_NODES(env);
3725 BRefNode* br = NBREF(node);
3726 p = BACKREFS_P(br);
3727 for (i = 0; i < br->back_num; i++) {
3728 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3729 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3730 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3731 #ifdef USE_BACKREF_WITH_LEVEL
3732 if (IS_BACKREF_NEST_LEVEL(br)) {
3733 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3734 }
3735 #endif
3736 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3737 }
3738 }
3739 break;
3740
3741 case NT_QTFR:
3742 {
3743 OnigDistance d;
3744 QtfrNode* qn = NQTFR(node);
3745 Node* target = qn->target;
3746
3747 if ((state & IN_REPEAT) != 0) {
3748 qn->state |= NST_IN_REPEAT;
3749 }
3750
3751 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3752 r = get_min_match_length(target, &d, env);
3753 if (r) break;
3754 if (d == 0) {
3755 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3756 #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3757 r = quantifiers_memory_node_info(target);
3758 if (r < 0) break;
3759 if (r > 0) {
3760 qn->target_empty_info = r;
3761 }
3762 #endif
3763 #if 0
3764 r = get_max_match_length(target, &d, env);
3765 if (r == 0 && d == 0) {
3766 /* ()* ==> ()?, ()+ ==> () */
3767 qn->upper = 1;
3768 if (qn->lower > 1) qn->lower = 1;
3769 if (NTYPE(target) == NT_STR) {
3770 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
3771 }
3772 }
3773 #endif
3774 }
3775 }
3776
3777 state |= IN_REPEAT;
3778 if (qn->lower != qn->upper)
3779 state |= IN_VAR_REPEAT;
3780 r = setup_tree(target, reg, state, env);
3781 if (r) break;
3782
3783 /* expand string */
3784 #define EXPAND_STRING_MAX_LENGTH 100
3785 if (NTYPE(target) == NT_STR) {
3786 if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
3787 qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3788 int len = NSTRING_LEN(target);
3789 StrNode* sn = NSTR(target);
3790
3791 if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
3792 int i, n = qn->lower;
3793 onig_node_conv_to_str_node(node, NSTR(target)->flag);
3794 for (i = 0; i < n; i++) {
3795 r = onig_node_str_cat(node, sn->s, sn->end);
3796 if (r) break;
3797 }
3798 onig_node_free(target);
3799 break; /* break case NT_QTFR: */
3800 }
3801 }
3802 }
3803
3804 #ifdef USE_OP_PUSH_OR_JUMP_EXACT
3805 if (qn->greedy && (qn->target_empty_info != 0)) {
3806 if (NTYPE(target) == NT_QTFR) {
3807 QtfrNode* tqn = NQTFR(target);
3808 if (IS_NOT_NULL(tqn->head_exact)) {
3809 qn->head_exact = tqn->head_exact;
3810 tqn->head_exact = NULL;
3811 }
3812 }
3813 else {
3814 qn->head_exact = get_head_value_node(qn->target, 1, reg);
3815 }
3816 }
3817 #endif
3818 }
3819 break;
3820
3821 case NT_ENCLOSE:
3822 {
3823 EncloseNode* en = NENCLOSE(node);
3824
3825 switch (en->type) {
3826 case ENCLOSE_OPTION:
3827 {
3828 OnigOptionType options = reg->options;
3829 reg->options = NENCLOSE(node)->option;
3830 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
3831 reg->options = options;
3832 }
3833 break;
3834
3835 case ENCLOSE_MEMORY:
3836 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
3837 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
3838 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
3839 }
3840 r = setup_tree(en->target, reg, state, env);
3841 break;
3842
3843 case ENCLOSE_STOP_BACKTRACK:
3844 {
3845 Node* target = en->target;
3846 r = setup_tree(target, reg, state, env);
3847 if (NTYPE(target) == NT_QTFR) {
3848 QtfrNode* tqn = NQTFR(target);
3849 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
3850 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
3851 int qtype = NTYPE(tqn->target);
3852 if (IS_NODE_TYPE_SIMPLE(qtype))
3853 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
3854 }
3855 }
3856 }
3857 break;
3858 }
3859 }
3860 break;
3861
3862 case NT_ANCHOR:
3863 {
3864 AnchorNode* an = NANCHOR(node);
3865
3866 switch (an->type) {
3867 case ANCHOR_PREC_READ:
3868 r = setup_tree(an->target, reg, state, env);
3869 break;
3870 case ANCHOR_PREC_READ_NOT:
3871 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3872 break;
3873
3874 /* allowed node types in look-behind */
3875 #define ALLOWED_TYPE_IN_LB \
3876 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
3877 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
3878
3879 #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
3880 #define ALLOWED_ENCLOSE_IN_LB_NOT 0
3881
3882 #define ALLOWED_ANCHOR_IN_LB \
3883 ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3884 #define ALLOWED_ANCHOR_IN_LB_NOT \
3885 ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
3886
3887 case ANCHOR_LOOK_BEHIND:
3888 {
3889 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3890 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
3891 if (r < 0) return r;
3892 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3893 r = setup_look_behind(node, reg, env);
3894 if (r != 0) return r;
3895 r = setup_tree(an->target, reg, state, env);
3896 }
3897 break;
3898
3899 case ANCHOR_LOOK_BEHIND_NOT:
3900 {
3901 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
3902 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
3903 if (r < 0) return r;
3904 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3905 r = setup_look_behind(node, reg, env);
3906 if (r != 0) return r;
3907 r = setup_tree(an->target, reg, (state | IN_NOT), env);
3908 }
3909 break;
3910 }
3911 }
3912 break;
3913
3914 default:
3915 break;
3916 }
3917
3918 return r;
3919 }
3920
3921 /* set skip map for Boyer-Moor search */
3922 static int
set_bm_skip(UChar * s,UChar * end,OnigEncoding enc ARG_UNUSED,UChar skip[],int ** int_skip)3923 set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
3924 UChar skip[], int** int_skip)
3925 {
3926 int i, len;
3927
3928 len = (int)(end - s);
3929 if (len < ONIG_CHAR_TABLE_SIZE) {
3930 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar)len;
3931
3932 for (i = 0; i < len - 1; i++)
3933 skip[s[i]] = (UChar)(len - 1 - i);
3934 }
3935 else {
3936 if (IS_NULL(*int_skip)) {
3937 *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
3938 if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
3939 }
3940 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
3941
3942 for (i = 0; i < len - 1; i++)
3943 (*int_skip)[s[i]] = len - 1 - i;
3944 }
3945 return 0;
3946 }
3947
3948 #define OPT_EXACT_MAXLEN 24
3949
3950 typedef struct {
3951 OnigDistance min; /* min byte length */
3952 OnigDistance max; /* max byte length */
3953 } MinMaxLen;
3954
3955 typedef struct {
3956 MinMaxLen mmd;
3957 OnigEncoding enc;
3958 OnigOptionType options;
3959 OnigCaseFoldType case_fold_flag;
3960 ScanEnv* scan_env;
3961 } OptEnv;
3962
3963 typedef struct {
3964 int left_anchor;
3965 int right_anchor;
3966 } OptAncInfo;
3967
3968 typedef struct {
3969 MinMaxLen mmd; /* info position */
3970 OptAncInfo anc;
3971
3972 int reach_end;
3973 int ignore_case;
3974 int len;
3975 UChar s[OPT_EXACT_MAXLEN];
3976 } OptExactInfo;
3977
3978 typedef struct {
3979 MinMaxLen mmd; /* info position */
3980 OptAncInfo anc;
3981
3982 int value; /* weighted value */
3983 UChar map[ONIG_CHAR_TABLE_SIZE];
3984 } OptMapInfo;
3985
3986 typedef struct {
3987 MinMaxLen len;
3988
3989 OptAncInfo anc;
3990 OptExactInfo exb; /* boundary */
3991 OptExactInfo exm; /* middle */
3992 OptExactInfo expr; /* prec read (?=...) */
3993
3994 OptMapInfo map; /* boundary */
3995 } NodeOptInfo;
3996
3997
3998 static int
map_position_value(OnigEncoding enc,int i)3999 map_position_value(OnigEncoding enc, int i)
4000 {
4001 static const short int ByteValTable[] = {
4002 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4003 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4004 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4005 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4006 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4007 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4008 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4009 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4010 };
4011
4012 if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
4013 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4014 return 20;
4015 else
4016 return (int )ByteValTable[i];
4017 }
4018 else
4019 return 4; /* Take it easy. */
4020 }
4021
4022 static int
distance_value(MinMaxLen * mm)4023 distance_value(MinMaxLen* mm)
4024 {
4025 /* 1000 / (min-max-dist + 1) */
4026 static const short int dist_vals[] = {
4027 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4028 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4029 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4030 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4031 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4032 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4033 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4034 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4035 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4036 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4037 };
4038
4039 int d;
4040
4041 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4042
4043 d = mm->max - mm->min;
4044 if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
4045 /* return dist_vals[d] * 16 / (mm->min + 12); */
4046 return (int )dist_vals[d];
4047 else
4048 return 1;
4049 }
4050
4051 static int
comp_distance_value(MinMaxLen * d1,MinMaxLen * d2,int v1,int v2)4052 comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4053 {
4054 if (v2 <= 0) return -1;
4055 if (v1 <= 0) return 1;
4056
4057 v1 *= distance_value(d1);
4058 v2 *= distance_value(d2);
4059
4060 if (v2 > v1) return 1;
4061 if (v2 < v1) return -1;
4062
4063 if (d2->min < d1->min) return 1;
4064 if (d2->min > d1->min) return -1;
4065 return 0;
4066 }
4067
4068 static int
is_equal_mml(MinMaxLen * a,MinMaxLen * b)4069 is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4070 {
4071 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4072 }
4073
4074
4075 static void
set_mml(MinMaxLen * mml,OnigDistance min,OnigDistance max)4076 set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4077 {
4078 mml->min = min;
4079 mml->max = max;
4080 }
4081
4082 static void
clear_mml(MinMaxLen * mml)4083 clear_mml(MinMaxLen* mml)
4084 {
4085 mml->min = mml->max = 0;
4086 }
4087
4088 static void
copy_mml(MinMaxLen * to,MinMaxLen * from)4089 copy_mml(MinMaxLen* to, MinMaxLen* from)
4090 {
4091 to->min = from->min;
4092 to->max = from->max;
4093 }
4094
4095 static void
add_mml(MinMaxLen * to,MinMaxLen * from)4096 add_mml(MinMaxLen* to, MinMaxLen* from)
4097 {
4098 to->min = distance_add(to->min, from->min);
4099 to->max = distance_add(to->max, from->max);
4100 }
4101
4102 #if 0
4103 static void
4104 add_len_mml(MinMaxLen* to, OnigDistance len)
4105 {
4106 to->min = distance_add(to->min, len);
4107 to->max = distance_add(to->max, len);
4108 }
4109 #endif
4110
4111 static void
alt_merge_mml(MinMaxLen * to,MinMaxLen * from)4112 alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4113 {
4114 if (to->min > from->min) to->min = from->min;
4115 if (to->max < from->max) to->max = from->max;
4116 }
4117
4118 static void
copy_opt_env(OptEnv * to,OptEnv * from)4119 copy_opt_env(OptEnv* to, OptEnv* from)
4120 {
4121 CopyMem (to, from, sizeof (OptEnv));
4122 }
4123
4124 static void
clear_opt_anc_info(OptAncInfo * anc)4125 clear_opt_anc_info(OptAncInfo* anc)
4126 {
4127 anc->left_anchor = 0;
4128 anc->right_anchor = 0;
4129 }
4130
4131 static void
copy_opt_anc_info(OptAncInfo * to,OptAncInfo * from)4132 copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4133 {
4134 CopyMem (to, from, sizeof (OptAncInfo));
4135 }
4136
4137 static void
concat_opt_anc_info(OptAncInfo * to,OptAncInfo * left,OptAncInfo * right,OnigDistance left_len,OnigDistance right_len)4138 concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4139 OnigDistance left_len, OnigDistance right_len)
4140 {
4141 clear_opt_anc_info(to);
4142
4143 to->left_anchor = left->left_anchor;
4144 if (left_len == 0) {
4145 to->left_anchor |= right->left_anchor;
4146 }
4147
4148 to->right_anchor = right->right_anchor;
4149 if (right_len == 0) {
4150 to->right_anchor |= left->right_anchor;
4151 }
4152 }
4153
4154 static int
is_left_anchor(int anc)4155 is_left_anchor(int anc)
4156 {
4157 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4158 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4159 anc == ANCHOR_PREC_READ_NOT)
4160 return 0;
4161
4162 return 1;
4163 }
4164
4165 static int
is_set_opt_anc_info(OptAncInfo * to,int anc)4166 is_set_opt_anc_info(OptAncInfo* to, int anc)
4167 {
4168 if ((to->left_anchor & anc) != 0) return 1;
4169
4170 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4171 }
4172
4173 static void
add_opt_anc_info(OptAncInfo * to,int anc)4174 add_opt_anc_info(OptAncInfo* to, int anc)
4175 {
4176 if (is_left_anchor(anc))
4177 to->left_anchor |= anc;
4178 else
4179 to->right_anchor |= anc;
4180 }
4181
4182 static void
remove_opt_anc_info(OptAncInfo * to,int anc)4183 remove_opt_anc_info(OptAncInfo* to, int anc)
4184 {
4185 if (is_left_anchor(anc))
4186 to->left_anchor &= ~anc;
4187 else
4188 to->right_anchor &= ~anc;
4189 }
4190
4191 static void
alt_merge_opt_anc_info(OptAncInfo * to,OptAncInfo * add)4192 alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4193 {
4194 to->left_anchor &= add->left_anchor;
4195 to->right_anchor &= add->right_anchor;
4196 }
4197
4198 static int
is_full_opt_exact_info(OptExactInfo * ex)4199 is_full_opt_exact_info(OptExactInfo* ex)
4200 {
4201 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4202 }
4203
4204 static void
clear_opt_exact_info(OptExactInfo * ex)4205 clear_opt_exact_info(OptExactInfo* ex)
4206 {
4207 clear_mml(&ex->mmd);
4208 clear_opt_anc_info(&ex->anc);
4209 ex->reach_end = 0;
4210 ex->ignore_case = 0;
4211 ex->len = 0;
4212 ex->s[0] = '\0';
4213 }
4214
4215 static void
copy_opt_exact_info(OptExactInfo * to,OptExactInfo * from)4216 copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4217 {
4218 CopyMem (to, from, sizeof (OptExactInfo));
4219 }
4220
4221 static void
concat_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OnigEncoding enc)4222 concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4223 {
4224 int i, j, len;
4225 UChar *p, *end;
4226 OptAncInfo tanc;
4227
4228 if (! to->ignore_case && add->ignore_case) {
4229 if (to->len >= add->len) return ; /* avoid */
4230
4231 to->ignore_case = 1;
4232 }
4233
4234 p = add->s;
4235 end = p + add->len;
4236 for (i = to->len; p < end; ) {
4237 len = enclen(enc, p);
4238 if (i + len > OPT_EXACT_MAXLEN) break;
4239 for (j = 0; j < len && p < end; j++)
4240 to->s[i++] = *p++;
4241 }
4242
4243 to->len = i;
4244 to->reach_end = (p == end ? add->reach_end : 0);
4245
4246 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4247 if (! to->reach_end) tanc.right_anchor = 0;
4248 copy_opt_anc_info(&to->anc, &tanc);
4249 }
4250
4251 static void
concat_opt_exact_info_str(OptExactInfo * to,UChar * s,UChar * end,int raw ARG_UNUSED,OnigEncoding enc)4252 concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4253 int raw ARG_UNUSED, OnigEncoding enc)
4254 {
4255 int i, j, len;
4256 UChar *p;
4257
4258 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4259 len = enclen(enc, p);
4260 if (i + len > OPT_EXACT_MAXLEN) break;
4261 for (j = 0; j < len && p < end; j++)
4262 to->s[i++] = *p++;
4263 }
4264
4265 to->len = i;
4266 }
4267
4268 static void
alt_merge_opt_exact_info(OptExactInfo * to,OptExactInfo * add,OptEnv * env)4269 alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4270 {
4271 int i, j, len;
4272
4273 if (add->len == 0 || to->len == 0) {
4274 clear_opt_exact_info(to);
4275 return ;
4276 }
4277
4278 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4279 clear_opt_exact_info(to);
4280 return ;
4281 }
4282
4283 for (i = 0; i < to->len && i < add->len; ) {
4284 if (to->s[i] != add->s[i]) break;
4285 len = enclen(env->enc, to->s + i);
4286
4287 for (j = 1; j < len; j++) {
4288 if (to->s[i+j] != add->s[i+j]) break;
4289 }
4290 if (j < len) break;
4291 i += len;
4292 }
4293
4294 if (! add->reach_end || i < add->len || i < to->len) {
4295 to->reach_end = 0;
4296 }
4297 to->len = i;
4298 to->ignore_case |= add->ignore_case;
4299
4300 alt_merge_opt_anc_info(&to->anc, &add->anc);
4301 if (! to->reach_end) to->anc.right_anchor = 0;
4302 }
4303
4304 static void
select_opt_exact_info(OnigEncoding enc,OptExactInfo * now,OptExactInfo * alt)4305 select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4306 {
4307 int v1, v2;
4308
4309 v1 = now->len;
4310 v2 = alt->len;
4311
4312 if (v2 == 0) {
4313 return ;
4314 }
4315 else if (v1 == 0) {
4316 copy_opt_exact_info(now, alt);
4317 return ;
4318 }
4319 else if (v1 <= 2 && v2 <= 2) {
4320 /* ByteValTable[x] is big value --> low price */
4321 v2 = map_position_value(enc, now->s[0]);
4322 v1 = map_position_value(enc, alt->s[0]);
4323
4324 if (now->len > 1) v1 += 5;
4325 if (alt->len > 1) v2 += 5;
4326 }
4327
4328 if (now->ignore_case == 0) v1 *= 2;
4329 if (alt->ignore_case == 0) v2 *= 2;
4330
4331 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4332 copy_opt_exact_info(now, alt);
4333 }
4334
4335 static void
clear_opt_map_info(OptMapInfo * map)4336 clear_opt_map_info(OptMapInfo* map)
4337 {
4338 static const OptMapInfo clean_info = {
4339 {0, 0}, {0, 0}, 0,
4340 {
4341 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4347 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4348 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4350 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4351 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4353 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4355 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4357 }
4358 };
4359
4360 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4361 }
4362
4363 static void
copy_opt_map_info(OptMapInfo * to,OptMapInfo * from)4364 copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4365 {
4366 CopyMem (to, from, sizeof (OptMapInfo));
4367 }
4368
4369 static void
add_char_opt_map_info(OptMapInfo * map,UChar c,OnigEncoding enc)4370 add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4371 {
4372 if (map->map[c] == 0) {
4373 map->map[c] = 1;
4374 map->value += map_position_value(enc, c);
4375 }
4376 }
4377
4378 static int
add_char_amb_opt_map_info(OptMapInfo * map,UChar * p,UChar * end,OnigEncoding enc,OnigCaseFoldType case_fold_flag)4379 add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4380 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4381 {
4382 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4383 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4384 int i, n;
4385
4386 add_char_opt_map_info(map, p[0], enc);
4387
4388 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4389 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4390 if (n < 0) return n;
4391
4392 for (i = 0; i < n; i++) {
4393 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4394 add_char_opt_map_info(map, buf[0], enc);
4395 }
4396
4397 return 0;
4398 }
4399
4400 static void
select_opt_map_info(OptMapInfo * now,OptMapInfo * alt)4401 select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4402 {
4403 static int z = 1<<15; /* 32768: something big value */
4404
4405 int v1, v2;
4406
4407 if (alt->value == 0) return ;
4408 if (now->value == 0) {
4409 copy_opt_map_info(now, alt);
4410 return ;
4411 }
4412
4413 v1 = z / now->value;
4414 v2 = z / alt->value;
4415 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4416 copy_opt_map_info(now, alt);
4417 }
4418
4419 static int
comp_opt_exact_or_map_info(OptExactInfo * e,OptMapInfo * m)4420 comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4421 {
4422 #define COMP_EM_BASE 20
4423 int ve, vm;
4424
4425 if (m->value <= 0) return -1;
4426
4427 ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
4428 vm = COMP_EM_BASE * 5 * 2 / m->value;
4429 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4430 }
4431
4432 static void
alt_merge_opt_map_info(OnigEncoding enc,OptMapInfo * to,OptMapInfo * add)4433 alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4434 {
4435 int i, val;
4436
4437 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4438 if (to->value == 0) return ;
4439 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4440 clear_opt_map_info(to);
4441 return ;
4442 }
4443
4444 alt_merge_mml(&to->mmd, &add->mmd);
4445
4446 val = 0;
4447 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4448 if (add->map[i])
4449 to->map[i] = 1;
4450
4451 if (to->map[i])
4452 val += map_position_value(enc, i);
4453 }
4454 to->value = val;
4455
4456 alt_merge_opt_anc_info(&to->anc, &add->anc);
4457 }
4458
4459 static void
set_bound_node_opt_info(NodeOptInfo * opt,MinMaxLen * mmd)4460 set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4461 {
4462 copy_mml(&(opt->exb.mmd), mmd);
4463 copy_mml(&(opt->expr.mmd), mmd);
4464 copy_mml(&(opt->map.mmd), mmd);
4465 }
4466
4467 static void
clear_node_opt_info(NodeOptInfo * opt)4468 clear_node_opt_info(NodeOptInfo* opt)
4469 {
4470 clear_mml(&opt->len);
4471 clear_opt_anc_info(&opt->anc);
4472 clear_opt_exact_info(&opt->exb);
4473 clear_opt_exact_info(&opt->exm);
4474 clear_opt_exact_info(&opt->expr);
4475 clear_opt_map_info(&opt->map);
4476 }
4477
4478 static void
copy_node_opt_info(NodeOptInfo * to,NodeOptInfo * from)4479 copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4480 {
4481 CopyMem (to, from, sizeof (NodeOptInfo));
4482 }
4483
4484 static void
concat_left_node_opt_info(OnigEncoding enc,NodeOptInfo * to,NodeOptInfo * add)4485 concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4486 {
4487 int exb_reach, exm_reach;
4488 OptAncInfo tanc;
4489
4490 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4491 copy_opt_anc_info(&to->anc, &tanc);
4492
4493 if (add->exb.len > 0 && to->len.max == 0) {
4494 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4495 to->len.max, add->len.max);
4496 copy_opt_anc_info(&add->exb.anc, &tanc);
4497 }
4498
4499 if (add->map.value > 0 && to->len.max == 0) {
4500 if (add->map.mmd.max == 0)
4501 add->map.anc.left_anchor |= to->anc.left_anchor;
4502 }
4503
4504 exb_reach = to->exb.reach_end;
4505 exm_reach = to->exm.reach_end;
4506
4507 if (add->len.max != 0)
4508 to->exb.reach_end = to->exm.reach_end = 0;
4509
4510 if (add->exb.len > 0) {
4511 if (exb_reach) {
4512 concat_opt_exact_info(&to->exb, &add->exb, enc);
4513 clear_opt_exact_info(&add->exb);
4514 }
4515 else if (exm_reach) {
4516 concat_opt_exact_info(&to->exm, &add->exb, enc);
4517 clear_opt_exact_info(&add->exb);
4518 }
4519 }
4520 select_opt_exact_info(enc, &to->exm, &add->exb);
4521 select_opt_exact_info(enc, &to->exm, &add->exm);
4522
4523 if (to->expr.len > 0) {
4524 if (add->len.max > 0) {
4525 if (to->expr.len > (int )add->len.max)
4526 to->expr.len = add->len.max;
4527
4528 if (to->expr.mmd.max == 0)
4529 select_opt_exact_info(enc, &to->exb, &to->expr);
4530 else
4531 select_opt_exact_info(enc, &to->exm, &to->expr);
4532 }
4533 }
4534 else if (add->expr.len > 0) {
4535 copy_opt_exact_info(&to->expr, &add->expr);
4536 }
4537
4538 select_opt_map_info(&to->map, &add->map);
4539
4540 add_mml(&to->len, &add->len);
4541 }
4542
4543 static void
alt_merge_node_opt_info(NodeOptInfo * to,NodeOptInfo * add,OptEnv * env)4544 alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4545 {
4546 alt_merge_opt_anc_info (&to->anc, &add->anc);
4547 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4548 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4549 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4550 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4551
4552 alt_merge_mml(&to->len, &add->len);
4553 }
4554
4555
4556 #define MAX_NODE_OPT_INFO_REF_COUNT 5
4557
4558 static int
optimize_node_left(Node * node,NodeOptInfo * opt,OptEnv * env)4559 optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4560 {
4561 int type;
4562 int r = 0;
4563
4564 clear_node_opt_info(opt);
4565 set_bound_node_opt_info(opt, &env->mmd);
4566
4567 type = NTYPE(node);
4568 switch (type) {
4569 case NT_LIST:
4570 {
4571 OptEnv nenv;
4572 NodeOptInfo nopt;
4573 Node* nd = node;
4574
4575 copy_opt_env(&nenv, env);
4576 do {
4577 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4578 if (r == 0) {
4579 add_mml(&nenv.mmd, &nopt.len);
4580 concat_left_node_opt_info(env->enc, opt, &nopt);
4581 }
4582 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4583 }
4584 break;
4585
4586 case NT_ALT:
4587 {
4588 NodeOptInfo nopt;
4589 Node* nd = node;
4590
4591 do {
4592 r = optimize_node_left(NCAR(nd), &nopt, env);
4593 if (r == 0) {
4594 if (nd == node) copy_node_opt_info(opt, &nopt);
4595 else alt_merge_node_opt_info(opt, &nopt, env);
4596 }
4597 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4598 }
4599 break;
4600
4601 case NT_STR:
4602 {
4603 StrNode* sn = NSTR(node);
4604 int slen = (int)(sn->end - sn->s);
4605 int is_raw = NSTRING_IS_RAW(node);
4606
4607 if (! NSTRING_IS_AMBIG(node)) {
4608 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4609 NSTRING_IS_RAW(node), env->enc);
4610 if (slen > 0) {
4611 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4612 }
4613 set_mml(&opt->len, slen, slen);
4614 }
4615 else {
4616 int max;
4617
4618 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4619 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4620 max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
4621 }
4622 else {
4623 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4624 is_raw, env->enc);
4625 opt->exb.ignore_case = 1;
4626
4627 if (slen > 0) {
4628 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4629 env->enc, env->case_fold_flag);
4630 if (r != 0) break;
4631 }
4632
4633 max = slen;
4634 }
4635
4636 set_mml(&opt->len, slen, max);
4637 }
4638
4639 if (opt->exb.len == slen)
4640 opt->exb.reach_end = 1;
4641 }
4642 break;
4643
4644 case NT_CCLASS:
4645 {
4646 int i, z;
4647 CClassNode* cc = NCCLASS(node);
4648
4649 /* no need to check ignore case. (setted in setup_tree()) */
4650
4651 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4652 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4653 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4654
4655 set_mml(&opt->len, min, max);
4656 }
4657 else {
4658 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4659 z = BITSET_AT(cc->bs, i);
4660 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
4661 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4662 }
4663 }
4664 set_mml(&opt->len, 1, 1);
4665 }
4666 }
4667 break;
4668
4669 case NT_CTYPE:
4670 {
4671 int i, min, max;
4672
4673 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4674
4675 if (max == 1) {
4676 min = 1;
4677
4678 switch (NCTYPE(node)->ctype) {
4679 case ONIGENC_CTYPE_WORD:
4680 if (NCTYPE(node)->not != 0) {
4681 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4682 if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
4683 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4684 }
4685 }
4686 }
4687 else {
4688 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4689 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
4690 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
4691 }
4692 }
4693 }
4694 break;
4695 }
4696 }
4697 else {
4698 min = ONIGENC_MBC_MINLEN(env->enc);
4699 }
4700 set_mml(&opt->len, min, max);
4701 }
4702 break;
4703
4704 case NT_CANY:
4705 {
4706 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4707 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4708 set_mml(&opt->len, min, max);
4709 }
4710 break;
4711
4712 case NT_ANCHOR:
4713 switch (NANCHOR(node)->type) {
4714 case ANCHOR_BEGIN_BUF:
4715 case ANCHOR_BEGIN_POSITION:
4716 case ANCHOR_BEGIN_LINE:
4717 case ANCHOR_END_BUF:
4718 case ANCHOR_SEMI_END_BUF:
4719 case ANCHOR_END_LINE:
4720 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
4721 break;
4722
4723 case ANCHOR_PREC_READ:
4724 {
4725 NodeOptInfo nopt;
4726
4727 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
4728 if (r == 0) {
4729 if (nopt.exb.len > 0)
4730 copy_opt_exact_info(&opt->expr, &nopt.exb);
4731 else if (nopt.exm.len > 0)
4732 copy_opt_exact_info(&opt->expr, &nopt.exm);
4733
4734 opt->expr.reach_end = 0;
4735
4736 if (nopt.map.value > 0)
4737 copy_opt_map_info(&opt->map, &nopt.map);
4738 }
4739 }
4740 break;
4741
4742 case ANCHOR_PREC_READ_NOT:
4743 case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
4744 case ANCHOR_LOOK_BEHIND_NOT:
4745 break;
4746 }
4747 break;
4748
4749 case NT_BREF:
4750 {
4751 int i;
4752 int* backs;
4753 OnigDistance min, max, tmin, tmax;
4754 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
4755 BRefNode* br = NBREF(node);
4756
4757 if (br->state & NST_RECURSION) {
4758 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4759 break;
4760 }
4761 backs = BACKREFS_P(br);
4762 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
4763 if (r != 0) break;
4764 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
4765 if (r != 0) break;
4766 for (i = 1; i < br->back_num; i++) {
4767 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
4768 if (r != 0) break;
4769 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
4770 if (r != 0) break;
4771 if (min > tmin) min = tmin;
4772 if (max < tmax) max = tmax;
4773 }
4774 if (r == 0) set_mml(&opt->len, min, max);
4775 }
4776 break;
4777
4778 #ifdef USE_SUBEXP_CALL
4779 case NT_CALL:
4780 if (IS_CALL_RECURSION(NCALL(node)))
4781 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
4782 else {
4783 OnigOptionType save = env->options;
4784 env->options = NENCLOSE(NCALL(node)->target)->option;
4785 r = optimize_node_left(NCALL(node)->target, opt, env);
4786 env->options = save;
4787 }
4788 break;
4789 #endif
4790
4791 case NT_QTFR:
4792 {
4793 int i;
4794 OnigDistance min, max;
4795 NodeOptInfo nopt;
4796 QtfrNode* qn = NQTFR(node);
4797
4798 r = optimize_node_left(qn->target, &nopt, env);
4799 if (r) break;
4800
4801 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
4802 if (env->mmd.max == 0 &&
4803 NTYPE(qn->target) == NT_CANY && qn->greedy) {
4804 if (IS_MULTILINE(env->options))
4805 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
4806 else
4807 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
4808 }
4809 }
4810 else {
4811 if (qn->lower > 0) {
4812 copy_node_opt_info(opt, &nopt);
4813 if (nopt.exb.len > 0) {
4814 if (nopt.exb.reach_end) {
4815 for (i = 2; i <= qn->lower &&
4816 ! is_full_opt_exact_info(&opt->exb); i++) {
4817 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
4818 }
4819 if (i < qn->lower) {
4820 opt->exb.reach_end = 0;
4821 }
4822 }
4823 }
4824
4825 if (qn->lower != qn->upper) {
4826 opt->exb.reach_end = 0;
4827 opt->exm.reach_end = 0;
4828 }
4829 if (qn->lower > 1)
4830 opt->exm.reach_end = 0;
4831 }
4832 }
4833
4834 min = distance_multiply(nopt.len.min, qn->lower);
4835 if (IS_REPEAT_INFINITE(qn->upper))
4836 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
4837 else
4838 max = distance_multiply(nopt.len.max, qn->upper);
4839
4840 set_mml(&opt->len, min, max);
4841 }
4842 break;
4843
4844 case NT_ENCLOSE:
4845 {
4846 EncloseNode* en = NENCLOSE(node);
4847
4848 switch (en->type) {
4849 case ENCLOSE_OPTION:
4850 {
4851 OnigOptionType save = env->options;
4852
4853 env->options = en->option;
4854 r = optimize_node_left(en->target, opt, env);
4855 env->options = save;
4856 }
4857 break;
4858
4859 case ENCLOSE_MEMORY:
4860 #ifdef USE_SUBEXP_CALL
4861 en->opt_count++;
4862 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
4863 OnigDistance min, max;
4864
4865 min = 0;
4866 max = ONIG_INFINITE_DISTANCE;
4867 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
4868 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
4869 set_mml(&opt->len, min, max);
4870 }
4871 else
4872 #endif
4873 {
4874 r = optimize_node_left(en->target, opt, env);
4875
4876 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
4877 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
4878 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
4879 }
4880 }
4881 break;
4882
4883 case ENCLOSE_STOP_BACKTRACK:
4884 r = optimize_node_left(en->target, opt, env);
4885 break;
4886 }
4887 }
4888 break;
4889
4890 default:
4891 #ifdef ONIG_DEBUG
4892 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
4893 NTYPE(node));
4894 #endif
4895 r = ONIGERR_TYPE_BUG;
4896 break;
4897 }
4898
4899 return r;
4900 }
4901
4902 static int
set_optimize_exact_info(regex_t * reg,OptExactInfo * e)4903 set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
4904 {
4905 int r;
4906
4907 if (e->len == 0) return 0;
4908
4909 if (e->ignore_case) {
4910 reg->exact = (UChar* )xmalloc(e->len);
4911 CHECK_NULL_RETURN_MEMERR(reg->exact);
4912 xmemcpy(reg->exact, e->s, e->len);
4913 reg->exact_end = reg->exact + e->len;
4914 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
4915 }
4916 else {
4917 int allow_reverse;
4918
4919 reg->exact = str_dup(e->s, e->s + e->len);
4920 CHECK_NULL_RETURN_MEMERR(reg->exact);
4921 reg->exact_end = reg->exact + e->len;
4922
4923 allow_reverse =
4924 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
4925
4926 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
4927 r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
4928 reg->map, &(reg->int_map));
4929 if (r) return r;
4930
4931 reg->optimize = (allow_reverse != 0
4932 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
4933 }
4934 else {
4935 reg->optimize = ONIG_OPTIMIZE_EXACT;
4936 }
4937 }
4938
4939 reg->dmin = e->mmd.min;
4940 reg->dmax = e->mmd.max;
4941
4942 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4943 reg->threshold_len = reg->dmin + (int)(reg->exact_end - reg->exact);
4944 }
4945
4946 return 0;
4947 }
4948
4949 static void
set_optimize_map_info(regex_t * reg,OptMapInfo * m)4950 set_optimize_map_info(regex_t* reg, OptMapInfo* m)
4951 {
4952 int i;
4953
4954 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4955 reg->map[i] = m->map[i];
4956
4957 reg->optimize = ONIG_OPTIMIZE_MAP;
4958 reg->dmin = m->mmd.min;
4959 reg->dmax = m->mmd.max;
4960
4961 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
4962 reg->threshold_len = reg->dmin + 1;
4963 }
4964 }
4965
4966 static void
set_sub_anchor(regex_t * reg,OptAncInfo * anc)4967 set_sub_anchor(regex_t* reg, OptAncInfo* anc)
4968 {
4969 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
4970 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
4971 }
4972
4973 #ifdef ONIG_DEBUG
4974 static void print_optimize_info(FILE* f, regex_t* reg);
4975 #endif
4976
4977 static int
set_optimize_info_from_tree(Node * node,regex_t * reg,ScanEnv * scan_env)4978 set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
4979 {
4980
4981 int r;
4982 NodeOptInfo opt;
4983 OptEnv env;
4984
4985 env.enc = reg->enc;
4986 env.options = reg->options;
4987 env.case_fold_flag = reg->case_fold_flag;
4988 env.scan_env = scan_env;
4989 clear_mml(&env.mmd);
4990
4991 r = optimize_node_left(node, &opt, &env);
4992 if (r) return r;
4993
4994 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
4995 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
4996
4997 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
4998
4999 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5000 reg->anchor_dmin = opt.len.min;
5001 reg->anchor_dmax = opt.len.max;
5002 }
5003
5004 if (opt.exb.len > 0 || opt.exm.len > 0) {
5005 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5006 if (opt.map.value > 0 &&
5007 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5008 goto set_map;
5009 }
5010 else {
5011 r = set_optimize_exact_info(reg, &opt.exb);
5012 set_sub_anchor(reg, &opt.exb.anc);
5013 }
5014 }
5015 else if (opt.map.value > 0) {
5016 set_map:
5017 set_optimize_map_info(reg, &opt.map);
5018 set_sub_anchor(reg, &opt.map.anc);
5019 }
5020 else {
5021 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5022 if (opt.len.max == 0)
5023 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5024 }
5025
5026 #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5027 print_optimize_info(stderr, reg);
5028 #endif
5029 return r;
5030 }
5031
5032 static void
clear_optimize_info(regex_t * reg)5033 clear_optimize_info(regex_t* reg)
5034 {
5035 reg->optimize = ONIG_OPTIMIZE_NONE;
5036 reg->anchor = 0;
5037 reg->anchor_dmin = 0;
5038 reg->anchor_dmax = 0;
5039 reg->sub_anchor = 0;
5040 reg->exact_end = (UChar* )NULL;
5041 reg->threshold_len = 0;
5042 if (IS_NOT_NULL(reg->exact)) {
5043 xfree(reg->exact);
5044 reg->exact = (UChar* )NULL;
5045 }
5046 }
5047
5048 #ifdef ONIG_DEBUG
5049
print_enc_string(FILE * fp,OnigEncoding enc,const UChar * s,const UChar * end)5050 static void print_enc_string(FILE* fp, OnigEncoding enc,
5051 const UChar *s, const UChar *end)
5052 {
5053 fprintf(fp, "\nPATTERN: /");
5054
5055 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5056 const UChar *p;
5057 OnigCodePoint code;
5058
5059 p = s;
5060 while (p < end) {
5061 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5062 if (code >= 0x80) {
5063 fprintf(fp, " 0x%04x ", (int )code);
5064 }
5065 else {
5066 fputc((int )code, fp);
5067 }
5068
5069 p += enclen(enc, p);
5070 }
5071 }
5072 else {
5073 while (s < end) {
5074 fputc((int )*s, fp);
5075 s++;
5076 }
5077 }
5078
5079 fprintf(fp, "/\n");
5080 }
5081
5082 static void
print_distance_range(FILE * f,OnigDistance a,OnigDistance b)5083 print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5084 {
5085 if (a == ONIG_INFINITE_DISTANCE)
5086 fputs("inf", f);
5087 else
5088 fprintf(f, "(%u)", a);
5089
5090 fputs("-", f);
5091
5092 if (b == ONIG_INFINITE_DISTANCE)
5093 fputs("inf", f);
5094 else
5095 fprintf(f, "(%u)", b);
5096 }
5097
5098 static void
print_anchor(FILE * f,int anchor)5099 print_anchor(FILE* f, int anchor)
5100 {
5101 int q = 0;
5102
5103 fprintf(f, "[");
5104
5105 if (anchor & ANCHOR_BEGIN_BUF) {
5106 fprintf(f, "begin-buf");
5107 q = 1;
5108 }
5109 if (anchor & ANCHOR_BEGIN_LINE) {
5110 if (q) fprintf(f, ", ");
5111 q = 1;
5112 fprintf(f, "begin-line");
5113 }
5114 if (anchor & ANCHOR_BEGIN_POSITION) {
5115 if (q) fprintf(f, ", ");
5116 q = 1;
5117 fprintf(f, "begin-pos");
5118 }
5119 if (anchor & ANCHOR_END_BUF) {
5120 if (q) fprintf(f, ", ");
5121 q = 1;
5122 fprintf(f, "end-buf");
5123 }
5124 if (anchor & ANCHOR_SEMI_END_BUF) {
5125 if (q) fprintf(f, ", ");
5126 q = 1;
5127 fprintf(f, "semi-end-buf");
5128 }
5129 if (anchor & ANCHOR_END_LINE) {
5130 if (q) fprintf(f, ", ");
5131 q = 1;
5132 fprintf(f, "end-line");
5133 }
5134 if (anchor & ANCHOR_ANYCHAR_STAR) {
5135 if (q) fprintf(f, ", ");
5136 q = 1;
5137 fprintf(f, "anychar-star");
5138 }
5139 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5140 if (q) fprintf(f, ", ");
5141 fprintf(f, "anychar-star-pl");
5142 }
5143
5144 fprintf(f, "]");
5145 }
5146
5147 static void
print_optimize_info(FILE * f,regex_t * reg)5148 print_optimize_info(FILE* f, regex_t* reg)
5149 {
5150 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5151 "EXACT_IC", "MAP" };
5152
5153 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5154 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5155 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5156 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5157 fprintf(f, "\n");
5158
5159 if (reg->optimize) {
5160 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5161 fprintf(f, "\n");
5162 }
5163 fprintf(f, "\n");
5164
5165 if (reg->exact) {
5166 UChar *p;
5167 fprintf(f, "exact: [");
5168 for (p = reg->exact; p < reg->exact_end; p++) {
5169 fputc(*p, f);
5170 }
5171 fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
5172 }
5173 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5174 int c, i, n = 0;
5175
5176 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5177 if (reg->map[i]) n++;
5178
5179 fprintf(f, "map: n=%d\n", n);
5180 if (n > 0) {
5181 c = 0;
5182 fputc('[', f);
5183 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5184 if (reg->map[i] != 0) {
5185 if (c > 0) fputs(", ", f);
5186 c++;
5187 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5188 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5189 fputc(i, f);
5190 else
5191 fprintf(f, "%d", i);
5192 }
5193 }
5194 fprintf(f, "]\n");
5195 }
5196 }
5197 }
5198 #endif /* ONIG_DEBUG */
5199
5200
5201 extern void
onig_free_body(regex_t * reg)5202 onig_free_body(regex_t* reg)
5203 {
5204 if (IS_NOT_NULL(reg)) {
5205 if (IS_NOT_NULL(reg->p)) xfree(reg->p);
5206 if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
5207 if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
5208 if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
5209 if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
5210 if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
5211
5212 #ifdef USE_NAMED_GROUP
5213 onig_names_free(reg);
5214 #endif
5215 }
5216 }
5217
5218 extern void
onig_free(regex_t * reg)5219 onig_free(regex_t* reg)
5220 {
5221 if (IS_NOT_NULL(reg)) {
5222 onig_free_body(reg);
5223 xfree(reg);
5224 }
5225 }
5226
5227 #define REGEX_TRANSFER(to,from) do {\
5228 (to)->state = ONIG_STATE_MODIFY;\
5229 onig_free_body(to);\
5230 xmemcpy(to, from, sizeof(regex_t));\
5231 xfree(from);\
5232 } while (0)
5233
5234 extern void
onig_transfer(regex_t * to,regex_t * from)5235 onig_transfer(regex_t* to, regex_t* from)
5236 {
5237 THREAD_ATOMIC_START;
5238 REGEX_TRANSFER(to, from);
5239 THREAD_ATOMIC_END;
5240 }
5241
5242 #define REGEX_CHAIN_HEAD(reg) do {\
5243 while (IS_NOT_NULL((reg)->chain)) {\
5244 (reg) = (reg)->chain;\
5245 }\
5246 } while (0)
5247
5248 extern void
onig_chain_link_add(regex_t * to,regex_t * add)5249 onig_chain_link_add(regex_t* to, regex_t* add)
5250 {
5251 THREAD_ATOMIC_START;
5252 REGEX_CHAIN_HEAD(to);
5253 to->chain = add;
5254 THREAD_ATOMIC_END;
5255 }
5256
5257 extern void
onig_chain_reduce(regex_t * reg)5258 onig_chain_reduce(regex_t* reg)
5259 {
5260 regex_t *head, *prev;
5261
5262 prev = reg;
5263 head = prev->chain;
5264 if (IS_NOT_NULL(head)) {
5265 reg->state = ONIG_STATE_MODIFY;
5266 while (IS_NOT_NULL(head->chain)) {
5267 prev = head;
5268 head = head->chain;
5269 }
5270 prev->chain = (regex_t* )NULL;
5271 REGEX_TRANSFER(reg, head);
5272 }
5273 }
5274
5275 #ifdef ONIG_DEBUG
5276 static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
5277 #endif
5278 #ifdef ONIG_DEBUG_PARSE_TREE
5279 static void print_tree P_((FILE* f, Node* node));
5280 #endif
5281
5282 extern int
onig_compile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigErrorInfo * einfo)5283 onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5284 OnigErrorInfo* einfo)
5285 {
5286 #define COMPILE_INIT_SIZE 20
5287
5288 int r, init_size;
5289 Node* root;
5290 ScanEnv scan_env;
5291 #ifdef USE_SUBEXP_CALL
5292 UnsetAddrList uslist;
5293 #endif
5294
5295 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5296
5297 reg->state = ONIG_STATE_COMPILING;
5298
5299 #ifdef ONIG_DEBUG
5300 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5301 #endif
5302
5303 if (reg->alloc == 0) {
5304 init_size = ((int)(pattern_end - pattern)) * 2;
5305 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5306 r = BBUF_INIT(reg, init_size);
5307 if (r != 0) goto end;
5308 }
5309 else
5310 reg->used = 0;
5311
5312 reg->num_mem = 0;
5313 reg->num_repeat = 0;
5314 reg->num_null_check = 0;
5315 reg->repeat_range_alloc = 0;
5316 reg->repeat_range = (OnigRepeatRange* )NULL;
5317 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5318 reg->num_comb_exp_check = 0;
5319 #endif
5320
5321 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5322 if (r != 0 || root == NULL) goto err;
5323
5324 #ifdef USE_NAMED_GROUP
5325 /* mixed use named group and no-named group */
5326 if (scan_env.num_named > 0 &&
5327 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5328 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5329 if (scan_env.num_named != scan_env.num_mem)
5330 r = disable_noname_group_capture(&root, reg, &scan_env);
5331 else
5332 r = numbered_ref_check(root);
5333
5334 if (r != 0) goto err;
5335 }
5336 #endif
5337
5338 #ifdef USE_SUBEXP_CALL
5339 if (scan_env.num_call > 0) {
5340 r = unset_addr_list_init(&uslist, scan_env.num_call);
5341 if (r != 0) goto err;
5342 scan_env.unset_addr_list = &uslist;
5343 r = setup_subexp_call(root, &scan_env);
5344 if (r != 0) goto err_unset;
5345 r = subexp_recursive_check_trav(root, &scan_env);
5346 if (r < 0) goto err_unset;
5347 r = subexp_inf_recursive_check_trav(root, &scan_env);
5348 if (r != 0) goto err_unset;
5349
5350 reg->num_call = scan_env.num_call;
5351 }
5352 else
5353 reg->num_call = 0;
5354 #endif
5355
5356 r = setup_tree(root, reg, 0, &scan_env);
5357 if (r != 0) goto err_unset;
5358
5359 #ifdef ONIG_DEBUG_PARSE_TREE
5360 print_tree(stderr, root);
5361 #endif
5362
5363 reg->capture_history = scan_env.capture_history;
5364 reg->bt_mem_start = scan_env.bt_mem_start;
5365 reg->bt_mem_start |= reg->capture_history;
5366 if (IS_FIND_CONDITION(reg->options))
5367 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5368 else {
5369 reg->bt_mem_end = scan_env.bt_mem_end;
5370 reg->bt_mem_end |= reg->capture_history;
5371 }
5372
5373 #ifdef USE_COMBINATION_EXPLOSION_CHECK
5374 if (scan_env.backrefed_mem == 0
5375 #ifdef USE_SUBEXP_CALL
5376 || scan_env.num_call == 0
5377 #endif
5378 ) {
5379 setup_comb_exp_check(root, 0, &scan_env);
5380 #ifdef USE_SUBEXP_CALL
5381 if (scan_env.has_recursion != 0) {
5382 scan_env.num_comb_exp_check = 0;
5383 }
5384 else
5385 #endif
5386 if (scan_env.comb_exp_max_regnum > 0) {
5387 int i;
5388 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5389 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5390 scan_env.num_comb_exp_check = 0;
5391 break;
5392 }
5393 }
5394 }
5395 }
5396
5397 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5398 #endif
5399
5400 clear_optimize_info(reg);
5401 #ifndef ONIG_DONT_OPTIMIZE
5402 r = set_optimize_info_from_tree(root, reg, &scan_env);
5403 if (r != 0) goto err_unset;
5404 #endif
5405
5406 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5407 xfree(scan_env.mem_nodes_dynamic);
5408 scan_env.mem_nodes_dynamic = (Node** )NULL;
5409 }
5410
5411 r = compile_tree(root, reg);
5412 if (r == 0) {
5413 r = add_opcode(reg, OP_END);
5414 #ifdef USE_SUBEXP_CALL
5415 if (scan_env.num_call > 0) {
5416 r = unset_addr_list_fix(&uslist, reg);
5417 unset_addr_list_end(&uslist);
5418 if (r) goto err;
5419 }
5420 #endif
5421
5422 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5423 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5424 else {
5425 if (reg->bt_mem_start != 0)
5426 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5427 else
5428 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5429 }
5430 }
5431 #ifdef USE_SUBEXP_CALL
5432 else if (scan_env.num_call > 0) {
5433 unset_addr_list_end(&uslist);
5434 }
5435 #endif
5436 onig_node_free(root);
5437
5438 #ifdef ONIG_DEBUG_COMPILE
5439 #ifdef USE_NAMED_GROUP
5440 onig_print_names(stderr, reg);
5441 #endif
5442 print_compiled_byte_code_list(stderr, reg);
5443 #endif
5444
5445 end:
5446 reg->state = ONIG_STATE_NORMAL;
5447 return r;
5448
5449 err_unset:
5450 #ifdef USE_SUBEXP_CALL
5451 if (scan_env.num_call > 0) {
5452 unset_addr_list_end(&uslist);
5453 }
5454 #endif
5455 err:
5456 if (IS_NOT_NULL(scan_env.error)) {
5457 if (IS_NOT_NULL(einfo)) {
5458 einfo->enc = scan_env.enc;
5459 einfo->par = scan_env.error;
5460 einfo->par_end = scan_env.error_end;
5461 }
5462 }
5463
5464 onig_node_free(root);
5465 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
5466 xfree(scan_env.mem_nodes_dynamic);
5467 return r;
5468 }
5469
5470 #ifdef USE_RECOMPILE_API
5471 extern int
onig_recompile(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5472 onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5473 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5474 OnigErrorInfo* einfo)
5475 {
5476 int r;
5477 regex_t *new_reg;
5478
5479 r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
5480 if (r) return r;
5481 if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
5482 onig_transfer(reg, new_reg);
5483 }
5484 else {
5485 onig_chain_link_add(reg, new_reg);
5486 }
5487 return 0;
5488 }
5489 #endif
5490
5491 static int onig_inited = 0;
5492
5493 extern int
onig_reg_init(regex_t * reg,OnigOptionType option,OnigCaseFoldType case_fold_flag,OnigEncoding enc,OnigSyntaxType * syntax)5494 onig_reg_init(regex_t* reg, OnigOptionType option,
5495 OnigCaseFoldType case_fold_flag,
5496 OnigEncoding enc, OnigSyntaxType* syntax)
5497 {
5498 if (! onig_inited)
5499 onig_init();
5500
5501 if (IS_NULL(reg))
5502 return ONIGERR_INVALID_ARGUMENT;
5503
5504 if (ONIGENC_IS_UNDEF(enc))
5505 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
5506
5507 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5508 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5509 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5510 }
5511
5512 (reg)->state = ONIG_STATE_MODIFY;
5513
5514 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5515 option |= syntax->options;
5516 option &= ~ONIG_OPTION_SINGLELINE;
5517 }
5518 else
5519 option |= syntax->options;
5520
5521 (reg)->enc = enc;
5522 (reg)->options = option;
5523 (reg)->syntax = syntax;
5524 (reg)->optimize = 0;
5525 (reg)->exact = (UChar* )NULL;
5526 (reg)->int_map = (int* )NULL;
5527 (reg)->int_map_backward = (int* )NULL;
5528 (reg)->chain = (regex_t* )NULL;
5529
5530 (reg)->p = (UChar* )NULL;
5531 (reg)->alloc = 0;
5532 (reg)->used = 0;
5533 (reg)->name_table = (void* )NULL;
5534
5535 (reg)->case_fold_flag = case_fold_flag;
5536 return 0;
5537 }
5538
5539 extern int
onig_new_without_alloc(regex_t * reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5540 onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5541 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5542 OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5543 {
5544 int r;
5545
5546 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5547 if (r) return r;
5548
5549 r = onig_compile(reg, pattern, pattern_end, einfo);
5550 return r;
5551 }
5552
5553 extern int
onig_new(regex_t ** reg,const UChar * pattern,const UChar * pattern_end,OnigOptionType option,OnigEncoding enc,OnigSyntaxType * syntax,OnigErrorInfo * einfo)5554 onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5555 OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
5556 OnigErrorInfo* einfo)
5557 {
5558 int r;
5559
5560 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5561 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5562
5563 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5564 if (r) goto err;
5565
5566 r = onig_compile(*reg, pattern, pattern_end, einfo);
5567 if (r) {
5568 err:
5569 onig_free(*reg);
5570 *reg = NULL;
5571 }
5572 return r;
5573 }
5574
5575
5576 extern int
onig_init(void)5577 onig_init(void)
5578 {
5579 if (onig_inited != 0)
5580 return 0;
5581
5582 THREAD_SYSTEM_INIT;
5583 THREAD_ATOMIC_START;
5584
5585 onig_inited = 1;
5586
5587 onigenc_init();
5588 /* onigenc_set_default_caseconv_table((UChar* )0); */
5589
5590 #ifdef ONIG_DEBUG_STATISTICS
5591 onig_statistics_init();
5592 #endif
5593
5594 THREAD_ATOMIC_END;
5595 return 0;
5596 }
5597
5598
5599 static OnigEndCallListItemType* EndCallTop;
5600
onig_add_end_call(void (* func)(void))5601 extern void onig_add_end_call(void (*func)(void))
5602 {
5603 OnigEndCallListItemType* item;
5604
5605 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
5606 if (item == 0) return ;
5607
5608 item->next = EndCallTop;
5609 item->func = func;
5610
5611 EndCallTop = item;
5612 }
5613
5614 static void
exec_end_call_list(void)5615 exec_end_call_list(void)
5616 {
5617 OnigEndCallListItemType* prev;
5618 void (*func)(void);
5619
5620 while (EndCallTop != 0) {
5621 func = EndCallTop->func;
5622 (*func)();
5623
5624 prev = EndCallTop;
5625 EndCallTop = EndCallTop->next;
5626 xfree(prev);
5627 }
5628 }
5629
5630 extern int
onig_end(void)5631 onig_end(void)
5632 {
5633 THREAD_ATOMIC_START;
5634
5635 exec_end_call_list();
5636
5637 #ifdef ONIG_DEBUG_STATISTICS
5638 onig_print_statistics(stderr);
5639 #endif
5640
5641 #ifdef USE_SHARED_CCLASS_TABLE
5642 onig_free_shared_cclass_table();
5643 #endif
5644
5645 #ifdef USE_PARSE_TREE_NODE_RECYCLE
5646 onig_free_node_list();
5647 #endif
5648
5649 onig_inited = 0;
5650
5651 THREAD_ATOMIC_END;
5652 THREAD_SYSTEM_END;
5653 return 0;
5654 }
5655
5656 extern int
onig_is_in_code_range(const UChar * p,OnigCodePoint code)5657 onig_is_in_code_range(const UChar* p, OnigCodePoint code)
5658 {
5659 OnigCodePoint n, *data;
5660 OnigCodePoint low, high, x;
5661
5662 GET_CODE_POINT(n, p);
5663 data = (OnigCodePoint* )p;
5664 data++;
5665
5666 for (low = 0, high = n; low < high; ) {
5667 x = (low + high) >> 1;
5668 if (code > data[x * 2 + 1])
5669 low = x + 1;
5670 else
5671 high = x;
5672 }
5673
5674 return ((low < n && code >= data[low * 2]) ? 1 : 0);
5675 }
5676
5677 extern int
onig_is_code_in_cc_len(int elen,OnigCodePoint code,CClassNode * cc)5678 onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
5679 {
5680 int found;
5681
5682 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
5683 if (IS_NULL(cc->mbuf)) {
5684 found = 0;
5685 }
5686 else {
5687 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
5688 }
5689 }
5690 else {
5691 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
5692 }
5693
5694 if (IS_NCCLASS_NOT(cc))
5695 return !found;
5696 else
5697 return found;
5698 }
5699
5700 extern int
onig_is_code_in_cc(OnigEncoding enc,OnigCodePoint code,CClassNode * cc)5701 onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
5702 {
5703 int len;
5704
5705 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5706 len = 2;
5707 }
5708 else {
5709 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
5710 }
5711 return onig_is_code_in_cc_len(len, code, cc);
5712 }
5713
5714
5715 #ifdef ONIG_DEBUG
5716
5717 /* arguments type */
5718 #define ARG_SPECIAL -1
5719 #define ARG_NON 0
5720 #define ARG_RELADDR 1
5721 #define ARG_ABSADDR 2
5722 #define ARG_LENGTH 3
5723 #define ARG_MEMNUM 4
5724 #define ARG_OPTION 5
5725 #define ARG_STATE_CHECK 6
5726
5727 OnigOpInfoType OnigOpInfo[] = {
5728 { OP_FINISH, "finish", ARG_NON },
5729 { OP_END, "end", ARG_NON },
5730 { OP_EXACT1, "exact1", ARG_SPECIAL },
5731 { OP_EXACT2, "exact2", ARG_SPECIAL },
5732 { OP_EXACT3, "exact3", ARG_SPECIAL },
5733 { OP_EXACT4, "exact4", ARG_SPECIAL },
5734 { OP_EXACT5, "exact5", ARG_SPECIAL },
5735 { OP_EXACTN, "exactn", ARG_SPECIAL },
5736 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
5737 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
5738 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
5739 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
5740 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
5741 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
5742 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
5743 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
5744 { OP_CCLASS, "cclass", ARG_SPECIAL },
5745 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
5746 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
5747 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
5748 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
5749 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
5750 { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
5751 { OP_ANYCHAR, "anychar", ARG_NON },
5752 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
5753 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
5754 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
5755 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
5756 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
5757 { OP_WORD, "word", ARG_NON },
5758 { OP_NOT_WORD, "not-word", ARG_NON },
5759 { OP_WORD_BOUND, "word-bound", ARG_NON },
5760 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
5761 { OP_WORD_BEGIN, "word-begin", ARG_NON },
5762 { OP_WORD_END, "word-end", ARG_NON },
5763 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
5764 { OP_END_BUF, "end-buf", ARG_NON },
5765 { OP_BEGIN_LINE, "begin-line", ARG_NON },
5766 { OP_END_LINE, "end-line", ARG_NON },
5767 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
5768 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
5769 { OP_BACKREF1, "backref1", ARG_NON },
5770 { OP_BACKREF2, "backref2", ARG_NON },
5771 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
5772 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
5773 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
5774 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
5775 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
5776 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
5777 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
5778 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
5779 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
5780 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
5781 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
5782 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
5783 { OP_SET_OPTION, "set-option", ARG_OPTION },
5784 { OP_FAIL, "fail", ARG_NON },
5785 { OP_JUMP, "jump", ARG_RELADDR },
5786 { OP_PUSH, "push", ARG_RELADDR },
5787 { OP_POP, "pop", ARG_NON },
5788 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
5789 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
5790 { OP_REPEAT, "repeat", ARG_SPECIAL },
5791 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
5792 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
5793 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
5794 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
5795 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
5796 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
5797 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
5798 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
5799 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
5800 { OP_PUSH_POS, "push-pos", ARG_NON },
5801 { OP_POP_POS, "pop-pos", ARG_NON },
5802 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
5803 { OP_FAIL_POS, "fail-pos", ARG_NON },
5804 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
5805 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
5806 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
5807 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
5808 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
5809 { OP_CALL, "call", ARG_ABSADDR },
5810 { OP_RETURN, "return", ARG_NON },
5811 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
5812 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
5813 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
5814 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
5815 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
5816 "state-check-anychar-ml*", ARG_STATE_CHECK },
5817 { -1, "", ARG_NON }
5818 };
5819
5820 static char*
op2name(int opcode)5821 op2name(int opcode)
5822 {
5823 int i;
5824
5825 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5826 if (opcode == OnigOpInfo[i].opcode)
5827 return OnigOpInfo[i].name;
5828 }
5829 return "";
5830 }
5831
5832 static int
op2arg_type(int opcode)5833 op2arg_type(int opcode)
5834 {
5835 int i;
5836
5837 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
5838 if (opcode == OnigOpInfo[i].opcode)
5839 return OnigOpInfo[i].arg_type;
5840 }
5841 return ARG_SPECIAL;
5842 }
5843
5844 static void
Indent(FILE * f,int indent)5845 Indent(FILE* f, int indent)
5846 {
5847 int i;
5848 for (i = 0; i < indent; i++) putc(' ', f);
5849 }
5850
5851 static void
p_string(FILE * f,int len,UChar * s)5852 p_string(FILE* f, int len, UChar* s)
5853 {
5854 fputs(":", f);
5855 while (len-- > 0) { fputc(*s++, f); }
5856 }
5857
5858 static void
p_len_string(FILE * f,LengthType len,int mb_len,UChar * s)5859 p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
5860 {
5861 int x = len * mb_len;
5862
5863 fprintf(f, ":%d:", len);
5864 while (x-- > 0) { fputc(*s++, f); }
5865 }
5866
5867 extern void
onig_print_compiled_byte_code(FILE * f,UChar * bp,UChar ** nextp,OnigEncoding enc)5868 onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
5869 OnigEncoding enc)
5870 {
5871 int i, n, arg_type;
5872 RelAddrType addr;
5873 LengthType len;
5874 MemNumType mem;
5875 StateCheckNumType scn;
5876 OnigCodePoint code;
5877 UChar *q;
5878
5879 fprintf(f, "[%s", op2name(*bp));
5880 arg_type = op2arg_type(*bp);
5881 if (arg_type != ARG_SPECIAL) {
5882 bp++;
5883 switch (arg_type) {
5884 case ARG_NON:
5885 break;
5886 case ARG_RELADDR:
5887 GET_RELADDR_INC(addr, bp);
5888 fprintf(f, ":(%d)", addr);
5889 break;
5890 case ARG_ABSADDR:
5891 GET_ABSADDR_INC(addr, bp);
5892 fprintf(f, ":(%d)", addr);
5893 break;
5894 case ARG_LENGTH:
5895 GET_LENGTH_INC(len, bp);
5896 fprintf(f, ":%d", len);
5897 break;
5898 case ARG_MEMNUM:
5899 mem = *((MemNumType* )bp);
5900 bp += SIZE_MEMNUM;
5901 fprintf(f, ":%d", mem);
5902 break;
5903 case ARG_OPTION:
5904 {
5905 OnigOptionType option = *((OnigOptionType* )bp);
5906 bp += SIZE_OPTION;
5907 fprintf(f, ":%d", option);
5908 }
5909 break;
5910
5911 case ARG_STATE_CHECK:
5912 scn = *((StateCheckNumType* )bp);
5913 bp += SIZE_STATE_CHECK_NUM;
5914 fprintf(f, ":%d", scn);
5915 break;
5916 }
5917 }
5918 else {
5919 switch (*bp++) {
5920 case OP_EXACT1:
5921 case OP_ANYCHAR_STAR_PEEK_NEXT:
5922 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
5923 p_string(f, 1, bp++); break;
5924 case OP_EXACT2:
5925 p_string(f, 2, bp); bp += 2; break;
5926 case OP_EXACT3:
5927 p_string(f, 3, bp); bp += 3; break;
5928 case OP_EXACT4:
5929 p_string(f, 4, bp); bp += 4; break;
5930 case OP_EXACT5:
5931 p_string(f, 5, bp); bp += 5; break;
5932 case OP_EXACTN:
5933 GET_LENGTH_INC(len, bp);
5934 p_len_string(f, len, 1, bp);
5935 bp += len;
5936 break;
5937
5938 case OP_EXACTMB2N1:
5939 p_string(f, 2, bp); bp += 2; break;
5940 case OP_EXACTMB2N2:
5941 p_string(f, 4, bp); bp += 4; break;
5942 case OP_EXACTMB2N3:
5943 p_string(f, 6, bp); bp += 6; break;
5944 case OP_EXACTMB2N:
5945 GET_LENGTH_INC(len, bp);
5946 p_len_string(f, len, 2, bp);
5947 bp += len * 2;
5948 break;
5949 case OP_EXACTMB3N:
5950 GET_LENGTH_INC(len, bp);
5951 p_len_string(f, len, 3, bp);
5952 bp += len * 3;
5953 break;
5954 case OP_EXACTMBN:
5955 {
5956 int mb_len;
5957
5958 GET_LENGTH_INC(mb_len, bp);
5959 GET_LENGTH_INC(len, bp);
5960 fprintf(f, ":%d:%d:", mb_len, len);
5961 n = len * mb_len;
5962 while (n-- > 0) { fputc(*bp++, f); }
5963 }
5964 break;
5965
5966 case OP_EXACT1_IC:
5967 len = enclen(enc, bp);
5968 p_string(f, len, bp);
5969 bp += len;
5970 break;
5971 case OP_EXACTN_IC:
5972 GET_LENGTH_INC(len, bp);
5973 p_len_string(f, len, 1, bp);
5974 bp += len;
5975 break;
5976
5977 case OP_CCLASS:
5978 n = bitset_on_num((BitSetRef )bp);
5979 bp += SIZE_BITSET;
5980 fprintf(f, ":%d", n);
5981 break;
5982
5983 case OP_CCLASS_NOT:
5984 n = bitset_on_num((BitSetRef )bp);
5985 bp += SIZE_BITSET;
5986 fprintf(f, ":%d", n);
5987 break;
5988
5989 case OP_CCLASS_MB:
5990 case OP_CCLASS_MB_NOT:
5991 GET_LENGTH_INC(len, bp);
5992 q = bp;
5993 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
5994 ALIGNMENT_RIGHT(q);
5995 #endif
5996 GET_CODE_POINT(code, q);
5997 bp += len;
5998 fprintf(f, ":%d:%d", (int )code, len);
5999 break;
6000
6001 case OP_CCLASS_MIX:
6002 case OP_CCLASS_MIX_NOT:
6003 n = bitset_on_num((BitSetRef )bp);
6004 bp += SIZE_BITSET;
6005 GET_LENGTH_INC(len, bp);
6006 q = bp;
6007 #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6008 ALIGNMENT_RIGHT(q);
6009 #endif
6010 GET_CODE_POINT(code, q);
6011 bp += len;
6012 fprintf(f, ":%d:%d:%d", n, (int )code, len);
6013 break;
6014
6015 case OP_CCLASS_NODE:
6016 {
6017 CClassNode *cc;
6018
6019 GET_POINTER_INC(cc, bp);
6020 n = bitset_on_num(cc->bs);
6021 fprintf(f, ":%u:%d", (unsigned int )cc, n);
6022 }
6023 break;
6024
6025 case OP_BACKREFN_IC:
6026 mem = *((MemNumType* )bp);
6027 bp += SIZE_MEMNUM;
6028 fprintf(f, ":%d", mem);
6029 break;
6030
6031 case OP_BACKREF_MULTI_IC:
6032 case OP_BACKREF_MULTI:
6033 fputs(" ", f);
6034 GET_LENGTH_INC(len, bp);
6035 for (i = 0; i < len; i++) {
6036 GET_MEMNUM_INC(mem, bp);
6037 if (i > 0) fputs(", ", f);
6038 fprintf(f, "%d", mem);
6039 }
6040 break;
6041
6042 case OP_BACKREF_WITH_LEVEL:
6043 {
6044 OnigOptionType option;
6045 LengthType level;
6046
6047 GET_OPTION_INC(option, bp);
6048 fprintf(f, ":%d", option);
6049 GET_LENGTH_INC(level, bp);
6050 fprintf(f, ":%d", level);
6051
6052 fputs(" ", f);
6053 GET_LENGTH_INC(len, bp);
6054 for (i = 0; i < len; i++) {
6055 GET_MEMNUM_INC(mem, bp);
6056 if (i > 0) fputs(", ", f);
6057 fprintf(f, "%d", mem);
6058 }
6059 }
6060 break;
6061
6062 case OP_REPEAT:
6063 case OP_REPEAT_NG:
6064 {
6065 mem = *((MemNumType* )bp);
6066 bp += SIZE_MEMNUM;
6067 addr = *((RelAddrType* )bp);
6068 bp += SIZE_RELADDR;
6069 fprintf(f, ":%d:%d", mem, addr);
6070 }
6071 break;
6072
6073 case OP_PUSH_OR_JUMP_EXACT1:
6074 case OP_PUSH_IF_PEEK_NEXT:
6075 addr = *((RelAddrType* )bp);
6076 bp += SIZE_RELADDR;
6077 fprintf(f, ":(%d)", addr);
6078 p_string(f, 1, bp);
6079 bp += 1;
6080 break;
6081
6082 case OP_LOOK_BEHIND:
6083 GET_LENGTH_INC(len, bp);
6084 fprintf(f, ":%d", len);
6085 break;
6086
6087 case OP_PUSH_LOOK_BEHIND_NOT:
6088 GET_RELADDR_INC(addr, bp);
6089 GET_LENGTH_INC(len, bp);
6090 fprintf(f, ":%d:(%d)", len, addr);
6091 break;
6092
6093 case OP_STATE_CHECK_PUSH:
6094 case OP_STATE_CHECK_PUSH_OR_JUMP:
6095 scn = *((StateCheckNumType* )bp);
6096 bp += SIZE_STATE_CHECK_NUM;
6097 addr = *((RelAddrType* )bp);
6098 bp += SIZE_RELADDR;
6099 fprintf(f, ":%d:(%d)", scn, addr);
6100 break;
6101
6102 default:
6103 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6104 *--bp);
6105 }
6106 }
6107 fputs("]", f);
6108 if (nextp) *nextp = bp;
6109 }
6110
6111 static void
print_compiled_byte_code_list(FILE * f,regex_t * reg)6112 print_compiled_byte_code_list(FILE* f, regex_t* reg)
6113 {
6114 int ncode;
6115 UChar* bp = reg->p;
6116 UChar* end = reg->p + reg->used;
6117
6118 fprintf(f, "code length: %d\n", reg->used);
6119
6120 ncode = 0;
6121 while (bp < end) {
6122 ncode++;
6123 if (bp > reg->p) {
6124 if (ncode % 5 == 0)
6125 fprintf(f, "\n");
6126 else
6127 fputs(" ", f);
6128 }
6129 onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
6130 }
6131
6132 fprintf(f, "\n");
6133 }
6134
6135 static void
print_indent_tree(FILE * f,Node * node,int indent)6136 print_indent_tree(FILE* f, Node* node, int indent)
6137 {
6138 int i, type;
6139 int add = 3;
6140 UChar* p;
6141
6142 Indent(f, indent);
6143 if (IS_NULL(node)) {
6144 fprintf(f, "ERROR: null node!!!\n");
6145 exit (0);
6146 }
6147
6148 type = NTYPE(node);
6149 switch (type) {
6150 case NT_LIST:
6151 case NT_ALT:
6152 if (NTYPE(node) == NT_LIST)
6153 fprintf(f, "<list:%x>\n", (int )node);
6154 else
6155 fprintf(f, "<alt:%x>\n", (int )node);
6156
6157 print_indent_tree(f, NCAR(node), indent + add);
6158 while (IS_NOT_NULL(node = NCDR(node))) {
6159 if (NTYPE(node) != type) {
6160 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6161 exit(0);
6162 }
6163 print_indent_tree(f, NCAR(node), indent + add);
6164 }
6165 break;
6166
6167 case NT_STR:
6168 fprintf(f, "<string%s:%x>",
6169 (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
6170 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6171 if (*p >= 0x20 && *p < 0x7f)
6172 fputc(*p, f);
6173 else {
6174 fprintf(f, " 0x%02x", *p);
6175 }
6176 }
6177 break;
6178
6179 case NT_CCLASS:
6180 fprintf(f, "<cclass:%x>", (int )node);
6181 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
6182 if (NCCLASS(node)->mbuf) {
6183 BBuf* bbuf = NCCLASS(node)->mbuf;
6184 for (i = 0; i < bbuf->used; i++) {
6185 if (i > 0) fprintf(f, ",");
6186 fprintf(f, "%0x", bbuf->p[i]);
6187 }
6188 }
6189 break;
6190
6191 case NT_CTYPE:
6192 fprintf(f, "<ctype:%x> ", (int )node);
6193 switch (NCTYPE(node)->ctype) {
6194 case ONIGENC_CTYPE_WORD:
6195 if (NCTYPE(node)->not != 0)
6196 fputs("not word", f);
6197 else
6198 fputs("word", f);
6199 break;
6200
6201 default:
6202 fprintf(f, "ERROR: undefined ctype.\n");
6203 exit(0);
6204 }
6205 break;
6206
6207 case NT_CANY:
6208 fprintf(f, "<anychar:%x>", (int )node);
6209 break;
6210
6211 case NT_ANCHOR:
6212 fprintf(f, "<anchor:%x> ", (int )node);
6213 switch (NANCHOR(node)->type) {
6214 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6215 case ANCHOR_END_BUF: fputs("end buf", f); break;
6216 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6217 case ANCHOR_END_LINE: fputs("end line", f); break;
6218 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6219 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6220
6221 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6222 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6223 #ifdef USE_WORD_BEGIN_END
6224 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6225 case ANCHOR_WORD_END: fputs("word end", f); break;
6226 #endif
6227 case ANCHOR_PREC_READ: fputs("prec read", f); break;
6228 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
6229 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
6230 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
6231
6232 default:
6233 fprintf(f, "ERROR: undefined anchor type.\n");
6234 break;
6235 }
6236 break;
6237
6238 case NT_BREF:
6239 {
6240 int* p;
6241 BRefNode* br = NBREF(node);
6242 p = BACKREFS_P(br);
6243 fprintf(f, "<backref:%x>", (int )node);
6244 for (i = 0; i < br->back_num; i++) {
6245 if (i > 0) fputs(", ", f);
6246 fprintf(f, "%d", p[i]);
6247 }
6248 }
6249 break;
6250
6251 #ifdef USE_SUBEXP_CALL
6252 case NT_CALL:
6253 {
6254 CallNode* cn = NCALL(node);
6255 fprintf(f, "<call:%x>", (int )node);
6256 p_string(f, cn->name_end - cn->name, cn->name);
6257 }
6258 break;
6259 #endif
6260
6261 case NT_QTFR:
6262 fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
6263 NQTFR(node)->lower, NQTFR(node)->upper,
6264 (NQTFR(node)->greedy ? "" : "?"));
6265 print_indent_tree(f, NQTFR(node)->target, indent + add);
6266 break;
6267
6268 case NT_ENCLOSE:
6269 fprintf(f, "<enclose:%x> ", (int )node);
6270 switch (NENCLOSE(node)->type) {
6271 case ENCLOSE_OPTION:
6272 fprintf(f, "option:%d", NENCLOSE(node)->option);
6273 break;
6274 case ENCLOSE_MEMORY:
6275 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6276 break;
6277 case ENCLOSE_STOP_BACKTRACK:
6278 fprintf(f, "stop-bt");
6279 break;
6280
6281 default:
6282 break;
6283 }
6284 fprintf(f, "\n");
6285 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6286 break;
6287
6288 default:
6289 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6290 break;
6291 }
6292
6293 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6294 type != NT_ENCLOSE)
6295 fprintf(f, "\n");
6296 fflush(f);
6297 }
6298 #endif /* ONIG_DEBUG */
6299
6300 #ifdef ONIG_DEBUG_PARSE_TREE
6301 static void
print_tree(FILE * f,Node * node)6302 print_tree(FILE* f, Node* node)
6303 {
6304 print_indent_tree(f, node, 0);
6305 }
6306 #endif
6307