1 /**********************************************************************
2 regparse.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
7 *
8 * (C) Copyright 2015 Hewlett Packard Enterprise Development LP<BR>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include "regparse.h"
33 #include "st.h"
34
35 #define WARN_BUFSIZE 256
36
37 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
38
39
40 OnigSyntaxType OnigSyntaxRuby = {
41 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
42 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
43 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
44 ONIG_SYN_OP_ESC_C_CONTROL )
45 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
46 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
47 ONIG_SYN_OP2_OPTION_RUBY |
48 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
49 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
50 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
51 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
52 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
53 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
54 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
55 ONIG_SYN_OP2_ESC_H_XDIGIT )
56 , ( SYN_GNU_REGEX_BV |
57 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
58 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
59 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
60 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
61 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
62 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
63 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
64 , ONIG_OPTION_NONE
65 ,
66 {
67 (OnigCodePoint )'\\' /* esc */
68 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
69 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
71 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
72 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
73 }
74 };
75
76 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
77
onig_null_warn(const char * s ARG_UNUSED)78 extern void onig_null_warn(const char* s ARG_UNUSED) { }
79
80 #ifdef DEFAULT_WARN_FUNCTION
81 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
82 #else
83 static OnigWarnFunc onig_warn = onig_null_warn;
84 #endif
85
86 #ifdef DEFAULT_VERB_WARN_FUNCTION
87 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
88 #else
89 static OnigWarnFunc onig_verb_warn = onig_null_warn;
90 #endif
91
onig_set_warn_func(OnigWarnFunc f)92 extern void onig_set_warn_func(OnigWarnFunc f)
93 {
94 onig_warn = f;
95 }
96
onig_set_verb_warn_func(OnigWarnFunc f)97 extern void onig_set_verb_warn_func(OnigWarnFunc f)
98 {
99 onig_verb_warn = f;
100 }
101
102 static void
bbuf_free(BBuf * bbuf)103 bbuf_free(BBuf* bbuf)
104 {
105 if (IS_NOT_NULL(bbuf)) {
106 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
107 xfree(bbuf);
108 }
109 }
110
111 static int
bbuf_clone(BBuf ** rto,BBuf * from)112 bbuf_clone(BBuf** rto, BBuf* from)
113 {
114 int r;
115 BBuf *to;
116
117 *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
118 CHECK_NULL_RETURN_MEMERR(to);
119 r = BBUF_INIT(to, from->alloc);
120 if (r != 0) return r;
121 to->used = from->used;
122 xmemcpy(to->p, from->p, from->used);
123 return 0;
124 }
125
126 #define BACKREF_REL_TO_ABS(rel_no, env) \
127 ((env)->num_mem + 1 + (rel_no))
128
129 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
130
131 #define MBCODE_START_POS(enc) \
132 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
133
134 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
135 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
136
137 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
138 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
139 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
140 if (r) return r;\
141 }\
142 } while (0)
143
144
145 #define BITSET_IS_EMPTY(bs,empty) do {\
146 int i;\
147 empty = 1;\
148 for (i = 0; i < (int )BITSET_SIZE; i++) {\
149 if ((bs)[i] != 0) {\
150 empty = 0; break;\
151 }\
152 }\
153 } while (0)
154
155 static void
bitset_set_range(BitSetRef bs,int from,int to)156 bitset_set_range(BitSetRef bs, int from, int to)
157 {
158 int i;
159 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
160 BITSET_SET_BIT(bs, i);
161 }
162 }
163
164 #if 0
165 static void
166 bitset_set_all(BitSetRef bs)
167 {
168 int i;
169 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
170 }
171 #endif
172
173 static void
bitset_invert(BitSetRef bs)174 bitset_invert(BitSetRef bs)
175 {
176 int i;
177 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
178 }
179
180 static void
bitset_invert_to(BitSetRef from,BitSetRef to)181 bitset_invert_to(BitSetRef from, BitSetRef to)
182 {
183 int i;
184 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); }
185 }
186
187 static void
bitset_and(BitSetRef dest,BitSetRef bs)188 bitset_and(BitSetRef dest, BitSetRef bs)
189 {
190 int i;
191 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; }
192 }
193
194 static void
bitset_or(BitSetRef dest,BitSetRef bs)195 bitset_or(BitSetRef dest, BitSetRef bs)
196 {
197 int i;
198 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; }
199 }
200
201 static void
bitset_copy(BitSetRef dest,BitSetRef bs)202 bitset_copy(BitSetRef dest, BitSetRef bs)
203 {
204 int i;
205 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; }
206 }
207
208 extern int
onig_strncmp(const UChar * s1,const UChar * s2,int n)209 onig_strncmp(const UChar* s1, const UChar* s2, int n)
210 {
211 int x;
212
213 while (n-- > 0) {
214 x = *s2++ - *s1++;
215 if (x) return x;
216 }
217 return 0;
218 }
219
220 extern void
onig_strcpy(UChar * dest,const UChar * src,const UChar * end)221 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
222 {
223 int len = (int)(end - src);
224 if (len > 0) {
225 xmemcpy(dest, src, len);
226 dest[len] = (UChar )0;
227 }
228 }
229
230 #ifdef USE_NAMED_GROUP
231 static UChar*
strdup_with_null(OnigEncoding enc,UChar * s,UChar * end)232 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
233 {
234 int slen, term_len, i;
235 UChar *r;
236
237 slen = (int)(end - s);
238 term_len = ONIGENC_MBC_MINLEN(enc);
239
240 r = (UChar* )xmalloc(slen + term_len);
241 CHECK_NULL_RETURN(r);
242 xmemcpy(r, s, slen);
243
244 for (i = 0; i < term_len; i++)
245 r[slen + i] = (UChar )0;
246
247 return r;
248 }
249 #endif
250
251 /* scan pattern methods */
252 #define PEND_VALUE 0
253
254 #define PFETCH_READY UChar* pfetch_prev
255 #define PEND (p < end ? 0 : 1)
256 #define PUNFETCH p = pfetch_prev
257 #define PINC do { \
258 pfetch_prev = p; \
259 p += ONIGENC_MBC_ENC_LEN(enc, p); \
260 } while (0)
261 #define PFETCH(c) do { \
262 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
263 pfetch_prev = p; \
264 p += ONIGENC_MBC_ENC_LEN(enc, p); \
265 } while (0)
266
267 #define PINC_S do { \
268 p += ONIGENC_MBC_ENC_LEN(enc, p); \
269 } while (0)
270 #define PFETCH_S(c) do { \
271 c = ONIGENC_MBC_TO_CODE(enc, p, end); \
272 p += ONIGENC_MBC_ENC_LEN(enc, p); \
273 } while (0)
274
275 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
276 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
277
278 static UChar*
strcat_capa(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa,int oldCapa)279 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
280 int capa, int oldCapa)
281 {
282 UChar* r;
283
284 if (dest)
285 r = (UChar* )xrealloc(dest, capa + 1, oldCapa);
286 else
287 r = (UChar* )xmalloc(capa + 1);
288
289 CHECK_NULL_RETURN(r);
290 onig_strcpy(r + (dest_end - dest), src, src_end);
291 return r;
292 }
293
294 /* dest on static area */
295 static UChar*
strcat_capa_from_static(UChar * dest,UChar * dest_end,const UChar * src,const UChar * src_end,int capa)296 strcat_capa_from_static(UChar* dest, UChar* dest_end,
297 const UChar* src, const UChar* src_end, int capa)
298 {
299 UChar* r;
300
301 r = (UChar* )xmalloc(capa + 1);
302 CHECK_NULL_RETURN(r);
303 onig_strcpy(r, dest, dest_end);
304 onig_strcpy(r + (dest_end - dest), src, src_end);
305 return r;
306 }
307
308
309 #ifdef USE_ST_LIBRARY
310
311 typedef struct {
312 UChar* s;
313 UChar* end;
314 } st_str_end_key;
315
316 static int
str_end_cmp(st_str_end_key * x,st_str_end_key * y)317 str_end_cmp(st_str_end_key* x, st_str_end_key* y)
318 {
319 UChar *p, *q;
320 int c;
321
322 if ((x->end - x->s) != (y->end - y->s))
323 return 1;
324
325 p = x->s;
326 q = y->s;
327 while (p < x->end) {
328 c = (int )*p - (int )*q;
329 if (c != 0) return c;
330
331 p++; q++;
332 }
333
334 return 0;
335 }
336
337 static int
str_end_hash(st_str_end_key * x)338 str_end_hash(st_str_end_key* x)
339 {
340 UChar *p;
341 int val = 0;
342
343 p = x->s;
344 while (p < x->end) {
345 val = val * 997 + (int )*p++;
346 }
347
348 return val + (val >> 5);
349 }
350
351 extern hash_table_type*
onig_st_init_strend_table_with_size(int size)352 onig_st_init_strend_table_with_size(int size)
353 {
354 static struct st_hash_type hashType = {
355 str_end_cmp,
356 str_end_hash,
357 };
358
359 return (hash_table_type* )
360 onig_st_init_table_with_size(&hashType, size);
361 }
362
363 extern int
onig_st_lookup_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type * value)364 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
365 const UChar* end_key, hash_data_type *value)
366 {
367 st_str_end_key key;
368
369 key.s = (UChar* )str_key;
370 key.end = (UChar* )end_key;
371
372 return onig_st_lookup(table, (st_data_t )(UINTN)(&key), value);
373 }
374
375 extern int
onig_st_insert_strend(hash_table_type * table,const UChar * str_key,const UChar * end_key,hash_data_type value)376 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
377 const UChar* end_key, hash_data_type value)
378 {
379 st_str_end_key* key;
380 int result;
381
382 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
383 CHECK_NULL_RETURN_MEMERR(key);
384 key->s = (UChar* )str_key;
385 key->end = (UChar* )end_key;
386 result = onig_st_insert(table, (st_data_t )(UINTN)key, value);
387 if (result) {
388 xfree(key);
389 }
390 return result;
391 }
392
393 #endif /* USE_ST_LIBRARY */
394
395
396 #ifdef USE_NAMED_GROUP
397
398 #define INIT_NAME_BACKREFS_ALLOC_NUM 8
399
400 typedef struct {
401 UChar* name;
402 int name_len; /* byte length */
403 int back_num; /* number of backrefs */
404 int back_alloc;
405 int back_ref1;
406 int* back_refs;
407 } NameEntry;
408
409 #ifdef USE_ST_LIBRARY
410
411 typedef st_table NameTable;
412 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
413
414 #define NAMEBUF_SIZE 24
415 #define NAMEBUF_SIZE_1 25
416
417 #ifdef ONIG_DEBUG
418 static int
i_print_name_entry(UChar * key,NameEntry * e,void * arg)419 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
420 {
421 int i;
422 FILE* fp = (FILE* )arg;
423
424 fprintf(fp, "%s: ", e->name);
425 if (e->back_num == 0)
426 fputs("-", fp);
427 else if (e->back_num == 1)
428 fprintf(fp, "%d", e->back_ref1);
429 else {
430 for (i = 0; i < e->back_num; i++) {
431 if (i > 0) fprintf(fp, ", ");
432 fprintf(fp, "%d", e->back_refs[i]);
433 }
434 }
435 fputs("\n", fp);
436 return ST_CONTINUE;
437 }
438
439 extern int
onig_print_names(FILE * fp,regex_t * reg)440 onig_print_names(FILE* fp, regex_t* reg)
441 {
442 NameTable* t = (NameTable* )reg->name_table;
443
444 if (IS_NOT_NULL(t)) {
445 fprintf(fp, "name table\n");
446 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp);
447 fputs("\n", fp);
448 }
449 return 0;
450 }
451 #endif /* ONIG_DEBUG */
452
453 static int
i_free_name_entry(UChar * key,NameEntry * e,void * arg ARG_UNUSED)454 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
455 {
456 xfree(e->name);
457 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
458 xfree(key);
459 xfree(e);
460 return ST_DELETE;
461 }
462
463 static int
names_clear(regex_t * reg)464 names_clear(regex_t* reg)
465 {
466 NameTable* t = (NameTable* )reg->name_table;
467
468 if (IS_NOT_NULL(t)) {
469 onig_st_foreach(t, i_free_name_entry, 0);
470 }
471 return 0;
472 }
473
474 extern int
onig_names_free(regex_t * reg)475 onig_names_free(regex_t* reg)
476 {
477 int r;
478 NameTable* t;
479
480 r = names_clear(reg);
481 if (r) return r;
482
483 t = (NameTable* )reg->name_table;
484 if (IS_NOT_NULL(t)) onig_st_free_table(t);
485 reg->name_table = (void* )NULL;
486 return 0;
487 }
488
489 static NameEntry*
name_find(regex_t * reg,const UChar * name,const UChar * name_end)490 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
491 {
492 NameEntry* e;
493 NameTable* t = (NameTable* )reg->name_table;
494
495 e = (NameEntry* )NULL;
496 if (IS_NOT_NULL(t)) {
497 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
498 }
499 return e;
500 }
501
502 typedef struct {
503 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
504 regex_t* reg;
505 void* arg;
506 int ret;
507 OnigEncoding enc;
508 } INamesArg;
509
510 static int
i_names(UChar * key ARG_UNUSED,NameEntry * e,INamesArg * arg)511 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
512 {
513 int r = (*(arg->func))(e->name,
514 e->name + e->name_len,
515 e->back_num,
516 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
517 arg->reg, arg->arg);
518 if (r != 0) {
519 arg->ret = r;
520 return ST_STOP;
521 }
522 return ST_CONTINUE;
523 }
524
525 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)526 onig_foreach_name(regex_t* reg,
527 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
528 {
529 INamesArg narg;
530 NameTable* t = (NameTable* )reg->name_table;
531
532 narg.ret = 0;
533 if (IS_NOT_NULL(t)) {
534 narg.func = func;
535 narg.reg = reg;
536 narg.arg = arg;
537 narg.enc = reg->enc; /* should be pattern encoding. */
538 onig_st_foreach(t, i_names, (HashDataType )(UINTN)&narg);
539 }
540 return narg.ret;
541 }
542
543 static int
i_renumber_name(UChar * key ARG_UNUSED,NameEntry * e,GroupNumRemap * map)544 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
545 {
546 int i;
547
548 if (e->back_num > 1) {
549 for (i = 0; i < e->back_num; i++) {
550 e->back_refs[i] = map[e->back_refs[i]].new_val;
551 }
552 }
553 else if (e->back_num == 1) {
554 e->back_ref1 = map[e->back_ref1].new_val;
555 }
556
557 return ST_CONTINUE;
558 }
559
560 extern int
onig_renumber_name_table(regex_t * reg,GroupNumRemap * map)561 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
562 {
563 NameTable* t = (NameTable* )reg->name_table;
564
565 if (IS_NOT_NULL(t)) {
566 onig_st_foreach(t, i_renumber_name, (HashDataType )(UINTN)map);
567 }
568 return 0;
569 }
570
571
572 extern int
onig_number_of_names(regex_t * reg)573 onig_number_of_names(regex_t* reg)
574 {
575 NameTable* t = (NameTable* )reg->name_table;
576
577 if (IS_NOT_NULL(t))
578 return t->num_entries;
579 else
580 return 0;
581 }
582
583 #else /* USE_ST_LIBRARY */
584
585 #define INIT_NAMES_ALLOC_NUM 8
586
587 typedef struct {
588 NameEntry* e;
589 int num;
590 int alloc;
591 } NameTable;
592
593 #ifdef ONIG_DEBUG
594 extern int
onig_print_names(FILE * fp,regex_t * reg)595 onig_print_names(FILE* fp, regex_t* reg)
596 {
597 int i, j;
598 NameEntry* e;
599 NameTable* t = (NameTable* )reg->name_table;
600
601 if (IS_NOT_NULL(t) && t->num > 0) {
602 fprintf(fp, "name table\n");
603 for (i = 0; i < t->num; i++) {
604 e = &(t->e[i]);
605 fprintf(fp, "%s: ", e->name);
606 if (e->back_num == 0) {
607 fputs("-", fp);
608 }
609 else if (e->back_num == 1) {
610 fprintf(fp, "%d", e->back_ref1);
611 }
612 else {
613 for (j = 0; j < e->back_num; j++) {
614 if (j > 0) fprintf(fp, ", ");
615 fprintf(fp, "%d", e->back_refs[j]);
616 }
617 }
618 fputs("\n", fp);
619 }
620 fputs("\n", fp);
621 }
622 return 0;
623 }
624 #endif
625
626 static int
names_clear(regex_t * reg)627 names_clear(regex_t* reg)
628 {
629 int i;
630 NameEntry* e;
631 NameTable* t = (NameTable* )reg->name_table;
632
633 if (IS_NOT_NULL(t)) {
634 for (i = 0; i < t->num; i++) {
635 e = &(t->e[i]);
636 if (IS_NOT_NULL(e->name)) {
637 xfree(e->name);
638 e->name = NULL;
639 e->name_len = 0;
640 e->back_num = 0;
641 e->back_alloc = 0;
642 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
643 e->back_refs = (int* )NULL;
644 }
645 }
646 if (IS_NOT_NULL(t->e)) {
647 xfree(t->e);
648 t->e = NULL;
649 }
650 t->num = 0;
651 }
652 return 0;
653 }
654
655 extern int
onig_names_free(regex_t * reg)656 onig_names_free(regex_t* reg)
657 {
658 int r;
659 NameTable* t;
660
661 r = names_clear(reg);
662 if (r) return r;
663
664 t = (NameTable* )reg->name_table;
665 if (IS_NOT_NULL(t)) xfree(t);
666 reg->name_table = NULL;
667 return 0;
668 }
669
670 static NameEntry*
name_find(regex_t * reg,UChar * name,UChar * name_end)671 name_find(regex_t* reg, UChar* name, UChar* name_end)
672 {
673 int i, len;
674 NameEntry* e;
675 NameTable* t = (NameTable* )reg->name_table;
676
677 if (IS_NOT_NULL(t)) {
678 len = name_end - name;
679 for (i = 0; i < t->num; i++) {
680 e = &(t->e[i]);
681 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
682 return e;
683 }
684 }
685 return (NameEntry* )NULL;
686 }
687
688 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)689 onig_foreach_name(regex_t* reg,
690 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
691 {
692 int i, r;
693 NameEntry* e;
694 NameTable* t = (NameTable* )reg->name_table;
695
696 if (IS_NOT_NULL(t)) {
697 for (i = 0; i < t->num; i++) {
698 e = &(t->e[i]);
699 r = (*func)(e->name, e->name + e->name_len, e->back_num,
700 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
701 reg, arg);
702 if (r != 0) return r;
703 }
704 }
705 return 0;
706 }
707
708 extern int
onig_number_of_names(regex_t * reg)709 onig_number_of_names(regex_t* reg)
710 {
711 NameTable* t = (NameTable* )reg->name_table;
712
713 if (IS_NOT_NULL(t))
714 return t->num;
715 else
716 return 0;
717 }
718
719 #endif /* else USE_ST_LIBRARY */
720
721 static int
name_add(regex_t * reg,UChar * name,UChar * name_end,int backref,ScanEnv * env)722 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
723 {
724 int alloc;
725 NameEntry* e;
726 NameTable* t = (NameTable* )reg->name_table;
727
728 if (name_end - name <= 0)
729 return ONIGERR_EMPTY_GROUP_NAME;
730
731 e = name_find(reg, name, name_end);
732 if (IS_NULL(e)) {
733 #ifdef USE_ST_LIBRARY
734 if (IS_NULL(t)) {
735 t = onig_st_init_strend_table_with_size(5);
736 CHECK_NULL_RETURN_MEMERR(t);
737 reg->name_table = (void* )t;
738 }
739 e = (NameEntry* )xmalloc(sizeof(NameEntry));
740 CHECK_NULL_RETURN_MEMERR(e);
741
742 e->name = strdup_with_null(reg->enc, name, name_end);
743 if (IS_NULL(e->name)) {
744 xfree(e); return ONIGERR_MEMORY;
745 }
746 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
747 (HashDataType )(UINTN)e);
748
749 e->name_len = (int)(name_end - name);
750 e->back_num = 0;
751 e->back_alloc = 0;
752 e->back_refs = (int* )NULL;
753
754 #else
755
756 if (IS_NULL(t)) {
757 alloc = INIT_NAMES_ALLOC_NUM;
758 t = (NameTable* )xmalloc(sizeof(NameTable));
759 CHECK_NULL_RETURN_MEMERR(t);
760 t->e = NULL;
761 t->alloc = 0;
762 t->num = 0;
763
764 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
765 if (IS_NULL(t->e)) {
766 xfree(t);
767 return ONIGERR_MEMORY;
768 }
769 t->alloc = alloc;
770 reg->name_table = t;
771 goto clear;
772 }
773 else if (t->num == t->alloc) {
774 int i;
775
776 alloc = t->alloc * 2;
777 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
778 CHECK_NULL_RETURN_MEMERR(t->e);
779 t->alloc = alloc;
780
781 clear:
782 for (i = t->num; i < t->alloc; i++) {
783 t->e[i].name = NULL;
784 t->e[i].name_len = 0;
785 t->e[i].back_num = 0;
786 t->e[i].back_alloc = 0;
787 t->e[i].back_refs = (int* )NULL;
788 }
789 }
790 e = &(t->e[t->num]);
791 t->num++;
792 e->name = strdup_with_null(reg->enc, name, name_end);
793 if (IS_NULL(e->name)) return ONIGERR_MEMORY;
794 e->name_len = name_end - name;
795 #endif
796 }
797
798 if (e->back_num >= 1 &&
799 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
800 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
801 name, name_end);
802 return ONIGERR_MULTIPLEX_DEFINED_NAME;
803 }
804
805 e->back_num++;
806 if (e->back_num == 1) {
807 e->back_ref1 = backref;
808 }
809 else {
810 if (e->back_num == 2) {
811 alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
812 e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
813 CHECK_NULL_RETURN_MEMERR(e->back_refs);
814 e->back_alloc = alloc;
815 e->back_refs[0] = e->back_ref1;
816 e->back_refs[1] = backref;
817 }
818 else {
819 if (e->back_num > e->back_alloc) {
820 alloc = e->back_alloc * 2;
821 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc, sizeof(int) * e->back_alloc);
822 CHECK_NULL_RETURN_MEMERR(e->back_refs);
823 e->back_alloc = alloc;
824 }
825 e->back_refs[e->back_num - 1] = backref;
826 }
827 }
828
829 return 0;
830 }
831
832 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)833 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
834 const UChar* name_end, int** nums)
835 {
836 NameEntry* e = name_find(reg, name, name_end);
837
838 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
839
840 switch (e->back_num) {
841 case 0:
842 break;
843 case 1:
844 *nums = &(e->back_ref1);
845 break;
846 default:
847 *nums = e->back_refs;
848 break;
849 }
850 return e->back_num;
851 }
852
853 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)854 onig_name_to_backref_number(regex_t* reg, const UChar* name,
855 const UChar* name_end, OnigRegion *region)
856 {
857 int i, n, *nums;
858
859 n = onig_name_to_group_numbers(reg, name, name_end, &nums);
860 if (n < 0)
861 return n;
862 else if (n == 0)
863 return ONIGERR_PARSER_BUG;
864 else if (n == 1)
865 return nums[0];
866 else {
867 if (IS_NOT_NULL(region)) {
868 for (i = n - 1; i >= 0; i--) {
869 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
870 return nums[i];
871 }
872 }
873 return nums[n - 1];
874 }
875 }
876
877 #else /* USE_NAMED_GROUP */
878
879 extern int
onig_name_to_group_numbers(regex_t * reg,const UChar * name,const UChar * name_end,int ** nums)880 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
881 const UChar* name_end, int** nums)
882 {
883 return ONIG_NO_SUPPORT_CONFIG;
884 }
885
886 extern int
onig_name_to_backref_number(regex_t * reg,const UChar * name,const UChar * name_end,OnigRegion * region)887 onig_name_to_backref_number(regex_t* reg, const UChar* name,
888 const UChar* name_end, OnigRegion* region)
889 {
890 return ONIG_NO_SUPPORT_CONFIG;
891 }
892
893 extern int
onig_foreach_name(regex_t * reg,int (* func)(const UChar *,const UChar *,int,int *,regex_t *,void *),void * arg)894 onig_foreach_name(regex_t* reg,
895 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
896 {
897 return ONIG_NO_SUPPORT_CONFIG;
898 }
899
900 extern int
onig_number_of_names(regex_t * reg)901 onig_number_of_names(regex_t* reg)
902 {
903 return 0;
904 }
905 #endif /* else USE_NAMED_GROUP */
906
907 extern int
onig_noname_group_capture_is_active(regex_t * reg)908 onig_noname_group_capture_is_active(regex_t* reg)
909 {
910 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
911 return 0;
912
913 #ifdef USE_NAMED_GROUP
914 if (onig_number_of_names(reg) > 0 &&
915 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
916 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
917 return 0;
918 }
919 #endif
920
921 return 1;
922 }
923
924
925 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
926
927 static void
scan_env_clear(ScanEnv * env)928 scan_env_clear(ScanEnv* env)
929 {
930 int i;
931
932 BIT_STATUS_CLEAR(env->capture_history);
933 BIT_STATUS_CLEAR(env->bt_mem_start);
934 BIT_STATUS_CLEAR(env->bt_mem_end);
935 BIT_STATUS_CLEAR(env->backrefed_mem);
936 env->error = (UChar* )NULL;
937 env->error_end = (UChar* )NULL;
938 env->num_call = 0;
939 env->num_mem = 0;
940 #ifdef USE_NAMED_GROUP
941 env->num_named = 0;
942 #endif
943 env->mem_alloc = 0;
944 env->mem_nodes_dynamic = (Node** )NULL;
945
946 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
947 env->mem_nodes_static[i] = NULL_NODE;
948
949 #ifdef USE_COMBINATION_EXPLOSION_CHECK
950 env->num_comb_exp_check = 0;
951 env->comb_exp_max_regnum = 0;
952 env->curr_max_regnum = 0;
953 env->has_recursion = 0;
954 #endif
955 }
956
957 static int
scan_env_add_mem_entry(ScanEnv * env)958 scan_env_add_mem_entry(ScanEnv* env)
959 {
960 int i, need, alloc;
961 Node** p;
962
963 need = env->num_mem + 1;
964 if (need >= SCANENV_MEMNODES_SIZE) {
965 if (env->mem_alloc <= need) {
966 if (IS_NULL(env->mem_nodes_dynamic)) {
967 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
968 p = (Node** )xmalloc(sizeof(Node*) * alloc);
969 CHECK_NULL_RETURN_MEMERR(p);
970
971 xmemcpy(p, env->mem_nodes_static,
972 sizeof(Node*) * SCANENV_MEMNODES_SIZE);
973 }
974 else {
975 alloc = env->mem_alloc * 2;
976 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc, sizeof(Node*) * env->mem_alloc);
977 }
978 CHECK_NULL_RETURN_MEMERR(p);
979
980 for (i = env->num_mem + 1; i < alloc; i++)
981 p[i] = NULL_NODE;
982
983 env->mem_nodes_dynamic = p;
984 env->mem_alloc = alloc;
985 }
986 }
987
988 env->num_mem++;
989 return env->num_mem;
990 }
991
992 static int
scan_env_set_mem_node(ScanEnv * env,int num,Node * node)993 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
994 {
995 if (env->num_mem >= num)
996 SCANENV_MEM_NODES(env)[num] = node;
997 else
998 return ONIGERR_PARSER_BUG;
999 return 0;
1000 }
1001
1002
1003 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1004 typedef struct _FreeNode {
1005 struct _FreeNode* next;
1006 } FreeNode;
1007
1008 static FreeNode* FreeNodeList = (FreeNode* )NULL;
1009 #endif
1010
1011 extern void
onig_node_free(Node * node)1012 onig_node_free(Node* node)
1013 {
1014 start:
1015 if (IS_NULL(node)) return ;
1016
1017 switch (NTYPE(node)) {
1018 case NT_STR:
1019 if (NSTR(node)->capa != 0 &&
1020 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1021 xfree(NSTR(node)->s);
1022 }
1023 break;
1024
1025 case NT_LIST:
1026 case NT_ALT:
1027 onig_node_free(NCAR(node));
1028 {
1029 Node* next_node = NCDR(node);
1030
1031 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1032 {
1033 FreeNode* n = (FreeNode* )node;
1034
1035 THREAD_ATOMIC_START;
1036 n->next = FreeNodeList;
1037 FreeNodeList = n;
1038 THREAD_ATOMIC_END;
1039 }
1040 #else
1041 xfree(node);
1042 #endif
1043 node = next_node;
1044 goto start;
1045 }
1046 break;
1047
1048 case NT_CCLASS:
1049 {
1050 CClassNode* cc = NCCLASS(node);
1051
1052 if (IS_NCCLASS_SHARE(cc)) return ;
1053 if (cc->mbuf)
1054 bbuf_free(cc->mbuf);
1055 }
1056 break;
1057
1058 case NT_QTFR:
1059 if (NQTFR(node)->target)
1060 onig_node_free(NQTFR(node)->target);
1061 break;
1062
1063 case NT_ENCLOSE:
1064 if (NENCLOSE(node)->target)
1065 onig_node_free(NENCLOSE(node)->target);
1066 break;
1067
1068 case NT_BREF:
1069 if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1070 xfree(NBREF(node)->back_dynamic);
1071 break;
1072
1073 case NT_ANCHOR:
1074 if (NANCHOR(node)->target)
1075 onig_node_free(NANCHOR(node)->target);
1076 break;
1077 }
1078
1079 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1080 {
1081 FreeNode* n = (FreeNode* )node;
1082
1083 THREAD_ATOMIC_START;
1084 n->next = FreeNodeList;
1085 FreeNodeList = n;
1086 THREAD_ATOMIC_END;
1087 }
1088 #else
1089 xfree(node);
1090 #endif
1091 }
1092
1093 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1094 extern int
onig_free_node_list(void)1095 onig_free_node_list(void)
1096 {
1097 FreeNode* n;
1098
1099 /* THREAD_ATOMIC_START; */
1100 while (IS_NOT_NULL(FreeNodeList)) {
1101 n = FreeNodeList;
1102 FreeNodeList = FreeNodeList->next;
1103 xfree(n);
1104 }
1105 /* THREAD_ATOMIC_END; */
1106 return 0;
1107 }
1108 #endif
1109
1110 static Node*
node_new(void)1111 node_new(void)
1112 {
1113 Node* node;
1114
1115 #ifdef USE_PARSE_TREE_NODE_RECYCLE
1116 THREAD_ATOMIC_START;
1117 if (IS_NOT_NULL(FreeNodeList)) {
1118 node = (Node* )FreeNodeList;
1119 FreeNodeList = FreeNodeList->next;
1120 THREAD_ATOMIC_END;
1121 return node;
1122 }
1123 THREAD_ATOMIC_END;
1124 #endif
1125
1126 node = (Node* )xmalloc(sizeof(Node));
1127 /* xmemset(node, 0, sizeof(Node)); */
1128 return node;
1129 }
1130
1131
1132 static void
initialize_cclass(CClassNode * cc)1133 initialize_cclass(CClassNode* cc)
1134 {
1135 BITSET_CLEAR(cc->bs);
1136 /* cc->base.flags = 0; */
1137 cc->flags = 0;
1138 cc->mbuf = NULL;
1139 }
1140
1141 static Node*
node_new_cclass(void)1142 node_new_cclass(void)
1143 {
1144 Node* node = node_new();
1145 CHECK_NULL_RETURN(node);
1146
1147 SET_NTYPE(node, NT_CCLASS);
1148 initialize_cclass(NCCLASS(node));
1149 return node;
1150 }
1151
1152 static Node*
node_new_cclass_by_codepoint_range(int not,OnigCodePoint sb_out,const OnigCodePoint ranges[])1153 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out,
1154 const OnigCodePoint ranges[])
1155 {
1156 int n, i;
1157 CClassNode* cc;
1158 OnigCodePoint j;
1159
1160 Node* node = node_new_cclass();
1161 CHECK_NULL_RETURN(node);
1162
1163 cc = NCCLASS(node);
1164 if (not != 0) NCCLASS_SET_NOT(cc);
1165
1166 BITSET_CLEAR(cc->bs);
1167 if (sb_out > 0 && IS_NOT_NULL(ranges)) {
1168 n = ONIGENC_CODE_RANGE_NUM(ranges);
1169 for (i = 0; i < n; i++) {
1170 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i);
1171 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) {
1172 if (j >= sb_out) goto sb_end;
1173
1174 BITSET_SET_BIT(cc->bs, j);
1175 }
1176 }
1177 }
1178
1179 sb_end:
1180 if (IS_NULL(ranges)) {
1181 is_null:
1182 cc->mbuf = NULL;
1183 }
1184 else {
1185 BBuf* bbuf;
1186
1187 n = ONIGENC_CODE_RANGE_NUM(ranges);
1188 if (n == 0) goto is_null;
1189
1190 bbuf = (BBuf* )xmalloc(sizeof(BBuf));
1191 CHECK_NULL_RETURN(bbuf);
1192 bbuf->alloc = n + 1;
1193 bbuf->used = n + 1;
1194 bbuf->p = (UChar* )((void* )ranges);
1195
1196 cc->mbuf = bbuf;
1197 }
1198
1199 return node;
1200 }
1201
1202 static Node*
node_new_ctype(int type,int not)1203 node_new_ctype(int type, int not)
1204 {
1205 Node* node = node_new();
1206 CHECK_NULL_RETURN(node);
1207
1208 SET_NTYPE(node, NT_CTYPE);
1209 NCTYPE(node)->ctype = type;
1210 NCTYPE(node)->not = not;
1211 return node;
1212 }
1213
1214 static Node*
node_new_anychar(void)1215 node_new_anychar(void)
1216 {
1217 Node* node = node_new();
1218 CHECK_NULL_RETURN(node);
1219
1220 SET_NTYPE(node, NT_CANY);
1221 return node;
1222 }
1223
1224 static Node*
node_new_list(Node * left,Node * right)1225 node_new_list(Node* left, Node* right)
1226 {
1227 Node* node = node_new();
1228 CHECK_NULL_RETURN(node);
1229
1230 SET_NTYPE(node, NT_LIST);
1231 NCAR(node) = left;
1232 NCDR(node) = right;
1233 return node;
1234 }
1235
1236 extern Node*
onig_node_new_list(Node * left,Node * right)1237 onig_node_new_list(Node* left, Node* right)
1238 {
1239 return node_new_list(left, right);
1240 }
1241
1242 extern Node*
onig_node_list_add(Node * list,Node * x)1243 onig_node_list_add(Node* list, Node* x)
1244 {
1245 Node *n;
1246
1247 n = onig_node_new_list(x, NULL);
1248 if (IS_NULL(n)) return NULL_NODE;
1249
1250 if (IS_NOT_NULL(list)) {
1251 while (IS_NOT_NULL(NCDR(list)))
1252 list = NCDR(list);
1253
1254 NCDR(list) = n;
1255 }
1256
1257 return n;
1258 }
1259
1260 extern Node*
onig_node_new_alt(Node * left,Node * right)1261 onig_node_new_alt(Node* left, Node* right)
1262 {
1263 Node* node = node_new();
1264 CHECK_NULL_RETURN(node);
1265
1266 SET_NTYPE(node, NT_ALT);
1267 NCAR(node) = left;
1268 NCDR(node) = right;
1269 return node;
1270 }
1271
1272 extern Node*
onig_node_new_anchor(int type)1273 onig_node_new_anchor(int type)
1274 {
1275 Node* node = node_new();
1276 CHECK_NULL_RETURN(node);
1277
1278 SET_NTYPE(node, NT_ANCHOR);
1279 NANCHOR(node)->type = type;
1280 NANCHOR(node)->target = NULL;
1281 NANCHOR(node)->char_len = -1;
1282 return node;
1283 }
1284
1285 static Node*
node_new_backref(int back_num,int * backrefs,int by_name,int exist_level,int nest_level,ScanEnv * env)1286 node_new_backref(int back_num, int* backrefs, int by_name,
1287 #ifdef USE_BACKREF_WITH_LEVEL
1288 int exist_level, int nest_level,
1289 #endif
1290 ScanEnv* env)
1291 {
1292 int i;
1293 Node* node = node_new();
1294
1295 CHECK_NULL_RETURN(node);
1296
1297 SET_NTYPE(node, NT_BREF);
1298 NBREF(node)->state = 0;
1299 NBREF(node)->back_num = back_num;
1300 NBREF(node)->back_dynamic = (int* )NULL;
1301 if (by_name != 0)
1302 NBREF(node)->state |= NST_NAME_REF;
1303
1304 #ifdef USE_BACKREF_WITH_LEVEL
1305 if (exist_level != 0) {
1306 NBREF(node)->state |= NST_NEST_LEVEL;
1307 NBREF(node)->nest_level = nest_level;
1308 }
1309 #endif
1310
1311 for (i = 0; i < back_num; i++) {
1312 if (backrefs[i] <= env->num_mem &&
1313 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1314 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1315 break;
1316 }
1317 }
1318
1319 if (back_num <= NODE_BACKREFS_SIZE) {
1320 for (i = 0; i < back_num; i++)
1321 NBREF(node)->back_static[i] = backrefs[i];
1322 }
1323 else {
1324 int* p = (int* )xmalloc(sizeof(int) * back_num);
1325 if (IS_NULL(p)) {
1326 onig_node_free(node);
1327 return NULL;
1328 }
1329 NBREF(node)->back_dynamic = p;
1330 for (i = 0; i < back_num; i++)
1331 p[i] = backrefs[i];
1332 }
1333 return node;
1334 }
1335
1336 #ifdef USE_SUBEXP_CALL
1337 static Node*
node_new_call(UChar * name,UChar * name_end,int gnum)1338 node_new_call(UChar* name, UChar* name_end, int gnum)
1339 {
1340 Node* node = node_new();
1341 CHECK_NULL_RETURN(node);
1342
1343 SET_NTYPE(node, NT_CALL);
1344 NCALL(node)->state = 0;
1345 NCALL(node)->target = NULL_NODE;
1346 NCALL(node)->name = name;
1347 NCALL(node)->name_end = name_end;
1348 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1349 return node;
1350 }
1351 #endif
1352
1353 static Node*
node_new_quantifier(int lower,int upper,int by_number)1354 node_new_quantifier(int lower, int upper, int by_number)
1355 {
1356 Node* node = node_new();
1357 CHECK_NULL_RETURN(node);
1358
1359 SET_NTYPE(node, NT_QTFR);
1360 NQTFR(node)->state = 0;
1361 NQTFR(node)->target = NULL;
1362 NQTFR(node)->lower = lower;
1363 NQTFR(node)->upper = upper;
1364 NQTFR(node)->greedy = 1;
1365 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1366 NQTFR(node)->head_exact = NULL_NODE;
1367 NQTFR(node)->next_head_exact = NULL_NODE;
1368 NQTFR(node)->is_refered = 0;
1369 if (by_number != 0)
1370 NQTFR(node)->state |= NST_BY_NUMBER;
1371
1372 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1373 NQTFR(node)->comb_exp_check_num = 0;
1374 #endif
1375
1376 return node;
1377 }
1378
1379 static Node*
node_new_enclose(int type)1380 node_new_enclose(int type)
1381 {
1382 Node* node = node_new();
1383 CHECK_NULL_RETURN(node);
1384
1385 SET_NTYPE(node, NT_ENCLOSE);
1386 NENCLOSE(node)->type = type;
1387 NENCLOSE(node)->state = 0;
1388 NENCLOSE(node)->regnum = 0;
1389 NENCLOSE(node)->option = 0;
1390 NENCLOSE(node)->target = NULL;
1391 NENCLOSE(node)->call_addr = -1;
1392 NENCLOSE(node)->opt_count = 0;
1393 return node;
1394 }
1395
1396 extern Node*
onig_node_new_enclose(int type)1397 onig_node_new_enclose(int type)
1398 {
1399 return node_new_enclose(type);
1400 }
1401
1402 static Node*
node_new_enclose_memory(OnigOptionType option,int is_named)1403 node_new_enclose_memory(OnigOptionType option, int is_named)
1404 {
1405 Node* node = node_new_enclose(ENCLOSE_MEMORY);
1406 CHECK_NULL_RETURN(node);
1407 if (is_named != 0)
1408 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1409
1410 #ifdef USE_SUBEXP_CALL
1411 NENCLOSE(node)->option = option;
1412 #endif
1413 return node;
1414 }
1415
1416 static Node*
node_new_option(OnigOptionType option)1417 node_new_option(OnigOptionType option)
1418 {
1419 Node* node = node_new_enclose(ENCLOSE_OPTION);
1420 CHECK_NULL_RETURN(node);
1421 NENCLOSE(node)->option = option;
1422 return node;
1423 }
1424
1425 extern int
onig_node_str_cat(Node * node,const UChar * s,const UChar * end)1426 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1427 {
1428 int addlen = (int)(end - s);
1429
1430 if (addlen > 0) {
1431 int len = (int)(NSTR(node)->end - NSTR(node)->s);
1432
1433 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1434 UChar* p;
1435 int capa = len + addlen + NODE_STR_MARGIN;
1436
1437 if (capa <= NSTR(node)->capa) {
1438 onig_strcpy(NSTR(node)->s + len, s, end);
1439 }
1440 else {
1441 if (NSTR(node)->s == NSTR(node)->buf)
1442 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1443 s, end, capa);
1444 else
1445 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa, NSTR(node)->capa);
1446
1447 CHECK_NULL_RETURN_MEMERR(p);
1448 NSTR(node)->s = p;
1449 NSTR(node)->capa = capa;
1450 }
1451 }
1452 else {
1453 onig_strcpy(NSTR(node)->s + len, s, end);
1454 }
1455 NSTR(node)->end = NSTR(node)->s + len + addlen;
1456 }
1457
1458 return 0;
1459 }
1460
1461 extern int
onig_node_str_set(Node * node,const UChar * s,const UChar * end)1462 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1463 {
1464 onig_node_str_clear(node);
1465 return onig_node_str_cat(node, s, end);
1466 }
1467
1468 static int
node_str_cat_char(Node * node,UChar c)1469 node_str_cat_char(Node* node, UChar c)
1470 {
1471 UChar s[1];
1472
1473 s[0] = c;
1474 return onig_node_str_cat(node, s, s + 1);
1475 }
1476
1477 extern void
onig_node_conv_to_str_node(Node * node,int flag)1478 onig_node_conv_to_str_node(Node* node, int flag)
1479 {
1480 SET_NTYPE(node, NT_STR);
1481 NSTR(node)->flag = flag;
1482 NSTR(node)->capa = 0;
1483 NSTR(node)->s = NSTR(node)->buf;
1484 NSTR(node)->end = NSTR(node)->buf;
1485 }
1486
1487 extern void
onig_node_str_clear(Node * node)1488 onig_node_str_clear(Node* node)
1489 {
1490 if (NSTR(node)->capa != 0 &&
1491 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1492 xfree(NSTR(node)->s);
1493 }
1494
1495 NSTR(node)->capa = 0;
1496 NSTR(node)->flag = 0;
1497 NSTR(node)->s = NSTR(node)->buf;
1498 NSTR(node)->end = NSTR(node)->buf;
1499 }
1500
1501 static Node*
node_new_str(const UChar * s,const UChar * end)1502 node_new_str(const UChar* s, const UChar* end)
1503 {
1504 Node* node = node_new();
1505 CHECK_NULL_RETURN(node);
1506
1507 SET_NTYPE(node, NT_STR);
1508 NSTR(node)->capa = 0;
1509 NSTR(node)->flag = 0;
1510 NSTR(node)->s = NSTR(node)->buf;
1511 NSTR(node)->end = NSTR(node)->buf;
1512 if (onig_node_str_cat(node, s, end)) {
1513 onig_node_free(node);
1514 return NULL;
1515 }
1516 return node;
1517 }
1518
1519 extern Node*
onig_node_new_str(const UChar * s,const UChar * end)1520 onig_node_new_str(const UChar* s, const UChar* end)
1521 {
1522 return node_new_str(s, end);
1523 }
1524
1525 static Node*
node_new_str_raw(UChar * s,UChar * end)1526 node_new_str_raw(UChar* s, UChar* end)
1527 {
1528 Node* node = node_new_str(s, end);
1529 CHECK_NULL_RETURN(node);
1530 NSTRING_SET_RAW(node);
1531 return node;
1532 }
1533
1534 static Node*
node_new_empty(void)1535 node_new_empty(void)
1536 {
1537 return node_new_str(NULL, NULL);
1538 }
1539
1540 static Node*
node_new_str_raw_char(UChar c)1541 node_new_str_raw_char(UChar c)
1542 {
1543 UChar p[1];
1544
1545 p[0] = c;
1546 return node_new_str_raw(p, p + 1);
1547 }
1548
1549 static Node*
str_node_split_last_char(StrNode * sn,OnigEncoding enc)1550 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1551 {
1552 const UChar *p;
1553 Node* n = NULL_NODE;
1554
1555 if (sn->end > sn->s) {
1556 p = onigenc_get_prev_char_head(enc, sn->s, sn->end);
1557 if (p && p > sn->s) { /* can be splitted. */
1558 n = node_new_str(p, sn->end);
1559 CHECK_NULL_RETURN(n);
1560 if ((sn->flag & NSTR_RAW) != 0)
1561 NSTRING_SET_RAW(n);
1562 sn->end = (UChar* )p;
1563 }
1564 }
1565 return n;
1566 }
1567
1568 static int
str_node_can_be_split(StrNode * sn,OnigEncoding enc)1569 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1570 {
1571 if (sn->end > sn->s) {
1572 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
1573 }
1574 return 0;
1575 }
1576
1577 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1578 static int
node_str_head_pad(StrNode * sn,int num,UChar val)1579 node_str_head_pad(StrNode* sn, int num, UChar val)
1580 {
1581 UChar buf[NODE_STR_BUF_SIZE];
1582 int i, len;
1583
1584 len = sn->end - sn->s;
1585 onig_strcpy(buf, sn->s, sn->end);
1586 onig_strcpy(&(sn->s[num]), buf, buf + len);
1587 sn->end += num;
1588
1589 for (i = 0; i < num; i++) {
1590 sn->s[i] = val;
1591 }
1592 }
1593 #endif
1594
1595 extern int
onig_scan_unsigned_number(UChar ** src,const UChar * end,OnigEncoding enc)1596 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1597 {
1598 unsigned int num, val;
1599 OnigCodePoint c;
1600 UChar* p = *src;
1601 PFETCH_READY;
1602
1603 num = 0;
1604 while (!PEND) {
1605 PFETCH(c);
1606 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1607 val = (unsigned int )DIGITVAL(c);
1608 if ((INT_MAX_LIMIT - val) / 10UL < num)
1609 return -1; /* overflow */
1610
1611 num = num * 10 + val;
1612 }
1613 else {
1614 PUNFETCH;
1615 break;
1616 }
1617 }
1618 *src = p;
1619 return num;
1620 }
1621
1622 static int
scan_unsigned_hexadecimal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1623 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
1624 OnigEncoding enc)
1625 {
1626 OnigCodePoint c;
1627 unsigned int num, val;
1628 UChar* p = *src;
1629 PFETCH_READY;
1630
1631 num = 0;
1632 while (!PEND && maxlen-- != 0) {
1633 PFETCH(c);
1634 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1635 val = (unsigned int )XDIGITVAL(enc,c);
1636 if ((INT_MAX_LIMIT - val) / 16UL < num)
1637 return -1; /* overflow */
1638
1639 num = (num << 4) + XDIGITVAL(enc,c);
1640 }
1641 else {
1642 PUNFETCH;
1643 break;
1644 }
1645 }
1646 *src = p;
1647 return num;
1648 }
1649
1650 static int
scan_unsigned_octal_number(UChar ** src,UChar * end,int maxlen,OnigEncoding enc)1651 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1652 OnigEncoding enc)
1653 {
1654 OnigCodePoint c;
1655 unsigned int num, val;
1656 UChar* p = *src;
1657 PFETCH_READY;
1658
1659 num = 0;
1660 while (!PEND && maxlen-- != 0) {
1661 PFETCH(c);
1662 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1663 val = ODIGITVAL(c);
1664 if ((INT_MAX_LIMIT - val) / 8UL < num)
1665 return -1; /* overflow */
1666
1667 num = (num << 3) + val;
1668 }
1669 else {
1670 PUNFETCH;
1671 break;
1672 }
1673 }
1674 *src = p;
1675 return num;
1676 }
1677
1678
1679 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1680 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1681
1682 /* data format:
1683 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1684 (all data size is OnigCodePoint)
1685 */
1686 static int
new_code_range(BBuf ** pbuf)1687 new_code_range(BBuf** pbuf)
1688 {
1689 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1690 int r;
1691 OnigCodePoint n;
1692 BBuf* bbuf;
1693
1694 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1695 CHECK_NULL_RETURN_MEMERR(*pbuf);
1696 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1697 if (r) return r;
1698
1699 n = 0;
1700 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1701 return 0;
1702 }
1703
1704 static int
add_code_range_to_buf(BBuf ** pbuf,OnigCodePoint from,OnigCodePoint to)1705 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
1706 {
1707 int r, inc_n, pos;
1708 int low, high, bound, x;
1709 OnigCodePoint n, *data;
1710 BBuf* bbuf;
1711
1712 if (from > to) {
1713 n = from; from = to; to = n;
1714 }
1715
1716 if (IS_NULL(*pbuf)) {
1717 r = new_code_range(pbuf);
1718 if (r) return r;
1719 bbuf = *pbuf;
1720 n = 0;
1721 }
1722 else {
1723 bbuf = *pbuf;
1724 GET_CODE_POINT(n, bbuf->p);
1725 }
1726 data = (OnigCodePoint* )(bbuf->p);
1727 data++;
1728
1729 for (low = 0, bound = n; low < bound; ) {
1730 x = (low + bound) >> 1;
1731 if (from > data[x*2 + 1])
1732 low = x + 1;
1733 else
1734 bound = x;
1735 }
1736
1737 for (high = low, bound = n; high < bound; ) {
1738 x = (high + bound) >> 1;
1739 if (to >= data[x*2] - 1)
1740 high = x + 1;
1741 else
1742 bound = x;
1743 }
1744
1745 inc_n = low + 1 - high;
1746 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1747 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1748
1749 if (inc_n != 1) {
1750 if (from > data[low*2])
1751 from = data[low*2];
1752 if (to < data[(high - 1)*2 + 1])
1753 to = data[(high - 1)*2 + 1];
1754 }
1755
1756 if (inc_n != 0 && (OnigCodePoint )high < n) {
1757 int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1758 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1759 int size = (n - high) * 2 * SIZE_CODE_POINT;
1760
1761 if (inc_n > 0) {
1762 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1763 }
1764 else {
1765 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1766 }
1767 }
1768
1769 pos = SIZE_CODE_POINT * (1 + low * 2);
1770 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1771 BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1772 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1773 n += inc_n;
1774 BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1775
1776 return 0;
1777 }
1778
1779 static int
add_code_range(BBuf ** pbuf,ScanEnv * env,OnigCodePoint from,OnigCodePoint to)1780 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1781 {
1782 if (from > to) {
1783 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1784 return 0;
1785 else
1786 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1787 }
1788
1789 return add_code_range_to_buf(pbuf, from, to);
1790 }
1791
1792 static int
not_code_range_buf(OnigEncoding enc,BBuf * bbuf,BBuf ** pbuf)1793 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
1794 {
1795 int r, i, n;
1796 OnigCodePoint pre, from, *data, to = 0;
1797
1798 *pbuf = (BBuf* )NULL;
1799 if (IS_NULL(bbuf)) {
1800 set_all:
1801 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1802 }
1803
1804 data = (OnigCodePoint* )(bbuf->p);
1805 GET_CODE_POINT(n, data);
1806 data++;
1807 if (n <= 0) goto set_all;
1808
1809 r = 0;
1810 pre = MBCODE_START_POS(enc);
1811 for (i = 0; i < n; i++) {
1812 from = data[i*2];
1813 to = data[i*2+1];
1814 if (pre <= from - 1) {
1815 r = add_code_range_to_buf(pbuf, pre, from - 1);
1816 if (r != 0) return r;
1817 }
1818 if (to == ~((OnigCodePoint )0)) break;
1819 pre = to + 1;
1820 }
1821 if (to < ~((OnigCodePoint )0)) {
1822 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0));
1823 }
1824 return r;
1825 }
1826
1827 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1828 BBuf *tbuf; \
1829 int tnot; \
1830 tnot = not1; not1 = not2; not2 = tnot; \
1831 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1832 } while (0)
1833
1834 static int
or_code_range_buf(OnigEncoding enc,BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1835 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1836 BBuf* bbuf2, int not2, BBuf** pbuf)
1837 {
1838 int r;
1839 OnigCodePoint i, n1, *data1;
1840 OnigCodePoint from, to;
1841
1842 *pbuf = (BBuf* )NULL;
1843 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1844 if (not1 != 0 || not2 != 0)
1845 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1846 return 0;
1847 }
1848
1849 r = 0;
1850 if (IS_NULL(bbuf2))
1851 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1852
1853 if (IS_NULL(bbuf1)) {
1854 if (not1 != 0) {
1855 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1856 }
1857 else {
1858 if (not2 == 0) {
1859 return bbuf_clone(pbuf, bbuf2);
1860 }
1861 else {
1862 return not_code_range_buf(enc, bbuf2, pbuf);
1863 }
1864 }
1865 }
1866
1867 if (not1 != 0)
1868 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1869
1870 data1 = (OnigCodePoint* )(bbuf1->p);
1871 GET_CODE_POINT(n1, data1);
1872 data1++;
1873
1874 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1875 r = bbuf_clone(pbuf, bbuf2);
1876 }
1877 else if (not1 == 0) { /* 1 OR (not 2) */
1878 r = not_code_range_buf(enc, bbuf2, pbuf);
1879 }
1880 if (r != 0) return r;
1881
1882 for (i = 0; i < n1; i++) {
1883 from = data1[i*2];
1884 to = data1[i*2+1];
1885 r = add_code_range_to_buf(pbuf, from, to);
1886 if (r != 0) return r;
1887 }
1888 return 0;
1889 }
1890
1891 static int
and_code_range1(BBuf ** pbuf,OnigCodePoint from1,OnigCodePoint to1,OnigCodePoint * data,int n)1892 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1,
1893 OnigCodePoint* data, int n)
1894 {
1895 int i, r;
1896 OnigCodePoint from2, to2;
1897
1898 for (i = 0; i < n; i++) {
1899 from2 = data[i*2];
1900 to2 = data[i*2+1];
1901 if (from2 < from1) {
1902 if (to2 < from1) continue;
1903 else {
1904 from1 = to2 + 1;
1905 }
1906 }
1907 else if (from2 <= to1) {
1908 if (to2 < to1) {
1909 if (from1 <= from2 - 1) {
1910 r = add_code_range_to_buf(pbuf, from1, from2-1);
1911 if (r != 0) return r;
1912 }
1913 from1 = to2 + 1;
1914 }
1915 else {
1916 to1 = from2 - 1;
1917 }
1918 }
1919 else {
1920 from1 = from2;
1921 }
1922 if (from1 > to1) break;
1923 }
1924 if (from1 <= to1) {
1925 r = add_code_range_to_buf(pbuf, from1, to1);
1926 if (r != 0) return r;
1927 }
1928 return 0;
1929 }
1930
1931 static int
and_code_range_buf(BBuf * bbuf1,int not1,BBuf * bbuf2,int not2,BBuf ** pbuf)1932 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
1933 {
1934 int r;
1935 OnigCodePoint i, j, n1, n2, *data1, *data2;
1936 OnigCodePoint from, to, from1, to1, from2, to2;
1937
1938 *pbuf = (BBuf* )NULL;
1939 if (IS_NULL(bbuf1)) {
1940 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1941 return bbuf_clone(pbuf, bbuf2);
1942 return 0;
1943 }
1944 else if (IS_NULL(bbuf2)) {
1945 if (not2 != 0)
1946 return bbuf_clone(pbuf, bbuf1);
1947 return 0;
1948 }
1949
1950 if (not1 != 0)
1951 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1952
1953 data1 = (OnigCodePoint* )(bbuf1->p);
1954 data2 = (OnigCodePoint* )(bbuf2->p);
1955 GET_CODE_POINT(n1, data1);
1956 GET_CODE_POINT(n2, data2);
1957 data1++;
1958 data2++;
1959
1960 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1961 for (i = 0; i < n1; i++) {
1962 from1 = data1[i*2];
1963 to1 = data1[i*2+1];
1964 for (j = 0; j < n2; j++) {
1965 from2 = data2[j*2];
1966 to2 = data2[j*2+1];
1967 if (from2 > to1) break;
1968 if (to2 < from1) continue;
1969 from = MAX(from1, from2);
1970 to = MIN(to1, to2);
1971 r = add_code_range_to_buf(pbuf, from, to);
1972 if (r != 0) return r;
1973 }
1974 }
1975 }
1976 else if (not1 == 0) { /* 1 AND (not 2) */
1977 for (i = 0; i < n1; i++) {
1978 from1 = data1[i*2];
1979 to1 = data1[i*2+1];
1980 r = and_code_range1(pbuf, from1, to1, data2, n2);
1981 if (r != 0) return r;
1982 }
1983 }
1984
1985 return 0;
1986 }
1987
1988 static int
and_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)1989 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
1990 {
1991 int r, not1, not2;
1992 BBuf *buf1, *buf2, *pbuf;
1993 BitSetRef bsr1, bsr2;
1994 BitSet bs1, bs2;
1995
1996 not1 = IS_NCCLASS_NOT(dest);
1997 bsr1 = dest->bs;
1998 buf1 = dest->mbuf;
1999 not2 = IS_NCCLASS_NOT(cc);
2000 bsr2 = cc->bs;
2001 buf2 = cc->mbuf;
2002
2003 if (not1 != 0) {
2004 bitset_invert_to(bsr1, bs1);
2005 bsr1 = bs1;
2006 }
2007 if (not2 != 0) {
2008 bitset_invert_to(bsr2, bs2);
2009 bsr2 = bs2;
2010 }
2011 bitset_and(bsr1, bsr2);
2012 if (bsr1 != dest->bs) {
2013 bitset_copy(dest->bs, bsr1);
2014 bsr1 = dest->bs;
2015 }
2016 if (not1 != 0) {
2017 bitset_invert(dest->bs);
2018 }
2019
2020 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2021 if (not1 != 0 && not2 != 0) {
2022 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
2023 }
2024 else {
2025 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
2026 if (r == 0 && not1 != 0) {
2027 BBuf *tbuf;
2028 r = not_code_range_buf(enc, pbuf, &tbuf);
2029 if (r != 0) {
2030 bbuf_free(pbuf);
2031 return r;
2032 }
2033 bbuf_free(pbuf);
2034 pbuf = tbuf;
2035 }
2036 }
2037 if (r != 0) return r;
2038
2039 dest->mbuf = pbuf;
2040 bbuf_free(buf1);
2041 return r;
2042 }
2043 return 0;
2044 }
2045
2046 static int
or_cclass(CClassNode * dest,CClassNode * cc,OnigEncoding enc)2047 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
2048 {
2049 int r, not1, not2;
2050 BBuf *buf1, *buf2, *pbuf;
2051 BitSetRef bsr1, bsr2;
2052 BitSet bs1, bs2;
2053
2054 not1 = IS_NCCLASS_NOT(dest);
2055 bsr1 = dest->bs;
2056 buf1 = dest->mbuf;
2057 not2 = IS_NCCLASS_NOT(cc);
2058 bsr2 = cc->bs;
2059 buf2 = cc->mbuf;
2060
2061 if (not1 != 0) {
2062 bitset_invert_to(bsr1, bs1);
2063 bsr1 = bs1;
2064 }
2065 if (not2 != 0) {
2066 bitset_invert_to(bsr2, bs2);
2067 bsr2 = bs2;
2068 }
2069 bitset_or(bsr1, bsr2);
2070 if (bsr1 != dest->bs) {
2071 bitset_copy(dest->bs, bsr1);
2072 bsr1 = dest->bs;
2073 }
2074 if (not1 != 0) {
2075 bitset_invert(dest->bs);
2076 }
2077
2078 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2079 if (not1 != 0 && not2 != 0) {
2080 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
2081 }
2082 else {
2083 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
2084 if (r == 0 && not1 != 0) {
2085 BBuf *tbuf;
2086 r = not_code_range_buf(enc, pbuf, &tbuf);
2087 if (r != 0) {
2088 bbuf_free(pbuf);
2089 return r;
2090 }
2091 bbuf_free(pbuf);
2092 pbuf = tbuf;
2093 }
2094 }
2095 if (r != 0) return r;
2096
2097 dest->mbuf = pbuf;
2098 bbuf_free(buf1);
2099 return r;
2100 }
2101 else
2102 return 0;
2103 }
2104
2105 static int
conv_backslash_value(int c,ScanEnv * env)2106 conv_backslash_value(int c, ScanEnv* env)
2107 {
2108 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2109 switch (c) {
2110 case 'n': return '\n';
2111 case 't': return '\t';
2112 case 'r': return '\r';
2113 case 'f': return '\f';
2114 case 'a': return '\007';
2115 case 'b': return '\010';
2116 case 'e': return '\033';
2117 case 'v':
2118 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2119 return '\v';
2120 break;
2121
2122 default:
2123 break;
2124 }
2125 }
2126 return c;
2127 }
2128
2129 static int
is_invalid_quantifier_target(Node * node)2130 is_invalid_quantifier_target(Node* node)
2131 {
2132 switch (NTYPE(node)) {
2133 case NT_ANCHOR:
2134 return 1;
2135 break;
2136
2137 case NT_ENCLOSE:
2138 /* allow enclosed elements */
2139 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2140 break;
2141
2142 case NT_LIST:
2143 do {
2144 if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2145 } while (IS_NOT_NULL(node = NCDR(node)));
2146 return 0;
2147 break;
2148
2149 case NT_ALT:
2150 do {
2151 if (is_invalid_quantifier_target(NCAR(node))) return 1;
2152 } while (IS_NOT_NULL(node = NCDR(node)));
2153 break;
2154
2155 default:
2156 break;
2157 }
2158 return 0;
2159 }
2160
2161 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2162 static int
popular_quantifier_num(QtfrNode * q)2163 popular_quantifier_num(QtfrNode* q)
2164 {
2165 if (q->greedy) {
2166 if (q->lower == 0) {
2167 if (q->upper == 1) return 0;
2168 else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2169 }
2170 else if (q->lower == 1) {
2171 if (IS_REPEAT_INFINITE(q->upper)) return 2;
2172 }
2173 }
2174 else {
2175 if (q->lower == 0) {
2176 if (q->upper == 1) return 3;
2177 else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2178 }
2179 else if (q->lower == 1) {
2180 if (IS_REPEAT_INFINITE(q->upper)) return 5;
2181 }
2182 }
2183 return -1;
2184 }
2185
2186
2187 enum ReduceType {
2188 RQ_ASIS = 0, /* as is */
2189 RQ_DEL = 1, /* delete parent */
2190 RQ_A, /* to '*' */
2191 RQ_AQ, /* to '*?' */
2192 RQ_QQ, /* to '??' */
2193 RQ_P_QQ, /* to '+)??' */
2194 RQ_PQ_Q /* to '+?)?' */
2195 };
2196
2197 static enum ReduceType ReduceTypeTable[6][6] = {
2198 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2199 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2200 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2201 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2202 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2203 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2204 };
2205
2206 extern void
onig_reduce_nested_quantifier(Node * pnode,Node * cnode)2207 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2208 {
2209 int pnum, cnum;
2210 QtfrNode *p, *c;
2211
2212 p = NQTFR(pnode);
2213 c = NQTFR(cnode);
2214 pnum = popular_quantifier_num(p);
2215 cnum = popular_quantifier_num(c);
2216 if (pnum < 0 || cnum < 0) return ;
2217
2218 switch(ReduceTypeTable[cnum][pnum]) {
2219 case RQ_DEL:
2220 CopyMem (pnode, cnode, sizeof (Node));
2221 break;
2222 case RQ_A:
2223 p->target = c->target;
2224 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2225 break;
2226 case RQ_AQ:
2227 p->target = c->target;
2228 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2229 break;
2230 case RQ_QQ:
2231 p->target = c->target;
2232 p->lower = 0; p->upper = 1; p->greedy = 0;
2233 break;
2234 case RQ_P_QQ:
2235 p->target = cnode;
2236 p->lower = 0; p->upper = 1; p->greedy = 0;
2237 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2238 return ;
2239 break;
2240 case RQ_PQ_Q:
2241 p->target = cnode;
2242 p->lower = 0; p->upper = 1; p->greedy = 1;
2243 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0;
2244 return ;
2245 break;
2246 case RQ_ASIS:
2247 p->target = cnode;
2248 return ;
2249 break;
2250 }
2251
2252 c->target = NULL_NODE;
2253 onig_node_free(cnode);
2254 }
2255
2256
2257 enum TokenSyms {
2258 TK_EOT = 0, /* end of token */
2259 TK_RAW_BYTE = 1,
2260 TK_CHAR,
2261 TK_STRING,
2262 TK_CODE_POINT,
2263 TK_ANYCHAR,
2264 TK_CHAR_TYPE,
2265 TK_BACKREF,
2266 TK_CALL,
2267 TK_ANCHOR,
2268 TK_OP_REPEAT,
2269 TK_INTERVAL,
2270 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2271 TK_ALT,
2272 TK_SUBEXP_OPEN,
2273 TK_SUBEXP_CLOSE,
2274 TK_CC_OPEN,
2275 TK_QUOTE_OPEN,
2276 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2277 /* in cc */
2278 TK_CC_CLOSE,
2279 TK_CC_RANGE,
2280 TK_POSIX_BRACKET_OPEN,
2281 TK_CC_AND, /* && */
2282 TK_CC_CC_OPEN /* [ */
2283 };
2284
2285 typedef struct {
2286 enum TokenSyms type;
2287 int escaped;
2288 int base; /* is number: 8, 16 (used in [....]) */
2289 UChar* backp;
2290 union {
2291 UChar* s;
2292 int c;
2293 OnigCodePoint code;
2294 int anchor;
2295 int subtype;
2296 struct {
2297 int lower;
2298 int upper;
2299 int greedy;
2300 int possessive;
2301 } repeat;
2302 struct {
2303 int num;
2304 int ref1;
2305 int* refs;
2306 int by_name;
2307 #ifdef USE_BACKREF_WITH_LEVEL
2308 int exist_level;
2309 int level; /* \k<name+n> */
2310 #endif
2311 } backref;
2312 struct {
2313 UChar* name;
2314 UChar* name_end;
2315 int gnum;
2316 } call;
2317 struct {
2318 int ctype;
2319 int not;
2320 } prop;
2321 } u;
2322 } OnigToken;
2323
2324
2325 static int
fetch_range_quantifier(UChar ** src,UChar * end,OnigToken * tok,ScanEnv * env)2326 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2327 {
2328 int low, up, syn_allow, non_low = 0;
2329 int r = 0;
2330 OnigCodePoint c;
2331 OnigEncoding enc = env->enc;
2332 UChar* p = *src;
2333 PFETCH_READY;
2334
2335 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2336
2337 if (PEND) {
2338 if (syn_allow)
2339 return 1; /* "....{" : OK! */
2340 else
2341 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2342 }
2343
2344 if (! syn_allow) {
2345 c = PPEEK;
2346 if (c == ')' || c == '(' || c == '|') {
2347 return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2348 }
2349 }
2350
2351 low = onig_scan_unsigned_number(&p, end, env->enc);
2352 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2353 if (low > ONIG_MAX_REPEAT_NUM)
2354 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2355
2356 if (p == *src) { /* can't read low */
2357 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2358 /* allow {,n} as {0,n} */
2359 low = 0;
2360 non_low = 1;
2361 }
2362 else
2363 goto invalid;
2364 }
2365
2366 if (PEND) goto invalid;
2367 PFETCH(c);
2368 if (c == ',') {
2369 UChar* prev = p;
2370 up = onig_scan_unsigned_number(&p, end, env->enc);
2371 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2372 if (up > ONIG_MAX_REPEAT_NUM)
2373 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2374
2375 if (p == prev) {
2376 if (non_low != 0)
2377 goto invalid;
2378 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2379 }
2380 }
2381 else {
2382 if (non_low != 0)
2383 goto invalid;
2384
2385 PUNFETCH;
2386 up = low; /* {n} : exact n times */
2387 r = 2; /* fixed */
2388 }
2389
2390 if (PEND) goto invalid;
2391 PFETCH(c);
2392 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2393 if (c != MC_ESC(env->syntax)) goto invalid;
2394 PFETCH(c);
2395 }
2396 if (c != '}') goto invalid;
2397
2398 if (!IS_REPEAT_INFINITE(up) && low > up) {
2399 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2400 }
2401
2402 tok->type = TK_INTERVAL;
2403 tok->u.repeat.lower = low;
2404 tok->u.repeat.upper = up;
2405 *src = p;
2406 return r; /* 0: normal {n,m}, 2: fixed {n} */
2407
2408 invalid:
2409 if (syn_allow)
2410 return 1; /* OK */
2411 else
2412 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2413 }
2414
2415 /* \M-, \C-, \c, or \... */
2416 static int
fetch_escaped_value(UChar ** src,UChar * end,ScanEnv * env)2417 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
2418 {
2419 int v;
2420 OnigCodePoint c;
2421 OnigEncoding enc = env->enc;
2422 UChar* p = *src;
2423
2424 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2425
2426 PFETCH_S(c);
2427 switch (c) {
2428 case 'M':
2429 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2430 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2431 PFETCH_S(c);
2432 if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2433 if (PEND) return ONIGERR_END_PATTERN_AT_META;
2434 PFETCH_S(c);
2435 if (c == MC_ESC(env->syntax)) {
2436 v = fetch_escaped_value(&p, end, env);
2437 if (v < 0) return v;
2438 c = (OnigCodePoint )v;
2439 }
2440 c = ((c & 0xff) | 0x80);
2441 }
2442 else
2443 goto backslash;
2444 break;
2445
2446 case 'C':
2447 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2448 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2449 PFETCH_S(c);
2450 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2451 goto control;
2452 }
2453 else
2454 goto backslash;
2455
2456 case 'c':
2457 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2458 control:
2459 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2460 PFETCH_S(c);
2461 if (c == '?') {
2462 c = 0177;
2463 }
2464 else {
2465 if (c == MC_ESC(env->syntax)) {
2466 v = fetch_escaped_value(&p, end, env);
2467 if (v < 0) return v;
2468 c = (OnigCodePoint )v;
2469 }
2470 c &= 0x9f;
2471 }
2472 break;
2473 }
2474 /* fall through */
2475
2476 default:
2477 {
2478 backslash:
2479 c = conv_backslash_value(c, env);
2480 }
2481 break;
2482 }
2483
2484 *src = p;
2485 return c;
2486 }
2487
2488 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2489
2490 static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)2491 get_name_end_code_point(OnigCodePoint start)
2492 {
2493 switch (start) {
2494 case '<': return (OnigCodePoint )'>'; break;
2495 case '\'': return (OnigCodePoint )'\''; break;
2496 default:
2497 break;
2498 }
2499
2500 return (OnigCodePoint )0;
2501 }
2502
2503 #ifdef USE_NAMED_GROUP
2504 #ifdef USE_BACKREF_WITH_LEVEL
2505 /*
2506 \k<name+n>, \k<name-n>
2507 \k<num+n>, \k<num-n>
2508 \k<-num+n>, \k<-num-n>
2509 */
2510 static int
fetch_name_with_level(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int * rlevel)2511 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2512 UChar** rname_end, ScanEnv* env,
2513 int* rback_num, int* rlevel)
2514 {
2515 int r, sign, is_num, exist_level;
2516 OnigCodePoint end_code;
2517 OnigCodePoint c = 0;
2518 OnigEncoding enc = env->enc;
2519 UChar *name_end;
2520 UChar *pnum_head;
2521 UChar *p = *src;
2522 PFETCH_READY;
2523
2524 *rback_num = 0;
2525 is_num = exist_level = 0;
2526 sign = 1;
2527 pnum_head = *src;
2528
2529 end_code = get_name_end_code_point(start_code);
2530
2531 name_end = end;
2532 r = 0;
2533 if (PEND) {
2534 return ONIGERR_EMPTY_GROUP_NAME;
2535 }
2536 else {
2537 PFETCH(c);
2538 if (c == end_code)
2539 return ONIGERR_EMPTY_GROUP_NAME;
2540
2541 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2542 is_num = 1;
2543 }
2544 else if (c == '-') {
2545 is_num = 2;
2546 sign = -1;
2547 pnum_head = p;
2548 }
2549 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2550 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2551 }
2552 }
2553
2554 while (!PEND) {
2555 name_end = p;
2556 PFETCH(c);
2557 if (c == end_code || c == ')' || c == '+' || c == '-') {
2558 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2559 break;
2560 }
2561
2562 if (is_num != 0) {
2563 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2564 is_num = 1;
2565 }
2566 else {
2567 r = ONIGERR_INVALID_GROUP_NAME;
2568 is_num = 0;
2569 }
2570 }
2571 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2572 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2573 }
2574 }
2575
2576 if (r == 0 && c != end_code) {
2577 if (c == '+' || c == '-') {
2578 int level;
2579 int flag = (c == '-' ? -1 : 1);
2580
2581 PFETCH(c);
2582 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2583 PUNFETCH;
2584 level = onig_scan_unsigned_number(&p, end, enc);
2585 if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2586 *rlevel = (level * flag);
2587 exist_level = 1;
2588
2589 PFETCH(c);
2590 if (c == end_code)
2591 goto end;
2592 }
2593
2594 err:
2595 r = ONIGERR_INVALID_GROUP_NAME;
2596 name_end = end;
2597 }
2598
2599 end:
2600 if (r == 0) {
2601 if (is_num != 0) {
2602 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2603 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2604 else if (*rback_num == 0) goto err;
2605
2606 *rback_num *= sign;
2607 }
2608
2609 *rname_end = name_end;
2610 *src = p;
2611 return (exist_level ? 1 : 0);
2612 }
2613 else {
2614 onig_scan_env_set_error_string(env, r, *src, name_end);
2615 return r;
2616 }
2617 }
2618 #endif /* USE_BACKREF_WITH_LEVEL */
2619
2620 /*
2621 def: 0 -> define name (don't allow number name)
2622 1 -> reference name (allow number name)
2623 */
2624 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2625 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2626 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2627 {
2628 int r, is_num, sign;
2629 OnigCodePoint end_code;
2630 OnigCodePoint c = 0;
2631 OnigEncoding enc = env->enc;
2632 UChar *name_end;
2633 UChar *pnum_head;
2634 UChar *p = *src;
2635
2636 *rback_num = 0;
2637
2638 end_code = get_name_end_code_point(start_code);
2639
2640 name_end = end;
2641 pnum_head = *src;
2642 r = 0;
2643 is_num = 0;
2644 sign = 1;
2645 if (PEND) {
2646 return ONIGERR_EMPTY_GROUP_NAME;
2647 }
2648 else {
2649 PFETCH_S(c);
2650 if (c == end_code)
2651 return ONIGERR_EMPTY_GROUP_NAME;
2652
2653 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2654 if (ref == 1)
2655 is_num = 1;
2656 else {
2657 r = ONIGERR_INVALID_GROUP_NAME;
2658 is_num = 0;
2659 }
2660 }
2661 else if (c == '-') {
2662 if (ref == 1) {
2663 is_num = 2;
2664 sign = -1;
2665 pnum_head = p;
2666 }
2667 else {
2668 r = ONIGERR_INVALID_GROUP_NAME;
2669 is_num = 0;
2670 }
2671 }
2672 else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2673 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2674 }
2675 }
2676
2677 if (r == 0) {
2678 while (!PEND) {
2679 name_end = p;
2680 PFETCH_S(c);
2681 if (c == end_code || c == ')') {
2682 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2683 break;
2684 }
2685
2686 if (is_num != 0) {
2687 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2688 is_num = 1;
2689 }
2690 else {
2691 if (!ONIGENC_IS_CODE_WORD(enc, c))
2692 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2693 else
2694 r = ONIGERR_INVALID_GROUP_NAME;
2695 is_num = 0;
2696 }
2697 }
2698 else {
2699 if (!ONIGENC_IS_CODE_WORD(enc, c)) {
2700 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2701 }
2702 }
2703 }
2704
2705 if (c != end_code) {
2706 r = ONIGERR_INVALID_GROUP_NAME;
2707 name_end = end;
2708 }
2709
2710 if (is_num != 0) {
2711 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2712 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2713 else if (*rback_num == 0) {
2714 r = ONIGERR_INVALID_GROUP_NAME;
2715 goto err;
2716 }
2717
2718 *rback_num *= sign;
2719 }
2720
2721 *rname_end = name_end;
2722 *src = p;
2723 return 0;
2724 }
2725 else {
2726 while (!PEND) {
2727 name_end = p;
2728 PFETCH_S(c);
2729 if (c == end_code || c == ')')
2730 break;
2731 }
2732 if (PEND)
2733 name_end = end;
2734
2735 err:
2736 onig_scan_env_set_error_string(env, r, *src, name_end);
2737 return r;
2738 }
2739 }
2740 #else
2741 static int
fetch_name(OnigCodePoint start_code,UChar ** src,UChar * end,UChar ** rname_end,ScanEnv * env,int * rback_num,int ref)2742 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2743 UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2744 {
2745 int r, is_num, sign;
2746 OnigCodePoint end_code;
2747 OnigCodePoint c = 0;
2748 UChar *name_end;
2749 OnigEncoding enc = env->enc;
2750 UChar *pnum_head;
2751 UChar *p = *src;
2752 PFETCH_READY;
2753
2754 *rback_num = 0;
2755
2756 end_code = get_name_end_code_point(start_code);
2757
2758 *rname_end = name_end = end;
2759 r = 0;
2760 pnum_head = *src;
2761 is_num = 0;
2762 sign = 1;
2763
2764 if (PEND) {
2765 return ONIGERR_EMPTY_GROUP_NAME;
2766 }
2767 else {
2768 PFETCH(c);
2769 if (c == end_code)
2770 return ONIGERR_EMPTY_GROUP_NAME;
2771
2772 if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2773 is_num = 1;
2774 }
2775 else if (c == '-') {
2776 is_num = 2;
2777 sign = -1;
2778 pnum_head = p;
2779 }
2780 else {
2781 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2782 }
2783 }
2784
2785 while (!PEND) {
2786 name_end = p;
2787
2788 PFETCH(c);
2789 if (c == end_code || c == ')') break;
2790 if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2791 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2792 }
2793 if (r == 0 && c != end_code) {
2794 r = ONIGERR_INVALID_GROUP_NAME;
2795 name_end = end;
2796 }
2797
2798 if (r == 0) {
2799 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2800 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2801 else if (*rback_num == 0) {
2802 r = ONIGERR_INVALID_GROUP_NAME;
2803 goto err;
2804 }
2805 *rback_num *= sign;
2806
2807 *rname_end = name_end;
2808 *src = p;
2809 return 0;
2810 }
2811 else {
2812 err:
2813 onig_scan_env_set_error_string(env, r, *src, name_end);
2814 return r;
2815 }
2816 }
2817 #endif /* USE_NAMED_GROUP */
2818
2819 static void
CC_ESC_WARN(ScanEnv * env,UChar * c)2820 CC_ESC_WARN(ScanEnv* env, UChar *c)
2821 {
2822 if (onig_warn == onig_null_warn) return ;
2823
2824 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2825 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2826 UChar buf[WARN_BUFSIZE];
2827 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2828 env->pattern, env->pattern_end,
2829 (UChar* )"character class has '%s' without escape", c);
2830 (*onig_warn)((char* )buf);
2831 }
2832 }
2833
2834 static void
CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv * env,UChar * c)2835 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2836 {
2837 if (onig_warn == onig_null_warn) return ;
2838
2839 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2840 UChar buf[WARN_BUFSIZE];
2841 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc,
2842 (env)->pattern, (env)->pattern_end,
2843 (UChar* )"regular expression has '%s' without escape", c);
2844 (*onig_warn)((char* )buf);
2845 }
2846 }
2847
2848 static UChar*
find_str_position(OnigCodePoint s[],int n,UChar * from,UChar * to,UChar ** next,OnigEncoding enc)2849 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2850 UChar **next, OnigEncoding enc)
2851 {
2852 int i;
2853 OnigCodePoint x;
2854 UChar *q;
2855 UChar *p = from;
2856
2857 while (p < to) {
2858 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2859 q = p + enclen(enc, p);
2860 if (x == s[0]) {
2861 for (i = 1; i < n && q < to; i++) {
2862 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2863 if (x != s[i]) break;
2864 q += enclen(enc, q);
2865 }
2866 if (i >= n) {
2867 if (IS_NOT_NULL(next))
2868 *next = q;
2869 return p;
2870 }
2871 }
2872 p = q;
2873 }
2874 return NULL_UCHARP;
2875 }
2876
2877 static int
str_exist_check_with_esc(OnigCodePoint s[],int n,UChar * from,UChar * to,OnigCodePoint bad,OnigEncoding enc,OnigSyntaxType * syn)2878 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2879 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn)
2880 {
2881 int i, in_esc;
2882 OnigCodePoint x;
2883 UChar *q;
2884 UChar *p = from;
2885
2886 in_esc = 0;
2887 while (p < to) {
2888 if (in_esc) {
2889 in_esc = 0;
2890 p += enclen(enc, p);
2891 }
2892 else {
2893 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2894 q = p + enclen(enc, p);
2895 if (x == s[0]) {
2896 for (i = 1; i < n && q < to; i++) {
2897 x = ONIGENC_MBC_TO_CODE(enc, q, to);
2898 if (x != s[i]) break;
2899 q += enclen(enc, q);
2900 }
2901 if (i >= n) return 1;
2902 p += enclen(enc, p);
2903 }
2904 else {
2905 x = ONIGENC_MBC_TO_CODE(enc, p, to);
2906 if (x == bad) return 0;
2907 else if (x == MC_ESC(syn)) in_esc = 1;
2908 p = q;
2909 }
2910 }
2911 }
2912 return 0;
2913 }
2914
2915 static int
fetch_token_in_cc(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)2916 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2917 {
2918 int num;
2919 OnigCodePoint c, c2;
2920 OnigSyntaxType* syn = env->syntax;
2921 OnigEncoding enc = env->enc;
2922 UChar* prev;
2923 UChar* p = *src;
2924 PFETCH_READY;
2925
2926 if (PEND) {
2927 tok->type = TK_EOT;
2928 return tok->type;
2929 }
2930
2931 PFETCH(c);
2932 tok->type = TK_CHAR;
2933 tok->base = 0;
2934 tok->u.c = c;
2935 tok->escaped = 0;
2936
2937 if (c == ']') {
2938 tok->type = TK_CC_CLOSE;
2939 }
2940 else if (c == '-') {
2941 tok->type = TK_CC_RANGE;
2942 }
2943 else if (c == MC_ESC(syn)) {
2944 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
2945 goto end;
2946
2947 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2948
2949 PFETCH(c);
2950 tok->escaped = 1;
2951 tok->u.c = c;
2952 switch (c) {
2953 case 'w':
2954 tok->type = TK_CHAR_TYPE;
2955 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2956 tok->u.prop.not = 0;
2957 break;
2958 case 'W':
2959 tok->type = TK_CHAR_TYPE;
2960 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
2961 tok->u.prop.not = 1;
2962 break;
2963 case 'd':
2964 tok->type = TK_CHAR_TYPE;
2965 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2966 tok->u.prop.not = 0;
2967 break;
2968 case 'D':
2969 tok->type = TK_CHAR_TYPE;
2970 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
2971 tok->u.prop.not = 1;
2972 break;
2973 case 's':
2974 tok->type = TK_CHAR_TYPE;
2975 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2976 tok->u.prop.not = 0;
2977 break;
2978 case 'S':
2979 tok->type = TK_CHAR_TYPE;
2980 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
2981 tok->u.prop.not = 1;
2982 break;
2983 case 'h':
2984 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2985 tok->type = TK_CHAR_TYPE;
2986 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2987 tok->u.prop.not = 0;
2988 break;
2989 case 'H':
2990 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
2991 tok->type = TK_CHAR_TYPE;
2992 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
2993 tok->u.prop.not = 1;
2994 break;
2995
2996 case 'p':
2997 case 'P':
2998 c2 = PPEEK;
2999 if (c2 == '{' &&
3000 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3001 PINC;
3002 tok->type = TK_CHAR_PROPERTY;
3003 tok->u.prop.not = (c == 'P' ? 1 : 0);
3004
3005 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3006 PFETCH(c2);
3007 if (c2 == '^') {
3008 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3009 }
3010 else
3011 PUNFETCH;
3012 }
3013 }
3014 break;
3015
3016 case 'x':
3017 if (PEND) break;
3018
3019 prev = p;
3020 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3021 PINC;
3022 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3023 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3024 if (!PEND) {
3025 c2 = PPEEK;
3026 if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3027 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3028 }
3029
3030 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
3031 PINC;
3032 tok->type = TK_CODE_POINT;
3033 tok->base = 16;
3034 tok->u.code = (OnigCodePoint )num;
3035 }
3036 else {
3037 /* can't read nothing or invalid format */
3038 p = prev;
3039 }
3040 }
3041 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3042 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3043 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3044 if (p == prev) { /* can't read nothing. */
3045 num = 0; /* but, it's not error */
3046 }
3047 tok->type = TK_RAW_BYTE;
3048 tok->base = 16;
3049 tok->u.c = num;
3050 }
3051 break;
3052
3053 case 'u':
3054 if (PEND) break;
3055
3056 prev = p;
3057 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3058 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3059 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3060 if (p == prev) { /* can't read nothing. */
3061 num = 0; /* but, it's not error */
3062 }
3063 tok->type = TK_CODE_POINT;
3064 tok->base = 16;
3065 tok->u.code = (OnigCodePoint )num;
3066 }
3067 break;
3068
3069 case '0':
3070 case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3071 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3072 PUNFETCH;
3073 prev = p;
3074 num = scan_unsigned_octal_number(&p, end, 3, enc);
3075 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3076 if (p == prev) { /* can't read nothing. */
3077 num = 0; /* but, it's not error */
3078 }
3079 tok->type = TK_RAW_BYTE;
3080 tok->base = 8;
3081 tok->u.c = num;
3082 }
3083 break;
3084
3085 default:
3086 PUNFETCH;
3087 num = fetch_escaped_value(&p, end, env);
3088 if (num < 0) return num;
3089 if (tok->u.c != num) {
3090 tok->u.code = (OnigCodePoint )num;
3091 tok->type = TK_CODE_POINT;
3092 }
3093 break;
3094 }
3095 }
3096 else if (c == '[') {
3097 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3098 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3099 tok->backp = p; /* point at '[' is readed */
3100 PINC;
3101 if (str_exist_check_with_esc(send, 2, p, end,
3102 (OnigCodePoint )']', enc, syn)) {
3103 tok->type = TK_POSIX_BRACKET_OPEN;
3104 }
3105 else {
3106 PUNFETCH;
3107 goto cc_in_cc;
3108 }
3109 }
3110 else {
3111 cc_in_cc:
3112 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3113 tok->type = TK_CC_CC_OPEN;
3114 }
3115 else {
3116 CC_ESC_WARN(env, (UChar* )"[");
3117 }
3118 }
3119 }
3120 else if (c == '&') {
3121 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3122 !PEND && (PPEEK_IS('&'))) {
3123 PINC;
3124 tok->type = TK_CC_AND;
3125 }
3126 }
3127
3128 end:
3129 *src = p;
3130 return tok->type;
3131 }
3132
3133 static int
fetch_token(OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)3134 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3135 {
3136 int r, num;
3137 OnigCodePoint c;
3138 OnigEncoding enc = env->enc;
3139 OnigSyntaxType* syn = env->syntax;
3140 UChar* prev;
3141 UChar* p = *src;
3142 PFETCH_READY;
3143
3144 start:
3145 if (PEND) {
3146 tok->type = TK_EOT;
3147 return tok->type;
3148 }
3149
3150 tok->type = TK_STRING;
3151 tok->base = 0;
3152 tok->backp = p;
3153
3154 PFETCH(c);
3155 if (IS_MC_ESC_CODE(c, syn)) {
3156 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3157
3158 tok->backp = p;
3159 PFETCH(c);
3160
3161 tok->u.c = c;
3162 tok->escaped = 1;
3163 switch (c) {
3164 case '*':
3165 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3166 tok->type = TK_OP_REPEAT;
3167 tok->u.repeat.lower = 0;
3168 tok->u.repeat.upper = REPEAT_INFINITE;
3169 goto greedy_check;
3170 break;
3171
3172 case '+':
3173 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3174 tok->type = TK_OP_REPEAT;
3175 tok->u.repeat.lower = 1;
3176 tok->u.repeat.upper = REPEAT_INFINITE;
3177 goto greedy_check;
3178 break;
3179
3180 case '?':
3181 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3182 tok->type = TK_OP_REPEAT;
3183 tok->u.repeat.lower = 0;
3184 tok->u.repeat.upper = 1;
3185 greedy_check:
3186 if (!PEND && PPEEK_IS('?') &&
3187 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3188 PFETCH(c);
3189 tok->u.repeat.greedy = 0;
3190 tok->u.repeat.possessive = 0;
3191 }
3192 else {
3193 possessive_check:
3194 if (!PEND && PPEEK_IS('+') &&
3195 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3196 tok->type != TK_INTERVAL) ||
3197 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3198 tok->type == TK_INTERVAL))) {
3199 PFETCH(c);
3200 tok->u.repeat.greedy = 1;
3201 tok->u.repeat.possessive = 1;
3202 }
3203 else {
3204 tok->u.repeat.greedy = 1;
3205 tok->u.repeat.possessive = 0;
3206 }
3207 }
3208 break;
3209
3210 case '{':
3211 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3212 r = fetch_range_quantifier(&p, end, tok, env);
3213 if (r < 0) return r; /* error */
3214 if (r == 0) goto greedy_check;
3215 else if (r == 2) { /* {n} */
3216 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3217 goto possessive_check;
3218
3219 goto greedy_check;
3220 }
3221 /* r == 1 : normal char */
3222 break;
3223
3224 case '|':
3225 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3226 tok->type = TK_ALT;
3227 break;
3228
3229 case '(':
3230 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3231 tok->type = TK_SUBEXP_OPEN;
3232 break;
3233
3234 case ')':
3235 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3236 tok->type = TK_SUBEXP_CLOSE;
3237 break;
3238
3239 case 'w':
3240 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3241 tok->type = TK_CHAR_TYPE;
3242 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3243 tok->u.prop.not = 0;
3244 break;
3245
3246 case 'W':
3247 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3248 tok->type = TK_CHAR_TYPE;
3249 tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3250 tok->u.prop.not = 1;
3251 break;
3252
3253 case 'b':
3254 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3255 tok->type = TK_ANCHOR;
3256 tok->u.anchor = ANCHOR_WORD_BOUND;
3257 break;
3258
3259 case 'B':
3260 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3261 tok->type = TK_ANCHOR;
3262 tok->u.anchor = ANCHOR_NOT_WORD_BOUND;
3263 break;
3264
3265 #ifdef USE_WORD_BEGIN_END
3266 case '<':
3267 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3268 tok->type = TK_ANCHOR;
3269 tok->u.anchor = ANCHOR_WORD_BEGIN;
3270 break;
3271
3272 case '>':
3273 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3274 tok->type = TK_ANCHOR;
3275 tok->u.anchor = ANCHOR_WORD_END;
3276 break;
3277 #endif
3278
3279 case 's':
3280 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3281 tok->type = TK_CHAR_TYPE;
3282 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3283 tok->u.prop.not = 0;
3284 break;
3285
3286 case 'S':
3287 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3288 tok->type = TK_CHAR_TYPE;
3289 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3290 tok->u.prop.not = 1;
3291 break;
3292
3293 case 'd':
3294 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3295 tok->type = TK_CHAR_TYPE;
3296 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3297 tok->u.prop.not = 0;
3298 break;
3299
3300 case 'D':
3301 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3302 tok->type = TK_CHAR_TYPE;
3303 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3304 tok->u.prop.not = 1;
3305 break;
3306
3307 case 'h':
3308 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3309 tok->type = TK_CHAR_TYPE;
3310 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3311 tok->u.prop.not = 0;
3312 break;
3313
3314 case 'H':
3315 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3316 tok->type = TK_CHAR_TYPE;
3317 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3318 tok->u.prop.not = 1;
3319 break;
3320
3321 case 'A':
3322 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3323 begin_buf:
3324 tok->type = TK_ANCHOR;
3325 tok->u.subtype = ANCHOR_BEGIN_BUF;
3326 break;
3327
3328 case 'Z':
3329 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3330 tok->type = TK_ANCHOR;
3331 tok->u.subtype = ANCHOR_SEMI_END_BUF;
3332 break;
3333
3334 case 'z':
3335 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3336 end_buf:
3337 tok->type = TK_ANCHOR;
3338 tok->u.subtype = ANCHOR_END_BUF;
3339 break;
3340
3341 case 'G':
3342 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3343 tok->type = TK_ANCHOR;
3344 tok->u.subtype = ANCHOR_BEGIN_POSITION;
3345 break;
3346
3347 case '`':
3348 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3349 goto begin_buf;
3350 break;
3351
3352 case '\'':
3353 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3354 goto end_buf;
3355 break;
3356
3357 case 'x':
3358 if (PEND) break;
3359
3360 prev = p;
3361 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3362 PINC;
3363 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
3364 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3365 if (!PEND) {
3366 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3367 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3368 }
3369
3370 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) {
3371 PINC;
3372 tok->type = TK_CODE_POINT;
3373 tok->u.code = (OnigCodePoint )num;
3374 }
3375 else {
3376 /* can't read nothing or invalid format */
3377 p = prev;
3378 }
3379 }
3380 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3381 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
3382 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3383 if (p == prev) { /* can't read nothing. */
3384 num = 0; /* but, it's not error */
3385 }
3386 tok->type = TK_RAW_BYTE;
3387 tok->base = 16;
3388 tok->u.c = num;
3389 }
3390 break;
3391
3392 case 'u':
3393 if (PEND) break;
3394
3395 prev = p;
3396 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3397 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
3398 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3399 if (p == prev) { /* can't read nothing. */
3400 num = 0; /* but, it's not error */
3401 }
3402 tok->type = TK_CODE_POINT;
3403 tok->base = 16;
3404 tok->u.code = (OnigCodePoint )num;
3405 }
3406 break;
3407
3408 case '1': case '2': case '3': case '4':
3409 case '5': case '6': case '7': case '8': case '9':
3410 PUNFETCH;
3411 prev = p;
3412 num = onig_scan_unsigned_number(&p, end, enc);
3413 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3414 goto skip_backref;
3415 }
3416
3417 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3418 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3419 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3420 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3421 return ONIGERR_INVALID_BACKREF;
3422 }
3423
3424 tok->type = TK_BACKREF;
3425 tok->u.backref.num = 1;
3426 tok->u.backref.ref1 = num;
3427 tok->u.backref.by_name = 0;
3428 #ifdef USE_BACKREF_WITH_LEVEL
3429 tok->u.backref.exist_level = 0;
3430 #endif
3431 break;
3432 }
3433
3434 skip_backref:
3435 if (c == '8' || c == '9') {
3436 /* normal char */
3437 p = prev; PINC;
3438 break;
3439 }
3440
3441 p = prev;
3442 /* fall through */
3443 case '0':
3444 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3445 prev = p;
3446 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3447 if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3448 if (p == prev) { /* can't read nothing. */
3449 num = 0; /* but, it's not error */
3450 }
3451 tok->type = TK_RAW_BYTE;
3452 tok->base = 8;
3453 tok->u.c = num;
3454 }
3455 else if (c != '0') {
3456 PINC;
3457 }
3458 break;
3459
3460 #ifdef USE_NAMED_GROUP
3461 case 'k':
3462 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3463 PFETCH(c);
3464 if (c == '<' || c == '\'') {
3465 UChar* name_end;
3466 int* backs;
3467 int back_num;
3468
3469 prev = p;
3470
3471 #ifdef USE_BACKREF_WITH_LEVEL
3472 name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3473 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end,
3474 env, &back_num, &tok->u.backref.level);
3475 if (r == 1) tok->u.backref.exist_level = 1;
3476 else tok->u.backref.exist_level = 0;
3477 #else
3478 r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3479 #endif
3480 if (r < 0) return r;
3481
3482 if (back_num != 0) {
3483 if (back_num < 0) {
3484 back_num = BACKREF_REL_TO_ABS(back_num, env);
3485 if (back_num <= 0)
3486 return ONIGERR_INVALID_BACKREF;
3487 }
3488
3489 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3490 if (back_num > env->num_mem ||
3491 IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3492 return ONIGERR_INVALID_BACKREF;
3493 }
3494 tok->type = TK_BACKREF;
3495 tok->u.backref.by_name = 0;
3496 tok->u.backref.num = 1;
3497 tok->u.backref.ref1 = back_num;
3498 }
3499 else {
3500 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3501 if (num <= 0) {
3502 onig_scan_env_set_error_string(env,
3503 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3504 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3505 }
3506 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3507 int i;
3508 for (i = 0; i < num; i++) {
3509 if (backs[i] > env->num_mem ||
3510 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3511 return ONIGERR_INVALID_BACKREF;
3512 }
3513 }
3514
3515 tok->type = TK_BACKREF;
3516 tok->u.backref.by_name = 1;
3517 if (num == 1) {
3518 tok->u.backref.num = 1;
3519 tok->u.backref.ref1 = backs[0];
3520 }
3521 else {
3522 tok->u.backref.num = num;
3523 tok->u.backref.refs = backs;
3524 }
3525 }
3526 }
3527 else
3528 PUNFETCH;
3529 }
3530 break;
3531 #endif
3532
3533 #ifdef USE_SUBEXP_CALL
3534 case 'g':
3535 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3536 PFETCH(c);
3537 if (c == '<' || c == '\'') {
3538 int gnum;
3539 UChar* name_end;
3540
3541 prev = p;
3542 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3543 if (r < 0) return r;
3544
3545 tok->type = TK_CALL;
3546 tok->u.call.name = prev;
3547 tok->u.call.name_end = name_end;
3548 tok->u.call.gnum = gnum;
3549 }
3550 else
3551 PUNFETCH;
3552 }
3553 break;
3554 #endif
3555
3556 case 'Q':
3557 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3558 tok->type = TK_QUOTE_OPEN;
3559 }
3560 break;
3561
3562 case 'p':
3563 case 'P':
3564 if (PPEEK_IS('{') &&
3565 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3566 PINC;
3567 tok->type = TK_CHAR_PROPERTY;
3568 tok->u.prop.not = (c == 'P' ? 1 : 0);
3569
3570 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3571 PFETCH(c);
3572 if (c == '^') {
3573 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3574 }
3575 else
3576 PUNFETCH;
3577 }
3578 }
3579 break;
3580
3581 default:
3582 PUNFETCH;
3583 num = fetch_escaped_value(&p, end, env);
3584 if (num < 0) return num;
3585 /* set_raw: */
3586 if (tok->u.c != num) {
3587 tok->type = TK_CODE_POINT;
3588 tok->u.code = (OnigCodePoint )num;
3589 }
3590 else { /* string */
3591 p = tok->backp + enclen(enc, tok->backp);
3592 }
3593 break;
3594 }
3595 }
3596 else {
3597 tok->u.c = c;
3598 tok->escaped = 0;
3599
3600 #ifdef USE_VARIABLE_META_CHARS
3601 if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3602 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3603 if (c == MC_ANYCHAR(syn))
3604 goto any_char;
3605 else if (c == MC_ANYTIME(syn))
3606 goto anytime;
3607 else if (c == MC_ZERO_OR_ONE_TIME(syn))
3608 goto zero_or_one_time;
3609 else if (c == MC_ONE_OR_MORE_TIME(syn))
3610 goto one_or_more_time;
3611 else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3612 tok->type = TK_ANYCHAR_ANYTIME;
3613 goto out;
3614 }
3615 }
3616 #endif
3617
3618 switch (c) {
3619 case '.':
3620 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3621 #ifdef USE_VARIABLE_META_CHARS
3622 any_char:
3623 #endif
3624 tok->type = TK_ANYCHAR;
3625 break;
3626
3627 case '*':
3628 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3629 #ifdef USE_VARIABLE_META_CHARS
3630 anytime:
3631 #endif
3632 tok->type = TK_OP_REPEAT;
3633 tok->u.repeat.lower = 0;
3634 tok->u.repeat.upper = REPEAT_INFINITE;
3635 goto greedy_check;
3636 break;
3637
3638 case '+':
3639 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3640 #ifdef USE_VARIABLE_META_CHARS
3641 one_or_more_time:
3642 #endif
3643 tok->type = TK_OP_REPEAT;
3644 tok->u.repeat.lower = 1;
3645 tok->u.repeat.upper = REPEAT_INFINITE;
3646 goto greedy_check;
3647 break;
3648
3649 case '?':
3650 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3651 #ifdef USE_VARIABLE_META_CHARS
3652 zero_or_one_time:
3653 #endif
3654 tok->type = TK_OP_REPEAT;
3655 tok->u.repeat.lower = 0;
3656 tok->u.repeat.upper = 1;
3657 goto greedy_check;
3658 break;
3659
3660 case '{':
3661 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3662 r = fetch_range_quantifier(&p, end, tok, env);
3663 if (r < 0) return r; /* error */
3664 if (r == 0) goto greedy_check;
3665 else if (r == 2) { /* {n} */
3666 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3667 goto possessive_check;
3668
3669 goto greedy_check;
3670 }
3671 /* r == 1 : normal char */
3672 break;
3673
3674 case '|':
3675 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3676 tok->type = TK_ALT;
3677 break;
3678
3679 case '(':
3680 if (PPEEK_IS('?') &&
3681 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3682 PINC;
3683 if (PPEEK_IS('#')) {
3684 PFETCH(c);
3685 while (1) {
3686 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3687 PFETCH(c);
3688 if (c == MC_ESC(syn)) {
3689 if (!PEND) PFETCH(c);
3690 }
3691 else {
3692 if (c == ')') break;
3693 }
3694 }
3695 goto start;
3696 }
3697 PUNFETCH;
3698 }
3699
3700 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3701 tok->type = TK_SUBEXP_OPEN;
3702 break;
3703
3704 case ')':
3705 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3706 tok->type = TK_SUBEXP_CLOSE;
3707 break;
3708
3709 case '^':
3710 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3711 tok->type = TK_ANCHOR;
3712 tok->u.subtype = (IS_SINGLELINE(env->option)
3713 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
3714 break;
3715
3716 case '$':
3717 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
3718 tok->type = TK_ANCHOR;
3719 tok->u.subtype = (IS_SINGLELINE(env->option)
3720 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
3721 break;
3722
3723 case '[':
3724 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
3725 tok->type = TK_CC_OPEN;
3726 break;
3727
3728 case ']':
3729 if (*src > env->pattern) /* /].../ is allowed. */
3730 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
3731 break;
3732
3733 case '#':
3734 if (IS_EXTEND(env->option)) {
3735 while (!PEND) {
3736 PFETCH(c);
3737 if (ONIGENC_IS_CODE_NEWLINE(enc, c))
3738 break;
3739 }
3740 goto start;
3741 break;
3742 }
3743 break;
3744
3745 case ' ': case '\t': case '\n': case '\r': case '\f':
3746 if (IS_EXTEND(env->option))
3747 goto start;
3748 break;
3749
3750 default:
3751 /* string */
3752 break;
3753 }
3754 }
3755
3756 #ifdef USE_VARIABLE_META_CHARS
3757 out:
3758 #endif
3759 *src = p;
3760 return tok->type;
3761 }
3762
3763 static int
add_ctype_to_cc_by_range(CClassNode * cc,int ctype ARG_UNUSED,int not,OnigEncoding enc ARG_UNUSED,OnigCodePoint sb_out,const OnigCodePoint mbr[])3764 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
3765 OnigEncoding enc ARG_UNUSED,
3766 OnigCodePoint sb_out, const OnigCodePoint mbr[])
3767 {
3768 int i, r;
3769 OnigCodePoint j;
3770
3771 int n = ONIGENC_CODE_RANGE_NUM(mbr);
3772
3773 if (not == 0) {
3774 for (i = 0; i < n; i++) {
3775 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
3776 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
3777 if (j >= sb_out) {
3778 if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++;
3779 else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3780 r = add_code_range_to_buf(&(cc->mbuf), j,
3781 ONIGENC_CODE_RANGE_TO(mbr, i));
3782 if (r != 0) return r;
3783 i++;
3784 }
3785
3786 goto sb_end;
3787 }
3788 BITSET_SET_BIT(cc->bs, j);
3789 }
3790 }
3791
3792 sb_end:
3793 for ( ; i < n; i++) {
3794 r = add_code_range_to_buf(&(cc->mbuf),
3795 ONIGENC_CODE_RANGE_FROM(mbr, i),
3796 ONIGENC_CODE_RANGE_TO(mbr, i));
3797 if (r != 0) return r;
3798 }
3799 }
3800 else {
3801 OnigCodePoint prev = 0;
3802
3803 for (i = 0; i < n; i++) {
3804 for (j = prev;
3805 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
3806 if (j >= sb_out) {
3807 goto sb_end2;
3808 }
3809 BITSET_SET_BIT(cc->bs, j);
3810 }
3811 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3812 }
3813 for (j = prev; j < sb_out; j++) {
3814 BITSET_SET_BIT(cc->bs, j);
3815 }
3816
3817 sb_end2:
3818 prev = sb_out;
3819
3820 for (i = 0; i < n; i++) {
3821 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
3822 r = add_code_range_to_buf(&(cc->mbuf), prev,
3823 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
3824 if (r != 0) return r;
3825 }
3826 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
3827 }
3828 if (prev < 0x7fffffff) {
3829 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff);
3830 if (r != 0) return r;
3831 }
3832 }
3833
3834 return 0;
3835 }
3836
3837 static int
add_ctype_to_cc(CClassNode * cc,int ctype,int not,ScanEnv * env)3838 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
3839 {
3840 int c, r;
3841 const OnigCodePoint *ranges;
3842 OnigCodePoint sb_out;
3843 OnigEncoding enc = env->enc;
3844
3845 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
3846 if (r == 0) {
3847 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges);
3848 }
3849 else if (r != ONIG_NO_SUPPORT_CONFIG) {
3850 return r;
3851 }
3852
3853 r = 0;
3854 switch (ctype) {
3855 case ONIGENC_CTYPE_ALPHA:
3856 case ONIGENC_CTYPE_BLANK:
3857 case ONIGENC_CTYPE_CNTRL:
3858 case ONIGENC_CTYPE_DIGIT:
3859 case ONIGENC_CTYPE_LOWER:
3860 case ONIGENC_CTYPE_PUNCT:
3861 case ONIGENC_CTYPE_SPACE:
3862 case ONIGENC_CTYPE_UPPER:
3863 case ONIGENC_CTYPE_XDIGIT:
3864 case ONIGENC_CTYPE_ASCII:
3865 case ONIGENC_CTYPE_ALNUM:
3866 if (not != 0) {
3867 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3868 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3869 BITSET_SET_BIT(cc->bs, c);
3870 }
3871 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3872 }
3873 else {
3874 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3875 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3876 BITSET_SET_BIT(cc->bs, c);
3877 }
3878 }
3879 break;
3880
3881 case ONIGENC_CTYPE_GRAPH:
3882 case ONIGENC_CTYPE_PRINT:
3883 if (not != 0) {
3884 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3885 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3886 BITSET_SET_BIT(cc->bs, c);
3887 }
3888 }
3889 else {
3890 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3891 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
3892 BITSET_SET_BIT(cc->bs, c);
3893 }
3894 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3895 }
3896 break;
3897
3898 case ONIGENC_CTYPE_WORD:
3899 if (not == 0) {
3900 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3901 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c);
3902 }
3903 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
3904 }
3905 else {
3906 for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
3907 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
3908 && ! ONIGENC_IS_CODE_WORD(enc, c))
3909 BITSET_SET_BIT(cc->bs, c);
3910 }
3911 }
3912 break;
3913
3914 default:
3915 return ONIGERR_PARSER_BUG;
3916 break;
3917 }
3918
3919 return r;
3920 }
3921
3922 static int
parse_posix_bracket(CClassNode * cc,UChar ** src,UChar * end,ScanEnv * env)3923 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
3924 {
3925 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
3926 #define POSIX_BRACKET_NAME_MIN_LEN 4
3927
3928 static PosixBracketEntryType PBS[] = {
3929 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 },
3930 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 },
3931 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 },
3932 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 },
3933 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 },
3934 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 },
3935 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 },
3936 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 },
3937 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 },
3938 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 },
3939 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 },
3940 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 },
3941 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 },
3942 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 },
3943 { (UChar* )NULL, -1, 0 }
3944 };
3945
3946 PosixBracketEntryType *pb;
3947 int not, i, r;
3948 OnigCodePoint c;
3949 OnigEncoding enc = env->enc;
3950 UChar *p = *src;
3951
3952 if (PPEEK_IS('^')) {
3953 PINC_S;
3954 not = 1;
3955 }
3956 else
3957 not = 0;
3958
3959 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
3960 goto not_posix_bracket;
3961
3962 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
3963 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
3964 p = (UChar* )onigenc_step(enc, p, end, pb->len);
3965 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
3966 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3967
3968 r = add_ctype_to_cc(cc, pb->ctype, not, env);
3969 if (r != 0) return r;
3970
3971 PINC_S; PINC_S;
3972 *src = p;
3973 return 0;
3974 }
3975 }
3976
3977 not_posix_bracket:
3978 c = 0;
3979 i = 0;
3980 while (!PEND && ((c = PPEEK) != ':') && c != ']') {
3981 PINC_S;
3982 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
3983 }
3984 if (c == ':' && ! PEND) {
3985 PINC_S;
3986 if (! PEND) {
3987 PFETCH_S(c);
3988 if (c == ']')
3989 return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
3990 }
3991 }
3992
3993 return 1; /* 1: is not POSIX bracket, but no error. */
3994 }
3995
3996 static int
fetch_char_property_to_ctype(UChar ** src,UChar * end,ScanEnv * env)3997 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
3998 {
3999 int r;
4000 OnigCodePoint c;
4001 OnigEncoding enc = env->enc;
4002 UChar *prev, *start, *p = *src;
4003
4004 r = 0;
4005 start = prev = p;
4006
4007 while (!PEND) {
4008 prev = p;
4009 PFETCH_S(c);
4010 if (c == '}') {
4011 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4012 if (r < 0) break;
4013
4014 *src = p;
4015 return r;
4016 }
4017 else if (c == '(' || c == ')' || c == '{' || c == '|') {
4018 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4019 break;
4020 }
4021 }
4022
4023 onig_scan_env_set_error_string(env, r, *src, prev);
4024 return r;
4025 }
4026
4027 static int
parse_char_property(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4028 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4029 ScanEnv* env)
4030 {
4031 int r, ctype;
4032 CClassNode* cc;
4033
4034 ctype = fetch_char_property_to_ctype(src, end, env);
4035 if (ctype < 0) return ctype;
4036
4037 *np = node_new_cclass();
4038 CHECK_NULL_RETURN_MEMERR(*np);
4039 cc = NCCLASS(*np);
4040 r = add_ctype_to_cc(cc, ctype, 0, env);
4041 if (r != 0) return r;
4042 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4043
4044 return 0;
4045 }
4046
4047
4048 enum CCSTATE {
4049 CCS_VALUE,
4050 CCS_RANGE,
4051 CCS_COMPLETE,
4052 CCS_START
4053 };
4054
4055 enum CCVALTYPE {
4056 CCV_SB,
4057 CCV_CODE_POINT,
4058 CCV_CLASS
4059 };
4060
4061 static int
next_state_class(CClassNode * cc,OnigCodePoint * vs,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4062 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type,
4063 enum CCSTATE* state, ScanEnv* env)
4064 {
4065 int r;
4066
4067 if (*state == CCS_RANGE)
4068 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4069
4070 if (*state == CCS_VALUE && *type != CCV_CLASS) {
4071 if (*type == CCV_SB)
4072 BITSET_SET_BIT(cc->bs, (int )(*vs));
4073 else if (*type == CCV_CODE_POINT) {
4074 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4075 if (r < 0) return r;
4076 }
4077 }
4078
4079 *state = CCS_VALUE;
4080 *type = CCV_CLASS;
4081 return 0;
4082 }
4083
4084 static int
next_state_val(CClassNode * cc,OnigCodePoint * vs,OnigCodePoint v,int * vs_israw,int v_israw,enum CCVALTYPE intype,enum CCVALTYPE * type,enum CCSTATE * state,ScanEnv * env)4085 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
4086 int* vs_israw, int v_israw,
4087 enum CCVALTYPE intype, enum CCVALTYPE* type,
4088 enum CCSTATE* state, ScanEnv* env)
4089 {
4090 int r;
4091
4092 switch (*state) {
4093 case CCS_VALUE:
4094 if (*type == CCV_SB)
4095 BITSET_SET_BIT(cc->bs, (int )(*vs));
4096 else if (*type == CCV_CODE_POINT) {
4097 r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4098 if (r < 0) return r;
4099 }
4100 break;
4101
4102 case CCS_RANGE:
4103 if (intype == *type) {
4104 if (intype == CCV_SB) {
4105 if (*vs > 0xff || v > 0xff)
4106 return ONIGERR_INVALID_CODE_POINT_VALUE;
4107
4108 if (*vs > v) {
4109 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4110 goto ccs_range_end;
4111 else
4112 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4113 }
4114 bitset_set_range(cc->bs, (int )*vs, (int )v);
4115 }
4116 else {
4117 r = add_code_range(&(cc->mbuf), env, *vs, v);
4118 if (r < 0) return r;
4119 }
4120 }
4121 else {
4122 #if 0
4123 if (intype == CCV_CODE_POINT && *type == CCV_SB) {
4124 #endif
4125 if (*vs > v) {
4126 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4127 goto ccs_range_end;
4128 else
4129 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4130 }
4131 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
4132 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
4133 if (r < 0) return r;
4134 #if 0
4135 }
4136 else
4137 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
4138 #endif
4139 }
4140 ccs_range_end:
4141 *state = CCS_COMPLETE;
4142 break;
4143
4144 case CCS_COMPLETE:
4145 case CCS_START:
4146 *state = CCS_VALUE;
4147 break;
4148
4149 default:
4150 break;
4151 }
4152
4153 *vs_israw = v_israw;
4154 *vs = v;
4155 *type = intype;
4156 return 0;
4157 }
4158
4159 static int
code_exist_check(OnigCodePoint c,UChar * from,UChar * end,int ignore_escaped,ScanEnv * env)4160 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4161 ScanEnv* env)
4162 {
4163 int in_esc;
4164 OnigCodePoint code;
4165 OnigEncoding enc = env->enc;
4166 UChar* p = from;
4167
4168 in_esc = 0;
4169 while (! PEND) {
4170 if (ignore_escaped && in_esc) {
4171 in_esc = 0;
4172 }
4173 else {
4174 PFETCH_S(code);
4175 if (code == c) return 1;
4176 if (code == MC_ESC(env->syntax)) in_esc = 1;
4177 }
4178 }
4179 return 0;
4180 }
4181
4182 static int
parse_char_class(Node ** np,OnigToken * tok,UChar ** src,UChar * end,ScanEnv * env)4183 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
4184 ScanEnv* env)
4185 {
4186 int r, neg, len, fetched, and_start;
4187 OnigCodePoint v, vs;
4188 UChar *p;
4189 Node* node;
4190 CClassNode *cc, *prev_cc;
4191 CClassNode work_cc;
4192
4193 enum CCSTATE state;
4194 enum CCVALTYPE val_type, in_type;
4195 int val_israw, in_israw;
4196
4197 prev_cc = (CClassNode* )NULL;
4198 *np = NULL_NODE;
4199 r = fetch_token_in_cc(tok, src, end, env);
4200 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4201 neg = 1;
4202 r = fetch_token_in_cc(tok, src, end, env);
4203 }
4204 else {
4205 neg = 0;
4206 }
4207
4208 if (r < 0) return r;
4209 if (r == TK_CC_CLOSE) {
4210 if (! code_exist_check((OnigCodePoint )']',
4211 *src, env->pattern_end, 1, env))
4212 return ONIGERR_EMPTY_CHAR_CLASS;
4213
4214 CC_ESC_WARN(env, (UChar* )"]");
4215 r = tok->type = TK_CHAR; /* allow []...] */
4216 }
4217
4218 *np = node = node_new_cclass();
4219 CHECK_NULL_RETURN_MEMERR(node);
4220 cc = NCCLASS(node);
4221
4222 and_start = 0;
4223 state = CCS_START;
4224 p = *src;
4225 while (r != TK_CC_CLOSE) {
4226 fetched = 0;
4227 switch (r) {
4228 case TK_CHAR:
4229 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
4230 if (len > 1) {
4231 in_type = CCV_CODE_POINT;
4232 }
4233 else if (len < 0) {
4234 r = len;
4235 goto err;
4236 }
4237 else {
4238 sb_char:
4239 in_type = CCV_SB;
4240 }
4241 v = (OnigCodePoint )tok->u.c;
4242 in_israw = 0;
4243 goto val_entry2;
4244 break;
4245
4246 case TK_RAW_BYTE:
4247 /* tok->base != 0 : octal or hexadec. */
4248 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4249 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4250 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4251 UChar* psave = p;
4252 int i, base = tok->base;
4253
4254 buf[0] = (UChar)tok->u.c;
4255 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4256 r = fetch_token_in_cc(tok, &p, end, env);
4257 if (r < 0) goto err;
4258 if (r != TK_RAW_BYTE || tok->base != base) {
4259 fetched = 1;
4260 break;
4261 }
4262 buf[i] = (UChar)tok->u.c;
4263 }
4264
4265 if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4266 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4267 goto err;
4268 }
4269
4270 len = enclen(env->enc, buf);
4271 if (i < len) {
4272 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4273 goto err;
4274 }
4275 else if (i > len) { /* fetch back */
4276 p = psave;
4277 for (i = 1; i < len; i++) {
4278 r = fetch_token_in_cc(tok, &p, end, env);
4279 }
4280 fetched = 0;
4281 }
4282
4283 if (i == 1) {
4284 v = (OnigCodePoint )buf[0];
4285 goto raw_single;
4286 }
4287 else {
4288 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4289 in_type = CCV_CODE_POINT;
4290 }
4291 }
4292 else {
4293 v = (OnigCodePoint )tok->u.c;
4294 raw_single:
4295 in_type = CCV_SB;
4296 }
4297 in_israw = 1;
4298 goto val_entry2;
4299 break;
4300
4301 case TK_CODE_POINT:
4302 v = tok->u.code;
4303 in_israw = 1;
4304 val_entry:
4305 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4306 if (len < 0) {
4307 r = len;
4308 goto err;
4309 }
4310 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4311 val_entry2:
4312 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4313 &state, env);
4314 if (r != 0) goto err;
4315 break;
4316
4317 case TK_POSIX_BRACKET_OPEN:
4318 r = parse_posix_bracket(cc, &p, end, env);
4319 if (r < 0) goto err;
4320 if (r == 1) { /* is not POSIX bracket */
4321 CC_ESC_WARN(env, (UChar* )"[");
4322 p = tok->backp;
4323 v = (OnigCodePoint )tok->u.c;
4324 in_israw = 0;
4325 goto val_entry;
4326 }
4327 goto next_class;
4328 break;
4329
4330 case TK_CHAR_TYPE:
4331 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env);
4332 if (r != 0) return r;
4333
4334 next_class:
4335 r = next_state_class(cc, &vs, &val_type, &state, env);
4336 if (r != 0) goto err;
4337 break;
4338
4339 case TK_CHAR_PROPERTY:
4340 {
4341 int ctype;
4342
4343 ctype = fetch_char_property_to_ctype(&p, end, env);
4344 if (ctype < 0) return ctype;
4345 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env);
4346 if (r != 0) return r;
4347 goto next_class;
4348 }
4349 break;
4350
4351 case TK_CC_RANGE:
4352 if (state == CCS_VALUE) {
4353 r = fetch_token_in_cc(tok, &p, end, env);
4354 if (r < 0) goto err;
4355 fetched = 1;
4356 if (r == TK_CC_CLOSE) { /* allow [x-] */
4357 range_end_val:
4358 v = (OnigCodePoint )'-';
4359 in_israw = 0;
4360 goto val_entry;
4361 }
4362 else if (r == TK_CC_AND) {
4363 CC_ESC_WARN(env, (UChar* )"-");
4364 goto range_end_val;
4365 }
4366 state = CCS_RANGE;
4367 }
4368 else if (state == CCS_START) {
4369 /* [-xa] is allowed */
4370 v = (OnigCodePoint )tok->u.c;
4371 in_israw = 0;
4372
4373 r = fetch_token_in_cc(tok, &p, end, env);
4374 if (r < 0) goto err;
4375 fetched = 1;
4376 /* [--x] or [a&&-x] is warned. */
4377 if (r == TK_CC_RANGE || and_start != 0)
4378 CC_ESC_WARN(env, (UChar* )"-");
4379
4380 goto val_entry;
4381 }
4382 else if (state == CCS_RANGE) {
4383 CC_ESC_WARN(env, (UChar* )"-");
4384 goto sb_char; /* [!--x] is allowed */
4385 }
4386 else { /* CCS_COMPLETE */
4387 r = fetch_token_in_cc(tok, &p, end, env);
4388 if (r < 0) goto err;
4389 fetched = 1;
4390 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4391 else if (r == TK_CC_AND) {
4392 CC_ESC_WARN(env, (UChar* )"-");
4393 goto range_end_val;
4394 }
4395
4396 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4397 CC_ESC_WARN(env, (UChar* )"-");
4398 goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
4399 }
4400 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4401 goto err;
4402 }
4403 break;
4404
4405 case TK_CC_CC_OPEN: /* [ */
4406 {
4407 Node *anode;
4408 CClassNode* acc;
4409
4410 r = parse_char_class(&anode, tok, &p, end, env);
4411 if (r != 0) goto cc_open_err;
4412 acc = NCCLASS(anode);
4413 r = or_cclass(cc, acc, env->enc);
4414
4415 onig_node_free(anode);
4416 cc_open_err:
4417 if (r != 0) goto err;
4418 }
4419 break;
4420
4421 case TK_CC_AND: /* && */
4422 {
4423 if (state == CCS_VALUE) {
4424 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4425 &val_type, &state, env);
4426 if (r != 0) goto err;
4427 }
4428 /* initialize local variables */
4429 and_start = 1;
4430 state = CCS_START;
4431
4432 if (IS_NOT_NULL(prev_cc)) {
4433 r = and_cclass(prev_cc, cc, env->enc);
4434 if (r != 0) goto err;
4435 bbuf_free(cc->mbuf);
4436 }
4437 else {
4438 prev_cc = cc;
4439 cc = &work_cc;
4440 }
4441 initialize_cclass(cc);
4442 }
4443 break;
4444
4445 case TK_EOT:
4446 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4447 goto err;
4448 break;
4449 default:
4450 r = ONIGERR_PARSER_BUG;
4451 goto err;
4452 break;
4453 }
4454
4455 if (fetched)
4456 r = tok->type;
4457 else {
4458 r = fetch_token_in_cc(tok, &p, end, env);
4459 if (r < 0) goto err;
4460 }
4461 }
4462
4463 if (state == CCS_VALUE) {
4464 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type,
4465 &val_type, &state, env);
4466 if (r != 0) goto err;
4467 }
4468
4469 if (IS_NOT_NULL(prev_cc)) {
4470 r = and_cclass(prev_cc, cc, env->enc);
4471 if (r != 0) goto err;
4472 bbuf_free(cc->mbuf);
4473 cc = prev_cc;
4474 }
4475
4476 if (neg != 0)
4477 NCCLASS_SET_NOT(cc);
4478 else
4479 NCCLASS_CLEAR_NOT(cc);
4480 if (IS_NCCLASS_NOT(cc) &&
4481 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4482 int is_empty;
4483
4484 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4485 if (is_empty != 0)
4486 BITSET_IS_EMPTY(cc->bs, is_empty);
4487
4488 if (is_empty == 0) {
4489 #define NEWLINE_CODE 0x0a
4490
4491 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4492 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4493 BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
4494 else
4495 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4496 }
4497 }
4498 }
4499 *src = p;
4500 return 0;
4501
4502 err:
4503 if (cc != NCCLASS(*np))
4504 bbuf_free(cc->mbuf);
4505 onig_node_free(*np);
4506 return r;
4507 }
4508
4509 static int parse_subexp(Node** top, OnigToken* tok, int term,
4510 UChar** src, UChar* end, ScanEnv* env);
4511
4512 static int
parse_enclose(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)4513 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4514 ScanEnv* env)
4515 {
4516 int r, num;
4517 Node *target;
4518 OnigOptionType option;
4519 OnigCodePoint c;
4520 OnigEncoding enc = env->enc;
4521
4522 #ifdef USE_NAMED_GROUP
4523 int list_capture;
4524 #endif
4525
4526 UChar* p = *src;
4527 PFETCH_READY;
4528
4529 *np = NULL;
4530 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4531
4532 option = env->option;
4533 if (PPEEK_IS('?') &&
4534 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4535 PINC;
4536 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4537
4538 PFETCH(c);
4539 switch (c) {
4540 case ':': /* (?:...) grouping only */
4541 group:
4542 r = fetch_token(tok, &p, end, env);
4543 if (r < 0) return r;
4544 r = parse_subexp(np, tok, term, &p, end, env);
4545 if (r < 0) return r;
4546 *src = p;
4547 return 1; /* group */
4548 break;
4549
4550 case '=':
4551 *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4552 break;
4553 case '!': /* preceding read */
4554 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4555 break;
4556 case '>': /* (?>...) stop backtrack */
4557 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4558 break;
4559
4560 #ifdef USE_NAMED_GROUP
4561 case '\'':
4562 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4563 goto named_group1;
4564 }
4565 else
4566 return ONIGERR_UNDEFINED_GROUP_OPTION;
4567 break;
4568 #endif
4569
4570 case '<': /* look behind (?<=...), (?<!...) */
4571 PFETCH(c);
4572 if (c == '=')
4573 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
4574 else if (c == '!')
4575 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
4576 #ifdef USE_NAMED_GROUP
4577 else {
4578 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4579 UChar *name;
4580 UChar *name_end;
4581
4582 PUNFETCH;
4583 c = '<';
4584
4585 named_group1:
4586 list_capture = 0;
4587
4588 named_group2:
4589 name = p;
4590 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
4591 if (r < 0) return r;
4592
4593 num = scan_env_add_mem_entry(env);
4594 if (num < 0) return num;
4595 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
4596 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4597
4598 r = name_add(env->reg, name, name_end, num, env);
4599 if (r != 0) return r;
4600 *np = node_new_enclose_memory(env->option, 1);
4601 CHECK_NULL_RETURN_MEMERR(*np);
4602 NENCLOSE(*np)->regnum = num;
4603 if (list_capture != 0)
4604 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4605 env->num_named++;
4606 }
4607 else {
4608 return ONIGERR_UNDEFINED_GROUP_OPTION;
4609 }
4610 }
4611 #else
4612 else {
4613 return ONIGERR_UNDEFINED_GROUP_OPTION;
4614 }
4615 #endif
4616 break;
4617
4618 case '@':
4619 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
4620 #ifdef USE_NAMED_GROUP
4621 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
4622 PFETCH(c);
4623 if (c == '<' || c == '\'') {
4624 list_capture = 1;
4625 goto named_group2; /* (?@<name>...) */
4626 }
4627 PUNFETCH;
4628 }
4629 #endif
4630 *np = node_new_enclose_memory(env->option, 0);
4631 CHECK_NULL_RETURN_MEMERR(*np);
4632 num = scan_env_add_mem_entry(env);
4633 if (num < 0) {
4634 onig_node_free(*np);
4635 return num;
4636 }
4637 else if (num >= (int )BIT_STATUS_BITS_NUM) {
4638 onig_node_free(*np);
4639 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
4640 }
4641 NENCLOSE(*np)->regnum = num;
4642 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
4643 }
4644 else {
4645 return ONIGERR_UNDEFINED_GROUP_OPTION;
4646 }
4647 break;
4648
4649 #ifdef USE_POSIXLINE_OPTION
4650 case 'p':
4651 #endif
4652 case '-': case 'i': case 'm': case 's': case 'x':
4653 {
4654 int neg = 0;
4655
4656 while (1) {
4657 switch (c) {
4658 case ':':
4659 case ')':
4660 break;
4661
4662 case '-': neg = 1; break;
4663 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
4664 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
4665 case 's':
4666 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4667 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4668 }
4669 else
4670 return ONIGERR_UNDEFINED_GROUP_OPTION;
4671 break;
4672
4673 case 'm':
4674 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
4675 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
4676 }
4677 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
4678 ONOFF(option, ONIG_OPTION_MULTILINE, neg);
4679 }
4680 else
4681 return ONIGERR_UNDEFINED_GROUP_OPTION;
4682 break;
4683 #ifdef USE_POSIXLINE_OPTION
4684 case 'p':
4685 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
4686 break;
4687 #endif
4688 default:
4689 return ONIGERR_UNDEFINED_GROUP_OPTION;
4690 }
4691
4692 if (c == ')') {
4693 *np = node_new_option(option);
4694 CHECK_NULL_RETURN_MEMERR(*np);
4695 *src = p;
4696 return 2; /* option only */
4697 }
4698 else if (c == ':') {
4699 OnigOptionType prev = env->option;
4700
4701 env->option = option;
4702 r = fetch_token(tok, &p, end, env);
4703 if (r < 0) return r;
4704 r = parse_subexp(&target, tok, term, &p, end, env);
4705 env->option = prev;
4706 if (r < 0) return r;
4707 *np = node_new_option(option);
4708 CHECK_NULL_RETURN_MEMERR(*np);
4709 NENCLOSE(*np)->target = target;
4710 *src = p;
4711 return 0;
4712 }
4713
4714 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4715 PFETCH(c);
4716 }
4717 }
4718 break;
4719
4720 default:
4721 return ONIGERR_UNDEFINED_GROUP_OPTION;
4722 }
4723 }
4724 else {
4725 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
4726 goto group;
4727
4728 *np = node_new_enclose_memory(env->option, 0);
4729 CHECK_NULL_RETURN_MEMERR(*np);
4730 num = scan_env_add_mem_entry(env);
4731 if (num < 0) return num;
4732 NENCLOSE(*np)->regnum = num;
4733 }
4734
4735 CHECK_NULL_RETURN_MEMERR(*np);
4736 r = fetch_token(tok, &p, end, env);
4737 if (r < 0) return r;
4738 r = parse_subexp(&target, tok, term, &p, end, env);
4739 if (r < 0) return r;
4740
4741 if (NTYPE(*np) == NT_ANCHOR)
4742 NANCHOR(*np)->target = target;
4743 else {
4744 NENCLOSE(*np)->target = target;
4745 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
4746 /* Don't move this to previous of parse_subexp() */
4747 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
4748 if (r != 0) return r;
4749 }
4750 }
4751
4752 *src = p;
4753 return 0;
4754 }
4755
4756 static const char* PopularQStr[] = {
4757 "?", "*", "+", "??", "*?", "+?"
4758 };
4759
4760 static const char* ReduceQStr[] = {
4761 "", "", "*", "*?", "??", "+ and ??", "+? and ?"
4762 };
4763
4764 static int
set_quantifier(Node * qnode,Node * target,int group,ScanEnv * env)4765 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
4766 {
4767 QtfrNode* qn;
4768
4769 qn = NQTFR(qnode);
4770 if (qn->lower == 1 && qn->upper == 1) {
4771 return 1;
4772 }
4773
4774 switch (NTYPE(target)) {
4775 case NT_STR:
4776 if (! group) {
4777 StrNode* sn = NSTR(target);
4778 if (str_node_can_be_split(sn, env->enc)) {
4779 Node* n = str_node_split_last_char(sn, env->enc);
4780 if (IS_NOT_NULL(n)) {
4781 qn->target = n;
4782 return 2;
4783 }
4784 }
4785 }
4786 break;
4787
4788 case NT_QTFR:
4789 { /* check redundant double repeat. */
4790 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
4791 QtfrNode* qnt = NQTFR(target);
4792 int nestq_num = popular_quantifier_num(qn);
4793 int targetq_num = popular_quantifier_num(qnt);
4794 if (nestq_num < 0 || targetq_num < 0) {
4795 return ONIGERR_TYPE_BUG;
4796 }
4797
4798 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
4799 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) &&
4800 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
4801 UChar buf[WARN_BUFSIZE];
4802
4803 switch(ReduceTypeTable[targetq_num][nestq_num]) {
4804 case RQ_ASIS:
4805 break;
4806
4807 case RQ_DEL:
4808 if (onig_verb_warn != onig_null_warn) {
4809 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4810 env->pattern, env->pattern_end,
4811 (UChar* )"redundant nested repeat operator");
4812 (*onig_verb_warn)((char* )buf);
4813 }
4814 goto warn_exit;
4815 break;
4816
4817 default:
4818 if (onig_verb_warn != onig_null_warn) {
4819 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
4820 env->pattern, env->pattern_end,
4821 (UChar* )"nested repeat operator %s and %s was replaced with '%s'",
4822 PopularQStr[targetq_num], PopularQStr[nestq_num],
4823 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
4824 (*onig_verb_warn)((char* )buf);
4825 }
4826 goto warn_exit;
4827 break;
4828 }
4829 }
4830
4831 warn_exit:
4832 #endif
4833 if (targetq_num >= 0) {
4834 if (nestq_num >= 0) {
4835 onig_reduce_nested_quantifier(qnode, target);
4836 goto q_exit;
4837 }
4838 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
4839 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
4840 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
4841 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
4842 }
4843 }
4844 }
4845 }
4846 break;
4847
4848 default:
4849 break;
4850 }
4851
4852 qn->target = target;
4853 q_exit:
4854 return 0;
4855 }
4856
4857
4858 #ifdef USE_SHARED_CCLASS_TABLE
4859
4860 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8
4861
4862 /* for ctype node hash table */
4863
4864 typedef struct {
4865 OnigEncoding enc;
4866 int not;
4867 int type;
4868 } type_cclass_key;
4869
type_cclass_cmp(type_cclass_key * x,type_cclass_key * y)4870 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y)
4871 {
4872 if (x->type != y->type) return 1;
4873 if (x->enc != y->enc) return 1;
4874 if (x->not != y->not) return 1;
4875 return 0;
4876 }
4877
type_cclass_hash(type_cclass_key * key)4878 static int type_cclass_hash(type_cclass_key* key)
4879 {
4880 int i, val;
4881 UChar *p;
4882
4883 val = 0;
4884
4885 p = (UChar* )&(key->enc);
4886 for (i = 0; i < (int )sizeof(key->enc); i++) {
4887 val = val * 997 + (int )*p++;
4888 }
4889
4890 p = (UChar* )(&key->type);
4891 for (i = 0; i < (int )sizeof(key->type); i++) {
4892 val = val * 997 + (int )*p++;
4893 }
4894
4895 val += key->not;
4896 return val + (val >> 5);
4897 }
4898
4899 static struct st_hash_type type_type_cclass_hash = {
4900 type_cclass_cmp,
4901 type_cclass_hash,
4902 };
4903
4904 static st_table* OnigTypeCClassTable;
4905
4906
4907 static int
i_free_shared_class(type_cclass_key * key,Node * node,void * arg ARG_UNUSED)4908 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED)
4909 {
4910 if (IS_NOT_NULL(node)) {
4911 CClassNode* cc = NCCLASS(node);
4912 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf);
4913 xfree(node);
4914 }
4915
4916 if (IS_NOT_NULL(key)) xfree(key);
4917 return ST_DELETE;
4918 }
4919
4920 extern int
onig_free_shared_cclass_table(void)4921 onig_free_shared_cclass_table(void)
4922 {
4923 if (IS_NOT_NULL(OnigTypeCClassTable)) {
4924 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0);
4925 onig_st_free_table(OnigTypeCClassTable);
4926 OnigTypeCClassTable = NULL;
4927 }
4928
4929 return 0;
4930 }
4931
4932 #endif /* USE_SHARED_CCLASS_TABLE */
4933
4934
4935 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4936 static int
clear_not_flag_cclass(CClassNode * cc,OnigEncoding enc)4937 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
4938 {
4939 BBuf *tbuf;
4940 int r;
4941
4942 if (IS_NCCLASS_NOT(cc)) {
4943 bitset_invert(cc->bs);
4944
4945 if (! ONIGENC_IS_SINGLEBYTE(enc)) {
4946 r = not_code_range_buf(enc, cc->mbuf, &tbuf);
4947 if (r != 0) return r;
4948
4949 bbuf_free(cc->mbuf);
4950 cc->mbuf = tbuf;
4951 }
4952
4953 NCCLASS_CLEAR_NOT(cc);
4954 }
4955
4956 return 0;
4957 }
4958 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
4959
4960 typedef struct {
4961 ScanEnv* env;
4962 CClassNode* cc;
4963 Node* alt_root;
4964 Node** ptail;
4965 } IApplyCaseFoldArg;
4966
4967 static int
i_apply_case_fold(OnigCodePoint from,OnigCodePoint to[],int to_len,void * arg)4968 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
4969 int to_len, void* arg)
4970 {
4971 IApplyCaseFoldArg* iarg;
4972 ScanEnv* env;
4973 CClassNode* cc;
4974 BitSetRef bs;
4975
4976 iarg = (IApplyCaseFoldArg* )arg;
4977 env = iarg->env;
4978 cc = iarg->cc;
4979 bs = cc->bs;
4980
4981 if (to_len == 1) {
4982 int is_in = onig_is_code_in_cc(env->enc, from, cc);
4983 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
4984 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
4985 (is_in == 0 && IS_NCCLASS_NOT(cc))) {
4986 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4987 add_code_range(&(cc->mbuf), env, *to, *to);
4988 }
4989 else {
4990 BITSET_SET_BIT(bs, *to);
4991 }
4992 }
4993 #else
4994 if (is_in != 0) {
4995 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
4996 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
4997 add_code_range(&(cc->mbuf), env, *to, *to);
4998 }
4999 else {
5000 if (IS_NCCLASS_NOT(cc)) {
5001 BITSET_CLEAR_BIT(bs, *to);
5002 }
5003 else
5004 BITSET_SET_BIT(bs, *to);
5005 }
5006 }
5007 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5008 }
5009 else {
5010 int r, i, len;
5011 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5012 Node *snode = NULL_NODE;
5013
5014 if (onig_is_code_in_cc(env->enc, from, cc)
5015 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5016 && !IS_NCCLASS_NOT(cc)
5017 #endif
5018 ) {
5019 for (i = 0; i < to_len; i++) {
5020 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5021 if (i == 0) {
5022 snode = onig_node_new_str(buf, buf + len);
5023 CHECK_NULL_RETURN_MEMERR(snode);
5024
5025 /* char-class expanded multi-char only
5026 compare with string folded at match time. */
5027 NSTRING_SET_AMBIG(snode);
5028 }
5029 else {
5030 r = onig_node_str_cat(snode, buf, buf + len);
5031 if (r < 0) {
5032 onig_node_free(snode);
5033 return r;
5034 }
5035 }
5036 }
5037
5038 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5039 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5040 iarg->ptail = &(NCDR((*(iarg->ptail))));
5041 }
5042 }
5043
5044 return 0;
5045 }
5046
5047 static int
parse_exp(Node ** np,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5048 parse_exp(Node** np, OnigToken* tok, int term,
5049 UChar** src, UChar* end, ScanEnv* env)
5050 {
5051 int r, len, group = 0;
5052 Node* qn;
5053 Node** targetp;
5054
5055 *np = NULL;
5056 if (tok->type == (enum TokenSyms )term)
5057 goto end_of_token;
5058
5059 switch (tok->type) {
5060 case TK_ALT:
5061 case TK_EOT:
5062 end_of_token:
5063 *np = node_new_empty();
5064 return tok->type;
5065 break;
5066
5067 case TK_SUBEXP_OPEN:
5068 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
5069 if (r < 0) return r;
5070 if (r == 1) group = 1;
5071 else if (r == 2) { /* option only */
5072 Node* target;
5073 OnigOptionType prev = env->option;
5074
5075 env->option = NENCLOSE(*np)->option;
5076 r = fetch_token(tok, src, end, env);
5077 if (r < 0) return r;
5078 r = parse_subexp(&target, tok, term, src, end, env);
5079 env->option = prev;
5080 if (r < 0) return r;
5081 NENCLOSE(*np)->target = target;
5082 return tok->type;
5083 }
5084 break;
5085
5086 case TK_SUBEXP_CLOSE:
5087 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
5088 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
5089
5090 if (tok->escaped) goto tk_raw_byte;
5091 else goto tk_byte;
5092 break;
5093
5094 case TK_STRING:
5095 tk_byte:
5096 {
5097 *np = node_new_str(tok->backp, *src);
5098 CHECK_NULL_RETURN_MEMERR(*np);
5099
5100 while (1) {
5101 r = fetch_token(tok, src, end, env);
5102 if (r < 0) return r;
5103 if (r != TK_STRING) break;
5104
5105 r = onig_node_str_cat(*np, tok->backp, *src);
5106 if (r < 0) return r;
5107 }
5108
5109 string_end:
5110 targetp = np;
5111 goto repeat;
5112 }
5113 break;
5114
5115 case TK_RAW_BYTE:
5116 tk_raw_byte:
5117 {
5118 *np = node_new_str_raw_char((UChar )tok->u.c);
5119 CHECK_NULL_RETURN_MEMERR(*np);
5120 len = 1;
5121 while (1) {
5122 if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
5123 if (len == enclen(env->enc, NSTR(*np)->s)) {
5124 r = fetch_token(tok, src, end, env);
5125 NSTRING_CLEAR_RAW(*np);
5126 goto string_end;
5127 }
5128 }
5129
5130 r = fetch_token(tok, src, end, env);
5131 if (r < 0) return r;
5132 if (r != TK_RAW_BYTE) {
5133 /* Don't use this, it is wrong for little endian encodings. */
5134 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
5135 int rem;
5136 if (len < ONIGENC_MBC_MINLEN(env->enc)) {
5137 rem = ONIGENC_MBC_MINLEN(env->enc) - len;
5138 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
5139 if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
5140 NSTRING_CLEAR_RAW(*np);
5141 goto string_end;
5142 }
5143 }
5144 #endif
5145 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
5146 }
5147
5148 r = node_str_cat_char(*np, (UChar )tok->u.c);
5149 if (r < 0) return r;
5150
5151 len++;
5152 }
5153 }
5154 break;
5155
5156 case TK_CODE_POINT:
5157 {
5158 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5159 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf);
5160 if (num < 0) return num;
5161 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
5162 *np = node_new_str_raw(buf, buf + num);
5163 #else
5164 *np = node_new_str(buf, buf + num);
5165 #endif
5166 CHECK_NULL_RETURN_MEMERR(*np);
5167 }
5168 break;
5169
5170 case TK_QUOTE_OPEN:
5171 {
5172 OnigCodePoint end_op[2];
5173 UChar *qstart, *qend, *nextp;
5174
5175 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
5176 end_op[1] = (OnigCodePoint )'E';
5177 qstart = *src;
5178 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
5179 if (IS_NULL(qend)) {
5180 nextp = qend = end;
5181 }
5182 *np = node_new_str(qstart, qend);
5183 CHECK_NULL_RETURN_MEMERR(*np);
5184 *src = nextp;
5185 }
5186 break;
5187
5188 case TK_CHAR_TYPE:
5189 {
5190 switch (tok->u.prop.ctype) {
5191 case ONIGENC_CTYPE_WORD:
5192 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
5193 CHECK_NULL_RETURN_MEMERR(*np);
5194 break;
5195
5196 case ONIGENC_CTYPE_SPACE:
5197 case ONIGENC_CTYPE_DIGIT:
5198 case ONIGENC_CTYPE_XDIGIT:
5199 {
5200 CClassNode* cc;
5201
5202 #ifdef USE_SHARED_CCLASS_TABLE
5203 const OnigCodePoint *mbr;
5204 OnigCodePoint sb_out;
5205
5206 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype,
5207 &sb_out, &mbr);
5208 if (r == 0 &&
5209 ONIGENC_CODE_RANGE_NUM(mbr)
5210 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) {
5211 type_cclass_key key;
5212 type_cclass_key* new_key;
5213
5214 key.enc = env->enc;
5215 key.not = tok->u.prop.not;
5216 key.type = tok->u.prop.ctype;
5217
5218 THREAD_ATOMIC_START;
5219
5220 if (IS_NULL(OnigTypeCClassTable)) {
5221 OnigTypeCClassTable
5222 = onig_st_init_table_with_size(&type_type_cclass_hash, 10);
5223 if (IS_NULL(OnigTypeCClassTable)) {
5224 THREAD_ATOMIC_END;
5225 return ONIGERR_MEMORY;
5226 }
5227 }
5228 else {
5229 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )(UINTN)&key,
5230 (st_data_t* )np)) {
5231 THREAD_ATOMIC_END;
5232 break;
5233 }
5234 }
5235
5236 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not,
5237 sb_out, mbr);
5238 if (IS_NULL(*np)) {
5239 THREAD_ATOMIC_END;
5240 return ONIGERR_MEMORY;
5241 }
5242
5243 cc = NCCLASS(*np);
5244 NCCLASS_SET_SHARE(cc);
5245 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key));
5246 CHECK_NULL_RETURN_MEMERR(new_key);
5247 xmemcpy(new_key, &key, sizeof(type_cclass_key));
5248 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )(UINTN)new_key,
5249 (st_data_t )(UINTN)*np);
5250
5251 THREAD_ATOMIC_END;
5252 }
5253 else {
5254 #endif
5255 *np = node_new_cclass();
5256 CHECK_NULL_RETURN_MEMERR(*np);
5257 cc = NCCLASS(*np);
5258 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
5259 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
5260 #ifdef USE_SHARED_CCLASS_TABLE
5261 }
5262 #endif
5263 }
5264 break;
5265
5266 default:
5267 return ONIGERR_PARSER_BUG;
5268 break;
5269 }
5270 }
5271 break;
5272
5273 case TK_CHAR_PROPERTY:
5274 r = parse_char_property(np, tok, src, end, env);
5275 if (r != 0) return r;
5276 break;
5277
5278 case TK_CC_OPEN:
5279 {
5280 CClassNode* cc;
5281
5282 r = parse_char_class(np, tok, src, end, env);
5283 if (r != 0) return r;
5284
5285 cc = NCCLASS(*np);
5286 if (IS_IGNORECASE(env->option)) {
5287 IApplyCaseFoldArg iarg;
5288
5289 iarg.env = env;
5290 iarg.cc = cc;
5291 iarg.alt_root = NULL_NODE;
5292 iarg.ptail = &(iarg.alt_root);
5293
5294 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5295 i_apply_case_fold, &iarg);
5296 if (r != 0) {
5297 onig_node_free(iarg.alt_root);
5298 return r;
5299 }
5300 if (IS_NOT_NULL(iarg.alt_root)) {
5301 Node* work = onig_node_new_alt(*np, iarg.alt_root);
5302 if (IS_NULL(work)) {
5303 onig_node_free(iarg.alt_root);
5304 return ONIGERR_MEMORY;
5305 }
5306 *np = work;
5307 }
5308 }
5309 }
5310 break;
5311
5312 case TK_ANYCHAR:
5313 *np = node_new_anychar();
5314 CHECK_NULL_RETURN_MEMERR(*np);
5315 break;
5316
5317 case TK_ANYCHAR_ANYTIME:
5318 *np = node_new_anychar();
5319 CHECK_NULL_RETURN_MEMERR(*np);
5320 qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
5321 CHECK_NULL_RETURN_MEMERR(qn);
5322 NQTFR(qn)->target = *np;
5323 *np = qn;
5324 break;
5325
5326 case TK_BACKREF:
5327 len = tok->u.backref.num;
5328 *np = node_new_backref(len,
5329 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
5330 tok->u.backref.by_name,
5331 #ifdef USE_BACKREF_WITH_LEVEL
5332 tok->u.backref.exist_level,
5333 tok->u.backref.level,
5334 #endif
5335 env);
5336 CHECK_NULL_RETURN_MEMERR(*np);
5337 break;
5338
5339 #ifdef USE_SUBEXP_CALL
5340 case TK_CALL:
5341 {
5342 int gnum = tok->u.call.gnum;
5343
5344 if (gnum < 0) {
5345 gnum = BACKREF_REL_TO_ABS(gnum, env);
5346 if (gnum <= 0)
5347 return ONIGERR_INVALID_BACKREF;
5348 }
5349 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
5350 CHECK_NULL_RETURN_MEMERR(*np);
5351 env->num_call++;
5352 }
5353 break;
5354 #endif
5355
5356 case TK_ANCHOR:
5357 *np = onig_node_new_anchor(tok->u.anchor);
5358 CHECK_NULL_RETURN_MEMERR(*np);
5359 break;
5360
5361 case TK_OP_REPEAT:
5362 case TK_INTERVAL:
5363 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
5364 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
5365 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
5366 else
5367 *np = node_new_empty();
5368 CHECK_NULL_RETURN_MEMERR(*np);
5369 }
5370 else {
5371 goto tk_byte;
5372 }
5373 break;
5374
5375 default:
5376 return ONIGERR_PARSER_BUG;
5377 break;
5378 }
5379
5380 {
5381 targetp = np;
5382
5383 re_entry:
5384 r = fetch_token(tok, src, end, env);
5385 if (r < 0) return r;
5386
5387 repeat:
5388 if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
5389 if (is_invalid_quantifier_target(*targetp))
5390 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
5391
5392 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
5393 (r == TK_INTERVAL ? 1 : 0));
5394 CHECK_NULL_RETURN_MEMERR(qn);
5395 NQTFR(qn)->greedy = tok->u.repeat.greedy;
5396 r = set_quantifier(qn, *targetp, group, env);
5397 if (r < 0) {
5398 onig_node_free(qn);
5399 return r;
5400 }
5401
5402 if (tok->u.repeat.possessive != 0) {
5403 Node* en;
5404 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5405 if (IS_NULL(en)) {
5406 onig_node_free(qn);
5407 return ONIGERR_MEMORY;
5408 }
5409 NENCLOSE(en)->target = qn;
5410 qn = en;
5411 }
5412
5413 if (r == 0) {
5414 *targetp = qn;
5415 }
5416 else if (r == 1) {
5417 onig_node_free(qn);
5418 }
5419 else if (r == 2) { /* split case: /abc+/ */
5420 Node *tmp;
5421
5422 *targetp = node_new_list(*targetp, NULL);
5423 if (IS_NULL(*targetp)) {
5424 onig_node_free(qn);
5425 return ONIGERR_MEMORY;
5426 }
5427 tmp = NCDR(*targetp) = node_new_list(qn, NULL);
5428 if (IS_NULL(tmp)) {
5429 onig_node_free(qn);
5430 return ONIGERR_MEMORY;
5431 }
5432 targetp = &(NCAR(tmp));
5433 }
5434 goto re_entry;
5435 }
5436 }
5437
5438 return r;
5439 }
5440
5441 static int
parse_branch(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5442 parse_branch(Node** top, OnigToken* tok, int term,
5443 UChar** src, UChar* end, ScanEnv* env)
5444 {
5445 int r;
5446 Node *node, **headp;
5447
5448 *top = NULL;
5449 r = parse_exp(&node, tok, term, src, end, env);
5450 if (r < 0) return r;
5451
5452 if (r == TK_EOT || r == term || r == TK_ALT) {
5453 *top = node;
5454 }
5455 else {
5456 *top = node_new_list(node, NULL);
5457 CHECK_NULL_RETURN_MEMERR(*top);
5458 headp = &(NCDR(*top));
5459 while (r != TK_EOT && r != term && r != TK_ALT) {
5460 r = parse_exp(&node, tok, term, src, end, env);
5461 CHECK_NULL_RETURN_MEMERR(node);
5462 if (r < 0) return r;
5463
5464 if (NTYPE(node) == NT_LIST) {
5465 *headp = node;
5466 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
5467 headp = &(NCDR(node));
5468 }
5469 else {
5470 *headp = node_new_list(node, NULL);
5471 headp = &(NCDR(*headp));
5472 }
5473 }
5474 }
5475
5476 return r;
5477 }
5478
5479 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
5480 static int
parse_subexp(Node ** top,OnigToken * tok,int term,UChar ** src,UChar * end,ScanEnv * env)5481 parse_subexp(Node** top, OnigToken* tok, int term,
5482 UChar** src, UChar* end, ScanEnv* env)
5483 {
5484 int r;
5485 Node *node, **headp;
5486
5487 *top = NULL;
5488 r = parse_branch(&node, tok, term, src, end, env);
5489 if (r < 0) {
5490 onig_node_free(node);
5491 return r;
5492 }
5493
5494 if (r == term) {
5495 *top = node;
5496 }
5497 else if (r == TK_ALT) {
5498 *top = onig_node_new_alt(node, NULL);
5499 CHECK_NULL_RETURN_MEMERR(*top);
5500 headp = &(NCDR(*top));
5501 while (r == TK_ALT) {
5502 r = fetch_token(tok, src, end, env);
5503 if (r < 0) return r;
5504 r = parse_branch(&node, tok, term, src, end, env);
5505 if (r < 0) return r;
5506
5507 *headp = onig_node_new_alt(node, NULL);
5508 headp = &(NCDR(*headp));
5509 }
5510
5511 if (tok->type != (enum TokenSyms )term)
5512 goto err;
5513 }
5514 else {
5515 err:
5516 if (term == TK_SUBEXP_CLOSE)
5517 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5518 else
5519 return ONIGERR_PARSER_BUG;
5520 }
5521
5522 return r;
5523 }
5524
5525 static int
parse_regexp(Node ** top,UChar ** src,UChar * end,ScanEnv * env)5526 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
5527 {
5528 int r;
5529 OnigToken tok;
5530
5531 r = fetch_token(&tok, src, end, env);
5532 if (r < 0) return r;
5533 r = parse_subexp(top, &tok, TK_EOT, src, end, env);
5534 if (r < 0) return r;
5535 return 0;
5536 }
5537
5538 extern int
onig_parse_make_tree(Node ** root,const UChar * pattern,const UChar * end,regex_t * reg,ScanEnv * env)5539 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
5540 regex_t* reg, ScanEnv* env)
5541 {
5542 int r;
5543 UChar* p;
5544
5545 #ifdef USE_NAMED_GROUP
5546 names_clear(reg);
5547 #endif
5548
5549 scan_env_clear(env);
5550 env->option = reg->options;
5551 env->case_fold_flag = reg->case_fold_flag;
5552 env->enc = reg->enc;
5553 env->syntax = reg->syntax;
5554 env->pattern = (UChar* )pattern;
5555 env->pattern_end = (UChar* )end;
5556 env->reg = reg;
5557
5558 *root = NULL;
5559 p = (UChar* )pattern;
5560 r = parse_regexp(root, &p, (UChar* )end, env);
5561 reg->num_mem = env->num_mem;
5562 return r;
5563 }
5564
5565 extern void
onig_scan_env_set_error_string(ScanEnv * env,int ecode ARG_UNUSED,UChar * arg,UChar * arg_end)5566 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
5567 UChar* arg, UChar* arg_end)
5568 {
5569 env->error = arg;
5570 env->error_end = arg_end;
5571 }
5572