1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
5 Free Software Foundation, Inc.
6 This file is part of the GNU C Library.
7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3, or (at your option)
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License along
20 with this program; if not, write to the Free Software Foundation,
21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 size_t length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 const re_dfastate_t *init_state,
27 char *fastmap);
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 #ifdef RE_ENABLE_I18N
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 #ifdef RE_ENABLE_I18N
35 static void optimize_utf8 (re_dfa_t *dfa);
36 #endif
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 Idx node, bool root);
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 reg_syntax_t syntax);
60 static int peek_token (re_token_t *token, re_string_t *input,
61 reg_syntax_t syntax) internal_function;
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 Idx nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 re_dfa_t *dfa, re_token_t *token,
78 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 re_token_t *token, reg_syntax_t syntax,
81 reg_errcode_t *err);
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 re_string_t *regexp,
84 re_token_t *token, int token_len,
85 re_dfa_t *dfa,
86 reg_syntax_t syntax,
87 bool accept_hyphen);
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token);
91 #ifdef RE_ENABLE_I18N
92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 re_charset_t *mbcset,
94 Idx *equiv_class_alloc,
95 const unsigned char *name);
96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 bitset_t sbcset,
98 re_charset_t *mbcset,
99 Idx *char_class_alloc,
100 const unsigned char *class_name,
101 reg_syntax_t syntax);
102 #else /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 const unsigned char *name);
105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 bitset_t sbcset,
107 const unsigned char *class_name,
108 reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 RE_TRANSLATE_TYPE trans,
112 const unsigned char *class_name,
113 const unsigned char *extra,
114 bool non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125
126 /* This table gives an error message for each of the error codes listed
127 in regex.h. Obviously the order here has to be same as there.
128 POSIX doesn't require that we do anything for REG_NOERROR,
129 but why not be nice? */
130
131 static const char __re_error_msgid[] =
132 {
133 #define REG_NOERROR_IDX 0
134 gettext_noop ("Success") /* REG_NOERROR */
135 "\0"
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137 gettext_noop ("No match") /* REG_NOMATCH */
138 "\0"
139 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141 "\0"
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144 "\0"
145 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147 "\0"
148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150 "\0"
151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153 "\0"
154 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
156 "\0"
157 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159 "\0"
160 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162 "\0"
163 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165 "\0"
166 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167 gettext_noop ("Invalid range end") /* REG_ERANGE */
168 "\0"
169 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
170 gettext_noop ("Memory exhausted") /* REG_ESPACE */
171 "\0"
172 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174 "\0"
175 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176 gettext_noop ("Premature end of regular expression") /* REG_EEND */
177 "\0"
178 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
179 gettext_noop ("Regular expression too big") /* REG_ESIZE */
180 "\0"
181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 };
184
185 static const size_t __re_error_msgid_idx[] =
186 {
187 REG_NOERROR_IDX,
188 REG_NOMATCH_IDX,
189 REG_BADPAT_IDX,
190 REG_ECOLLATE_IDX,
191 REG_ECTYPE_IDX,
192 REG_EESCAPE_IDX,
193 REG_ESUBREG_IDX,
194 REG_EBRACK_IDX,
195 REG_EPAREN_IDX,
196 REG_EBRACE_IDX,
197 REG_BADBR_IDX,
198 REG_ERANGE_IDX,
199 REG_ESPACE_IDX,
200 REG_BADRPT_IDX,
201 REG_EEND_IDX,
202 REG_ESIZE_IDX,
203 REG_ERPAREN_IDX
204 };
205
206 /* Entry points for GNU code. */
207
208 /* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
211
212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 are set in BUFP on entry. */
214
215 #ifdef _LIBC
216 const char *
re_compile_pattern(pattern,length,bufp)217 re_compile_pattern (pattern, length, bufp)
218 const char *pattern;
219 size_t length;
220 struct re_pattern_buffer *bufp;
221 #else /* size_t might promote */
222 const char *
223 re_compile_pattern (const char *pattern, size_t length,
224 struct re_pattern_buffer *bufp)
225 #endif
226 {
227 reg_errcode_t ret;
228
229 /* And GNU code determines whether or not to get register information
230 by passing null for the REGS argument to re_match, etc., not by
231 setting no_sub, unless RE_NO_SUB is set. */
232 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233
234 /* Match anchors at newline. */
235 bufp->newline_anchor = 1;
236
237 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238
239 if (!ret)
240 return NULL;
241 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 }
243 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)244 weak_alias (__re_compile_pattern, re_compile_pattern)
245 #endif
246
247 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
248 also be assigned to arbitrarily: each pattern buffer stores its own
249 syntax, so it can be changed between regex compilations. */
250 /* This has no initializer because initialized variables in Emacs
251 become read-only after dumping. */
252 reg_syntax_t re_syntax_options;
253
254
255 /* Specify the precise syntax of regexps for compilation. This provides
256 for compatibility for various utilities which historically have
257 different, incompatible syntaxes.
258
259 The argument SYNTAX is a bit mask comprised of the various bits
260 defined in regex.h. We return the old syntax. */
261
262 reg_syntax_t
263 re_set_syntax (syntax)
264 reg_syntax_t syntax;
265 {
266 reg_syntax_t ret = re_syntax_options;
267
268 re_syntax_options = syntax;
269 return ret;
270 }
271 #ifdef _LIBC
272 weak_alias (__re_set_syntax, re_set_syntax)
273 #endif
274
275 int
276 re_compile_fastmap (bufp)
277 struct re_pattern_buffer *bufp;
278 {
279 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
280 char *fastmap = bufp->fastmap;
281
282 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
283 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
284 if (dfa->init_state != dfa->init_state_word)
285 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
286 if (dfa->init_state != dfa->init_state_nl)
287 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
288 if (dfa->init_state != dfa->init_state_begbuf)
289 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
290 bufp->fastmap_accurate = 1;
291 return 0;
292 }
293 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 #endif
296
297 static inline void
298 __attribute ((always_inline))
299 re_set_fastmap (char *fastmap, bool icase, int ch)
300 {
301 fastmap[ch] = 1;
302 if (icase)
303 fastmap[tolower (ch)] = 1;
304 }
305
306 /* Helper function for re_compile_fastmap.
307 Compile fastmap for the initial_state INIT_STATE. */
308
309 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 char *fastmap)
312 {
313 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
314 Idx node_cnt;
315 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
316 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317 {
318 Idx node = init_state->nodes.elems[node_cnt];
319 re_token_type_t type = dfa->nodes[node].type;
320
321 if (type == CHARACTER)
322 {
323 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
324 #ifdef RE_ENABLE_I18N
325 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 {
327 unsigned char buf[MB_LEN_MAX];
328 unsigned char *p;
329 wchar_t wc;
330 mbstate_t state;
331
332 p = buf;
333 *p++ = dfa->nodes[node].opr.c;
334 while (++node < dfa->nodes_len
335 && dfa->nodes[node].type == CHARACTER
336 && dfa->nodes[node].mb_partial)
337 *p++ = dfa->nodes[node].opr.c;
338 memset (&state, '\0', sizeof (state));
339 if (__mbrtowc (&wc, (const char *) buf, p - buf,
340 &state) == p - buf
341 && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 != (size_t) -1))
343 re_set_fastmap (fastmap, false, buf[0]);
344 }
345 #endif
346 }
347 else if (type == SIMPLE_BRACKET)
348 {
349 int i, ch;
350 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 {
352 int j;
353 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
354 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
355 if (w & ((bitset_word_t) 1 << j))
356 re_set_fastmap (fastmap, icase, ch);
357 }
358 }
359 #ifdef RE_ENABLE_I18N
360 else if (type == COMPLEX_BRACKET)
361 {
362 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 Idx i;
364
365 # ifdef _LIBC
366 /* See if we have to try all bytes which start multiple collation
367 elements.
368 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
369 collation element, and don't catch 'b' since 'b' is
370 the only collation element which starts from 'b' (and
371 it is caught by SIMPLE_BRACKET). */
372 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
373 && (cset->ncoll_syms || cset->nranges))
374 {
375 const int32_t *table = (const int32_t *)
376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 for (i = 0; i < SBC_MAX; ++i)
378 if (table[i] < 0)
379 re_set_fastmap (fastmap, icase, i);
380 }
381 # endif /* _LIBC */
382
383 /* See if we have to start the match at all multibyte characters,
384 i.e. where we would not find an invalid sequence. This only
385 applies to multibyte character sets; for single byte character
386 sets, the SIMPLE_BRACKET again suffices. */
387 if (dfa->mb_cur_max > 1
388 && (cset->nchar_classes || cset->non_match
389 # ifdef _LIBC
390 || cset->nequiv_classes
391 # endif /* _LIBC */
392 ))
393 {
394 unsigned char c = 0;
395 do
396 {
397 mbstate_t mbs;
398 memset (&mbs, 0, sizeof (mbs));
399 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
400 re_set_fastmap (fastmap, false, (int) c);
401 }
402 while (++c != 0);
403 }
404
405 else
406 {
407 /* ... Else catch all bytes which can start the mbchars. */
408 for (i = 0; i < cset->nmbchars; ++i)
409 {
410 char buf[256];
411 mbstate_t state;
412 memset (&state, '\0', sizeof (state));
413 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 {
417 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 != (size_t) -1)
419 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 }
421 }
422 }
423 }
424 #endif /* RE_ENABLE_I18N */
425 else if (type == OP_PERIOD
426 #ifdef RE_ENABLE_I18N
427 || type == OP_UTF8_PERIOD
428 #endif /* RE_ENABLE_I18N */
429 || type == END_OF_RE)
430 {
431 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
432 if (type == END_OF_RE)
433 bufp->can_be_null = 1;
434 return;
435 }
436 }
437 }
438
439 /* Entry point for POSIX code. */
440 /* regcomp takes a regular expression as a string and compiles it.
441
442 PREG is a regex_t *. We do not expect any fields to be initialized,
443 since POSIX says we shouldn't. Thus, we set
444
445 `buffer' to the compiled pattern;
446 `used' to the length of the compiled pattern;
447 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
448 REG_EXTENDED bit in CFLAGS is set; otherwise, to
449 RE_SYNTAX_POSIX_BASIC;
450 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
451 `fastmap' to an allocated space for the fastmap;
452 `fastmap_accurate' to zero;
453 `re_nsub' to the number of subexpressions in PATTERN.
454
455 PATTERN is the address of the pattern string.
456
457 CFLAGS is a series of bits which affect compilation.
458
459 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
460 use POSIX basic syntax.
461
462 If REG_NEWLINE is set, then . and [^...] don't match newline.
463 Also, regexec will try a match beginning after every newline.
464
465 If REG_ICASE is set, then we considers upper- and lowercase
466 versions of letters to be equivalent when matching.
467
468 If REG_NOSUB is set, then when PREG is passed to regexec, that
469 routine will report only success or failure, and nothing about the
470 registers.
471
472 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
473 the return codes and their meanings.) */
474
475 int
regcomp(preg,pattern,cflags)476 regcomp (preg, pattern, cflags)
477 regex_t *_Restrict_ preg;
478 const char *_Restrict_ pattern;
479 int cflags;
480 {
481 reg_errcode_t ret;
482 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
483 : RE_SYNTAX_POSIX_BASIC);
484
485 preg->buffer = NULL;
486 preg->allocated = 0;
487 preg->used = 0;
488
489 /* Try to allocate space for the fastmap. */
490 preg->fastmap = re_malloc (char, SBC_MAX);
491 if (BE (preg->fastmap == NULL, 0))
492 return REG_ESPACE;
493
494 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495
496 /* If REG_NEWLINE is set, newlines are treated differently. */
497 if (cflags & REG_NEWLINE)
498 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
499 syntax &= ~RE_DOT_NEWLINE;
500 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
501 /* It also changes the matching behavior. */
502 preg->newline_anchor = 1;
503 }
504 else
505 preg->newline_anchor = 0;
506 preg->no_sub = !!(cflags & REG_NOSUB);
507 preg->translate = NULL;
508
509 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510
511 /* POSIX doesn't distinguish between an unmatched open-group and an
512 unmatched close-group: both are REG_EPAREN. */
513 if (ret == REG_ERPAREN)
514 ret = REG_EPAREN;
515
516 /* We have already checked preg->fastmap != NULL. */
517 if (BE (ret == REG_NOERROR, 1))
518 /* Compute the fastmap now, since regexec cannot modify the pattern
519 buffer. This function never fails in this implementation. */
520 (void) re_compile_fastmap (preg);
521 else
522 {
523 /* Some error occurred while compiling the expression. */
524 re_free (preg->fastmap);
525 preg->fastmap = NULL;
526 }
527
528 return (int) ret;
529 }
530 #ifdef _LIBC
531 weak_alias (__regcomp, regcomp)
532 #endif
533
534 /* Returns a message corresponding to an error code, ERRCODE, returned
535 from either regcomp or regexec. We don't use PREG here. */
536
537 #ifdef _LIBC
538 size_t
539 regerror (errcode, preg, errbuf, errbuf_size)
540 int errcode;
541 const regex_t *_Restrict_ preg;
542 char *_Restrict_ errbuf;
543 size_t errbuf_size;
544 #else /* size_t might promote */
545 size_t
546 regerror (int errcode, const regex_t *_Restrict_ preg,
547 char *_Restrict_ errbuf, size_t errbuf_size)
548 #endif
549 {
550 const char *msg;
551 size_t msg_size;
552
553 if (BE (errcode < 0
554 || errcode >= (int) (sizeof (__re_error_msgid_idx)
555 / sizeof (__re_error_msgid_idx[0])), 0))
556 /* Only error codes returned by the rest of the code should be passed
557 to this routine. If we are given anything else, or if other regex
558 code generates an invalid error code, then the program has a bug.
559 Dump core so we can fix it. */
560 abort ();
561
562 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563
564 msg_size = strlen (msg) + 1; /* Includes the null. */
565
566 if (BE (errbuf_size != 0, 1))
567 {
568 size_t cpy_size = msg_size;
569 if (BE (msg_size > errbuf_size, 0))
570 {
571 cpy_size = errbuf_size - 1;
572 errbuf[cpy_size] = '\0';
573 }
574 memcpy (errbuf, msg, cpy_size);
575 }
576
577 return msg_size;
578 }
579 #ifdef _LIBC
580 weak_alias (__regerror, regerror)
581 #endif
582
583
584 #ifdef RE_ENABLE_I18N
585 /* This static array is used for the map to single-byte characters when
586 UTF-8 is used. Otherwise we would allocate memory just to initialize
587 it the same all the time. UTF-8 is the preferred encoding so this is
588 a worthwhile optimization. */
589 static const bitset_t utf8_sb_map =
590 {
591 /* Set the first 128 bits. */
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX,
600 # endif
601 (BITSET_WORD_MAX
602 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 ? 0
604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 };
606 #endif
607
608
609 static void
free_dfa_content(re_dfa_t * dfa)610 free_dfa_content (re_dfa_t *dfa)
611 {
612 Idx i, j;
613
614 if (dfa->nodes)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
619 {
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
626 }
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
631
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
634 {
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
637 {
638 re_dfastate_t *state = entry->array[j];
639 free_state (state);
640 }
641 re_free (entry->array);
642 }
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
647 #endif
648 re_free (dfa->subexp_map);
649 #ifdef DEBUG
650 re_free (dfa->re_str);
651 #endif
652
653 re_free (dfa);
654 }
655
656
657 /* Free dynamically allocated space used by PREG. */
658
659 void
regfree(preg)660 regfree (preg)
661 regex_t *preg;
662 {
663 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
666 preg->buffer = NULL;
667 preg->allocated = 0;
668
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
671
672 re_free (preg->translate);
673 preg->translate = NULL;
674 }
675 #ifdef _LIBC
676 weak_alias (__regfree, regfree)
677 #endif
678
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
681
682 #if defined _REGEX_RE_COMP || defined _LIBC
683
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
686
687 char *
688 # ifdef _LIBC
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
692 weak_function
693 # endif
re_comp(s)694 re_comp (s)
695 const char *s;
696 {
697 reg_errcode_t ret;
698 char *fastmap;
699
700 if (!s)
701 {
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
704 return 0;
705 }
706
707 if (re_comp_buf.buffer)
708 {
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
714 }
715
716 if (re_comp_buf.fastmap == NULL)
717 {
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 }
723
724 /* Since `re_exec' always passes NULL for the `regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
726
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
729
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731
732 if (!ret)
733 return NULL;
734
735 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 }
738
739 #ifdef _LIBC
libc_freeres_fn(free_mem)740 libc_freeres_fn (free_mem)
741 {
742 __regfree (&re_comp_buf);
743 }
744 #endif
745
746 #endif /* _REGEX_RE_COMP */
747
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
751
752 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 reg_syntax_t syntax)
755 {
756 reg_errcode_t err = REG_NOERROR;
757 re_dfa_t *dfa;
758 re_string_t regexp;
759
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
764 preg->used = 0;
765 preg->re_nsub = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
768
769 /* Initialize the dfa. */
770 dfa = (re_dfa_t *) preg->buffer;
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772 {
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778 if (dfa == NULL)
779 return REG_ESPACE;
780 preg->allocated = sizeof (re_dfa_t);
781 preg->buffer = (unsigned char *) dfa;
782 }
783 preg->used = sizeof (re_dfa_t);
784
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
787 {
788 free_dfa_content (dfa);
789 preg->buffer = NULL;
790 preg->allocated = 0;
791 return err;
792 }
793 #ifdef DEBUG
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
797 #endif
798
799 __libc_lock_init (dfa->lock);
800
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
804 {
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
809 preg->buffer = NULL;
810 preg->allocated = 0;
811 return err;
812 }
813
814 /* Parse the regular expression, and build a structure tree. */
815 preg->re_nsub = 0;
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
819
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
824
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828 optimize_utf8 (dfa);
829 #endif
830
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
833
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
837
838 if (BE (err != REG_NOERROR, 0))
839 {
840 free_dfa_content (dfa);
841 preg->buffer = NULL;
842 preg->allocated = 0;
843 }
844
845 return err;
846 }
847
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
850
851 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)852 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 {
854 __re_size_t table_size;
855 #ifdef RE_ENABLE_I18N
856 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
857 #else
858 size_t max_i18n_object_size = 0;
859 #endif
860 size_t max_object_size =
861 MAX (sizeof (struct re_state_table_entry),
862 MAX (sizeof (re_token_t),
863 MAX (sizeof (re_node_set),
864 MAX (sizeof (regmatch_t),
865 max_i18n_object_size))));
866
867 memset (dfa, '\0', sizeof (re_dfa_t));
868
869 /* Force allocation of str_tree_storage the first time. */
870 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
871
872 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
873 calculation below, and for similar doubling calculations
874 elsewhere. And it's <= rather than <, because some of the
875 doubling calculations add 1 afterwards. */
876 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
877 return REG_ESPACE;
878
879 dfa->nodes_alloc = pat_len + 1;
880 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
881
882 /* table_size = 2 ^ ceil(log pat_len) */
883 for (table_size = 1; ; table_size <<= 1)
884 if (table_size > pat_len)
885 break;
886
887 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
888 dfa->state_hash_mask = table_size - 1;
889
890 dfa->mb_cur_max = MB_CUR_MAX;
891 #ifdef _LIBC
892 if (dfa->mb_cur_max == 6
893 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
894 dfa->is_utf8 = 1;
895 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
896 != 0);
897 #else
898 if (strcmp (locale_charset (), "UTF-8") == 0)
899 dfa->is_utf8 = 1;
900
901 /* We check exhaustively in the loop below if this charset is a
902 superset of ASCII. */
903 dfa->map_notascii = 0;
904 #endif
905
906 #ifdef RE_ENABLE_I18N
907 if (dfa->mb_cur_max > 1)
908 {
909 if (dfa->is_utf8)
910 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
911 else
912 {
913 int i, j, ch;
914
915 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
916 if (BE (dfa->sb_char == NULL, 0))
917 return REG_ESPACE;
918
919 /* Set the bits corresponding to single byte chars. */
920 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
921 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
922 {
923 wint_t wch = __btowc (ch);
924 if (wch != WEOF)
925 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
926 # ifndef _LIBC
927 if (isascii (ch) && wch != ch)
928 dfa->map_notascii = 1;
929 # endif
930 }
931 }
932 }
933 #endif
934
935 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
936 return REG_ESPACE;
937 return REG_NOERROR;
938 }
939
940 /* Initialize WORD_CHAR table, which indicate which character is
941 "word". In this case "word" means that it is the word construction
942 character used by some operators like "\<", "\>", etc. */
943
944 static void
945 internal_function
init_word_char(re_dfa_t * dfa)946 init_word_char (re_dfa_t *dfa)
947 {
948 int i, j, ch;
949 dfa->word_ops_used = 1;
950 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
951 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
952 if (isalnum (ch) || ch == '_')
953 dfa->word_char[i] |= (bitset_word_t) 1 << j;
954 }
955
956 /* Free the work area which are only used while compiling. */
957
958 static void
free_workarea_compile(regex_t * preg)959 free_workarea_compile (regex_t *preg)
960 {
961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962 bin_tree_storage_t *storage, *next;
963 for (storage = dfa->str_tree_storage; storage; storage = next)
964 {
965 next = storage->next;
966 re_free (storage);
967 }
968 dfa->str_tree_storage = NULL;
969 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
970 dfa->str_tree = NULL;
971 re_free (dfa->org_indices);
972 dfa->org_indices = NULL;
973 }
974
975 /* Create initial states for all contexts. */
976
977 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)978 create_initial_state (re_dfa_t *dfa)
979 {
980 Idx first, i;
981 reg_errcode_t err;
982 re_node_set init_nodes;
983
984 /* Initial states have the epsilon closure of the node which is
985 the first node of the regular expression. */
986 first = dfa->str_tree->first->node_idx;
987 dfa->init_node = first;
988 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
989 if (BE (err != REG_NOERROR, 0))
990 return err;
991
992 /* The back-references which are in initial states can epsilon transit,
993 since in this case all of the subexpressions can be null.
994 Then we add epsilon closures of the nodes which are the next nodes of
995 the back-references. */
996 if (dfa->nbackref > 0)
997 for (i = 0; i < init_nodes.nelem; ++i)
998 {
999 Idx node_idx = init_nodes.elems[i];
1000 re_token_type_t type = dfa->nodes[node_idx].type;
1001
1002 Idx clexp_idx;
1003 if (type != OP_BACK_REF)
1004 continue;
1005 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1006 {
1007 re_token_t *clexp_node;
1008 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1009 if (clexp_node->type == OP_CLOSE_SUBEXP
1010 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1011 break;
1012 }
1013 if (clexp_idx == init_nodes.nelem)
1014 continue;
1015
1016 if (type == OP_BACK_REF)
1017 {
1018 Idx dest_idx = dfa->edests[node_idx].elems[0];
1019 if (!re_node_set_contains (&init_nodes, dest_idx))
1020 {
1021 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1022 i = 0;
1023 }
1024 }
1025 }
1026
1027 /* It must be the first time to invoke acquire_state. */
1028 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1029 /* We don't check ERR here, since the initial state must not be NULL. */
1030 if (BE (dfa->init_state == NULL, 0))
1031 return err;
1032 if (dfa->init_state->has_constraint)
1033 {
1034 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1035 CONTEXT_WORD);
1036 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1037 CONTEXT_NEWLINE);
1038 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1039 &init_nodes,
1040 CONTEXT_NEWLINE
1041 | CONTEXT_BEGBUF);
1042 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1043 || dfa->init_state_begbuf == NULL, 0))
1044 return err;
1045 }
1046 else
1047 dfa->init_state_word = dfa->init_state_nl
1048 = dfa->init_state_begbuf = dfa->init_state;
1049
1050 re_node_set_free (&init_nodes);
1051 return REG_NOERROR;
1052 }
1053
1054 #ifdef RE_ENABLE_I18N
1055 /* If it is possible to do searching in single byte encoding instead of UTF-8
1056 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1057 DFA nodes where needed. */
1058
1059 static void
optimize_utf8(re_dfa_t * dfa)1060 optimize_utf8 (re_dfa_t *dfa)
1061 {
1062 Idx node;
1063 int i;
1064 bool mb_chars = false;
1065 bool has_period = false;
1066
1067 for (node = 0; node < dfa->nodes_len; ++node)
1068 switch (dfa->nodes[node].type)
1069 {
1070 case CHARACTER:
1071 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1072 mb_chars = true;
1073 break;
1074 case ANCHOR:
1075 switch (dfa->nodes[node].opr.ctx_type)
1076 {
1077 case LINE_FIRST:
1078 case LINE_LAST:
1079 case BUF_FIRST:
1080 case BUF_LAST:
1081 break;
1082 default:
1083 /* Word anchors etc. cannot be handled. It's okay to test
1084 opr.ctx_type since constraints (for all DFA nodes) are
1085 created by ORing one or more opr.ctx_type values. */
1086 return;
1087 }
1088 break;
1089 case OP_PERIOD:
1090 has_period = true;
1091 break;
1092 case OP_BACK_REF:
1093 case OP_ALT:
1094 case END_OF_RE:
1095 case OP_DUP_ASTERISK:
1096 case OP_OPEN_SUBEXP:
1097 case OP_CLOSE_SUBEXP:
1098 break;
1099 case COMPLEX_BRACKET:
1100 return;
1101 case SIMPLE_BRACKET:
1102 /* Just double check. */
1103 {
1104 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1105 ? 0
1106 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1107 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1108 {
1109 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1110 return;
1111 rshift = 0;
1112 }
1113 }
1114 break;
1115 default:
1116 abort ();
1117 }
1118
1119 if (mb_chars || has_period)
1120 for (node = 0; node < dfa->nodes_len; ++node)
1121 {
1122 if (dfa->nodes[node].type == CHARACTER
1123 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1124 dfa->nodes[node].mb_partial = 0;
1125 else if (dfa->nodes[node].type == OP_PERIOD)
1126 dfa->nodes[node].type = OP_UTF8_PERIOD;
1127 }
1128
1129 /* The search can be in single byte locale. */
1130 dfa->mb_cur_max = 1;
1131 dfa->is_utf8 = 0;
1132 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1133 }
1134 #endif
1135
1136 /* Analyze the structure tree, and calculate "first", "next", "edest",
1137 "eclosure", and "inveclosure". */
1138
1139 static reg_errcode_t
analyze(regex_t * preg)1140 analyze (regex_t *preg)
1141 {
1142 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1143 reg_errcode_t ret;
1144
1145 /* Allocate arrays. */
1146 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1147 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1148 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1149 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1150 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1151 || dfa->eclosures == NULL, 0))
1152 return REG_ESPACE;
1153
1154 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1155 if (dfa->subexp_map != NULL)
1156 {
1157 Idx i;
1158 for (i = 0; i < preg->re_nsub; i++)
1159 dfa->subexp_map[i] = i;
1160 preorder (dfa->str_tree, optimize_subexps, dfa);
1161 for (i = 0; i < preg->re_nsub; i++)
1162 if (dfa->subexp_map[i] != i)
1163 break;
1164 if (i == preg->re_nsub)
1165 {
1166 free (dfa->subexp_map);
1167 dfa->subexp_map = NULL;
1168 }
1169 }
1170
1171 ret = postorder (dfa->str_tree, lower_subexps, preg);
1172 if (BE (ret != REG_NOERROR, 0))
1173 return ret;
1174 ret = postorder (dfa->str_tree, calc_first, dfa);
1175 if (BE (ret != REG_NOERROR, 0))
1176 return ret;
1177 preorder (dfa->str_tree, calc_next, dfa);
1178 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1179 if (BE (ret != REG_NOERROR, 0))
1180 return ret;
1181 ret = calc_eclosure (dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1183 return ret;
1184
1185 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1186 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1187 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188 || dfa->nbackref)
1189 {
1190 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1191 if (BE (dfa->inveclosures == NULL, 0))
1192 return REG_ESPACE;
1193 ret = calc_inveclosure (dfa);
1194 }
1195
1196 return ret;
1197 }
1198
1199 /* Our parse trees are very unbalanced, so we cannot use a stack to
1200 implement parse tree visits. Instead, we use parent pointers and
1201 some hairy code in these two functions. */
1202 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1203 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1204 void *extra)
1205 {
1206 bin_tree_t *node, *prev;
1207
1208 for (node = root; ; )
1209 {
1210 /* Descend down the tree, preferably to the left (or to the right
1211 if that's the only child). */
1212 while (node->left || node->right)
1213 if (node->left)
1214 node = node->left;
1215 else
1216 node = node->right;
1217
1218 do
1219 {
1220 reg_errcode_t err = fn (extra, node);
1221 if (BE (err != REG_NOERROR, 0))
1222 return err;
1223 if (node->parent == NULL)
1224 return REG_NOERROR;
1225 prev = node;
1226 node = node->parent;
1227 }
1228 /* Go up while we have a node that is reached from the right. */
1229 while (node->right == prev || node->right == NULL);
1230 node = node->right;
1231 }
1232 }
1233
1234 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1235 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1236 void *extra)
1237 {
1238 bin_tree_t *node;
1239
1240 for (node = root; ; )
1241 {
1242 reg_errcode_t err = fn (extra, node);
1243 if (BE (err != REG_NOERROR, 0))
1244 return err;
1245
1246 /* Go to the left node, or up and to the right. */
1247 if (node->left)
1248 node = node->left;
1249 else
1250 {
1251 bin_tree_t *prev = NULL;
1252 while (node->right == prev || node->right == NULL)
1253 {
1254 prev = node;
1255 node = node->parent;
1256 if (!node)
1257 return REG_NOERROR;
1258 }
1259 node = node->right;
1260 }
1261 }
1262 }
1263
1264 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1266 backreferences as well. Requires a preorder visit. */
1267 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1268 optimize_subexps (void *extra, bin_tree_t *node)
1269 {
1270 re_dfa_t *dfa = (re_dfa_t *) extra;
1271
1272 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1273 {
1274 int idx = node->token.opr.idx;
1275 node->token.opr.idx = dfa->subexp_map[idx];
1276 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1277 }
1278
1279 else if (node->token.type == SUBEXP
1280 && node->left && node->left->token.type == SUBEXP)
1281 {
1282 Idx other_idx = node->left->token.opr.idx;
1283
1284 node->left = node->left->left;
1285 if (node->left)
1286 node->left->parent = node;
1287
1288 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1289 if (other_idx < BITSET_WORD_BITS)
1290 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1291 }
1292
1293 return REG_NOERROR;
1294 }
1295
1296 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1297 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1298 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1299 lower_subexps (void *extra, bin_tree_t *node)
1300 {
1301 regex_t *preg = (regex_t *) extra;
1302 reg_errcode_t err = REG_NOERROR;
1303
1304 if (node->left && node->left->token.type == SUBEXP)
1305 {
1306 node->left = lower_subexp (&err, preg, node->left);
1307 if (node->left)
1308 node->left->parent = node;
1309 }
1310 if (node->right && node->right->token.type == SUBEXP)
1311 {
1312 node->right = lower_subexp (&err, preg, node->right);
1313 if (node->right)
1314 node->right->parent = node;
1315 }
1316
1317 return err;
1318 }
1319
1320 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1321 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1322 {
1323 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1324 bin_tree_t *body = node->left;
1325 bin_tree_t *op, *cls, *tree1, *tree;
1326
1327 if (preg->no_sub
1328 /* We do not optimize empty subexpressions, because otherwise we may
1329 have bad CONCAT nodes with NULL children. This is obviously not
1330 very common, so we do not lose much. An example that triggers
1331 this case is the sed "script" /\(\)/x. */
1332 && node->left != NULL
1333 && (node->token.opr.idx >= BITSET_WORD_BITS
1334 || !(dfa->used_bkref_map
1335 & ((bitset_word_t) 1 << node->token.opr.idx))))
1336 return node->left;
1337
1338 /* Convert the SUBEXP node to the concatenation of an
1339 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1340 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1341 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1342 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1343 tree = create_tree (dfa, op, tree1, CONCAT);
1344 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345 {
1346 *err = REG_ESPACE;
1347 return NULL;
1348 }
1349
1350 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1351 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1352 return tree;
1353 }
1354
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356 nodes. Requires a postorder visit. */
1357 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1358 calc_first (void *extra, bin_tree_t *node)
1359 {
1360 re_dfa_t *dfa = (re_dfa_t *) extra;
1361 if (node->token.type == CONCAT)
1362 {
1363 node->first = node->left->first;
1364 node->node_idx = node->left->node_idx;
1365 }
1366 else
1367 {
1368 node->first = node;
1369 node->node_idx = re_dfa_add_node (dfa, node->token);
1370 if (BE (node->node_idx == REG_MISSING, 0))
1371 return REG_ESPACE;
1372 if (node->token.type == ANCHOR)
1373 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1374 }
1375 return REG_NOERROR;
1376 }
1377
1378 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1379 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1380 calc_next (void *extra, bin_tree_t *node)
1381 {
1382 switch (node->token.type)
1383 {
1384 case OP_DUP_ASTERISK:
1385 node->left->next = node;
1386 break;
1387 case CONCAT:
1388 node->left->next = node->right->first;
1389 node->right->next = node->next;
1390 break;
1391 default:
1392 if (node->left)
1393 node->left->next = node->next;
1394 if (node->right)
1395 node->right->next = node->next;
1396 break;
1397 }
1398 return REG_NOERROR;
1399 }
1400
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1402 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1403 link_nfa_nodes (void *extra, bin_tree_t *node)
1404 {
1405 re_dfa_t *dfa = (re_dfa_t *) extra;
1406 Idx idx = node->node_idx;
1407 reg_errcode_t err = REG_NOERROR;
1408
1409 switch (node->token.type)
1410 {
1411 case CONCAT:
1412 break;
1413
1414 case END_OF_RE:
1415 assert (node->next == NULL);
1416 break;
1417
1418 case OP_DUP_ASTERISK:
1419 case OP_ALT:
1420 {
1421 Idx left, right;
1422 dfa->has_plural_match = 1;
1423 if (node->left != NULL)
1424 left = node->left->first->node_idx;
1425 else
1426 left = node->next->node_idx;
1427 if (node->right != NULL)
1428 right = node->right->first->node_idx;
1429 else
1430 right = node->next->node_idx;
1431 assert (REG_VALID_INDEX (left));
1432 assert (REG_VALID_INDEX (right));
1433 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1434 }
1435 break;
1436
1437 case ANCHOR:
1438 case OP_OPEN_SUBEXP:
1439 case OP_CLOSE_SUBEXP:
1440 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1441 break;
1442
1443 case OP_BACK_REF:
1444 dfa->nexts[idx] = node->next->node_idx;
1445 if (node->token.type == OP_BACK_REF)
1446 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1447 break;
1448
1449 default:
1450 assert (!IS_EPSILON_NODE (node->token.type));
1451 dfa->nexts[idx] = node->next->node_idx;
1452 break;
1453 }
1454
1455 return err;
1456 }
1457
1458 /* Duplicate the epsilon closure of the node ROOT_NODE.
1459 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1460 to their own constraint. */
1461
1462 static reg_errcode_t
1463 internal_function
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1464 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1465 Idx root_node, unsigned int init_constraint)
1466 {
1467 Idx org_node, clone_node;
1468 bool ok;
1469 unsigned int constraint = init_constraint;
1470 for (org_node = top_org_node, clone_node = top_clone_node;;)
1471 {
1472 Idx org_dest, clone_dest;
1473 if (dfa->nodes[org_node].type == OP_BACK_REF)
1474 {
1475 /* If the back reference epsilon-transit, its destination must
1476 also have the constraint. Then duplicate the epsilon closure
1477 of the destination of the back reference, and store it in
1478 edests of the back reference. */
1479 org_dest = dfa->nexts[org_node];
1480 re_node_set_empty (dfa->edests + clone_node);
1481 clone_dest = duplicate_node (dfa, org_dest, constraint);
1482 if (BE (clone_dest == REG_MISSING, 0))
1483 return REG_ESPACE;
1484 dfa->nexts[clone_node] = dfa->nexts[org_node];
1485 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1486 if (BE (! ok, 0))
1487 return REG_ESPACE;
1488 }
1489 else if (dfa->edests[org_node].nelem == 0)
1490 {
1491 /* In case of the node can't epsilon-transit, don't duplicate the
1492 destination and store the original destination as the
1493 destination of the node. */
1494 dfa->nexts[clone_node] = dfa->nexts[org_node];
1495 break;
1496 }
1497 else if (dfa->edests[org_node].nelem == 1)
1498 {
1499 /* In case of the node can epsilon-transit, and it has only one
1500 destination. */
1501 org_dest = dfa->edests[org_node].elems[0];
1502 re_node_set_empty (dfa->edests + clone_node);
1503 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1504 /* If the node is root_node itself, it means the epsilon closure
1505 has a loop. Then tie it to the destination of the root_node. */
1506 if (org_node == root_node && clone_node != org_node)
1507 {
1508 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1509 if (BE (! ok, 0))
1510 return REG_ESPACE;
1511 break;
1512 }
1513 /* In case the node has another constraint, append it. */
1514 constraint |= dfa->nodes[org_node].constraint;
1515 clone_dest = duplicate_node (dfa, org_dest, constraint);
1516 if (BE (clone_dest == REG_MISSING, 0))
1517 return REG_ESPACE;
1518 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (! ok, 0))
1520 return REG_ESPACE;
1521 }
1522 else /* dfa->edests[org_node].nelem == 2 */
1523 {
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1526 org_dest = dfa->edests[org_node].elems[0];
1527 re_node_set_empty (dfa->edests + clone_node);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530 if (clone_dest == REG_MISSING)
1531 {
1532 /* There is no such duplicated node, create a new one. */
1533 reg_errcode_t err;
1534 clone_dest = duplicate_node (dfa, org_dest, constraint);
1535 if (BE (clone_dest == REG_MISSING, 0))
1536 return REG_ESPACE;
1537 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538 if (BE (! ok, 0))
1539 return REG_ESPACE;
1540 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1541 root_node, constraint);
1542 if (BE (err != REG_NOERROR, 0))
1543 return err;
1544 }
1545 else
1546 {
1547 /* There is a duplicated node which satisfy the constraint,
1548 use it to avoid infinite loop. */
1549 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550 if (BE (! ok, 0))
1551 return REG_ESPACE;
1552 }
1553
1554 org_dest = dfa->edests[org_node].elems[1];
1555 clone_dest = duplicate_node (dfa, org_dest, constraint);
1556 if (BE (clone_dest == REG_MISSING, 0))
1557 return REG_ESPACE;
1558 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (! ok, 0))
1560 return REG_ESPACE;
1561 }
1562 org_node = org_dest;
1563 clone_node = clone_dest;
1564 }
1565 return REG_NOERROR;
1566 }
1567
1568 /* Search for a node which is duplicated from the node ORG_NODE, and
1569 satisfies the constraint CONSTRAINT. */
1570
1571 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1572 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1573 unsigned int constraint)
1574 {
1575 Idx idx;
1576 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1577 {
1578 if (org_node == dfa->org_indices[idx]
1579 && constraint == dfa->nodes[idx].constraint)
1580 return idx; /* Found. */
1581 }
1582 return REG_MISSING; /* Not found. */
1583 }
1584
1585 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1586 Return the index of the new node, or REG_MISSING if insufficient storage is
1587 available. */
1588
1589 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1590 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1591 {
1592 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1593 if (BE (dup_idx != REG_MISSING, 1))
1594 {
1595 dfa->nodes[dup_idx].constraint = constraint;
1596 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1597 dfa->nodes[dup_idx].duplicated = 1;
1598
1599 /* Store the index of the original node. */
1600 dfa->org_indices[dup_idx] = org_idx;
1601 }
1602 return dup_idx;
1603 }
1604
1605 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1606 calc_inveclosure (re_dfa_t *dfa)
1607 {
1608 Idx src, idx;
1609 bool ok;
1610 for (idx = 0; idx < dfa->nodes_len; ++idx)
1611 re_node_set_init_empty (dfa->inveclosures + idx);
1612
1613 for (src = 0; src < dfa->nodes_len; ++src)
1614 {
1615 Idx *elems = dfa->eclosures[src].elems;
1616 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1617 {
1618 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1619 if (BE (! ok, 0))
1620 return REG_ESPACE;
1621 }
1622 }
1623
1624 return REG_NOERROR;
1625 }
1626
1627 /* Calculate "eclosure" for all the node in DFA. */
1628
1629 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1630 calc_eclosure (re_dfa_t *dfa)
1631 {
1632 Idx node_idx;
1633 bool incomplete;
1634 #ifdef DEBUG
1635 assert (dfa->nodes_len > 0);
1636 #endif
1637 incomplete = false;
1638 /* For each nodes, calculate epsilon closure. */
1639 for (node_idx = 0; ; ++node_idx)
1640 {
1641 reg_errcode_t err;
1642 re_node_set eclosure_elem;
1643 if (node_idx == dfa->nodes_len)
1644 {
1645 if (!incomplete)
1646 break;
1647 incomplete = false;
1648 node_idx = 0;
1649 }
1650
1651 #ifdef DEBUG
1652 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1653 #endif
1654
1655 /* If we have already calculated, skip it. */
1656 if (dfa->eclosures[node_idx].nelem != 0)
1657 continue;
1658 /* Calculate epsilon closure of `node_idx'. */
1659 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1660 if (BE (err != REG_NOERROR, 0))
1661 return err;
1662
1663 if (dfa->eclosures[node_idx].nelem == 0)
1664 {
1665 incomplete = true;
1666 re_node_set_free (&eclosure_elem);
1667 }
1668 }
1669 return REG_NOERROR;
1670 }
1671
1672 /* Calculate epsilon closure of NODE. */
1673
1674 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1675 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1676 {
1677 reg_errcode_t err;
1678 Idx i;
1679 bool incomplete;
1680 bool ok;
1681 re_node_set eclosure;
1682 incomplete = false;
1683 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1684 if (BE (err != REG_NOERROR, 0))
1685 return err;
1686
1687 /* This indicates that we are calculating this node now.
1688 We reference this value to avoid infinite loop. */
1689 dfa->eclosures[node].nelem = REG_MISSING;
1690
1691 /* If the current node has constraints, duplicate all nodes
1692 since they must inherit the constraints. */
1693 if (dfa->nodes[node].constraint
1694 && dfa->edests[node].nelem
1695 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1696 {
1697 err = duplicate_node_closure (dfa, node, node, node,
1698 dfa->nodes[node].constraint);
1699 if (BE (err != REG_NOERROR, 0))
1700 return err;
1701 }
1702
1703 /* Expand each epsilon destination nodes. */
1704 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1705 for (i = 0; i < dfa->edests[node].nelem; ++i)
1706 {
1707 re_node_set eclosure_elem;
1708 Idx edest = dfa->edests[node].elems[i];
1709 /* If calculating the epsilon closure of `edest' is in progress,
1710 return intermediate result. */
1711 if (dfa->eclosures[edest].nelem == REG_MISSING)
1712 {
1713 incomplete = true;
1714 continue;
1715 }
1716 /* If we haven't calculated the epsilon closure of `edest' yet,
1717 calculate now. Otherwise use calculated epsilon closure. */
1718 if (dfa->eclosures[edest].nelem == 0)
1719 {
1720 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1721 if (BE (err != REG_NOERROR, 0))
1722 return err;
1723 }
1724 else
1725 eclosure_elem = dfa->eclosures[edest];
1726 /* Merge the epsilon closure of `edest'. */
1727 re_node_set_merge (&eclosure, &eclosure_elem);
1728 /* If the epsilon closure of `edest' is incomplete,
1729 the epsilon closure of this node is also incomplete. */
1730 if (dfa->eclosures[edest].nelem == 0)
1731 {
1732 incomplete = true;
1733 re_node_set_free (&eclosure_elem);
1734 }
1735 }
1736
1737 /* Epsilon closures include itself. */
1738 ok = re_node_set_insert (&eclosure, node);
1739 if (BE (! ok, 0))
1740 return REG_ESPACE;
1741 if (incomplete && !root)
1742 dfa->eclosures[node].nelem = 0;
1743 else
1744 dfa->eclosures[node] = eclosure;
1745 *new_set = eclosure;
1746 return REG_NOERROR;
1747 }
1748
1749 /* Functions for token which are used in the parser. */
1750
1751 /* Fetch a token from INPUT.
1752 We must not use this function inside bracket expressions. */
1753
1754 static void
1755 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1756 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 {
1758 re_string_skip_bytes (input, peek_token (result, input, syntax));
1759 }
1760
1761 /* Peek a token from INPUT, and return the length of the token.
1762 We must not use this function inside bracket expressions. */
1763
1764 static int
1765 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1766 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 {
1768 unsigned char c;
1769
1770 if (re_string_eoi (input))
1771 {
1772 token->type = END_OF_RE;
1773 return 0;
1774 }
1775
1776 c = re_string_peek_byte (input, 0);
1777 token->opr.c = c;
1778
1779 token->word_char = 0;
1780 #ifdef RE_ENABLE_I18N
1781 token->mb_partial = 0;
1782 if (input->mb_cur_max > 1 &&
1783 !re_string_first_byte (input, re_string_cur_idx (input)))
1784 {
1785 token->type = CHARACTER;
1786 token->mb_partial = 1;
1787 return 1;
1788 }
1789 #endif
1790 if (c == '\\')
1791 {
1792 unsigned char c2;
1793 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794 {
1795 token->type = BACK_SLASH;
1796 return 1;
1797 }
1798
1799 c2 = re_string_peek_byte_case (input, 1);
1800 token->opr.c = c2;
1801 token->type = CHARACTER;
1802 #ifdef RE_ENABLE_I18N
1803 if (input->mb_cur_max > 1)
1804 {
1805 wint_t wc = re_string_wchar_at (input,
1806 re_string_cur_idx (input) + 1);
1807 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808 }
1809 else
1810 #endif
1811 token->word_char = IS_WORD_CHAR (c2) != 0;
1812
1813 switch (c2)
1814 {
1815 case '|':
1816 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1817 token->type = OP_ALT;
1818 break;
1819 case '1': case '2': case '3': case '4': case '5':
1820 case '6': case '7': case '8': case '9':
1821 if (!(syntax & RE_NO_BK_REFS))
1822 {
1823 token->type = OP_BACK_REF;
1824 token->opr.idx = c2 - '1';
1825 }
1826 break;
1827 case '<':
1828 if (!(syntax & RE_NO_GNU_OPS))
1829 {
1830 token->type = ANCHOR;
1831 token->opr.ctx_type = WORD_FIRST;
1832 }
1833 break;
1834 case '>':
1835 if (!(syntax & RE_NO_GNU_OPS))
1836 {
1837 token->type = ANCHOR;
1838 token->opr.ctx_type = WORD_LAST;
1839 }
1840 break;
1841 case 'b':
1842 if (!(syntax & RE_NO_GNU_OPS))
1843 {
1844 token->type = ANCHOR;
1845 token->opr.ctx_type = WORD_DELIM;
1846 }
1847 break;
1848 case 'B':
1849 if (!(syntax & RE_NO_GNU_OPS))
1850 {
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = NOT_WORD_DELIM;
1853 }
1854 break;
1855 case 'w':
1856 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = OP_WORD;
1858 break;
1859 case 'W':
1860 if (!(syntax & RE_NO_GNU_OPS))
1861 token->type = OP_NOTWORD;
1862 break;
1863 case 's':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = OP_SPACE;
1866 break;
1867 case 'S':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_NOTSPACE;
1870 break;
1871 case '`':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 {
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = BUF_FIRST;
1876 }
1877 break;
1878 case '\'':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 {
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = BUF_LAST;
1883 }
1884 break;
1885 case '(':
1886 if (!(syntax & RE_NO_BK_PARENS))
1887 token->type = OP_OPEN_SUBEXP;
1888 break;
1889 case ')':
1890 if (!(syntax & RE_NO_BK_PARENS))
1891 token->type = OP_CLOSE_SUBEXP;
1892 break;
1893 case '+':
1894 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1895 token->type = OP_DUP_PLUS;
1896 break;
1897 case '?':
1898 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1899 token->type = OP_DUP_QUESTION;
1900 break;
1901 case '{':
1902 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1903 token->type = OP_OPEN_DUP_NUM;
1904 break;
1905 case '}':
1906 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1907 token->type = OP_CLOSE_DUP_NUM;
1908 break;
1909 default:
1910 break;
1911 }
1912 return 2;
1913 }
1914
1915 token->type = CHARACTER;
1916 #ifdef RE_ENABLE_I18N
1917 if (input->mb_cur_max > 1)
1918 {
1919 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1920 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921 }
1922 else
1923 #endif
1924 token->word_char = IS_WORD_CHAR (token->opr.c);
1925
1926 switch (c)
1927 {
1928 case '\n':
1929 if (syntax & RE_NEWLINE_ALT)
1930 token->type = OP_ALT;
1931 break;
1932 case '|':
1933 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1934 token->type = OP_ALT;
1935 break;
1936 case '*':
1937 token->type = OP_DUP_ASTERISK;
1938 break;
1939 case '+':
1940 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1941 token->type = OP_DUP_PLUS;
1942 break;
1943 case '?':
1944 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1945 token->type = OP_DUP_QUESTION;
1946 break;
1947 case '{':
1948 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1949 token->type = OP_OPEN_DUP_NUM;
1950 break;
1951 case '}':
1952 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1953 token->type = OP_CLOSE_DUP_NUM;
1954 break;
1955 case '(':
1956 if (syntax & RE_NO_BK_PARENS)
1957 token->type = OP_OPEN_SUBEXP;
1958 break;
1959 case ')':
1960 if (syntax & RE_NO_BK_PARENS)
1961 token->type = OP_CLOSE_SUBEXP;
1962 break;
1963 case '[':
1964 token->type = OP_OPEN_BRACKET;
1965 break;
1966 case '.':
1967 token->type = OP_PERIOD;
1968 break;
1969 case '^':
1970 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1971 re_string_cur_idx (input) != 0)
1972 {
1973 char prev = re_string_peek_byte (input, -1);
1974 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1975 break;
1976 }
1977 token->type = ANCHOR;
1978 token->opr.ctx_type = LINE_FIRST;
1979 break;
1980 case '$':
1981 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1982 re_string_cur_idx (input) + 1 != re_string_length (input))
1983 {
1984 re_token_t next;
1985 re_string_skip_bytes (input, 1);
1986 peek_token (&next, input, syntax);
1987 re_string_skip_bytes (input, -1);
1988 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1989 break;
1990 }
1991 token->type = ANCHOR;
1992 token->opr.ctx_type = LINE_LAST;
1993 break;
1994 default:
1995 break;
1996 }
1997 return 1;
1998 }
1999
2000 /* Peek a token from INPUT, and return the length of the token.
2001 We must not use this function out of bracket expressions. */
2002
2003 static int
2004 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2005 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2006 {
2007 unsigned char c;
2008 if (re_string_eoi (input))
2009 {
2010 token->type = END_OF_RE;
2011 return 0;
2012 }
2013 c = re_string_peek_byte (input, 0);
2014 token->opr.c = c;
2015
2016 #ifdef RE_ENABLE_I18N
2017 if (input->mb_cur_max > 1 &&
2018 !re_string_first_byte (input, re_string_cur_idx (input)))
2019 {
2020 token->type = CHARACTER;
2021 return 1;
2022 }
2023 #endif /* RE_ENABLE_I18N */
2024
2025 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2026 && re_string_cur_idx (input) + 1 < re_string_length (input))
2027 {
2028 /* In this case, '\' escape a character. */
2029 unsigned char c2;
2030 re_string_skip_bytes (input, 1);
2031 c2 = re_string_peek_byte (input, 0);
2032 token->opr.c = c2;
2033 token->type = CHARACTER;
2034 return 1;
2035 }
2036 if (c == '[') /* '[' is a special char in a bracket exps. */
2037 {
2038 unsigned char c2;
2039 int token_len;
2040 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2041 c2 = re_string_peek_byte (input, 1);
2042 else
2043 c2 = 0;
2044 token->opr.c = c2;
2045 token_len = 2;
2046 switch (c2)
2047 {
2048 case '.':
2049 token->type = OP_OPEN_COLL_ELEM;
2050 break;
2051 case '=':
2052 token->type = OP_OPEN_EQUIV_CLASS;
2053 break;
2054 case ':':
2055 if (syntax & RE_CHAR_CLASSES)
2056 {
2057 token->type = OP_OPEN_CHAR_CLASS;
2058 break;
2059 }
2060 /* else fall through. */
2061 default:
2062 token->type = CHARACTER;
2063 token->opr.c = c;
2064 token_len = 1;
2065 break;
2066 }
2067 return token_len;
2068 }
2069 switch (c)
2070 {
2071 case '-':
2072 token->type = OP_CHARSET_RANGE;
2073 break;
2074 case ']':
2075 token->type = OP_CLOSE_BRACKET;
2076 break;
2077 case '^':
2078 token->type = OP_NON_MATCH_LIST;
2079 break;
2080 default:
2081 token->type = CHARACTER;
2082 }
2083 return 1;
2084 }
2085
2086 /* Functions for parser. */
2087
2088 /* Entry point of the parser.
2089 Parse the regular expression REGEXP and return the structure tree.
2090 If an error is occured, ERR is set by error code, and return NULL.
2091 This function build the following tree, from regular expression <reg_exp>:
2092 CAT
2093 / \
2094 / \
2095 <reg_exp> EOR
2096
2097 CAT means concatenation.
2098 EOR means end of regular expression. */
2099
2100 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2101 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2102 reg_errcode_t *err)
2103 {
2104 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105 bin_tree_t *tree, *eor, *root;
2106 re_token_t current_token;
2107 dfa->syntax = syntax;
2108 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2110 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111 return NULL;
2112 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113 if (tree != NULL)
2114 root = create_tree (dfa, tree, eor, CONCAT);
2115 else
2116 root = eor;
2117 if (BE (eor == NULL || root == NULL, 0))
2118 {
2119 *err = REG_ESPACE;
2120 return NULL;
2121 }
2122 return root;
2123 }
2124
2125 /* This function build the following tree, from regular expression
2126 <branch1>|<branch2>:
2127 ALT
2128 / \
2129 / \
2130 <branch1> <branch2>
2131
2132 ALT means alternative, which represents the operator `|'. */
2133
2134 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2135 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2136 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2137 {
2138 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2139 bin_tree_t *tree, *branch = NULL;
2140 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2141 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142 return NULL;
2143
2144 while (token->type == OP_ALT)
2145 {
2146 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147 if (token->type != OP_ALT && token->type != END_OF_RE
2148 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149 {
2150 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2151 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2152 return NULL;
2153 }
2154 else
2155 branch = NULL;
2156 tree = create_tree (dfa, tree, branch, OP_ALT);
2157 if (BE (tree == NULL, 0))
2158 {
2159 *err = REG_ESPACE;
2160 return NULL;
2161 }
2162 }
2163 return tree;
2164 }
2165
2166 /* This function build the following tree, from regular expression
2167 <exp1><exp2>:
2168 CAT
2169 / \
2170 / \
2171 <exp1> <exp2>
2172
2173 CAT means concatenation. */
2174
2175 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2176 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2177 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2178 {
2179 bin_tree_t *tree, *expr;
2180 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2181 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2182 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183 return NULL;
2184
2185 while (token->type != OP_ALT && token->type != END_OF_RE
2186 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187 {
2188 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2190 {
2191 return NULL;
2192 }
2193 if (tree != NULL && expr != NULL)
2194 {
2195 tree = create_tree (dfa, tree, expr, CONCAT);
2196 if (tree == NULL)
2197 {
2198 *err = REG_ESPACE;
2199 return NULL;
2200 }
2201 }
2202 else if (tree == NULL)
2203 tree = expr;
2204 /* Otherwise expr == NULL, we don't need to create new tree. */
2205 }
2206 return tree;
2207 }
2208
2209 /* This function build the following tree, from regular expression a*:
2210 *
2211 |
2212 a
2213 */
2214
2215 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2216 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 {
2219 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2220 bin_tree_t *tree;
2221 switch (token->type)
2222 {
2223 case CHARACTER:
2224 tree = create_token_tree (dfa, NULL, NULL, token);
2225 if (BE (tree == NULL, 0))
2226 {
2227 *err = REG_ESPACE;
2228 return NULL;
2229 }
2230 #ifdef RE_ENABLE_I18N
2231 if (dfa->mb_cur_max > 1)
2232 {
2233 while (!re_string_eoi (regexp)
2234 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2235 {
2236 bin_tree_t *mbc_remain;
2237 fetch_token (token, regexp, syntax);
2238 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2239 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2240 if (BE (mbc_remain == NULL || tree == NULL, 0))
2241 {
2242 *err = REG_ESPACE;
2243 return NULL;
2244 }
2245 }
2246 }
2247 #endif
2248 break;
2249 case OP_OPEN_SUBEXP:
2250 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2251 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2252 return NULL;
2253 break;
2254 case OP_OPEN_BRACKET:
2255 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2256 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257 return NULL;
2258 break;
2259 case OP_BACK_REF:
2260 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2261 {
2262 *err = REG_ESUBREG;
2263 return NULL;
2264 }
2265 dfa->used_bkref_map |= 1 << token->opr.idx;
2266 tree = create_token_tree (dfa, NULL, NULL, token);
2267 if (BE (tree == NULL, 0))
2268 {
2269 *err = REG_ESPACE;
2270 return NULL;
2271 }
2272 ++dfa->nbackref;
2273 dfa->has_mb_node = 1;
2274 break;
2275 case OP_OPEN_DUP_NUM:
2276 if (syntax & RE_CONTEXT_INVALID_DUP)
2277 {
2278 *err = REG_BADRPT;
2279 return NULL;
2280 }
2281 /* FALLTHROUGH */
2282 case OP_DUP_ASTERISK:
2283 case OP_DUP_PLUS:
2284 case OP_DUP_QUESTION:
2285 if (syntax & RE_CONTEXT_INVALID_OPS)
2286 {
2287 *err = REG_BADRPT;
2288 return NULL;
2289 }
2290 else if (syntax & RE_CONTEXT_INDEP_OPS)
2291 {
2292 fetch_token (token, regexp, syntax);
2293 return parse_expression (regexp, preg, token, syntax, nest, err);
2294 }
2295 /* else fall through */
2296 case OP_CLOSE_SUBEXP:
2297 if ((token->type == OP_CLOSE_SUBEXP) &&
2298 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2299 {
2300 *err = REG_ERPAREN;
2301 return NULL;
2302 }
2303 /* else fall through */
2304 case OP_CLOSE_DUP_NUM:
2305 /* We treat it as a normal character. */
2306
2307 /* Then we can these characters as normal characters. */
2308 token->type = CHARACTER;
2309 /* mb_partial and word_char bits should be initialized already
2310 by peek_token. */
2311 tree = create_token_tree (dfa, NULL, NULL, token);
2312 if (BE (tree == NULL, 0))
2313 {
2314 *err = REG_ESPACE;
2315 return NULL;
2316 }
2317 break;
2318 case ANCHOR:
2319 if ((token->opr.ctx_type
2320 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2321 && dfa->word_ops_used == 0)
2322 init_word_char (dfa);
2323 if (token->opr.ctx_type == WORD_DELIM
2324 || token->opr.ctx_type == NOT_WORD_DELIM)
2325 {
2326 bin_tree_t *tree_first, *tree_last;
2327 if (token->opr.ctx_type == WORD_DELIM)
2328 {
2329 token->opr.ctx_type = WORD_FIRST;
2330 tree_first = create_token_tree (dfa, NULL, NULL, token);
2331 token->opr.ctx_type = WORD_LAST;
2332 }
2333 else
2334 {
2335 token->opr.ctx_type = INSIDE_WORD;
2336 tree_first = create_token_tree (dfa, NULL, NULL, token);
2337 token->opr.ctx_type = INSIDE_NOTWORD;
2338 }
2339 tree_last = create_token_tree (dfa, NULL, NULL, token);
2340 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2341 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2342 {
2343 *err = REG_ESPACE;
2344 return NULL;
2345 }
2346 }
2347 else
2348 {
2349 tree = create_token_tree (dfa, NULL, NULL, token);
2350 if (BE (tree == NULL, 0))
2351 {
2352 *err = REG_ESPACE;
2353 return NULL;
2354 }
2355 }
2356 /* We must return here, since ANCHORs can't be followed
2357 by repetition operators.
2358 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2359 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2360 fetch_token (token, regexp, syntax);
2361 return tree;
2362 case OP_PERIOD:
2363 tree = create_token_tree (dfa, NULL, NULL, token);
2364 if (BE (tree == NULL, 0))
2365 {
2366 *err = REG_ESPACE;
2367 return NULL;
2368 }
2369 if (dfa->mb_cur_max > 1)
2370 dfa->has_mb_node = 1;
2371 break;
2372 case OP_WORD:
2373 case OP_NOTWORD:
2374 tree = build_charclass_op (dfa, regexp->trans,
2375 (const unsigned char *) "alnum",
2376 (const unsigned char *) "_",
2377 token->type == OP_NOTWORD, err);
2378 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2379 return NULL;
2380 break;
2381 case OP_SPACE:
2382 case OP_NOTSPACE:
2383 tree = build_charclass_op (dfa, regexp->trans,
2384 (const unsigned char *) "space",
2385 (const unsigned char *) "",
2386 token->type == OP_NOTSPACE, err);
2387 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2388 return NULL;
2389 break;
2390 case OP_ALT:
2391 case END_OF_RE:
2392 return NULL;
2393 case BACK_SLASH:
2394 *err = REG_EESCAPE;
2395 return NULL;
2396 default:
2397 /* Must not happen? */
2398 #ifdef DEBUG
2399 assert (0);
2400 #endif
2401 return NULL;
2402 }
2403 fetch_token (token, regexp, syntax);
2404
2405 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2406 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2407 {
2408 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2409 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2410 return NULL;
2411 /* In BRE consecutive duplications are not allowed. */
2412 if ((syntax & RE_CONTEXT_INVALID_DUP)
2413 && (token->type == OP_DUP_ASTERISK
2414 || token->type == OP_OPEN_DUP_NUM))
2415 {
2416 *err = REG_BADRPT;
2417 return NULL;
2418 }
2419 }
2420
2421 return tree;
2422 }
2423
2424 /* This function build the following tree, from regular expression
2425 (<reg_exp>):
2426 SUBEXP
2427 |
2428 <reg_exp>
2429 */
2430
2431 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2432 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2433 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2434 {
2435 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2436 bin_tree_t *tree;
2437 size_t cur_nsub;
2438 cur_nsub = preg->re_nsub++;
2439
2440 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2441
2442 /* The subexpression may be a null string. */
2443 if (token->type == OP_CLOSE_SUBEXP)
2444 tree = NULL;
2445 else
2446 {
2447 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2448 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2449 *err = REG_EPAREN;
2450 if (BE (*err != REG_NOERROR, 0))
2451 return NULL;
2452 }
2453
2454 if (cur_nsub <= '9' - '1')
2455 dfa->completed_bkref_map |= 1 << cur_nsub;
2456
2457 tree = create_tree (dfa, tree, NULL, SUBEXP);
2458 if (BE (tree == NULL, 0))
2459 {
2460 *err = REG_ESPACE;
2461 return NULL;
2462 }
2463 tree->token.opr.idx = cur_nsub;
2464 return tree;
2465 }
2466
2467 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2468
2469 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2470 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2471 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2472 {
2473 bin_tree_t *tree = NULL, *old_tree = NULL;
2474 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2475 re_token_t start_token = *token;
2476
2477 if (token->type == OP_OPEN_DUP_NUM)
2478 {
2479 end = 0;
2480 start = fetch_number (regexp, token, syntax);
2481 if (start == REG_MISSING)
2482 {
2483 if (token->type == CHARACTER && token->opr.c == ',')
2484 start = 0; /* We treat "{,m}" as "{0,m}". */
2485 else
2486 {
2487 *err = REG_BADBR; /* <re>{} is invalid. */
2488 return NULL;
2489 }
2490 }
2491 if (BE (start != REG_ERROR, 1))
2492 {
2493 /* We treat "{n}" as "{n,n}". */
2494 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2495 : ((token->type == CHARACTER && token->opr.c == ',')
2496 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2497 }
2498 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2499 {
2500 /* Invalid sequence. */
2501 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2502 {
2503 if (token->type == END_OF_RE)
2504 *err = REG_EBRACE;
2505 else
2506 *err = REG_BADBR;
2507
2508 return NULL;
2509 }
2510
2511 /* If the syntax bit is set, rollback. */
2512 re_string_set_index (regexp, start_idx);
2513 *token = start_token;
2514 token->type = CHARACTER;
2515 /* mb_partial and word_char bits should be already initialized by
2516 peek_token. */
2517 return elem;
2518 }
2519
2520 if (BE (end != REG_MISSING && start > end, 0))
2521 {
2522 /* First number greater than second. */
2523 *err = REG_BADBR;
2524 return NULL;
2525 }
2526 }
2527 else
2528 {
2529 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2530 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2531 }
2532
2533 fetch_token (token, regexp, syntax);
2534
2535 if (BE (elem == NULL, 0))
2536 return NULL;
2537 if (BE (start == 0 && end == 0, 0))
2538 {
2539 postorder (elem, free_tree, NULL);
2540 return NULL;
2541 }
2542
2543 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2544 if (BE (start > 0, 0))
2545 {
2546 tree = elem;
2547 for (i = 2; i <= start; ++i)
2548 {
2549 elem = duplicate_tree (elem, dfa);
2550 tree = create_tree (dfa, tree, elem, CONCAT);
2551 if (BE (elem == NULL || tree == NULL, 0))
2552 goto parse_dup_op_espace;
2553 }
2554
2555 if (start == end)
2556 return tree;
2557
2558 /* Duplicate ELEM before it is marked optional. */
2559 elem = duplicate_tree (elem, dfa);
2560 old_tree = tree;
2561 }
2562 else
2563 old_tree = NULL;
2564
2565 if (elem->token.type == SUBEXP)
2566 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2567
2568 tree = create_tree (dfa, elem, NULL,
2569 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2570 if (BE (tree == NULL, 0))
2571 goto parse_dup_op_espace;
2572
2573 /* This loop is actually executed only when end != REG_MISSING,
2574 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2575 already created the start+1-th copy. */
2576 if ((Idx) -1 < 0 || end != REG_MISSING)
2577 for (i = start + 2; i <= end; ++i)
2578 {
2579 elem = duplicate_tree (elem, dfa);
2580 tree = create_tree (dfa, tree, elem, CONCAT);
2581 if (BE (elem == NULL || tree == NULL, 0))
2582 goto parse_dup_op_espace;
2583
2584 tree = create_tree (dfa, tree, NULL, OP_ALT);
2585 if (BE (tree == NULL, 0))
2586 goto parse_dup_op_espace;
2587 }
2588
2589 if (old_tree)
2590 tree = create_tree (dfa, old_tree, tree, CONCAT);
2591
2592 return tree;
2593
2594 parse_dup_op_espace:
2595 *err = REG_ESPACE;
2596 return NULL;
2597 }
2598
2599 /* Size of the names for collating symbol/equivalence_class/character_class.
2600 I'm not sure, but maybe enough. */
2601 #define BRACKET_NAME_BUF_SIZE 32
2602
2603 #ifndef _LIBC
2604 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2605 Build the range expression which starts from START_ELEM, and ends
2606 at END_ELEM. The result are written to MBCSET and SBCSET.
2607 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2608 mbcset->range_ends, is a pointer argument sinse we may
2609 update it. */
2610
2611 static reg_errcode_t
2612 internal_function
2613 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2614 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2615 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2616 # else /* not RE_ENABLE_I18N */
2617 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2618 bracket_elem_t *end_elem)
2619 # endif /* not RE_ENABLE_I18N */
2620 {
2621 unsigned int start_ch, end_ch;
2622 /* Equivalence Classes and Character Classes can't be a range start/end. */
2623 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2624 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2625 0))
2626 return REG_ERANGE;
2627
2628 /* We can handle no multi character collating elements without libc
2629 support. */
2630 if (BE ((start_elem->type == COLL_SYM
2631 && strlen ((char *) start_elem->opr.name) > 1)
2632 || (end_elem->type == COLL_SYM
2633 && strlen ((char *) end_elem->opr.name) > 1), 0))
2634 return REG_ECOLLATE;
2635
2636 # ifdef RE_ENABLE_I18N
2637 {
2638 wchar_t wc;
2639 wint_t start_wc;
2640 wint_t end_wc;
2641 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2642
2643 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2644 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2645 : 0));
2646 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2647 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2648 : 0));
2649 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2650 ? __btowc (start_ch) : start_elem->opr.wch);
2651 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2652 ? __btowc (end_ch) : end_elem->opr.wch);
2653 if (start_wc == WEOF || end_wc == WEOF)
2654 return REG_ECOLLATE;
2655 cmp_buf[0] = start_wc;
2656 cmp_buf[4] = end_wc;
2657 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2658 return REG_ERANGE;
2659
2660 /* Got valid collation sequence values, add them as a new entry.
2661 However, for !_LIBC we have no collation elements: if the
2662 character set is single byte, the single byte character set
2663 that we build below suffices. parse_bracket_exp passes
2664 no MBCSET if dfa->mb_cur_max == 1. */
2665 if (mbcset)
2666 {
2667 /* Check the space of the arrays. */
2668 if (BE (*range_alloc == mbcset->nranges, 0))
2669 {
2670 /* There is not enough space, need realloc. */
2671 wchar_t *new_array_start, *new_array_end;
2672 Idx new_nranges;
2673
2674 /* +1 in case of mbcset->nranges is 0. */
2675 new_nranges = 2 * mbcset->nranges + 1;
2676 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2677 are NULL if *range_alloc == 0. */
2678 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2679 new_nranges);
2680 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2681 new_nranges);
2682
2683 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2684 return REG_ESPACE;
2685
2686 mbcset->range_starts = new_array_start;
2687 mbcset->range_ends = new_array_end;
2688 *range_alloc = new_nranges;
2689 }
2690
2691 mbcset->range_starts[mbcset->nranges] = start_wc;
2692 mbcset->range_ends[mbcset->nranges++] = end_wc;
2693 }
2694
2695 /* Build the table for single byte characters. */
2696 for (wc = 0; wc < SBC_MAX; ++wc)
2697 {
2698 cmp_buf[2] = wc;
2699 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2700 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2701 bitset_set (sbcset, wc);
2702 }
2703 }
2704 # else /* not RE_ENABLE_I18N */
2705 {
2706 unsigned int ch;
2707 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2708 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2709 : 0));
2710 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2711 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2712 : 0));
2713 if (start_ch > end_ch)
2714 return REG_ERANGE;
2715 /* Build the table for single byte characters. */
2716 for (ch = 0; ch < SBC_MAX; ++ch)
2717 if (start_ch <= ch && ch <= end_ch)
2718 bitset_set (sbcset, ch);
2719 }
2720 # endif /* not RE_ENABLE_I18N */
2721 return REG_NOERROR;
2722 }
2723 #endif /* not _LIBC */
2724
2725 #ifndef _LIBC
2726 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2727 Build the collating element which is represented by NAME.
2728 The result are written to MBCSET and SBCSET.
2729 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2730 pointer argument since we may update it. */
2731
2732 static reg_errcode_t
2733 internal_function
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2734 build_collating_symbol (bitset_t sbcset,
2735 # ifdef RE_ENABLE_I18N
2736 re_charset_t *mbcset, Idx *coll_sym_alloc,
2737 # endif
2738 const unsigned char *name)
2739 {
2740 size_t name_len = strlen ((const char *) name);
2741 if (BE (name_len != 1, 0))
2742 return REG_ECOLLATE;
2743 else
2744 {
2745 bitset_set (sbcset, name[0]);
2746 return REG_NOERROR;
2747 }
2748 }
2749 #endif /* not _LIBC */
2750
2751 /* This function parse bracket expression like "[abc]", "[a-c]",
2752 "[[.a-a.]]" etc. */
2753
2754 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2755 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2756 reg_syntax_t syntax, reg_errcode_t *err)
2757 {
2758 #ifdef _LIBC
2759 const unsigned char *collseqmb;
2760 const char *collseqwc;
2761 uint32_t nrules;
2762 int32_t table_size;
2763 const int32_t *symb_table;
2764 const unsigned char *extra;
2765
2766 /* Local function for parse_bracket_exp used in _LIBC environement.
2767 Seek the collating symbol entry correspondings to NAME.
2768 Return the index of the symbol in the SYMB_TABLE. */
2769
2770 auto inline int32_t
2771 __attribute ((always_inline))
2772 seek_collating_symbol_entry (name, name_len)
2773 const unsigned char *name;
2774 size_t name_len;
2775 {
2776 int32_t hash = elem_hash ((const char *) name, name_len);
2777 int32_t elem = hash % table_size;
2778 if (symb_table[2 * elem] != 0)
2779 {
2780 int32_t second = hash % (table_size - 2) + 1;
2781
2782 do
2783 {
2784 /* First compare the hashing value. */
2785 if (symb_table[2 * elem] == hash
2786 /* Compare the length of the name. */
2787 && name_len == extra[symb_table[2 * elem + 1]]
2788 /* Compare the name. */
2789 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2790 name_len) == 0)
2791 {
2792 /* Yep, this is the entry. */
2793 break;
2794 }
2795
2796 /* Next entry. */
2797 elem += second;
2798 }
2799 while (symb_table[2 * elem] != 0);
2800 }
2801 return elem;
2802 }
2803
2804 /* Local function for parse_bracket_exp used in _LIBC environement.
2805 Look up the collation sequence value of BR_ELEM.
2806 Return the value if succeeded, UINT_MAX otherwise. */
2807
2808 auto inline unsigned int
2809 __attribute ((always_inline))
2810 lookup_collation_sequence_value (br_elem)
2811 bracket_elem_t *br_elem;
2812 {
2813 if (br_elem->type == SB_CHAR)
2814 {
2815 /*
2816 if (MB_CUR_MAX == 1)
2817 */
2818 if (nrules == 0)
2819 return collseqmb[br_elem->opr.ch];
2820 else
2821 {
2822 wint_t wc = __btowc (br_elem->opr.ch);
2823 return __collseq_table_lookup (collseqwc, wc);
2824 }
2825 }
2826 else if (br_elem->type == MB_CHAR)
2827 {
2828 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829 }
2830 else if (br_elem->type == COLL_SYM)
2831 {
2832 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833 if (nrules != 0)
2834 {
2835 int32_t elem, idx;
2836 elem = seek_collating_symbol_entry (br_elem->opr.name,
2837 sym_name_len);
2838 if (symb_table[2 * elem] != 0)
2839 {
2840 /* We found the entry. */
2841 idx = symb_table[2 * elem + 1];
2842 /* Skip the name of collating element name. */
2843 idx += 1 + extra[idx];
2844 /* Skip the byte sequence of the collating element. */
2845 idx += 1 + extra[idx];
2846 /* Adjust for the alignment. */
2847 idx = (idx + 3) & ~3;
2848 /* Skip the multibyte collation sequence value. */
2849 idx += sizeof (unsigned int);
2850 /* Skip the wide char sequence of the collating element. */
2851 idx += sizeof (unsigned int) *
2852 (1 + *(unsigned int *) (extra + idx));
2853 /* Return the collation sequence value. */
2854 return *(unsigned int *) (extra + idx);
2855 }
2856 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2857 {
2858 /* No valid character. Match it as a single byte
2859 character. */
2860 return collseqmb[br_elem->opr.name[0]];
2861 }
2862 }
2863 else if (sym_name_len == 1)
2864 return collseqmb[br_elem->opr.name[0]];
2865 }
2866 return UINT_MAX;
2867 }
2868
2869 /* Local function for parse_bracket_exp used in _LIBC environement.
2870 Build the range expression which starts from START_ELEM, and ends
2871 at END_ELEM. The result are written to MBCSET and SBCSET.
2872 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873 mbcset->range_ends, is a pointer argument sinse we may
2874 update it. */
2875
2876 auto inline reg_errcode_t
2877 __attribute ((always_inline))
2878 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2879 re_charset_t *mbcset;
2880 Idx *range_alloc;
2881 bitset_t sbcset;
2882 bracket_elem_t *start_elem, *end_elem;
2883 {
2884 unsigned int ch;
2885 uint32_t start_collseq;
2886 uint32_t end_collseq;
2887
2888 /* Equivalence Classes and Character Classes can't be a range
2889 start/end. */
2890 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2891 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2892 0))
2893 return REG_ERANGE;
2894
2895 start_collseq = lookup_collation_sequence_value (start_elem);
2896 end_collseq = lookup_collation_sequence_value (end_elem);
2897 /* Check start/end collation sequence values. */
2898 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2899 return REG_ECOLLATE;
2900 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2901 return REG_ERANGE;
2902
2903 /* Got valid collation sequence values, add them as a new entry.
2904 However, if we have no collation elements, and the character set
2905 is single byte, the single byte character set that we
2906 build below suffices. */
2907 if (nrules > 0 || dfa->mb_cur_max > 1)
2908 {
2909 /* Check the space of the arrays. */
2910 if (BE (*range_alloc == mbcset->nranges, 0))
2911 {
2912 /* There is not enough space, need realloc. */
2913 uint32_t *new_array_start;
2914 uint32_t *new_array_end;
2915 Idx new_nranges;
2916
2917 /* +1 in case of mbcset->nranges is 0. */
2918 new_nranges = 2 * mbcset->nranges + 1;
2919 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920 new_nranges);
2921 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922 new_nranges);
2923
2924 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2925 return REG_ESPACE;
2926
2927 mbcset->range_starts = new_array_start;
2928 mbcset->range_ends = new_array_end;
2929 *range_alloc = new_nranges;
2930 }
2931
2932 mbcset->range_starts[mbcset->nranges] = start_collseq;
2933 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2934 }
2935
2936 /* Build the table for single byte characters. */
2937 for (ch = 0; ch < SBC_MAX; ch++)
2938 {
2939 uint32_t ch_collseq;
2940 /*
2941 if (MB_CUR_MAX == 1)
2942 */
2943 if (nrules == 0)
2944 ch_collseq = collseqmb[ch];
2945 else
2946 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2947 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2948 bitset_set (sbcset, ch);
2949 }
2950 return REG_NOERROR;
2951 }
2952
2953 /* Local function for parse_bracket_exp used in _LIBC environement.
2954 Build the collating element which is represented by NAME.
2955 The result are written to MBCSET and SBCSET.
2956 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2957 pointer argument sinse we may update it. */
2958
2959 auto inline reg_errcode_t
2960 __attribute ((always_inline))
2961 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2962 re_charset_t *mbcset;
2963 Idx *coll_sym_alloc;
2964 bitset_t sbcset;
2965 const unsigned char *name;
2966 {
2967 int32_t elem, idx;
2968 size_t name_len = strlen ((const char *) name);
2969 if (nrules != 0)
2970 {
2971 elem = seek_collating_symbol_entry (name, name_len);
2972 if (symb_table[2 * elem] != 0)
2973 {
2974 /* We found the entry. */
2975 idx = symb_table[2 * elem + 1];
2976 /* Skip the name of collating element name. */
2977 idx += 1 + extra[idx];
2978 }
2979 else if (symb_table[2 * elem] == 0 && name_len == 1)
2980 {
2981 /* No valid character, treat it as a normal
2982 character. */
2983 bitset_set (sbcset, name[0]);
2984 return REG_NOERROR;
2985 }
2986 else
2987 return REG_ECOLLATE;
2988
2989 /* Got valid collation sequence, add it as a new entry. */
2990 /* Check the space of the arrays. */
2991 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2992 {
2993 /* Not enough, realloc it. */
2994 /* +1 in case of mbcset->ncoll_syms is 0. */
2995 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2996 /* Use realloc since mbcset->coll_syms is NULL
2997 if *alloc == 0. */
2998 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2999 new_coll_sym_alloc);
3000 if (BE (new_coll_syms == NULL, 0))
3001 return REG_ESPACE;
3002 mbcset->coll_syms = new_coll_syms;
3003 *coll_sym_alloc = new_coll_sym_alloc;
3004 }
3005 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3006 return REG_NOERROR;
3007 }
3008 else
3009 {
3010 if (BE (name_len != 1, 0))
3011 return REG_ECOLLATE;
3012 else
3013 {
3014 bitset_set (sbcset, name[0]);
3015 return REG_NOERROR;
3016 }
3017 }
3018 }
3019 #endif
3020
3021 re_token_t br_token;
3022 re_bitset_ptr_t sbcset;
3023 #ifdef RE_ENABLE_I18N
3024 re_charset_t *mbcset;
3025 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3026 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3027 #endif /* not RE_ENABLE_I18N */
3028 bool non_match = false;
3029 bin_tree_t *work_tree;
3030 int token_len;
3031 bool first_round = true;
3032 #ifdef _LIBC
3033 collseqmb = (const unsigned char *)
3034 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036 if (nrules)
3037 {
3038 /*
3039 if (MB_CUR_MAX > 1)
3040 */
3041 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3044 _NL_COLLATE_SYMB_TABLEMB);
3045 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3046 _NL_COLLATE_SYMB_EXTRAMB);
3047 }
3048 #endif
3049 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3050 #ifdef RE_ENABLE_I18N
3051 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3052 #endif /* RE_ENABLE_I18N */
3053 #ifdef RE_ENABLE_I18N
3054 if (BE (sbcset == NULL || mbcset == NULL, 0))
3055 #else
3056 if (BE (sbcset == NULL, 0))
3057 #endif /* RE_ENABLE_I18N */
3058 {
3059 *err = REG_ESPACE;
3060 return NULL;
3061 }
3062
3063 token_len = peek_token_bracket (token, regexp, syntax);
3064 if (BE (token->type == END_OF_RE, 0))
3065 {
3066 *err = REG_BADPAT;
3067 goto parse_bracket_exp_free_return;
3068 }
3069 if (token->type == OP_NON_MATCH_LIST)
3070 {
3071 #ifdef RE_ENABLE_I18N
3072 mbcset->non_match = 1;
3073 #endif /* not RE_ENABLE_I18N */
3074 non_match = true;
3075 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3076 bitset_set (sbcset, '\n');
3077 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3078 token_len = peek_token_bracket (token, regexp, syntax);
3079 if (BE (token->type == END_OF_RE, 0))
3080 {
3081 *err = REG_BADPAT;
3082 goto parse_bracket_exp_free_return;
3083 }
3084 }
3085
3086 /* We treat the first ']' as a normal character. */
3087 if (token->type == OP_CLOSE_BRACKET)
3088 token->type = CHARACTER;
3089
3090 while (1)
3091 {
3092 bracket_elem_t start_elem, end_elem;
3093 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3094 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3095 reg_errcode_t ret;
3096 int token_len2 = 0;
3097 bool is_range_exp = false;
3098 re_token_t token2;
3099
3100 start_elem.opr.name = start_name_buf;
3101 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3102 syntax, first_round);
3103 if (BE (ret != REG_NOERROR, 0))
3104 {
3105 *err = ret;
3106 goto parse_bracket_exp_free_return;
3107 }
3108 first_round = false;
3109
3110 /* Get information about the next token. We need it in any case. */
3111 token_len = peek_token_bracket (token, regexp, syntax);
3112
3113 /* Do not check for ranges if we know they are not allowed. */
3114 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3115 {
3116 if (BE (token->type == END_OF_RE, 0))
3117 {
3118 *err = REG_EBRACK;
3119 goto parse_bracket_exp_free_return;
3120 }
3121 if (token->type == OP_CHARSET_RANGE)
3122 {
3123 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3124 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3125 if (BE (token2.type == END_OF_RE, 0))
3126 {
3127 *err = REG_EBRACK;
3128 goto parse_bracket_exp_free_return;
3129 }
3130 if (token2.type == OP_CLOSE_BRACKET)
3131 {
3132 /* We treat the last '-' as a normal character. */
3133 re_string_skip_bytes (regexp, -token_len);
3134 token->type = CHARACTER;
3135 }
3136 else
3137 is_range_exp = true;
3138 }
3139 }
3140
3141 if (is_range_exp == true)
3142 {
3143 end_elem.opr.name = end_name_buf;
3144 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3145 dfa, syntax, true);
3146 if (BE (ret != REG_NOERROR, 0))
3147 {
3148 *err = ret;
3149 goto parse_bracket_exp_free_return;
3150 }
3151
3152 token_len = peek_token_bracket (token, regexp, syntax);
3153
3154 #ifdef _LIBC
3155 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3156 &start_elem, &end_elem);
3157 #else
3158 # ifdef RE_ENABLE_I18N
3159 *err = build_range_exp (sbcset,
3160 dfa->mb_cur_max > 1 ? mbcset : NULL,
3161 &range_alloc, &start_elem, &end_elem);
3162 # else
3163 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3164 # endif
3165 #endif /* RE_ENABLE_I18N */
3166 if (BE (*err != REG_NOERROR, 0))
3167 goto parse_bracket_exp_free_return;
3168 }
3169 else
3170 {
3171 switch (start_elem.type)
3172 {
3173 case SB_CHAR:
3174 bitset_set (sbcset, start_elem.opr.ch);
3175 break;
3176 #ifdef RE_ENABLE_I18N
3177 case MB_CHAR:
3178 /* Check whether the array has enough space. */
3179 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3180 {
3181 wchar_t *new_mbchars;
3182 /* Not enough, realloc it. */
3183 /* +1 in case of mbcset->nmbchars is 0. */
3184 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3185 /* Use realloc since array is NULL if *alloc == 0. */
3186 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3187 mbchar_alloc);
3188 if (BE (new_mbchars == NULL, 0))
3189 goto parse_bracket_exp_espace;
3190 mbcset->mbchars = new_mbchars;
3191 }
3192 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3193 break;
3194 #endif /* RE_ENABLE_I18N */
3195 case EQUIV_CLASS:
3196 *err = build_equiv_class (sbcset,
3197 #ifdef RE_ENABLE_I18N
3198 mbcset, &equiv_class_alloc,
3199 #endif /* RE_ENABLE_I18N */
3200 start_elem.opr.name);
3201 if (BE (*err != REG_NOERROR, 0))
3202 goto parse_bracket_exp_free_return;
3203 break;
3204 case COLL_SYM:
3205 *err = build_collating_symbol (sbcset,
3206 #ifdef RE_ENABLE_I18N
3207 mbcset, &coll_sym_alloc,
3208 #endif /* RE_ENABLE_I18N */
3209 start_elem.opr.name);
3210 if (BE (*err != REG_NOERROR, 0))
3211 goto parse_bracket_exp_free_return;
3212 break;
3213 case CHAR_CLASS:
3214 *err = build_charclass (regexp->trans, sbcset,
3215 #ifdef RE_ENABLE_I18N
3216 mbcset, &char_class_alloc,
3217 #endif /* RE_ENABLE_I18N */
3218 start_elem.opr.name, syntax);
3219 if (BE (*err != REG_NOERROR, 0))
3220 goto parse_bracket_exp_free_return;
3221 break;
3222 default:
3223 assert (0);
3224 break;
3225 }
3226 }
3227 if (BE (token->type == END_OF_RE, 0))
3228 {
3229 *err = REG_EBRACK;
3230 goto parse_bracket_exp_free_return;
3231 }
3232 if (token->type == OP_CLOSE_BRACKET)
3233 break;
3234 }
3235
3236 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3237
3238 /* If it is non-matching list. */
3239 if (non_match)
3240 bitset_not (sbcset);
3241
3242 #ifdef RE_ENABLE_I18N
3243 /* Ensure only single byte characters are set. */
3244 if (dfa->mb_cur_max > 1)
3245 bitset_mask (sbcset, dfa->sb_char);
3246
3247 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3248 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3249 || mbcset->non_match)))
3250 {
3251 bin_tree_t *mbc_tree;
3252 int sbc_idx;
3253 /* Build a tree for complex bracket. */
3254 dfa->has_mb_node = 1;
3255 br_token.type = COMPLEX_BRACKET;
3256 br_token.opr.mbcset = mbcset;
3257 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258 if (BE (mbc_tree == NULL, 0))
3259 goto parse_bracket_exp_espace;
3260 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3261 if (sbcset[sbc_idx])
3262 break;
3263 /* If there are no bits set in sbcset, there is no point
3264 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3265 if (sbc_idx < BITSET_WORDS)
3266 {
3267 /* Build a tree for simple bracket. */
3268 br_token.type = SIMPLE_BRACKET;
3269 br_token.opr.sbcset = sbcset;
3270 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3271 if (BE (work_tree == NULL, 0))
3272 goto parse_bracket_exp_espace;
3273
3274 /* Then join them by ALT node. */
3275 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3276 if (BE (work_tree == NULL, 0))
3277 goto parse_bracket_exp_espace;
3278 }
3279 else
3280 {
3281 re_free (sbcset);
3282 work_tree = mbc_tree;
3283 }
3284 }
3285 else
3286 #endif /* not RE_ENABLE_I18N */
3287 {
3288 #ifdef RE_ENABLE_I18N
3289 free_charset (mbcset);
3290 #endif
3291 /* Build a tree for simple bracket. */
3292 br_token.type = SIMPLE_BRACKET;
3293 br_token.opr.sbcset = sbcset;
3294 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3295 if (BE (work_tree == NULL, 0))
3296 goto parse_bracket_exp_espace;
3297 }
3298 return work_tree;
3299
3300 parse_bracket_exp_espace:
3301 *err = REG_ESPACE;
3302 parse_bracket_exp_free_return:
3303 re_free (sbcset);
3304 #ifdef RE_ENABLE_I18N
3305 free_charset (mbcset);
3306 #endif /* RE_ENABLE_I18N */
3307 return NULL;
3308 }
3309
3310 /* Parse an element in the bracket expression. */
3311
3312 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3313 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3314 re_token_t *token, int token_len, re_dfa_t *dfa,
3315 reg_syntax_t syntax, bool accept_hyphen)
3316 {
3317 #ifdef RE_ENABLE_I18N
3318 int cur_char_size;
3319 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3320 if (cur_char_size > 1)
3321 {
3322 elem->type = MB_CHAR;
3323 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3324 re_string_skip_bytes (regexp, cur_char_size);
3325 return REG_NOERROR;
3326 }
3327 #endif /* RE_ENABLE_I18N */
3328 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3329 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3330 || token->type == OP_OPEN_EQUIV_CLASS)
3331 return parse_bracket_symbol (elem, regexp, token);
3332 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3333 {
3334 /* A '-' must only appear as anything but a range indicator before
3335 the closing bracket. Everything else is an error. */
3336 re_token_t token2;
3337 (void) peek_token_bracket (&token2, regexp, syntax);
3338 if (token2.type != OP_CLOSE_BRACKET)
3339 /* The actual error value is not standardized since this whole
3340 case is undefined. But ERANGE makes good sense. */
3341 return REG_ERANGE;
3342 }
3343 elem->type = SB_CHAR;
3344 elem->opr.ch = token->opr.c;
3345 return REG_NOERROR;
3346 }
3347
3348 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3349 such as [:<character_class>:], [.<collating_element>.], and
3350 [=<equivalent_class>=]. */
3351
3352 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3353 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3354 re_token_t *token)
3355 {
3356 unsigned char ch, delim = token->opr.c;
3357 int i = 0;
3358 if (re_string_eoi(regexp))
3359 return REG_EBRACK;
3360 for (;; ++i)
3361 {
3362 if (i >= BRACKET_NAME_BUF_SIZE)
3363 return REG_EBRACK;
3364 if (token->type == OP_OPEN_CHAR_CLASS)
3365 ch = re_string_fetch_byte_case (regexp);
3366 else
3367 ch = re_string_fetch_byte (regexp);
3368 if (re_string_eoi(regexp))
3369 return REG_EBRACK;
3370 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3371 break;
3372 elem->opr.name[i] = ch;
3373 }
3374 re_string_skip_bytes (regexp, 1);
3375 elem->opr.name[i] = '\0';
3376 switch (token->type)
3377 {
3378 case OP_OPEN_COLL_ELEM:
3379 elem->type = COLL_SYM;
3380 break;
3381 case OP_OPEN_EQUIV_CLASS:
3382 elem->type = EQUIV_CLASS;
3383 break;
3384 case OP_OPEN_CHAR_CLASS:
3385 elem->type = CHAR_CLASS;
3386 break;
3387 default:
3388 break;
3389 }
3390 return REG_NOERROR;
3391 }
3392
3393 /* Helper function for parse_bracket_exp.
3394 Build the equivalence class which is represented by NAME.
3395 The result are written to MBCSET and SBCSET.
3396 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3397 is a pointer argument sinse we may update it. */
3398
3399 static reg_errcode_t
3400 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3401 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3402 Idx *equiv_class_alloc, const unsigned char *name)
3403 #else /* not RE_ENABLE_I18N */
3404 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3405 #endif /* not RE_ENABLE_I18N */
3406 {
3407 #ifdef _LIBC
3408 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3409 if (nrules != 0)
3410 {
3411 const int32_t *table, *indirect;
3412 const unsigned char *weights, *extra, *cp;
3413 unsigned char char_buf[2];
3414 int32_t idx1, idx2;
3415 unsigned int ch;
3416 size_t len;
3417 /* This #include defines a local function! */
3418 # include <locale/weight.h>
3419 /* Calculate the index for equivalence class. */
3420 cp = name;
3421 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3422 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3423 _NL_COLLATE_WEIGHTMB);
3424 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3425 _NL_COLLATE_EXTRAMB);
3426 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3427 _NL_COLLATE_INDIRECTMB);
3428 idx1 = findidx (&cp);
3429 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3430 /* This isn't a valid character. */
3431 return REG_ECOLLATE;
3432
3433 /* Build single byte matcing table for this equivalence class. */
3434 char_buf[1] = (unsigned char) '\0';
3435 len = weights[idx1];
3436 for (ch = 0; ch < SBC_MAX; ++ch)
3437 {
3438 char_buf[0] = ch;
3439 cp = char_buf;
3440 idx2 = findidx (&cp);
3441 /*
3442 idx2 = table[ch];
3443 */
3444 if (idx2 == 0)
3445 /* This isn't a valid character. */
3446 continue;
3447 if (len == weights[idx2])
3448 {
3449 int cnt = 0;
3450 while (cnt <= len &&
3451 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3452 ++cnt;
3453
3454 if (cnt > len)
3455 bitset_set (sbcset, ch);
3456 }
3457 }
3458 /* Check whether the array has enough space. */
3459 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460 {
3461 /* Not enough, realloc it. */
3462 /* +1 in case of mbcset->nequiv_classes is 0. */
3463 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464 /* Use realloc since the array is NULL if *alloc == 0. */
3465 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466 int32_t,
3467 new_equiv_class_alloc);
3468 if (BE (new_equiv_classes == NULL, 0))
3469 return REG_ESPACE;
3470 mbcset->equiv_classes = new_equiv_classes;
3471 *equiv_class_alloc = new_equiv_class_alloc;
3472 }
3473 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474 }
3475 else
3476 #endif /* _LIBC */
3477 {
3478 if (BE (strlen ((const char *) name) != 1, 0))
3479 return REG_ECOLLATE;
3480 bitset_set (sbcset, *name);
3481 }
3482 return REG_NOERROR;
3483 }
3484
3485 /* Helper function for parse_bracket_exp.
3486 Build the character class which is represented by NAME.
3487 The result are written to MBCSET and SBCSET.
3488 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489 is a pointer argument sinse we may update it. */
3490
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3493 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494 re_charset_t *mbcset, Idx *char_class_alloc,
3495 const unsigned char *class_name, reg_syntax_t syntax)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498 const unsigned char *class_name, reg_syntax_t syntax)
3499 #endif /* not RE_ENABLE_I18N */
3500 {
3501 int i;
3502 const char *name = (const char *) class_name;
3503
3504 /* In case of REG_ICASE "upper" and "lower" match the both of
3505 upper and lower cases. */
3506 if ((syntax & RE_ICASE)
3507 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508 name = "alpha";
3509
3510 #ifdef RE_ENABLE_I18N
3511 /* Check the space of the arrays. */
3512 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513 {
3514 /* Not enough, realloc it. */
3515 /* +1 in case of mbcset->nchar_classes is 0. */
3516 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517 /* Use realloc since array is NULL if *alloc == 0. */
3518 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519 new_char_class_alloc);
3520 if (BE (new_char_classes == NULL, 0))
3521 return REG_ESPACE;
3522 mbcset->char_classes = new_char_classes;
3523 *char_class_alloc = new_char_class_alloc;
3524 }
3525 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526 #endif /* RE_ENABLE_I18N */
3527
3528 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3529 do { \
3530 if (BE (trans != NULL, 0)) \
3531 { \
3532 for (i = 0; i < SBC_MAX; ++i) \
3533 if (ctype_func (i)) \
3534 bitset_set (sbcset, trans[i]); \
3535 } \
3536 else \
3537 { \
3538 for (i = 0; i < SBC_MAX; ++i) \
3539 if (ctype_func (i)) \
3540 bitset_set (sbcset, i); \
3541 } \
3542 } while (0)
3543
3544 if (strcmp (name, "alnum") == 0)
3545 BUILD_CHARCLASS_LOOP (isalnum);
3546 else if (strcmp (name, "cntrl") == 0)
3547 BUILD_CHARCLASS_LOOP (iscntrl);
3548 else if (strcmp (name, "lower") == 0)
3549 BUILD_CHARCLASS_LOOP (islower);
3550 else if (strcmp (name, "space") == 0)
3551 BUILD_CHARCLASS_LOOP (isspace);
3552 else if (strcmp (name, "alpha") == 0)
3553 BUILD_CHARCLASS_LOOP (isalpha);
3554 else if (strcmp (name, "digit") == 0)
3555 BUILD_CHARCLASS_LOOP (isdigit);
3556 else if (strcmp (name, "print") == 0)
3557 BUILD_CHARCLASS_LOOP (isprint);
3558 else if (strcmp (name, "upper") == 0)
3559 BUILD_CHARCLASS_LOOP (isupper);
3560 else if (strcmp (name, "blank") == 0)
3561 BUILD_CHARCLASS_LOOP (isblank);
3562 else if (strcmp (name, "graph") == 0)
3563 BUILD_CHARCLASS_LOOP (isgraph);
3564 else if (strcmp (name, "punct") == 0)
3565 BUILD_CHARCLASS_LOOP (ispunct);
3566 else if (strcmp (name, "xdigit") == 0)
3567 BUILD_CHARCLASS_LOOP (isxdigit);
3568 else
3569 return REG_ECTYPE;
3570
3571 return REG_NOERROR;
3572 }
3573
3574 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3575 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576 const unsigned char *class_name,
3577 const unsigned char *extra, bool non_match,
3578 reg_errcode_t *err)
3579 {
3580 re_bitset_ptr_t sbcset;
3581 #ifdef RE_ENABLE_I18N
3582 re_charset_t *mbcset;
3583 Idx alloc = 0;
3584 #endif /* not RE_ENABLE_I18N */
3585 reg_errcode_t ret;
3586 re_token_t br_token;
3587 bin_tree_t *tree;
3588
3589 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590 #ifdef RE_ENABLE_I18N
3591 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592 #endif /* RE_ENABLE_I18N */
3593
3594 #ifdef RE_ENABLE_I18N
3595 if (BE (sbcset == NULL || mbcset == NULL, 0))
3596 #else /* not RE_ENABLE_I18N */
3597 if (BE (sbcset == NULL, 0))
3598 #endif /* not RE_ENABLE_I18N */
3599 {
3600 *err = REG_ESPACE;
3601 return NULL;
3602 }
3603
3604 if (non_match)
3605 {
3606 #ifdef RE_ENABLE_I18N
3607 mbcset->non_match = 1;
3608 #endif /* not RE_ENABLE_I18N */
3609 }
3610
3611 /* We don't care the syntax in this case. */
3612 ret = build_charclass (trans, sbcset,
3613 #ifdef RE_ENABLE_I18N
3614 mbcset, &alloc,
3615 #endif /* RE_ENABLE_I18N */
3616 class_name, 0);
3617
3618 if (BE (ret != REG_NOERROR, 0))
3619 {
3620 re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622 free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624 *err = ret;
3625 return NULL;
3626 }
3627 /* \w match '_' also. */
3628 for (; *extra; extra++)
3629 bitset_set (sbcset, *extra);
3630
3631 /* If it is non-matching list. */
3632 if (non_match)
3633 bitset_not (sbcset);
3634
3635 #ifdef RE_ENABLE_I18N
3636 /* Ensure only single byte characters are set. */
3637 if (dfa->mb_cur_max > 1)
3638 bitset_mask (sbcset, dfa->sb_char);
3639 #endif
3640
3641 /* Build a tree for simple bracket. */
3642 br_token.type = SIMPLE_BRACKET;
3643 br_token.opr.sbcset = sbcset;
3644 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645 if (BE (tree == NULL, 0))
3646 goto build_word_op_espace;
3647
3648 #ifdef RE_ENABLE_I18N
3649 if (dfa->mb_cur_max > 1)
3650 {
3651 bin_tree_t *mbc_tree;
3652 /* Build a tree for complex bracket. */
3653 br_token.type = COMPLEX_BRACKET;
3654 br_token.opr.mbcset = mbcset;
3655 dfa->has_mb_node = 1;
3656 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657 if (BE (mbc_tree == NULL, 0))
3658 goto build_word_op_espace;
3659 /* Then join them by ALT node. */
3660 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661 if (BE (mbc_tree != NULL, 1))
3662 return tree;
3663 }
3664 else
3665 {
3666 free_charset (mbcset);
3667 return tree;
3668 }
3669 #else /* not RE_ENABLE_I18N */
3670 return tree;
3671 #endif /* not RE_ENABLE_I18N */
3672
3673 build_word_op_espace:
3674 re_free (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676 free_charset (mbcset);
3677 #endif /* RE_ENABLE_I18N */
3678 *err = REG_ESPACE;
3679 return NULL;
3680 }
3681
3682 /* This is intended for the expressions like "a{1,3}".
3683 Fetch a number from `input', and return the number.
3684 Return REG_MISSING if the number field is empty like "{,1}".
3685 Return REG_ERROR if an error occurred. */
3686
3687 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3688 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689 {
3690 Idx num = REG_MISSING;
3691 unsigned char c;
3692 while (1)
3693 {
3694 fetch_token (token, input, syntax);
3695 c = token->opr.c;
3696 if (BE (token->type == END_OF_RE, 0))
3697 return REG_ERROR;
3698 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699 break;
3700 num = ((token->type != CHARACTER || c < '0' || '9' < c
3701 || num == REG_ERROR)
3702 ? REG_ERROR
3703 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3704 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3705 }
3706 return num;
3707 }
3708
3709 #ifdef RE_ENABLE_I18N
3710 static void
free_charset(re_charset_t * cset)3711 free_charset (re_charset_t *cset)
3712 {
3713 re_free (cset->mbchars);
3714 # ifdef _LIBC
3715 re_free (cset->coll_syms);
3716 re_free (cset->equiv_classes);
3717 re_free (cset->range_starts);
3718 re_free (cset->range_ends);
3719 # endif
3720 re_free (cset->char_classes);
3721 re_free (cset);
3722 }
3723 #endif /* RE_ENABLE_I18N */
3724
3725 /* Functions for binary tree operation. */
3726
3727 /* Create a tree node. */
3728
3729 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3730 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3731 re_token_type_t type)
3732 {
3733 re_token_t t;
3734 t.type = type;
3735 return create_token_tree (dfa, left, right, &t);
3736 }
3737
3738 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3739 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3740 const re_token_t *token)
3741 {
3742 bin_tree_t *tree;
3743 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3744 {
3745 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3746
3747 if (storage == NULL)
3748 return NULL;
3749 storage->next = dfa->str_tree_storage;
3750 dfa->str_tree_storage = storage;
3751 dfa->str_tree_storage_idx = 0;
3752 }
3753 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3754
3755 tree->parent = NULL;
3756 tree->left = left;
3757 tree->right = right;
3758 tree->token = *token;
3759 tree->token.duplicated = 0;
3760 tree->token.opt_subexp = 0;
3761 tree->first = NULL;
3762 tree->next = NULL;
3763 tree->node_idx = REG_MISSING;
3764
3765 if (left != NULL)
3766 left->parent = tree;
3767 if (right != NULL)
3768 right->parent = tree;
3769 return tree;
3770 }
3771
3772 /* Mark the tree SRC as an optional subexpression.
3773 To be called from preorder or postorder. */
3774
3775 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3776 mark_opt_subexp (void *extra, bin_tree_t *node)
3777 {
3778 Idx idx = (Idx) (intptr_t) extra;
3779 assert(sizeof(void*) >= sizeof(Idx));
3780 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3781 node->token.opt_subexp = 1;
3782
3783 return REG_NOERROR;
3784 }
3785
3786 /* Free the allocated memory inside NODE. */
3787
3788 static void
free_token(re_token_t * node)3789 free_token (re_token_t *node)
3790 {
3791 #ifdef RE_ENABLE_I18N
3792 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3793 free_charset (node->opr.mbcset);
3794 else
3795 #endif /* RE_ENABLE_I18N */
3796 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3797 re_free (node->opr.sbcset);
3798 }
3799
3800 /* Worker function for tree walking. Free the allocated memory inside NODE
3801 and its children. */
3802
3803 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3804 free_tree (void *extra, bin_tree_t *node)
3805 {
3806 free_token (&node->token);
3807 return REG_NOERROR;
3808 }
3809
3810
3811 /* Duplicate the node SRC, and return new node. This is a preorder
3812 visit similar to the one implemented by the generic visitor, but
3813 we need more infrastructure to maintain two parallel trees --- so,
3814 it's easier to duplicate. */
3815
3816 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3817 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3818 {
3819 const bin_tree_t *node;
3820 bin_tree_t *dup_root;
3821 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3822
3823 for (node = root; ; )
3824 {
3825 /* Create a new tree and link it back to the current parent. */
3826 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3827 if (*p_new == NULL)
3828 return NULL;
3829 (*p_new)->parent = dup_node;
3830 (*p_new)->token.duplicated = 1;
3831 dup_node = *p_new;
3832
3833 /* Go to the left node, or up and to the right. */
3834 if (node->left)
3835 {
3836 node = node->left;
3837 p_new = &dup_node->left;
3838 }
3839 else
3840 {
3841 const bin_tree_t *prev = NULL;
3842 while (node->right == prev || node->right == NULL)
3843 {
3844 prev = node;
3845 node = node->parent;
3846 dup_node = dup_node->parent;
3847 if (!node)
3848 return dup_root;
3849 }
3850 node = node->right;
3851 p_new = &dup_node->right;
3852 }
3853 }
3854 }
3855