1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
5    Free Software Foundation, Inc.
6    This file is part of the GNU C Library.
7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3, or (at your option)
12    any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License along
20    with this program; if not, write to the Free Software Foundation,
21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 					  size_t length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 				     const re_dfastate_t *init_state,
27 				     char *fastmap);
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 #ifdef RE_ENABLE_I18N
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 #ifdef RE_ENABLE_I18N
35 static void optimize_utf8 (re_dfa_t *dfa);
36 #endif
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 			       reg_errcode_t (fn (void *, bin_tree_t *)),
40 			       void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 				reg_errcode_t (fn (void *, bin_tree_t *)),
43 				void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 				 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 				   unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 					 Idx node, bool root);
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 			 reg_syntax_t syntax);
60 static int peek_token (re_token_t *token, re_string_t *input,
61 			reg_syntax_t syntax) internal_function;
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 			  reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 				  re_token_t *token, reg_syntax_t syntax,
66 				  Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 				 re_token_t *token, reg_syntax_t syntax,
69 				 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 				     re_token_t *token, reg_syntax_t syntax,
72 				     Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 				  re_token_t *token, reg_syntax_t syntax,
75 				  Idx nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 				 re_dfa_t *dfa, re_token_t *token,
78 				 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 				      re_token_t *token, reg_syntax_t syntax,
81 				      reg_errcode_t *err);
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 					    re_string_t *regexp,
84 					    re_token_t *token, int token_len,
85 					    re_dfa_t *dfa,
86 					    reg_syntax_t syntax,
87 					    bool accept_hyphen);
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 					  re_string_t *regexp,
90 					  re_token_t *token);
91 #ifdef RE_ENABLE_I18N
92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 					re_charset_t *mbcset,
94 					Idx *equiv_class_alloc,
95 					const unsigned char *name);
96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 				      bitset_t sbcset,
98 				      re_charset_t *mbcset,
99 				      Idx *char_class_alloc,
100 				      const unsigned char *class_name,
101 				      reg_syntax_t syntax);
102 #else  /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 					const unsigned char *name);
105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 				      bitset_t sbcset,
107 				      const unsigned char *class_name,
108 				      reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 				       RE_TRANSLATE_TYPE trans,
112 				       const unsigned char *class_name,
113 				       const unsigned char *extra,
114 				       bool non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 				bin_tree_t *left, bin_tree_t *right,
117 				re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 				      bin_tree_t *left, bin_tree_t *right,
120 				      const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 
126 /* This table gives an error message for each of the error codes listed
127    in regex.h.  Obviously the order here has to be same as there.
128    POSIX doesn't require that we do anything for REG_NOERROR,
129    but why not be nice?  */
130 
131 static const char __re_error_msgid[] =
132   {
133 #define REG_NOERROR_IDX	0
134     gettext_noop ("Success")	/* REG_NOERROR */
135     "\0"
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137     gettext_noop ("No match")	/* REG_NOMATCH */
138     "\0"
139 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141     "\0"
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144     "\0"
145 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147     "\0"
148 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150     "\0"
151 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153     "\0"
154 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
155     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
156     "\0"
157 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159     "\0"
160 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162     "\0"
163 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165     "\0"
166 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167     gettext_noop ("Invalid range end")	/* REG_ERANGE */
168     "\0"
169 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
171     "\0"
172 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174     "\0"
175 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
177     "\0"
178 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
180     "\0"
181 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183   };
184 
185 static const size_t __re_error_msgid_idx[] =
186   {
187     REG_NOERROR_IDX,
188     REG_NOMATCH_IDX,
189     REG_BADPAT_IDX,
190     REG_ECOLLATE_IDX,
191     REG_ECTYPE_IDX,
192     REG_EESCAPE_IDX,
193     REG_ESUBREG_IDX,
194     REG_EBRACK_IDX,
195     REG_EPAREN_IDX,
196     REG_EBRACE_IDX,
197     REG_BADBR_IDX,
198     REG_ERANGE_IDX,
199     REG_ESPACE_IDX,
200     REG_BADRPT_IDX,
201     REG_EEND_IDX,
202     REG_ESIZE_IDX,
203     REG_ERPAREN_IDX
204   };
205 
206 /* Entry points for GNU code.  */
207 
208 /* re_compile_pattern is the GNU regular expression compiler: it
209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210    Returns 0 if the pattern was valid, otherwise an error string.
211 
212    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213    are set in BUFP on entry.  */
214 
215 #ifdef _LIBC
216 const char *
re_compile_pattern(pattern,length,bufp)217 re_compile_pattern (pattern, length, bufp)
218     const char *pattern;
219     size_t length;
220     struct re_pattern_buffer *bufp;
221 #else /* size_t might promote */
222 const char *
223 re_compile_pattern (const char *pattern, size_t length,
224 		    struct re_pattern_buffer *bufp)
225 #endif
226 {
227   reg_errcode_t ret;
228 
229   /* And GNU code determines whether or not to get register information
230      by passing null for the REGS argument to re_match, etc., not by
231      setting no_sub, unless RE_NO_SUB is set.  */
232   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233 
234   /* Match anchors at newline.  */
235   bufp->newline_anchor = 1;
236 
237   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 
239   if (!ret)
240     return NULL;
241   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 }
243 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)244 weak_alias (__re_compile_pattern, re_compile_pattern)
245 #endif
246 
247 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
248    also be assigned to arbitrarily: each pattern buffer stores its own
249    syntax, so it can be changed between regex compilations.  */
250 /* This has no initializer because initialized variables in Emacs
251    become read-only after dumping.  */
252 reg_syntax_t re_syntax_options;
253 
254 
255 /* Specify the precise syntax of regexps for compilation.  This provides
256    for compatibility for various utilities which historically have
257    different, incompatible syntaxes.
258 
259    The argument SYNTAX is a bit mask comprised of the various bits
260    defined in regex.h.  We return the old syntax.  */
261 
262 reg_syntax_t
263 re_set_syntax (syntax)
264     reg_syntax_t syntax;
265 {
266   reg_syntax_t ret = re_syntax_options;
267 
268   re_syntax_options = syntax;
269   return ret;
270 }
271 #ifdef _LIBC
272 weak_alias (__re_set_syntax, re_set_syntax)
273 #endif
274 
275 int
276 re_compile_fastmap (bufp)
277     struct re_pattern_buffer *bufp;
278 {
279   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
280   char *fastmap = bufp->fastmap;
281 
282   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
283   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
284   if (dfa->init_state != dfa->init_state_word)
285     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
286   if (dfa->init_state != dfa->init_state_nl)
287     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
288   if (dfa->init_state != dfa->init_state_begbuf)
289     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
290   bufp->fastmap_accurate = 1;
291   return 0;
292 }
293 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 #endif
296 
297 static inline void
298 __attribute ((always_inline))
299 re_set_fastmap (char *fastmap, bool icase, int ch)
300 {
301   fastmap[ch] = 1;
302   if (icase)
303     fastmap[tolower (ch)] = 1;
304 }
305 
306 /* Helper function for re_compile_fastmap.
307    Compile fastmap for the initial_state INIT_STATE.  */
308 
309 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 			 char *fastmap)
312 {
313   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
314   Idx node_cnt;
315   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
316   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317     {
318       Idx node = init_state->nodes.elems[node_cnt];
319       re_token_type_t type = dfa->nodes[node].type;
320 
321       if (type == CHARACTER)
322 	{
323 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
324 #ifdef RE_ENABLE_I18N
325 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 	    {
327 	      unsigned char buf[MB_LEN_MAX];
328 	      unsigned char *p;
329 	      wchar_t wc;
330 	      mbstate_t state;
331 
332 	      p = buf;
333 	      *p++ = dfa->nodes[node].opr.c;
334 	      while (++node < dfa->nodes_len
335 		     &&	dfa->nodes[node].type == CHARACTER
336 		     && dfa->nodes[node].mb_partial)
337 		*p++ = dfa->nodes[node].opr.c;
338 	      memset (&state, '\0', sizeof (state));
339 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
340 			     &state) == p - buf
341 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 		      != (size_t) -1))
343 		re_set_fastmap (fastmap, false, buf[0]);
344 	    }
345 #endif
346 	}
347       else if (type == SIMPLE_BRACKET)
348 	{
349 	  int i, ch;
350 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 	    {
352 	      int j;
353 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
354 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
355 		if (w & ((bitset_word_t) 1 << j))
356 		  re_set_fastmap (fastmap, icase, ch);
357 	    }
358 	}
359 #ifdef RE_ENABLE_I18N
360       else if (type == COMPLEX_BRACKET)
361 	{
362 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 	  Idx i;
364 
365 # ifdef _LIBC
366 	  /* See if we have to try all bytes which start multiple collation
367 	     elements.
368 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
369 		  collation element, and don't catch 'b' since 'b' is
370 		  the only collation element which starts from 'b' (and
371 		  it is caught by SIMPLE_BRACKET).  */
372 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
373 		  && (cset->ncoll_syms || cset->nranges))
374 		{
375 		  const int32_t *table = (const int32_t *)
376 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 		  for (i = 0; i < SBC_MAX; ++i)
378 		    if (table[i] < 0)
379 		      re_set_fastmap (fastmap, icase, i);
380 		}
381 # endif /* _LIBC */
382 
383 	  /* See if we have to start the match at all multibyte characters,
384 	     i.e. where we would not find an invalid sequence.  This only
385 	     applies to multibyte character sets; for single byte character
386 	     sets, the SIMPLE_BRACKET again suffices.  */
387 	  if (dfa->mb_cur_max > 1
388 	      && (cset->nchar_classes || cset->non_match
389 # ifdef _LIBC
390 		  || cset->nequiv_classes
391 # endif /* _LIBC */
392 		 ))
393 	    {
394 	      unsigned char c = 0;
395 	      do
396 		{
397 		  mbstate_t mbs;
398 		  memset (&mbs, 0, sizeof (mbs));
399 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
400 		    re_set_fastmap (fastmap, false, (int) c);
401 		}
402 	      while (++c != 0);
403 	    }
404 
405 	  else
406 	    {
407 	      /* ... Else catch all bytes which can start the mbchars.  */
408 	      for (i = 0; i < cset->nmbchars; ++i)
409 		{
410 		  char buf[256];
411 		  mbstate_t state;
412 		  memset (&state, '\0', sizeof (state));
413 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 		    {
417 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 			  != (size_t) -1)
419 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 		    }
421 		}
422 	    }
423 	}
424 #endif /* RE_ENABLE_I18N */
425       else if (type == OP_PERIOD
426 #ifdef RE_ENABLE_I18N
427 	       || type == OP_UTF8_PERIOD
428 #endif /* RE_ENABLE_I18N */
429 	       || type == END_OF_RE)
430 	{
431 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
432 	  if (type == END_OF_RE)
433 	    bufp->can_be_null = 1;
434 	  return;
435 	}
436     }
437 }
438 
439 /* Entry point for POSIX code.  */
440 /* regcomp takes a regular expression as a string and compiles it.
441 
442    PREG is a regex_t *.  We do not expect any fields to be initialized,
443    since POSIX says we shouldn't.  Thus, we set
444 
445      `buffer' to the compiled pattern;
446      `used' to the length of the compiled pattern;
447      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
448        REG_EXTENDED bit in CFLAGS is set; otherwise, to
449        RE_SYNTAX_POSIX_BASIC;
450      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
451      `fastmap' to an allocated space for the fastmap;
452      `fastmap_accurate' to zero;
453      `re_nsub' to the number of subexpressions in PATTERN.
454 
455    PATTERN is the address of the pattern string.
456 
457    CFLAGS is a series of bits which affect compilation.
458 
459      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
460      use POSIX basic syntax.
461 
462      If REG_NEWLINE is set, then . and [^...] don't match newline.
463      Also, regexec will try a match beginning after every newline.
464 
465      If REG_ICASE is set, then we considers upper- and lowercase
466      versions of letters to be equivalent when matching.
467 
468      If REG_NOSUB is set, then when PREG is passed to regexec, that
469      routine will report only success or failure, and nothing about the
470      registers.
471 
472    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
473    the return codes and their meanings.)  */
474 
475 int
regcomp(preg,pattern,cflags)476 regcomp (preg, pattern, cflags)
477     regex_t *_Restrict_ preg;
478     const char *_Restrict_ pattern;
479     int cflags;
480 {
481   reg_errcode_t ret;
482   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
483 			 : RE_SYNTAX_POSIX_BASIC);
484 
485   preg->buffer = NULL;
486   preg->allocated = 0;
487   preg->used = 0;
488 
489   /* Try to allocate space for the fastmap.  */
490   preg->fastmap = re_malloc (char, SBC_MAX);
491   if (BE (preg->fastmap == NULL, 0))
492     return REG_ESPACE;
493 
494   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495 
496   /* If REG_NEWLINE is set, newlines are treated differently.  */
497   if (cflags & REG_NEWLINE)
498     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
499       syntax &= ~RE_DOT_NEWLINE;
500       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
501       /* It also changes the matching behavior.  */
502       preg->newline_anchor = 1;
503     }
504   else
505     preg->newline_anchor = 0;
506   preg->no_sub = !!(cflags & REG_NOSUB);
507   preg->translate = NULL;
508 
509   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510 
511   /* POSIX doesn't distinguish between an unmatched open-group and an
512      unmatched close-group: both are REG_EPAREN.  */
513   if (ret == REG_ERPAREN)
514     ret = REG_EPAREN;
515 
516   /* We have already checked preg->fastmap != NULL.  */
517   if (BE (ret == REG_NOERROR, 1))
518     /* Compute the fastmap now, since regexec cannot modify the pattern
519        buffer.  This function never fails in this implementation.  */
520     (void) re_compile_fastmap (preg);
521   else
522     {
523       /* Some error occurred while compiling the expression.  */
524       re_free (preg->fastmap);
525       preg->fastmap = NULL;
526     }
527 
528   return (int) ret;
529 }
530 #ifdef _LIBC
531 weak_alias (__regcomp, regcomp)
532 #endif
533 
534 /* Returns a message corresponding to an error code, ERRCODE, returned
535    from either regcomp or regexec.   We don't use PREG here.  */
536 
537 #ifdef _LIBC
538 size_t
539 regerror (errcode, preg, errbuf, errbuf_size)
540     int errcode;
541     const regex_t *_Restrict_ preg;
542     char *_Restrict_ errbuf;
543     size_t errbuf_size;
544 #else /* size_t might promote */
545 size_t
546 regerror (int errcode, const regex_t *_Restrict_ preg,
547 	  char *_Restrict_ errbuf, size_t errbuf_size)
548 #endif
549 {
550   const char *msg;
551   size_t msg_size;
552 
553   if (BE (errcode < 0
554 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
555 			       / sizeof (__re_error_msgid_idx[0])), 0))
556     /* Only error codes returned by the rest of the code should be passed
557        to this routine.  If we are given anything else, or if other regex
558        code generates an invalid error code, then the program has a bug.
559        Dump core so we can fix it.  */
560     abort ();
561 
562   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563 
564   msg_size = strlen (msg) + 1; /* Includes the null.  */
565 
566   if (BE (errbuf_size != 0, 1))
567     {
568       size_t cpy_size = msg_size;
569       if (BE (msg_size > errbuf_size, 0))
570 	{
571 	  cpy_size = errbuf_size - 1;
572 	  errbuf[cpy_size] = '\0';
573 	}
574       memcpy (errbuf, msg, cpy_size);
575     }
576 
577   return msg_size;
578 }
579 #ifdef _LIBC
580 weak_alias (__regerror, regerror)
581 #endif
582 
583 
584 #ifdef RE_ENABLE_I18N
585 /* This static array is used for the map to single-byte characters when
586    UTF-8 is used.  Otherwise we would allocate memory just to initialize
587    it the same all the time.  UTF-8 is the preferred encoding so this is
588    a worthwhile optimization.  */
589 static const bitset_t utf8_sb_map =
590 {
591   /* Set the first 128 bits.  */
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 #  error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597   BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599   BITSET_WORD_MAX,
600 # endif
601   (BITSET_WORD_MAX
602    >> (SBC_MAX % BITSET_WORD_BITS == 0
603        ? 0
604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 };
606 #endif
607 
608 
609 static void
free_dfa_content(re_dfa_t * dfa)610 free_dfa_content (re_dfa_t *dfa)
611 {
612   Idx i, j;
613 
614   if (dfa->nodes)
615     for (i = 0; i < dfa->nodes_len; ++i)
616       free_token (dfa->nodes + i);
617   re_free (dfa->nexts);
618   for (i = 0; i < dfa->nodes_len; ++i)
619     {
620       if (dfa->eclosures != NULL)
621 	re_node_set_free (dfa->eclosures + i);
622       if (dfa->inveclosures != NULL)
623 	re_node_set_free (dfa->inveclosures + i);
624       if (dfa->edests != NULL)
625 	re_node_set_free (dfa->edests + i);
626     }
627   re_free (dfa->edests);
628   re_free (dfa->eclosures);
629   re_free (dfa->inveclosures);
630   re_free (dfa->nodes);
631 
632   if (dfa->state_table)
633     for (i = 0; i <= dfa->state_hash_mask; ++i)
634       {
635 	struct re_state_table_entry *entry = dfa->state_table + i;
636 	for (j = 0; j < entry->num; ++j)
637 	  {
638 	    re_dfastate_t *state = entry->array[j];
639 	    free_state (state);
640 	  }
641         re_free (entry->array);
642       }
643   re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645   if (dfa->sb_char != utf8_sb_map)
646     re_free (dfa->sb_char);
647 #endif
648   re_free (dfa->subexp_map);
649 #ifdef DEBUG
650   re_free (dfa->re_str);
651 #endif
652 
653   re_free (dfa);
654 }
655 
656 
657 /* Free dynamically allocated space used by PREG.  */
658 
659 void
regfree(preg)660 regfree (preg)
661     regex_t *preg;
662 {
663   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
664   if (BE (dfa != NULL, 1))
665     free_dfa_content (dfa);
666   preg->buffer = NULL;
667   preg->allocated = 0;
668 
669   re_free (preg->fastmap);
670   preg->fastmap = NULL;
671 
672   re_free (preg->translate);
673   preg->translate = NULL;
674 }
675 #ifdef _LIBC
676 weak_alias (__regfree, regfree)
677 #endif
678 
679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
680    them unless specifically requested.  */
681 
682 #if defined _REGEX_RE_COMP || defined _LIBC
683 
684 /* BSD has one and only one pattern buffer.  */
685 static struct re_pattern_buffer re_comp_buf;
686 
687 char *
688 # ifdef _LIBC
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690    these names if they don't use our functions, and still use
691    regcomp/regexec above without link errors.  */
692 weak_function
693 # endif
re_comp(s)694 re_comp (s)
695      const char *s;
696 {
697   reg_errcode_t ret;
698   char *fastmap;
699 
700   if (!s)
701     {
702       if (!re_comp_buf.buffer)
703 	return gettext ("No previous regular expression");
704       return 0;
705     }
706 
707   if (re_comp_buf.buffer)
708     {
709       fastmap = re_comp_buf.fastmap;
710       re_comp_buf.fastmap = NULL;
711       __regfree (&re_comp_buf);
712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713       re_comp_buf.fastmap = fastmap;
714     }
715 
716   if (re_comp_buf.fastmap == NULL)
717     {
718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719       if (re_comp_buf.fastmap == NULL)
720 	return (char *) gettext (__re_error_msgid
721 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
722     }
723 
724   /* Since `re_exec' always passes NULL for the `regs' argument, we
725      don't need to initialize the pattern buffer fields which affect it.  */
726 
727   /* Match anchors at newlines.  */
728   re_comp_buf.newline_anchor = 1;
729 
730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731 
732   if (!ret)
733     return NULL;
734 
735   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 }
738 
739 #ifdef _LIBC
libc_freeres_fn(free_mem)740 libc_freeres_fn (free_mem)
741 {
742   __regfree (&re_comp_buf);
743 }
744 #endif
745 
746 #endif /* _REGEX_RE_COMP */
747 
748 /* Internal entry point.
749    Compile the regular expression PATTERN, whose length is LENGTH.
750    SYNTAX indicate regular expression's syntax.  */
751 
752 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 		     reg_syntax_t syntax)
755 {
756   reg_errcode_t err = REG_NOERROR;
757   re_dfa_t *dfa;
758   re_string_t regexp;
759 
760   /* Initialize the pattern buffer.  */
761   preg->fastmap_accurate = 0;
762   preg->syntax = syntax;
763   preg->not_bol = preg->not_eol = 0;
764   preg->used = 0;
765   preg->re_nsub = 0;
766   preg->can_be_null = 0;
767   preg->regs_allocated = REGS_UNALLOCATED;
768 
769   /* Initialize the dfa.  */
770   dfa = (re_dfa_t *) preg->buffer;
771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772     {
773       /* If zero allocated, but buffer is non-null, try to realloc
774 	 enough space.  This loses if buffer's address is bogus, but
775 	 that is the user's responsibility.  If ->buffer is NULL this
776 	 is a simple allocation.  */
777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778       if (dfa == NULL)
779 	return REG_ESPACE;
780       preg->allocated = sizeof (re_dfa_t);
781       preg->buffer = (unsigned char *) dfa;
782     }
783   preg->used = sizeof (re_dfa_t);
784 
785   err = init_dfa (dfa, length);
786   if (BE (err != REG_NOERROR, 0))
787     {
788       free_dfa_content (dfa);
789       preg->buffer = NULL;
790       preg->allocated = 0;
791       return err;
792     }
793 #ifdef DEBUG
794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
795   dfa->re_str = re_malloc (char, length + 1);
796   strncpy (dfa->re_str, pattern, length + 1);
797 #endif
798 
799   __libc_lock_init (dfa->lock);
800 
801   err = re_string_construct (&regexp, pattern, length, preg->translate,
802 			     (syntax & RE_ICASE) != 0, dfa);
803   if (BE (err != REG_NOERROR, 0))
804     {
805     re_compile_internal_free_return:
806       free_workarea_compile (preg);
807       re_string_destruct (&regexp);
808       free_dfa_content (dfa);
809       preg->buffer = NULL;
810       preg->allocated = 0;
811       return err;
812     }
813 
814   /* Parse the regular expression, and build a structure tree.  */
815   preg->re_nsub = 0;
816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
817   if (BE (dfa->str_tree == NULL, 0))
818     goto re_compile_internal_free_return;
819 
820   /* Analyze the tree and create the nfa.  */
821   err = analyze (preg);
822   if (BE (err != REG_NOERROR, 0))
823     goto re_compile_internal_free_return;
824 
825 #ifdef RE_ENABLE_I18N
826   /* If possible, do searching in single byte encoding to speed things up.  */
827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828     optimize_utf8 (dfa);
829 #endif
830 
831   /* Then create the initial state of the dfa.  */
832   err = create_initial_state (dfa);
833 
834   /* Release work areas.  */
835   free_workarea_compile (preg);
836   re_string_destruct (&regexp);
837 
838   if (BE (err != REG_NOERROR, 0))
839     {
840       free_dfa_content (dfa);
841       preg->buffer = NULL;
842       preg->allocated = 0;
843     }
844 
845   return err;
846 }
847 
848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
849    as the initial length of some arrays.  */
850 
851 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)852 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 {
854   __re_size_t table_size;
855 #ifdef RE_ENABLE_I18N
856   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
857 #else
858   size_t max_i18n_object_size = 0;
859 #endif
860   size_t max_object_size =
861     MAX (sizeof (struct re_state_table_entry),
862 	 MAX (sizeof (re_token_t),
863 	      MAX (sizeof (re_node_set),
864 		   MAX (sizeof (regmatch_t),
865 			max_i18n_object_size))));
866 
867   memset (dfa, '\0', sizeof (re_dfa_t));
868 
869   /* Force allocation of str_tree_storage the first time.  */
870   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
871 
872   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
873      calculation below, and for similar doubling calculations
874      elsewhere.  And it's <= rather than <, because some of the
875      doubling calculations add 1 afterwards.  */
876   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
877     return REG_ESPACE;
878 
879   dfa->nodes_alloc = pat_len + 1;
880   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
881 
882   /*  table_size = 2 ^ ceil(log pat_len) */
883   for (table_size = 1; ; table_size <<= 1)
884     if (table_size > pat_len)
885       break;
886 
887   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
888   dfa->state_hash_mask = table_size - 1;
889 
890   dfa->mb_cur_max = MB_CUR_MAX;
891 #ifdef _LIBC
892   if (dfa->mb_cur_max == 6
893       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
894     dfa->is_utf8 = 1;
895   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
896 		       != 0);
897 #else
898   if (strcmp (locale_charset (), "UTF-8") == 0)
899     dfa->is_utf8 = 1;
900 
901   /* We check exhaustively in the loop below if this charset is a
902      superset of ASCII.  */
903   dfa->map_notascii = 0;
904 #endif
905 
906 #ifdef RE_ENABLE_I18N
907   if (dfa->mb_cur_max > 1)
908     {
909       if (dfa->is_utf8)
910 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
911       else
912 	{
913 	  int i, j, ch;
914 
915 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
916 	  if (BE (dfa->sb_char == NULL, 0))
917 	    return REG_ESPACE;
918 
919 	  /* Set the bits corresponding to single byte chars.  */
920 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
921 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
922 	      {
923 		wint_t wch = __btowc (ch);
924 		if (wch != WEOF)
925 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
926 # ifndef _LIBC
927 		if (isascii (ch) && wch != ch)
928 		  dfa->map_notascii = 1;
929 # endif
930 	      }
931 	}
932     }
933 #endif
934 
935   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
936     return REG_ESPACE;
937   return REG_NOERROR;
938 }
939 
940 /* Initialize WORD_CHAR table, which indicate which character is
941    "word".  In this case "word" means that it is the word construction
942    character used by some operators like "\<", "\>", etc.  */
943 
944 static void
945 internal_function
init_word_char(re_dfa_t * dfa)946 init_word_char (re_dfa_t *dfa)
947 {
948   int i, j, ch;
949   dfa->word_ops_used = 1;
950   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
951     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
952       if (isalnum (ch) || ch == '_')
953 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
954 }
955 
956 /* Free the work area which are only used while compiling.  */
957 
958 static void
free_workarea_compile(regex_t * preg)959 free_workarea_compile (regex_t *preg)
960 {
961   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962   bin_tree_storage_t *storage, *next;
963   for (storage = dfa->str_tree_storage; storage; storage = next)
964     {
965       next = storage->next;
966       re_free (storage);
967     }
968   dfa->str_tree_storage = NULL;
969   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
970   dfa->str_tree = NULL;
971   re_free (dfa->org_indices);
972   dfa->org_indices = NULL;
973 }
974 
975 /* Create initial states for all contexts.  */
976 
977 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)978 create_initial_state (re_dfa_t *dfa)
979 {
980   Idx first, i;
981   reg_errcode_t err;
982   re_node_set init_nodes;
983 
984   /* Initial states have the epsilon closure of the node which is
985      the first node of the regular expression.  */
986   first = dfa->str_tree->first->node_idx;
987   dfa->init_node = first;
988   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
989   if (BE (err != REG_NOERROR, 0))
990     return err;
991 
992   /* The back-references which are in initial states can epsilon transit,
993      since in this case all of the subexpressions can be null.
994      Then we add epsilon closures of the nodes which are the next nodes of
995      the back-references.  */
996   if (dfa->nbackref > 0)
997     for (i = 0; i < init_nodes.nelem; ++i)
998       {
999 	Idx node_idx = init_nodes.elems[i];
1000 	re_token_type_t type = dfa->nodes[node_idx].type;
1001 
1002 	Idx clexp_idx;
1003 	if (type != OP_BACK_REF)
1004 	  continue;
1005 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1006 	  {
1007 	    re_token_t *clexp_node;
1008 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1009 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1010 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1011 	      break;
1012 	  }
1013 	if (clexp_idx == init_nodes.nelem)
1014 	  continue;
1015 
1016 	if (type == OP_BACK_REF)
1017 	  {
1018 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1019 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1020 	      {
1021 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1022 		i = 0;
1023 	      }
1024 	  }
1025       }
1026 
1027   /* It must be the first time to invoke acquire_state.  */
1028   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1029   /* We don't check ERR here, since the initial state must not be NULL.  */
1030   if (BE (dfa->init_state == NULL, 0))
1031     return err;
1032   if (dfa->init_state->has_constraint)
1033     {
1034       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1035 						       CONTEXT_WORD);
1036       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1037 						     CONTEXT_NEWLINE);
1038       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1039 							 &init_nodes,
1040 							 CONTEXT_NEWLINE
1041 							 | CONTEXT_BEGBUF);
1042       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1043 	      || dfa->init_state_begbuf == NULL, 0))
1044 	return err;
1045     }
1046   else
1047     dfa->init_state_word = dfa->init_state_nl
1048       = dfa->init_state_begbuf = dfa->init_state;
1049 
1050   re_node_set_free (&init_nodes);
1051   return REG_NOERROR;
1052 }
1053 
1054 #ifdef RE_ENABLE_I18N
1055 /* If it is possible to do searching in single byte encoding instead of UTF-8
1056    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1057    DFA nodes where needed.  */
1058 
1059 static void
optimize_utf8(re_dfa_t * dfa)1060 optimize_utf8 (re_dfa_t *dfa)
1061 {
1062   Idx node;
1063   int i;
1064   bool mb_chars = false;
1065   bool has_period = false;
1066 
1067   for (node = 0; node < dfa->nodes_len; ++node)
1068     switch (dfa->nodes[node].type)
1069       {
1070       case CHARACTER:
1071 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1072 	  mb_chars = true;
1073 	break;
1074       case ANCHOR:
1075 	switch (dfa->nodes[node].opr.ctx_type)
1076 	  {
1077 	  case LINE_FIRST:
1078 	  case LINE_LAST:
1079 	  case BUF_FIRST:
1080 	  case BUF_LAST:
1081 	    break;
1082 	  default:
1083 	    /* Word anchors etc. cannot be handled.  It's okay to test
1084 	       opr.ctx_type since constraints (for all DFA nodes) are
1085 	       created by ORing one or more opr.ctx_type values.  */
1086 	    return;
1087 	  }
1088 	break;
1089       case OP_PERIOD:
1090         has_period = true;
1091         break;
1092       case OP_BACK_REF:
1093       case OP_ALT:
1094       case END_OF_RE:
1095       case OP_DUP_ASTERISK:
1096       case OP_OPEN_SUBEXP:
1097       case OP_CLOSE_SUBEXP:
1098 	break;
1099       case COMPLEX_BRACKET:
1100 	return;
1101       case SIMPLE_BRACKET:
1102 	/* Just double check.  */
1103 	{
1104 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1105 			? 0
1106 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1107 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1108 	    {
1109 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1110 		return;
1111 	      rshift = 0;
1112 	    }
1113 	}
1114 	break;
1115       default:
1116 	abort ();
1117       }
1118 
1119   if (mb_chars || has_period)
1120     for (node = 0; node < dfa->nodes_len; ++node)
1121       {
1122 	if (dfa->nodes[node].type == CHARACTER
1123 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1124 	  dfa->nodes[node].mb_partial = 0;
1125 	else if (dfa->nodes[node].type == OP_PERIOD)
1126 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1127       }
1128 
1129   /* The search can be in single byte locale.  */
1130   dfa->mb_cur_max = 1;
1131   dfa->is_utf8 = 0;
1132   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1133 }
1134 #endif
1135 
1136 /* Analyze the structure tree, and calculate "first", "next", "edest",
1137    "eclosure", and "inveclosure".  */
1138 
1139 static reg_errcode_t
analyze(regex_t * preg)1140 analyze (regex_t *preg)
1141 {
1142   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1143   reg_errcode_t ret;
1144 
1145   /* Allocate arrays.  */
1146   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1147   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1148   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1149   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1150   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1151 	  || dfa->eclosures == NULL, 0))
1152     return REG_ESPACE;
1153 
1154   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1155   if (dfa->subexp_map != NULL)
1156     {
1157       Idx i;
1158       for (i = 0; i < preg->re_nsub; i++)
1159 	dfa->subexp_map[i] = i;
1160       preorder (dfa->str_tree, optimize_subexps, dfa);
1161       for (i = 0; i < preg->re_nsub; i++)
1162 	if (dfa->subexp_map[i] != i)
1163 	  break;
1164       if (i == preg->re_nsub)
1165 	{
1166 	  free (dfa->subexp_map);
1167 	  dfa->subexp_map = NULL;
1168 	}
1169     }
1170 
1171   ret = postorder (dfa->str_tree, lower_subexps, preg);
1172   if (BE (ret != REG_NOERROR, 0))
1173     return ret;
1174   ret = postorder (dfa->str_tree, calc_first, dfa);
1175   if (BE (ret != REG_NOERROR, 0))
1176     return ret;
1177   preorder (dfa->str_tree, calc_next, dfa);
1178   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181   ret = calc_eclosure (dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184 
1185   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1186      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1187   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188       || dfa->nbackref)
1189     {
1190       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1191       if (BE (dfa->inveclosures == NULL, 0))
1192         return REG_ESPACE;
1193       ret = calc_inveclosure (dfa);
1194     }
1195 
1196   return ret;
1197 }
1198 
1199 /* Our parse trees are very unbalanced, so we cannot use a stack to
1200    implement parse tree visits.  Instead, we use parent pointers and
1201    some hairy code in these two functions.  */
1202 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1203 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1204 	   void *extra)
1205 {
1206   bin_tree_t *node, *prev;
1207 
1208   for (node = root; ; )
1209     {
1210       /* Descend down the tree, preferably to the left (or to the right
1211 	 if that's the only child).  */
1212       while (node->left || node->right)
1213 	if (node->left)
1214           node = node->left;
1215         else
1216           node = node->right;
1217 
1218       do
1219 	{
1220 	  reg_errcode_t err = fn (extra, node);
1221 	  if (BE (err != REG_NOERROR, 0))
1222 	    return err;
1223           if (node->parent == NULL)
1224 	    return REG_NOERROR;
1225 	  prev = node;
1226 	  node = node->parent;
1227 	}
1228       /* Go up while we have a node that is reached from the right.  */
1229       while (node->right == prev || node->right == NULL);
1230       node = node->right;
1231     }
1232 }
1233 
1234 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1235 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1236 	  void *extra)
1237 {
1238   bin_tree_t *node;
1239 
1240   for (node = root; ; )
1241     {
1242       reg_errcode_t err = fn (extra, node);
1243       if (BE (err != REG_NOERROR, 0))
1244 	return err;
1245 
1246       /* Go to the left node, or up and to the right.  */
1247       if (node->left)
1248 	node = node->left;
1249       else
1250 	{
1251 	  bin_tree_t *prev = NULL;
1252 	  while (node->right == prev || node->right == NULL)
1253 	    {
1254 	      prev = node;
1255 	      node = node->parent;
1256 	      if (!node)
1257 	        return REG_NOERROR;
1258 	    }
1259 	  node = node->right;
1260 	}
1261     }
1262 }
1263 
1264 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1266    backreferences as well.  Requires a preorder visit.  */
1267 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1268 optimize_subexps (void *extra, bin_tree_t *node)
1269 {
1270   re_dfa_t *dfa = (re_dfa_t *) extra;
1271 
1272   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1273     {
1274       int idx = node->token.opr.idx;
1275       node->token.opr.idx = dfa->subexp_map[idx];
1276       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1277     }
1278 
1279   else if (node->token.type == SUBEXP
1280            && node->left && node->left->token.type == SUBEXP)
1281     {
1282       Idx other_idx = node->left->token.opr.idx;
1283 
1284       node->left = node->left->left;
1285       if (node->left)
1286         node->left->parent = node;
1287 
1288       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1289       if (other_idx < BITSET_WORD_BITS)
1290 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1291     }
1292 
1293   return REG_NOERROR;
1294 }
1295 
1296 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1297    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1298 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1299 lower_subexps (void *extra, bin_tree_t *node)
1300 {
1301   regex_t *preg = (regex_t *) extra;
1302   reg_errcode_t err = REG_NOERROR;
1303 
1304   if (node->left && node->left->token.type == SUBEXP)
1305     {
1306       node->left = lower_subexp (&err, preg, node->left);
1307       if (node->left)
1308 	node->left->parent = node;
1309     }
1310   if (node->right && node->right->token.type == SUBEXP)
1311     {
1312       node->right = lower_subexp (&err, preg, node->right);
1313       if (node->right)
1314 	node->right->parent = node;
1315     }
1316 
1317   return err;
1318 }
1319 
1320 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1321 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1322 {
1323   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1324   bin_tree_t *body = node->left;
1325   bin_tree_t *op, *cls, *tree1, *tree;
1326 
1327   if (preg->no_sub
1328       /* We do not optimize empty subexpressions, because otherwise we may
1329 	 have bad CONCAT nodes with NULL children.  This is obviously not
1330 	 very common, so we do not lose much.  An example that triggers
1331 	 this case is the sed "script" /\(\)/x.  */
1332       && node->left != NULL
1333       && (node->token.opr.idx >= BITSET_WORD_BITS
1334 	  || !(dfa->used_bkref_map
1335 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1336     return node->left;
1337 
1338   /* Convert the SUBEXP node to the concatenation of an
1339      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1340   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1341   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1342   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1343   tree = create_tree (dfa, op, tree1, CONCAT);
1344   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345     {
1346       *err = REG_ESPACE;
1347       return NULL;
1348     }
1349 
1350   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1351   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1352   return tree;
1353 }
1354 
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356    nodes.  Requires a postorder visit.  */
1357 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1358 calc_first (void *extra, bin_tree_t *node)
1359 {
1360   re_dfa_t *dfa = (re_dfa_t *) extra;
1361   if (node->token.type == CONCAT)
1362     {
1363       node->first = node->left->first;
1364       node->node_idx = node->left->node_idx;
1365     }
1366   else
1367     {
1368       node->first = node;
1369       node->node_idx = re_dfa_add_node (dfa, node->token);
1370       if (BE (node->node_idx == REG_MISSING, 0))
1371         return REG_ESPACE;
1372       if (node->token.type == ANCHOR)
1373         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1374     }
1375   return REG_NOERROR;
1376 }
1377 
1378 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1379 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1380 calc_next (void *extra, bin_tree_t *node)
1381 {
1382   switch (node->token.type)
1383     {
1384     case OP_DUP_ASTERISK:
1385       node->left->next = node;
1386       break;
1387     case CONCAT:
1388       node->left->next = node->right->first;
1389       node->right->next = node->next;
1390       break;
1391     default:
1392       if (node->left)
1393 	node->left->next = node->next;
1394       if (node->right)
1395         node->right->next = node->next;
1396       break;
1397     }
1398   return REG_NOERROR;
1399 }
1400 
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1402 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1403 link_nfa_nodes (void *extra, bin_tree_t *node)
1404 {
1405   re_dfa_t *dfa = (re_dfa_t *) extra;
1406   Idx idx = node->node_idx;
1407   reg_errcode_t err = REG_NOERROR;
1408 
1409   switch (node->token.type)
1410     {
1411     case CONCAT:
1412       break;
1413 
1414     case END_OF_RE:
1415       assert (node->next == NULL);
1416       break;
1417 
1418     case OP_DUP_ASTERISK:
1419     case OP_ALT:
1420       {
1421 	Idx left, right;
1422 	dfa->has_plural_match = 1;
1423 	if (node->left != NULL)
1424 	  left = node->left->first->node_idx;
1425 	else
1426 	  left = node->next->node_idx;
1427 	if (node->right != NULL)
1428 	  right = node->right->first->node_idx;
1429 	else
1430 	  right = node->next->node_idx;
1431 	assert (REG_VALID_INDEX (left));
1432 	assert (REG_VALID_INDEX (right));
1433 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1434       }
1435       break;
1436 
1437     case ANCHOR:
1438     case OP_OPEN_SUBEXP:
1439     case OP_CLOSE_SUBEXP:
1440       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1441       break;
1442 
1443     case OP_BACK_REF:
1444       dfa->nexts[idx] = node->next->node_idx;
1445       if (node->token.type == OP_BACK_REF)
1446 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1447       break;
1448 
1449     default:
1450       assert (!IS_EPSILON_NODE (node->token.type));
1451       dfa->nexts[idx] = node->next->node_idx;
1452       break;
1453     }
1454 
1455   return err;
1456 }
1457 
1458 /* Duplicate the epsilon closure of the node ROOT_NODE.
1459    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1460    to their own constraint.  */
1461 
1462 static reg_errcode_t
1463 internal_function
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1464 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1465 			Idx root_node, unsigned int init_constraint)
1466 {
1467   Idx org_node, clone_node;
1468   bool ok;
1469   unsigned int constraint = init_constraint;
1470   for (org_node = top_org_node, clone_node = top_clone_node;;)
1471     {
1472       Idx org_dest, clone_dest;
1473       if (dfa->nodes[org_node].type == OP_BACK_REF)
1474 	{
1475 	  /* If the back reference epsilon-transit, its destination must
1476 	     also have the constraint.  Then duplicate the epsilon closure
1477 	     of the destination of the back reference, and store it in
1478 	     edests of the back reference.  */
1479 	  org_dest = dfa->nexts[org_node];
1480 	  re_node_set_empty (dfa->edests + clone_node);
1481 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1482 	  if (BE (clone_dest == REG_MISSING, 0))
1483 	    return REG_ESPACE;
1484 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1485 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1486 	  if (BE (! ok, 0))
1487 	    return REG_ESPACE;
1488 	}
1489       else if (dfa->edests[org_node].nelem == 0)
1490 	{
1491 	  /* In case of the node can't epsilon-transit, don't duplicate the
1492 	     destination and store the original destination as the
1493 	     destination of the node.  */
1494 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1495 	  break;
1496 	}
1497       else if (dfa->edests[org_node].nelem == 1)
1498 	{
1499 	  /* In case of the node can epsilon-transit, and it has only one
1500 	     destination.  */
1501 	  org_dest = dfa->edests[org_node].elems[0];
1502 	  re_node_set_empty (dfa->edests + clone_node);
1503 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1504 	  /* If the node is root_node itself, it means the epsilon closure
1505 	     has a loop.  Then tie it to the destination of the root_node.  */
1506 	  if (org_node == root_node && clone_node != org_node)
1507 	    {
1508 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1509 	      if (BE (! ok, 0))
1510 	        return REG_ESPACE;
1511 	      break;
1512 	    }
1513 	  /* In case the node has another constraint, append it.  */
1514 	  constraint |= dfa->nodes[org_node].constraint;
1515 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1516 	  if (BE (clone_dest == REG_MISSING, 0))
1517 	    return REG_ESPACE;
1518 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 	  if (BE (! ok, 0))
1520 	    return REG_ESPACE;
1521 	}
1522       else /* dfa->edests[org_node].nelem == 2 */
1523 	{
1524 	  /* In case of the node can epsilon-transit, and it has two
1525 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1526 	  org_dest = dfa->edests[org_node].elems[0];
1527 	  re_node_set_empty (dfa->edests + clone_node);
1528 	  /* Search for a duplicated node which satisfies the constraint.  */
1529 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530 	  if (clone_dest == REG_MISSING)
1531 	    {
1532 	      /* There is no such duplicated node, create a new one.  */
1533 	      reg_errcode_t err;
1534 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1535 	      if (BE (clone_dest == REG_MISSING, 0))
1536 		return REG_ESPACE;
1537 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538 	      if (BE (! ok, 0))
1539 		return REG_ESPACE;
1540 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1541 					    root_node, constraint);
1542 	      if (BE (err != REG_NOERROR, 0))
1543 		return err;
1544 	    }
1545 	  else
1546 	    {
1547 	      /* There is a duplicated node which satisfy the constraint,
1548 		 use it to avoid infinite loop.  */
1549 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550 	      if (BE (! ok, 0))
1551 		return REG_ESPACE;
1552 	    }
1553 
1554 	  org_dest = dfa->edests[org_node].elems[1];
1555 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1556 	  if (BE (clone_dest == REG_MISSING, 0))
1557 	    return REG_ESPACE;
1558 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 	  if (BE (! ok, 0))
1560 	    return REG_ESPACE;
1561 	}
1562       org_node = org_dest;
1563       clone_node = clone_dest;
1564     }
1565   return REG_NOERROR;
1566 }
1567 
1568 /* Search for a node which is duplicated from the node ORG_NODE, and
1569    satisfies the constraint CONSTRAINT.  */
1570 
1571 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1572 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1573 			unsigned int constraint)
1574 {
1575   Idx idx;
1576   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1577     {
1578       if (org_node == dfa->org_indices[idx]
1579 	  && constraint == dfa->nodes[idx].constraint)
1580 	return idx; /* Found.  */
1581     }
1582   return REG_MISSING; /* Not found.  */
1583 }
1584 
1585 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1586    Return the index of the new node, or REG_MISSING if insufficient storage is
1587    available.  */
1588 
1589 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1590 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1591 {
1592   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1593   if (BE (dup_idx != REG_MISSING, 1))
1594     {
1595       dfa->nodes[dup_idx].constraint = constraint;
1596       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1597       dfa->nodes[dup_idx].duplicated = 1;
1598 
1599       /* Store the index of the original node.  */
1600       dfa->org_indices[dup_idx] = org_idx;
1601     }
1602   return dup_idx;
1603 }
1604 
1605 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1606 calc_inveclosure (re_dfa_t *dfa)
1607 {
1608   Idx src, idx;
1609   bool ok;
1610   for (idx = 0; idx < dfa->nodes_len; ++idx)
1611     re_node_set_init_empty (dfa->inveclosures + idx);
1612 
1613   for (src = 0; src < dfa->nodes_len; ++src)
1614     {
1615       Idx *elems = dfa->eclosures[src].elems;
1616       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1617 	{
1618 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1619 	  if (BE (! ok, 0))
1620 	    return REG_ESPACE;
1621 	}
1622     }
1623 
1624   return REG_NOERROR;
1625 }
1626 
1627 /* Calculate "eclosure" for all the node in DFA.  */
1628 
1629 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1630 calc_eclosure (re_dfa_t *dfa)
1631 {
1632   Idx node_idx;
1633   bool incomplete;
1634 #ifdef DEBUG
1635   assert (dfa->nodes_len > 0);
1636 #endif
1637   incomplete = false;
1638   /* For each nodes, calculate epsilon closure.  */
1639   for (node_idx = 0; ; ++node_idx)
1640     {
1641       reg_errcode_t err;
1642       re_node_set eclosure_elem;
1643       if (node_idx == dfa->nodes_len)
1644 	{
1645 	  if (!incomplete)
1646 	    break;
1647 	  incomplete = false;
1648 	  node_idx = 0;
1649 	}
1650 
1651 #ifdef DEBUG
1652       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1653 #endif
1654 
1655       /* If we have already calculated, skip it.  */
1656       if (dfa->eclosures[node_idx].nelem != 0)
1657 	continue;
1658       /* Calculate epsilon closure of `node_idx'.  */
1659       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1660       if (BE (err != REG_NOERROR, 0))
1661 	return err;
1662 
1663       if (dfa->eclosures[node_idx].nelem == 0)
1664 	{
1665 	  incomplete = true;
1666 	  re_node_set_free (&eclosure_elem);
1667 	}
1668     }
1669   return REG_NOERROR;
1670 }
1671 
1672 /* Calculate epsilon closure of NODE.  */
1673 
1674 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1675 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1676 {
1677   reg_errcode_t err;
1678   Idx i;
1679   bool incomplete;
1680   bool ok;
1681   re_node_set eclosure;
1682   incomplete = false;
1683   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1684   if (BE (err != REG_NOERROR, 0))
1685     return err;
1686 
1687   /* This indicates that we are calculating this node now.
1688      We reference this value to avoid infinite loop.  */
1689   dfa->eclosures[node].nelem = REG_MISSING;
1690 
1691   /* If the current node has constraints, duplicate all nodes
1692      since they must inherit the constraints.  */
1693   if (dfa->nodes[node].constraint
1694       && dfa->edests[node].nelem
1695       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1696     {
1697       err = duplicate_node_closure (dfa, node, node, node,
1698 				    dfa->nodes[node].constraint);
1699       if (BE (err != REG_NOERROR, 0))
1700 	return err;
1701     }
1702 
1703   /* Expand each epsilon destination nodes.  */
1704   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1705     for (i = 0; i < dfa->edests[node].nelem; ++i)
1706       {
1707 	re_node_set eclosure_elem;
1708 	Idx edest = dfa->edests[node].elems[i];
1709 	/* If calculating the epsilon closure of `edest' is in progress,
1710 	   return intermediate result.  */
1711 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1712 	  {
1713 	    incomplete = true;
1714 	    continue;
1715 	  }
1716 	/* If we haven't calculated the epsilon closure of `edest' yet,
1717 	   calculate now. Otherwise use calculated epsilon closure.  */
1718 	if (dfa->eclosures[edest].nelem == 0)
1719 	  {
1720 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1721 	    if (BE (err != REG_NOERROR, 0))
1722 	      return err;
1723 	  }
1724 	else
1725 	  eclosure_elem = dfa->eclosures[edest];
1726 	/* Merge the epsilon closure of `edest'.  */
1727 	re_node_set_merge (&eclosure, &eclosure_elem);
1728 	/* If the epsilon closure of `edest' is incomplete,
1729 	   the epsilon closure of this node is also incomplete.  */
1730 	if (dfa->eclosures[edest].nelem == 0)
1731 	  {
1732 	    incomplete = true;
1733 	    re_node_set_free (&eclosure_elem);
1734 	  }
1735       }
1736 
1737   /* Epsilon closures include itself.  */
1738   ok = re_node_set_insert (&eclosure, node);
1739   if (BE (! ok, 0))
1740     return REG_ESPACE;
1741   if (incomplete && !root)
1742     dfa->eclosures[node].nelem = 0;
1743   else
1744     dfa->eclosures[node] = eclosure;
1745   *new_set = eclosure;
1746   return REG_NOERROR;
1747 }
1748 
1749 /* Functions for token which are used in the parser.  */
1750 
1751 /* Fetch a token from INPUT.
1752    We must not use this function inside bracket expressions.  */
1753 
1754 static void
1755 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1756 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 {
1758   re_string_skip_bytes (input, peek_token (result, input, syntax));
1759 }
1760 
1761 /* Peek a token from INPUT, and return the length of the token.
1762    We must not use this function inside bracket expressions.  */
1763 
1764 static int
1765 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1766 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 {
1768   unsigned char c;
1769 
1770   if (re_string_eoi (input))
1771     {
1772       token->type = END_OF_RE;
1773       return 0;
1774     }
1775 
1776   c = re_string_peek_byte (input, 0);
1777   token->opr.c = c;
1778 
1779   token->word_char = 0;
1780 #ifdef RE_ENABLE_I18N
1781   token->mb_partial = 0;
1782   if (input->mb_cur_max > 1 &&
1783       !re_string_first_byte (input, re_string_cur_idx (input)))
1784     {
1785       token->type = CHARACTER;
1786       token->mb_partial = 1;
1787       return 1;
1788     }
1789 #endif
1790   if (c == '\\')
1791     {
1792       unsigned char c2;
1793       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794 	{
1795 	  token->type = BACK_SLASH;
1796 	  return 1;
1797 	}
1798 
1799       c2 = re_string_peek_byte_case (input, 1);
1800       token->opr.c = c2;
1801       token->type = CHARACTER;
1802 #ifdef RE_ENABLE_I18N
1803       if (input->mb_cur_max > 1)
1804 	{
1805 	  wint_t wc = re_string_wchar_at (input,
1806 					  re_string_cur_idx (input) + 1);
1807 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808 	}
1809       else
1810 #endif
1811 	token->word_char = IS_WORD_CHAR (c2) != 0;
1812 
1813       switch (c2)
1814 	{
1815 	case '|':
1816 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1817 	    token->type = OP_ALT;
1818 	  break;
1819 	case '1': case '2': case '3': case '4': case '5':
1820 	case '6': case '7': case '8': case '9':
1821 	  if (!(syntax & RE_NO_BK_REFS))
1822 	    {
1823 	      token->type = OP_BACK_REF;
1824 	      token->opr.idx = c2 - '1';
1825 	    }
1826 	  break;
1827 	case '<':
1828 	  if (!(syntax & RE_NO_GNU_OPS))
1829 	    {
1830 	      token->type = ANCHOR;
1831 	      token->opr.ctx_type = WORD_FIRST;
1832 	    }
1833 	  break;
1834 	case '>':
1835 	  if (!(syntax & RE_NO_GNU_OPS))
1836 	    {
1837 	      token->type = ANCHOR;
1838 	      token->opr.ctx_type = WORD_LAST;
1839 	    }
1840 	  break;
1841 	case 'b':
1842 	  if (!(syntax & RE_NO_GNU_OPS))
1843 	    {
1844 	      token->type = ANCHOR;
1845 	      token->opr.ctx_type = WORD_DELIM;
1846 	    }
1847 	  break;
1848 	case 'B':
1849 	  if (!(syntax & RE_NO_GNU_OPS))
1850 	    {
1851 	      token->type = ANCHOR;
1852 	      token->opr.ctx_type = NOT_WORD_DELIM;
1853 	    }
1854 	  break;
1855 	case 'w':
1856 	  if (!(syntax & RE_NO_GNU_OPS))
1857 	    token->type = OP_WORD;
1858 	  break;
1859 	case 'W':
1860 	  if (!(syntax & RE_NO_GNU_OPS))
1861 	    token->type = OP_NOTWORD;
1862 	  break;
1863 	case 's':
1864 	  if (!(syntax & RE_NO_GNU_OPS))
1865 	    token->type = OP_SPACE;
1866 	  break;
1867 	case 'S':
1868 	  if (!(syntax & RE_NO_GNU_OPS))
1869 	    token->type = OP_NOTSPACE;
1870 	  break;
1871 	case '`':
1872 	  if (!(syntax & RE_NO_GNU_OPS))
1873 	    {
1874 	      token->type = ANCHOR;
1875 	      token->opr.ctx_type = BUF_FIRST;
1876 	    }
1877 	  break;
1878 	case '\'':
1879 	  if (!(syntax & RE_NO_GNU_OPS))
1880 	    {
1881 	      token->type = ANCHOR;
1882 	      token->opr.ctx_type = BUF_LAST;
1883 	    }
1884 	  break;
1885 	case '(':
1886 	  if (!(syntax & RE_NO_BK_PARENS))
1887 	    token->type = OP_OPEN_SUBEXP;
1888 	  break;
1889 	case ')':
1890 	  if (!(syntax & RE_NO_BK_PARENS))
1891 	    token->type = OP_CLOSE_SUBEXP;
1892 	  break;
1893 	case '+':
1894 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1895 	    token->type = OP_DUP_PLUS;
1896 	  break;
1897 	case '?':
1898 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1899 	    token->type = OP_DUP_QUESTION;
1900 	  break;
1901 	case '{':
1902 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1903 	    token->type = OP_OPEN_DUP_NUM;
1904 	  break;
1905 	case '}':
1906 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1907 	    token->type = OP_CLOSE_DUP_NUM;
1908 	  break;
1909 	default:
1910 	  break;
1911 	}
1912       return 2;
1913     }
1914 
1915   token->type = CHARACTER;
1916 #ifdef RE_ENABLE_I18N
1917   if (input->mb_cur_max > 1)
1918     {
1919       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1920       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921     }
1922   else
1923 #endif
1924     token->word_char = IS_WORD_CHAR (token->opr.c);
1925 
1926   switch (c)
1927     {
1928     case '\n':
1929       if (syntax & RE_NEWLINE_ALT)
1930 	token->type = OP_ALT;
1931       break;
1932     case '|':
1933       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1934 	token->type = OP_ALT;
1935       break;
1936     case '*':
1937       token->type = OP_DUP_ASTERISK;
1938       break;
1939     case '+':
1940       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1941 	token->type = OP_DUP_PLUS;
1942       break;
1943     case '?':
1944       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1945 	token->type = OP_DUP_QUESTION;
1946       break;
1947     case '{':
1948       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1949 	token->type = OP_OPEN_DUP_NUM;
1950       break;
1951     case '}':
1952       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1953 	token->type = OP_CLOSE_DUP_NUM;
1954       break;
1955     case '(':
1956       if (syntax & RE_NO_BK_PARENS)
1957 	token->type = OP_OPEN_SUBEXP;
1958       break;
1959     case ')':
1960       if (syntax & RE_NO_BK_PARENS)
1961 	token->type = OP_CLOSE_SUBEXP;
1962       break;
1963     case '[':
1964       token->type = OP_OPEN_BRACKET;
1965       break;
1966     case '.':
1967       token->type = OP_PERIOD;
1968       break;
1969     case '^':
1970       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1971 	  re_string_cur_idx (input) != 0)
1972 	{
1973 	  char prev = re_string_peek_byte (input, -1);
1974 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1975 	    break;
1976 	}
1977       token->type = ANCHOR;
1978       token->opr.ctx_type = LINE_FIRST;
1979       break;
1980     case '$':
1981       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1982 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1983 	{
1984 	  re_token_t next;
1985 	  re_string_skip_bytes (input, 1);
1986 	  peek_token (&next, input, syntax);
1987 	  re_string_skip_bytes (input, -1);
1988 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1989 	    break;
1990 	}
1991       token->type = ANCHOR;
1992       token->opr.ctx_type = LINE_LAST;
1993       break;
1994     default:
1995       break;
1996     }
1997   return 1;
1998 }
1999 
2000 /* Peek a token from INPUT, and return the length of the token.
2001    We must not use this function out of bracket expressions.  */
2002 
2003 static int
2004 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2005 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2006 {
2007   unsigned char c;
2008   if (re_string_eoi (input))
2009     {
2010       token->type = END_OF_RE;
2011       return 0;
2012     }
2013   c = re_string_peek_byte (input, 0);
2014   token->opr.c = c;
2015 
2016 #ifdef RE_ENABLE_I18N
2017   if (input->mb_cur_max > 1 &&
2018       !re_string_first_byte (input, re_string_cur_idx (input)))
2019     {
2020       token->type = CHARACTER;
2021       return 1;
2022     }
2023 #endif /* RE_ENABLE_I18N */
2024 
2025   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2026       && re_string_cur_idx (input) + 1 < re_string_length (input))
2027     {
2028       /* In this case, '\' escape a character.  */
2029       unsigned char c2;
2030       re_string_skip_bytes (input, 1);
2031       c2 = re_string_peek_byte (input, 0);
2032       token->opr.c = c2;
2033       token->type = CHARACTER;
2034       return 1;
2035     }
2036   if (c == '[') /* '[' is a special char in a bracket exps.  */
2037     {
2038       unsigned char c2;
2039       int token_len;
2040       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2041 	c2 = re_string_peek_byte (input, 1);
2042       else
2043 	c2 = 0;
2044       token->opr.c = c2;
2045       token_len = 2;
2046       switch (c2)
2047 	{
2048 	case '.':
2049 	  token->type = OP_OPEN_COLL_ELEM;
2050 	  break;
2051 	case '=':
2052 	  token->type = OP_OPEN_EQUIV_CLASS;
2053 	  break;
2054 	case ':':
2055 	  if (syntax & RE_CHAR_CLASSES)
2056 	    {
2057 	      token->type = OP_OPEN_CHAR_CLASS;
2058 	      break;
2059 	    }
2060 	  /* else fall through.  */
2061 	default:
2062 	  token->type = CHARACTER;
2063 	  token->opr.c = c;
2064 	  token_len = 1;
2065 	  break;
2066 	}
2067       return token_len;
2068     }
2069   switch (c)
2070     {
2071     case '-':
2072       token->type = OP_CHARSET_RANGE;
2073       break;
2074     case ']':
2075       token->type = OP_CLOSE_BRACKET;
2076       break;
2077     case '^':
2078       token->type = OP_NON_MATCH_LIST;
2079       break;
2080     default:
2081       token->type = CHARACTER;
2082     }
2083   return 1;
2084 }
2085 
2086 /* Functions for parser.  */
2087 
2088 /* Entry point of the parser.
2089    Parse the regular expression REGEXP and return the structure tree.
2090    If an error is occured, ERR is set by error code, and return NULL.
2091    This function build the following tree, from regular expression <reg_exp>:
2092 	   CAT
2093 	   / \
2094 	  /   \
2095    <reg_exp>  EOR
2096 
2097    CAT means concatenation.
2098    EOR means end of regular expression.  */
2099 
2100 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2101 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2102        reg_errcode_t *err)
2103 {
2104   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2105   bin_tree_t *tree, *eor, *root;
2106   re_token_t current_token;
2107   dfa->syntax = syntax;
2108   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2110   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2111     return NULL;
2112   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2113   if (tree != NULL)
2114     root = create_tree (dfa, tree, eor, CONCAT);
2115   else
2116     root = eor;
2117   if (BE (eor == NULL || root == NULL, 0))
2118     {
2119       *err = REG_ESPACE;
2120       return NULL;
2121     }
2122   return root;
2123 }
2124 
2125 /* This function build the following tree, from regular expression
2126    <branch1>|<branch2>:
2127 	   ALT
2128 	   / \
2129 	  /   \
2130    <branch1> <branch2>
2131 
2132    ALT means alternative, which represents the operator `|'.  */
2133 
2134 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2135 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2136 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2137 {
2138   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2139   bin_tree_t *tree, *branch = NULL;
2140   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2141   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2142     return NULL;
2143 
2144   while (token->type == OP_ALT)
2145     {
2146       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147       if (token->type != OP_ALT && token->type != END_OF_RE
2148 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149 	{
2150 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2151 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2152 	    return NULL;
2153 	}
2154       else
2155 	branch = NULL;
2156       tree = create_tree (dfa, tree, branch, OP_ALT);
2157       if (BE (tree == NULL, 0))
2158 	{
2159 	  *err = REG_ESPACE;
2160 	  return NULL;
2161 	}
2162     }
2163   return tree;
2164 }
2165 
2166 /* This function build the following tree, from regular expression
2167    <exp1><exp2>:
2168 	CAT
2169 	/ \
2170        /   \
2171    <exp1> <exp2>
2172 
2173    CAT means concatenation.  */
2174 
2175 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2176 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2177 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2178 {
2179   bin_tree_t *tree, *expr;
2180   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2181   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2182   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183     return NULL;
2184 
2185   while (token->type != OP_ALT && token->type != END_OF_RE
2186 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187     {
2188       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2189       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2190 	{
2191 	  return NULL;
2192 	}
2193       if (tree != NULL && expr != NULL)
2194 	{
2195 	  tree = create_tree (dfa, tree, expr, CONCAT);
2196 	  if (tree == NULL)
2197 	    {
2198 	      *err = REG_ESPACE;
2199 	      return NULL;
2200 	    }
2201 	}
2202       else if (tree == NULL)
2203 	tree = expr;
2204       /* Otherwise expr == NULL, we don't need to create new tree.  */
2205     }
2206   return tree;
2207 }
2208 
2209 /* This function build the following tree, from regular expression a*:
2210 	 *
2211 	 |
2212 	 a
2213 */
2214 
2215 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2216 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 {
2219   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2220   bin_tree_t *tree;
2221   switch (token->type)
2222     {
2223     case CHARACTER:
2224       tree = create_token_tree (dfa, NULL, NULL, token);
2225       if (BE (tree == NULL, 0))
2226 	{
2227 	  *err = REG_ESPACE;
2228 	  return NULL;
2229 	}
2230 #ifdef RE_ENABLE_I18N
2231       if (dfa->mb_cur_max > 1)
2232 	{
2233 	  while (!re_string_eoi (regexp)
2234 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2235 	    {
2236 	      bin_tree_t *mbc_remain;
2237 	      fetch_token (token, regexp, syntax);
2238 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2239 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2240 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2241 		{
2242 		  *err = REG_ESPACE;
2243 		  return NULL;
2244 		}
2245 	    }
2246 	}
2247 #endif
2248       break;
2249     case OP_OPEN_SUBEXP:
2250       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2251       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2252 	return NULL;
2253       break;
2254     case OP_OPEN_BRACKET:
2255       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2256       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257 	return NULL;
2258       break;
2259     case OP_BACK_REF:
2260       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2261 	{
2262 	  *err = REG_ESUBREG;
2263 	  return NULL;
2264 	}
2265       dfa->used_bkref_map |= 1 << token->opr.idx;
2266       tree = create_token_tree (dfa, NULL, NULL, token);
2267       if (BE (tree == NULL, 0))
2268 	{
2269 	  *err = REG_ESPACE;
2270 	  return NULL;
2271 	}
2272       ++dfa->nbackref;
2273       dfa->has_mb_node = 1;
2274       break;
2275     case OP_OPEN_DUP_NUM:
2276       if (syntax & RE_CONTEXT_INVALID_DUP)
2277 	{
2278 	  *err = REG_BADRPT;
2279 	  return NULL;
2280 	}
2281       /* FALLTHROUGH */
2282     case OP_DUP_ASTERISK:
2283     case OP_DUP_PLUS:
2284     case OP_DUP_QUESTION:
2285       if (syntax & RE_CONTEXT_INVALID_OPS)
2286 	{
2287 	  *err = REG_BADRPT;
2288 	  return NULL;
2289 	}
2290       else if (syntax & RE_CONTEXT_INDEP_OPS)
2291 	{
2292 	  fetch_token (token, regexp, syntax);
2293 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2294 	}
2295       /* else fall through  */
2296     case OP_CLOSE_SUBEXP:
2297       if ((token->type == OP_CLOSE_SUBEXP) &&
2298 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2299 	{
2300 	  *err = REG_ERPAREN;
2301 	  return NULL;
2302 	}
2303       /* else fall through  */
2304     case OP_CLOSE_DUP_NUM:
2305       /* We treat it as a normal character.  */
2306 
2307       /* Then we can these characters as normal characters.  */
2308       token->type = CHARACTER;
2309       /* mb_partial and word_char bits should be initialized already
2310 	 by peek_token.  */
2311       tree = create_token_tree (dfa, NULL, NULL, token);
2312       if (BE (tree == NULL, 0))
2313 	{
2314 	  *err = REG_ESPACE;
2315 	  return NULL;
2316 	}
2317       break;
2318     case ANCHOR:
2319       if ((token->opr.ctx_type
2320 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2321 	  && dfa->word_ops_used == 0)
2322 	init_word_char (dfa);
2323       if (token->opr.ctx_type == WORD_DELIM
2324           || token->opr.ctx_type == NOT_WORD_DELIM)
2325 	{
2326 	  bin_tree_t *tree_first, *tree_last;
2327 	  if (token->opr.ctx_type == WORD_DELIM)
2328 	    {
2329 	      token->opr.ctx_type = WORD_FIRST;
2330 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2331 	      token->opr.ctx_type = WORD_LAST;
2332             }
2333           else
2334             {
2335 	      token->opr.ctx_type = INSIDE_WORD;
2336 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2337 	      token->opr.ctx_type = INSIDE_NOTWORD;
2338             }
2339 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2340 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2341 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2342 	    {
2343 	      *err = REG_ESPACE;
2344 	      return NULL;
2345 	    }
2346 	}
2347       else
2348 	{
2349 	  tree = create_token_tree (dfa, NULL, NULL, token);
2350 	  if (BE (tree == NULL, 0))
2351 	    {
2352 	      *err = REG_ESPACE;
2353 	      return NULL;
2354 	    }
2355 	}
2356       /* We must return here, since ANCHORs can't be followed
2357 	 by repetition operators.
2358 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2359 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2360       fetch_token (token, regexp, syntax);
2361       return tree;
2362     case OP_PERIOD:
2363       tree = create_token_tree (dfa, NULL, NULL, token);
2364       if (BE (tree == NULL, 0))
2365 	{
2366 	  *err = REG_ESPACE;
2367 	  return NULL;
2368 	}
2369       if (dfa->mb_cur_max > 1)
2370 	dfa->has_mb_node = 1;
2371       break;
2372     case OP_WORD:
2373     case OP_NOTWORD:
2374       tree = build_charclass_op (dfa, regexp->trans,
2375 				 (const unsigned char *) "alnum",
2376 				 (const unsigned char *) "_",
2377 				 token->type == OP_NOTWORD, err);
2378       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2379 	return NULL;
2380       break;
2381     case OP_SPACE:
2382     case OP_NOTSPACE:
2383       tree = build_charclass_op (dfa, regexp->trans,
2384 				 (const unsigned char *) "space",
2385 				 (const unsigned char *) "",
2386 				 token->type == OP_NOTSPACE, err);
2387       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2388 	return NULL;
2389       break;
2390     case OP_ALT:
2391     case END_OF_RE:
2392       return NULL;
2393     case BACK_SLASH:
2394       *err = REG_EESCAPE;
2395       return NULL;
2396     default:
2397       /* Must not happen?  */
2398 #ifdef DEBUG
2399       assert (0);
2400 #endif
2401       return NULL;
2402     }
2403   fetch_token (token, regexp, syntax);
2404 
2405   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2406 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2407     {
2408       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2409       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2410 	return NULL;
2411       /* In BRE consecutive duplications are not allowed.  */
2412       if ((syntax & RE_CONTEXT_INVALID_DUP)
2413 	  && (token->type == OP_DUP_ASTERISK
2414 	      || token->type == OP_OPEN_DUP_NUM))
2415 	{
2416 	  *err = REG_BADRPT;
2417 	  return NULL;
2418 	}
2419     }
2420 
2421   return tree;
2422 }
2423 
2424 /* This function build the following tree, from regular expression
2425    (<reg_exp>):
2426 	 SUBEXP
2427 	    |
2428 	<reg_exp>
2429 */
2430 
2431 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2432 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2433 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2434 {
2435   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2436   bin_tree_t *tree;
2437   size_t cur_nsub;
2438   cur_nsub = preg->re_nsub++;
2439 
2440   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2441 
2442   /* The subexpression may be a null string.  */
2443   if (token->type == OP_CLOSE_SUBEXP)
2444     tree = NULL;
2445   else
2446     {
2447       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2448       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2449         *err = REG_EPAREN;
2450       if (BE (*err != REG_NOERROR, 0))
2451 	return NULL;
2452     }
2453 
2454   if (cur_nsub <= '9' - '1')
2455     dfa->completed_bkref_map |= 1 << cur_nsub;
2456 
2457   tree = create_tree (dfa, tree, NULL, SUBEXP);
2458   if (BE (tree == NULL, 0))
2459     {
2460       *err = REG_ESPACE;
2461       return NULL;
2462     }
2463   tree->token.opr.idx = cur_nsub;
2464   return tree;
2465 }
2466 
2467 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2468 
2469 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2470 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2471 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2472 {
2473   bin_tree_t *tree = NULL, *old_tree = NULL;
2474   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2475   re_token_t start_token = *token;
2476 
2477   if (token->type == OP_OPEN_DUP_NUM)
2478     {
2479       end = 0;
2480       start = fetch_number (regexp, token, syntax);
2481       if (start == REG_MISSING)
2482 	{
2483 	  if (token->type == CHARACTER && token->opr.c == ',')
2484 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2485 	  else
2486 	    {
2487 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2488 	      return NULL;
2489 	    }
2490 	}
2491       if (BE (start != REG_ERROR, 1))
2492 	{
2493 	  /* We treat "{n}" as "{n,n}".  */
2494 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2495 		 : ((token->type == CHARACTER && token->opr.c == ',')
2496 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2497 	}
2498       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2499 	{
2500 	  /* Invalid sequence.  */
2501 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2502 	    {
2503 	      if (token->type == END_OF_RE)
2504 		*err = REG_EBRACE;
2505 	      else
2506 		*err = REG_BADBR;
2507 
2508 	      return NULL;
2509 	    }
2510 
2511 	  /* If the syntax bit is set, rollback.  */
2512 	  re_string_set_index (regexp, start_idx);
2513 	  *token = start_token;
2514 	  token->type = CHARACTER;
2515 	  /* mb_partial and word_char bits should be already initialized by
2516 	     peek_token.  */
2517 	  return elem;
2518 	}
2519 
2520       if (BE (end != REG_MISSING && start > end, 0))
2521 	{
2522 	  /* First number greater than second.  */
2523 	  *err = REG_BADBR;
2524 	  return NULL;
2525 	}
2526     }
2527   else
2528     {
2529       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2530       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2531     }
2532 
2533   fetch_token (token, regexp, syntax);
2534 
2535   if (BE (elem == NULL, 0))
2536     return NULL;
2537   if (BE (start == 0 && end == 0, 0))
2538     {
2539       postorder (elem, free_tree, NULL);
2540       return NULL;
2541     }
2542 
2543   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2544   if (BE (start > 0, 0))
2545     {
2546       tree = elem;
2547       for (i = 2; i <= start; ++i)
2548 	{
2549 	  elem = duplicate_tree (elem, dfa);
2550 	  tree = create_tree (dfa, tree, elem, CONCAT);
2551 	  if (BE (elem == NULL || tree == NULL, 0))
2552 	    goto parse_dup_op_espace;
2553 	}
2554 
2555       if (start == end)
2556 	return tree;
2557 
2558       /* Duplicate ELEM before it is marked optional.  */
2559       elem = duplicate_tree (elem, dfa);
2560       old_tree = tree;
2561     }
2562   else
2563     old_tree = NULL;
2564 
2565   if (elem->token.type == SUBEXP)
2566     postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2567 
2568   tree = create_tree (dfa, elem, NULL,
2569 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2570   if (BE (tree == NULL, 0))
2571     goto parse_dup_op_espace;
2572 
2573   /* This loop is actually executed only when end != REG_MISSING,
2574      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2575      already created the start+1-th copy.  */
2576   if ((Idx) -1 < 0 || end != REG_MISSING)
2577     for (i = start + 2; i <= end; ++i)
2578       {
2579 	elem = duplicate_tree (elem, dfa);
2580 	tree = create_tree (dfa, tree, elem, CONCAT);
2581 	if (BE (elem == NULL || tree == NULL, 0))
2582 	  goto parse_dup_op_espace;
2583 
2584 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2585 	if (BE (tree == NULL, 0))
2586 	  goto parse_dup_op_espace;
2587       }
2588 
2589   if (old_tree)
2590     tree = create_tree (dfa, old_tree, tree, CONCAT);
2591 
2592   return tree;
2593 
2594  parse_dup_op_espace:
2595   *err = REG_ESPACE;
2596   return NULL;
2597 }
2598 
2599 /* Size of the names for collating symbol/equivalence_class/character_class.
2600    I'm not sure, but maybe enough.  */
2601 #define BRACKET_NAME_BUF_SIZE 32
2602 
2603 #ifndef _LIBC
2604   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2605      Build the range expression which starts from START_ELEM, and ends
2606      at END_ELEM.  The result are written to MBCSET and SBCSET.
2607      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2608      mbcset->range_ends, is a pointer argument sinse we may
2609      update it.  */
2610 
2611 static reg_errcode_t
2612 internal_function
2613 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2614 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2615 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2616 # else /* not RE_ENABLE_I18N */
2617 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2618 		 bracket_elem_t *end_elem)
2619 # endif /* not RE_ENABLE_I18N */
2620 {
2621   unsigned int start_ch, end_ch;
2622   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2623   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2624 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2625 	  0))
2626     return REG_ERANGE;
2627 
2628   /* We can handle no multi character collating elements without libc
2629      support.  */
2630   if (BE ((start_elem->type == COLL_SYM
2631 	   && strlen ((char *) start_elem->opr.name) > 1)
2632 	  || (end_elem->type == COLL_SYM
2633 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2634     return REG_ECOLLATE;
2635 
2636 # ifdef RE_ENABLE_I18N
2637   {
2638     wchar_t wc;
2639     wint_t start_wc;
2640     wint_t end_wc;
2641     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2642 
2643     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2644 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2645 		   : 0));
2646     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2647 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2648 		 : 0));
2649     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2650 		? __btowc (start_ch) : start_elem->opr.wch);
2651     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2652 	      ? __btowc (end_ch) : end_elem->opr.wch);
2653     if (start_wc == WEOF || end_wc == WEOF)
2654       return REG_ECOLLATE;
2655     cmp_buf[0] = start_wc;
2656     cmp_buf[4] = end_wc;
2657     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2658       return REG_ERANGE;
2659 
2660     /* Got valid collation sequence values, add them as a new entry.
2661        However, for !_LIBC we have no collation elements: if the
2662        character set is single byte, the single byte character set
2663        that we build below suffices.  parse_bracket_exp passes
2664        no MBCSET if dfa->mb_cur_max == 1.  */
2665     if (mbcset)
2666       {
2667         /* Check the space of the arrays.  */
2668         if (BE (*range_alloc == mbcset->nranges, 0))
2669           {
2670 	    /* There is not enough space, need realloc.  */
2671 	    wchar_t *new_array_start, *new_array_end;
2672 	    Idx new_nranges;
2673 
2674 	    /* +1 in case of mbcset->nranges is 0.  */
2675 	    new_nranges = 2 * mbcset->nranges + 1;
2676 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2677 	       are NULL if *range_alloc == 0.  */
2678 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2679 				          new_nranges);
2680 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2681 				        new_nranges);
2682 
2683 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2684 	      return REG_ESPACE;
2685 
2686 	    mbcset->range_starts = new_array_start;
2687 	    mbcset->range_ends = new_array_end;
2688 	    *range_alloc = new_nranges;
2689           }
2690 
2691         mbcset->range_starts[mbcset->nranges] = start_wc;
2692         mbcset->range_ends[mbcset->nranges++] = end_wc;
2693       }
2694 
2695     /* Build the table for single byte characters.  */
2696     for (wc = 0; wc < SBC_MAX; ++wc)
2697       {
2698 	cmp_buf[2] = wc;
2699 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2700 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2701 	  bitset_set (sbcset, wc);
2702       }
2703   }
2704 # else /* not RE_ENABLE_I18N */
2705   {
2706     unsigned int ch;
2707     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2708 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2709 		   : 0));
2710     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2711 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2712 		 : 0));
2713     if (start_ch > end_ch)
2714       return REG_ERANGE;
2715     /* Build the table for single byte characters.  */
2716     for (ch = 0; ch < SBC_MAX; ++ch)
2717       if (start_ch <= ch  && ch <= end_ch)
2718 	bitset_set (sbcset, ch);
2719   }
2720 # endif /* not RE_ENABLE_I18N */
2721   return REG_NOERROR;
2722 }
2723 #endif /* not _LIBC */
2724 
2725 #ifndef _LIBC
2726 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2727    Build the collating element which is represented by NAME.
2728    The result are written to MBCSET and SBCSET.
2729    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2730    pointer argument since we may update it.  */
2731 
2732 static reg_errcode_t
2733 internal_function
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2734 build_collating_symbol (bitset_t sbcset,
2735 # ifdef RE_ENABLE_I18N
2736 			re_charset_t *mbcset, Idx *coll_sym_alloc,
2737 # endif
2738 			const unsigned char *name)
2739 {
2740   size_t name_len = strlen ((const char *) name);
2741   if (BE (name_len != 1, 0))
2742     return REG_ECOLLATE;
2743   else
2744     {
2745       bitset_set (sbcset, name[0]);
2746       return REG_NOERROR;
2747     }
2748 }
2749 #endif /* not _LIBC */
2750 
2751 /* This function parse bracket expression like "[abc]", "[a-c]",
2752    "[[.a-a.]]" etc.  */
2753 
2754 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2755 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2756 		   reg_syntax_t syntax, reg_errcode_t *err)
2757 {
2758 #ifdef _LIBC
2759   const unsigned char *collseqmb;
2760   const char *collseqwc;
2761   uint32_t nrules;
2762   int32_t table_size;
2763   const int32_t *symb_table;
2764   const unsigned char *extra;
2765 
2766   /* Local function for parse_bracket_exp used in _LIBC environement.
2767      Seek the collating symbol entry correspondings to NAME.
2768      Return the index of the symbol in the SYMB_TABLE.  */
2769 
2770   auto inline int32_t
2771   __attribute ((always_inline))
2772   seek_collating_symbol_entry (name, name_len)
2773 	 const unsigned char *name;
2774 	 size_t name_len;
2775     {
2776       int32_t hash = elem_hash ((const char *) name, name_len);
2777       int32_t elem = hash % table_size;
2778       if (symb_table[2 * elem] != 0)
2779 	{
2780 	  int32_t second = hash % (table_size - 2) + 1;
2781 
2782 	  do
2783 	    {
2784 	      /* First compare the hashing value.  */
2785 	      if (symb_table[2 * elem] == hash
2786 		  /* Compare the length of the name.  */
2787 		  && name_len == extra[symb_table[2 * elem + 1]]
2788 		  /* Compare the name.  */
2789 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2790 			     name_len) == 0)
2791 		{
2792 		  /* Yep, this is the entry.  */
2793 		  break;
2794 		}
2795 
2796 	      /* Next entry.  */
2797 	      elem += second;
2798 	    }
2799 	  while (symb_table[2 * elem] != 0);
2800 	}
2801       return elem;
2802     }
2803 
2804   /* Local function for parse_bracket_exp used in _LIBC environement.
2805      Look up the collation sequence value of BR_ELEM.
2806      Return the value if succeeded, UINT_MAX otherwise.  */
2807 
2808   auto inline unsigned int
2809   __attribute ((always_inline))
2810   lookup_collation_sequence_value (br_elem)
2811 	 bracket_elem_t *br_elem;
2812     {
2813       if (br_elem->type == SB_CHAR)
2814 	{
2815 	  /*
2816 	  if (MB_CUR_MAX == 1)
2817 	  */
2818 	  if (nrules == 0)
2819 	    return collseqmb[br_elem->opr.ch];
2820 	  else
2821 	    {
2822 	      wint_t wc = __btowc (br_elem->opr.ch);
2823 	      return __collseq_table_lookup (collseqwc, wc);
2824 	    }
2825 	}
2826       else if (br_elem->type == MB_CHAR)
2827 	{
2828 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2829 	}
2830       else if (br_elem->type == COLL_SYM)
2831 	{
2832 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833 	  if (nrules != 0)
2834 	    {
2835 	      int32_t elem, idx;
2836 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2837 						  sym_name_len);
2838 	      if (symb_table[2 * elem] != 0)
2839 		{
2840 		  /* We found the entry.  */
2841 		  idx = symb_table[2 * elem + 1];
2842 		  /* Skip the name of collating element name.  */
2843 		  idx += 1 + extra[idx];
2844 		  /* Skip the byte sequence of the collating element.  */
2845 		  idx += 1 + extra[idx];
2846 		  /* Adjust for the alignment.  */
2847 		  idx = (idx + 3) & ~3;
2848 		  /* Skip the multibyte collation sequence value.  */
2849 		  idx += sizeof (unsigned int);
2850 		  /* Skip the wide char sequence of the collating element.  */
2851 		  idx += sizeof (unsigned int) *
2852 		    (1 + *(unsigned int *) (extra + idx));
2853 		  /* Return the collation sequence value.  */
2854 		  return *(unsigned int *) (extra + idx);
2855 		}
2856 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2857 		{
2858 		  /* No valid character.  Match it as a single byte
2859 		     character.  */
2860 		  return collseqmb[br_elem->opr.name[0]];
2861 		}
2862 	    }
2863 	  else if (sym_name_len == 1)
2864 	    return collseqmb[br_elem->opr.name[0]];
2865 	}
2866       return UINT_MAX;
2867     }
2868 
2869   /* Local function for parse_bracket_exp used in _LIBC environement.
2870      Build the range expression which starts from START_ELEM, and ends
2871      at END_ELEM.  The result are written to MBCSET and SBCSET.
2872      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873      mbcset->range_ends, is a pointer argument sinse we may
2874      update it.  */
2875 
2876   auto inline reg_errcode_t
2877   __attribute ((always_inline))
2878   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2879 	 re_charset_t *mbcset;
2880 	 Idx *range_alloc;
2881 	 bitset_t sbcset;
2882 	 bracket_elem_t *start_elem, *end_elem;
2883     {
2884       unsigned int ch;
2885       uint32_t start_collseq;
2886       uint32_t end_collseq;
2887 
2888       /* Equivalence Classes and Character Classes can't be a range
2889 	 start/end.  */
2890       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2891 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2892 	      0))
2893 	return REG_ERANGE;
2894 
2895       start_collseq = lookup_collation_sequence_value (start_elem);
2896       end_collseq = lookup_collation_sequence_value (end_elem);
2897       /* Check start/end collation sequence values.  */
2898       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2899 	return REG_ECOLLATE;
2900       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2901 	return REG_ERANGE;
2902 
2903       /* Got valid collation sequence values, add them as a new entry.
2904 	 However, if we have no collation elements, and the character set
2905 	 is single byte, the single byte character set that we
2906 	 build below suffices. */
2907       if (nrules > 0 || dfa->mb_cur_max > 1)
2908 	{
2909           /* Check the space of the arrays.  */
2910           if (BE (*range_alloc == mbcset->nranges, 0))
2911 	    {
2912 	      /* There is not enough space, need realloc.  */
2913 	      uint32_t *new_array_start;
2914 	      uint32_t *new_array_end;
2915 	      Idx new_nranges;
2916 
2917 	      /* +1 in case of mbcset->nranges is 0.  */
2918 	      new_nranges = 2 * mbcset->nranges + 1;
2919 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920 					    new_nranges);
2921 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922 				          new_nranges);
2923 
2924 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2925 	        return REG_ESPACE;
2926 
2927 	      mbcset->range_starts = new_array_start;
2928 	      mbcset->range_ends = new_array_end;
2929 	      *range_alloc = new_nranges;
2930 	    }
2931 
2932           mbcset->range_starts[mbcset->nranges] = start_collseq;
2933           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2934 	}
2935 
2936       /* Build the table for single byte characters.  */
2937       for (ch = 0; ch < SBC_MAX; ch++)
2938 	{
2939 	  uint32_t ch_collseq;
2940 	  /*
2941 	  if (MB_CUR_MAX == 1)
2942 	  */
2943 	  if (nrules == 0)
2944 	    ch_collseq = collseqmb[ch];
2945 	  else
2946 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2947 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2948 	    bitset_set (sbcset, ch);
2949 	}
2950       return REG_NOERROR;
2951     }
2952 
2953   /* Local function for parse_bracket_exp used in _LIBC environement.
2954      Build the collating element which is represented by NAME.
2955      The result are written to MBCSET and SBCSET.
2956      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2957      pointer argument sinse we may update it.  */
2958 
2959   auto inline reg_errcode_t
2960   __attribute ((always_inline))
2961   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2962 	 re_charset_t *mbcset;
2963 	 Idx *coll_sym_alloc;
2964 	 bitset_t sbcset;
2965 	 const unsigned char *name;
2966     {
2967       int32_t elem, idx;
2968       size_t name_len = strlen ((const char *) name);
2969       if (nrules != 0)
2970 	{
2971 	  elem = seek_collating_symbol_entry (name, name_len);
2972 	  if (symb_table[2 * elem] != 0)
2973 	    {
2974 	      /* We found the entry.  */
2975 	      idx = symb_table[2 * elem + 1];
2976 	      /* Skip the name of collating element name.  */
2977 	      idx += 1 + extra[idx];
2978 	    }
2979 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2980 	    {
2981 	      /* No valid character, treat it as a normal
2982 		 character.  */
2983 	      bitset_set (sbcset, name[0]);
2984 	      return REG_NOERROR;
2985 	    }
2986 	  else
2987 	    return REG_ECOLLATE;
2988 
2989 	  /* Got valid collation sequence, add it as a new entry.  */
2990 	  /* Check the space of the arrays.  */
2991 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2992 	    {
2993 	      /* Not enough, realloc it.  */
2994 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2995 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2996 	      /* Use realloc since mbcset->coll_syms is NULL
2997 		 if *alloc == 0.  */
2998 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2999 						   new_coll_sym_alloc);
3000 	      if (BE (new_coll_syms == NULL, 0))
3001 		return REG_ESPACE;
3002 	      mbcset->coll_syms = new_coll_syms;
3003 	      *coll_sym_alloc = new_coll_sym_alloc;
3004 	    }
3005 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3006 	  return REG_NOERROR;
3007 	}
3008       else
3009 	{
3010 	  if (BE (name_len != 1, 0))
3011 	    return REG_ECOLLATE;
3012 	  else
3013 	    {
3014 	      bitset_set (sbcset, name[0]);
3015 	      return REG_NOERROR;
3016 	    }
3017 	}
3018     }
3019 #endif
3020 
3021   re_token_t br_token;
3022   re_bitset_ptr_t sbcset;
3023 #ifdef RE_ENABLE_I18N
3024   re_charset_t *mbcset;
3025   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3026   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3027 #endif /* not RE_ENABLE_I18N */
3028   bool non_match = false;
3029   bin_tree_t *work_tree;
3030   int token_len;
3031   bool first_round = true;
3032 #ifdef _LIBC
3033   collseqmb = (const unsigned char *)
3034     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036   if (nrules)
3037     {
3038       /*
3039       if (MB_CUR_MAX > 1)
3040       */
3041       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3044 						  _NL_COLLATE_SYMB_TABLEMB);
3045       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3046 						   _NL_COLLATE_SYMB_EXTRAMB);
3047     }
3048 #endif
3049   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3050 #ifdef RE_ENABLE_I18N
3051   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3052 #endif /* RE_ENABLE_I18N */
3053 #ifdef RE_ENABLE_I18N
3054   if (BE (sbcset == NULL || mbcset == NULL, 0))
3055 #else
3056   if (BE (sbcset == NULL, 0))
3057 #endif /* RE_ENABLE_I18N */
3058     {
3059       *err = REG_ESPACE;
3060       return NULL;
3061     }
3062 
3063   token_len = peek_token_bracket (token, regexp, syntax);
3064   if (BE (token->type == END_OF_RE, 0))
3065     {
3066       *err = REG_BADPAT;
3067       goto parse_bracket_exp_free_return;
3068     }
3069   if (token->type == OP_NON_MATCH_LIST)
3070     {
3071 #ifdef RE_ENABLE_I18N
3072       mbcset->non_match = 1;
3073 #endif /* not RE_ENABLE_I18N */
3074       non_match = true;
3075       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3076 	bitset_set (sbcset, '\n');
3077       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3078       token_len = peek_token_bracket (token, regexp, syntax);
3079       if (BE (token->type == END_OF_RE, 0))
3080 	{
3081 	  *err = REG_BADPAT;
3082 	  goto parse_bracket_exp_free_return;
3083 	}
3084     }
3085 
3086   /* We treat the first ']' as a normal character.  */
3087   if (token->type == OP_CLOSE_BRACKET)
3088     token->type = CHARACTER;
3089 
3090   while (1)
3091     {
3092       bracket_elem_t start_elem, end_elem;
3093       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3094       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3095       reg_errcode_t ret;
3096       int token_len2 = 0;
3097       bool is_range_exp = false;
3098       re_token_t token2;
3099 
3100       start_elem.opr.name = start_name_buf;
3101       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3102 				   syntax, first_round);
3103       if (BE (ret != REG_NOERROR, 0))
3104 	{
3105 	  *err = ret;
3106 	  goto parse_bracket_exp_free_return;
3107 	}
3108       first_round = false;
3109 
3110       /* Get information about the next token.  We need it in any case.  */
3111       token_len = peek_token_bracket (token, regexp, syntax);
3112 
3113       /* Do not check for ranges if we know they are not allowed.  */
3114       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3115 	{
3116 	  if (BE (token->type == END_OF_RE, 0))
3117 	    {
3118 	      *err = REG_EBRACK;
3119 	      goto parse_bracket_exp_free_return;
3120 	    }
3121 	  if (token->type == OP_CHARSET_RANGE)
3122 	    {
3123 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3124 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3125 	      if (BE (token2.type == END_OF_RE, 0))
3126 		{
3127 		  *err = REG_EBRACK;
3128 		  goto parse_bracket_exp_free_return;
3129 		}
3130 	      if (token2.type == OP_CLOSE_BRACKET)
3131 		{
3132 		  /* We treat the last '-' as a normal character.  */
3133 		  re_string_skip_bytes (regexp, -token_len);
3134 		  token->type = CHARACTER;
3135 		}
3136 	      else
3137 		is_range_exp = true;
3138 	    }
3139 	}
3140 
3141       if (is_range_exp == true)
3142 	{
3143 	  end_elem.opr.name = end_name_buf;
3144 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3145 				       dfa, syntax, true);
3146 	  if (BE (ret != REG_NOERROR, 0))
3147 	    {
3148 	      *err = ret;
3149 	      goto parse_bracket_exp_free_return;
3150 	    }
3151 
3152 	  token_len = peek_token_bracket (token, regexp, syntax);
3153 
3154 #ifdef _LIBC
3155 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3156 				  &start_elem, &end_elem);
3157 #else
3158 # ifdef RE_ENABLE_I18N
3159 	  *err = build_range_exp (sbcset,
3160 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3161 				  &range_alloc, &start_elem, &end_elem);
3162 # else
3163 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3164 # endif
3165 #endif /* RE_ENABLE_I18N */
3166 	  if (BE (*err != REG_NOERROR, 0))
3167 	    goto parse_bracket_exp_free_return;
3168 	}
3169       else
3170 	{
3171 	  switch (start_elem.type)
3172 	    {
3173 	    case SB_CHAR:
3174 	      bitset_set (sbcset, start_elem.opr.ch);
3175 	      break;
3176 #ifdef RE_ENABLE_I18N
3177 	    case MB_CHAR:
3178 	      /* Check whether the array has enough space.  */
3179 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3180 		{
3181 		  wchar_t *new_mbchars;
3182 		  /* Not enough, realloc it.  */
3183 		  /* +1 in case of mbcset->nmbchars is 0.  */
3184 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3185 		  /* Use realloc since array is NULL if *alloc == 0.  */
3186 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3187 					    mbchar_alloc);
3188 		  if (BE (new_mbchars == NULL, 0))
3189 		    goto parse_bracket_exp_espace;
3190 		  mbcset->mbchars = new_mbchars;
3191 		}
3192 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3193 	      break;
3194 #endif /* RE_ENABLE_I18N */
3195 	    case EQUIV_CLASS:
3196 	      *err = build_equiv_class (sbcset,
3197 #ifdef RE_ENABLE_I18N
3198 					mbcset, &equiv_class_alloc,
3199 #endif /* RE_ENABLE_I18N */
3200 					start_elem.opr.name);
3201 	      if (BE (*err != REG_NOERROR, 0))
3202 		goto parse_bracket_exp_free_return;
3203 	      break;
3204 	    case COLL_SYM:
3205 	      *err = build_collating_symbol (sbcset,
3206 #ifdef RE_ENABLE_I18N
3207 					     mbcset, &coll_sym_alloc,
3208 #endif /* RE_ENABLE_I18N */
3209 					     start_elem.opr.name);
3210 	      if (BE (*err != REG_NOERROR, 0))
3211 		goto parse_bracket_exp_free_return;
3212 	      break;
3213 	    case CHAR_CLASS:
3214 	      *err = build_charclass (regexp->trans, sbcset,
3215 #ifdef RE_ENABLE_I18N
3216 				      mbcset, &char_class_alloc,
3217 #endif /* RE_ENABLE_I18N */
3218 				      start_elem.opr.name, syntax);
3219 	      if (BE (*err != REG_NOERROR, 0))
3220 	       goto parse_bracket_exp_free_return;
3221 	      break;
3222 	    default:
3223 	      assert (0);
3224 	      break;
3225 	    }
3226 	}
3227       if (BE (token->type == END_OF_RE, 0))
3228 	{
3229 	  *err = REG_EBRACK;
3230 	  goto parse_bracket_exp_free_return;
3231 	}
3232       if (token->type == OP_CLOSE_BRACKET)
3233 	break;
3234     }
3235 
3236   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3237 
3238   /* If it is non-matching list.  */
3239   if (non_match)
3240     bitset_not (sbcset);
3241 
3242 #ifdef RE_ENABLE_I18N
3243   /* Ensure only single byte characters are set.  */
3244   if (dfa->mb_cur_max > 1)
3245     bitset_mask (sbcset, dfa->sb_char);
3246 
3247   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3248       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3249 						     || mbcset->non_match)))
3250     {
3251       bin_tree_t *mbc_tree;
3252       int sbc_idx;
3253       /* Build a tree for complex bracket.  */
3254       dfa->has_mb_node = 1;
3255       br_token.type = COMPLEX_BRACKET;
3256       br_token.opr.mbcset = mbcset;
3257       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258       if (BE (mbc_tree == NULL, 0))
3259 	goto parse_bracket_exp_espace;
3260       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3261 	if (sbcset[sbc_idx])
3262 	  break;
3263       /* If there are no bits set in sbcset, there is no point
3264 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3265       if (sbc_idx < BITSET_WORDS)
3266 	{
3267           /* Build a tree for simple bracket.  */
3268           br_token.type = SIMPLE_BRACKET;
3269           br_token.opr.sbcset = sbcset;
3270           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3271           if (BE (work_tree == NULL, 0))
3272             goto parse_bracket_exp_espace;
3273 
3274           /* Then join them by ALT node.  */
3275           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3276           if (BE (work_tree == NULL, 0))
3277             goto parse_bracket_exp_espace;
3278 	}
3279       else
3280 	{
3281 	  re_free (sbcset);
3282 	  work_tree = mbc_tree;
3283 	}
3284     }
3285   else
3286 #endif /* not RE_ENABLE_I18N */
3287     {
3288 #ifdef RE_ENABLE_I18N
3289       free_charset (mbcset);
3290 #endif
3291       /* Build a tree for simple bracket.  */
3292       br_token.type = SIMPLE_BRACKET;
3293       br_token.opr.sbcset = sbcset;
3294       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3295       if (BE (work_tree == NULL, 0))
3296         goto parse_bracket_exp_espace;
3297     }
3298   return work_tree;
3299 
3300  parse_bracket_exp_espace:
3301   *err = REG_ESPACE;
3302  parse_bracket_exp_free_return:
3303   re_free (sbcset);
3304 #ifdef RE_ENABLE_I18N
3305   free_charset (mbcset);
3306 #endif /* RE_ENABLE_I18N */
3307   return NULL;
3308 }
3309 
3310 /* Parse an element in the bracket expression.  */
3311 
3312 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3313 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3314 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3315 		       reg_syntax_t syntax, bool accept_hyphen)
3316 {
3317 #ifdef RE_ENABLE_I18N
3318   int cur_char_size;
3319   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3320   if (cur_char_size > 1)
3321     {
3322       elem->type = MB_CHAR;
3323       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3324       re_string_skip_bytes (regexp, cur_char_size);
3325       return REG_NOERROR;
3326     }
3327 #endif /* RE_ENABLE_I18N */
3328   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3329   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3330       || token->type == OP_OPEN_EQUIV_CLASS)
3331     return parse_bracket_symbol (elem, regexp, token);
3332   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3333     {
3334       /* A '-' must only appear as anything but a range indicator before
3335 	 the closing bracket.  Everything else is an error.  */
3336       re_token_t token2;
3337       (void) peek_token_bracket (&token2, regexp, syntax);
3338       if (token2.type != OP_CLOSE_BRACKET)
3339 	/* The actual error value is not standardized since this whole
3340 	   case is undefined.  But ERANGE makes good sense.  */
3341 	return REG_ERANGE;
3342     }
3343   elem->type = SB_CHAR;
3344   elem->opr.ch = token->opr.c;
3345   return REG_NOERROR;
3346 }
3347 
3348 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3349    such as [:<character_class>:], [.<collating_element>.], and
3350    [=<equivalent_class>=].  */
3351 
3352 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3353 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3354 		      re_token_t *token)
3355 {
3356   unsigned char ch, delim = token->opr.c;
3357   int i = 0;
3358   if (re_string_eoi(regexp))
3359     return REG_EBRACK;
3360   for (;; ++i)
3361     {
3362       if (i >= BRACKET_NAME_BUF_SIZE)
3363 	return REG_EBRACK;
3364       if (token->type == OP_OPEN_CHAR_CLASS)
3365 	ch = re_string_fetch_byte_case (regexp);
3366       else
3367 	ch = re_string_fetch_byte (regexp);
3368       if (re_string_eoi(regexp))
3369 	return REG_EBRACK;
3370       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3371 	break;
3372       elem->opr.name[i] = ch;
3373     }
3374   re_string_skip_bytes (regexp, 1);
3375   elem->opr.name[i] = '\0';
3376   switch (token->type)
3377     {
3378     case OP_OPEN_COLL_ELEM:
3379       elem->type = COLL_SYM;
3380       break;
3381     case OP_OPEN_EQUIV_CLASS:
3382       elem->type = EQUIV_CLASS;
3383       break;
3384     case OP_OPEN_CHAR_CLASS:
3385       elem->type = CHAR_CLASS;
3386       break;
3387     default:
3388       break;
3389     }
3390   return REG_NOERROR;
3391 }
3392 
3393   /* Helper function for parse_bracket_exp.
3394      Build the equivalence class which is represented by NAME.
3395      The result are written to MBCSET and SBCSET.
3396      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3397      is a pointer argument sinse we may update it.  */
3398 
3399 static reg_errcode_t
3400 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3401 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3402 		   Idx *equiv_class_alloc, const unsigned char *name)
3403 #else /* not RE_ENABLE_I18N */
3404 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3405 #endif /* not RE_ENABLE_I18N */
3406 {
3407 #ifdef _LIBC
3408   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3409   if (nrules != 0)
3410     {
3411       const int32_t *table, *indirect;
3412       const unsigned char *weights, *extra, *cp;
3413       unsigned char char_buf[2];
3414       int32_t idx1, idx2;
3415       unsigned int ch;
3416       size_t len;
3417       /* This #include defines a local function!  */
3418 # include <locale/weight.h>
3419       /* Calculate the index for equivalence class.  */
3420       cp = name;
3421       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3422       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3423 					       _NL_COLLATE_WEIGHTMB);
3424       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3425 						   _NL_COLLATE_EXTRAMB);
3426       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3427 						_NL_COLLATE_INDIRECTMB);
3428       idx1 = findidx (&cp);
3429       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3430 	/* This isn't a valid character.  */
3431 	return REG_ECOLLATE;
3432 
3433       /* Build single byte matcing table for this equivalence class.  */
3434       char_buf[1] = (unsigned char) '\0';
3435       len = weights[idx1];
3436       for (ch = 0; ch < SBC_MAX; ++ch)
3437 	{
3438 	  char_buf[0] = ch;
3439 	  cp = char_buf;
3440 	  idx2 = findidx (&cp);
3441 /*
3442 	  idx2 = table[ch];
3443 */
3444 	  if (idx2 == 0)
3445 	    /* This isn't a valid character.  */
3446 	    continue;
3447 	  if (len == weights[idx2])
3448 	    {
3449 	      int cnt = 0;
3450 	      while (cnt <= len &&
3451 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3452 		++cnt;
3453 
3454 	      if (cnt > len)
3455 		bitset_set (sbcset, ch);
3456 	    }
3457 	}
3458       /* Check whether the array has enough space.  */
3459       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3460 	{
3461 	  /* Not enough, realloc it.  */
3462 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3463 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3465 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466 						   int32_t,
3467 						   new_equiv_class_alloc);
3468 	  if (BE (new_equiv_classes == NULL, 0))
3469 	    return REG_ESPACE;
3470 	  mbcset->equiv_classes = new_equiv_classes;
3471 	  *equiv_class_alloc = new_equiv_class_alloc;
3472 	}
3473       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3474     }
3475   else
3476 #endif /* _LIBC */
3477     {
3478       if (BE (strlen ((const char *) name) != 1, 0))
3479 	return REG_ECOLLATE;
3480       bitset_set (sbcset, *name);
3481     }
3482   return REG_NOERROR;
3483 }
3484 
3485   /* Helper function for parse_bracket_exp.
3486      Build the character class which is represented by NAME.
3487      The result are written to MBCSET and SBCSET.
3488      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489      is a pointer argument sinse we may update it.  */
3490 
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3493 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494 		 re_charset_t *mbcset, Idx *char_class_alloc,
3495 		 const unsigned char *class_name, reg_syntax_t syntax)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498 		 const unsigned char *class_name, reg_syntax_t syntax)
3499 #endif /* not RE_ENABLE_I18N */
3500 {
3501   int i;
3502   const char *name = (const char *) class_name;
3503 
3504   /* In case of REG_ICASE "upper" and "lower" match the both of
3505      upper and lower cases.  */
3506   if ((syntax & RE_ICASE)
3507       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508     name = "alpha";
3509 
3510 #ifdef RE_ENABLE_I18N
3511   /* Check the space of the arrays.  */
3512   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3513     {
3514       /* Not enough, realloc it.  */
3515       /* +1 in case of mbcset->nchar_classes is 0.  */
3516       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517       /* Use realloc since array is NULL if *alloc == 0.  */
3518       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519 					       new_char_class_alloc);
3520       if (BE (new_char_classes == NULL, 0))
3521 	return REG_ESPACE;
3522       mbcset->char_classes = new_char_classes;
3523       *char_class_alloc = new_char_class_alloc;
3524     }
3525   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526 #endif /* RE_ENABLE_I18N */
3527 
3528 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3529   do {						\
3530     if (BE (trans != NULL, 0))			\
3531       {						\
3532 	for (i = 0; i < SBC_MAX; ++i)		\
3533 	  if (ctype_func (i))			\
3534 	    bitset_set (sbcset, trans[i]);	\
3535       }						\
3536     else					\
3537       {						\
3538 	for (i = 0; i < SBC_MAX; ++i)		\
3539 	  if (ctype_func (i))			\
3540 	    bitset_set (sbcset, i);		\
3541       }						\
3542   } while (0)
3543 
3544   if (strcmp (name, "alnum") == 0)
3545     BUILD_CHARCLASS_LOOP (isalnum);
3546   else if (strcmp (name, "cntrl") == 0)
3547     BUILD_CHARCLASS_LOOP (iscntrl);
3548   else if (strcmp (name, "lower") == 0)
3549     BUILD_CHARCLASS_LOOP (islower);
3550   else if (strcmp (name, "space") == 0)
3551     BUILD_CHARCLASS_LOOP (isspace);
3552   else if (strcmp (name, "alpha") == 0)
3553     BUILD_CHARCLASS_LOOP (isalpha);
3554   else if (strcmp (name, "digit") == 0)
3555     BUILD_CHARCLASS_LOOP (isdigit);
3556   else if (strcmp (name, "print") == 0)
3557     BUILD_CHARCLASS_LOOP (isprint);
3558   else if (strcmp (name, "upper") == 0)
3559     BUILD_CHARCLASS_LOOP (isupper);
3560   else if (strcmp (name, "blank") == 0)
3561     BUILD_CHARCLASS_LOOP (isblank);
3562   else if (strcmp (name, "graph") == 0)
3563     BUILD_CHARCLASS_LOOP (isgraph);
3564   else if (strcmp (name, "punct") == 0)
3565     BUILD_CHARCLASS_LOOP (ispunct);
3566   else if (strcmp (name, "xdigit") == 0)
3567     BUILD_CHARCLASS_LOOP (isxdigit);
3568   else
3569     return REG_ECTYPE;
3570 
3571   return REG_NOERROR;
3572 }
3573 
3574 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3575 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576 		    const unsigned char *class_name,
3577 		    const unsigned char *extra, bool non_match,
3578 		    reg_errcode_t *err)
3579 {
3580   re_bitset_ptr_t sbcset;
3581 #ifdef RE_ENABLE_I18N
3582   re_charset_t *mbcset;
3583   Idx alloc = 0;
3584 #endif /* not RE_ENABLE_I18N */
3585   reg_errcode_t ret;
3586   re_token_t br_token;
3587   bin_tree_t *tree;
3588 
3589   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590 #ifdef RE_ENABLE_I18N
3591   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592 #endif /* RE_ENABLE_I18N */
3593 
3594 #ifdef RE_ENABLE_I18N
3595   if (BE (sbcset == NULL || mbcset == NULL, 0))
3596 #else /* not RE_ENABLE_I18N */
3597   if (BE (sbcset == NULL, 0))
3598 #endif /* not RE_ENABLE_I18N */
3599     {
3600       *err = REG_ESPACE;
3601       return NULL;
3602     }
3603 
3604   if (non_match)
3605     {
3606 #ifdef RE_ENABLE_I18N
3607       mbcset->non_match = 1;
3608 #endif /* not RE_ENABLE_I18N */
3609     }
3610 
3611   /* We don't care the syntax in this case.  */
3612   ret = build_charclass (trans, sbcset,
3613 #ifdef RE_ENABLE_I18N
3614 			 mbcset, &alloc,
3615 #endif /* RE_ENABLE_I18N */
3616 			 class_name, 0);
3617 
3618   if (BE (ret != REG_NOERROR, 0))
3619     {
3620       re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622       free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624       *err = ret;
3625       return NULL;
3626     }
3627   /* \w match '_' also.  */
3628   for (; *extra; extra++)
3629     bitset_set (sbcset, *extra);
3630 
3631   /* If it is non-matching list.  */
3632   if (non_match)
3633     bitset_not (sbcset);
3634 
3635 #ifdef RE_ENABLE_I18N
3636   /* Ensure only single byte characters are set.  */
3637   if (dfa->mb_cur_max > 1)
3638     bitset_mask (sbcset, dfa->sb_char);
3639 #endif
3640 
3641   /* Build a tree for simple bracket.  */
3642   br_token.type = SIMPLE_BRACKET;
3643   br_token.opr.sbcset = sbcset;
3644   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645   if (BE (tree == NULL, 0))
3646     goto build_word_op_espace;
3647 
3648 #ifdef RE_ENABLE_I18N
3649   if (dfa->mb_cur_max > 1)
3650     {
3651       bin_tree_t *mbc_tree;
3652       /* Build a tree for complex bracket.  */
3653       br_token.type = COMPLEX_BRACKET;
3654       br_token.opr.mbcset = mbcset;
3655       dfa->has_mb_node = 1;
3656       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657       if (BE (mbc_tree == NULL, 0))
3658 	goto build_word_op_espace;
3659       /* Then join them by ALT node.  */
3660       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661       if (BE (mbc_tree != NULL, 1))
3662 	return tree;
3663     }
3664   else
3665     {
3666       free_charset (mbcset);
3667       return tree;
3668     }
3669 #else /* not RE_ENABLE_I18N */
3670   return tree;
3671 #endif /* not RE_ENABLE_I18N */
3672 
3673  build_word_op_espace:
3674   re_free (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676   free_charset (mbcset);
3677 #endif /* RE_ENABLE_I18N */
3678   *err = REG_ESPACE;
3679   return NULL;
3680 }
3681 
3682 /* This is intended for the expressions like "a{1,3}".
3683    Fetch a number from `input', and return the number.
3684    Return REG_MISSING if the number field is empty like "{,1}".
3685    Return REG_ERROR if an error occurred.  */
3686 
3687 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3688 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3689 {
3690   Idx num = REG_MISSING;
3691   unsigned char c;
3692   while (1)
3693     {
3694       fetch_token (token, input, syntax);
3695       c = token->opr.c;
3696       if (BE (token->type == END_OF_RE, 0))
3697 	return REG_ERROR;
3698       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699 	break;
3700       num = ((token->type != CHARACTER || c < '0' || '9' < c
3701 	      || num == REG_ERROR)
3702 	     ? REG_ERROR
3703 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3704       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3705     }
3706   return num;
3707 }
3708 
3709 #ifdef RE_ENABLE_I18N
3710 static void
free_charset(re_charset_t * cset)3711 free_charset (re_charset_t *cset)
3712 {
3713   re_free (cset->mbchars);
3714 # ifdef _LIBC
3715   re_free (cset->coll_syms);
3716   re_free (cset->equiv_classes);
3717   re_free (cset->range_starts);
3718   re_free (cset->range_ends);
3719 # endif
3720   re_free (cset->char_classes);
3721   re_free (cset);
3722 }
3723 #endif /* RE_ENABLE_I18N */
3724 
3725 /* Functions for binary tree operation.  */
3726 
3727 /* Create a tree node.  */
3728 
3729 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3730 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3731 	     re_token_type_t type)
3732 {
3733   re_token_t t;
3734   t.type = type;
3735   return create_token_tree (dfa, left, right, &t);
3736 }
3737 
3738 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3739 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3740 		   const re_token_t *token)
3741 {
3742   bin_tree_t *tree;
3743   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3744     {
3745       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3746 
3747       if (storage == NULL)
3748 	return NULL;
3749       storage->next = dfa->str_tree_storage;
3750       dfa->str_tree_storage = storage;
3751       dfa->str_tree_storage_idx = 0;
3752     }
3753   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3754 
3755   tree->parent = NULL;
3756   tree->left = left;
3757   tree->right = right;
3758   tree->token = *token;
3759   tree->token.duplicated = 0;
3760   tree->token.opt_subexp = 0;
3761   tree->first = NULL;
3762   tree->next = NULL;
3763   tree->node_idx = REG_MISSING;
3764 
3765   if (left != NULL)
3766     left->parent = tree;
3767   if (right != NULL)
3768     right->parent = tree;
3769   return tree;
3770 }
3771 
3772 /* Mark the tree SRC as an optional subexpression.
3773    To be called from preorder or postorder.  */
3774 
3775 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3776 mark_opt_subexp (void *extra, bin_tree_t *node)
3777 {
3778   Idx idx = (Idx) (intptr_t) extra;
3779   assert(sizeof(void*) >= sizeof(Idx));
3780   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3781     node->token.opt_subexp = 1;
3782 
3783   return REG_NOERROR;
3784 }
3785 
3786 /* Free the allocated memory inside NODE. */
3787 
3788 static void
free_token(re_token_t * node)3789 free_token (re_token_t *node)
3790 {
3791 #ifdef RE_ENABLE_I18N
3792   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3793     free_charset (node->opr.mbcset);
3794   else
3795 #endif /* RE_ENABLE_I18N */
3796     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3797       re_free (node->opr.sbcset);
3798 }
3799 
3800 /* Worker function for tree walking.  Free the allocated memory inside NODE
3801    and its children. */
3802 
3803 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3804 free_tree (void *extra, bin_tree_t *node)
3805 {
3806   free_token (&node->token);
3807   return REG_NOERROR;
3808 }
3809 
3810 
3811 /* Duplicate the node SRC, and return new node.  This is a preorder
3812    visit similar to the one implemented by the generic visitor, but
3813    we need more infrastructure to maintain two parallel trees --- so,
3814    it's easier to duplicate.  */
3815 
3816 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3817 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3818 {
3819   const bin_tree_t *node;
3820   bin_tree_t *dup_root;
3821   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3822 
3823   for (node = root; ; )
3824     {
3825       /* Create a new tree and link it back to the current parent.  */
3826       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3827       if (*p_new == NULL)
3828 	return NULL;
3829       (*p_new)->parent = dup_node;
3830       (*p_new)->token.duplicated = 1;
3831       dup_node = *p_new;
3832 
3833       /* Go to the left node, or up and to the right.  */
3834       if (node->left)
3835 	{
3836 	  node = node->left;
3837 	  p_new = &dup_node->left;
3838 	}
3839       else
3840 	{
3841 	  const bin_tree_t *prev = NULL;
3842 	  while (node->right == prev || node->right == NULL)
3843 	    {
3844 	      prev = node;
3845 	      node = node->parent;
3846 	      dup_node = dup_node->parent;
3847 	      if (!node)
3848 	        return dup_root;
3849 	    }
3850 	  node = node->right;
3851 	  p_new = &dup_node->right;
3852 	}
3853     }
3854 }
3855