1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5    Free Software Foundation, Inc.
6    This file is part of the GNU C Library.
7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3, or (at your option)
12    any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License along
20    with this program; if not, write to the Free Software Foundation,
21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
24 				     Idx n) internal_function;
25 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
26 static void match_ctx_free (re_match_context_t *cache) internal_function;
27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
28 					  Idx str_idx, Idx from, Idx to)
29      internal_function;
30 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
31      internal_function;
32 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
33 					   Idx str_idx) internal_function;
34 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
35 						    Idx node, Idx str_idx)
36      internal_function;
37 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
38 			   re_dfastate_t **limited_sts, Idx last_node,
39 			   Idx last_str_idx)
40      internal_function;
41 static reg_errcode_t re_search_internal (const regex_t *preg,
42 					 const char *string, Idx length,
43 					 Idx start, Idx last_start, Idx stop,
44 					 size_t nmatch, regmatch_t pmatch[],
45 					 int eflags) internal_function;
46 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
47 				  const char *string1, Idx length1,
48 				  const char *string2, Idx length2,
49 				  Idx start, regoff_t range,
50 				  struct re_registers *regs,
51 				  Idx stop, bool ret_len) internal_function;
52 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
53 				const char *string, Idx length, Idx start,
54 				regoff_t range, Idx stop,
55 				struct re_registers *regs,
56 				bool ret_len) internal_function;
57 static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
58 				  Idx nregs, int regs_allocated)
59      internal_function;
60 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
61      internal_function;
62 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
63 			   Idx *p_match_first) internal_function;
64 static Idx check_halt_state_context (const re_match_context_t *mctx,
65 				     const re_dfastate_t *state, Idx idx)
66      internal_function;
67 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
68 			 regmatch_t *prev_idx_match, Idx cur_node,
69 			 Idx cur_idx, Idx nmatch) internal_function;
70 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
71 				      Idx str_idx, Idx dest_node, Idx nregs,
72 				      regmatch_t *regs,
73 				      re_node_set *eps_via_nodes)
74      internal_function;
75 static reg_errcode_t set_regs (const regex_t *preg,
76 			       const re_match_context_t *mctx,
77 			       size_t nmatch, regmatch_t *pmatch,
78 			       bool fl_backtrack) internal_function;
79 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
80      internal_function;
81 
82 #ifdef RE_ENABLE_I18N
83 static int sift_states_iter_mb (const re_match_context_t *mctx,
84 				re_sift_context_t *sctx,
85 				Idx node_idx, Idx str_idx, Idx max_str_idx)
86      internal_function;
87 #endif /* RE_ENABLE_I18N */
88 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
89 					   re_sift_context_t *sctx)
90      internal_function;
91 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
92 					  re_sift_context_t *sctx, Idx str_idx,
93 					  re_node_set *cur_dest)
94      internal_function;
95 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
96 					      re_sift_context_t *sctx,
97 					      Idx str_idx,
98 					      re_node_set *dest_nodes)
99      internal_function;
100 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
101 					    re_node_set *dest_nodes,
102 					    const re_node_set *candidates)
103      internal_function;
104 static bool check_dst_limits (const re_match_context_t *mctx,
105 			      const re_node_set *limits,
106 			      Idx dst_node, Idx dst_idx, Idx src_node,
107 			      Idx src_idx) internal_function;
108 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
109 					int boundaries, Idx subexp_idx,
110 					Idx from_node, Idx bkref_idx)
111      internal_function;
112 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
113 				      Idx limit, Idx subexp_idx,
114 				      Idx node, Idx str_idx,
115 				      Idx bkref_idx) internal_function;
116 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
117 					  re_node_set *dest_nodes,
118 					  const re_node_set *candidates,
119 					  re_node_set *limits,
120 					  struct re_backref_cache_entry *bkref_ents,
121 					  Idx str_idx) internal_function;
122 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
123 					re_sift_context_t *sctx,
124 					Idx str_idx, const re_node_set *candidates)
125      internal_function;
126 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
127 					re_dfastate_t **dst,
128 					re_dfastate_t **src, Idx num)
129      internal_function;
130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 					 re_match_context_t *mctx) internal_function;
132 static re_dfastate_t *transit_state (reg_errcode_t *err,
133 				     re_match_context_t *mctx,
134 				     re_dfastate_t *state) internal_function;
135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 					    re_match_context_t *mctx,
137 					    re_dfastate_t *next_state)
138      internal_function;
139 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
140 						re_node_set *cur_nodes,
141 						Idx str_idx) internal_function;
142 #if 0
143 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
144 					re_match_context_t *mctx,
145 					re_dfastate_t *pstate)
146      internal_function;
147 #endif
148 #ifdef RE_ENABLE_I18N
149 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
150 				       re_dfastate_t *pstate)
151      internal_function;
152 #endif /* RE_ENABLE_I18N */
153 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
154 					  const re_node_set *nodes)
155      internal_function;
156 static reg_errcode_t get_subexp (re_match_context_t *mctx,
157 				 Idx bkref_node, Idx bkref_str_idx)
158      internal_function;
159 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
160 				     const re_sub_match_top_t *sub_top,
161 				     re_sub_match_last_t *sub_last,
162 				     Idx bkref_node, Idx bkref_str)
163      internal_function;
164 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
165 			     Idx subexp_idx, int type) internal_function;
166 static reg_errcode_t check_arrival (re_match_context_t *mctx,
167 				    state_array_t *path, Idx top_node,
168 				    Idx top_str, Idx last_node, Idx last_str,
169 				    int type) internal_function;
170 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
171 						   Idx str_idx,
172 						   re_node_set *cur_nodes,
173 						   re_node_set *next_nodes)
174      internal_function;
175 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
176 					       re_node_set *cur_nodes,
177 					       Idx ex_subexp, int type)
178      internal_function;
179 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
180 						   re_node_set *dst_nodes,
181 						   Idx target, Idx ex_subexp,
182 						   int type) internal_function;
183 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
184 					 re_node_set *cur_nodes, Idx cur_str,
185 					 Idx subexp_num, int type)
186      internal_function;
187 static bool build_trtable (const re_dfa_t *dfa,
188 			   re_dfastate_t *state) internal_function;
189 #ifdef RE_ENABLE_I18N
190 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
191 				    const re_string_t *input, Idx idx)
192      internal_function;
193 # ifdef _LIBC
194 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
195 						   size_t name_len)
196      internal_function;
197 # endif /* _LIBC */
198 #endif /* RE_ENABLE_I18N */
199 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
200 				       const re_dfastate_t *state,
201 				       re_node_set *states_node,
202 				       bitset_t *states_ch) internal_function;
203 static bool check_node_accept (const re_match_context_t *mctx,
204 			       const re_token_t *node, Idx idx)
205      internal_function;
206 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
207      internal_function;
208 
209 /* Entry point for POSIX code.  */
210 
211 /* regexec searches for a given pattern, specified by PREG, in the
212    string STRING.
213 
214    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
215    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
216    least NMATCH elements, and we set them to the offsets of the
217    corresponding matched substrings.
218 
219    EFLAGS specifies `execution flags' which affect matching: if
220    REG_NOTBOL is set, then ^ does not match at the beginning of the
221    string; if REG_NOTEOL is set, then $ does not match at the end.
222 
223    We return 0 if we find a match and REG_NOMATCH if not.  */
224 
225 int
regexec(preg,string,nmatch,pmatch,eflags)226 regexec (preg, string, nmatch, pmatch, eflags)
227     const regex_t *_Restrict_ preg;
228     const char *_Restrict_ string;
229     size_t nmatch;
230     regmatch_t pmatch[_Restrict_arr_];
231     int eflags;
232 {
233   reg_errcode_t err;
234   Idx start, length;
235 #ifdef _LIBC
236   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
237 #endif
238 
239   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
240     return REG_BADPAT;
241 
242   if (eflags & REG_STARTEND)
243     {
244       start = pmatch[0].rm_so;
245       length = pmatch[0].rm_eo;
246     }
247   else
248     {
249       start = 0;
250       length = strlen (string);
251     }
252 
253   __libc_lock_lock (dfa->lock);
254   if (preg->no_sub)
255     err = re_search_internal (preg, string, length, start, length,
256 			      length, 0, NULL, eflags);
257   else
258     err = re_search_internal (preg, string, length, start, length,
259 			      length, nmatch, pmatch, eflags);
260   __libc_lock_unlock (dfa->lock);
261   return err != REG_NOERROR;
262 }
263 
264 #ifdef _LIBC
265 # include <shlib-compat.h>
266 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
267 
268 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
269 __typeof__ (__regexec) __compat_regexec;
270 
271 int
272 attribute_compat_text_section
__compat_regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)273 __compat_regexec (const regex_t *_Restrict_ preg,
274 		  const char *_Restrict_ string, size_t nmatch,
275 		  regmatch_t pmatch[], int eflags)
276 {
277   return regexec (preg, string, nmatch, pmatch,
278 		  eflags & (REG_NOTBOL | REG_NOTEOL));
279 }
280 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
281 # endif
282 #endif
283 
284 /* Entry points for GNU code.  */
285 
286 /* re_match, re_search, re_match_2, re_search_2
287 
288    The former two functions operate on STRING with length LENGTH,
289    while the later two operate on concatenation of STRING1 and STRING2
290    with lengths LENGTH1 and LENGTH2, respectively.
291 
292    re_match() matches the compiled pattern in BUFP against the string,
293    starting at index START.
294 
295    re_search() first tries matching at index START, then it tries to match
296    starting from index START + 1, and so on.  The last start position tried
297    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
298    way as re_match().)
299 
300    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
301    the first STOP characters of the concatenation of the strings should be
302    concerned.
303 
304    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
305    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
306    computed relative to the concatenation, not relative to the individual
307    strings.)
308 
309    On success, re_match* functions return the length of the match, re_search*
310    return the position of the start of the match.  Return value -1 means no
311    match was found and -2 indicates an internal error.  */
312 
313 regoff_t
re_match(bufp,string,length,start,regs)314 re_match (bufp, string, length, start, regs)
315     struct re_pattern_buffer *bufp;
316     const char *string;
317     Idx length, start;
318     struct re_registers *regs;
319 {
320   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
321 }
322 #ifdef _LIBC
323 weak_alias (__re_match, re_match)
324 #endif
325 
326 regoff_t
327 re_search (bufp, string, length, start, range, regs)
328     struct re_pattern_buffer *bufp;
329     const char *string;
330     Idx length, start;
331     regoff_t range;
332     struct re_registers *regs;
333 {
334   return re_search_stub (bufp, string, length, start, range, length, regs,
335 			 false);
336 }
337 #ifdef _LIBC
338 weak_alias (__re_search, re_search)
339 #endif
340 
341 regoff_t
342 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
343     struct re_pattern_buffer *bufp;
344     const char *string1, *string2;
345     Idx length1, length2, start, stop;
346     struct re_registers *regs;
347 {
348   return re_search_2_stub (bufp, string1, length1, string2, length2,
349 			   start, 0, regs, stop, true);
350 }
351 #ifdef _LIBC
352 weak_alias (__re_match_2, re_match_2)
353 #endif
354 
355 regoff_t
356 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
357     struct re_pattern_buffer *bufp;
358     const char *string1, *string2;
359     Idx length1, length2, start, stop;
360     regoff_t range;
361     struct re_registers *regs;
362 {
363   return re_search_2_stub (bufp, string1, length1, string2, length2,
364 			   start, range, regs, stop, false);
365 }
366 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)367 weak_alias (__re_search_2, re_search_2)
368 #endif
369 
370 static regoff_t
371 internal_function
372 re_search_2_stub (struct re_pattern_buffer *bufp,
373 		  const char *string1, Idx length1,
374 		  const char *string2, Idx length2,
375 		  Idx start, regoff_t range, struct re_registers *regs,
376 		  Idx stop, bool ret_len)
377 {
378   const char *str;
379   regoff_t rval;
380   Idx len = length1 + length2;
381   char *s = NULL;
382 
383   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
384     return -2;
385 
386   /* Concatenate the strings.  */
387   if (length2 > 0)
388     if (length1 > 0)
389       {
390 	s = re_malloc (char, len);
391 
392 	if (BE (s == NULL, 0))
393 	  return -2;
394 #ifdef _LIBC
395 	memcpy (__mempcpy (s, string1, length1), string2, length2);
396 #else
397 	memcpy (s, string1, length1);
398 	memcpy (s + length1, string2, length2);
399 #endif
400 	str = s;
401       }
402     else
403       str = string2;
404   else
405     str = string1;
406 
407   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
408 			 ret_len);
409   re_free (s);
410   return rval;
411 }
412 
413 /* The parameters have the same meaning as those of re_search.
414    Additional parameters:
415    If RET_LEN is true the length of the match is returned (re_match style);
416    otherwise the position of the match is returned.  */
417 
418 static regoff_t
419 internal_function
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)420 re_search_stub (struct re_pattern_buffer *bufp,
421 		const char *string, Idx length,
422 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
423 		bool ret_len)
424 {
425   reg_errcode_t result;
426   regmatch_t *pmatch;
427   Idx nregs;
428   regoff_t rval;
429   int eflags = 0;
430 #ifdef _LIBC
431   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
432 #endif
433   Idx last_start = start + range;
434 
435   /* Check for out-of-range.  */
436   if (BE (start < 0 || start > length, 0))
437     return -1;
438   if (BE (length < last_start || (0 <= range && last_start < start), 0))
439     last_start = length;
440   else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
441     last_start = 0;
442 
443   __libc_lock_lock (dfa->lock);
444 
445   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
446   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
447 
448   /* Compile fastmap if we haven't yet.  */
449   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
450     re_compile_fastmap (bufp);
451 
452   if (BE (bufp->no_sub, 0))
453     regs = NULL;
454 
455   /* We need at least 1 register.  */
456   if (regs == NULL)
457     nregs = 1;
458   else if (BE (bufp->regs_allocated == REGS_FIXED
459 	       && regs->num_regs <= bufp->re_nsub, 0))
460     {
461       nregs = regs->num_regs;
462       if (BE (nregs < 1, 0))
463 	{
464 	  /* Nothing can be copied to regs.  */
465 	  regs = NULL;
466 	  nregs = 1;
467 	}
468     }
469   else
470     nregs = bufp->re_nsub + 1;
471   pmatch = re_malloc (regmatch_t, nregs);
472   if (BE (pmatch == NULL, 0))
473     {
474       rval = -2;
475       goto out;
476     }
477 
478   result = re_search_internal (bufp, string, length, start, last_start, stop,
479 			       nregs, pmatch, eflags);
480 
481   rval = 0;
482 
483   /* I hope we needn't fill ther regs with -1's when no match was found.  */
484   if (result != REG_NOERROR)
485     rval = -1;
486   else if (regs != NULL)
487     {
488       /* If caller wants register contents data back, copy them.  */
489       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
490 					   bufp->regs_allocated);
491       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
492 	rval = -2;
493     }
494 
495   if (BE (rval == 0, 1))
496     {
497       if (ret_len)
498 	{
499 	  assert (pmatch[0].rm_so == start);
500 	  rval = pmatch[0].rm_eo - start;
501 	}
502       else
503 	rval = pmatch[0].rm_so;
504     }
505   re_free (pmatch);
506  out:
507   __libc_lock_unlock (dfa->lock);
508   return rval;
509 }
510 
511 static unsigned int
512 internal_function
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)513 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
514 	      int regs_allocated)
515 {
516   int rval = REGS_REALLOCATE;
517   Idx i;
518   Idx need_regs = nregs + 1;
519   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
520      uses.  */
521 
522   /* Have the register data arrays been allocated?  */
523   if (regs_allocated == REGS_UNALLOCATED)
524     { /* No.  So allocate them with malloc.  */
525       regs->start = re_malloc (regoff_t, need_regs);
526       if (BE (regs->start == NULL, 0))
527 	return REGS_UNALLOCATED;
528       regs->end = re_malloc (regoff_t, need_regs);
529       if (BE (regs->end == NULL, 0))
530 	{
531 	  re_free (regs->start);
532 	  return REGS_UNALLOCATED;
533 	}
534       regs->num_regs = need_regs;
535     }
536   else if (regs_allocated == REGS_REALLOCATE)
537     { /* Yes.  If we need more elements than were already
538 	 allocated, reallocate them.  If we need fewer, just
539 	 leave it alone.  */
540       if (BE (need_regs > regs->num_regs, 0))
541 	{
542 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
543 	  regoff_t *new_end;
544 	  if (BE (new_start == NULL, 0))
545 	    return REGS_UNALLOCATED;
546 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
547 	  if (BE (new_end == NULL, 0))
548 	    {
549 	      re_free (new_start);
550 	      return REGS_UNALLOCATED;
551 	    }
552 	  regs->start = new_start;
553 	  regs->end = new_end;
554 	  regs->num_regs = need_regs;
555 	}
556     }
557   else
558     {
559       assert (regs_allocated == REGS_FIXED);
560       /* This function may not be called with REGS_FIXED and nregs too big.  */
561       assert (regs->num_regs >= nregs);
562       rval = REGS_FIXED;
563     }
564 
565   /* Copy the regs.  */
566   for (i = 0; i < nregs; ++i)
567     {
568       regs->start[i] = pmatch[i].rm_so;
569       regs->end[i] = pmatch[i].rm_eo;
570     }
571   for ( ; i < regs->num_regs; ++i)
572     regs->start[i] = regs->end[i] = -1;
573 
574   return rval;
575 }
576 
577 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
578    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
579    this memory for recording register information.  STARTS and ENDS
580    must be allocated using the malloc library routine, and must each
581    be at least NUM_REGS * sizeof (regoff_t) bytes long.
582 
583    If NUM_REGS == 0, then subsequent matches should allocate their own
584    register data.
585 
586    Unless this function is called, the first search or match using
587    PATTERN_BUFFER will allocate its own register data, without
588    freeing the old data.  */
589 
590 void
re_set_registers(bufp,regs,num_regs,starts,ends)591 re_set_registers (bufp, regs, num_regs, starts, ends)
592     struct re_pattern_buffer *bufp;
593     struct re_registers *regs;
594     __re_size_t num_regs;
595     regoff_t *starts, *ends;
596 {
597   if (num_regs)
598     {
599       bufp->regs_allocated = REGS_REALLOCATE;
600       regs->num_regs = num_regs;
601       regs->start = starts;
602       regs->end = ends;
603     }
604   else
605     {
606       bufp->regs_allocated = REGS_UNALLOCATED;
607       regs->num_regs = 0;
608       regs->start = regs->end = NULL;
609     }
610 }
611 #ifdef _LIBC
612 weak_alias (__re_set_registers, re_set_registers)
613 #endif
614 
615 /* Entry points compatible with 4.2 BSD regex library.  We don't define
616    them unless specifically requested.  */
617 
618 #if defined _REGEX_RE_COMP || defined _LIBC
619 int
620 # ifdef _LIBC
621 weak_function
622 # endif
623 re_exec (s)
624      const char *s;
625 {
626   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
627 }
628 #endif /* _REGEX_RE_COMP */
629 
630 /* Internal entry point.  */
631 
632 /* Searches for a compiled pattern PREG in the string STRING, whose
633    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
634    meaning as with regexec.  LAST_START is START + RANGE, where
635    START and RANGE have the same meaning as with re_search.
636    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
637    otherwise return the error code.
638    Note: We assume front end functions already check ranges.
639    (0 <= LAST_START && LAST_START <= LENGTH)  */
640 
641 static reg_errcode_t
642 internal_function
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)643 re_search_internal (const regex_t *preg,
644 		    const char *string, Idx length,
645 		    Idx start, Idx last_start, Idx stop,
646 		    size_t nmatch, regmatch_t pmatch[],
647 		    int eflags)
648 {
649   reg_errcode_t err;
650   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
651   Idx left_lim, right_lim;
652   int incr;
653   bool fl_longest_match;
654   int match_kind;
655   Idx match_first;
656   Idx match_last = REG_MISSING;
657   Idx extra_nmatch;
658   bool sb;
659   int ch;
660 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
661   re_match_context_t mctx = { .dfa = dfa };
662 #else
663   re_match_context_t mctx;
664 #endif
665   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
666 		    && start != last_start && !preg->can_be_null)
667 		   ? preg->fastmap : NULL);
668   RE_TRANSLATE_TYPE t = preg->translate;
669 
670 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
671   memset (&mctx, '\0', sizeof (re_match_context_t));
672   mctx.dfa = dfa;
673 #endif
674 
675   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
676   nmatch -= extra_nmatch;
677 
678   /* Check if the DFA haven't been compiled.  */
679   if (BE (preg->used == 0 || dfa->init_state == NULL
680 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
681 	  || dfa->init_state_begbuf == NULL, 0))
682     return REG_NOMATCH;
683 
684 #ifdef DEBUG
685   /* We assume front-end functions already check them.  */
686   assert (0 <= last_start && last_start <= length);
687 #endif
688 
689   /* If initial states with non-begbuf contexts have no elements,
690      the regex must be anchored.  If preg->newline_anchor is set,
691      we'll never use init_state_nl, so do not check it.  */
692   if (dfa->init_state->nodes.nelem == 0
693       && dfa->init_state_word->nodes.nelem == 0
694       && (dfa->init_state_nl->nodes.nelem == 0
695 	  || !preg->newline_anchor))
696     {
697       if (start != 0 && last_start != 0)
698         return REG_NOMATCH;
699       start = last_start = 0;
700     }
701 
702   /* We must check the longest matching, if nmatch > 0.  */
703   fl_longest_match = (nmatch != 0 || dfa->nbackref);
704 
705   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
706 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
707 			    dfa);
708   if (BE (err != REG_NOERROR, 0))
709     goto free_return;
710   mctx.input.stop = stop;
711   mctx.input.raw_stop = stop;
712   mctx.input.newline_anchor = preg->newline_anchor;
713 
714   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
715   if (BE (err != REG_NOERROR, 0))
716     goto free_return;
717 
718   /* We will log all the DFA states through which the dfa pass,
719      if nmatch > 1, or this dfa has "multibyte node", which is a
720      back-reference or a node which can accept multibyte character or
721      multi character collating element.  */
722   if (nmatch > 1 || dfa->has_mb_node)
723     {
724       /* Avoid overflow.  */
725       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
726 	{
727 	  err = REG_ESPACE;
728 	  goto free_return;
729 	}
730 
731       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
732       if (BE (mctx.state_log == NULL, 0))
733 	{
734 	  err = REG_ESPACE;
735 	  goto free_return;
736 	}
737     }
738   else
739     mctx.state_log = NULL;
740 
741   match_first = start;
742   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
743 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
744 
745   /* Check incrementally whether of not the input string match.  */
746   incr = (last_start < start) ? -1 : 1;
747   left_lim = (last_start < start) ? last_start : start;
748   right_lim = (last_start < start) ? start : last_start;
749   sb = dfa->mb_cur_max == 1;
750   match_kind =
751     (fastmap
752      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
753 	| (start <= last_start ? 2 : 0)
754 	| (t != NULL ? 1 : 0))
755      : 8);
756 
757   for (;; match_first += incr)
758     {
759       err = REG_NOMATCH;
760       if (match_first < left_lim || right_lim < match_first)
761 	goto free_return;
762 
763       /* Advance as rapidly as possible through the string, until we
764 	 find a plausible place to start matching.  This may be done
765 	 with varying efficiency, so there are various possibilities:
766 	 only the most common of them are specialized, in order to
767 	 save on code size.  We use a switch statement for speed.  */
768       switch (match_kind)
769 	{
770 	case 8:
771 	  /* No fastmap.  */
772 	  break;
773 
774 	case 7:
775 	  /* Fastmap with single-byte translation, match forward.  */
776 	  while (BE (match_first < right_lim, 1)
777 		 && !fastmap[t[(unsigned char) string[match_first]]])
778 	    ++match_first;
779 	  goto forward_match_found_start_or_reached_end;
780 
781 	case 6:
782 	  /* Fastmap without translation, match forward.  */
783 	  while (BE (match_first < right_lim, 1)
784 		 && !fastmap[(unsigned char) string[match_first]])
785 	    ++match_first;
786 
787 	forward_match_found_start_or_reached_end:
788 	  if (BE (match_first == right_lim, 0))
789 	    {
790 	      ch = match_first >= length
791 		       ? 0 : (unsigned char) string[match_first];
792 	      if (!fastmap[t ? t[ch] : ch])
793 		goto free_return;
794 	    }
795 	  break;
796 
797 	case 4:
798 	case 5:
799 	  /* Fastmap without multi-byte translation, match backwards.  */
800 	  while (match_first >= left_lim)
801 	    {
802 	      ch = match_first >= length
803 		       ? 0 : (unsigned char) string[match_first];
804 	      if (fastmap[t ? t[ch] : ch])
805 		break;
806 	      --match_first;
807 	    }
808 	  if (match_first < left_lim)
809 	    goto free_return;
810 	  break;
811 
812 	default:
813 	  /* In this case, we can't determine easily the current byte,
814 	     since it might be a component byte of a multibyte
815 	     character.  Then we use the constructed buffer instead.  */
816 	  for (;;)
817 	    {
818 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
819 		 buffers.  */
820 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
821 	      if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
822 		{
823 		  err = re_string_reconstruct (&mctx.input, match_first,
824 					       eflags);
825 		  if (BE (err != REG_NOERROR, 0))
826 		    goto free_return;
827 
828 		  offset = match_first - mctx.input.raw_mbs_idx;
829 		}
830 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
831 		 Note that MATCH_FIRST must not be smaller than 0.  */
832 	      ch = (match_first >= length
833 		    ? 0 : re_string_byte_at (&mctx.input, offset));
834 	      if (fastmap[ch])
835 		break;
836 	      match_first += incr;
837 	      if (match_first < left_lim || match_first > right_lim)
838 	        {
839 	          err = REG_NOMATCH;
840 	          goto free_return;
841 	        }
842 	    }
843 	  break;
844 	}
845 
846       /* Reconstruct the buffers so that the matcher can assume that
847 	 the matching starts from the beginning of the buffer.  */
848       err = re_string_reconstruct (&mctx.input, match_first, eflags);
849       if (BE (err != REG_NOERROR, 0))
850 	goto free_return;
851 
852 #ifdef RE_ENABLE_I18N
853      /* Don't consider this char as a possible match start if it part,
854 	yet isn't the head, of a multibyte character.  */
855       if (!sb && !re_string_first_byte (&mctx.input, 0))
856 	continue;
857 #endif
858 
859       /* It seems to be appropriate one, then use the matcher.  */
860       /* We assume that the matching starts from 0.  */
861       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
862       match_last = check_matching (&mctx, fl_longest_match,
863 				   start <= last_start ? &match_first : NULL);
864       if (match_last != REG_MISSING)
865 	{
866 	  if (BE (match_last == REG_ERROR, 0))
867 	    {
868 	      err = REG_ESPACE;
869 	      goto free_return;
870 	    }
871 	  else
872 	    {
873 	      mctx.match_last = match_last;
874 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
875 		{
876 		  re_dfastate_t *pstate = mctx.state_log[match_last];
877 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
878 							     match_last);
879 		}
880 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
881 		  || dfa->nbackref)
882 		{
883 		  err = prune_impossible_nodes (&mctx);
884 		  if (err == REG_NOERROR)
885 		    break;
886 		  if (BE (err != REG_NOMATCH, 0))
887 		    goto free_return;
888 		  match_last = REG_MISSING;
889 		}
890 	      else
891 		break; /* We found a match.  */
892 	    }
893 	}
894 
895       match_ctx_clean (&mctx);
896     }
897 
898 #ifdef DEBUG
899   assert (match_last != REG_MISSING);
900   assert (err == REG_NOERROR);
901 #endif
902 
903   /* Set pmatch[] if we need.  */
904   if (nmatch > 0)
905     {
906       Idx reg_idx;
907 
908       /* Initialize registers.  */
909       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
910 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
911 
912       /* Set the points where matching start/end.  */
913       pmatch[0].rm_so = 0;
914       pmatch[0].rm_eo = mctx.match_last;
915       /* FIXME: This function should fail if mctx.match_last exceeds
916 	 the maximum possible regoff_t value.  We need a new error
917 	 code REG_OVERFLOW.  */
918 
919       if (!preg->no_sub && nmatch > 1)
920 	{
921 	  err = set_regs (preg, &mctx, nmatch, pmatch,
922 			  dfa->has_plural_match && dfa->nbackref > 0);
923 	  if (BE (err != REG_NOERROR, 0))
924 	    goto free_return;
925 	}
926 
927       /* At last, add the offset to the each registers, since we slided
928 	 the buffers so that we could assume that the matching starts
929 	 from 0.  */
930       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
931 	if (pmatch[reg_idx].rm_so != -1)
932 	  {
933 #ifdef RE_ENABLE_I18N
934 	    if (BE (mctx.input.offsets_needed != 0, 0))
935 	      {
936 		pmatch[reg_idx].rm_so =
937 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
938 		   ? mctx.input.valid_raw_len
939 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
940 		pmatch[reg_idx].rm_eo =
941 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
942 		   ? mctx.input.valid_raw_len
943 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
944 	      }
945 #else
946 	    assert (mctx.input.offsets_needed == 0);
947 #endif
948 	    pmatch[reg_idx].rm_so += match_first;
949 	    pmatch[reg_idx].rm_eo += match_first;
950 	  }
951       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
952 	{
953 	  pmatch[nmatch + reg_idx].rm_so = -1;
954 	  pmatch[nmatch + reg_idx].rm_eo = -1;
955 	}
956 
957       if (dfa->subexp_map)
958         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
959           if (dfa->subexp_map[reg_idx] != reg_idx)
960             {
961               pmatch[reg_idx + 1].rm_so
962                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
963               pmatch[reg_idx + 1].rm_eo
964                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
965             }
966     }
967 
968  free_return:
969   re_free (mctx.state_log);
970   if (dfa->nbackref)
971     match_ctx_free (&mctx);
972   re_string_destruct (&mctx.input);
973   return err;
974 }
975 
976 static reg_errcode_t
977 internal_function
prune_impossible_nodes(re_match_context_t * mctx)978 prune_impossible_nodes (re_match_context_t *mctx)
979 {
980   const re_dfa_t *const dfa = mctx->dfa;
981   Idx halt_node, match_last;
982   reg_errcode_t ret;
983   re_dfastate_t **sifted_states;
984   re_dfastate_t **lim_states = NULL;
985   re_sift_context_t sctx;
986 #ifdef DEBUG
987   assert (mctx->state_log != NULL);
988 #endif
989   match_last = mctx->match_last;
990   halt_node = mctx->last_node;
991 
992   /* Avoid overflow.  */
993   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
994     return REG_ESPACE;
995 
996   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
997   if (BE (sifted_states == NULL, 0))
998     {
999       ret = REG_ESPACE;
1000       goto free_return;
1001     }
1002   if (dfa->nbackref)
1003     {
1004       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
1005       if (BE (lim_states == NULL, 0))
1006 	{
1007 	  ret = REG_ESPACE;
1008 	  goto free_return;
1009 	}
1010       while (1)
1011 	{
1012 	  memset (lim_states, '\0',
1013 		  sizeof (re_dfastate_t *) * (match_last + 1));
1014 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1015 			 match_last);
1016 	  ret = sift_states_backward (mctx, &sctx);
1017 	  re_node_set_free (&sctx.limits);
1018 	  if (BE (ret != REG_NOERROR, 0))
1019 	      goto free_return;
1020 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
1021 	    break;
1022 	  do
1023 	    {
1024 	      --match_last;
1025 	      if (! REG_VALID_INDEX (match_last))
1026 		{
1027 		  ret = REG_NOMATCH;
1028 		  goto free_return;
1029 		}
1030 	    } while (mctx->state_log[match_last] == NULL
1031 		     || !mctx->state_log[match_last]->halt);
1032 	  halt_node = check_halt_state_context (mctx,
1033 						mctx->state_log[match_last],
1034 						match_last);
1035 	}
1036       ret = merge_state_array (dfa, sifted_states, lim_states,
1037 			       match_last + 1);
1038       re_free (lim_states);
1039       lim_states = NULL;
1040       if (BE (ret != REG_NOERROR, 0))
1041 	goto free_return;
1042     }
1043   else
1044     {
1045       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1046       ret = sift_states_backward (mctx, &sctx);
1047       re_node_set_free (&sctx.limits);
1048       if (BE (ret != REG_NOERROR, 0))
1049 	goto free_return;
1050       if (sifted_states[0] == NULL)
1051 	{
1052 	  ret = REG_NOMATCH;
1053 	  goto free_return;
1054 	}
1055     }
1056   re_free (mctx->state_log);
1057   mctx->state_log = sifted_states;
1058   sifted_states = NULL;
1059   mctx->last_node = halt_node;
1060   mctx->match_last = match_last;
1061   ret = REG_NOERROR;
1062  free_return:
1063   re_free (sifted_states);
1064   re_free (lim_states);
1065   return ret;
1066 }
1067 
1068 /* Acquire an initial state and return it.
1069    We must select appropriate initial state depending on the context,
1070    since initial states may have constraints like "\<", "^", etc..  */
1071 
1072 static inline re_dfastate_t *
1073 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1074 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1075 			    Idx idx)
1076 {
1077   const re_dfa_t *const dfa = mctx->dfa;
1078   if (dfa->init_state->has_constraint)
1079     {
1080       unsigned int context;
1081       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1082       if (IS_WORD_CONTEXT (context))
1083 	return dfa->init_state_word;
1084       else if (IS_ORDINARY_CONTEXT (context))
1085 	return dfa->init_state;
1086       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1087 	return dfa->init_state_begbuf;
1088       else if (IS_NEWLINE_CONTEXT (context))
1089 	return dfa->init_state_nl;
1090       else if (IS_BEGBUF_CONTEXT (context))
1091 	{
1092 	  /* It is relatively rare case, then calculate on demand.  */
1093 	  return re_acquire_state_context (err, dfa,
1094 					   dfa->init_state->entrance_nodes,
1095 					   context);
1096 	}
1097       else
1098 	/* Must not happen?  */
1099 	return dfa->init_state;
1100     }
1101   else
1102     return dfa->init_state;
1103 }
1104 
1105 /* Check whether the regular expression match input string INPUT or not,
1106    and return the index where the matching end.  Return REG_MISSING if
1107    there is no match, and return REG_ERROR in case of an error.
1108    FL_LONGEST_MATCH means we want the POSIX longest matching.
1109    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1110    next place where we may want to try matching.
1111    Note that the matcher assume that the maching starts from the current
1112    index of the buffer.  */
1113 
1114 static Idx
1115 internal_function
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1116 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1117 		Idx *p_match_first)
1118 {
1119   const re_dfa_t *const dfa = mctx->dfa;
1120   reg_errcode_t err;
1121   Idx match = 0;
1122   Idx match_last = REG_MISSING;
1123   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1124   re_dfastate_t *cur_state;
1125   bool at_init_state = p_match_first != NULL;
1126   Idx next_start_idx = cur_str_idx;
1127 
1128   err = REG_NOERROR;
1129   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1130   /* An initial state must not be NULL (invalid).  */
1131   if (BE (cur_state == NULL, 0))
1132     {
1133       assert (err == REG_ESPACE);
1134       return REG_ERROR;
1135     }
1136 
1137   if (mctx->state_log != NULL)
1138     {
1139       mctx->state_log[cur_str_idx] = cur_state;
1140 
1141       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1142 	 later.  E.g. Processing back references.  */
1143       if (BE (dfa->nbackref, 0))
1144 	{
1145 	  at_init_state = false;
1146 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1147 	  if (BE (err != REG_NOERROR, 0))
1148 	    return err;
1149 
1150 	  if (cur_state->has_backref)
1151 	    {
1152 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1153 	      if (BE (err != REG_NOERROR, 0))
1154 	        return err;
1155 	    }
1156 	}
1157     }
1158 
1159   /* If the RE accepts NULL string.  */
1160   if (BE (cur_state->halt, 0))
1161     {
1162       if (!cur_state->has_constraint
1163 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1164 	{
1165 	  if (!fl_longest_match)
1166 	    return cur_str_idx;
1167 	  else
1168 	    {
1169 	      match_last = cur_str_idx;
1170 	      match = 1;
1171 	    }
1172 	}
1173     }
1174 
1175   while (!re_string_eoi (&mctx->input))
1176     {
1177       re_dfastate_t *old_state = cur_state;
1178       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1179 
1180       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1181           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1182               && mctx->input.valid_len < mctx->input.len))
1183         {
1184           err = extend_buffers (mctx);
1185           if (BE (err != REG_NOERROR, 0))
1186 	    {
1187 	      assert (err == REG_ESPACE);
1188 	      return REG_ERROR;
1189 	    }
1190         }
1191 
1192       cur_state = transit_state (&err, mctx, cur_state);
1193       if (mctx->state_log != NULL)
1194 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1195 
1196       if (cur_state == NULL)
1197 	{
1198 	  /* Reached the invalid state or an error.  Try to recover a valid
1199 	     state using the state log, if available and if we have not
1200 	     already found a valid (even if not the longest) match.  */
1201 	  if (BE (err != REG_NOERROR, 0))
1202 	    return REG_ERROR;
1203 
1204 	  if (mctx->state_log == NULL
1205 	      || (match && !fl_longest_match)
1206 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1207 	    break;
1208 	}
1209 
1210       if (BE (at_init_state, 0))
1211 	{
1212 	  if (old_state == cur_state)
1213 	    next_start_idx = next_char_idx;
1214 	  else
1215 	    at_init_state = false;
1216 	}
1217 
1218       if (cur_state->halt)
1219 	{
1220 	  /* Reached a halt state.
1221 	     Check the halt state can satisfy the current context.  */
1222 	  if (!cur_state->has_constraint
1223 	      || check_halt_state_context (mctx, cur_state,
1224 					   re_string_cur_idx (&mctx->input)))
1225 	    {
1226 	      /* We found an appropriate halt state.  */
1227 	      match_last = re_string_cur_idx (&mctx->input);
1228 	      match = 1;
1229 
1230 	      /* We found a match, do not modify match_first below.  */
1231 	      p_match_first = NULL;
1232 	      if (!fl_longest_match)
1233 		break;
1234 	    }
1235 	}
1236     }
1237 
1238   if (p_match_first)
1239     *p_match_first += next_start_idx;
1240 
1241   return match_last;
1242 }
1243 
1244 /* Check NODE match the current context.  */
1245 
1246 static bool
1247 internal_function
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1248 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1249 {
1250   re_token_type_t type = dfa->nodes[node].type;
1251   unsigned int constraint = dfa->nodes[node].constraint;
1252   if (type != END_OF_RE)
1253     return false;
1254   if (!constraint)
1255     return true;
1256   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1257     return false;
1258   return true;
1259 }
1260 
1261 /* Check the halt state STATE match the current context.
1262    Return 0 if not match, if the node, STATE has, is a halt node and
1263    match the context, return the node.  */
1264 
1265 static Idx
1266 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1267 check_halt_state_context (const re_match_context_t *mctx,
1268 			  const re_dfastate_t *state, Idx idx)
1269 {
1270   Idx i;
1271   unsigned int context;
1272 #ifdef DEBUG
1273   assert (state->halt);
1274 #endif
1275   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1276   for (i = 0; i < state->nodes.nelem; ++i)
1277     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1278       return state->nodes.elems[i];
1279   return 0;
1280 }
1281 
1282 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1283    corresponding to the DFA).
1284    Return the destination node, and update EPS_VIA_NODES;
1285    return REG_MISSING in case of errors.  */
1286 
1287 static Idx
1288 internal_function
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1289 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1290 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1291 		   struct re_fail_stack_t *fs)
1292 {
1293   const re_dfa_t *const dfa = mctx->dfa;
1294   Idx i;
1295   bool ok;
1296   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1297     {
1298       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1299       re_node_set *edests = &dfa->edests[node];
1300       Idx dest_node;
1301       ok = re_node_set_insert (eps_via_nodes, node);
1302       if (BE (! ok, 0))
1303 	return REG_ERROR;
1304       /* Pick up a valid destination, or return REG_MISSING if none
1305 	 is found.  */
1306       for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1307 	{
1308 	  Idx candidate = edests->elems[i];
1309 	  if (!re_node_set_contains (cur_nodes, candidate))
1310 	    continue;
1311           if (dest_node == REG_MISSING)
1312 	    dest_node = candidate;
1313 
1314           else
1315 	    {
1316 	      /* In order to avoid infinite loop like "(a*)*", return the second
1317 	         epsilon-transition if the first was already considered.  */
1318 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1319 	        return candidate;
1320 
1321 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1322 	      else if (fs != NULL
1323 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1324 				           eps_via_nodes))
1325 		return REG_ERROR;
1326 
1327 	      /* We know we are going to exit.  */
1328 	      break;
1329 	    }
1330 	}
1331       return dest_node;
1332     }
1333   else
1334     {
1335       Idx naccepted = 0;
1336       re_token_type_t type = dfa->nodes[node].type;
1337 
1338 #ifdef RE_ENABLE_I18N
1339       if (dfa->nodes[node].accept_mb)
1340 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1341       else
1342 #endif /* RE_ENABLE_I18N */
1343       if (type == OP_BACK_REF)
1344 	{
1345 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1346 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1347 	  if (fs != NULL)
1348 	    {
1349 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1350 		return REG_MISSING;
1351 	      else if (naccepted)
1352 		{
1353 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1354 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1355 			      naccepted) != 0)
1356 		    return REG_MISSING;
1357 		}
1358 	    }
1359 
1360 	  if (naccepted == 0)
1361 	    {
1362 	      Idx dest_node;
1363 	      ok = re_node_set_insert (eps_via_nodes, node);
1364 	      if (BE (! ok, 0))
1365 		return REG_ERROR;
1366 	      dest_node = dfa->edests[node].elems[0];
1367 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1368 					dest_node))
1369 		return dest_node;
1370 	    }
1371 	}
1372 
1373       if (naccepted != 0
1374 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1375 	{
1376 	  Idx dest_node = dfa->nexts[node];
1377 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1378 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1379 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1380 					       dest_node)))
1381 	    return REG_MISSING;
1382 	  re_node_set_empty (eps_via_nodes);
1383 	  return dest_node;
1384 	}
1385     }
1386   return REG_MISSING;
1387 }
1388 
1389 static reg_errcode_t
1390 internal_function
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1391 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1392 		 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1393 {
1394   reg_errcode_t err;
1395   Idx num = fs->num++;
1396   if (fs->num == fs->alloc)
1397     {
1398       struct re_fail_stack_ent_t *new_array;
1399       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1400 				       * fs->alloc * 2));
1401       if (new_array == NULL)
1402 	return REG_ESPACE;
1403       fs->alloc *= 2;
1404       fs->stack = new_array;
1405     }
1406   fs->stack[num].idx = str_idx;
1407   fs->stack[num].node = dest_node;
1408   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1409   if (fs->stack[num].regs == NULL)
1410     return REG_ESPACE;
1411   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1412   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1413   return err;
1414 }
1415 
1416 static Idx
1417 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1418 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1419 		regmatch_t *regs, re_node_set *eps_via_nodes)
1420 {
1421   Idx num = --fs->num;
1422   assert (REG_VALID_INDEX (num));
1423   *pidx = fs->stack[num].idx;
1424   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1425   re_node_set_free (eps_via_nodes);
1426   re_free (fs->stack[num].regs);
1427   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1428   return fs->stack[num].node;
1429 }
1430 
1431 /* Set the positions where the subexpressions are starts/ends to registers
1432    PMATCH.
1433    Note: We assume that pmatch[0] is already set, and
1434    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1435 
1436 static reg_errcode_t
1437 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1438 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1439 	  regmatch_t *pmatch, bool fl_backtrack)
1440 {
1441   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1442   Idx idx, cur_node;
1443   re_node_set eps_via_nodes;
1444   struct re_fail_stack_t *fs;
1445   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1446   regmatch_t *prev_idx_match;
1447   bool prev_idx_match_malloced = false;
1448 
1449 #ifdef DEBUG
1450   assert (nmatch > 1);
1451   assert (mctx->state_log != NULL);
1452 #endif
1453   if (fl_backtrack)
1454     {
1455       fs = &fs_body;
1456       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1457       if (fs->stack == NULL)
1458 	return REG_ESPACE;
1459     }
1460   else
1461     fs = NULL;
1462 
1463   cur_node = dfa->init_node;
1464   re_node_set_init_empty (&eps_via_nodes);
1465 
1466   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1467     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1468   else
1469     {
1470       prev_idx_match = re_malloc (regmatch_t, nmatch);
1471       if (prev_idx_match == NULL)
1472 	{
1473 	  free_fail_stack_return (fs);
1474 	  return REG_ESPACE;
1475 	}
1476       prev_idx_match_malloced = true;
1477     }
1478   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1479 
1480   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1481     {
1482       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1483 
1484       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1485 	{
1486 	  Idx reg_idx;
1487 	  if (fs)
1488 	    {
1489 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1490 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1491 		  break;
1492 	      if (reg_idx == nmatch)
1493 		{
1494 		  re_node_set_free (&eps_via_nodes);
1495 		  if (prev_idx_match_malloced)
1496 		    re_free (prev_idx_match);
1497 		  return free_fail_stack_return (fs);
1498 		}
1499 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500 					 &eps_via_nodes);
1501 	    }
1502 	  else
1503 	    {
1504 	      re_node_set_free (&eps_via_nodes);
1505 	      if (prev_idx_match_malloced)
1506 		re_free (prev_idx_match);
1507 	      return REG_NOERROR;
1508 	    }
1509 	}
1510 
1511       /* Proceed to next node.  */
1512       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1513 				    &eps_via_nodes, fs);
1514 
1515       if (BE (! REG_VALID_INDEX (cur_node), 0))
1516 	{
1517 	  if (BE (cur_node == REG_ERROR, 0))
1518 	    {
1519 	      re_node_set_free (&eps_via_nodes);
1520 	      if (prev_idx_match_malloced)
1521 		re_free (prev_idx_match);
1522 	      free_fail_stack_return (fs);
1523 	      return REG_ESPACE;
1524 	    }
1525 	  if (fs)
1526 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1527 				       &eps_via_nodes);
1528 	  else
1529 	    {
1530 	      re_node_set_free (&eps_via_nodes);
1531 	      if (prev_idx_match_malloced)
1532 		re_free (prev_idx_match);
1533 	      return REG_NOMATCH;
1534 	    }
1535 	}
1536     }
1537   re_node_set_free (&eps_via_nodes);
1538   if (prev_idx_match_malloced)
1539     re_free (prev_idx_match);
1540   return free_fail_stack_return (fs);
1541 }
1542 
1543 static reg_errcode_t
1544 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1545 free_fail_stack_return (struct re_fail_stack_t *fs)
1546 {
1547   if (fs)
1548     {
1549       Idx fs_idx;
1550       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1551 	{
1552 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1553 	  re_free (fs->stack[fs_idx].regs);
1554 	}
1555       re_free (fs->stack);
1556     }
1557   return REG_NOERROR;
1558 }
1559 
1560 static void
1561 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1562 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1563 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1564 {
1565   int type = dfa->nodes[cur_node].type;
1566   if (type == OP_OPEN_SUBEXP)
1567     {
1568       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1569 
1570       /* We are at the first node of this sub expression.  */
1571       if (reg_num < nmatch)
1572 	{
1573 	  pmatch[reg_num].rm_so = cur_idx;
1574 	  pmatch[reg_num].rm_eo = -1;
1575 	}
1576     }
1577   else if (type == OP_CLOSE_SUBEXP)
1578     {
1579       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1580       if (reg_num < nmatch)
1581 	{
1582 	  /* We are at the last node of this sub expression.  */
1583 	  if (pmatch[reg_num].rm_so < cur_idx)
1584 	    {
1585 	      pmatch[reg_num].rm_eo = cur_idx;
1586 	      /* This is a non-empty match or we are not inside an optional
1587 		 subexpression.  Accept this right away.  */
1588 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1589 	    }
1590 	  else
1591 	    {
1592 	      if (dfa->nodes[cur_node].opt_subexp
1593 		  && prev_idx_match[reg_num].rm_so != -1)
1594 		/* We transited through an empty match for an optional
1595 		   subexpression, like (a?)*, and this is not the subexp's
1596 		   first match.  Copy back the old content of the registers
1597 		   so that matches of an inner subexpression are undone as
1598 		   well, like in ((a?))*.  */
1599 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1600 	      else
1601 		/* We completed a subexpression, but it may be part of
1602 		   an optional one, so do not update PREV_IDX_MATCH.  */
1603 		pmatch[reg_num].rm_eo = cur_idx;
1604 	    }
1605 	}
1606     }
1607 }
1608 
1609 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1610    and sift the nodes in each states according to the following rules.
1611    Updated state_log will be wrote to STATE_LOG.
1612 
1613    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1614      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1615 	If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1616 	the LAST_NODE, we throw away the node `a'.
1617      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1618 	string `s' and transit to `b':
1619 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1620 	   away the node `a'.
1621 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1622 	    thrown away, we throw away the node `a'.
1623      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1624 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1625 	   node `a'.
1626 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1627 	    we throw away the node `a'.  */
1628 
1629 #define STATE_NODE_CONTAINS(state,node) \
1630   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1631 
1632 static reg_errcode_t
1633 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1634 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1635 {
1636   reg_errcode_t err;
1637   int null_cnt = 0;
1638   Idx str_idx = sctx->last_str_idx;
1639   re_node_set cur_dest;
1640 
1641 #ifdef DEBUG
1642   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1643 #endif
1644 
1645   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1646      transit to the last_node and the last_node itself.  */
1647   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1648   if (BE (err != REG_NOERROR, 0))
1649     return err;
1650   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1651   if (BE (err != REG_NOERROR, 0))
1652     goto free_return;
1653 
1654   /* Then check each states in the state_log.  */
1655   while (str_idx > 0)
1656     {
1657       /* Update counters.  */
1658       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1659       if (null_cnt > mctx->max_mb_elem_len)
1660 	{
1661 	  memset (sctx->sifted_states, '\0',
1662 		  sizeof (re_dfastate_t *) * str_idx);
1663 	  re_node_set_free (&cur_dest);
1664 	  return REG_NOERROR;
1665 	}
1666       re_node_set_empty (&cur_dest);
1667       --str_idx;
1668 
1669       if (mctx->state_log[str_idx])
1670 	{
1671 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1672           if (BE (err != REG_NOERROR, 0))
1673 	    goto free_return;
1674 	}
1675 
1676       /* Add all the nodes which satisfy the following conditions:
1677 	 - It can epsilon transit to a node in CUR_DEST.
1678 	 - It is in CUR_SRC.
1679 	 And update state_log.  */
1680       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1681       if (BE (err != REG_NOERROR, 0))
1682 	goto free_return;
1683     }
1684   err = REG_NOERROR;
1685  free_return:
1686   re_node_set_free (&cur_dest);
1687   return err;
1688 }
1689 
1690 static reg_errcode_t
1691 internal_function
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1692 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1693 		     Idx str_idx, re_node_set *cur_dest)
1694 {
1695   const re_dfa_t *const dfa = mctx->dfa;
1696   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1697   Idx i;
1698 
1699   /* Then build the next sifted state.
1700      We build the next sifted state on `cur_dest', and update
1701      `sifted_states[str_idx]' with `cur_dest'.
1702      Note:
1703      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1704      `cur_src' points the node_set of the old `state_log[str_idx]'
1705      (with the epsilon nodes pre-filtered out).  */
1706   for (i = 0; i < cur_src->nelem; i++)
1707     {
1708       Idx prev_node = cur_src->elems[i];
1709       int naccepted = 0;
1710       bool ok;
1711 
1712 #ifdef DEBUG
1713       re_token_type_t type = dfa->nodes[prev_node].type;
1714       assert (!IS_EPSILON_NODE (type));
1715 #endif
1716 #ifdef RE_ENABLE_I18N
1717       /* If the node may accept `multi byte'.  */
1718       if (dfa->nodes[prev_node].accept_mb)
1719 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1720 					 str_idx, sctx->last_str_idx);
1721 #endif /* RE_ENABLE_I18N */
1722 
1723       /* We don't check backreferences here.
1724 	 See update_cur_sifted_state().  */
1725       if (!naccepted
1726 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1727 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1728 				  dfa->nexts[prev_node]))
1729 	naccepted = 1;
1730 
1731       if (naccepted == 0)
1732 	continue;
1733 
1734       if (sctx->limits.nelem)
1735 	{
1736 	  Idx to_idx = str_idx + naccepted;
1737 	  if (check_dst_limits (mctx, &sctx->limits,
1738 				dfa->nexts[prev_node], to_idx,
1739 				prev_node, str_idx))
1740 	    continue;
1741 	}
1742       ok = re_node_set_insert (cur_dest, prev_node);
1743       if (BE (! ok, 0))
1744 	return REG_ESPACE;
1745     }
1746 
1747   return REG_NOERROR;
1748 }
1749 
1750 /* Helper functions.  */
1751 
1752 static reg_errcode_t
1753 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1754 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1755 {
1756   Idx top = mctx->state_log_top;
1757 
1758   if (next_state_log_idx >= mctx->input.bufs_len
1759       || (next_state_log_idx >= mctx->input.valid_len
1760 	  && mctx->input.valid_len < mctx->input.len))
1761     {
1762       reg_errcode_t err;
1763       err = extend_buffers (mctx);
1764       if (BE (err != REG_NOERROR, 0))
1765 	return err;
1766     }
1767 
1768   if (top < next_state_log_idx)
1769     {
1770       memset (mctx->state_log + top + 1, '\0',
1771 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1772       mctx->state_log_top = next_state_log_idx;
1773     }
1774   return REG_NOERROR;
1775 }
1776 
1777 static reg_errcode_t
1778 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1779 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1780 		   re_dfastate_t **src, Idx num)
1781 {
1782   Idx st_idx;
1783   reg_errcode_t err;
1784   for (st_idx = 0; st_idx < num; ++st_idx)
1785     {
1786       if (dst[st_idx] == NULL)
1787 	dst[st_idx] = src[st_idx];
1788       else if (src[st_idx] != NULL)
1789 	{
1790 	  re_node_set merged_set;
1791 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1792 					&src[st_idx]->nodes);
1793 	  if (BE (err != REG_NOERROR, 0))
1794 	    return err;
1795 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1796 	  re_node_set_free (&merged_set);
1797 	  if (BE (err != REG_NOERROR, 0))
1798 	    return err;
1799 	}
1800     }
1801   return REG_NOERROR;
1802 }
1803 
1804 static reg_errcode_t
1805 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1806 update_cur_sifted_state (const re_match_context_t *mctx,
1807 			 re_sift_context_t *sctx, Idx str_idx,
1808 			 re_node_set *dest_nodes)
1809 {
1810   const re_dfa_t *const dfa = mctx->dfa;
1811   reg_errcode_t err = REG_NOERROR;
1812   const re_node_set *candidates;
1813   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1814 		: &mctx->state_log[str_idx]->nodes);
1815 
1816   if (dest_nodes->nelem == 0)
1817     sctx->sifted_states[str_idx] = NULL;
1818   else
1819     {
1820       if (candidates)
1821 	{
1822 	  /* At first, add the nodes which can epsilon transit to a node in
1823 	     DEST_NODE.  */
1824 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1825 	  if (BE (err != REG_NOERROR, 0))
1826 	    return err;
1827 
1828 	  /* Then, check the limitations in the current sift_context.  */
1829 	  if (sctx->limits.nelem)
1830 	    {
1831 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1832 					 mctx->bkref_ents, str_idx);
1833 	      if (BE (err != REG_NOERROR, 0))
1834 		return err;
1835 	    }
1836 	}
1837 
1838       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1839       if (BE (err != REG_NOERROR, 0))
1840 	return err;
1841     }
1842 
1843   if (candidates && mctx->state_log[str_idx]->has_backref)
1844     {
1845       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1846       if (BE (err != REG_NOERROR, 0))
1847 	return err;
1848     }
1849   return REG_NOERROR;
1850 }
1851 
1852 static reg_errcode_t
1853 internal_function
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1854 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1855 		       const re_node_set *candidates)
1856 {
1857   reg_errcode_t err = REG_NOERROR;
1858   Idx i;
1859 
1860   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1861   if (BE (err != REG_NOERROR, 0))
1862     return err;
1863 
1864   if (!state->inveclosure.alloc)
1865     {
1866       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1867       if (BE (err != REG_NOERROR, 0))
1868         return REG_ESPACE;
1869       for (i = 0; i < dest_nodes->nelem; i++)
1870         re_node_set_merge (&state->inveclosure,
1871 			   dfa->inveclosures + dest_nodes->elems[i]);
1872     }
1873   return re_node_set_add_intersect (dest_nodes, candidates,
1874 				    &state->inveclosure);
1875 }
1876 
1877 static reg_errcode_t
1878 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1879 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1880 		       const re_node_set *candidates)
1881 {
1882     Idx ecl_idx;
1883     reg_errcode_t err;
1884     re_node_set *inv_eclosure = dfa->inveclosures + node;
1885     re_node_set except_nodes;
1886     re_node_set_init_empty (&except_nodes);
1887     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1888       {
1889 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1890 	if (cur_node == node)
1891 	  continue;
1892 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1893 	  {
1894 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1895 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1896 			 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1897 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1898 		 && re_node_set_contains (dest_nodes, edst1))
1899 		|| (REG_VALID_NONZERO_INDEX (edst2)
1900 		    && !re_node_set_contains (inv_eclosure, edst2)
1901 		    && re_node_set_contains (dest_nodes, edst2)))
1902 	      {
1903 		err = re_node_set_add_intersect (&except_nodes, candidates,
1904 						 dfa->inveclosures + cur_node);
1905 		if (BE (err != REG_NOERROR, 0))
1906 		  {
1907 		    re_node_set_free (&except_nodes);
1908 		    return err;
1909 		  }
1910 	      }
1911 	  }
1912       }
1913     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1914       {
1915 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1916 	if (!re_node_set_contains (&except_nodes, cur_node))
1917 	  {
1918 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1919 	    re_node_set_remove_at (dest_nodes, idx);
1920 	  }
1921       }
1922     re_node_set_free (&except_nodes);
1923     return REG_NOERROR;
1924 }
1925 
1926 static bool
1927 internal_function
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1928 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1929 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1930 {
1931   const re_dfa_t *const dfa = mctx->dfa;
1932   Idx lim_idx, src_pos, dst_pos;
1933 
1934   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1935   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1936   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1937     {
1938       Idx subexp_idx;
1939       struct re_backref_cache_entry *ent;
1940       ent = mctx->bkref_ents + limits->elems[lim_idx];
1941       subexp_idx = dfa->nodes[ent->node].opr.idx;
1942 
1943       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1944 					   subexp_idx, dst_node, dst_idx,
1945 					   dst_bkref_idx);
1946       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1947 					   subexp_idx, src_node, src_idx,
1948 					   src_bkref_idx);
1949 
1950       /* In case of:
1951 	 <src> <dst> ( <subexp> )
1952 	 ( <subexp> ) <src> <dst>
1953 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1954       if (src_pos == dst_pos)
1955 	continue; /* This is unrelated limitation.  */
1956       else
1957 	return true;
1958     }
1959   return false;
1960 }
1961 
1962 static int
1963 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1964 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1965 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1966 {
1967   const re_dfa_t *const dfa = mctx->dfa;
1968   const re_node_set *eclosures = dfa->eclosures + from_node;
1969   Idx node_idx;
1970 
1971   /* Else, we are on the boundary: examine the nodes on the epsilon
1972      closure.  */
1973   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1974     {
1975       Idx node = eclosures->elems[node_idx];
1976       switch (dfa->nodes[node].type)
1977 	{
1978 	case OP_BACK_REF:
1979 	  if (bkref_idx != REG_MISSING)
1980 	    {
1981 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1982 	      do
1983 	        {
1984 		  Idx dst;
1985 		  int cpos;
1986 
1987 		  if (ent->node != node)
1988 		    continue;
1989 
1990 		  if (subexp_idx < BITSET_WORD_BITS
1991 		      && !(ent->eps_reachable_subexps_map
1992 			   & ((bitset_word_t) 1 << subexp_idx)))
1993 		    continue;
1994 
1995 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1996 		     OP_CLOSE_SUBEXP cases below.  But, if the
1997 		     destination node is the same node as the source
1998 		     node, don't recurse because it would cause an
1999 		     infinite loop: a regex that exhibits this behavior
2000 		     is ()\1*\1*  */
2001 		  dst = dfa->edests[node].elems[0];
2002 		  if (dst == from_node)
2003 		    {
2004 		      if (boundaries & 1)
2005 		        return -1;
2006 		      else /* if (boundaries & 2) */
2007 		        return 0;
2008 		    }
2009 
2010 		  cpos =
2011 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2012 						 dst, bkref_idx);
2013 		  if (cpos == -1 /* && (boundaries & 1) */)
2014 		    return -1;
2015 		  if (cpos == 0 && (boundaries & 2))
2016 		    return 0;
2017 
2018 		  if (subexp_idx < BITSET_WORD_BITS)
2019 		    ent->eps_reachable_subexps_map
2020 		      &= ~((bitset_word_t) 1 << subexp_idx);
2021 	        }
2022 	      while (ent++->more);
2023 	    }
2024 	  break;
2025 
2026 	case OP_OPEN_SUBEXP:
2027 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2028 	    return -1;
2029 	  break;
2030 
2031 	case OP_CLOSE_SUBEXP:
2032 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2033 	    return 0;
2034 	  break;
2035 
2036 	default:
2037 	    break;
2038 	}
2039     }
2040 
2041   return (boundaries & 2) ? 1 : 0;
2042 }
2043 
2044 static int
2045 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)2046 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2047 			   Idx subexp_idx, Idx from_node, Idx str_idx,
2048 			   Idx bkref_idx)
2049 {
2050   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2051   int boundaries;
2052 
2053   /* If we are outside the range of the subexpression, return -1 or 1.  */
2054   if (str_idx < lim->subexp_from)
2055     return -1;
2056 
2057   if (lim->subexp_to < str_idx)
2058     return 1;
2059 
2060   /* If we are within the subexpression, return 0.  */
2061   boundaries = (str_idx == lim->subexp_from);
2062   boundaries |= (str_idx == lim->subexp_to) << 1;
2063   if (boundaries == 0)
2064     return 0;
2065 
2066   /* Else, examine epsilon closure.  */
2067   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2068 				      from_node, bkref_idx);
2069 }
2070 
2071 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2072    which are against limitations from DEST_NODES. */
2073 
2074 static reg_errcode_t
2075 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2076 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2077 		     const re_node_set *candidates, re_node_set *limits,
2078 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2079 {
2080   reg_errcode_t err;
2081   Idx node_idx, lim_idx;
2082 
2083   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2084     {
2085       Idx subexp_idx;
2086       struct re_backref_cache_entry *ent;
2087       ent = bkref_ents + limits->elems[lim_idx];
2088 
2089       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2090 	continue; /* This is unrelated limitation.  */
2091 
2092       subexp_idx = dfa->nodes[ent->node].opr.idx;
2093       if (ent->subexp_to == str_idx)
2094 	{
2095 	  Idx ops_node = REG_MISSING;
2096 	  Idx cls_node = REG_MISSING;
2097 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098 	    {
2099 	      Idx node = dest_nodes->elems[node_idx];
2100 	      re_token_type_t type = dfa->nodes[node].type;
2101 	      if (type == OP_OPEN_SUBEXP
2102 		  && subexp_idx == dfa->nodes[node].opr.idx)
2103 		ops_node = node;
2104 	      else if (type == OP_CLOSE_SUBEXP
2105 		       && subexp_idx == dfa->nodes[node].opr.idx)
2106 		cls_node = node;
2107 	    }
2108 
2109 	  /* Check the limitation of the open subexpression.  */
2110 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2111 	  if (REG_VALID_INDEX (ops_node))
2112 	    {
2113 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2114 					   candidates);
2115 	      if (BE (err != REG_NOERROR, 0))
2116 		return err;
2117 	    }
2118 
2119 	  /* Check the limitation of the close subexpression.  */
2120 	  if (REG_VALID_INDEX (cls_node))
2121 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2122 	      {
2123 		Idx node = dest_nodes->elems[node_idx];
2124 		if (!re_node_set_contains (dfa->inveclosures + node,
2125 					   cls_node)
2126 		    && !re_node_set_contains (dfa->eclosures + node,
2127 					      cls_node))
2128 		  {
2129 		    /* It is against this limitation.
2130 		       Remove it form the current sifted state.  */
2131 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2132 						 candidates);
2133 		    if (BE (err != REG_NOERROR, 0))
2134 		      return err;
2135 		    --node_idx;
2136 		  }
2137 	      }
2138 	}
2139       else /* (ent->subexp_to != str_idx)  */
2140 	{
2141 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2142 	    {
2143 	      Idx node = dest_nodes->elems[node_idx];
2144 	      re_token_type_t type = dfa->nodes[node].type;
2145 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2146 		{
2147 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2148 		    continue;
2149 		  /* It is against this limitation.
2150 		     Remove it form the current sifted state.  */
2151 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2152 					       candidates);
2153 		  if (BE (err != REG_NOERROR, 0))
2154 		    return err;
2155 		}
2156 	    }
2157 	}
2158     }
2159   return REG_NOERROR;
2160 }
2161 
2162 static reg_errcode_t
2163 internal_function
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2164 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2165 		   Idx str_idx, const re_node_set *candidates)
2166 {
2167   const re_dfa_t *const dfa = mctx->dfa;
2168   reg_errcode_t err;
2169   Idx node_idx, node;
2170   re_sift_context_t local_sctx;
2171   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2172 
2173   if (first_idx == REG_MISSING)
2174     return REG_NOERROR;
2175 
2176   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2177 
2178   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2179     {
2180       Idx enabled_idx;
2181       re_token_type_t type;
2182       struct re_backref_cache_entry *entry;
2183       node = candidates->elems[node_idx];
2184       type = dfa->nodes[node].type;
2185       /* Avoid infinite loop for the REs like "()\1+".  */
2186       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2187 	continue;
2188       if (type != OP_BACK_REF)
2189 	continue;
2190 
2191       entry = mctx->bkref_ents + first_idx;
2192       enabled_idx = first_idx;
2193       do
2194 	{
2195 	  Idx subexp_len;
2196 	  Idx to_idx;
2197 	  Idx dst_node;
2198 	  bool ok;
2199 	  re_dfastate_t *cur_state;
2200 
2201 	  if (entry->node != node)
2202 	    continue;
2203 	  subexp_len = entry->subexp_to - entry->subexp_from;
2204 	  to_idx = str_idx + subexp_len;
2205 	  dst_node = (subexp_len ? dfa->nexts[node]
2206 		      : dfa->edests[node].elems[0]);
2207 
2208 	  if (to_idx > sctx->last_str_idx
2209 	      || sctx->sifted_states[to_idx] == NULL
2210 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2211 	      || check_dst_limits (mctx, &sctx->limits, node,
2212 				   str_idx, dst_node, to_idx))
2213 	    continue;
2214 
2215 	  if (local_sctx.sifted_states == NULL)
2216 	    {
2217 	      local_sctx = *sctx;
2218 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2219 	      if (BE (err != REG_NOERROR, 0))
2220 		goto free_return;
2221 	    }
2222 	  local_sctx.last_node = node;
2223 	  local_sctx.last_str_idx = str_idx;
2224 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2225 	  if (BE (! ok, 0))
2226 	    {
2227 	      err = REG_ESPACE;
2228 	      goto free_return;
2229 	    }
2230 	  cur_state = local_sctx.sifted_states[str_idx];
2231 	  err = sift_states_backward (mctx, &local_sctx);
2232 	  if (BE (err != REG_NOERROR, 0))
2233 	    goto free_return;
2234 	  if (sctx->limited_states != NULL)
2235 	    {
2236 	      err = merge_state_array (dfa, sctx->limited_states,
2237 				       local_sctx.sifted_states,
2238 				       str_idx + 1);
2239 	      if (BE (err != REG_NOERROR, 0))
2240 		goto free_return;
2241 	    }
2242 	  local_sctx.sifted_states[str_idx] = cur_state;
2243 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2244 
2245 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2246           entry = mctx->bkref_ents + enabled_idx;
2247 	}
2248       while (enabled_idx++, entry++->more);
2249     }
2250   err = REG_NOERROR;
2251  free_return:
2252   if (local_sctx.sifted_states != NULL)
2253     {
2254       re_node_set_free (&local_sctx.limits);
2255     }
2256 
2257   return err;
2258 }
2259 
2260 
2261 #ifdef RE_ENABLE_I18N
2262 static int
2263 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2264 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2265 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2266 {
2267   const re_dfa_t *const dfa = mctx->dfa;
2268   int naccepted;
2269   /* Check the node can accept `multi byte'.  */
2270   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2271   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2272       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2273 			    dfa->nexts[node_idx]))
2274     /* The node can't accept the `multi byte', or the
2275        destination was already thrown away, then the node
2276        could't accept the current input `multi byte'.   */
2277     naccepted = 0;
2278   /* Otherwise, it is sure that the node could accept
2279      `naccepted' bytes input.  */
2280   return naccepted;
2281 }
2282 #endif /* RE_ENABLE_I18N */
2283 
2284 
2285 /* Functions for state transition.  */
2286 
2287 /* Return the next state to which the current state STATE will transit by
2288    accepting the current input byte, and update STATE_LOG if necessary.
2289    If STATE can accept a multibyte char/collating element/back reference
2290    update the destination of STATE_LOG.  */
2291 
2292 static re_dfastate_t *
2293 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2294 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2295 	       re_dfastate_t *state)
2296 {
2297   re_dfastate_t **trtable;
2298   unsigned char ch;
2299 
2300 #ifdef RE_ENABLE_I18N
2301   /* If the current state can accept multibyte.  */
2302   if (BE (state->accept_mb, 0))
2303     {
2304       *err = transit_state_mb (mctx, state);
2305       if (BE (*err != REG_NOERROR, 0))
2306 	return NULL;
2307     }
2308 #endif /* RE_ENABLE_I18N */
2309 
2310   /* Then decide the next state with the single byte.  */
2311 #if 0
2312   if (0)
2313     /* don't use transition table  */
2314     return transit_state_sb (err, mctx, state);
2315 #endif
2316 
2317   /* Use transition table  */
2318   ch = re_string_fetch_byte (&mctx->input);
2319   for (;;)
2320     {
2321       trtable = state->trtable;
2322       if (BE (trtable != NULL, 1))
2323 	return trtable[ch];
2324 
2325       trtable = state->word_trtable;
2326       if (BE (trtable != NULL, 1))
2327         {
2328 	  unsigned int context;
2329 	  context
2330 	    = re_string_context_at (&mctx->input,
2331 				    re_string_cur_idx (&mctx->input) - 1,
2332 				    mctx->eflags);
2333 	  if (IS_WORD_CONTEXT (context))
2334 	    return trtable[ch + SBC_MAX];
2335 	  else
2336 	    return trtable[ch];
2337 	}
2338 
2339       if (!build_trtable (mctx->dfa, state))
2340 	{
2341 	  *err = REG_ESPACE;
2342 	  return NULL;
2343 	}
2344 
2345       /* Retry, we now have a transition table.  */
2346     }
2347 }
2348 
2349 /* Update the state_log if we need */
2350 static re_dfastate_t *
2351 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2352 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2353 		      re_dfastate_t *next_state)
2354 {
2355   const re_dfa_t *const dfa = mctx->dfa;
2356   Idx cur_idx = re_string_cur_idx (&mctx->input);
2357 
2358   if (cur_idx > mctx->state_log_top)
2359     {
2360       mctx->state_log[cur_idx] = next_state;
2361       mctx->state_log_top = cur_idx;
2362     }
2363   else if (mctx->state_log[cur_idx] == 0)
2364     {
2365       mctx->state_log[cur_idx] = next_state;
2366     }
2367   else
2368     {
2369       re_dfastate_t *pstate;
2370       unsigned int context;
2371       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2372       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2373          the destination of a multibyte char/collating element/
2374          back reference.  Then the next state is the union set of
2375          these destinations and the results of the transition table.  */
2376       pstate = mctx->state_log[cur_idx];
2377       log_nodes = pstate->entrance_nodes;
2378       if (next_state != NULL)
2379         {
2380           table_nodes = next_state->entrance_nodes;
2381           *err = re_node_set_init_union (&next_nodes, table_nodes,
2382 					     log_nodes);
2383           if (BE (*err != REG_NOERROR, 0))
2384 	    return NULL;
2385         }
2386       else
2387         next_nodes = *log_nodes;
2388       /* Note: We already add the nodes of the initial state,
2389 	 then we don't need to add them here.  */
2390 
2391       context = re_string_context_at (&mctx->input,
2392 				      re_string_cur_idx (&mctx->input) - 1,
2393 				      mctx->eflags);
2394       next_state = mctx->state_log[cur_idx]
2395         = re_acquire_state_context (err, dfa, &next_nodes, context);
2396       /* We don't need to check errors here, since the return value of
2397          this function is next_state and ERR is already set.  */
2398 
2399       if (table_nodes != NULL)
2400         re_node_set_free (&next_nodes);
2401     }
2402 
2403   if (BE (dfa->nbackref, 0) && next_state != NULL)
2404     {
2405       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2406 	 later.  We must check them here, since the back references in the
2407 	 next state might use them.  */
2408       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2409 					cur_idx);
2410       if (BE (*err != REG_NOERROR, 0))
2411 	return NULL;
2412 
2413       /* If the next state has back references.  */
2414       if (next_state->has_backref)
2415 	{
2416 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2417 	  if (BE (*err != REG_NOERROR, 0))
2418 	    return NULL;
2419 	  next_state = mctx->state_log[cur_idx];
2420 	}
2421     }
2422 
2423   return next_state;
2424 }
2425 
2426 /* Skip bytes in the input that correspond to part of a
2427    multi-byte match, then look in the log for a state
2428    from which to restart matching.  */
2429 static re_dfastate_t *
2430 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2431 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2432 {
2433   re_dfastate_t *cur_state;
2434   do
2435     {
2436       Idx max = mctx->state_log_top;
2437       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2438 
2439       do
2440 	{
2441           if (++cur_str_idx > max)
2442             return NULL;
2443           re_string_skip_bytes (&mctx->input, 1);
2444 	}
2445       while (mctx->state_log[cur_str_idx] == NULL);
2446 
2447       cur_state = merge_state_with_log (err, mctx, NULL);
2448     }
2449   while (*err == REG_NOERROR && cur_state == NULL);
2450   return cur_state;
2451 }
2452 
2453 /* Helper functions for transit_state.  */
2454 
2455 /* From the node set CUR_NODES, pick up the nodes whose types are
2456    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2457    expression. And register them to use them later for evaluating the
2458    correspoding back references.  */
2459 
2460 static reg_errcode_t
2461 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2462 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2463 			   Idx str_idx)
2464 {
2465   const re_dfa_t *const dfa = mctx->dfa;
2466   Idx node_idx;
2467   reg_errcode_t err;
2468 
2469   /* TODO: This isn't efficient.
2470 	   Because there might be more than one nodes whose types are
2471 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2472 	   nodes.
2473 	   E.g. RE: (a){2}  */
2474   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2475     {
2476       Idx node = cur_nodes->elems[node_idx];
2477       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2478 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2479 	  && (dfa->used_bkref_map
2480 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2481 	{
2482 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2483 	  if (BE (err != REG_NOERROR, 0))
2484 	    return err;
2485 	}
2486     }
2487   return REG_NOERROR;
2488 }
2489 
2490 #if 0
2491 /* Return the next state to which the current state STATE will transit by
2492    accepting the current input byte.  */
2493 
2494 static re_dfastate_t *
2495 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2496 		  re_dfastate_t *state)
2497 {
2498   const re_dfa_t *const dfa = mctx->dfa;
2499   re_node_set next_nodes;
2500   re_dfastate_t *next_state;
2501   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2502   unsigned int context;
2503 
2504   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2505   if (BE (*err != REG_NOERROR, 0))
2506     return NULL;
2507   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2508     {
2509       Idx cur_node = state->nodes.elems[node_cnt];
2510       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2511 	{
2512 	  *err = re_node_set_merge (&next_nodes,
2513 				    dfa->eclosures + dfa->nexts[cur_node]);
2514 	  if (BE (*err != REG_NOERROR, 0))
2515 	    {
2516 	      re_node_set_free (&next_nodes);
2517 	      return NULL;
2518 	    }
2519 	}
2520     }
2521   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2522   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2523   /* We don't need to check errors here, since the return value of
2524      this function is next_state and ERR is already set.  */
2525 
2526   re_node_set_free (&next_nodes);
2527   re_string_skip_bytes (&mctx->input, 1);
2528   return next_state;
2529 }
2530 #endif
2531 
2532 #ifdef RE_ENABLE_I18N
2533 static reg_errcode_t
2534 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2535 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2536 {
2537   const re_dfa_t *const dfa = mctx->dfa;
2538   reg_errcode_t err;
2539   Idx i;
2540 
2541   for (i = 0; i < pstate->nodes.nelem; ++i)
2542     {
2543       re_node_set dest_nodes, *new_nodes;
2544       Idx cur_node_idx = pstate->nodes.elems[i];
2545       int naccepted;
2546       Idx dest_idx;
2547       unsigned int context;
2548       re_dfastate_t *dest_state;
2549 
2550       if (!dfa->nodes[cur_node_idx].accept_mb)
2551         continue;
2552 
2553       if (dfa->nodes[cur_node_idx].constraint)
2554 	{
2555 	  context = re_string_context_at (&mctx->input,
2556 					  re_string_cur_idx (&mctx->input),
2557 					  mctx->eflags);
2558 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2559 					   context))
2560 	    continue;
2561 	}
2562 
2563       /* How many bytes the node can accept?  */
2564       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2565 					   re_string_cur_idx (&mctx->input));
2566       if (naccepted == 0)
2567 	continue;
2568 
2569       /* The node can accepts `naccepted' bytes.  */
2570       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2571       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2572 			       : mctx->max_mb_elem_len);
2573       err = clean_state_log_if_needed (mctx, dest_idx);
2574       if (BE (err != REG_NOERROR, 0))
2575 	return err;
2576 #ifdef DEBUG
2577       assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2578 #endif
2579       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2580 
2581       dest_state = mctx->state_log[dest_idx];
2582       if (dest_state == NULL)
2583 	dest_nodes = *new_nodes;
2584       else
2585 	{
2586 	  err = re_node_set_init_union (&dest_nodes,
2587 					dest_state->entrance_nodes, new_nodes);
2588 	  if (BE (err != REG_NOERROR, 0))
2589 	    return err;
2590 	}
2591       context = re_string_context_at (&mctx->input, dest_idx - 1,
2592 				      mctx->eflags);
2593       mctx->state_log[dest_idx]
2594 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2595       if (dest_state != NULL)
2596 	re_node_set_free (&dest_nodes);
2597       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2598 	return err;
2599     }
2600   return REG_NOERROR;
2601 }
2602 #endif /* RE_ENABLE_I18N */
2603 
2604 static reg_errcode_t
2605 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2606 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2607 {
2608   const re_dfa_t *const dfa = mctx->dfa;
2609   reg_errcode_t err;
2610   Idx i;
2611   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2612 
2613   for (i = 0; i < nodes->nelem; ++i)
2614     {
2615       Idx dest_str_idx, prev_nelem, bkc_idx;
2616       Idx node_idx = nodes->elems[i];
2617       unsigned int context;
2618       const re_token_t *node = dfa->nodes + node_idx;
2619       re_node_set *new_dest_nodes;
2620 
2621       /* Check whether `node' is a backreference or not.  */
2622       if (node->type != OP_BACK_REF)
2623 	continue;
2624 
2625       if (node->constraint)
2626 	{
2627 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2628 					  mctx->eflags);
2629 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2630 	    continue;
2631 	}
2632 
2633       /* `node' is a backreference.
2634 	 Check the substring which the substring matched.  */
2635       bkc_idx = mctx->nbkref_ents;
2636       err = get_subexp (mctx, node_idx, cur_str_idx);
2637       if (BE (err != REG_NOERROR, 0))
2638 	goto free_return;
2639 
2640       /* And add the epsilon closures (which is `new_dest_nodes') of
2641 	 the backreference to appropriate state_log.  */
2642 #ifdef DEBUG
2643       assert (dfa->nexts[node_idx] != REG_MISSING);
2644 #endif
2645       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2646 	{
2647 	  Idx subexp_len;
2648 	  re_dfastate_t *dest_state;
2649 	  struct re_backref_cache_entry *bkref_ent;
2650 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2651 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2652 	    continue;
2653 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2654 	  new_dest_nodes = (subexp_len == 0
2655 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2656 			    : dfa->eclosures + dfa->nexts[node_idx]);
2657 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2658 			  - bkref_ent->subexp_from);
2659 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2660 					  mctx->eflags);
2661 	  dest_state = mctx->state_log[dest_str_idx];
2662 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2663 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2664 	  /* Add `new_dest_node' to state_log.  */
2665 	  if (dest_state == NULL)
2666 	    {
2667 	      mctx->state_log[dest_str_idx]
2668 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2669 					    context);
2670 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2671 		      && err != REG_NOERROR, 0))
2672 		goto free_return;
2673 	    }
2674 	  else
2675 	    {
2676 	      re_node_set dest_nodes;
2677 	      err = re_node_set_init_union (&dest_nodes,
2678 					    dest_state->entrance_nodes,
2679 					    new_dest_nodes);
2680 	      if (BE (err != REG_NOERROR, 0))
2681 		{
2682 		  re_node_set_free (&dest_nodes);
2683 		  goto free_return;
2684 		}
2685 	      mctx->state_log[dest_str_idx]
2686 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2687 	      re_node_set_free (&dest_nodes);
2688 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2689 		      && err != REG_NOERROR, 0))
2690 		goto free_return;
2691 	    }
2692 	  /* We need to check recursively if the backreference can epsilon
2693 	     transit.  */
2694 	  if (subexp_len == 0
2695 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2696 	    {
2697 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2698 					       cur_str_idx);
2699 	      if (BE (err != REG_NOERROR, 0))
2700 		goto free_return;
2701 	      err = transit_state_bkref (mctx, new_dest_nodes);
2702 	      if (BE (err != REG_NOERROR, 0))
2703 		goto free_return;
2704 	    }
2705 	}
2706     }
2707   err = REG_NOERROR;
2708  free_return:
2709   return err;
2710 }
2711 
2712 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2713    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2714    Note that we might collect inappropriate candidates here.
2715    However, the cost of checking them strictly here is too high, then we
2716    delay these checking for prune_impossible_nodes().  */
2717 
2718 static reg_errcode_t
2719 internal_function
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2720 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2721 {
2722   const re_dfa_t *const dfa = mctx->dfa;
2723   Idx subexp_num, sub_top_idx;
2724   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2725   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2726   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2727   if (cache_idx != REG_MISSING)
2728     {
2729       const struct re_backref_cache_entry *entry
2730 	= mctx->bkref_ents + cache_idx;
2731       do
2732         if (entry->node == bkref_node)
2733 	  return REG_NOERROR; /* We already checked it.  */
2734       while (entry++->more);
2735     }
2736 
2737   subexp_num = dfa->nodes[bkref_node].opr.idx;
2738 
2739   /* For each sub expression  */
2740   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2741     {
2742       reg_errcode_t err;
2743       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2744       re_sub_match_last_t *sub_last;
2745       Idx sub_last_idx, sl_str, bkref_str_off;
2746 
2747       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2748 	continue; /* It isn't related.  */
2749 
2750       sl_str = sub_top->str_idx;
2751       bkref_str_off = bkref_str_idx;
2752       /* At first, check the last node of sub expressions we already
2753 	 evaluated.  */
2754       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2755 	{
2756 	  regoff_t sl_str_diff;
2757 	  sub_last = sub_top->lasts[sub_last_idx];
2758 	  sl_str_diff = sub_last->str_idx - sl_str;
2759 	  /* The matched string by the sub expression match with the substring
2760 	     at the back reference?  */
2761 	  if (sl_str_diff > 0)
2762 	    {
2763 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2764 		{
2765 		  /* Not enough chars for a successful match.  */
2766 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2767 		    break;
2768 
2769 		  err = clean_state_log_if_needed (mctx,
2770 						   bkref_str_off
2771 						   + sl_str_diff);
2772 		  if (BE (err != REG_NOERROR, 0))
2773 		    return err;
2774 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2775 		}
2776 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2777 		/* We don't need to search this sub expression any more.  */
2778 		break;
2779 	    }
2780 	  bkref_str_off += sl_str_diff;
2781 	  sl_str += sl_str_diff;
2782 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2783 				bkref_str_idx);
2784 
2785 	  /* Reload buf, since the preceding call might have reallocated
2786 	     the buffer.  */
2787 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2788 
2789 	  if (err == REG_NOMATCH)
2790 	    continue;
2791 	  if (BE (err != REG_NOERROR, 0))
2792 	    return err;
2793 	}
2794 
2795       if (sub_last_idx < sub_top->nlasts)
2796 	continue;
2797       if (sub_last_idx > 0)
2798 	++sl_str;
2799       /* Then, search for the other last nodes of the sub expression.  */
2800       for (; sl_str <= bkref_str_idx; ++sl_str)
2801 	{
2802 	  Idx cls_node;
2803 	  regoff_t sl_str_off;
2804 	  const re_node_set *nodes;
2805 	  sl_str_off = sl_str - sub_top->str_idx;
2806 	  /* The matched string by the sub expression match with the substring
2807 	     at the back reference?  */
2808 	  if (sl_str_off > 0)
2809 	    {
2810 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2811 		{
2812 		  /* If we are at the end of the input, we cannot match.  */
2813 		  if (bkref_str_off >= mctx->input.len)
2814 		    break;
2815 
2816 		  err = extend_buffers (mctx);
2817 		  if (BE (err != REG_NOERROR, 0))
2818 		    return err;
2819 
2820 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2821 		}
2822 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2823 		break; /* We don't need to search this sub expression
2824 			  any more.  */
2825 	    }
2826 	  if (mctx->state_log[sl_str] == NULL)
2827 	    continue;
2828 	  /* Does this state have a ')' of the sub expression?  */
2829 	  nodes = &mctx->state_log[sl_str]->nodes;
2830 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2831 				       OP_CLOSE_SUBEXP);
2832 	  if (cls_node == REG_MISSING)
2833 	    continue; /* No.  */
2834 	  if (sub_top->path == NULL)
2835 	    {
2836 	      sub_top->path = calloc (sizeof (state_array_t),
2837 				      sl_str - sub_top->str_idx + 1);
2838 	      if (sub_top->path == NULL)
2839 		return REG_ESPACE;
2840 	    }
2841 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2842 	     in the current context?  */
2843 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2844 			       sub_top->str_idx, cls_node, sl_str,
2845 			       OP_CLOSE_SUBEXP);
2846 	  if (err == REG_NOMATCH)
2847 	      continue;
2848 	  if (BE (err != REG_NOERROR, 0))
2849 	      return err;
2850 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2851 	  if (BE (sub_last == NULL, 0))
2852 	    return REG_ESPACE;
2853 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2854 				bkref_str_idx);
2855 	  if (err == REG_NOMATCH)
2856 	    continue;
2857 	}
2858     }
2859   return REG_NOERROR;
2860 }
2861 
2862 /* Helper functions for get_subexp().  */
2863 
2864 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2865    If it can arrive, register the sub expression expressed with SUB_TOP
2866    and SUB_LAST.  */
2867 
2868 static reg_errcode_t
2869 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2870 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2871 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2872 {
2873   reg_errcode_t err;
2874   Idx to_idx;
2875   /* Can the subexpression arrive the back reference?  */
2876   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2877 		       sub_last->str_idx, bkref_node, bkref_str,
2878 		       OP_OPEN_SUBEXP);
2879   if (err != REG_NOERROR)
2880     return err;
2881   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2882 			     sub_last->str_idx);
2883   if (BE (err != REG_NOERROR, 0))
2884     return err;
2885   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2886   return clean_state_log_if_needed (mctx, to_idx);
2887 }
2888 
2889 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2890    Search '(' if FL_OPEN, or search ')' otherwise.
2891    TODO: This function isn't efficient...
2892 	 Because there might be more than one nodes whose types are
2893 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2894 	 nodes.
2895 	 E.g. RE: (a){2}  */
2896 
2897 static Idx
2898 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2899 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2900 		  Idx subexp_idx, int type)
2901 {
2902   Idx cls_idx;
2903   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2904     {
2905       Idx cls_node = nodes->elems[cls_idx];
2906       const re_token_t *node = dfa->nodes + cls_node;
2907       if (node->type == type
2908 	  && node->opr.idx == subexp_idx)
2909 	return cls_node;
2910     }
2911   return REG_MISSING;
2912 }
2913 
2914 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2915    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2916    heavily reused.
2917    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2918 
2919 static reg_errcode_t
2920 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2921 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2922 	       Idx top_str, Idx last_node, Idx last_str, int type)
2923 {
2924   const re_dfa_t *const dfa = mctx->dfa;
2925   reg_errcode_t err = REG_NOERROR;
2926   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2927   re_dfastate_t *cur_state = NULL;
2928   re_node_set *cur_nodes, next_nodes;
2929   re_dfastate_t **backup_state_log;
2930   unsigned int context;
2931 
2932   subexp_num = dfa->nodes[top_node].opr.idx;
2933   /* Extend the buffer if we need.  */
2934   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2935     {
2936       re_dfastate_t **new_array;
2937       Idx old_alloc = path->alloc;
2938       Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
2939       if (BE (new_alloc < old_alloc, 0)
2940 	  || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2941 	return REG_ESPACE;
2942       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2943       if (BE (new_array == NULL, 0))
2944 	return REG_ESPACE;
2945       path->array = new_array;
2946       path->alloc = new_alloc;
2947       memset (new_array + old_alloc, '\0',
2948 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2949     }
2950 
2951   str_idx = path->next_idx ? path->next_idx : top_str;
2952 
2953   /* Temporary modify MCTX.  */
2954   backup_state_log = mctx->state_log;
2955   backup_cur_idx = mctx->input.cur_idx;
2956   mctx->state_log = path->array;
2957   mctx->input.cur_idx = str_idx;
2958 
2959   /* Setup initial node set.  */
2960   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2961   if (str_idx == top_str)
2962     {
2963       err = re_node_set_init_1 (&next_nodes, top_node);
2964       if (BE (err != REG_NOERROR, 0))
2965 	return err;
2966       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2967       if (BE (err != REG_NOERROR, 0))
2968 	{
2969 	  re_node_set_free (&next_nodes);
2970 	  return err;
2971 	}
2972     }
2973   else
2974     {
2975       cur_state = mctx->state_log[str_idx];
2976       if (cur_state && cur_state->has_backref)
2977 	{
2978 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2979 	  if (BE (err != REG_NOERROR, 0))
2980 	    return err;
2981 	}
2982       else
2983 	re_node_set_init_empty (&next_nodes);
2984     }
2985   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2986     {
2987       if (next_nodes.nelem)
2988 	{
2989 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2990 				    subexp_num, type);
2991 	  if (BE (err != REG_NOERROR, 0))
2992 	    {
2993 	      re_node_set_free (&next_nodes);
2994 	      return err;
2995 	    }
2996 	}
2997       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2998       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2999 	{
3000 	  re_node_set_free (&next_nodes);
3001 	  return err;
3002 	}
3003       mctx->state_log[str_idx] = cur_state;
3004     }
3005 
3006   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3007     {
3008       re_node_set_empty (&next_nodes);
3009       if (mctx->state_log[str_idx + 1])
3010 	{
3011 	  err = re_node_set_merge (&next_nodes,
3012 				   &mctx->state_log[str_idx + 1]->nodes);
3013 	  if (BE (err != REG_NOERROR, 0))
3014 	    {
3015 	      re_node_set_free (&next_nodes);
3016 	      return err;
3017 	    }
3018 	}
3019       if (cur_state)
3020 	{
3021 	  err = check_arrival_add_next_nodes (mctx, str_idx,
3022 					      &cur_state->non_eps_nodes,
3023 					      &next_nodes);
3024 	  if (BE (err != REG_NOERROR, 0))
3025 	    {
3026 	      re_node_set_free (&next_nodes);
3027 	      return err;
3028 	    }
3029 	}
3030       ++str_idx;
3031       if (next_nodes.nelem)
3032 	{
3033 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3034 	  if (BE (err != REG_NOERROR, 0))
3035 	    {
3036 	      re_node_set_free (&next_nodes);
3037 	      return err;
3038 	    }
3039 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3040 				    subexp_num, type);
3041 	  if (BE (err != REG_NOERROR, 0))
3042 	    {
3043 	      re_node_set_free (&next_nodes);
3044 	      return err;
3045 	    }
3046 	}
3047       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3048       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3049       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3050 	{
3051 	  re_node_set_free (&next_nodes);
3052 	  return err;
3053 	}
3054       mctx->state_log[str_idx] = cur_state;
3055       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3056     }
3057   re_node_set_free (&next_nodes);
3058   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3059 	       : &mctx->state_log[last_str]->nodes);
3060   path->next_idx = str_idx;
3061 
3062   /* Fix MCTX.  */
3063   mctx->state_log = backup_state_log;
3064   mctx->input.cur_idx = backup_cur_idx;
3065 
3066   /* Then check the current node set has the node LAST_NODE.  */
3067   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3068     return REG_NOERROR;
3069 
3070   return REG_NOMATCH;
3071 }
3072 
3073 /* Helper functions for check_arrival.  */
3074 
3075 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3076    to NEXT_NODES.
3077    TODO: This function is similar to the functions transit_state*(),
3078 	 however this function has many additional works.
3079 	 Can't we unify them?  */
3080 
3081 static reg_errcode_t
3082 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3083 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3084 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3085 {
3086   const re_dfa_t *const dfa = mctx->dfa;
3087   bool ok;
3088   Idx cur_idx;
3089 #ifdef RE_ENABLE_I18N
3090   reg_errcode_t err = REG_NOERROR;
3091 #endif
3092   re_node_set union_set;
3093   re_node_set_init_empty (&union_set);
3094   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3095     {
3096       int naccepted = 0;
3097       Idx cur_node = cur_nodes->elems[cur_idx];
3098 #ifdef DEBUG
3099       re_token_type_t type = dfa->nodes[cur_node].type;
3100       assert (!IS_EPSILON_NODE (type));
3101 #endif
3102 #ifdef RE_ENABLE_I18N
3103       /* If the node may accept `multi byte'.  */
3104       if (dfa->nodes[cur_node].accept_mb)
3105 	{
3106 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3107 					       str_idx);
3108 	  if (naccepted > 1)
3109 	    {
3110 	      re_dfastate_t *dest_state;
3111 	      Idx next_node = dfa->nexts[cur_node];
3112 	      Idx next_idx = str_idx + naccepted;
3113 	      dest_state = mctx->state_log[next_idx];
3114 	      re_node_set_empty (&union_set);
3115 	      if (dest_state)
3116 		{
3117 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3118 		  if (BE (err != REG_NOERROR, 0))
3119 		    {
3120 		      re_node_set_free (&union_set);
3121 		      return err;
3122 		    }
3123 		}
3124 	      ok = re_node_set_insert (&union_set, next_node);
3125 	      if (BE (! ok, 0))
3126 		{
3127 		  re_node_set_free (&union_set);
3128 		  return REG_ESPACE;
3129 		}
3130 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3131 							    &union_set);
3132 	      if (BE (mctx->state_log[next_idx] == NULL
3133 		      && err != REG_NOERROR, 0))
3134 		{
3135 		  re_node_set_free (&union_set);
3136 		  return err;
3137 		}
3138 	    }
3139 	}
3140 #endif /* RE_ENABLE_I18N */
3141       if (naccepted
3142 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3143 	{
3144 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3145 	  if (BE (! ok, 0))
3146 	    {
3147 	      re_node_set_free (&union_set);
3148 	      return REG_ESPACE;
3149 	    }
3150 	}
3151     }
3152   re_node_set_free (&union_set);
3153   return REG_NOERROR;
3154 }
3155 
3156 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3157    CUR_NODES, however exclude the nodes which are:
3158     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3159     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3160 */
3161 
3162 static reg_errcode_t
3163 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3164 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3165 			  Idx ex_subexp, int type)
3166 {
3167   reg_errcode_t err;
3168   Idx idx, outside_node;
3169   re_node_set new_nodes;
3170 #ifdef DEBUG
3171   assert (cur_nodes->nelem);
3172 #endif
3173   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3174   if (BE (err != REG_NOERROR, 0))
3175     return err;
3176   /* Create a new node set NEW_NODES with the nodes which are epsilon
3177      closures of the node in CUR_NODES.  */
3178 
3179   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3180     {
3181       Idx cur_node = cur_nodes->elems[idx];
3182       const re_node_set *eclosure = dfa->eclosures + cur_node;
3183       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3184       if (outside_node == REG_MISSING)
3185 	{
3186 	  /* There are no problematic nodes, just merge them.  */
3187 	  err = re_node_set_merge (&new_nodes, eclosure);
3188 	  if (BE (err != REG_NOERROR, 0))
3189 	    {
3190 	      re_node_set_free (&new_nodes);
3191 	      return err;
3192 	    }
3193 	}
3194       else
3195 	{
3196 	  /* There are problematic nodes, re-calculate incrementally.  */
3197 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3198 					      ex_subexp, type);
3199 	  if (BE (err != REG_NOERROR, 0))
3200 	    {
3201 	      re_node_set_free (&new_nodes);
3202 	      return err;
3203 	    }
3204 	}
3205     }
3206   re_node_set_free (cur_nodes);
3207   *cur_nodes = new_nodes;
3208   return REG_NOERROR;
3209 }
3210 
3211 /* Helper function for check_arrival_expand_ecl.
3212    Check incrementally the epsilon closure of TARGET, and if it isn't
3213    problematic append it to DST_NODES.  */
3214 
3215 static reg_errcode_t
3216 internal_function
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3217 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3218 			      Idx target, Idx ex_subexp, int type)
3219 {
3220   Idx cur_node;
3221   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3222     {
3223       bool ok;
3224 
3225       if (dfa->nodes[cur_node].type == type
3226 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3227 	{
3228 	  if (type == OP_CLOSE_SUBEXP)
3229 	    {
3230 	      ok = re_node_set_insert (dst_nodes, cur_node);
3231 	      if (BE (! ok, 0))
3232 		return REG_ESPACE;
3233 	    }
3234 	  break;
3235 	}
3236       ok = re_node_set_insert (dst_nodes, cur_node);
3237       if (BE (! ok, 0))
3238 	return REG_ESPACE;
3239       if (dfa->edests[cur_node].nelem == 0)
3240 	break;
3241       if (dfa->edests[cur_node].nelem == 2)
3242 	{
3243 	  reg_errcode_t err;
3244 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3245 					      dfa->edests[cur_node].elems[1],
3246 					      ex_subexp, type);
3247 	  if (BE (err != REG_NOERROR, 0))
3248 	    return err;
3249 	}
3250       cur_node = dfa->edests[cur_node].elems[0];
3251     }
3252   return REG_NOERROR;
3253 }
3254 
3255 
3256 /* For all the back references in the current state, calculate the
3257    destination of the back references by the appropriate entry
3258    in MCTX->BKREF_ENTS.  */
3259 
3260 static reg_errcode_t
3261 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3262 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3263 		    Idx cur_str, Idx subexp_num, int type)
3264 {
3265   const re_dfa_t *const dfa = mctx->dfa;
3266   reg_errcode_t err;
3267   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3268   struct re_backref_cache_entry *ent;
3269 
3270   if (cache_idx_start == REG_MISSING)
3271     return REG_NOERROR;
3272 
3273  restart:
3274   ent = mctx->bkref_ents + cache_idx_start;
3275   do
3276     {
3277       Idx to_idx, next_node;
3278 
3279       /* Is this entry ENT is appropriate?  */
3280       if (!re_node_set_contains (cur_nodes, ent->node))
3281 	continue; /* No.  */
3282 
3283       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3284       /* Calculate the destination of the back reference, and append it
3285 	 to MCTX->STATE_LOG.  */
3286       if (to_idx == cur_str)
3287 	{
3288 	  /* The backreference did epsilon transit, we must re-check all the
3289 	     node in the current state.  */
3290 	  re_node_set new_dests;
3291 	  reg_errcode_t err2, err3;
3292 	  next_node = dfa->edests[ent->node].elems[0];
3293 	  if (re_node_set_contains (cur_nodes, next_node))
3294 	    continue;
3295 	  err = re_node_set_init_1 (&new_dests, next_node);
3296 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3297 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3298 	  re_node_set_free (&new_dests);
3299 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3300 		  || err3 != REG_NOERROR, 0))
3301 	    {
3302 	      err = (err != REG_NOERROR ? err
3303 		     : (err2 != REG_NOERROR ? err2 : err3));
3304 	      return err;
3305 	    }
3306 	  /* TODO: It is still inefficient...  */
3307 	  goto restart;
3308 	}
3309       else
3310 	{
3311 	  re_node_set union_set;
3312 	  next_node = dfa->nexts[ent->node];
3313 	  if (mctx->state_log[to_idx])
3314 	    {
3315 	      bool ok;
3316 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3317 					next_node))
3318 		continue;
3319 	      err = re_node_set_init_copy (&union_set,
3320 					   &mctx->state_log[to_idx]->nodes);
3321 	      ok = re_node_set_insert (&union_set, next_node);
3322 	      if (BE (err != REG_NOERROR || ! ok, 0))
3323 		{
3324 		  re_node_set_free (&union_set);
3325 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3326 		  return err;
3327 		}
3328 	    }
3329 	  else
3330 	    {
3331 	      err = re_node_set_init_1 (&union_set, next_node);
3332 	      if (BE (err != REG_NOERROR, 0))
3333 		return err;
3334 	    }
3335 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3336 	  re_node_set_free (&union_set);
3337 	  if (BE (mctx->state_log[to_idx] == NULL
3338 		  && err != REG_NOERROR, 0))
3339 	    return err;
3340 	}
3341     }
3342   while (ent++->more);
3343   return REG_NOERROR;
3344 }
3345 
3346 /* Build transition table for the state.
3347    Return true if successful.  */
3348 
3349 static bool
3350 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3351 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3352 {
3353   reg_errcode_t err;
3354   Idx i, j;
3355   int ch;
3356   bool need_word_trtable = false;
3357   bitset_word_t elem, mask;
3358   bool dests_node_malloced = false;
3359   bool dest_states_malloced = false;
3360   Idx ndests; /* Number of the destination states from `state'.  */
3361   re_dfastate_t **trtable;
3362   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3363   re_node_set follows, *dests_node;
3364   bitset_t *dests_ch;
3365   bitset_t acceptable;
3366 
3367   struct dests_alloc
3368   {
3369     re_node_set dests_node[SBC_MAX];
3370     bitset_t dests_ch[SBC_MAX];
3371   } *dests_alloc;
3372 
3373   /* We build DFA states which corresponds to the destination nodes
3374      from `state'.  `dests_node[i]' represents the nodes which i-th
3375      destination state contains, and `dests_ch[i]' represents the
3376      characters which i-th destination state accepts.  */
3377   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3378     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3379   else
3380     {
3381       dests_alloc = re_malloc (struct dests_alloc, 1);
3382       if (BE (dests_alloc == NULL, 0))
3383 	return false;
3384       dests_node_malloced = true;
3385     }
3386   dests_node = dests_alloc->dests_node;
3387   dests_ch = dests_alloc->dests_ch;
3388 
3389   /* Initialize transiton table.  */
3390   state->word_trtable = state->trtable = NULL;
3391 
3392   /* At first, group all nodes belonging to `state' into several
3393      destinations.  */
3394   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3395   if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3396     {
3397       if (dests_node_malloced)
3398 	free (dests_alloc);
3399       if (ndests == 0)
3400 	{
3401 	  state->trtable = (re_dfastate_t **)
3402 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3403 	  return true;
3404 	}
3405       return false;
3406     }
3407 
3408   err = re_node_set_alloc (&follows, ndests + 1);
3409   if (BE (err != REG_NOERROR, 0))
3410     goto out_free;
3411 
3412   /* Avoid arithmetic overflow in size calculation.  */
3413   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3414 	    / (3 * sizeof (re_dfastate_t *)))
3415 	   < ndests),
3416 	  0))
3417     goto out_free;
3418 
3419   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3420 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3421     dest_states = (re_dfastate_t **)
3422       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3423   else
3424     {
3425       dest_states = (re_dfastate_t **)
3426 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
3427       if (BE (dest_states == NULL, 0))
3428 	{
3429 out_free:
3430 	  if (dest_states_malloced)
3431 	    free (dest_states);
3432 	  re_node_set_free (&follows);
3433 	  for (i = 0; i < ndests; ++i)
3434 	    re_node_set_free (dests_node + i);
3435 	  if (dests_node_malloced)
3436 	    free (dests_alloc);
3437 	  return false;
3438 	}
3439       dest_states_malloced = true;
3440     }
3441   dest_states_word = dest_states + ndests;
3442   dest_states_nl = dest_states_word + ndests;
3443   bitset_empty (acceptable);
3444 
3445   /* Then build the states for all destinations.  */
3446   for (i = 0; i < ndests; ++i)
3447     {
3448       Idx next_node;
3449       re_node_set_empty (&follows);
3450       /* Merge the follows of this destination states.  */
3451       for (j = 0; j < dests_node[i].nelem; ++j)
3452 	{
3453 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3454 	  if (next_node != REG_MISSING)
3455 	    {
3456 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3457 	      if (BE (err != REG_NOERROR, 0))
3458 		goto out_free;
3459 	    }
3460 	}
3461       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3462       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3463 	goto out_free;
3464       /* If the new state has context constraint,
3465 	 build appropriate states for these contexts.  */
3466       if (dest_states[i]->has_constraint)
3467 	{
3468 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3469 							  CONTEXT_WORD);
3470 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3471 	    goto out_free;
3472 
3473 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3474 	    need_word_trtable = true;
3475 
3476 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3477 							CONTEXT_NEWLINE);
3478 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3479 	    goto out_free;
3480 	}
3481       else
3482 	{
3483 	  dest_states_word[i] = dest_states[i];
3484 	  dest_states_nl[i] = dest_states[i];
3485 	}
3486       bitset_merge (acceptable, dests_ch[i]);
3487     }
3488 
3489   if (!BE (need_word_trtable, 0))
3490     {
3491       /* We don't care about whether the following character is a word
3492 	 character, or we are in a single-byte character set so we can
3493 	 discern by looking at the character code: allocate a
3494 	 256-entry transition table.  */
3495       trtable = state->trtable =
3496 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3497       if (BE (trtable == NULL, 0))
3498 	goto out_free;
3499 
3500       /* For all characters ch...:  */
3501       for (i = 0; i < BITSET_WORDS; ++i)
3502 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3503 	     elem;
3504 	     mask <<= 1, elem >>= 1, ++ch)
3505 	  if (BE (elem & 1, 0))
3506 	    {
3507 	      /* There must be exactly one destination which accepts
3508 		 character ch.  See group_nodes_into_DFAstates.  */
3509 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3510 		;
3511 
3512 	      /* j-th destination accepts the word character ch.  */
3513 	      if (dfa->word_char[i] & mask)
3514 		trtable[ch] = dest_states_word[j];
3515 	      else
3516 		trtable[ch] = dest_states[j];
3517 	    }
3518     }
3519   else
3520     {
3521       /* We care about whether the following character is a word
3522 	 character, and we are in a multi-byte character set: discern
3523 	 by looking at the character code: build two 256-entry
3524 	 transition tables, one starting at trtable[0] and one
3525 	 starting at trtable[SBC_MAX].  */
3526       trtable = state->word_trtable =
3527 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3528       if (BE (trtable == NULL, 0))
3529 	goto out_free;
3530 
3531       /* For all characters ch...:  */
3532       for (i = 0; i < BITSET_WORDS; ++i)
3533 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3534 	     elem;
3535 	     mask <<= 1, elem >>= 1, ++ch)
3536 	  if (BE (elem & 1, 0))
3537 	    {
3538 	      /* There must be exactly one destination which accepts
3539 		 character ch.  See group_nodes_into_DFAstates.  */
3540 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3541 		;
3542 
3543 	      /* j-th destination accepts the word character ch.  */
3544 	      trtable[ch] = dest_states[j];
3545 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3546 	    }
3547     }
3548 
3549   /* new line */
3550   if (bitset_contain (acceptable, NEWLINE_CHAR))
3551     {
3552       /* The current state accepts newline character.  */
3553       for (j = 0; j < ndests; ++j)
3554 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3555 	  {
3556 	    /* k-th destination accepts newline character.  */
3557 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3558 	    if (need_word_trtable)
3559 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3560 	    /* There must be only one destination which accepts
3561 	       newline.  See group_nodes_into_DFAstates.  */
3562 	    break;
3563 	  }
3564     }
3565 
3566   if (dest_states_malloced)
3567     free (dest_states);
3568 
3569   re_node_set_free (&follows);
3570   for (i = 0; i < ndests; ++i)
3571     re_node_set_free (dests_node + i);
3572 
3573   if (dests_node_malloced)
3574     free (dests_alloc);
3575 
3576   return true;
3577 }
3578 
3579 /* Group all nodes belonging to STATE into several destinations.
3580    Then for all destinations, set the nodes belonging to the destination
3581    to DESTS_NODE[i] and set the characters accepted by the destination
3582    to DEST_CH[i].  This function return the number of destinations.  */
3583 
3584 static Idx
3585 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3586 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3587 			    re_node_set *dests_node, bitset_t *dests_ch)
3588 {
3589   reg_errcode_t err;
3590   bool ok;
3591   Idx i, j, k;
3592   Idx ndests; /* Number of the destinations from `state'.  */
3593   bitset_t accepts; /* Characters a node can accept.  */
3594   const re_node_set *cur_nodes = &state->nodes;
3595   bitset_empty (accepts);
3596   ndests = 0;
3597 
3598   /* For all the nodes belonging to `state',  */
3599   for (i = 0; i < cur_nodes->nelem; ++i)
3600     {
3601       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3602       re_token_type_t type = node->type;
3603       unsigned int constraint = node->constraint;
3604 
3605       /* Enumerate all single byte character this node can accept.  */
3606       if (type == CHARACTER)
3607 	bitset_set (accepts, node->opr.c);
3608       else if (type == SIMPLE_BRACKET)
3609 	{
3610 	  bitset_merge (accepts, node->opr.sbcset);
3611 	}
3612       else if (type == OP_PERIOD)
3613 	{
3614 #ifdef RE_ENABLE_I18N
3615 	  if (dfa->mb_cur_max > 1)
3616 	    bitset_merge (accepts, dfa->sb_char);
3617 	  else
3618 #endif
3619 	    bitset_set_all (accepts);
3620 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3621 	    bitset_clear (accepts, '\n');
3622 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3623 	    bitset_clear (accepts, '\0');
3624 	}
3625 #ifdef RE_ENABLE_I18N
3626       else if (type == OP_UTF8_PERIOD)
3627         {
3628 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3629 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3630 	  else
3631 	    bitset_merge (accepts, utf8_sb_map);
3632 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3633 	    bitset_clear (accepts, '\n');
3634 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3635 	    bitset_clear (accepts, '\0');
3636         }
3637 #endif
3638       else
3639 	continue;
3640 
3641       /* Check the `accepts' and sift the characters which are not
3642 	 match it the context.  */
3643       if (constraint)
3644 	{
3645 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3646 	    {
3647 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3648 	      bitset_empty (accepts);
3649 	      if (accepts_newline)
3650 		bitset_set (accepts, NEWLINE_CHAR);
3651 	      else
3652 		continue;
3653 	    }
3654 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3655 	    {
3656 	      bitset_empty (accepts);
3657 	      continue;
3658 	    }
3659 
3660 	  if (constraint & NEXT_WORD_CONSTRAINT)
3661 	    {
3662 	      bitset_word_t any_set = 0;
3663 	      if (type == CHARACTER && !node->word_char)
3664 		{
3665 		  bitset_empty (accepts);
3666 		  continue;
3667 		}
3668 #ifdef RE_ENABLE_I18N
3669 	      if (dfa->mb_cur_max > 1)
3670 		for (j = 0; j < BITSET_WORDS; ++j)
3671 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3672 	      else
3673 #endif
3674 		for (j = 0; j < BITSET_WORDS; ++j)
3675 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3676 	      if (!any_set)
3677 		continue;
3678 	    }
3679 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3680 	    {
3681 	      bitset_word_t any_set = 0;
3682 	      if (type == CHARACTER && node->word_char)
3683 		{
3684 		  bitset_empty (accepts);
3685 		  continue;
3686 		}
3687 #ifdef RE_ENABLE_I18N
3688 	      if (dfa->mb_cur_max > 1)
3689 		for (j = 0; j < BITSET_WORDS; ++j)
3690 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3691 	      else
3692 #endif
3693 		for (j = 0; j < BITSET_WORDS; ++j)
3694 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3695 	      if (!any_set)
3696 		continue;
3697 	    }
3698 	}
3699 
3700       /* Then divide `accepts' into DFA states, or create a new
3701 	 state.  Above, we make sure that accepts is not empty.  */
3702       for (j = 0; j < ndests; ++j)
3703 	{
3704 	  bitset_t intersec; /* Intersection sets, see below.  */
3705 	  bitset_t remains;
3706 	  /* Flags, see below.  */
3707 	  bitset_word_t has_intersec, not_subset, not_consumed;
3708 
3709 	  /* Optimization, skip if this state doesn't accept the character.  */
3710 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3711 	    continue;
3712 
3713 	  /* Enumerate the intersection set of this state and `accepts'.  */
3714 	  has_intersec = 0;
3715 	  for (k = 0; k < BITSET_WORDS; ++k)
3716 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3717 	  /* And skip if the intersection set is empty.  */
3718 	  if (!has_intersec)
3719 	    continue;
3720 
3721 	  /* Then check if this state is a subset of `accepts'.  */
3722 	  not_subset = not_consumed = 0;
3723 	  for (k = 0; k < BITSET_WORDS; ++k)
3724 	    {
3725 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3726 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3727 	    }
3728 
3729 	  /* If this state isn't a subset of `accepts', create a
3730 	     new group state, which has the `remains'. */
3731 	  if (not_subset)
3732 	    {
3733 	      bitset_copy (dests_ch[ndests], remains);
3734 	      bitset_copy (dests_ch[j], intersec);
3735 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3736 	      if (BE (err != REG_NOERROR, 0))
3737 		goto error_return;
3738 	      ++ndests;
3739 	    }
3740 
3741 	  /* Put the position in the current group. */
3742 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3743 	  if (BE (! ok, 0))
3744 	    goto error_return;
3745 
3746 	  /* If all characters are consumed, go to next node. */
3747 	  if (!not_consumed)
3748 	    break;
3749 	}
3750       /* Some characters remain, create a new group. */
3751       if (j == ndests)
3752 	{
3753 	  bitset_copy (dests_ch[ndests], accepts);
3754 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3755 	  if (BE (err != REG_NOERROR, 0))
3756 	    goto error_return;
3757 	  ++ndests;
3758 	  bitset_empty (accepts);
3759 	}
3760     }
3761   return ndests;
3762  error_return:
3763   for (j = 0; j < ndests; ++j)
3764     re_node_set_free (dests_node + j);
3765   return REG_MISSING;
3766 }
3767 
3768 #ifdef RE_ENABLE_I18N
3769 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3770    Return the number of the bytes the node accepts.
3771    STR_IDX is the current index of the input string.
3772 
3773    This function handles the nodes which can accept one character, or
3774    one collating element like '.', '[a-z]', opposite to the other nodes
3775    can only accept one byte.  */
3776 
3777 static int
3778 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3779 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3780 			 const re_string_t *input, Idx str_idx)
3781 {
3782   const re_token_t *node = dfa->nodes + node_idx;
3783   int char_len, elem_len;
3784   Idx i;
3785 
3786   if (BE (node->type == OP_UTF8_PERIOD, 0))
3787     {
3788       unsigned char c = re_string_byte_at (input, str_idx), d;
3789       if (BE (c < 0xc2, 1))
3790 	return 0;
3791 
3792       if (str_idx + 2 > input->len)
3793 	return 0;
3794 
3795       d = re_string_byte_at (input, str_idx + 1);
3796       if (c < 0xe0)
3797 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3798       else if (c < 0xf0)
3799 	{
3800 	  char_len = 3;
3801 	  if (c == 0xe0 && d < 0xa0)
3802 	    return 0;
3803 	}
3804       else if (c < 0xf8)
3805 	{
3806 	  char_len = 4;
3807 	  if (c == 0xf0 && d < 0x90)
3808 	    return 0;
3809 	}
3810       else if (c < 0xfc)
3811 	{
3812 	  char_len = 5;
3813 	  if (c == 0xf8 && d < 0x88)
3814 	    return 0;
3815 	}
3816       else if (c < 0xfe)
3817 	{
3818 	  char_len = 6;
3819 	  if (c == 0xfc && d < 0x84)
3820 	    return 0;
3821 	}
3822       else
3823 	return 0;
3824 
3825       if (str_idx + char_len > input->len)
3826 	return 0;
3827 
3828       for (i = 1; i < char_len; ++i)
3829 	{
3830 	  d = re_string_byte_at (input, str_idx + i);
3831 	  if (d < 0x80 || d > 0xbf)
3832 	    return 0;
3833 	}
3834       return char_len;
3835     }
3836 
3837   char_len = re_string_char_size_at (input, str_idx);
3838   if (node->type == OP_PERIOD)
3839     {
3840       if (char_len <= 1)
3841         return 0;
3842       /* FIXME: I don't think this if is needed, as both '\n'
3843 	 and '\0' are char_len == 1.  */
3844       /* '.' accepts any one character except the following two cases.  */
3845       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3846 	   re_string_byte_at (input, str_idx) == '\n') ||
3847 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
3848 	   re_string_byte_at (input, str_idx) == '\0'))
3849 	return 0;
3850       return char_len;
3851     }
3852 
3853   elem_len = re_string_elem_size_at (input, str_idx);
3854   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3855     return 0;
3856 
3857   if (node->type == COMPLEX_BRACKET)
3858     {
3859       const re_charset_t *cset = node->opr.mbcset;
3860 # ifdef _LIBC
3861       const unsigned char *pin
3862 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3863       Idx j;
3864       uint32_t nrules;
3865 # endif /* _LIBC */
3866       int match_len = 0;
3867       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3868 		    ? re_string_wchar_at (input, str_idx) : 0);
3869 
3870       /* match with multibyte character?  */
3871       for (i = 0; i < cset->nmbchars; ++i)
3872 	if (wc == cset->mbchars[i])
3873 	  {
3874 	    match_len = char_len;
3875 	    goto check_node_accept_bytes_match;
3876 	  }
3877       /* match with character_class?  */
3878       for (i = 0; i < cset->nchar_classes; ++i)
3879 	{
3880 	  wctype_t wt = cset->char_classes[i];
3881 	  if (__iswctype (wc, wt))
3882 	    {
3883 	      match_len = char_len;
3884 	      goto check_node_accept_bytes_match;
3885 	    }
3886 	}
3887 
3888 # ifdef _LIBC
3889       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3890       if (nrules != 0)
3891 	{
3892 	  unsigned int in_collseq = 0;
3893 	  const int32_t *table, *indirect;
3894 	  const unsigned char *weights, *extra;
3895 	  const char *collseqwc;
3896 	  int32_t idx;
3897 	  /* This #include defines a local function!  */
3898 #  include <locale/weight.h>
3899 
3900 	  /* match with collating_symbol?  */
3901 	  if (cset->ncoll_syms)
3902 	    extra = (const unsigned char *)
3903 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3904 	  for (i = 0; i < cset->ncoll_syms; ++i)
3905 	    {
3906 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3907 	      /* Compare the length of input collating element and
3908 		 the length of current collating element.  */
3909 	      if (*coll_sym != elem_len)
3910 		continue;
3911 	      /* Compare each bytes.  */
3912 	      for (j = 0; j < *coll_sym; j++)
3913 		if (pin[j] != coll_sym[1 + j])
3914 		  break;
3915 	      if (j == *coll_sym)
3916 		{
3917 		  /* Match if every bytes is equal.  */
3918 		  match_len = j;
3919 		  goto check_node_accept_bytes_match;
3920 		}
3921 	    }
3922 
3923 	  if (cset->nranges)
3924 	    {
3925 	      if (elem_len <= char_len)
3926 		{
3927 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3928 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3929 		}
3930 	      else
3931 		in_collseq = find_collation_sequence_value (pin, elem_len);
3932 	    }
3933 	  /* match with range expression?  */
3934 	  for (i = 0; i < cset->nranges; ++i)
3935 	    if (cset->range_starts[i] <= in_collseq
3936 		&& in_collseq <= cset->range_ends[i])
3937 	      {
3938 		match_len = elem_len;
3939 		goto check_node_accept_bytes_match;
3940 	      }
3941 
3942 	  /* match with equivalence_class?  */
3943 	  if (cset->nequiv_classes)
3944 	    {
3945 	      const unsigned char *cp = pin;
3946 	      table = (const int32_t *)
3947 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3948 	      weights = (const unsigned char *)
3949 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3950 	      extra = (const unsigned char *)
3951 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3952 	      indirect = (const int32_t *)
3953 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3954 	      idx = findidx (&cp);
3955 	      if (idx > 0)
3956 		for (i = 0; i < cset->nequiv_classes; ++i)
3957 		  {
3958 		    int32_t equiv_class_idx = cset->equiv_classes[i];
3959 		    size_t weight_len = weights[idx];
3960 		    if (weight_len == weights[equiv_class_idx])
3961 		      {
3962 			Idx cnt = 0;
3963 			while (cnt <= weight_len
3964 			       && (weights[equiv_class_idx + 1 + cnt]
3965 				   == weights[idx + 1 + cnt]))
3966 			  ++cnt;
3967 			if (cnt > weight_len)
3968 			  {
3969 			    match_len = elem_len;
3970 			    goto check_node_accept_bytes_match;
3971 			  }
3972 		      }
3973 		  }
3974 	    }
3975 	}
3976       else
3977 # endif /* _LIBC */
3978 	{
3979 	  /* match with range expression?  */
3980 #if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__)
3981 	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3982 #else
3983 	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3984 	  cmp_buf[2] = wc;
3985 #endif
3986 	  for (i = 0; i < cset->nranges; ++i)
3987 	    {
3988 	      cmp_buf[0] = cset->range_starts[i];
3989 	      cmp_buf[4] = cset->range_ends[i];
3990 	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3991 		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3992 		{
3993 		  match_len = char_len;
3994 		  goto check_node_accept_bytes_match;
3995 		}
3996 	    }
3997 	}
3998     check_node_accept_bytes_match:
3999       if (!cset->non_match)
4000 	return match_len;
4001       else
4002 	{
4003 	  if (match_len > 0)
4004 	    return 0;
4005 	  else
4006 	    return (elem_len > char_len) ? elem_len : char_len;
4007 	}
4008     }
4009   return 0;
4010 }
4011 
4012 # ifdef _LIBC
4013 static unsigned int
4014 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)4015 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4016 {
4017   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4018   if (nrules == 0)
4019     {
4020       if (mbs_len == 1)
4021 	{
4022 	  /* No valid character.  Match it as a single byte character.  */
4023 	  const unsigned char *collseq = (const unsigned char *)
4024 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4025 	  return collseq[mbs[0]];
4026 	}
4027       return UINT_MAX;
4028     }
4029   else
4030     {
4031       int32_t idx;
4032       const unsigned char *extra = (const unsigned char *)
4033 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4034       int32_t extrasize = (const unsigned char *)
4035 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4036 
4037       for (idx = 0; idx < extrasize;)
4038 	{
4039 	  int mbs_cnt;
4040 	  bool found = false;
4041 	  int32_t elem_mbs_len;
4042 	  /* Skip the name of collating element name.  */
4043 	  idx = idx + extra[idx] + 1;
4044 	  elem_mbs_len = extra[idx++];
4045 	  if (mbs_len == elem_mbs_len)
4046 	    {
4047 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4048 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4049 		  break;
4050 	      if (mbs_cnt == elem_mbs_len)
4051 		/* Found the entry.  */
4052 		found = true;
4053 	    }
4054 	  /* Skip the byte sequence of the collating element.  */
4055 	  idx += elem_mbs_len;
4056 	  /* Adjust for the alignment.  */
4057 	  idx = (idx + 3) & ~3;
4058 	  /* Skip the collation sequence value.  */
4059 	  idx += sizeof (uint32_t);
4060 	  /* Skip the wide char sequence of the collating element.  */
4061 	  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4062 	  /* If we found the entry, return the sequence value.  */
4063 	  if (found)
4064 	    return *(uint32_t *) (extra + idx);
4065 	  /* Skip the collation sequence value.  */
4066 	  idx += sizeof (uint32_t);
4067 	}
4068       return UINT_MAX;
4069     }
4070 }
4071 # endif /* _LIBC */
4072 #endif /* RE_ENABLE_I18N */
4073 
4074 /* Check whether the node accepts the byte which is IDX-th
4075    byte of the INPUT.  */
4076 
4077 static bool
4078 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)4079 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4080 		   Idx idx)
4081 {
4082   unsigned char ch;
4083   ch = re_string_byte_at (&mctx->input, idx);
4084   switch (node->type)
4085     {
4086     case CHARACTER:
4087       if (node->opr.c != ch)
4088         return false;
4089       break;
4090 
4091     case SIMPLE_BRACKET:
4092       if (!bitset_contain (node->opr.sbcset, ch))
4093         return false;
4094       break;
4095 
4096 #ifdef RE_ENABLE_I18N
4097     case OP_UTF8_PERIOD:
4098       if (ch >= ASCII_CHARS)
4099         return false;
4100       /* FALLTHROUGH */
4101 #endif
4102     case OP_PERIOD:
4103       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4104 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4105 	return false;
4106       break;
4107 
4108     default:
4109       return false;
4110     }
4111 
4112   if (node->constraint)
4113     {
4114       /* The node has constraints.  Check whether the current context
4115 	 satisfies the constraints.  */
4116       unsigned int context = re_string_context_at (&mctx->input, idx,
4117 						   mctx->eflags);
4118       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4119 	return false;
4120     }
4121 
4122   return true;
4123 }
4124 
4125 /* Extend the buffers, if the buffers have run out.  */
4126 
4127 static reg_errcode_t
4128 internal_function
extend_buffers(re_match_context_t * mctx)4129 extend_buffers (re_match_context_t *mctx)
4130 {
4131   reg_errcode_t ret;
4132   re_string_t *pstr = &mctx->input;
4133 
4134   /* Avoid overflow.  */
4135   if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4136     return REG_ESPACE;
4137 
4138   /* Double the lengthes of the buffers.  */
4139   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4140   if (BE (ret != REG_NOERROR, 0))
4141     return ret;
4142 
4143   if (mctx->state_log != NULL)
4144     {
4145       /* And double the length of state_log.  */
4146       /* XXX We have no indication of the size of this buffer.  If this
4147 	 allocation fail we have no indication that the state_log array
4148 	 does not have the right size.  */
4149       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4150 					      pstr->bufs_len + 1);
4151       if (BE (new_array == NULL, 0))
4152 	return REG_ESPACE;
4153       mctx->state_log = new_array;
4154     }
4155 
4156   /* Then reconstruct the buffers.  */
4157   if (pstr->icase)
4158     {
4159 #ifdef RE_ENABLE_I18N
4160       if (pstr->mb_cur_max > 1)
4161 	{
4162 	  ret = build_wcs_upper_buffer (pstr);
4163 	  if (BE (ret != REG_NOERROR, 0))
4164 	    return ret;
4165 	}
4166       else
4167 #endif /* RE_ENABLE_I18N  */
4168 	build_upper_buffer (pstr);
4169     }
4170   else
4171     {
4172 #ifdef RE_ENABLE_I18N
4173       if (pstr->mb_cur_max > 1)
4174 	build_wcs_buffer (pstr);
4175       else
4176 #endif /* RE_ENABLE_I18N  */
4177 	{
4178 	  if (pstr->trans != NULL)
4179 	    re_string_translate_buffer (pstr);
4180 	}
4181     }
4182   return REG_NOERROR;
4183 }
4184 
4185 
4186 /* Functions for matching context.  */
4187 
4188 /* Initialize MCTX.  */
4189 
4190 static reg_errcode_t
4191 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4192 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4193 {
4194   mctx->eflags = eflags;
4195   mctx->match_last = REG_MISSING;
4196   if (n > 0)
4197     {
4198       /* Avoid overflow.  */
4199       size_t max_object_size =
4200 	MAX (sizeof (struct re_backref_cache_entry),
4201 	     sizeof (re_sub_match_top_t *));
4202       if (BE (SIZE_MAX / max_object_size < n, 0))
4203 	return REG_ESPACE;
4204 
4205       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4206       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4207       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4208 	return REG_ESPACE;
4209     }
4210   /* Already zero-ed by the caller.
4211      else
4212        mctx->bkref_ents = NULL;
4213      mctx->nbkref_ents = 0;
4214      mctx->nsub_tops = 0;  */
4215   mctx->abkref_ents = n;
4216   mctx->max_mb_elem_len = 1;
4217   mctx->asub_tops = n;
4218   return REG_NOERROR;
4219 }
4220 
4221 /* Clean the entries which depend on the current input in MCTX.
4222    This function must be invoked when the matcher changes the start index
4223    of the input, or changes the input string.  */
4224 
4225 static void
4226 internal_function
match_ctx_clean(re_match_context_t * mctx)4227 match_ctx_clean (re_match_context_t *mctx)
4228 {
4229   Idx st_idx;
4230   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4231     {
4232       Idx sl_idx;
4233       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4234       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4235 	{
4236 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4237 	  re_free (last->path.array);
4238 	  re_free (last);
4239 	}
4240       re_free (top->lasts);
4241       if (top->path)
4242 	{
4243 	  re_free (top->path->array);
4244 	  re_free (top->path);
4245 	}
4246       free (top);
4247     }
4248 
4249   mctx->nsub_tops = 0;
4250   mctx->nbkref_ents = 0;
4251 }
4252 
4253 /* Free all the memory associated with MCTX.  */
4254 
4255 static void
4256 internal_function
match_ctx_free(re_match_context_t * mctx)4257 match_ctx_free (re_match_context_t *mctx)
4258 {
4259   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4260   match_ctx_clean (mctx);
4261   re_free (mctx->sub_tops);
4262   re_free (mctx->bkref_ents);
4263 }
4264 
4265 /* Add a new backreference entry to MCTX.
4266    Note that we assume that caller never call this function with duplicate
4267    entry, and call with STR_IDX which isn't smaller than any existing entry.
4268 */
4269 
4270 static reg_errcode_t
4271 internal_function
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4272 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4273 		     Idx to)
4274 {
4275   if (mctx->nbkref_ents >= mctx->abkref_ents)
4276     {
4277       struct re_backref_cache_entry* new_entry;
4278       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4279 			      mctx->abkref_ents * 2);
4280       if (BE (new_entry == NULL, 0))
4281 	{
4282 	  re_free (mctx->bkref_ents);
4283 	  return REG_ESPACE;
4284 	}
4285       mctx->bkref_ents = new_entry;
4286       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4287 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4288       mctx->abkref_ents *= 2;
4289     }
4290   if (mctx->nbkref_ents > 0
4291       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4292     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4293 
4294   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4295   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4296   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4297   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4298 
4299   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4300      If bit N is clear, means that this entry won't epsilon-transition to
4301      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4302      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4303      such node.
4304 
4305      A backreference does not epsilon-transition unless it is empty, so set
4306      to all zeros if FROM != TO.  */
4307   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4308     = (from == to ? -1 : 0);
4309 
4310   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4311   if (mctx->max_mb_elem_len < to - from)
4312     mctx->max_mb_elem_len = to - from;
4313   return REG_NOERROR;
4314 }
4315 
4316 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4317    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4318 
4319 static Idx
4320 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4321 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4322 {
4323   Idx left, right, mid, last;
4324   last = right = mctx->nbkref_ents;
4325   for (left = 0; left < right;)
4326     {
4327       mid = (left + right) / 2;
4328       if (mctx->bkref_ents[mid].str_idx < str_idx)
4329 	left = mid + 1;
4330       else
4331 	right = mid;
4332     }
4333   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4334     return left;
4335   else
4336     return REG_MISSING;
4337 }
4338 
4339 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4340    at STR_IDX.  */
4341 
4342 static reg_errcode_t
4343 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4344 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4345 {
4346 #ifdef DEBUG
4347   assert (mctx->sub_tops != NULL);
4348   assert (mctx->asub_tops > 0);
4349 #endif
4350   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4351     {
4352       Idx new_asub_tops = mctx->asub_tops * 2;
4353       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4354 						   re_sub_match_top_t *,
4355 						   new_asub_tops);
4356       if (BE (new_array == NULL, 0))
4357 	return REG_ESPACE;
4358       mctx->sub_tops = new_array;
4359       mctx->asub_tops = new_asub_tops;
4360     }
4361   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4362   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4363     return REG_ESPACE;
4364   mctx->sub_tops[mctx->nsub_tops]->node = node;
4365   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4366   return REG_NOERROR;
4367 }
4368 
4369 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4370    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4371 
4372 static re_sub_match_last_t *
4373 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4374 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4375 {
4376   re_sub_match_last_t *new_entry;
4377   if (BE (subtop->nlasts == subtop->alasts, 0))
4378     {
4379       Idx new_alasts = 2 * subtop->alasts + 1;
4380       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4381 						    re_sub_match_last_t *,
4382 						    new_alasts);
4383       if (BE (new_array == NULL, 0))
4384 	return NULL;
4385       subtop->lasts = new_array;
4386       subtop->alasts = new_alasts;
4387     }
4388   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4389   if (BE (new_entry != NULL, 1))
4390     {
4391       subtop->lasts[subtop->nlasts] = new_entry;
4392       new_entry->node = node;
4393       new_entry->str_idx = str_idx;
4394       ++subtop->nlasts;
4395     }
4396   return new_entry;
4397 }
4398 
4399 static void
4400 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4401 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4402 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4403 {
4404   sctx->sifted_states = sifted_sts;
4405   sctx->limited_states = limited_sts;
4406   sctx->last_node = last_node;
4407   sctx->last_str_idx = last_str_idx;
4408   re_node_set_init_empty (&sctx->limits);
4409 }
4410