1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5    Free Software Foundation, Inc.
6    This file is part of the GNU C Library.
7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3, or (at your option)
12    any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License along
20    with this program; if not, write to the Free Software Foundation,
21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 static void re_string_construct_common (const char *str, Idx len,
24 					re_string_t *pstr,
25 					RE_TRANSLATE_TYPE trans, bool icase,
26 					const re_dfa_t *dfa) internal_function;
27 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  re_hashval_t hash) internal_function;
30 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31 					  const re_node_set *nodes,
32 					  unsigned int context,
33 					  re_hashval_t hash) internal_function;
34 
35 /* Functions for string operation.  */
36 
37 /* This function allocate the buffers.  It is necessary to call
38    re_string_reconstruct before using the object.  */
39 
40 static reg_errcode_t
41 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)42 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44 {
45   reg_errcode_t ret;
46   Idx init_buf_len;
47 
48   /* Ensure at least one character fits into the buffers.  */
49   if (init_len < dfa->mb_cur_max)
50     init_len = dfa->mb_cur_max;
51   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52   re_string_construct_common (str, len, pstr, trans, icase, dfa);
53 
54   ret = re_string_realloc_buffers (pstr, init_buf_len);
55   if (BE (ret != REG_NOERROR, 0))
56     return ret;
57 
58   pstr->word_char = dfa->word_char;
59   pstr->word_ops_used = dfa->word_ops_used;
60   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62   pstr->valid_raw_len = pstr->valid_len;
63   return REG_NOERROR;
64 }
65 
66 /* This function allocate the buffers, and initialize them.  */
67 
68 static reg_errcode_t
69 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)70 re_string_construct (re_string_t *pstr, const char *str, Idx len,
71 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72 {
73   reg_errcode_t ret;
74   memset (pstr, '\0', sizeof (re_string_t));
75   re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 
77   if (len > 0)
78     {
79       ret = re_string_realloc_buffers (pstr, len + 1);
80       if (BE (ret != REG_NOERROR, 0))
81 	return ret;
82     }
83   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84 
85   if (icase)
86     {
87 #ifdef RE_ENABLE_I18N
88       if (dfa->mb_cur_max > 1)
89 	{
90 	  while (1)
91 	    {
92 	      ret = build_wcs_upper_buffer (pstr);
93 	      if (BE (ret != REG_NOERROR, 0))
94 		return ret;
95 	      if (pstr->valid_raw_len >= len)
96 		break;
97 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98 		break;
99 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100 	      if (BE (ret != REG_NOERROR, 0))
101 		return ret;
102 	    }
103 	}
104       else
105 #endif /* RE_ENABLE_I18N  */
106 	build_upper_buffer (pstr);
107     }
108   else
109     {
110 #ifdef RE_ENABLE_I18N
111       if (dfa->mb_cur_max > 1)
112 	build_wcs_buffer (pstr);
113       else
114 #endif /* RE_ENABLE_I18N  */
115 	{
116 	  if (trans != NULL)
117 	    re_string_translate_buffer (pstr);
118 	  else
119 	    {
120 	      pstr->valid_len = pstr->bufs_len;
121 	      pstr->valid_raw_len = pstr->bufs_len;
122 	    }
123 	}
124     }
125 
126   return REG_NOERROR;
127 }
128 
129 /* Helper functions for re_string_allocate, and re_string_construct.  */
130 
131 static reg_errcode_t
132 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)133 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134 {
135 #ifdef RE_ENABLE_I18N
136   if (pstr->mb_cur_max > 1)
137     {
138       wint_t *new_wcs;
139 
140       /* Avoid overflow.  */
141       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
143 	return REG_ESPACE;
144 
145       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
146       if (BE (new_wcs == NULL, 0))
147 	return REG_ESPACE;
148       pstr->wcs = new_wcs;
149       if (pstr->offsets != NULL)
150 	{
151 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
152 	  if (BE (new_offsets == NULL, 0))
153 	    return REG_ESPACE;
154 	  pstr->offsets = new_offsets;
155 	}
156     }
157 #endif /* RE_ENABLE_I18N  */
158   if (pstr->mbs_allocated)
159     {
160       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161 					   new_buf_len);
162       if (BE (new_mbs == NULL, 0))
163 	return REG_ESPACE;
164       pstr->mbs = new_mbs;
165     }
166   pstr->bufs_len = new_buf_len;
167   return REG_NOERROR;
168 }
169 
170 
171 static void
172 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)173 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
174 			    RE_TRANSLATE_TYPE trans, bool icase,
175 			    const re_dfa_t *dfa)
176 {
177   pstr->raw_mbs = (const unsigned char *) str;
178   pstr->len = len;
179   pstr->raw_len = len;
180   pstr->trans = trans;
181   pstr->icase = icase;
182   pstr->mbs_allocated = (trans != NULL || icase);
183   pstr->mb_cur_max = dfa->mb_cur_max;
184   pstr->is_utf8 = dfa->is_utf8;
185   pstr->map_notascii = dfa->map_notascii;
186   pstr->stop = pstr->len;
187   pstr->raw_stop = pstr->stop;
188 }
189 
190 #ifdef RE_ENABLE_I18N
191 
192 /* Build wide character buffer PSTR->WCS.
193    If the byte sequence of the string are:
194      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
195    Then wide character buffer will be:
196      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
197    We use WEOF for padding, they indicate that the position isn't
198    a first byte of a multibyte character.
199 
200    Note that this function assumes PSTR->VALID_LEN elements are already
201    built and starts from PSTR->VALID_LEN.  */
202 
203 static void
204 internal_function
build_wcs_buffer(re_string_t * pstr)205 build_wcs_buffer (re_string_t *pstr)
206 {
207 #ifdef _LIBC
208   unsigned char buf[MB_LEN_MAX];
209   assert (MB_LEN_MAX >= pstr->mb_cur_max);
210 #else
211   unsigned char buf[64];
212 #endif
213   mbstate_t prev_st;
214   Idx byte_idx, end_idx, remain_len;
215   size_t mbclen;
216 
217   /* Build the buffers from pstr->valid_len to either pstr->len or
218      pstr->bufs_len.  */
219   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
220   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
221     {
222       wchar_t wc;
223       const char *p;
224 
225       remain_len = end_idx - byte_idx;
226       prev_st = pstr->cur_state;
227       /* Apply the translation if we need.  */
228       if (BE (pstr->trans != NULL, 0))
229 	{
230 	  int i, ch;
231 
232 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
233 	    {
234 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
235 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
236 	    }
237 	  p = (const char *) buf;
238 	}
239       else
240 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
241       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
242       if (BE (mbclen == (size_t) -2, 0))
243 	{
244 	  /* The buffer doesn't have enough space, finish to build.  */
245 	  pstr->cur_state = prev_st;
246 	  break;
247 	}
248       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
249 	{
250 	  /* We treat these cases as a singlebyte character.  */
251 	  mbclen = 1;
252 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253 	  if (BE (pstr->trans != NULL, 0))
254 	    wc = pstr->trans[wc];
255 	  pstr->cur_state = prev_st;
256 	}
257 
258       /* Write wide character and padding.  */
259       pstr->wcs[byte_idx++] = wc;
260       /* Write paddings.  */
261       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262 	pstr->wcs[byte_idx++] = WEOF;
263     }
264   pstr->valid_len = byte_idx;
265   pstr->valid_raw_len = byte_idx;
266 }
267 
268 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269    but for REG_ICASE.  */
270 
271 static reg_errcode_t
272 internal_function
build_wcs_upper_buffer(re_string_t * pstr)273 build_wcs_upper_buffer (re_string_t *pstr)
274 {
275   mbstate_t prev_st;
276   Idx src_idx, byte_idx, end_idx, remain_len;
277   size_t mbclen;
278 #ifdef _LIBC
279   char buf[MB_LEN_MAX];
280   assert (MB_LEN_MAX >= pstr->mb_cur_max);
281 #else
282   char buf[64];
283 #endif
284 
285   byte_idx = pstr->valid_len;
286   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287 
288   /* The following optimization assumes that ASCII characters can be
289      mapped to wide characters with a simple cast.  */
290   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291     {
292       while (byte_idx < end_idx)
293 	{
294 	  wchar_t wc;
295 
296 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297 	      && mbsinit (&pstr->cur_state))
298 	    {
299 	      /* In case of a singlebyte character.  */
300 	      pstr->mbs[byte_idx]
301 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302 	      /* The next step uses the assumption that wchar_t is encoded
303 		 ASCII-safe: all ASCII values can be converted like this.  */
304 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305 	      ++byte_idx;
306 	      continue;
307 	    }
308 
309 	  remain_len = end_idx - byte_idx;
310 	  prev_st = pstr->cur_state;
311 	  mbclen = __mbrtowc (&wc,
312 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313 			       + byte_idx), remain_len, &pstr->cur_state);
314 	  if (BE (mbclen < (size_t) -2, 1))
315 	    {
316 	      wchar_t wcu = wc;
317 	      if (iswlower (wc))
318 		{
319 		  size_t mbcdlen;
320 
321 		  wcu = towupper (wc);
322 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
323 		  if (BE (mbclen == mbcdlen, 1))
324 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 		  else
326 		    {
327 		      src_idx = byte_idx;
328 		      goto offsets_needed;
329 		    }
330 		}
331 	      else
332 		memcpy (pstr->mbs + byte_idx,
333 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 	      pstr->wcs[byte_idx++] = wcu;
335 	      /* Write paddings.  */
336 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 		pstr->wcs[byte_idx++] = WEOF;
338 	    }
339 	  else if (mbclen == (size_t) -1 || mbclen == 0)
340 	    {
341 	      /* It is an invalid character or '\0'.  Just use the byte.  */
342 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343 	      pstr->mbs[byte_idx] = ch;
344 	      /* And also cast it to wide char.  */
345 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
346 	      if (BE (mbclen == (size_t) -1, 0))
347 		pstr->cur_state = prev_st;
348 	    }
349 	  else
350 	    {
351 	      /* The buffer doesn't have enough space, finish to build.  */
352 	      pstr->cur_state = prev_st;
353 	      break;
354 	    }
355 	}
356       pstr->valid_len = byte_idx;
357       pstr->valid_raw_len = byte_idx;
358       return REG_NOERROR;
359     }
360   else
361     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362       {
363 	wchar_t wc;
364 	const char *p;
365       offsets_needed:
366 	remain_len = end_idx - byte_idx;
367 	prev_st = pstr->cur_state;
368 	if (BE (pstr->trans != NULL, 0))
369 	  {
370 	    int i, ch;
371 
372 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
373 	      {
374 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375 		buf[i] = pstr->trans[ch];
376 	      }
377 	    p = (const char *) buf;
378 	  }
379 	else
380 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382 	if (BE (mbclen < (size_t) -2, 1))
383 	  {
384 	    wchar_t wcu = wc;
385 	    if (iswlower (wc))
386 	      {
387 		size_t mbcdlen;
388 
389 		wcu = towupper (wc);
390 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
391 		if (BE (mbclen == mbcdlen, 1))
392 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
393 		else if (mbcdlen != (size_t) -1)
394 		  {
395 		    size_t i;
396 
397 		    if (byte_idx + mbcdlen > pstr->bufs_len)
398 		      {
399 			pstr->cur_state = prev_st;
400 			break;
401 		      }
402 
403 		    if (pstr->offsets == NULL)
404 		      {
405 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
406 
407 			if (pstr->offsets == NULL)
408 			  return REG_ESPACE;
409 		      }
410 		    if (!pstr->offsets_needed)
411 		      {
412 			for (i = 0; i < (size_t) byte_idx; ++i)
413 			  pstr->offsets[i] = i;
414 			pstr->offsets_needed = 1;
415 		      }
416 
417 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418 		    pstr->wcs[byte_idx] = wcu;
419 		    pstr->offsets[byte_idx] = src_idx;
420 		    for (i = 1; i < mbcdlen; ++i)
421 		      {
422 			pstr->offsets[byte_idx + i]
423 			  = src_idx + (i < mbclen ? i : mbclen - 1);
424 			pstr->wcs[byte_idx + i] = WEOF;
425 		      }
426 		    pstr->len += mbcdlen - mbclen;
427 		    if (pstr->raw_stop > src_idx)
428 		      pstr->stop += mbcdlen - mbclen;
429 		    end_idx = (pstr->bufs_len > pstr->len)
430 			      ? pstr->len : pstr->bufs_len;
431 		    byte_idx += mbcdlen;
432 		    src_idx += mbclen;
433 		    continue;
434 		  }
435                 else
436                   memcpy (pstr->mbs + byte_idx, p, mbclen);
437 	      }
438 	    else
439 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
440 
441 	    if (BE (pstr->offsets_needed != 0, 0))
442 	      {
443 		size_t i;
444 		for (i = 0; i < mbclen; ++i)
445 		  pstr->offsets[byte_idx + i] = src_idx + i;
446 	      }
447 	    src_idx += mbclen;
448 
449 	    pstr->wcs[byte_idx++] = wcu;
450 	    /* Write paddings.  */
451 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452 	      pstr->wcs[byte_idx++] = WEOF;
453 	  }
454 	else if (mbclen == (size_t) -1 || mbclen == 0)
455 	  {
456 	    /* It is an invalid character or '\0'.  Just use the byte.  */
457 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
458 
459 	    if (BE (pstr->trans != NULL, 0))
460 	      ch = pstr->trans [ch];
461 	    pstr->mbs[byte_idx] = ch;
462 
463 	    if (BE (pstr->offsets_needed != 0, 0))
464 	      pstr->offsets[byte_idx] = src_idx;
465 	    ++src_idx;
466 
467 	    /* And also cast it to wide char.  */
468 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
469 	    if (BE (mbclen == (size_t) -1, 0))
470 	      pstr->cur_state = prev_st;
471 	  }
472 	else
473 	  {
474 	    /* The buffer doesn't have enough space, finish to build.  */
475 	    pstr->cur_state = prev_st;
476 	    break;
477 	  }
478       }
479   pstr->valid_len = byte_idx;
480   pstr->valid_raw_len = src_idx;
481   return REG_NOERROR;
482 }
483 
484 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
485    Return the index.  */
486 
487 static Idx
488 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
490 {
491   mbstate_t prev_st;
492   Idx rawbuf_idx;
493   size_t mbclen;
494   wint_t wc = WEOF;
495 
496   /* Skip the characters which are not necessary to check.  */
497   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
498        rawbuf_idx < new_raw_idx;)
499     {
500       wchar_t wc2;
501       Idx remain_len;
502       remain_len = pstr->len - rawbuf_idx;
503       prev_st = pstr->cur_state;
504       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505 			  remain_len, &pstr->cur_state);
506       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
507 	{
508 	  /* We treat these cases as a single byte character.  */
509 	  if (mbclen == 0 || remain_len == 0)
510 	    wc = L'\0';
511 	  else
512 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513 	  mbclen = 1;
514 	  pstr->cur_state = prev_st;
515 	}
516       else
517 	wc = wc2;
518       /* Then proceed the next character.  */
519       rawbuf_idx += mbclen;
520     }
521   *last_wc = wc;
522   return rawbuf_idx;
523 }
524 #endif /* RE_ENABLE_I18N  */
525 
526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
527    This function is used in case of REG_ICASE.  */
528 
529 static void
530 internal_function
build_upper_buffer(re_string_t * pstr)531 build_upper_buffer (re_string_t *pstr)
532 {
533   Idx char_idx, end_idx;
534   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535 
536   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537     {
538       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539       if (BE (pstr->trans != NULL, 0))
540 	ch = pstr->trans[ch];
541       if (islower (ch))
542 	pstr->mbs[char_idx] = toupper (ch);
543       else
544 	pstr->mbs[char_idx] = ch;
545     }
546   pstr->valid_len = char_idx;
547   pstr->valid_raw_len = char_idx;
548 }
549 
550 /* Apply TRANS to the buffer in PSTR.  */
551 
552 static void
553 internal_function
re_string_translate_buffer(re_string_t * pstr)554 re_string_translate_buffer (re_string_t *pstr)
555 {
556   Idx buf_idx, end_idx;
557   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558 
559   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560     {
561       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562       pstr->mbs[buf_idx] = pstr->trans[ch];
563     }
564 
565   pstr->valid_len = buf_idx;
566   pstr->valid_raw_len = buf_idx;
567 }
568 
569 /* This function re-construct the buffers.
570    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571    convert to upper case in case of REG_ICASE, apply translation.  */
572 
573 static reg_errcode_t
574 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)575 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576 {
577   Idx offset;
578 
579   if (BE (pstr->raw_mbs_idx <= idx, 0))
580     offset = idx - pstr->raw_mbs_idx;
581   else
582     {
583       /* Reset buffer.  */
584 #ifdef RE_ENABLE_I18N
585       if (pstr->mb_cur_max > 1)
586 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587 #endif /* RE_ENABLE_I18N */
588       pstr->len = pstr->raw_len;
589       pstr->stop = pstr->raw_stop;
590       pstr->valid_len = 0;
591       pstr->raw_mbs_idx = 0;
592       pstr->valid_raw_len = 0;
593       pstr->offsets_needed = 0;
594       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
596       if (!pstr->mbs_allocated)
597 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
598       offset = idx;
599     }
600 
601   if (BE (offset != 0, 1))
602     {
603       /* Should the already checked characters be kept?  */
604       if (BE (offset < pstr->valid_raw_len, 1))
605 	{
606 	  /* Yes, move them to the front of the buffer.  */
607 #ifdef RE_ENABLE_I18N
608 	  if (BE (pstr->offsets_needed, 0))
609 	    {
610 	      Idx low = 0, high = pstr->valid_len, mid;
611 	      do
612 		{
613 		  mid = (high + low) / 2;
614 		  if (pstr->offsets[mid] > offset)
615 		    high = mid;
616 		  else if (pstr->offsets[mid] < offset)
617 		    low = mid + 1;
618 		  else
619 		    break;
620 		}
621 	      while (low < high);
622 	      if (pstr->offsets[mid] < offset)
623 		++mid;
624 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 							eflags);
626 	      /* This can be quite complicated, so handle specially
627 		 only the common and easy case where the character with
628 		 different length representation of lower and upper
629 		 case is present at or after offset.  */
630 	      if (pstr->valid_len > offset
631 		  && mid == offset && pstr->offsets[mid] == offset)
632 		{
633 		  memmove (pstr->wcs, pstr->wcs + offset,
634 			   (pstr->valid_len - offset) * sizeof (wint_t));
635 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 		  pstr->valid_len -= offset;
637 		  pstr->valid_raw_len -= offset;
638 		  for (low = 0; low < pstr->valid_len; low++)
639 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 		}
641 	      else
642 		{
643 		  /* Otherwise, just find out how long the partial multibyte
644 		     character at offset is and fill it with WEOF/255.  */
645 		  pstr->len = pstr->raw_len - idx + offset;
646 		  pstr->stop = pstr->raw_stop - idx + offset;
647 		  pstr->offsets_needed = 0;
648 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 		    --mid;
650 		  while (mid < pstr->valid_len)
651 		    if (pstr->wcs[mid] != WEOF)
652 		      break;
653 		    else
654 		      ++mid;
655 		  if (mid == pstr->valid_len)
656 		    pstr->valid_len = 0;
657 		  else
658 		    {
659 		      pstr->valid_len = pstr->offsets[mid] - offset;
660 		      if (pstr->valid_len)
661 			{
662 			  for (low = 0; low < pstr->valid_len; ++low)
663 			    pstr->wcs[low] = WEOF;
664 			  memset (pstr->mbs, 255, pstr->valid_len);
665 			}
666 		    }
667 		  pstr->valid_raw_len = pstr->valid_len;
668 		}
669 	    }
670 	  else
671 #endif
672 	    {
673 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 							eflags);
675 #ifdef RE_ENABLE_I18N
676 	      if (pstr->mb_cur_max > 1)
677 		memmove (pstr->wcs, pstr->wcs + offset,
678 			 (pstr->valid_len - offset) * sizeof (wint_t));
679 #endif /* RE_ENABLE_I18N */
680 	      if (BE (pstr->mbs_allocated, 0))
681 		memmove (pstr->mbs, pstr->mbs + offset,
682 			 pstr->valid_len - offset);
683 	      pstr->valid_len -= offset;
684 	      pstr->valid_raw_len -= offset;
685 #if DEBUG
686 	      assert (pstr->valid_len > 0);
687 #endif
688 	    }
689 	}
690       else
691 	{
692 #ifdef RE_ENABLE_I18N
693 	  /* No, skip all characters until IDX.  */
694 	  Idx prev_valid_len = pstr->valid_len;
695 
696 	  if (BE (pstr->offsets_needed, 0))
697 	    {
698 	      pstr->len = pstr->raw_len - idx + offset;
699 	      pstr->stop = pstr->raw_stop - idx + offset;
700 	      pstr->offsets_needed = 0;
701 	    }
702 #endif
703 	  pstr->valid_len = 0;
704 #ifdef RE_ENABLE_I18N
705 	  if (pstr->mb_cur_max > 1)
706 	    {
707 	      Idx wcs_idx;
708 	      wint_t wc = WEOF;
709 
710 	      if (pstr->is_utf8)
711 		{
712 		  const unsigned char *raw, *p, *end;
713 
714 		  /* Special case UTF-8.  Multi-byte chars start with any
715 		     byte other than 0x80 - 0xbf.  */
716 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717 		  end = raw + (offset - pstr->mb_cur_max);
718 		  if (end < pstr->raw_mbs)
719 		    end = pstr->raw_mbs;
720 		  p = raw + offset - 1;
721 #ifdef _LIBC
722 		  /* We know the wchar_t encoding is UCS4, so for the simple
723 		     case, ASCII characters, skip the conversion step.  */
724 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 		    {
726 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727 		      /* pstr->valid_len = 0; */
728 		      wc = (wchar_t) *p;
729 		    }
730 		  else
731 #endif
732 		    for (; p >= end; --p)
733 		      if ((*p & 0xc0) != 0x80)
734 			{
735 			  mbstate_t cur_state;
736 			  wchar_t wc2;
737 			  Idx mlen = raw + pstr->len - p;
738 			  unsigned char buf[6];
739 			  size_t mbclen;
740 
741 			  if (BE (pstr->trans != NULL, 0))
742 			    {
743 			      int i = mlen < 6 ? mlen : 6;
744 			      while (--i >= 0)
745 				buf[i] = pstr->trans[p[i]];
746 			    }
747 			  /* XXX Don't use mbrtowc, we know which conversion
748 			     to use (UTF-8 -> UCS4).  */
749 			  memset (&cur_state, 0, sizeof (cur_state));
750 			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
751 					      &cur_state);
752 			  if (raw + offset - p <= mbclen
753 			      && mbclen < (size_t) -2)
754 			    {
755 			      memset (&pstr->cur_state, '\0',
756 				      sizeof (mbstate_t));
757 			      pstr->valid_len = mbclen - (raw + offset - p);
758 			      wc = wc2;
759 			    }
760 			  break;
761 			}
762 		}
763 
764 	      if (wc == WEOF)
765 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766 	      if (wc == WEOF)
767 		pstr->tip_context
768 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
769 	      else
770 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771 				      && IS_WIDE_WORD_CHAR (wc))
772 				     ? CONTEXT_WORD
773 				     : ((IS_WIDE_NEWLINE (wc)
774 					 && pstr->newline_anchor)
775 					? CONTEXT_NEWLINE : 0));
776 	      if (BE (pstr->valid_len, 0))
777 		{
778 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779 		    pstr->wcs[wcs_idx] = WEOF;
780 		  if (pstr->mbs_allocated)
781 		    memset (pstr->mbs, 255, pstr->valid_len);
782 		}
783 	      pstr->valid_raw_len = pstr->valid_len;
784 	    }
785 	  else
786 #endif /* RE_ENABLE_I18N */
787 	    {
788 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789 	      pstr->valid_raw_len = 0;
790 	      if (pstr->trans)
791 		c = pstr->trans[c];
792 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
793 				   ? CONTEXT_WORD
794 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
795 				      ? CONTEXT_NEWLINE : 0));
796 	    }
797 	}
798       if (!BE (pstr->mbs_allocated, 0))
799 	pstr->mbs += offset;
800     }
801   pstr->raw_mbs_idx = idx;
802   pstr->len -= offset;
803   pstr->stop -= offset;
804 
805   /* Then build the buffers.  */
806 #ifdef RE_ENABLE_I18N
807   if (pstr->mb_cur_max > 1)
808     {
809       if (pstr->icase)
810 	{
811 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812 	  if (BE (ret != REG_NOERROR, 0))
813 	    return ret;
814 	}
815       else
816 	build_wcs_buffer (pstr);
817     }
818   else
819 #endif /* RE_ENABLE_I18N */
820     if (BE (pstr->mbs_allocated, 0))
821       {
822 	if (pstr->icase)
823 	  build_upper_buffer (pstr);
824 	else if (pstr->trans != NULL)
825 	  re_string_translate_buffer (pstr);
826       }
827     else
828       pstr->valid_len = pstr->len;
829 
830   pstr->cur_idx = 0;
831   return REG_NOERROR;
832 }
833 
834 static unsigned char
internal_function(pure)835 internal_function __attribute ((pure))
836 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837 {
838   int ch;
839   Idx off;
840 
841   /* Handle the common (easiest) cases first.  */
842   if (BE (!pstr->mbs_allocated, 1))
843     return re_string_peek_byte (pstr, idx);
844 
845 #ifdef RE_ENABLE_I18N
846   if (pstr->mb_cur_max > 1
847       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848     return re_string_peek_byte (pstr, idx);
849 #endif
850 
851   off = pstr->cur_idx + idx;
852 #ifdef RE_ENABLE_I18N
853   if (pstr->offsets_needed)
854     off = pstr->offsets[off];
855 #endif
856 
857   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858 
859 #ifdef RE_ENABLE_I18N
860   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861      this function returns CAPITAL LETTER I instead of first byte of
862      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
863      since peek_byte_case doesn't advance cur_idx in any way.  */
864   if (pstr->offsets_needed && !isascii (ch))
865     return re_string_peek_byte (pstr, idx);
866 #endif
867 
868   return ch;
869 }
870 
871 static unsigned char
internal_function(pure)872 internal_function __attribute ((pure))
873 re_string_fetch_byte_case (re_string_t *pstr)
874 {
875   if (BE (!pstr->mbs_allocated, 1))
876     return re_string_fetch_byte (pstr);
877 
878 #ifdef RE_ENABLE_I18N
879   if (pstr->offsets_needed)
880     {
881       Idx off;
882       int ch;
883 
884       /* For tr_TR.UTF-8 [[:islower:]] there is
885 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
886 	 in that case the whole multi-byte character and return
887 	 the original letter.  On the other side, with
888 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 	 anything else would complicate things too much.  */
890 
891       if (!re_string_first_byte (pstr, pstr->cur_idx))
892 	return re_string_fetch_byte (pstr);
893 
894       off = pstr->offsets[pstr->cur_idx];
895       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896 
897       if (! isascii (ch))
898 	return re_string_fetch_byte (pstr);
899 
900       re_string_skip_bytes (pstr,
901 			    re_string_char_size_at (pstr, pstr->cur_idx));
902       return ch;
903     }
904 #endif
905 
906   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907 }
908 
909 static void
910 internal_function
re_string_destruct(re_string_t * pstr)911 re_string_destruct (re_string_t *pstr)
912 {
913 #ifdef RE_ENABLE_I18N
914   re_free (pstr->wcs);
915   re_free (pstr->offsets);
916 #endif /* RE_ENABLE_I18N  */
917   if (pstr->mbs_allocated)
918     re_free (pstr->mbs);
919 }
920 
921 /* Return the context at IDX in INPUT.  */
922 
923 static unsigned int
924 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)925 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926 {
927   int c;
928   if (BE (! REG_VALID_INDEX (idx), 0))
929     /* In this case, we use the value stored in input->tip_context,
930        since we can't know the character in input->mbs[-1] here.  */
931     return input->tip_context;
932   if (BE (idx == input->len, 0))
933     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935 #ifdef RE_ENABLE_I18N
936   if (input->mb_cur_max > 1)
937     {
938       wint_t wc;
939       Idx wc_idx = idx;
940       while(input->wcs[wc_idx] == WEOF)
941 	{
942 #ifdef DEBUG
943 	  /* It must not happen.  */
944 	  assert (REG_VALID_INDEX (wc_idx));
945 #endif
946 	  --wc_idx;
947 	  if (! REG_VALID_INDEX (wc_idx))
948 	    return input->tip_context;
949 	}
950       wc = input->wcs[wc_idx];
951       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 	return CONTEXT_WORD;
953       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 	      ? CONTEXT_NEWLINE : 0);
955     }
956   else
957 #endif
958     {
959       c = re_string_byte_at (input, idx);
960       if (bitset_contain (input->word_char, c))
961 	return CONTEXT_WORD;
962       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963     }
964 }
965 
966 /* Functions for set operation.  */
967 
968 static reg_errcode_t
969 internal_function
re_node_set_alloc(re_node_set * set,Idx size)970 re_node_set_alloc (re_node_set *set, Idx size)
971 {
972   set->alloc = size;
973   set->nelem = 0;
974   set->elems = re_malloc (Idx, size);
975   if (BE (set->elems == NULL, 0))
976     return REG_ESPACE;
977   return REG_NOERROR;
978 }
979 
980 static reg_errcode_t
981 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)982 re_node_set_init_1 (re_node_set *set, Idx elem)
983 {
984   set->alloc = 1;
985   set->nelem = 1;
986   set->elems = re_malloc (Idx, 1);
987   if (BE (set->elems == NULL, 0))
988     {
989       set->alloc = set->nelem = 0;
990       return REG_ESPACE;
991     }
992   set->elems[0] = elem;
993   return REG_NOERROR;
994 }
995 
996 static reg_errcode_t
997 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
999 {
1000   set->alloc = 2;
1001   set->elems = re_malloc (Idx, 2);
1002   if (BE (set->elems == NULL, 0))
1003     return REG_ESPACE;
1004   if (elem1 == elem2)
1005     {
1006       set->nelem = 1;
1007       set->elems[0] = elem1;
1008     }
1009   else
1010     {
1011       set->nelem = 2;
1012       if (elem1 < elem2)
1013 	{
1014 	  set->elems[0] = elem1;
1015 	  set->elems[1] = elem2;
1016 	}
1017       else
1018 	{
1019 	  set->elems[0] = elem2;
1020 	  set->elems[1] = elem1;
1021 	}
1022     }
1023   return REG_NOERROR;
1024 }
1025 
1026 static reg_errcode_t
1027 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029 {
1030   dest->nelem = src->nelem;
1031   if (src->nelem > 0)
1032     {
1033       dest->alloc = dest->nelem;
1034       dest->elems = re_malloc (Idx, dest->alloc);
1035       if (BE (dest->elems == NULL, 0))
1036 	{
1037 	  dest->alloc = dest->nelem = 0;
1038 	  return REG_ESPACE;
1039 	}
1040       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041     }
1042   else
1043     re_node_set_init_empty (dest);
1044   return REG_NOERROR;
1045 }
1046 
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1050 
1051 static reg_errcode_t
1052 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054 			   const re_node_set *src2)
1055 {
1056   Idx i1, i2, is, id, delta, sbase;
1057   if (src1->nelem == 0 || src2->nelem == 0)
1058     return REG_NOERROR;
1059 
1060   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061      conservative estimate.  */
1062   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063     {
1064       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066       if (BE (new_elems == NULL, 0))
1067         return REG_ESPACE;
1068       dest->elems = new_elems;
1069       dest->alloc = new_alloc;
1070     }
1071 
1072   /* Find the items in the intersection of SRC1 and SRC2, and copy
1073      into the top of DEST those that are not already in DEST itself.  */
1074   sbase = dest->nelem + src1->nelem + src2->nelem;
1075   i1 = src1->nelem - 1;
1076   i2 = src2->nelem - 1;
1077   id = dest->nelem - 1;
1078   for (;;)
1079     {
1080       if (src1->elems[i1] == src2->elems[i2])
1081 	{
1082 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1083 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1084 	    --id;
1085 
1086           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1087             dest->elems[--sbase] = src1->elems[i1];
1088 
1089 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1090 	    break;
1091 	}
1092 
1093       /* Lower the highest of the two items.  */
1094       else if (src1->elems[i1] < src2->elems[i2])
1095 	{
1096 	  if (! REG_VALID_INDEX (--i2))
1097 	    break;
1098 	}
1099       else
1100 	{
1101 	  if (! REG_VALID_INDEX (--i1))
1102 	    break;
1103 	}
1104     }
1105 
1106   id = dest->nelem - 1;
1107   is = dest->nelem + src1->nelem + src2->nelem - 1;
1108   delta = is - sbase + 1;
1109 
1110   /* Now copy.  When DELTA becomes zero, the remaining
1111      DEST elements are already in place; this is more or
1112      less the same loop that is in re_node_set_merge.  */
1113   dest->nelem += delta;
1114   if (delta > 0 && REG_VALID_INDEX (id))
1115     for (;;)
1116       {
1117         if (dest->elems[is] > dest->elems[id])
1118           {
1119             /* Copy from the top.  */
1120             dest->elems[id + delta--] = dest->elems[is--];
1121             if (delta == 0)
1122               break;
1123           }
1124         else
1125           {
1126             /* Slide from the bottom.  */
1127             dest->elems[id + delta] = dest->elems[id];
1128             if (! REG_VALID_INDEX (--id))
1129               break;
1130           }
1131       }
1132 
1133   /* Copy remaining SRC elements.  */
1134   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135 
1136   return REG_NOERROR;
1137 }
1138 
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1141 
1142 static reg_errcode_t
1143 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145 			const re_node_set *src2)
1146 {
1147   Idx i1, i2, id;
1148   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149     {
1150       dest->alloc = src1->nelem + src2->nelem;
1151       dest->elems = re_malloc (Idx, dest->alloc);
1152       if (BE (dest->elems == NULL, 0))
1153 	return REG_ESPACE;
1154     }
1155   else
1156     {
1157       if (src1 != NULL && src1->nelem > 0)
1158 	return re_node_set_init_copy (dest, src1);
1159       else if (src2 != NULL && src2->nelem > 0)
1160 	return re_node_set_init_copy (dest, src2);
1161       else
1162 	re_node_set_init_empty (dest);
1163       return REG_NOERROR;
1164     }
1165   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166     {
1167       if (src1->elems[i1] > src2->elems[i2])
1168 	{
1169 	  dest->elems[id++] = src2->elems[i2++];
1170 	  continue;
1171 	}
1172       if (src1->elems[i1] == src2->elems[i2])
1173 	++i2;
1174       dest->elems[id++] = src1->elems[i1++];
1175     }
1176   if (i1 < src1->nelem)
1177     {
1178       memcpy (dest->elems + id, src1->elems + i1,
1179 	     (src1->nelem - i1) * sizeof (Idx));
1180       id += src1->nelem - i1;
1181     }
1182   else if (i2 < src2->nelem)
1183     {
1184       memcpy (dest->elems + id, src2->elems + i2,
1185 	     (src2->nelem - i2) * sizeof (Idx));
1186       id += src2->nelem - i2;
1187     }
1188   dest->nelem = id;
1189   return REG_NOERROR;
1190 }
1191 
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1194 
1195 static reg_errcode_t
1196 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198 {
1199   Idx is, id, sbase, delta;
1200   if (src == NULL || src->nelem == 0)
1201     return REG_NOERROR;
1202   if (dest->alloc < 2 * src->nelem + dest->nelem)
1203     {
1204       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206       if (BE (new_buffer == NULL, 0))
1207 	return REG_ESPACE;
1208       dest->elems = new_buffer;
1209       dest->alloc = new_alloc;
1210     }
1211 
1212   if (BE (dest->nelem == 0, 0))
1213     {
1214       dest->nelem = src->nelem;
1215       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216       return REG_NOERROR;
1217     }
1218 
1219   /* Copy into the top of DEST the items of SRC that are not
1220      found in DEST.  Maybe we could binary search in DEST?  */
1221   for (sbase = dest->nelem + 2 * src->nelem,
1222        is = src->nelem - 1, id = dest->nelem - 1;
1223        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1224     {
1225       if (dest->elems[id] == src->elems[is])
1226         is--, id--;
1227       else if (dest->elems[id] < src->elems[is])
1228         dest->elems[--sbase] = src->elems[is--];
1229       else /* if (dest->elems[id] > src->elems[is]) */
1230         --id;
1231     }
1232 
1233   if (REG_VALID_INDEX (is))
1234     {
1235       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1236       sbase -= is + 1;
1237       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1238     }
1239 
1240   id = dest->nelem - 1;
1241   is = dest->nelem + 2 * src->nelem - 1;
1242   delta = is - sbase + 1;
1243   if (delta == 0)
1244     return REG_NOERROR;
1245 
1246   /* Now copy.  When DELTA becomes zero, the remaining
1247      DEST elements are already in place.  */
1248   dest->nelem += delta;
1249   for (;;)
1250     {
1251       if (dest->elems[is] > dest->elems[id])
1252         {
1253 	  /* Copy from the top.  */
1254           dest->elems[id + delta--] = dest->elems[is--];
1255 	  if (delta == 0)
1256 	    break;
1257 	}
1258       else
1259         {
1260           /* Slide from the bottom.  */
1261           dest->elems[id + delta] = dest->elems[id];
1262 	  if (! REG_VALID_INDEX (--id))
1263 	    {
1264 	      /* Copy remaining SRC elements.  */
1265 	      memcpy (dest->elems, dest->elems + sbase,
1266 	              delta * sizeof (Idx));
1267 	      break;
1268 	    }
1269 	}
1270     }
1271 
1272   return REG_NOERROR;
1273 }
1274 
1275 /* Insert the new element ELEM to the re_node_set* SET.
1276    SET should not already have ELEM.
1277    Return true if successful.  */
1278 
1279 static bool
1280 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1281 re_node_set_insert (re_node_set *set, Idx elem)
1282 {
1283   Idx idx;
1284   /* In case the set is empty.  */
1285   if (set->alloc == 0)
1286     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1287 
1288   if (BE (set->nelem, 0) == 0)
1289     {
1290       /* We already guaranteed above that set->alloc != 0.  */
1291       set->elems[0] = elem;
1292       ++set->nelem;
1293       return true;
1294     }
1295 
1296   /* Realloc if we need.  */
1297   if (set->alloc == set->nelem)
1298     {
1299       Idx *new_elems;
1300       set->alloc = set->alloc * 2;
1301       new_elems = re_realloc (set->elems, Idx, set->alloc);
1302       if (BE (new_elems == NULL, 0))
1303 	return false;
1304       set->elems = new_elems;
1305     }
1306 
1307   /* Move the elements which follows the new element.  Test the
1308      first element separately to skip a check in the inner loop.  */
1309   if (elem < set->elems[0])
1310     {
1311       idx = 0;
1312       for (idx = set->nelem; idx > 0; idx--)
1313         set->elems[idx] = set->elems[idx - 1];
1314     }
1315   else
1316     {
1317       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1318         set->elems[idx] = set->elems[idx - 1];
1319     }
1320 
1321   /* Insert the new element.  */
1322   set->elems[idx] = elem;
1323   ++set->nelem;
1324   return true;
1325 }
1326 
1327 /* Insert the new element ELEM to the re_node_set* SET.
1328    SET should not already have any element greater than or equal to ELEM.
1329    Return true if successful.  */
1330 
1331 static bool
1332 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1333 re_node_set_insert_last (re_node_set *set, Idx elem)
1334 {
1335   /* Realloc if we need.  */
1336   if (set->alloc == set->nelem)
1337     {
1338       Idx *new_elems;
1339       set->alloc = (set->alloc + 1) * 2;
1340       new_elems = re_realloc (set->elems, Idx, set->alloc);
1341       if (BE (new_elems == NULL, 0))
1342 	return false;
1343       set->elems = new_elems;
1344     }
1345 
1346   /* Insert the new element.  */
1347   set->elems[set->nelem++] = elem;
1348   return true;
1349 }
1350 
1351 /* Compare two node sets SET1 and SET2.
1352    Return true if SET1 and SET2 are equivalent.  */
1353 
1354 static bool
internal_function(pure)1355 internal_function __attribute ((pure))
1356 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1357 {
1358   Idx i;
1359   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1360     return false;
1361   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1362     if (set1->elems[i] != set2->elems[i])
1363       return false;
1364   return true;
1365 }
1366 
1367 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1368 
1369 static Idx
internal_function(pure)1370 internal_function __attribute ((pure))
1371 re_node_set_contains (const re_node_set *set, Idx elem)
1372 {
1373   __re_size_t idx, right, mid;
1374   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1375     return 0;
1376 
1377   /* Binary search the element.  */
1378   idx = 0;
1379   right = set->nelem - 1;
1380   while (idx < right)
1381     {
1382       mid = (idx + right) / 2;
1383       if (set->elems[mid] < elem)
1384 	idx = mid + 1;
1385       else
1386 	right = mid;
1387     }
1388   return set->elems[idx] == elem ? idx + 1 : 0;
1389 }
1390 
1391 static void
1392 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1393 re_node_set_remove_at (re_node_set *set, Idx idx)
1394 {
1395   if (idx < 0 || idx >= set->nelem)
1396     return;
1397   --set->nelem;
1398   for (; idx < set->nelem; idx++)
1399     set->elems[idx] = set->elems[idx + 1];
1400 }
1401 
1402 
1403 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1404    Or return REG_MISSING if an error occurred.  */
1405 
1406 static Idx
1407 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1408 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1409 {
1410   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1411     {
1412       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1413       Idx *new_nexts, *new_indices;
1414       re_node_set *new_edests, *new_eclosures;
1415       re_token_t *new_nodes;
1416       size_t max_object_size =
1417 	MAX (sizeof (re_token_t),
1418 	     MAX (sizeof (re_node_set),
1419 		  sizeof (Idx)));
1420 
1421       /* Avoid overflows.  */
1422       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1423 	return REG_MISSING;
1424 
1425       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426       if (BE (new_nodes == NULL, 0))
1427 	return REG_MISSING;
1428       dfa->nodes = new_nodes;
1429       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1430       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1431       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433       if (BE (new_nexts == NULL || new_indices == NULL
1434 	      || new_edests == NULL || new_eclosures == NULL, 0))
1435 	return REG_MISSING;
1436       dfa->nexts = new_nexts;
1437       dfa->org_indices = new_indices;
1438       dfa->edests = new_edests;
1439       dfa->eclosures = new_eclosures;
1440       dfa->nodes_alloc = new_nodes_alloc;
1441     }
1442   dfa->nodes[dfa->nodes_len] = token;
1443   dfa->nodes[dfa->nodes_len].constraint = 0;
1444 #ifdef RE_ENABLE_I18N
1445   {
1446   int type = token.type;
1447   dfa->nodes[dfa->nodes_len].accept_mb =
1448     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449   }
1450 #endif
1451   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1452   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454   return dfa->nodes_len++;
1455 }
1456 
1457 static inline re_hashval_t
1458 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1459 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 {
1461   re_hashval_t hash = nodes->nelem + context;
1462   Idx i;
1463   for (i = 0 ; i < nodes->nelem ; i++)
1464     hash += nodes->elems[i];
1465   return hash;
1466 }
1467 
1468 /* Search for the state whose node_set is equivalent to NODES.
1469    Return the pointer to the state, if we found it in the DFA.
1470    Otherwise create the new one and return it.  In case of an error
1471    return NULL and set the error code in ERR.
1472    Note: - We assume NULL as the invalid state, then it is possible that
1473 	   return value is NULL and ERR is REG_NOERROR.
1474 	 - We never return non-NULL value in case of any errors, it is for
1475 	   optimization.  */
1476 
1477 static re_dfastate_t *
1478 internal_function
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480 		  const re_node_set *nodes)
1481 {
1482   re_hashval_t hash;
1483   re_dfastate_t *new_state;
1484   struct re_state_table_entry *spot;
1485   Idx i;
1486 #ifdef lint
1487   /* Suppress bogus uninitialized-variable warnings.  */
1488   *err = REG_NOERROR;
1489 #endif
1490   if (BE (nodes->nelem == 0, 0))
1491     {
1492       *err = REG_NOERROR;
1493       return NULL;
1494     }
1495   hash = calc_state_hash (nodes, 0);
1496   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497 
1498   for (i = 0 ; i < spot->num ; i++)
1499     {
1500       re_dfastate_t *state = spot->array[i];
1501       if (hash != state->hash)
1502 	continue;
1503       if (re_node_set_compare (&state->nodes, nodes))
1504 	return state;
1505     }
1506 
1507   /* There are no appropriate state in the dfa, create the new one.  */
1508   new_state = create_ci_newstate (dfa, nodes, hash);
1509   if (BE (new_state == NULL, 0))
1510     *err = REG_ESPACE;
1511 
1512   return new_state;
1513 }
1514 
1515 /* Search for the state whose node_set is equivalent to NODES and
1516    whose context is equivalent to CONTEXT.
1517    Return the pointer to the state, if we found it in the DFA.
1518    Otherwise create the new one and return it.  In case of an error
1519    return NULL and set the error code in ERR.
1520    Note: - We assume NULL as the invalid state, then it is possible that
1521 	   return value is NULL and ERR is REG_NOERROR.
1522 	 - We never return non-NULL value in case of any errors, it is for
1523 	   optimization.  */
1524 
1525 static re_dfastate_t *
1526 internal_function
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528 			  const re_node_set *nodes, unsigned int context)
1529 {
1530   re_hashval_t hash;
1531   re_dfastate_t *new_state;
1532   struct re_state_table_entry *spot;
1533   Idx i;
1534 #ifdef lint
1535   /* Suppress bogus uninitialized-variable warnings.  */
1536   *err = REG_NOERROR;
1537 #endif
1538   if (nodes->nelem == 0)
1539     {
1540       *err = REG_NOERROR;
1541       return NULL;
1542     }
1543   hash = calc_state_hash (nodes, context);
1544   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545 
1546   for (i = 0 ; i < spot->num ; i++)
1547     {
1548       re_dfastate_t *state = spot->array[i];
1549       if (state->hash == hash
1550 	  && state->context == context
1551 	  && re_node_set_compare (state->entrance_nodes, nodes))
1552 	return state;
1553     }
1554   /* There are no appropriate state in `dfa', create the new one.  */
1555   new_state = create_cd_newstate (dfa, nodes, context, hash);
1556   if (BE (new_state == NULL, 0))
1557     *err = REG_ESPACE;
1558 
1559   return new_state;
1560 }
1561 
1562 /* Finish initialization of the new state NEWSTATE, and using its hash value
1563    HASH put in the appropriate bucket of DFA's state table.  Return value
1564    indicates the error code if failed.  */
1565 
1566 static reg_errcode_t
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568 		re_hashval_t hash)
1569 {
1570   struct re_state_table_entry *spot;
1571   reg_errcode_t err;
1572   Idx i;
1573 
1574   newstate->hash = hash;
1575   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576   if (BE (err != REG_NOERROR, 0))
1577     return REG_ESPACE;
1578   for (i = 0; i < newstate->nodes.nelem; i++)
1579     {
1580       Idx elem = newstate->nodes.elems[i];
1581       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582 	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1583 	  return REG_ESPACE;
1584     }
1585 
1586   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587   if (BE (spot->alloc <= spot->num, 0))
1588     {
1589       Idx new_alloc = 2 * spot->num + 2;
1590       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591 					      new_alloc);
1592       if (BE (new_array == NULL, 0))
1593 	return REG_ESPACE;
1594       spot->array = new_array;
1595       spot->alloc = new_alloc;
1596     }
1597   spot->array[spot->num++] = newstate;
1598   return REG_NOERROR;
1599 }
1600 
1601 static void
free_state(re_dfastate_t * state)1602 free_state (re_dfastate_t *state)
1603 {
1604   re_node_set_free (&state->non_eps_nodes);
1605   re_node_set_free (&state->inveclosure);
1606   if (state->entrance_nodes != &state->nodes)
1607     {
1608       re_node_set_free (state->entrance_nodes);
1609       re_free (state->entrance_nodes);
1610     }
1611   re_node_set_free (&state->nodes);
1612   re_free (state->word_trtable);
1613   re_free (state->trtable);
1614   re_free (state);
1615 }
1616 
1617 /* Create the new state which is independ of contexts.
1618    Return the new state if succeeded, otherwise return NULL.  */
1619 
1620 static re_dfastate_t *
1621 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623 		    re_hashval_t hash)
1624 {
1625   Idx i;
1626   reg_errcode_t err;
1627   re_dfastate_t *newstate;
1628 
1629   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630   if (BE (newstate == NULL, 0))
1631     return NULL;
1632   err = re_node_set_init_copy (&newstate->nodes, nodes);
1633   if (BE (err != REG_NOERROR, 0))
1634     {
1635       re_free (newstate);
1636       return NULL;
1637     }
1638 
1639   newstate->entrance_nodes = &newstate->nodes;
1640   for (i = 0 ; i < nodes->nelem ; i++)
1641     {
1642       re_token_t *node = dfa->nodes + nodes->elems[i];
1643       re_token_type_t type = node->type;
1644       if (type == CHARACTER && !node->constraint)
1645 	continue;
1646 #ifdef RE_ENABLE_I18N
1647       newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649 
1650       /* If the state has the halt node, the state is a halt state.  */
1651       if (type == END_OF_RE)
1652 	newstate->halt = 1;
1653       else if (type == OP_BACK_REF)
1654 	newstate->has_backref = 1;
1655       else if (type == ANCHOR || node->constraint)
1656 	newstate->has_constraint = 1;
1657     }
1658   err = register_state (dfa, newstate, hash);
1659   if (BE (err != REG_NOERROR, 0))
1660     {
1661       free_state (newstate);
1662       newstate = NULL;
1663     }
1664   return newstate;
1665 }
1666 
1667 /* Create the new state which is depend on the context CONTEXT.
1668    Return the new state if succeeded, otherwise return NULL.  */
1669 
1670 static re_dfastate_t *
1671 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673 		    unsigned int context, re_hashval_t hash)
1674 {
1675   Idx i, nctx_nodes = 0;
1676   reg_errcode_t err;
1677   re_dfastate_t *newstate;
1678 
1679   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680   if (BE (newstate == NULL, 0))
1681     return NULL;
1682   err = re_node_set_init_copy (&newstate->nodes, nodes);
1683   if (BE (err != REG_NOERROR, 0))
1684     {
1685       re_free (newstate);
1686       return NULL;
1687     }
1688 
1689   newstate->context = context;
1690   newstate->entrance_nodes = &newstate->nodes;
1691 
1692   for (i = 0 ; i < nodes->nelem ; i++)
1693     {
1694       re_token_t *node = dfa->nodes + nodes->elems[i];
1695       re_token_type_t type = node->type;
1696       unsigned int constraint = node->constraint;
1697 
1698       if (type == CHARACTER && !constraint)
1699 	continue;
1700 #ifdef RE_ENABLE_I18N
1701       newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703 
1704       /* If the state has the halt node, the state is a halt state.  */
1705       if (type == END_OF_RE)
1706 	newstate->halt = 1;
1707       else if (type == OP_BACK_REF)
1708 	newstate->has_backref = 1;
1709 
1710       if (constraint)
1711 	{
1712 	  if (newstate->entrance_nodes == &newstate->nodes)
1713 	    {
1714 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715 	      if (BE (newstate->entrance_nodes == NULL, 0))
1716 		{
1717 		  free_state (newstate);
1718 		  return NULL;
1719 		}
1720 	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1721 	      nctx_nodes = 0;
1722 	      newstate->has_constraint = 1;
1723 	    }
1724 
1725 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1726 	    {
1727 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1728 	      ++nctx_nodes;
1729 	    }
1730 	}
1731     }
1732   err = register_state (dfa, newstate, hash);
1733   if (BE (err != REG_NOERROR, 0))
1734     {
1735       free_state (newstate);
1736       newstate = NULL;
1737     }
1738   return  newstate;
1739 }
1740