1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2016 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains functions that scan a compiled pattern and change
42 repeats into possessive repeats where possible. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 
50 #include "pcre2_internal.h"
51 
52 
53 /*************************************************
54 *        Tables for auto-possessification        *
55 *************************************************/
56 
57 /* This table is used to check whether auto-possessification is possible
58 between adjacent character-type opcodes. The left-hand (repeated) opcode is
59 used to select the row, and the right-hand opcode is use to select the column.
60 A value of 1 means that auto-possessification is OK. For example, the second
61 value in the first row means that \D+\d can be turned into \D++\d.
62 
63 The Unicode property types (\P and \p) have to be present to fill out the table
64 because of what their opcode values are, but the table values should always be
65 zero because property types are handled separately in the code. The last four
66 columns apply to items that cannot be repeated, so there is no need to have
67 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
68 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
69 
70 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
71 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
72 
73 static const uint8_t autoposstab[APTROWS][APTCOLS] = {
74 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
75   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
76   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
77   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
78   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
79   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
80   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
81   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
82   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
83   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
84   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
85   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
86   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
87   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
88   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
89   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
90   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
91   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
92 };
93 
94 #ifdef SUPPORT_UNICODE
95 /* This table is used to check whether auto-possessification is possible
96 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
97 left-hand (repeated) opcode is used to select the row, and the right-hand
98 opcode is used to select the column. The values are as follows:
99 
100   0   Always return FALSE (never auto-possessify)
101   1   Character groups are distinct (possessify if both are OP_PROP)
102   2   Check character categories in the same group (general or particular)
103   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
104 
105   4   Check left general category vs right particular category
106   5   Check right general category vs left particular category
107 
108   6   Left alphanum vs right general category
109   7   Left space vs right general category
110   8   Left word vs right general category
111 
112   9   Right alphanum vs left general category
113  10   Right space vs left general category
114  11   Right word vs left general category
115 
116  12   Left alphanum vs right particular category
117  13   Left space vs right particular category
118  14   Left word vs right particular category
119 
120  15   Right alphanum vs left particular category
121  16   Right space vs left particular category
122  17   Right word vs left particular category
123 */
124 
125 static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
126 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
127   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
128   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
129   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
130   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
131   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
132   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
133   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
134   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
135   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
136   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
137   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
138 };
139 
140 /* This table is used to check whether auto-possessification is possible
141 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
142 specifies a general category and the other specifies a particular category. The
143 row is selected by the general category and the column by the particular
144 category. The value is 1 if the particular category is not part of the general
145 category. */
146 
147 static const uint8_t catposstab[7][30] = {
148 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
149   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
150   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
151   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
152   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
153   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
154   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
155   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
156 };
157 
158 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
159 a general or particular category. The properties in each row are those
160 that apply to the character set in question. Duplication means that a little
161 unnecessary work is done when checking, but this keeps things much simpler
162 because they can all use the same code. For more details see the comment where
163 this table is used.
164 
165 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
166 "space", but from Perl 5.18 it's included, so both categories are treated the
167 same here. */
168 
169 static const uint8_t posspropstab[3][4] = {
170   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
171   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
172   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
173 };
174 #endif  /* SUPPORT_UNICODE */
175 
176 
177 
178 #ifdef SUPPORT_UNICODE
179 /*************************************************
180 *        Check a character and a property        *
181 *************************************************/
182 
183 /* This function is called by compare_opcodes() when a property item is
184 adjacent to a fixed character.
185 
186 Arguments:
187   c            the character
188   ptype        the property type
189   pdata        the data for the type
190   negated      TRUE if it's a negated property (\P or \p{^)
191 
192 Returns:       TRUE if auto-possessifying is OK
193 */
194 
195 static BOOL
check_char_prop(uint32_t c,unsigned int ptype,unsigned int pdata,BOOL negated)196 check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
197   BOOL negated)
198 {
199 const uint32_t *p;
200 const ucd_record *prop = GET_UCD(c);
201 
202 switch(ptype)
203   {
204   case PT_LAMP:
205   return (prop->chartype == ucp_Lu ||
206           prop->chartype == ucp_Ll ||
207           prop->chartype == ucp_Lt) == negated;
208 
209   case PT_GC:
210   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
211 
212   case PT_PC:
213   return (pdata == prop->chartype) == negated;
214 
215   case PT_SC:
216   return (pdata == prop->script) == negated;
217 
218   /* These are specials */
219 
220   case PT_ALNUM:
221   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
222           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
223 
224   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
225   means that Perl space and POSIX space are now identical. PCRE was changed
226   at release 8.34. */
227 
228   case PT_SPACE:    /* Perl space */
229   case PT_PXSPACE:  /* POSIX space */
230   switch(c)
231     {
232     HSPACE_CASES:
233     VSPACE_CASES:
234     return negated;
235 
236     default:
237     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
238     }
239   break;  /* Control never reaches here */
240 
241   case PT_WORD:
242   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
243           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
244           c == CHAR_UNDERSCORE) == negated;
245 
246   case PT_CLIST:
247   p = PRIV(ucd_caseless_sets) + prop->caseset;
248   for (;;)
249     {
250     if (c < *p) return !negated;
251     if (c == *p++) return negated;
252     }
253   break;  /* Control never reaches here */
254   }
255 
256 return FALSE;
257 }
258 #endif  /* SUPPORT_UNICODE */
259 
260 
261 
262 /*************************************************
263 *        Base opcode of repeated opcodes         *
264 *************************************************/
265 
266 /* Returns the base opcode for repeated single character type opcodes. If the
267 opcode is not a repeated character type, it returns with the original value.
268 
269 Arguments:  c opcode
270 Returns:    base opcode for the type
271 */
272 
273 static PCRE2_UCHAR
get_repeat_base(PCRE2_UCHAR c)274 get_repeat_base(PCRE2_UCHAR c)
275 {
276 return (c > OP_TYPEPOSUPTO)? c :
277        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
278        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
279        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
280        (c >= OP_STARI)?      OP_STARI :
281                              OP_STAR;
282 }
283 
284 
285 /*************************************************
286 *        Fill the character property list        *
287 *************************************************/
288 
289 /* Checks whether the code points to an opcode that can take part in auto-
290 possessification, and if so, fills a list with its properties.
291 
292 Arguments:
293   code        points to start of expression
294   utf         TRUE if in UTF mode
295   fcc         points to the case-flipping table
296   list        points to output list
297               list[0] will be filled with the opcode
298               list[1] will be non-zero if this opcode
299                 can match an empty character string
300               list[2..7] depends on the opcode
301 
302 Returns:      points to the start of the next opcode if *code is accepted
303               NULL if *code is not accepted
304 */
305 
306 static PCRE2_SPTR
get_chr_property_list(PCRE2_SPTR code,BOOL utf,const uint8_t * fcc,uint32_t * list)307 get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
308   uint32_t *list)
309 {
310 PCRE2_UCHAR c = *code;
311 PCRE2_UCHAR base;
312 PCRE2_SPTR end;
313 uint32_t chr;
314 
315 #ifdef SUPPORT_UNICODE
316 uint32_t *clist_dest;
317 const uint32_t *clist_src;
318 #else
319 (void)utf;    /* Suppress "unused parameter" compiler warning */
320 #endif
321 
322 list[0] = c;
323 list[1] = FALSE;
324 code++;
325 
326 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
327   {
328   base = get_repeat_base(c);
329   c -= (base - OP_STAR);
330 
331   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
332     code += IMM2_SIZE;
333 
334   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
335              c != OP_POSPLUS);
336 
337   switch(base)
338     {
339     case OP_STAR:
340     list[0] = OP_CHAR;
341     break;
342 
343     case OP_STARI:
344     list[0] = OP_CHARI;
345     break;
346 
347     case OP_NOTSTAR:
348     list[0] = OP_NOT;
349     break;
350 
351     case OP_NOTSTARI:
352     list[0] = OP_NOTI;
353     break;
354 
355     case OP_TYPESTAR:
356     list[0] = *code;
357     code++;
358     break;
359     }
360   c = list[0];
361   }
362 
363 switch(c)
364   {
365   case OP_NOT_DIGIT:
366   case OP_DIGIT:
367   case OP_NOT_WHITESPACE:
368   case OP_WHITESPACE:
369   case OP_NOT_WORDCHAR:
370   case OP_WORDCHAR:
371   case OP_ANY:
372   case OP_ALLANY:
373   case OP_ANYNL:
374   case OP_NOT_HSPACE:
375   case OP_HSPACE:
376   case OP_NOT_VSPACE:
377   case OP_VSPACE:
378   case OP_EXTUNI:
379   case OP_EODN:
380   case OP_EOD:
381   case OP_DOLL:
382   case OP_DOLLM:
383   return code;
384 
385   case OP_CHAR:
386   case OP_NOT:
387   GETCHARINCTEST(chr, code);
388   list[2] = chr;
389   list[3] = NOTACHAR;
390   return code;
391 
392   case OP_CHARI:
393   case OP_NOTI:
394   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
395   GETCHARINCTEST(chr, code);
396   list[2] = chr;
397 
398 #ifdef SUPPORT_UNICODE
399   if (chr < 128 || (chr < 256 && !utf))
400     list[3] = fcc[chr];
401   else
402     list[3] = UCD_OTHERCASE(chr);
403 #elif defined SUPPORT_WIDE_CHARS
404   list[3] = (chr < 256) ? fcc[chr] : chr;
405 #else
406   list[3] = fcc[chr];
407 #endif
408 
409   /* The othercase might be the same value. */
410 
411   if (chr == list[3])
412     list[3] = NOTACHAR;
413   else
414     list[4] = NOTACHAR;
415   return code;
416 
417 #ifdef SUPPORT_UNICODE
418   case OP_PROP:
419   case OP_NOTPROP:
420   if (code[0] != PT_CLIST)
421     {
422     list[2] = code[0];
423     list[3] = code[1];
424     return code + 2;
425     }
426 
427   /* Convert only if we have enough space. */
428 
429   clist_src = PRIV(ucd_caseless_sets) + code[1];
430   clist_dest = list + 2;
431   code += 2;
432 
433   do {
434      if (clist_dest >= list + 8)
435        {
436        /* Early return if there is not enough space. This should never
437        happen, since all clists are shorter than 5 character now. */
438        list[2] = code[0];
439        list[3] = code[1];
440        return code;
441        }
442      *clist_dest++ = *clist_src;
443      }
444   while(*clist_src++ != NOTACHAR);
445 
446   /* All characters are stored. The terminating NOTACHAR is copied from the
447   clist itself. */
448 
449   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
450   return code;
451 #endif
452 
453   case OP_NCLASS:
454   case OP_CLASS:
455 #ifdef SUPPORT_WIDE_CHARS
456   case OP_XCLASS:
457   if (c == OP_XCLASS)
458     end = code + GET(code, 0) - 1;
459   else
460 #endif
461     end = code + 32 / sizeof(PCRE2_UCHAR);
462 
463   switch(*end)
464     {
465     case OP_CRSTAR:
466     case OP_CRMINSTAR:
467     case OP_CRQUERY:
468     case OP_CRMINQUERY:
469     case OP_CRPOSSTAR:
470     case OP_CRPOSQUERY:
471     list[1] = TRUE;
472     end++;
473     break;
474 
475     case OP_CRPLUS:
476     case OP_CRMINPLUS:
477     case OP_CRPOSPLUS:
478     end++;
479     break;
480 
481     case OP_CRRANGE:
482     case OP_CRMINRANGE:
483     case OP_CRPOSRANGE:
484     list[1] = (GET2(end, 1) == 0);
485     end += 1 + 2 * IMM2_SIZE;
486     break;
487     }
488   list[2] = (uint32_t)(end - code);
489   return end;
490   }
491 return NULL;    /* Opcode not accepted */
492 }
493 
494 
495 
496 /*************************************************
497 *    Scan further character sets for match       *
498 *************************************************/
499 
500 /* Checks whether the base and the current opcode have a common character, in
501 which case the base cannot be possessified.
502 
503 Arguments:
504   code        points to the byte code
505   utf         TRUE in UTF mode
506   cb          compile data block
507   base_list   the data list of the base opcode
508   base_end    the end of the data list
509   rec_limit   points to recursion depth counter
510 
511 Returns:      TRUE if the auto-possessification is possible
512 */
513 
514 static BOOL
compare_opcodes(PCRE2_SPTR code,BOOL utf,const compile_block * cb,const uint32_t * base_list,PCRE2_SPTR base_end,int * rec_limit)515 compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
516   const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
517 {
518 PCRE2_UCHAR c;
519 uint32_t list[8];
520 const uint32_t *chr_ptr;
521 const uint32_t *ochr_ptr;
522 const uint32_t *list_ptr;
523 PCRE2_SPTR next_code;
524 #ifdef SUPPORT_WIDE_CHARS
525 PCRE2_SPTR xclass_flags;
526 #endif
527 const uint8_t *class_bitset;
528 const uint8_t *set1, *set2, *set_end;
529 uint32_t chr;
530 BOOL accepted, invert_bits;
531 BOOL entered_a_group = FALSE;
532 
533 if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */
534 
535 /* Note: the base_list[1] contains whether the current opcode has a greedy
536 (represented by a non-zero value) quantifier. This is a different from
537 other character type lists, which store here that the character iterator
538 matches to an empty string (also represented by a non-zero value). */
539 
540 for(;;)
541   {
542   /* All operations move the code pointer forward.
543   Therefore infinite recursions are not possible. */
544 
545   c = *code;
546 
547   /* Skip over callouts */
548 
549   if (c == OP_CALLOUT)
550     {
551     code += PRIV(OP_lengths)[c];
552     continue;
553     }
554 
555   if (c == OP_CALLOUT_STR)
556     {
557     code += GET(code, 1 + 2*LINK_SIZE);
558     continue;
559     }
560 
561   if (c == OP_ALT)
562     {
563     do code += GET(code, 1); while (*code == OP_ALT);
564     c = *code;
565     }
566 
567   switch(c)
568     {
569     case OP_END:
570     case OP_KETRPOS:
571     /* TRUE only in greedy case. The non-greedy case could be replaced by
572     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
573     uses more memory, which we cannot get at this stage.) */
574 
575     return base_list[1] != 0;
576 
577     case OP_KET:
578     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
579     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
580     cannot be converted to a possessive form. */
581 
582     if (base_list[1] == 0) return FALSE;
583 
584     switch(*(code - GET(code, 1)))
585       {
586       case OP_ASSERT:
587       case OP_ASSERT_NOT:
588       case OP_ASSERTBACK:
589       case OP_ASSERTBACK_NOT:
590       case OP_ONCE:
591       case OP_ONCE_NC:
592       /* Atomic sub-patterns and assertions can always auto-possessify their
593       last iterator. However, if the group was entered as a result of checking
594       a previous iterator, this is not possible. */
595 
596       return !entered_a_group;
597       }
598 
599     code += PRIV(OP_lengths)[c];
600     continue;
601 
602     case OP_ONCE:
603     case OP_ONCE_NC:
604     case OP_BRA:
605     case OP_CBRA:
606     next_code = code + GET(code, 1);
607     code += PRIV(OP_lengths)[c];
608 
609     while (*next_code == OP_ALT)
610       {
611       if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
612         return FALSE;
613       code = next_code + 1 + LINK_SIZE;
614       next_code += GET(next_code, 1);
615       }
616 
617     entered_a_group = TRUE;
618     continue;
619 
620     case OP_BRAZERO:
621     case OP_BRAMINZERO:
622 
623     next_code = code + 1;
624     if (*next_code != OP_BRA && *next_code != OP_CBRA
625         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
626 
627     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
628 
629     /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
630 
631     next_code += 1 + LINK_SIZE;
632     if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
633       return FALSE;
634 
635     code += PRIV(OP_lengths)[c];
636     continue;
637 
638     default:
639     break;
640     }
641 
642   /* Check for a supported opcode, and load its properties. */
643 
644   code = get_chr_property_list(code, utf, cb->fcc, list);
645   if (code == NULL) return FALSE;    /* Unsupported */
646 
647   /* If either opcode is a small character list, set pointers for comparing
648   characters from that list with another list, or with a property. */
649 
650   if (base_list[0] == OP_CHAR)
651     {
652     chr_ptr = base_list + 2;
653     list_ptr = list;
654     }
655   else if (list[0] == OP_CHAR)
656     {
657     chr_ptr = list + 2;
658     list_ptr = base_list;
659     }
660 
661   /* Character bitsets can also be compared to certain opcodes. */
662 
663   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
664 #if PCRE2_CODE_UNIT_WIDTH == 8
665       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
666       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
667 #endif
668       )
669     {
670 #if PCRE2_CODE_UNIT_WIDTH == 8
671     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
672 #else
673     if (base_list[0] == OP_CLASS)
674 #endif
675       {
676       set1 = (uint8_t *)(base_end - base_list[2]);
677       list_ptr = list;
678       }
679     else
680       {
681       set1 = (uint8_t *)(code - list[2]);
682       list_ptr = base_list;
683       }
684 
685     invert_bits = FALSE;
686     switch(list_ptr[0])
687       {
688       case OP_CLASS:
689       case OP_NCLASS:
690       set2 = (uint8_t *)
691         ((list_ptr == list ? code : base_end) - list_ptr[2]);
692       break;
693 
694 #ifdef SUPPORT_WIDE_CHARS
695       case OP_XCLASS:
696       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
697       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
698       if ((*xclass_flags & XCL_MAP) == 0)
699         {
700         /* No bits are set for characters < 256. */
701         if (list[1] == 0) return TRUE;
702         /* Might be an empty repeat. */
703         continue;
704         }
705       set2 = (uint8_t *)(xclass_flags + 1);
706       break;
707 #endif
708 
709       case OP_NOT_DIGIT:
710       invert_bits = TRUE;
711       /* Fall through */
712       case OP_DIGIT:
713       set2 = (uint8_t *)(cb->cbits + cbit_digit);
714       break;
715 
716       case OP_NOT_WHITESPACE:
717       invert_bits = TRUE;
718       /* Fall through */
719       case OP_WHITESPACE:
720       set2 = (uint8_t *)(cb->cbits + cbit_space);
721       break;
722 
723       case OP_NOT_WORDCHAR:
724       invert_bits = TRUE;
725       /* Fall through */
726       case OP_WORDCHAR:
727       set2 = (uint8_t *)(cb->cbits + cbit_word);
728       break;
729 
730       default:
731       return FALSE;
732       }
733 
734     /* Because the bit sets are unaligned bytes, we need to perform byte
735     comparison here. */
736 
737     set_end = set1 + 32;
738     if (invert_bits)
739       {
740       do
741         {
742         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
743         }
744       while (set1 < set_end);
745       }
746     else
747       {
748       do
749         {
750         if ((*set1++ & *set2++) != 0) return FALSE;
751         }
752       while (set1 < set_end);
753       }
754 
755     if (list[1] == 0) return TRUE;
756     /* Might be an empty repeat. */
757     continue;
758     }
759 
760   /* Some property combinations also acceptable. Unicode property opcodes are
761   processed specially; the rest can be handled with a lookup table. */
762 
763   else
764     {
765     uint32_t leftop, rightop;
766 
767     leftop = base_list[0];
768     rightop = list[0];
769 
770 #ifdef SUPPORT_UNICODE
771     accepted = FALSE; /* Always set in non-unicode case. */
772     if (leftop == OP_PROP || leftop == OP_NOTPROP)
773       {
774       if (rightop == OP_EOD)
775         accepted = TRUE;
776       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
777         {
778         int n;
779         const uint8_t *p;
780         BOOL same = leftop == rightop;
781         BOOL lisprop = leftop == OP_PROP;
782         BOOL risprop = rightop == OP_PROP;
783         BOOL bothprop = lisprop && risprop;
784 
785         /* There's a table that specifies how each combination is to be
786         processed:
787           0   Always return FALSE (never auto-possessify)
788           1   Character groups are distinct (possessify if both are OP_PROP)
789           2   Check character categories in the same group (general or particular)
790           3   Return TRUE if the two opcodes are not the same
791           ... see comments below
792         */
793 
794         n = propposstab[base_list[2]][list[2]];
795         switch(n)
796           {
797           case 0: break;
798           case 1: accepted = bothprop; break;
799           case 2: accepted = (base_list[3] == list[3]) != same; break;
800           case 3: accepted = !same; break;
801 
802           case 4:  /* Left general category, right particular category */
803           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
804           break;
805 
806           case 5:  /* Right general category, left particular category */
807           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
808           break;
809 
810           /* This code is logically tricky. Think hard before fiddling with it.
811           The posspropstab table has four entries per row. Each row relates to
812           one of PCRE's special properties such as ALNUM or SPACE or WORD.
813           Only WORD actually needs all four entries, but using repeats for the
814           others means they can all use the same code below.
815 
816           The first two entries in each row are Unicode general categories, and
817           apply always, because all the characters they include are part of the
818           PCRE character set. The third and fourth entries are a general and a
819           particular category, respectively, that include one or more relevant
820           characters. One or the other is used, depending on whether the check
821           is for a general or a particular category. However, in both cases the
822           category contains more characters than the specials that are defined
823           for the property being tested against. Therefore, it cannot be used
824           in a NOTPROP case.
825 
826           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
827           Underscore is covered by ucp_P or ucp_Po. */
828 
829           case 6:  /* Left alphanum vs right general category */
830           case 7:  /* Left space vs right general category */
831           case 8:  /* Left word vs right general category */
832           p = posspropstab[n-6];
833           accepted = risprop && lisprop ==
834             (list[3] != p[0] &&
835              list[3] != p[1] &&
836             (list[3] != p[2] || !lisprop));
837           break;
838 
839           case 9:   /* Right alphanum vs left general category */
840           case 10:  /* Right space vs left general category */
841           case 11:  /* Right word vs left general category */
842           p = posspropstab[n-9];
843           accepted = lisprop && risprop ==
844             (base_list[3] != p[0] &&
845              base_list[3] != p[1] &&
846             (base_list[3] != p[2] || !risprop));
847           break;
848 
849           case 12:  /* Left alphanum vs right particular category */
850           case 13:  /* Left space vs right particular category */
851           case 14:  /* Left word vs right particular category */
852           p = posspropstab[n-12];
853           accepted = risprop && lisprop ==
854             (catposstab[p[0]][list[3]] &&
855              catposstab[p[1]][list[3]] &&
856             (list[3] != p[3] || !lisprop));
857           break;
858 
859           case 15:  /* Right alphanum vs left particular category */
860           case 16:  /* Right space vs left particular category */
861           case 17:  /* Right word vs left particular category */
862           p = posspropstab[n-15];
863           accepted = lisprop && risprop ==
864             (catposstab[p[0]][base_list[3]] &&
865              catposstab[p[1]][base_list[3]] &&
866             (base_list[3] != p[3] || !risprop));
867           break;
868           }
869         }
870       }
871 
872     else
873 #endif  /* SUPPORT_UNICODE */
874 
875     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
876            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
877            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
878 
879     if (!accepted) return FALSE;
880 
881     if (list[1] == 0) return TRUE;
882     /* Might be an empty repeat. */
883     continue;
884     }
885 
886   /* Control reaches here only if one of the items is a small character list.
887   All characters are checked against the other side. */
888 
889   do
890     {
891     chr = *chr_ptr;
892 
893     switch(list_ptr[0])
894       {
895       case OP_CHAR:
896       ochr_ptr = list_ptr + 2;
897       do
898         {
899         if (chr == *ochr_ptr) return FALSE;
900         ochr_ptr++;
901         }
902       while(*ochr_ptr != NOTACHAR);
903       break;
904 
905       case OP_NOT:
906       ochr_ptr = list_ptr + 2;
907       do
908         {
909         if (chr == *ochr_ptr)
910           break;
911         ochr_ptr++;
912         }
913       while(*ochr_ptr != NOTACHAR);
914       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
915       break;
916 
917       /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
918       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
919 
920       case OP_DIGIT:
921       if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
922       break;
923 
924       case OP_NOT_DIGIT:
925       if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
926       break;
927 
928       case OP_WHITESPACE:
929       if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
930       break;
931 
932       case OP_NOT_WHITESPACE:
933       if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
934       break;
935 
936       case OP_WORDCHAR:
937       if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
938       break;
939 
940       case OP_NOT_WORDCHAR:
941       if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
942       break;
943 
944       case OP_HSPACE:
945       switch(chr)
946         {
947         HSPACE_CASES: return FALSE;
948         default: break;
949         }
950       break;
951 
952       case OP_NOT_HSPACE:
953       switch(chr)
954         {
955         HSPACE_CASES: break;
956         default: return FALSE;
957         }
958       break;
959 
960       case OP_ANYNL:
961       case OP_VSPACE:
962       switch(chr)
963         {
964         VSPACE_CASES: return FALSE;
965         default: break;
966         }
967       break;
968 
969       case OP_NOT_VSPACE:
970       switch(chr)
971         {
972         VSPACE_CASES: break;
973         default: return FALSE;
974         }
975       break;
976 
977       case OP_DOLL:
978       case OP_EODN:
979       switch (chr)
980         {
981         case CHAR_CR:
982         case CHAR_LF:
983         case CHAR_VT:
984         case CHAR_FF:
985         case CHAR_NEL:
986 #ifndef EBCDIC
987         case 0x2028:
988         case 0x2029:
989 #endif  /* Not EBCDIC */
990         return FALSE;
991         }
992       break;
993 
994       case OP_EOD:    /* Can always possessify before \z */
995       break;
996 
997 #ifdef SUPPORT_UNICODE
998       case OP_PROP:
999       case OP_NOTPROP:
1000       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1001             list_ptr[0] == OP_NOTPROP))
1002         return FALSE;
1003       break;
1004 #endif
1005 
1006       case OP_NCLASS:
1007       if (chr > 255) return FALSE;
1008       /* Fall through */
1009 
1010       case OP_CLASS:
1011       if (chr > 255) break;
1012       class_bitset = (uint8_t *)
1013         ((list_ptr == list ? code : base_end) - list_ptr[2]);
1014       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
1015       break;
1016 
1017 #ifdef SUPPORT_WIDE_CHARS
1018       case OP_XCLASS:
1019       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1020           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
1021       break;
1022 #endif
1023 
1024       default:
1025       return FALSE;
1026       }
1027 
1028     chr_ptr++;
1029     }
1030   while(*chr_ptr != NOTACHAR);
1031 
1032   /* At least one character must be matched from this opcode. */
1033 
1034   if (list[1] == 0) return TRUE;
1035   }
1036 
1037 /* Control never reaches here. There used to be a fail-save return FALSE; here,
1038 but some compilers complain about an unreachable statement. */
1039 }
1040 
1041 
1042 
1043 /*************************************************
1044 *    Scan compiled regex for auto-possession     *
1045 *************************************************/
1046 
1047 /* Replaces single character iterations with their possessive alternatives
1048 if appropriate. This function modifies the compiled opcode! Hitting a
1049 non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a
1050 bad UTF string was compiled with PCRE2_NO_UTF_CHECK.
1051 
1052 Arguments:
1053   code        points to start of the byte code
1054   utf         TRUE in UTF mode
1055   cb          compile data block
1056 
1057 Returns:      0 for success
1058               -1 if a non-existant opcode is encountered
1059 */
1060 
1061 int
PRIV(auto_possessify)1062 PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
1063 {
1064 register PCRE2_UCHAR c;
1065 PCRE2_SPTR end;
1066 PCRE2_UCHAR *repeat_opcode;
1067 uint32_t list[8];
1068 int rec_limit;
1069 
1070 for (;;)
1071   {
1072   c = *code;
1073 
1074   if (c > OP_TABLE_LENGTH) return -1;   /* Something gone wrong */
1075 
1076   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1077     {
1078     c -= get_repeat_base(c) - OP_STAR;
1079     end = (c <= OP_MINUPTO) ?
1080       get_chr_property_list(code, utf, cb->fcc, list) : NULL;
1081     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1082 
1083     rec_limit = 1000;
1084     if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
1085       {
1086       switch(c)
1087         {
1088         case OP_STAR:
1089         *code += OP_POSSTAR - OP_STAR;
1090         break;
1091 
1092         case OP_MINSTAR:
1093         *code += OP_POSSTAR - OP_MINSTAR;
1094         break;
1095 
1096         case OP_PLUS:
1097         *code += OP_POSPLUS - OP_PLUS;
1098         break;
1099 
1100         case OP_MINPLUS:
1101         *code += OP_POSPLUS - OP_MINPLUS;
1102         break;
1103 
1104         case OP_QUERY:
1105         *code += OP_POSQUERY - OP_QUERY;
1106         break;
1107 
1108         case OP_MINQUERY:
1109         *code += OP_POSQUERY - OP_MINQUERY;
1110         break;
1111 
1112         case OP_UPTO:
1113         *code += OP_POSUPTO - OP_UPTO;
1114         break;
1115 
1116         case OP_MINUPTO:
1117         *code += OP_POSUPTO - OP_MINUPTO;
1118         break;
1119         }
1120       }
1121     c = *code;
1122     }
1123   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
1124     {
1125 #ifdef SUPPORT_WIDE_CHARS
1126     if (c == OP_XCLASS)
1127       repeat_opcode = code + GET(code, 1);
1128     else
1129 #endif
1130       repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1131 
1132     c = *repeat_opcode;
1133     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1134       {
1135       /* end must not be NULL. */
1136       end = get_chr_property_list(code, utf, cb->fcc, list);
1137 
1138       list[1] = (c & 1) == 0;
1139 
1140       rec_limit = 1000;
1141       if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
1142         {
1143         switch (c)
1144           {
1145           case OP_CRSTAR:
1146           case OP_CRMINSTAR:
1147           *repeat_opcode = OP_CRPOSSTAR;
1148           break;
1149 
1150           case OP_CRPLUS:
1151           case OP_CRMINPLUS:
1152           *repeat_opcode = OP_CRPOSPLUS;
1153           break;
1154 
1155           case OP_CRQUERY:
1156           case OP_CRMINQUERY:
1157           *repeat_opcode = OP_CRPOSQUERY;
1158           break;
1159 
1160           case OP_CRRANGE:
1161           case OP_CRMINRANGE:
1162           *repeat_opcode = OP_CRPOSRANGE;
1163           break;
1164           }
1165         }
1166       }
1167     c = *code;
1168     }
1169 
1170   switch(c)
1171     {
1172     case OP_END:
1173     return 0;
1174 
1175     case OP_TYPESTAR:
1176     case OP_TYPEMINSTAR:
1177     case OP_TYPEPLUS:
1178     case OP_TYPEMINPLUS:
1179     case OP_TYPEQUERY:
1180     case OP_TYPEMINQUERY:
1181     case OP_TYPEPOSSTAR:
1182     case OP_TYPEPOSPLUS:
1183     case OP_TYPEPOSQUERY:
1184     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1185     break;
1186 
1187     case OP_TYPEUPTO:
1188     case OP_TYPEMINUPTO:
1189     case OP_TYPEEXACT:
1190     case OP_TYPEPOSUPTO:
1191     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1192       code += 2;
1193     break;
1194 
1195     case OP_CALLOUT_STR:
1196     code += GET(code, 1 + 2*LINK_SIZE);
1197     break;
1198 
1199 #ifdef SUPPORT_WIDE_CHARS
1200     case OP_XCLASS:
1201     code += GET(code, 1);
1202     break;
1203 #endif
1204 
1205     case OP_MARK:
1206     case OP_PRUNE_ARG:
1207     case OP_SKIP_ARG:
1208     case OP_THEN_ARG:
1209     code += code[1];
1210     break;
1211     }
1212 
1213   /* Add in the fixed length from the table */
1214 
1215   code += PRIV(OP_lengths)[c];
1216 
1217   /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1218   followed by a multi-byte character. The length in the table is a minimum, so
1219   we have to arrange to skip the extra code units. */
1220 
1221 #ifdef MAYBE_UTF_MULTI
1222   if (utf) switch(c)
1223     {
1224     case OP_CHAR:
1225     case OP_CHARI:
1226     case OP_NOT:
1227     case OP_NOTI:
1228     case OP_STAR:
1229     case OP_MINSTAR:
1230     case OP_PLUS:
1231     case OP_MINPLUS:
1232     case OP_QUERY:
1233     case OP_MINQUERY:
1234     case OP_UPTO:
1235     case OP_MINUPTO:
1236     case OP_EXACT:
1237     case OP_POSSTAR:
1238     case OP_POSPLUS:
1239     case OP_POSQUERY:
1240     case OP_POSUPTO:
1241     case OP_STARI:
1242     case OP_MINSTARI:
1243     case OP_PLUSI:
1244     case OP_MINPLUSI:
1245     case OP_QUERYI:
1246     case OP_MINQUERYI:
1247     case OP_UPTOI:
1248     case OP_MINUPTOI:
1249     case OP_EXACTI:
1250     case OP_POSSTARI:
1251     case OP_POSPLUSI:
1252     case OP_POSQUERYI:
1253     case OP_POSUPTOI:
1254     case OP_NOTSTAR:
1255     case OP_NOTMINSTAR:
1256     case OP_NOTPLUS:
1257     case OP_NOTMINPLUS:
1258     case OP_NOTQUERY:
1259     case OP_NOTMINQUERY:
1260     case OP_NOTUPTO:
1261     case OP_NOTMINUPTO:
1262     case OP_NOTEXACT:
1263     case OP_NOTPOSSTAR:
1264     case OP_NOTPOSPLUS:
1265     case OP_NOTPOSQUERY:
1266     case OP_NOTPOSUPTO:
1267     case OP_NOTSTARI:
1268     case OP_NOTMINSTARI:
1269     case OP_NOTPLUSI:
1270     case OP_NOTMINPLUSI:
1271     case OP_NOTQUERYI:
1272     case OP_NOTMINQUERYI:
1273     case OP_NOTUPTOI:
1274     case OP_NOTMINUPTOI:
1275     case OP_NOTEXACTI:
1276     case OP_NOTPOSSTARI:
1277     case OP_NOTPOSPLUSI:
1278     case OP_NOTPOSQUERYI:
1279     case OP_NOTPOSUPTOI:
1280     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1281     break;
1282     }
1283 #else
1284   (void)(utf);  /* Keep compiler happy by referencing function argument */
1285 #endif  /* SUPPORT_WIDE_CHARS */
1286   }
1287 }
1288 
1289 /* End of pcre2_auto_possess.c */
1290