1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2014 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #define NLBLOCK cd             /* Block containing newline information */
50 #define PSSTART start_pattern  /* Field containing pattern start */
51 #define PSEND   end_pattern    /* Field containing pattern end */
52 
53 #include "pcre_internal.h"
54 
55 
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60 
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67 
68 
69 /* Macro for setting individual bits in class bitmaps. */
70 
71 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72 
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77 
78 #define OFLOW_MAX (INT_MAX - 20)
79 
80 /* Definitions to allow mutual recursion */
81 
82 static int
83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84     const pcre_uint32 *, unsigned int);
85 
86 static BOOL
87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89     compile_data *, int *);
90 
91 
92 
93 /*************************************************
94 *      Code parameters and static tables         *
95 *************************************************/
96 
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103 
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114 
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117 
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122 
123 #define NAMED_GROUP_LIST_SIZE  20
124 
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127 
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129 
130 /* Private flags added to firstchar and reqchar. */
131 
132 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET       (-2)
136 #define REQ_NONE        (-1)
137 
138 /* Repeated character flags. */
139 
140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141 
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146 
147 #ifndef EBCDIC
148 
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151 
152 static const short int escapes[] = {
153      0,                       0,
154      0,                       0,
155      0,                       0,
156      0,                       0,
157      0,                       0,
158      CHAR_COLON,              CHAR_SEMICOLON,
159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161      CHAR_COMMERCIAL_AT,      -ESC_A,
162      -ESC_B,                  -ESC_C,
163      -ESC_D,                  -ESC_E,
164      0,                       -ESC_G,
165      -ESC_H,                  0,
166      0,                       -ESC_K,
167      0,                       0,
168      -ESC_N,                  0,
169      -ESC_P,                  -ESC_Q,
170      -ESC_R,                  -ESC_S,
171      0,                       0,
172      -ESC_V,                  -ESC_W,
173      -ESC_X,                  0,
174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177      CHAR_GRAVE_ACCENT,       7,
178      -ESC_b,                  0,
179      -ESC_d,                  ESC_e,
180      ESC_f,                   0,
181      -ESC_h,                  0,
182      0,                       -ESC_k,
183      0,                       0,
184      ESC_n,                   0,
185      -ESC_p,                  0,
186      ESC_r,                   -ESC_s,
187      ESC_tee,                 0,
188      -ESC_v,                  -ESC_w,
189      0,                       0,
190      -ESC_z
191 };
192 
193 #else
194 
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196 
197 static const short int escapes[] = {
198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221 };
222 #endif
223 
224 
225 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
226 searched linearly. Put all the names into a single string, in order to reduce
227 the number of relocations when a shared library is dynamically linked. The
228 string is built from string macros so that it works in UTF-8 mode on EBCDIC
229 platforms. */
230 
231 typedef struct verbitem {
232   int   len;                 /* Length of verb name */
233   int   op;                  /* Op when no arg, or -1 if arg mandatory */
234   int   op_arg;              /* Op when arg present, or -1 if not allowed */
235 } verbitem;
236 
237 static const char verbnames[] =
238   "\0"                       /* Empty name is a shorthand for MARK */
239   STRING_MARK0
240   STRING_ACCEPT0
241   STRING_COMMIT0
242   STRING_F0
243   STRING_FAIL0
244   STRING_PRUNE0
245   STRING_SKIP0
246   STRING_THEN;
247 
248 static const verbitem verbs[] = {
249   { 0, -1,        OP_MARK },
250   { 4, -1,        OP_MARK },
251   { 6, OP_ACCEPT, -1 },
252   { 6, OP_COMMIT, -1 },
253   { 1, OP_FAIL,   -1 },
254   { 4, OP_FAIL,   -1 },
255   { 5, OP_PRUNE,  OP_PRUNE_ARG },
256   { 4, OP_SKIP,   OP_SKIP_ARG  },
257   { 4, OP_THEN,   OP_THEN_ARG  }
258 };
259 
260 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
261 
262 
263 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
264 another regex library. */
265 
266 static const pcre_uchar sub_start_of_word[] = {
267   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
268   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
269 
270 static const pcre_uchar sub_end_of_word[] = {
271   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
272   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
273   CHAR_RIGHT_PARENTHESIS, '\0' };
274 
275 
276 /* Tables of names of POSIX character classes and their lengths. The names are
277 now all in a single string, to reduce the number of relocations when a shared
278 library is dynamically loaded. The list of lengths is terminated by a zero
279 length entry. The first three must be alpha, lower, upper, as this is assumed
280 for handling case independence. The indices for graph, print, and punct are
281 needed, so identify them. */
282 
283 static const char posix_names[] =
284   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
285   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
286   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
287   STRING_word0  STRING_xdigit;
288 
289 static const pcre_uint8 posix_name_lengths[] = {
290   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
291 
292 #define PC_GRAPH  8
293 #define PC_PRINT  9
294 #define PC_PUNCT 10
295 
296 
297 /* Table of class bit maps for each POSIX class. Each class is formed from a
298 base map, with an optional addition or removal of another map. Then, for some
299 classes, there is some additional tweaking: for [:blank:] the vertical space
300 characters are removed, and for [:alpha:] and [:alnum:] the underscore
301 character is removed. The triples in the table consist of the base map offset,
302 second map offset or -1 if no second map, and a non-negative value for map
303 addition or a negative value for map subtraction (if there are two maps). The
304 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
305 remove vertical space characters, 2 => remove underscore. */
306 
307 static const int posix_class_maps[] = {
308   cbit_word,  cbit_digit, -2,             /* alpha */
309   cbit_lower, -1,          0,             /* lower */
310   cbit_upper, -1,          0,             /* upper */
311   cbit_word,  -1,          2,             /* alnum - word without underscore */
312   cbit_print, cbit_cntrl,  0,             /* ascii */
313   cbit_space, -1,          1,             /* blank - a GNU extension */
314   cbit_cntrl, -1,          0,             /* cntrl */
315   cbit_digit, -1,          0,             /* digit */
316   cbit_graph, -1,          0,             /* graph */
317   cbit_print, -1,          0,             /* print */
318   cbit_punct, -1,          0,             /* punct */
319   cbit_space, -1,          0,             /* space */
320   cbit_word,  -1,          0,             /* word - a Perl extension */
321   cbit_xdigit,-1,          0              /* xdigit */
322 };
323 
324 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
325 Unicode property escapes. */
326 
327 #ifdef SUPPORT_UCP
328 static const pcre_uchar string_PNd[]  = {
329   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
330   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331 static const pcre_uchar string_pNd[]  = {
332   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
333   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
334 static const pcre_uchar string_PXsp[] = {
335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337 static const pcre_uchar string_pXsp[] = {
338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340 static const pcre_uchar string_PXwd[] = {
341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343 static const pcre_uchar string_pXwd[] = {
344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346 
347 static const pcre_uchar *substitutes[] = {
348   string_PNd,           /* \D */
349   string_pNd,           /* \d */
350   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
351   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
352   string_PXwd,          /* \W */
353   string_pXwd           /* \w */
354 };
355 
356 /* The POSIX class substitutes must be in the order of the POSIX class names,
357 defined above, and there are both positive and negative cases. NULL means no
358 general substitute of a Unicode property escape (\p or \P). However, for some
359 POSIX classes (e.g. graph, print, punct) a special property code is compiled
360 directly. */
361 
362 static const pcre_uchar string_pL[] =   {
363   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
364   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
365 static const pcre_uchar string_pLl[] =  {
366   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
367   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
368 static const pcre_uchar string_pLu[] =  {
369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371 static const pcre_uchar string_pXan[] = {
372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374 static const pcre_uchar string_h[] =    {
375   CHAR_BACKSLASH, CHAR_h, '\0' };
376 static const pcre_uchar string_pXps[] = {
377   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
378   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
379 static const pcre_uchar string_PL[] =   {
380   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
381   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
382 static const pcre_uchar string_PLl[] =  {
383   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
384   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385 static const pcre_uchar string_PLu[] =  {
386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388 static const pcre_uchar string_PXan[] = {
389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391 static const pcre_uchar string_H[] =    {
392   CHAR_BACKSLASH, CHAR_H, '\0' };
393 static const pcre_uchar string_PXps[] = {
394   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
395   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
396 
397 static const pcre_uchar *posix_substitutes[] = {
398   string_pL,            /* alpha */
399   string_pLl,           /* lower */
400   string_pLu,           /* upper */
401   string_pXan,          /* alnum */
402   NULL,                 /* ascii */
403   string_h,             /* blank */
404   NULL,                 /* cntrl */
405   string_pNd,           /* digit */
406   NULL,                 /* graph */
407   NULL,                 /* print */
408   NULL,                 /* punct */
409   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
410   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
411   NULL,                 /* xdigit */
412   /* Negated cases */
413   string_PL,            /* ^alpha */
414   string_PLl,           /* ^lower */
415   string_PLu,           /* ^upper */
416   string_PXan,          /* ^alnum */
417   NULL,                 /* ^ascii */
418   string_H,             /* ^blank */
419   NULL,                 /* ^cntrl */
420   string_PNd,           /* ^digit */
421   NULL,                 /* ^graph */
422   NULL,                 /* ^print */
423   NULL,                 /* ^punct */
424   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
425   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
426   NULL                  /* ^xdigit */
427 };
428 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
429 #endif
430 
431 #define STRING(a)  # a
432 #define XSTRING(s) STRING(s)
433 
434 /* The texts of compile-time error messages. These are "char *" because they
435 are passed to the outside world. Do not ever re-use any error number, because
436 they are documented. Always add a new error instead. Messages marked DEAD below
437 are no longer used. This used to be a table of strings, but in order to reduce
438 the number of relocations needed when a shared library is loaded dynamically,
439 it is now one long string. We cannot use a table of offsets, because the
440 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
441 simply count through to the one we want - this isn't a performance issue
442 because these strings are used only when there is a compilation error.
443 
444 Each substring ends with \0 to insert a null character. This includes the final
445 substring, so that the whole string ends with \0\0, which can be detected when
446 counting through. */
447 
448 static const char error_texts[] =
449   "no error\0"
450   "\\ at end of pattern\0"
451   "\\c at end of pattern\0"
452   "unrecognized character follows \\\0"
453   "numbers out of order in {} quantifier\0"
454   /* 5 */
455   "number too big in {} quantifier\0"
456   "missing terminating ] for character class\0"
457   "invalid escape sequence in character class\0"
458   "range out of order in character class\0"
459   "nothing to repeat\0"
460   /* 10 */
461   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
462   "internal error: unexpected repeat\0"
463   "unrecognized character after (? or (?-\0"
464   "POSIX named classes are supported only within a class\0"
465   "missing )\0"
466   /* 15 */
467   "reference to non-existent subpattern\0"
468   "erroffset passed as NULL\0"
469   "unknown option bit(s) set\0"
470   "missing ) after comment\0"
471   "parentheses nested too deeply\0"  /** DEAD **/
472   /* 20 */
473   "regular expression is too large\0"
474   "failed to get memory\0"
475   "unmatched parentheses\0"
476   "internal error: code overflow\0"
477   "unrecognized character after (?<\0"
478   /* 25 */
479   "lookbehind assertion is not fixed length\0"
480   "malformed number or name after (?(\0"
481   "conditional group contains more than two branches\0"
482   "assertion expected after (?(\0"
483   "(?R or (?[+-]digits must be followed by )\0"
484   /* 30 */
485   "unknown POSIX class name\0"
486   "POSIX collating elements are not supported\0"
487   "this version of PCRE is compiled without UTF support\0"
488   "spare error\0"  /** DEAD **/
489   "character value in \\x{} or \\o{} is too large\0"
490   /* 35 */
491   "invalid condition (?(0)\0"
492   "\\C not allowed in lookbehind assertion\0"
493   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
494   "number after (?C is > 255\0"
495   "closing ) for (?C expected\0"
496   /* 40 */
497   "recursive call could loop indefinitely\0"
498   "unrecognized character after (?P\0"
499   "syntax error in subpattern name (missing terminator)\0"
500   "two named subpatterns have the same name\0"
501   "invalid UTF-8 string\0"
502   /* 45 */
503   "support for \\P, \\p, and \\X has not been compiled\0"
504   "malformed \\P or \\p sequence\0"
505   "unknown property name after \\P or \\p\0"
506   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
507   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
508   /* 50 */
509   "repeated subpattern is too long\0"    /** DEAD **/
510   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
511   "internal error: overran compiling workspace\0"
512   "internal error: previously-checked referenced subpattern not found\0"
513   "DEFINE group contains more than one branch\0"
514   /* 55 */
515   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
516   "inconsistent NEWLINE options\0"
517   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
518   "a numbered reference must not be zero\0"
519   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
520   /* 60 */
521   "(*VERB) not recognized or malformed\0"
522   "number is too big\0"
523   "subpattern name expected\0"
524   "digit expected after (?+\0"
525   "] is an invalid data character in JavaScript compatibility mode\0"
526   /* 65 */
527   "different names for subpatterns of the same number are not allowed\0"
528   "(*MARK) must have an argument\0"
529   "this version of PCRE is not compiled with Unicode property support\0"
530   "\\c must be followed by an ASCII character\0"
531   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
532   /* 70 */
533   "internal error: unknown opcode in find_fixedlength()\0"
534   "\\N is not supported in a class\0"
535   "too many forward references\0"
536   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
537   "invalid UTF-16 string\0"
538   /* 75 */
539   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
540   "character value in \\u.... sequence is too large\0"
541   "invalid UTF-32 string\0"
542   "setting UTF is disabled by the application\0"
543   "non-hex character in \\x{} (closing brace missing?)\0"
544   /* 80 */
545   "non-octal character in \\o{} (closing brace missing?)\0"
546   "missing opening brace after \\o\0"
547   "parentheses are too deeply nested\0"
548   "invalid range in character class\0"
549   "group name must start with a non-digit\0"
550   /* 85 */
551   "parentheses are too deeply nested (stack check)\0"
552   "digits missing in \\x{} or \\o{}\0"
553   ;
554 
555 /* Table to identify digits and hex digits. This is used when compiling
556 patterns. Note that the tables in chartables are dependent on the locale, and
557 may mark arbitrary characters as digits - but the PCRE compiling code expects
558 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
559 a private table here. It costs 256 bytes, but it is a lot faster than doing
560 character value tests (at least in some simple cases I timed), and in some
561 applications one wants PCRE to compile efficiently as well as match
562 efficiently.
563 
564 For convenience, we use the same bit definitions as in chartables:
565 
566   0x04   decimal digit
567   0x08   hexadecimal digit
568 
569 Then we can use ctype_digit and ctype_xdigit in the code. */
570 
571 /* Using a simple comparison for decimal numbers rather than a memory read
572 is much faster, and the resulting code is simpler (the compiler turns it
573 into a subtraction and unsigned comparison). */
574 
575 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
576 
577 #ifndef EBCDIC
578 
579 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
580 UTF-8 mode. */
581 
582 static const pcre_uint8 digitab[] =
583   {
584   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
585   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
586   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
587   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
588   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
589   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
590   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
591   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
592   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
593   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
596   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
602   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
603   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
616 
617 #else
618 
619 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
620 
621 static const pcre_uint8 digitab[] =
622   {
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
627   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
628   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
629   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
630   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
631   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
632   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
633   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
639   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
647   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
653   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
654   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
655 
656 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
657   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
658   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
659   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
661   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
664   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
665   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
666   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
667   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
668   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
669   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
670   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
673   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
674   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
675   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
676   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
677   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
678   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
679   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
680   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
681   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
682   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
683   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
684   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
685   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
687   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
688   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
689 #endif
690 
691 
692 /* This table is used to check whether auto-possessification is possible
693 between adjacent character-type opcodes. The left-hand (repeated) opcode is
694 used to select the row, and the right-hand opcode is use to select the column.
695 A value of 1 means that auto-possessification is OK. For example, the second
696 value in the first row means that \D+\d can be turned into \D++\d.
697 
698 The Unicode property types (\P and \p) have to be present to fill out the table
699 because of what their opcode values are, but the table values should always be
700 zero because property types are handled separately in the code. The last four
701 columns apply to items that cannot be repeated, so there is no need to have
702 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
703 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
704 
705 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
706 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
707 
708 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
709 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
710   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
711   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
712   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
713   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
714   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
715   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
716   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
717   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
718   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
719   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
720   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
721   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
722   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
723   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
724   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
725   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
726   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
727 };
728 
729 
730 /* This table is used to check whether auto-possessification is possible
731 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
732 left-hand (repeated) opcode is used to select the row, and the right-hand
733 opcode is used to select the column. The values are as follows:
734 
735   0   Always return FALSE (never auto-possessify)
736   1   Character groups are distinct (possessify if both are OP_PROP)
737   2   Check character categories in the same group (general or particular)
738   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
739 
740   4   Check left general category vs right particular category
741   5   Check right general category vs left particular category
742 
743   6   Left alphanum vs right general category
744   7   Left space vs right general category
745   8   Left word vs right general category
746 
747   9   Right alphanum vs left general category
748  10   Right space vs left general category
749  11   Right word vs left general category
750 
751  12   Left alphanum vs right particular category
752  13   Left space vs right particular category
753  14   Left word vs right particular category
754 
755  15   Right alphanum vs left particular category
756  16   Right space vs left particular category
757  17   Right word vs left particular category
758 */
759 
760 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
761 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
762   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
763   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
764   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
765   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
766   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
767   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
768   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
769   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
770   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
771   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
772   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
773 };
774 
775 /* This table is used to check whether auto-possessification is possible
776 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
777 specifies a general category and the other specifies a particular category. The
778 row is selected by the general category and the column by the particular
779 category. The value is 1 if the particular category is not part of the general
780 category. */
781 
782 static const pcre_uint8 catposstab[7][30] = {
783 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
784   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
785   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
786   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
787   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
788   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
789   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
790   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
791 };
792 
793 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
794 a general or particular category. The properties in each row are those
795 that apply to the character set in question. Duplication means that a little
796 unnecessary work is done when checking, but this keeps things much simpler
797 because they can all use the same code. For more details see the comment where
798 this table is used.
799 
800 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
801 "space", but from Perl 5.18 it's included, so both categories are treated the
802 same here. */
803 
804 static const pcre_uint8 posspropstab[3][4] = {
805   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
806   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
807   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
808 };
809 
810 /* This table is used when converting repeating opcodes into possessified
811 versions as a result of an explicit possessive quantifier such as ++. A zero
812 value means there is no possessified version - in those cases the item in
813 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
814 because all relevant opcodes are less than that. */
815 
816 static const pcre_uint8 opcode_possessify[] = {
817   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
818   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
819 
820   0,                       /* NOTI */
821   OP_POSSTAR, 0,           /* STAR, MINSTAR */
822   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
823   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
824   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
825   0,                       /* EXACT */
826   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
827 
828   OP_POSSTARI, 0,          /* STARI, MINSTARI */
829   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
830   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
831   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
832   0,                       /* EXACTI */
833   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
834 
835   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
836   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
837   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
838   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
839   0,                       /* NOTEXACT */
840   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
841 
842   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
843   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
844   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
845   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
846   0,                       /* NOTEXACTI */
847   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
848 
849   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
850   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
851   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
852   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
853   0,                       /* TYPEEXACT */
854   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
855 
856   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
857   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
858   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
859   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
860   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
861 
862   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
863   0, 0,                    /* REF, REFI */
864   0, 0,                    /* DNREF, DNREFI */
865   0, 0                     /* RECURSE, CALLOUT */
866 };
867 
868 
869 
870 /*************************************************
871 *            Find an error text                  *
872 *************************************************/
873 
874 /* The error texts are now all in one long string, to save on relocations. As
875 some of the text is of unknown length, we can't use a table of offsets.
876 Instead, just count through the strings. This is not a performance issue
877 because it happens only when there has been a compilation error.
878 
879 Argument:   the error number
880 Returns:    pointer to the error string
881 */
882 
883 static const char *
find_error_text(int n)884 find_error_text(int n)
885 {
886 const char *s = error_texts;
887 for (; n > 0; n--)
888   {
889   while (*s++ != CHAR_NULL) {};
890   if (*s == CHAR_NULL) return "Error text not found (please report)";
891   }
892 return s;
893 }
894 
895 
896 
897 /*************************************************
898 *           Expand the workspace                 *
899 *************************************************/
900 
901 /* This function is called during the second compiling phase, if the number of
902 forward references fills the existing workspace, which is originally a block on
903 the stack. A larger block is obtained from malloc() unless the ultimate limit
904 has been reached or the increase will be rather small.
905 
906 Argument: pointer to the compile data block
907 Returns:  0 if all went well, else an error number
908 */
909 
910 static int
expand_workspace(compile_data * cd)911 expand_workspace(compile_data *cd)
912 {
913 pcre_uchar *newspace;
914 int newsize = cd->workspace_size * 2;
915 
916 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
917 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
918     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
919  return ERR72;
920 
921 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
922 if (newspace == NULL) return ERR21;
923 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
924 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
925 if (cd->workspace_size > COMPILE_WORK_SIZE)
926   (PUBL(free))((void *)cd->start_workspace);
927 cd->start_workspace = newspace;
928 cd->workspace_size = newsize;
929 return 0;
930 }
931 
932 
933 
934 /*************************************************
935 *            Check for counted repeat            *
936 *************************************************/
937 
938 /* This function is called when a '{' is encountered in a place where it might
939 start a quantifier. It looks ahead to see if it really is a quantifier or not.
940 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
941 where the ddds are digits.
942 
943 Arguments:
944   p         pointer to the first char after '{'
945 
946 Returns:    TRUE or FALSE
947 */
948 
949 static BOOL
is_counted_repeat(const pcre_uchar * p)950 is_counted_repeat(const pcre_uchar *p)
951 {
952 if (!IS_DIGIT(*p)) return FALSE;
953 p++;
954 while (IS_DIGIT(*p)) p++;
955 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
956 
957 if (*p++ != CHAR_COMMA) return FALSE;
958 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
959 
960 if (!IS_DIGIT(*p)) return FALSE;
961 p++;
962 while (IS_DIGIT(*p)) p++;
963 
964 return (*p == CHAR_RIGHT_CURLY_BRACKET);
965 }
966 
967 
968 
969 /*************************************************
970 *            Handle escapes                      *
971 *************************************************/
972 
973 /* This function is called when a \ has been encountered. It either returns a
974 positive value for a simple escape such as \n, or 0 for a data character which
975 will be placed in chptr. A backreference to group n is returned as negative n.
976 When UTF-8 is enabled, a positive value greater than 255 may be returned in
977 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
978 character of the escape sequence.
979 
980 Arguments:
981   ptrptr         points to the pattern position pointer
982   chptr          points to a returned data character
983   errorcodeptr   points to the errorcode variable
984   bracount       number of previous extracting brackets
985   options        the options bits
986   isclass        TRUE if inside a character class
987 
988 Returns:         zero => a data character
989                  positive => a special escape sequence
990                  negative => a back reference
991                  on error, errorcodeptr is set
992 */
993 
994 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)995 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
996   int bracount, int options, BOOL isclass)
997 {
998 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
999 BOOL utf = (options & PCRE_UTF8) != 0;
1000 const pcre_uchar *ptr = *ptrptr + 1;
1001 pcre_uint32 c;
1002 int escape = 0;
1003 int i;
1004 
1005 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1006 ptr--;                            /* Set pointer back to the last byte */
1007 
1008 /* If backslash is at the end of the pattern, it's an error. */
1009 
1010 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1011 
1012 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1013 in a table. A non-zero result is something that can be returned immediately.
1014 Otherwise further processing may be required. */
1015 
1016 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1017 /* Not alphanumeric */
1018 else if (c < CHAR_0 || c > CHAR_z) {}
1019 else if ((i = escapes[c - CHAR_0]) != 0)
1020   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1021 
1022 #else           /* EBCDIC coding */
1023 /* Not alphanumeric */
1024 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1025 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1026 #endif
1027 
1028 /* Escapes that need further processing, or are illegal. */
1029 
1030 else
1031   {
1032   const pcre_uchar *oldptr;
1033   BOOL braced, negated, overflow;
1034   int s;
1035 
1036   switch (c)
1037     {
1038     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1039     error. */
1040 
1041     case CHAR_l:
1042     case CHAR_L:
1043     *errorcodeptr = ERR37;
1044     break;
1045 
1046     case CHAR_u:
1047     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1048       {
1049       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1050       Otherwise it is a lowercase u letter. */
1051       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1052         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1053         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1054         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1055         {
1056         c = 0;
1057         for (i = 0; i < 4; ++i)
1058           {
1059           register pcre_uint32 cc = *(++ptr);
1060 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1061           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1062           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1063 #else           /* EBCDIC coding */
1064           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1065           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1066 #endif
1067           }
1068 
1069 #if defined COMPILE_PCRE8
1070         if (c > (utf ? 0x10ffffU : 0xffU))
1071 #elif defined COMPILE_PCRE16
1072         if (c > (utf ? 0x10ffffU : 0xffffU))
1073 #elif defined COMPILE_PCRE32
1074         if (utf && c > 0x10ffffU)
1075 #endif
1076           {
1077           *errorcodeptr = ERR76;
1078           }
1079         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1080         }
1081       }
1082     else
1083       *errorcodeptr = ERR37;
1084     break;
1085 
1086     case CHAR_U:
1087     /* In JavaScript, \U is an uppercase U letter. */
1088     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1089     break;
1090 
1091     /* In a character class, \g is just a literal "g". Outside a character
1092     class, \g must be followed by one of a number of specific things:
1093 
1094     (1) A number, either plain or braced. If positive, it is an absolute
1095     backreference. If negative, it is a relative backreference. This is a Perl
1096     5.10 feature.
1097 
1098     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1099     is part of Perl's movement towards a unified syntax for back references. As
1100     this is synonymous with \k{name}, we fudge it up by pretending it really
1101     was \k.
1102 
1103     (3) For Oniguruma compatibility we also support \g followed by a name or a
1104     number either in angle brackets or in single quotes. However, these are
1105     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1106     the ESC_g code (cf \k). */
1107 
1108     case CHAR_g:
1109     if (isclass) break;
1110     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1111       {
1112       escape = ESC_g;
1113       break;
1114       }
1115 
1116     /* Handle the Perl-compatible cases */
1117 
1118     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1119       {
1120       const pcre_uchar *p;
1121       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1122         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1123       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1124         {
1125         escape = ESC_k;
1126         break;
1127         }
1128       braced = TRUE;
1129       ptr++;
1130       }
1131     else braced = FALSE;
1132 
1133     if (ptr[1] == CHAR_MINUS)
1134       {
1135       negated = TRUE;
1136       ptr++;
1137       }
1138     else negated = FALSE;
1139 
1140     /* The integer range is limited by the machine's int representation. */
1141     s = 0;
1142     overflow = FALSE;
1143     while (IS_DIGIT(ptr[1]))
1144       {
1145       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1146         {
1147         overflow = TRUE;
1148         break;
1149         }
1150       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1151       }
1152     if (overflow) /* Integer overflow */
1153       {
1154       while (IS_DIGIT(ptr[1]))
1155         ptr++;
1156       *errorcodeptr = ERR61;
1157       break;
1158       }
1159 
1160     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1161       {
1162       *errorcodeptr = ERR57;
1163       break;
1164       }
1165 
1166     if (s == 0)
1167       {
1168       *errorcodeptr = ERR58;
1169       break;
1170       }
1171 
1172     if (negated)
1173       {
1174       if (s > bracount)
1175         {
1176         *errorcodeptr = ERR15;
1177         break;
1178         }
1179       s = bracount - (s - 1);
1180       }
1181 
1182     escape = -s;
1183     break;
1184 
1185     /* The handling of escape sequences consisting of a string of digits
1186     starting with one that is not zero is not straightforward. Perl has changed
1187     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1188     recommended to avoid the ambiguities in the old syntax.
1189 
1190     Outside a character class, the digits are read as a decimal number. If the
1191     number is less than 8 (used to be 10), or if there are that many previous
1192     extracting left brackets, then it is a back reference. Otherwise, up to
1193     three octal digits are read to form an escaped byte. Thus \123 is likely to
1194     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1195     the octal value is greater than 377, the least significant 8 bits are
1196     taken. \8 and \9 are treated as the literal characters 8 and 9.
1197 
1198     Inside a character class, \ followed by a digit is always either a literal
1199     8 or 9 or an octal number. */
1200 
1201     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1202     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1203 
1204     if (!isclass)
1205       {
1206       oldptr = ptr;
1207       /* The integer range is limited by the machine's int representation. */
1208       s = (int)(c -CHAR_0);
1209       overflow = FALSE;
1210       while (IS_DIGIT(ptr[1]))
1211         {
1212         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1213           {
1214           overflow = TRUE;
1215           break;
1216           }
1217         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1218         }
1219       if (overflow) /* Integer overflow */
1220         {
1221         while (IS_DIGIT(ptr[1]))
1222           ptr++;
1223         *errorcodeptr = ERR61;
1224         break;
1225         }
1226       if (s < 8 || s <= bracount)  /* Check for back reference */
1227         {
1228         escape = -s;
1229         break;
1230         }
1231       ptr = oldptr;      /* Put the pointer back and fall through */
1232       }
1233 
1234     /* Handle a digit following \ when the number is not a back reference. If
1235     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1236     then treat the digit as a following literal. At least by Perl 5.18 this
1237     changed so as not to insert the binary zero. */
1238 
1239     if ((c = *ptr) >= CHAR_8) break;
1240 
1241     /* Fall through with a digit less than 8 */
1242 
1243     /* \0 always starts an octal number, but we may drop through to here with a
1244     larger first octal digit. The original code used just to take the least
1245     significant 8 bits of octal numbers (I think this is what early Perls used
1246     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1247     but no more than 3 octal digits. */
1248 
1249     case CHAR_0:
1250     c -= CHAR_0;
1251     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1252         c = c * 8 + *(++ptr) - CHAR_0;
1253 #ifdef COMPILE_PCRE8
1254     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1255 #endif
1256     break;
1257 
1258     /* \o is a relatively new Perl feature, supporting a more general way of
1259     specifying character codes in octal. The only supported form is \o{ddd}. */
1260 
1261     case CHAR_o:
1262     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1263     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1264       {
1265       ptr += 2;
1266       c = 0;
1267       overflow = FALSE;
1268       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1269         {
1270         register pcre_uint32 cc = *ptr++;
1271         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1272 #ifdef COMPILE_PCRE32
1273         if (c >= 0x20000000l) { overflow = TRUE; break; }
1274 #endif
1275         c = (c << 3) + cc - CHAR_0 ;
1276 #if defined COMPILE_PCRE8
1277         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1278 #elif defined COMPILE_PCRE16
1279         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1280 #elif defined COMPILE_PCRE32
1281         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1282 #endif
1283         }
1284       if (overflow)
1285         {
1286         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1287         *errorcodeptr = ERR34;
1288         }
1289       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1290         {
1291         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1292         }
1293       else *errorcodeptr = ERR80;
1294       }
1295     break;
1296 
1297     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1298     numbers. Otherwise it is a lowercase x letter. */
1299 
1300     case CHAR_x:
1301     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1302       {
1303       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1304         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1305         {
1306         c = 0;
1307         for (i = 0; i < 2; ++i)
1308           {
1309           register pcre_uint32 cc = *(++ptr);
1310 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1311           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1312           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1313 #else           /* EBCDIC coding */
1314           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1315           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1316 #endif
1317           }
1318         }
1319       }    /* End JavaScript handling */
1320 
1321     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1322     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1323     digits. If not, { used to be treated as a data character. However, Perl
1324     seems to read hex digits up to the first non-such, and ignore the rest, so
1325     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1326     now gives an error. */
1327 
1328     else
1329       {
1330       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1331         {
1332         ptr += 2;
1333         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1334           {
1335           *errorcodeptr = ERR86;
1336           break;
1337           }
1338         c = 0;
1339         overflow = FALSE;
1340         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1341           {
1342           register pcre_uint32 cc = *ptr++;
1343           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1344 
1345 #ifdef COMPILE_PCRE32
1346           if (c >= 0x10000000l) { overflow = TRUE; break; }
1347 #endif
1348 
1349 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1350           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1351           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1352 #else           /* EBCDIC coding */
1353           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1354           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1355 #endif
1356 
1357 #if defined COMPILE_PCRE8
1358           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1359 #elif defined COMPILE_PCRE16
1360           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1361 #elif defined COMPILE_PCRE32
1362           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1363 #endif
1364           }
1365 
1366         if (overflow)
1367           {
1368           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1369           *errorcodeptr = ERR34;
1370           }
1371 
1372         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1373           {
1374           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1375           }
1376 
1377         /* If the sequence of hex digits does not end with '}', give an error.
1378         We used just to recognize this construct and fall through to the normal
1379         \x handling, but nowadays Perl gives an error, which seems much more
1380         sensible, so we do too. */
1381 
1382         else *errorcodeptr = ERR79;
1383         }   /* End of \x{} processing */
1384 
1385       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1386 
1387       else
1388         {
1389         c = 0;
1390         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1391           {
1392           pcre_uint32 cc;                          /* Some compilers don't like */
1393           cc = *(++ptr);                           /* ++ in initializers */
1394 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1395           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1396           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1397 #else           /* EBCDIC coding */
1398           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1399           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1400 #endif
1401           }
1402         }     /* End of \xdd handling */
1403       }       /* End of Perl-style \x handling */
1404     break;
1405 
1406     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1407     An error is given if the byte following \c is not an ASCII character. This
1408     coding is ASCII-specific, but then the whole concept of \cx is
1409     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1410 
1411     case CHAR_c:
1412     c = *(++ptr);
1413     if (c == CHAR_NULL)
1414       {
1415       *errorcodeptr = ERR2;
1416       break;
1417       }
1418 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1419     if (c > 127)  /* Excludes all non-ASCII in either mode */
1420       {
1421       *errorcodeptr = ERR68;
1422       break;
1423       }
1424     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1425     c ^= 0x40;
1426 #else             /* EBCDIC coding */
1427     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1428     c ^= 0xC0;
1429 #endif
1430     break;
1431 
1432     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1433     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1434     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1435     odd, but there used to be some cases other than the default, and there may
1436     be again in future, so I haven't "optimized" it. */
1437 
1438     default:
1439     if ((options & PCRE_EXTRA) != 0) switch(c)
1440       {
1441       default:
1442       *errorcodeptr = ERR3;
1443       break;
1444       }
1445     break;
1446     }
1447   }
1448 
1449 /* Perl supports \N{name} for character names, as well as plain \N for "not
1450 newline". PCRE does not support \N{name}. However, it does support
1451 quantification such as \N{2,3}. */
1452 
1453 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1454      !is_counted_repeat(ptr+2))
1455   *errorcodeptr = ERR37;
1456 
1457 /* If PCRE_UCP is set, we change the values for \d etc. */
1458 
1459 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1460   escape += (ESC_DU - ESC_D);
1461 
1462 /* Set the pointer to the final character before returning. */
1463 
1464 *ptrptr = ptr;
1465 *chptr = c;
1466 return escape;
1467 }
1468 
1469 
1470 
1471 #ifdef SUPPORT_UCP
1472 /*************************************************
1473 *               Handle \P and \p                 *
1474 *************************************************/
1475 
1476 /* This function is called after \P or \p has been encountered, provided that
1477 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1478 pointing at the P or p. On exit, it is pointing at the final character of the
1479 escape sequence.
1480 
1481 Argument:
1482   ptrptr         points to the pattern position pointer
1483   negptr         points to a boolean that is set TRUE for negation else FALSE
1484   ptypeptr       points to an unsigned int that is set to the type value
1485   pdataptr       points to an unsigned int that is set to the detailed property value
1486   errorcodeptr   points to the error code variable
1487 
1488 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1489 */
1490 
1491 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1492 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1493   unsigned int *pdataptr, int *errorcodeptr)
1494 {
1495 pcre_uchar c;
1496 int i, bot, top;
1497 const pcre_uchar *ptr = *ptrptr;
1498 pcre_uchar name[32];
1499 
1500 c = *(++ptr);
1501 if (c == CHAR_NULL) goto ERROR_RETURN;
1502 
1503 *negptr = FALSE;
1504 
1505 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1506 negation. */
1507 
1508 if (c == CHAR_LEFT_CURLY_BRACKET)
1509   {
1510   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1511     {
1512     *negptr = TRUE;
1513     ptr++;
1514     }
1515   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1516     {
1517     c = *(++ptr);
1518     if (c == CHAR_NULL) goto ERROR_RETURN;
1519     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1520     name[i] = c;
1521     }
1522   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1523   name[i] = 0;
1524   }
1525 
1526 /* Otherwise there is just one following character */
1527 
1528 else
1529   {
1530   name[0] = c;
1531   name[1] = 0;
1532   }
1533 
1534 *ptrptr = ptr;
1535 
1536 /* Search for a recognized property name using binary chop */
1537 
1538 bot = 0;
1539 top = PRIV(utt_size);
1540 
1541 while (bot < top)
1542   {
1543   int r;
1544   i = (bot + top) >> 1;
1545   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1546   if (r == 0)
1547     {
1548     *ptypeptr = PRIV(utt)[i].type;
1549     *pdataptr = PRIV(utt)[i].value;
1550     return TRUE;
1551     }
1552   if (r > 0) bot = i + 1; else top = i;
1553   }
1554 
1555 *errorcodeptr = ERR47;
1556 *ptrptr = ptr;
1557 return FALSE;
1558 
1559 ERROR_RETURN:
1560 *errorcodeptr = ERR46;
1561 *ptrptr = ptr;
1562 return FALSE;
1563 }
1564 #endif
1565 
1566 
1567 
1568 /*************************************************
1569 *         Read repeat counts                     *
1570 *************************************************/
1571 
1572 /* Read an item of the form {n,m} and return the values. This is called only
1573 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1574 so the syntax is guaranteed to be correct, but we need to check the values.
1575 
1576 Arguments:
1577   p              pointer to first char after '{'
1578   minp           pointer to int for min
1579   maxp           pointer to int for max
1580                  returned as -1 if no max
1581   errorcodeptr   points to error code variable
1582 
1583 Returns:         pointer to '}' on success;
1584                  current ptr on error, with errorcodeptr set non-zero
1585 */
1586 
1587 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1588 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1589 {
1590 int min = 0;
1591 int max = -1;
1592 
1593 while (IS_DIGIT(*p))
1594   {
1595   min = min * 10 + (int)(*p++ - CHAR_0);
1596   if (min > 65535)
1597     {
1598     *errorcodeptr = ERR5;
1599     return p;
1600     }
1601   }
1602 
1603 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1604   {
1605   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1606     {
1607     max = 0;
1608     while(IS_DIGIT(*p))
1609       {
1610       max = max * 10 + (int)(*p++ - CHAR_0);
1611       if (max > 65535)
1612         {
1613         *errorcodeptr = ERR5;
1614         return p;
1615         }
1616       }
1617     if (max < min)
1618       {
1619       *errorcodeptr = ERR4;
1620       return p;
1621       }
1622     }
1623   }
1624 
1625 *minp = min;
1626 *maxp = max;
1627 return p;
1628 }
1629 
1630 
1631 
1632 /*************************************************
1633 *      Find first significant op code            *
1634 *************************************************/
1635 
1636 /* This is called by several functions that scan a compiled expression looking
1637 for a fixed first character, or an anchoring op code etc. It skips over things
1638 that do not influence this. For some calls, it makes sense to skip negative
1639 forward and all backward assertions, and also the \b assertion; for others it
1640 does not.
1641 
1642 Arguments:
1643   code         pointer to the start of the group
1644   skipassert   TRUE if certain assertions are to be skipped
1645 
1646 Returns:       pointer to the first significant opcode
1647 */
1648 
1649 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1650 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1651 {
1652 for (;;)
1653   {
1654   switch ((int)*code)
1655     {
1656     case OP_ASSERT_NOT:
1657     case OP_ASSERTBACK:
1658     case OP_ASSERTBACK_NOT:
1659     if (!skipassert) return code;
1660     do code += GET(code, 1); while (*code == OP_ALT);
1661     code += PRIV(OP_lengths)[*code];
1662     break;
1663 
1664     case OP_WORD_BOUNDARY:
1665     case OP_NOT_WORD_BOUNDARY:
1666     if (!skipassert) return code;
1667     /* Fall through */
1668 
1669     case OP_CALLOUT:
1670     case OP_CREF:
1671     case OP_DNCREF:
1672     case OP_RREF:
1673     case OP_DNRREF:
1674     case OP_DEF:
1675     code += PRIV(OP_lengths)[*code];
1676     break;
1677 
1678     default:
1679     return code;
1680     }
1681   }
1682 /* Control never reaches here */
1683 }
1684 
1685 
1686 
1687 /*************************************************
1688 *        Find the fixed length of a branch       *
1689 *************************************************/
1690 
1691 /* Scan a branch and compute the fixed length of subject that will match it,
1692 if the length is fixed. This is needed for dealing with backward assertions.
1693 In UTF8 mode, the result is in characters rather than bytes. The branch is
1694 temporarily terminated with OP_END when this function is called.
1695 
1696 This function is called when a backward assertion is encountered, so that if it
1697 fails, the error message can point to the correct place in the pattern.
1698 However, we cannot do this when the assertion contains subroutine calls,
1699 because they can be forward references. We solve this by remembering this case
1700 and doing the check at the end; a flag specifies which mode we are running in.
1701 
1702 Arguments:
1703   code     points to the start of the pattern (the bracket)
1704   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1705   atend    TRUE if called when the pattern is complete
1706   cd       the "compile data" structure
1707 
1708 Returns:   the fixed length,
1709              or -1 if there is no fixed length,
1710              or -2 if \C was encountered (in UTF-8 mode only)
1711              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1712              or -4 if an unknown opcode was encountered (internal error)
1713 */
1714 
1715 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd)1716 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1717 {
1718 int length = -1;
1719 
1720 register int branchlength = 0;
1721 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1722 
1723 /* Scan along the opcodes for this branch. If we get to the end of the
1724 branch, check the length against that of the other branches. */
1725 
1726 for (;;)
1727   {
1728   int d;
1729   pcre_uchar *ce, *cs;
1730   register pcre_uchar op = *cc;
1731 
1732   switch (op)
1733     {
1734     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1735     OP_BRA (normal non-capturing bracket) because the other variants of these
1736     opcodes are all concerned with unlimited repeated groups, which of course
1737     are not of fixed length. */
1738 
1739     case OP_CBRA:
1740     case OP_BRA:
1741     case OP_ONCE:
1742     case OP_ONCE_NC:
1743     case OP_COND:
1744     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1745     if (d < 0) return d;
1746     branchlength += d;
1747     do cc += GET(cc, 1); while (*cc == OP_ALT);
1748     cc += 1 + LINK_SIZE;
1749     break;
1750 
1751     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1752     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1753     an ALT. If it is END it's the end of the outer call. All can be handled by
1754     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1755     because they all imply an unlimited repeat. */
1756 
1757     case OP_ALT:
1758     case OP_KET:
1759     case OP_END:
1760     case OP_ACCEPT:
1761     case OP_ASSERT_ACCEPT:
1762     if (length < 0) length = branchlength;
1763       else if (length != branchlength) return -1;
1764     if (*cc != OP_ALT) return length;
1765     cc += 1 + LINK_SIZE;
1766     branchlength = 0;
1767     break;
1768 
1769     /* A true recursion implies not fixed length, but a subroutine call may
1770     be OK. If the subroutine is a forward reference, we can't deal with
1771     it until the end of the pattern, so return -3. */
1772 
1773     case OP_RECURSE:
1774     if (!atend) return -3;
1775     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1776     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1777     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1778     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1779     if (d < 0) return d;
1780     branchlength += d;
1781     cc += 1 + LINK_SIZE;
1782     break;
1783 
1784     /* Skip over assertive subpatterns */
1785 
1786     case OP_ASSERT:
1787     case OP_ASSERT_NOT:
1788     case OP_ASSERTBACK:
1789     case OP_ASSERTBACK_NOT:
1790     do cc += GET(cc, 1); while (*cc == OP_ALT);
1791     cc += PRIV(OP_lengths)[*cc];
1792     break;
1793 
1794     /* Skip over things that don't match chars */
1795 
1796     case OP_MARK:
1797     case OP_PRUNE_ARG:
1798     case OP_SKIP_ARG:
1799     case OP_THEN_ARG:
1800     cc += cc[1] + PRIV(OP_lengths)[*cc];
1801     break;
1802 
1803     case OP_CALLOUT:
1804     case OP_CIRC:
1805     case OP_CIRCM:
1806     case OP_CLOSE:
1807     case OP_COMMIT:
1808     case OP_CREF:
1809     case OP_DEF:
1810     case OP_DNCREF:
1811     case OP_DNRREF:
1812     case OP_DOLL:
1813     case OP_DOLLM:
1814     case OP_EOD:
1815     case OP_EODN:
1816     case OP_FAIL:
1817     case OP_NOT_WORD_BOUNDARY:
1818     case OP_PRUNE:
1819     case OP_REVERSE:
1820     case OP_RREF:
1821     case OP_SET_SOM:
1822     case OP_SKIP:
1823     case OP_SOD:
1824     case OP_SOM:
1825     case OP_THEN:
1826     case OP_WORD_BOUNDARY:
1827     cc += PRIV(OP_lengths)[*cc];
1828     break;
1829 
1830     /* Handle literal characters */
1831 
1832     case OP_CHAR:
1833     case OP_CHARI:
1834     case OP_NOT:
1835     case OP_NOTI:
1836     branchlength++;
1837     cc += 2;
1838 #ifdef SUPPORT_UTF
1839     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1840 #endif
1841     break;
1842 
1843     /* Handle exact repetitions. The count is already in characters, but we
1844     need to skip over a multibyte character in UTF8 mode.  */
1845 
1846     case OP_EXACT:
1847     case OP_EXACTI:
1848     case OP_NOTEXACT:
1849     case OP_NOTEXACTI:
1850     branchlength += (int)GET2(cc,1);
1851     cc += 2 + IMM2_SIZE;
1852 #ifdef SUPPORT_UTF
1853     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1854 #endif
1855     break;
1856 
1857     case OP_TYPEEXACT:
1858     branchlength += GET2(cc,1);
1859     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1860       cc += 2;
1861     cc += 1 + IMM2_SIZE + 1;
1862     break;
1863 
1864     /* Handle single-char matchers */
1865 
1866     case OP_PROP:
1867     case OP_NOTPROP:
1868     cc += 2;
1869     /* Fall through */
1870 
1871     case OP_HSPACE:
1872     case OP_VSPACE:
1873     case OP_NOT_HSPACE:
1874     case OP_NOT_VSPACE:
1875     case OP_NOT_DIGIT:
1876     case OP_DIGIT:
1877     case OP_NOT_WHITESPACE:
1878     case OP_WHITESPACE:
1879     case OP_NOT_WORDCHAR:
1880     case OP_WORDCHAR:
1881     case OP_ANY:
1882     case OP_ALLANY:
1883     branchlength++;
1884     cc++;
1885     break;
1886 
1887     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1888     otherwise \C is coded as OP_ALLANY. */
1889 
1890     case OP_ANYBYTE:
1891     return -2;
1892 
1893     /* Check a class for variable quantification */
1894 
1895     case OP_CLASS:
1896     case OP_NCLASS:
1897 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1898     case OP_XCLASS:
1899     /* The original code caused an unsigned overflow in 64 bit systems,
1900     so now we use a conditional statement. */
1901     if (op == OP_XCLASS)
1902       cc += GET(cc, 1);
1903     else
1904       cc += PRIV(OP_lengths)[OP_CLASS];
1905 #else
1906     cc += PRIV(OP_lengths)[OP_CLASS];
1907 #endif
1908 
1909     switch (*cc)
1910       {
1911       case OP_CRSTAR:
1912       case OP_CRMINSTAR:
1913       case OP_CRPLUS:
1914       case OP_CRMINPLUS:
1915       case OP_CRQUERY:
1916       case OP_CRMINQUERY:
1917       case OP_CRPOSSTAR:
1918       case OP_CRPOSPLUS:
1919       case OP_CRPOSQUERY:
1920       return -1;
1921 
1922       case OP_CRRANGE:
1923       case OP_CRMINRANGE:
1924       case OP_CRPOSRANGE:
1925       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1926       branchlength += (int)GET2(cc,1);
1927       cc += 1 + 2 * IMM2_SIZE;
1928       break;
1929 
1930       default:
1931       branchlength++;
1932       }
1933     break;
1934 
1935     /* Anything else is variable length */
1936 
1937     case OP_ANYNL:
1938     case OP_BRAMINZERO:
1939     case OP_BRAPOS:
1940     case OP_BRAPOSZERO:
1941     case OP_BRAZERO:
1942     case OP_CBRAPOS:
1943     case OP_EXTUNI:
1944     case OP_KETRMAX:
1945     case OP_KETRMIN:
1946     case OP_KETRPOS:
1947     case OP_MINPLUS:
1948     case OP_MINPLUSI:
1949     case OP_MINQUERY:
1950     case OP_MINQUERYI:
1951     case OP_MINSTAR:
1952     case OP_MINSTARI:
1953     case OP_MINUPTO:
1954     case OP_MINUPTOI:
1955     case OP_NOTMINPLUS:
1956     case OP_NOTMINPLUSI:
1957     case OP_NOTMINQUERY:
1958     case OP_NOTMINQUERYI:
1959     case OP_NOTMINSTAR:
1960     case OP_NOTMINSTARI:
1961     case OP_NOTMINUPTO:
1962     case OP_NOTMINUPTOI:
1963     case OP_NOTPLUS:
1964     case OP_NOTPLUSI:
1965     case OP_NOTPOSPLUS:
1966     case OP_NOTPOSPLUSI:
1967     case OP_NOTPOSQUERY:
1968     case OP_NOTPOSQUERYI:
1969     case OP_NOTPOSSTAR:
1970     case OP_NOTPOSSTARI:
1971     case OP_NOTPOSUPTO:
1972     case OP_NOTPOSUPTOI:
1973     case OP_NOTQUERY:
1974     case OP_NOTQUERYI:
1975     case OP_NOTSTAR:
1976     case OP_NOTSTARI:
1977     case OP_NOTUPTO:
1978     case OP_NOTUPTOI:
1979     case OP_PLUS:
1980     case OP_PLUSI:
1981     case OP_POSPLUS:
1982     case OP_POSPLUSI:
1983     case OP_POSQUERY:
1984     case OP_POSQUERYI:
1985     case OP_POSSTAR:
1986     case OP_POSSTARI:
1987     case OP_POSUPTO:
1988     case OP_POSUPTOI:
1989     case OP_QUERY:
1990     case OP_QUERYI:
1991     case OP_REF:
1992     case OP_REFI:
1993     case OP_DNREF:
1994     case OP_DNREFI:
1995     case OP_SBRA:
1996     case OP_SBRAPOS:
1997     case OP_SCBRA:
1998     case OP_SCBRAPOS:
1999     case OP_SCOND:
2000     case OP_SKIPZERO:
2001     case OP_STAR:
2002     case OP_STARI:
2003     case OP_TYPEMINPLUS:
2004     case OP_TYPEMINQUERY:
2005     case OP_TYPEMINSTAR:
2006     case OP_TYPEMINUPTO:
2007     case OP_TYPEPLUS:
2008     case OP_TYPEPOSPLUS:
2009     case OP_TYPEPOSQUERY:
2010     case OP_TYPEPOSSTAR:
2011     case OP_TYPEPOSUPTO:
2012     case OP_TYPEQUERY:
2013     case OP_TYPESTAR:
2014     case OP_TYPEUPTO:
2015     case OP_UPTO:
2016     case OP_UPTOI:
2017     return -1;
2018 
2019     /* Catch unrecognized opcodes so that when new ones are added they
2020     are not forgotten, as has happened in the past. */
2021 
2022     default:
2023     return -4;
2024     }
2025   }
2026 /* Control never gets here */
2027 }
2028 
2029 
2030 
2031 /*************************************************
2032 *    Scan compiled regex for specific bracket    *
2033 *************************************************/
2034 
2035 /* This little function scans through a compiled pattern until it finds a
2036 capturing bracket with the given number, or, if the number is negative, an
2037 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2038 so that it can be called from pcre_study() when finding the minimum matching
2039 length.
2040 
2041 Arguments:
2042   code        points to start of expression
2043   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2044   number      the required bracket number or negative to find a lookbehind
2045 
2046 Returns:      pointer to the opcode for the bracket, or NULL if not found
2047 */
2048 
2049 const pcre_uchar *
PRIV(find_bracket)2050 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2051 {
2052 for (;;)
2053   {
2054   register pcre_uchar c = *code;
2055 
2056   if (c == OP_END) return NULL;
2057 
2058   /* XCLASS is used for classes that cannot be represented just by a bit
2059   map. This includes negated single high-valued characters. The length in
2060   the table is zero; the actual length is stored in the compiled code. */
2061 
2062   if (c == OP_XCLASS) code += GET(code, 1);
2063 
2064   /* Handle recursion */
2065 
2066   else if (c == OP_REVERSE)
2067     {
2068     if (number < 0) return (pcre_uchar *)code;
2069     code += PRIV(OP_lengths)[c];
2070     }
2071 
2072   /* Handle capturing bracket */
2073 
2074   else if (c == OP_CBRA || c == OP_SCBRA ||
2075            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2076     {
2077     int n = (int)GET2(code, 1+LINK_SIZE);
2078     if (n == number) return (pcre_uchar *)code;
2079     code += PRIV(OP_lengths)[c];
2080     }
2081 
2082   /* Otherwise, we can get the item's length from the table, except that for
2083   repeated character types, we have to test for \p and \P, which have an extra
2084   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2085   must add in its length. */
2086 
2087   else
2088     {
2089     switch(c)
2090       {
2091       case OP_TYPESTAR:
2092       case OP_TYPEMINSTAR:
2093       case OP_TYPEPLUS:
2094       case OP_TYPEMINPLUS:
2095       case OP_TYPEQUERY:
2096       case OP_TYPEMINQUERY:
2097       case OP_TYPEPOSSTAR:
2098       case OP_TYPEPOSPLUS:
2099       case OP_TYPEPOSQUERY:
2100       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2101       break;
2102 
2103       case OP_TYPEUPTO:
2104       case OP_TYPEMINUPTO:
2105       case OP_TYPEEXACT:
2106       case OP_TYPEPOSUPTO:
2107       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2108         code += 2;
2109       break;
2110 
2111       case OP_MARK:
2112       case OP_PRUNE_ARG:
2113       case OP_SKIP_ARG:
2114       case OP_THEN_ARG:
2115       code += code[1];
2116       break;
2117       }
2118 
2119     /* Add in the fixed length from the table */
2120 
2121     code += PRIV(OP_lengths)[c];
2122 
2123   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2124   a multi-byte character. The length in the table is a minimum, so we have to
2125   arrange to skip the extra bytes. */
2126 
2127 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2128     if (utf) switch(c)
2129       {
2130       case OP_CHAR:
2131       case OP_CHARI:
2132       case OP_EXACT:
2133       case OP_EXACTI:
2134       case OP_UPTO:
2135       case OP_UPTOI:
2136       case OP_MINUPTO:
2137       case OP_MINUPTOI:
2138       case OP_POSUPTO:
2139       case OP_POSUPTOI:
2140       case OP_STAR:
2141       case OP_STARI:
2142       case OP_MINSTAR:
2143       case OP_MINSTARI:
2144       case OP_POSSTAR:
2145       case OP_POSSTARI:
2146       case OP_PLUS:
2147       case OP_PLUSI:
2148       case OP_MINPLUS:
2149       case OP_MINPLUSI:
2150       case OP_POSPLUS:
2151       case OP_POSPLUSI:
2152       case OP_QUERY:
2153       case OP_QUERYI:
2154       case OP_MINQUERY:
2155       case OP_MINQUERYI:
2156       case OP_POSQUERY:
2157       case OP_POSQUERYI:
2158       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2159       break;
2160       }
2161 #else
2162     (void)(utf);  /* Keep compiler happy by referencing function argument */
2163 #endif
2164     }
2165   }
2166 }
2167 
2168 
2169 
2170 /*************************************************
2171 *   Scan compiled regex for recursion reference  *
2172 *************************************************/
2173 
2174 /* This little function scans through a compiled pattern until it finds an
2175 instance of OP_RECURSE.
2176 
2177 Arguments:
2178   code        points to start of expression
2179   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2180 
2181 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2182 */
2183 
2184 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2185 find_recurse(const pcre_uchar *code, BOOL utf)
2186 {
2187 for (;;)
2188   {
2189   register pcre_uchar c = *code;
2190   if (c == OP_END) return NULL;
2191   if (c == OP_RECURSE) return code;
2192 
2193   /* XCLASS is used for classes that cannot be represented just by a bit
2194   map. This includes negated single high-valued characters. The length in
2195   the table is zero; the actual length is stored in the compiled code. */
2196 
2197   if (c == OP_XCLASS) code += GET(code, 1);
2198 
2199   /* Otherwise, we can get the item's length from the table, except that for
2200   repeated character types, we have to test for \p and \P, which have an extra
2201   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2202   must add in its length. */
2203 
2204   else
2205     {
2206     switch(c)
2207       {
2208       case OP_TYPESTAR:
2209       case OP_TYPEMINSTAR:
2210       case OP_TYPEPLUS:
2211       case OP_TYPEMINPLUS:
2212       case OP_TYPEQUERY:
2213       case OP_TYPEMINQUERY:
2214       case OP_TYPEPOSSTAR:
2215       case OP_TYPEPOSPLUS:
2216       case OP_TYPEPOSQUERY:
2217       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2218       break;
2219 
2220       case OP_TYPEPOSUPTO:
2221       case OP_TYPEUPTO:
2222       case OP_TYPEMINUPTO:
2223       case OP_TYPEEXACT:
2224       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2225         code += 2;
2226       break;
2227 
2228       case OP_MARK:
2229       case OP_PRUNE_ARG:
2230       case OP_SKIP_ARG:
2231       case OP_THEN_ARG:
2232       code += code[1];
2233       break;
2234       }
2235 
2236     /* Add in the fixed length from the table */
2237 
2238     code += PRIV(OP_lengths)[c];
2239 
2240     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2241     by a multi-byte character. The length in the table is a minimum, so we have
2242     to arrange to skip the extra bytes. */
2243 
2244 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2245     if (utf) switch(c)
2246       {
2247       case OP_CHAR:
2248       case OP_CHARI:
2249       case OP_NOT:
2250       case OP_NOTI:
2251       case OP_EXACT:
2252       case OP_EXACTI:
2253       case OP_NOTEXACT:
2254       case OP_NOTEXACTI:
2255       case OP_UPTO:
2256       case OP_UPTOI:
2257       case OP_NOTUPTO:
2258       case OP_NOTUPTOI:
2259       case OP_MINUPTO:
2260       case OP_MINUPTOI:
2261       case OP_NOTMINUPTO:
2262       case OP_NOTMINUPTOI:
2263       case OP_POSUPTO:
2264       case OP_POSUPTOI:
2265       case OP_NOTPOSUPTO:
2266       case OP_NOTPOSUPTOI:
2267       case OP_STAR:
2268       case OP_STARI:
2269       case OP_NOTSTAR:
2270       case OP_NOTSTARI:
2271       case OP_MINSTAR:
2272       case OP_MINSTARI:
2273       case OP_NOTMINSTAR:
2274       case OP_NOTMINSTARI:
2275       case OP_POSSTAR:
2276       case OP_POSSTARI:
2277       case OP_NOTPOSSTAR:
2278       case OP_NOTPOSSTARI:
2279       case OP_PLUS:
2280       case OP_PLUSI:
2281       case OP_NOTPLUS:
2282       case OP_NOTPLUSI:
2283       case OP_MINPLUS:
2284       case OP_MINPLUSI:
2285       case OP_NOTMINPLUS:
2286       case OP_NOTMINPLUSI:
2287       case OP_POSPLUS:
2288       case OP_POSPLUSI:
2289       case OP_NOTPOSPLUS:
2290       case OP_NOTPOSPLUSI:
2291       case OP_QUERY:
2292       case OP_QUERYI:
2293       case OP_NOTQUERY:
2294       case OP_NOTQUERYI:
2295       case OP_MINQUERY:
2296       case OP_MINQUERYI:
2297       case OP_NOTMINQUERY:
2298       case OP_NOTMINQUERYI:
2299       case OP_POSQUERY:
2300       case OP_POSQUERYI:
2301       case OP_NOTPOSQUERY:
2302       case OP_NOTPOSQUERYI:
2303       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2304       break;
2305       }
2306 #else
2307     (void)(utf);  /* Keep compiler happy by referencing function argument */
2308 #endif
2309     }
2310   }
2311 }
2312 
2313 
2314 
2315 /*************************************************
2316 *    Scan compiled branch for non-emptiness      *
2317 *************************************************/
2318 
2319 /* This function scans through a branch of a compiled pattern to see whether it
2320 can match the empty string or not. It is called from could_be_empty()
2321 below and from compile_branch() when checking for an unlimited repeat of a
2322 group that can match nothing. Note that first_significant_code() skips over
2323 backward and negative forward assertions when its final argument is TRUE. If we
2324 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2325 bracket whose current branch will already have been scanned.
2326 
2327 Arguments:
2328   code        points to start of search
2329   endcode     points to where to stop
2330   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2331   cd          contains pointers to tables etc.
2332   recurses    chain of recurse_check to catch mutual recursion
2333 
2334 Returns:      TRUE if what is matched could be empty
2335 */
2336 
2337 typedef struct recurse_check {
2338   struct recurse_check *prev;
2339   const pcre_uchar *group;
2340 } recurse_check;
2341 
2342 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2343 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2344   BOOL utf, compile_data *cd, recurse_check *recurses)
2345 {
2346 register pcre_uchar c;
2347 recurse_check this_recurse;
2348 
2349 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2350      code < endcode;
2351      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2352   {
2353   const pcre_uchar *ccode;
2354 
2355   c = *code;
2356 
2357   /* Skip over forward assertions; the other assertions are skipped by
2358   first_significant_code() with a TRUE final argument. */
2359 
2360   if (c == OP_ASSERT)
2361     {
2362     do code += GET(code, 1); while (*code == OP_ALT);
2363     c = *code;
2364     continue;
2365     }
2366 
2367   /* For a recursion/subroutine call, if its end has been reached, which
2368   implies a backward reference subroutine call, we can scan it. If it's a
2369   forward reference subroutine call, we can't. To detect forward reference
2370   we have to scan up the list that is kept in the workspace. This function is
2371   called only when doing the real compile, not during the pre-compile that
2372   measures the size of the compiled pattern. */
2373 
2374   if (c == OP_RECURSE)
2375     {
2376     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2377     const pcre_uchar *endgroup = scode;
2378     BOOL empty_branch;
2379 
2380     /* Test for forward reference or uncompleted reference. This is disabled
2381     when called to scan a completed pattern by setting cd->start_workspace to
2382     NULL. */
2383 
2384     if (cd->start_workspace != NULL)
2385       {
2386       const pcre_uchar *tcode;
2387       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2388         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2389       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2390       }
2391 
2392     /* If the reference is to a completed group, we need to detect whether this
2393     is a recursive call, as otherwise there will be an infinite loop. If it is
2394     a recursion, just skip over it. Simple recursions are easily detected. For
2395     mutual recursions we keep a chain on the stack. */
2396 
2397     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2398     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2399     else
2400       {
2401       recurse_check *r = recurses;
2402       for (r = recurses; r != NULL; r = r->prev)
2403         if (r->group == scode) break;
2404       if (r != NULL) continue;   /* Mutual recursion */
2405       }
2406 
2407     /* Completed reference; scan the referenced group, remembering it on the
2408     stack chain to detect mutual recursions. */
2409 
2410     empty_branch = FALSE;
2411     this_recurse.prev = recurses;
2412     this_recurse.group = scode;
2413 
2414     do
2415       {
2416       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2417         {
2418         empty_branch = TRUE;
2419         break;
2420         }
2421       scode += GET(scode, 1);
2422       }
2423     while (*scode == OP_ALT);
2424 
2425     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2426     continue;
2427     }
2428 
2429   /* Groups with zero repeats can of course be empty; skip them. */
2430 
2431   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2432       c == OP_BRAPOSZERO)
2433     {
2434     code += PRIV(OP_lengths)[c];
2435     do code += GET(code, 1); while (*code == OP_ALT);
2436     c = *code;
2437     continue;
2438     }
2439 
2440   /* A nested group that is already marked as "could be empty" can just be
2441   skipped. */
2442 
2443   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2444       c == OP_SCBRA || c == OP_SCBRAPOS)
2445     {
2446     do code += GET(code, 1); while (*code == OP_ALT);
2447     c = *code;
2448     continue;
2449     }
2450 
2451   /* For other groups, scan the branches. */
2452 
2453   if (c == OP_BRA  || c == OP_BRAPOS ||
2454       c == OP_CBRA || c == OP_CBRAPOS ||
2455       c == OP_ONCE || c == OP_ONCE_NC ||
2456       c == OP_COND)
2457     {
2458     BOOL empty_branch;
2459     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2460 
2461     /* If a conditional group has only one branch, there is a second, implied,
2462     empty branch, so just skip over the conditional, because it could be empty.
2463     Otherwise, scan the individual branches of the group. */
2464 
2465     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2466       code += GET(code, 1);
2467     else
2468       {
2469       empty_branch = FALSE;
2470       do
2471         {
2472         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL))
2473           empty_branch = TRUE;
2474         code += GET(code, 1);
2475         }
2476       while (*code == OP_ALT);
2477       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2478       }
2479 
2480     c = *code;
2481     continue;
2482     }
2483 
2484   /* Handle the other opcodes */
2485 
2486   switch (c)
2487     {
2488     /* Check for quantifiers after a class. XCLASS is used for classes that
2489     cannot be represented just by a bit map. This includes negated single
2490     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2491     actual length is stored in the compiled code, so we must update "code"
2492     here. */
2493 
2494 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2495     case OP_XCLASS:
2496     ccode = code += GET(code, 1);
2497     goto CHECK_CLASS_REPEAT;
2498 #endif
2499 
2500     case OP_CLASS:
2501     case OP_NCLASS:
2502     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2503 
2504 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2505     CHECK_CLASS_REPEAT:
2506 #endif
2507 
2508     switch (*ccode)
2509       {
2510       case OP_CRSTAR:            /* These could be empty; continue */
2511       case OP_CRMINSTAR:
2512       case OP_CRQUERY:
2513       case OP_CRMINQUERY:
2514       case OP_CRPOSSTAR:
2515       case OP_CRPOSQUERY:
2516       break;
2517 
2518       default:                   /* Non-repeat => class must match */
2519       case OP_CRPLUS:            /* These repeats aren't empty */
2520       case OP_CRMINPLUS:
2521       case OP_CRPOSPLUS:
2522       return FALSE;
2523 
2524       case OP_CRRANGE:
2525       case OP_CRMINRANGE:
2526       case OP_CRPOSRANGE:
2527       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2528       break;
2529       }
2530     break;
2531 
2532     /* Opcodes that must match a character */
2533 
2534     case OP_ANY:
2535     case OP_ALLANY:
2536     case OP_ANYBYTE:
2537 
2538     case OP_PROP:
2539     case OP_NOTPROP:
2540     case OP_ANYNL:
2541 
2542     case OP_NOT_HSPACE:
2543     case OP_HSPACE:
2544     case OP_NOT_VSPACE:
2545     case OP_VSPACE:
2546     case OP_EXTUNI:
2547 
2548     case OP_NOT_DIGIT:
2549     case OP_DIGIT:
2550     case OP_NOT_WHITESPACE:
2551     case OP_WHITESPACE:
2552     case OP_NOT_WORDCHAR:
2553     case OP_WORDCHAR:
2554 
2555     case OP_CHAR:
2556     case OP_CHARI:
2557     case OP_NOT:
2558     case OP_NOTI:
2559 
2560     case OP_PLUS:
2561     case OP_PLUSI:
2562     case OP_MINPLUS:
2563     case OP_MINPLUSI:
2564 
2565     case OP_NOTPLUS:
2566     case OP_NOTPLUSI:
2567     case OP_NOTMINPLUS:
2568     case OP_NOTMINPLUSI:
2569 
2570     case OP_POSPLUS:
2571     case OP_POSPLUSI:
2572     case OP_NOTPOSPLUS:
2573     case OP_NOTPOSPLUSI:
2574 
2575     case OP_EXACT:
2576     case OP_EXACTI:
2577     case OP_NOTEXACT:
2578     case OP_NOTEXACTI:
2579 
2580     case OP_TYPEPLUS:
2581     case OP_TYPEMINPLUS:
2582     case OP_TYPEPOSPLUS:
2583     case OP_TYPEEXACT:
2584 
2585     return FALSE;
2586 
2587     /* These are going to continue, as they may be empty, but we have to
2588     fudge the length for the \p and \P cases. */
2589 
2590     case OP_TYPESTAR:
2591     case OP_TYPEMINSTAR:
2592     case OP_TYPEPOSSTAR:
2593     case OP_TYPEQUERY:
2594     case OP_TYPEMINQUERY:
2595     case OP_TYPEPOSQUERY:
2596     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2597     break;
2598 
2599     /* Same for these */
2600 
2601     case OP_TYPEUPTO:
2602     case OP_TYPEMINUPTO:
2603     case OP_TYPEPOSUPTO:
2604     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2605       code += 2;
2606     break;
2607 
2608     /* End of branch */
2609 
2610     case OP_KET:
2611     case OP_KETRMAX:
2612     case OP_KETRMIN:
2613     case OP_KETRPOS:
2614     case OP_ALT:
2615     return TRUE;
2616 
2617     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2618     MINUPTO, and POSUPTO and their caseless and negative versions may be
2619     followed by a multibyte character. */
2620 
2621 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2622     case OP_STAR:
2623     case OP_STARI:
2624     case OP_NOTSTAR:
2625     case OP_NOTSTARI:
2626 
2627     case OP_MINSTAR:
2628     case OP_MINSTARI:
2629     case OP_NOTMINSTAR:
2630     case OP_NOTMINSTARI:
2631 
2632     case OP_POSSTAR:
2633     case OP_POSSTARI:
2634     case OP_NOTPOSSTAR:
2635     case OP_NOTPOSSTARI:
2636 
2637     case OP_QUERY:
2638     case OP_QUERYI:
2639     case OP_NOTQUERY:
2640     case OP_NOTQUERYI:
2641 
2642     case OP_MINQUERY:
2643     case OP_MINQUERYI:
2644     case OP_NOTMINQUERY:
2645     case OP_NOTMINQUERYI:
2646 
2647     case OP_POSQUERY:
2648     case OP_POSQUERYI:
2649     case OP_NOTPOSQUERY:
2650     case OP_NOTPOSQUERYI:
2651 
2652     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2653     break;
2654 
2655     case OP_UPTO:
2656     case OP_UPTOI:
2657     case OP_NOTUPTO:
2658     case OP_NOTUPTOI:
2659 
2660     case OP_MINUPTO:
2661     case OP_MINUPTOI:
2662     case OP_NOTMINUPTO:
2663     case OP_NOTMINUPTOI:
2664 
2665     case OP_POSUPTO:
2666     case OP_POSUPTOI:
2667     case OP_NOTPOSUPTO:
2668     case OP_NOTPOSUPTOI:
2669 
2670     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2671     break;
2672 #endif
2673 
2674     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2675     string. */
2676 
2677     case OP_MARK:
2678     case OP_PRUNE_ARG:
2679     case OP_SKIP_ARG:
2680     case OP_THEN_ARG:
2681     code += code[1];
2682     break;
2683 
2684     /* None of the remaining opcodes are required to match a character. */
2685 
2686     default:
2687     break;
2688     }
2689   }
2690 
2691 return TRUE;
2692 }
2693 
2694 
2695 
2696 /*************************************************
2697 *    Scan compiled regex for non-emptiness       *
2698 *************************************************/
2699 
2700 /* This function is called to check for left recursive calls. We want to check
2701 the current branch of the current pattern to see if it could match the empty
2702 string. If it could, we must look outwards for branches at other levels,
2703 stopping when we pass beyond the bracket which is the subject of the recursion.
2704 This function is called only during the real compile, not during the
2705 pre-compile.
2706 
2707 Arguments:
2708   code        points to start of the recursion
2709   endcode     points to where to stop (current RECURSE item)
2710   bcptr       points to the chain of current (unclosed) branch starts
2711   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2712   cd          pointers to tables etc
2713 
2714 Returns:      TRUE if what is matched could be empty
2715 */
2716 
2717 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2718 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2719   branch_chain *bcptr, BOOL utf, compile_data *cd)
2720 {
2721 while (bcptr != NULL && bcptr->current_branch >= code)
2722   {
2723   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2724     return FALSE;
2725   bcptr = bcptr->outer;
2726   }
2727 return TRUE;
2728 }
2729 
2730 
2731 
2732 /*************************************************
2733 *        Base opcode of repeated opcodes         *
2734 *************************************************/
2735 
2736 /* Returns the base opcode for repeated single character type opcodes. If the
2737 opcode is not a repeated character type, it returns with the original value.
2738 
2739 Arguments:  c opcode
2740 Returns:    base opcode for the type
2741 */
2742 
2743 static pcre_uchar
get_repeat_base(pcre_uchar c)2744 get_repeat_base(pcre_uchar c)
2745 {
2746 return (c > OP_TYPEPOSUPTO)? c :
2747        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2748        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2749        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2750        (c >= OP_STARI)?      OP_STARI :
2751                              OP_STAR;
2752 }
2753 
2754 
2755 
2756 #ifdef SUPPORT_UCP
2757 /*************************************************
2758 *        Check a character and a property        *
2759 *************************************************/
2760 
2761 /* This function is called by check_auto_possessive() when a property item
2762 is adjacent to a fixed character.
2763 
2764 Arguments:
2765   c            the character
2766   ptype        the property type
2767   pdata        the data for the type
2768   negated      TRUE if it's a negated property (\P or \p{^)
2769 
2770 Returns:       TRUE if auto-possessifying is OK
2771 */
2772 
2773 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2774 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2775   BOOL negated)
2776 {
2777 const pcre_uint32 *p;
2778 const ucd_record *prop = GET_UCD(c);
2779 
2780 switch(ptype)
2781   {
2782   case PT_LAMP:
2783   return (prop->chartype == ucp_Lu ||
2784           prop->chartype == ucp_Ll ||
2785           prop->chartype == ucp_Lt) == negated;
2786 
2787   case PT_GC:
2788   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2789 
2790   case PT_PC:
2791   return (pdata == prop->chartype) == negated;
2792 
2793   case PT_SC:
2794   return (pdata == prop->script) == negated;
2795 
2796   /* These are specials */
2797 
2798   case PT_ALNUM:
2799   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2800           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2801 
2802   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2803   means that Perl space and POSIX space are now identical. PCRE was changed
2804   at release 8.34. */
2805 
2806   case PT_SPACE:    /* Perl space */
2807   case PT_PXSPACE:  /* POSIX space */
2808   switch(c)
2809     {
2810     HSPACE_CASES:
2811     VSPACE_CASES:
2812     return negated;
2813 
2814     default:
2815     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2816     }
2817   break;  /* Control never reaches here */
2818 
2819   case PT_WORD:
2820   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2821           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2822           c == CHAR_UNDERSCORE) == negated;
2823 
2824   case PT_CLIST:
2825   p = PRIV(ucd_caseless_sets) + prop->caseset;
2826   for (;;)
2827     {
2828     if (c < *p) return !negated;
2829     if (c == *p++) return negated;
2830     }
2831   break;  /* Control never reaches here */
2832   }
2833 
2834 return FALSE;
2835 }
2836 #endif  /* SUPPORT_UCP */
2837 
2838 
2839 
2840 /*************************************************
2841 *        Fill the character property list        *
2842 *************************************************/
2843 
2844 /* Checks whether the code points to an opcode that can take part in auto-
2845 possessification, and if so, fills a list with its properties.
2846 
2847 Arguments:
2848   code        points to start of expression
2849   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2850   fcc         points to case-flipping table
2851   list        points to output list
2852               list[0] will be filled with the opcode
2853               list[1] will be non-zero if this opcode
2854                 can match an empty character string
2855               list[2..7] depends on the opcode
2856 
2857 Returns:      points to the start of the next opcode if *code is accepted
2858               NULL if *code is not accepted
2859 */
2860 
2861 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2862 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2863   const pcre_uint8 *fcc, pcre_uint32 *list)
2864 {
2865 pcre_uchar c = *code;
2866 pcre_uchar base;
2867 const pcre_uchar *end;
2868 pcre_uint32 chr;
2869 
2870 #ifdef SUPPORT_UCP
2871 pcre_uint32 *clist_dest;
2872 const pcre_uint32 *clist_src;
2873 #else
2874 utf = utf;  /* Suppress "unused parameter" compiler warning */
2875 #endif
2876 
2877 list[0] = c;
2878 list[1] = FALSE;
2879 code++;
2880 
2881 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2882   {
2883   base = get_repeat_base(c);
2884   c -= (base - OP_STAR);
2885 
2886   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2887     code += IMM2_SIZE;
2888 
2889   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2890 
2891   switch(base)
2892     {
2893     case OP_STAR:
2894     list[0] = OP_CHAR;
2895     break;
2896 
2897     case OP_STARI:
2898     list[0] = OP_CHARI;
2899     break;
2900 
2901     case OP_NOTSTAR:
2902     list[0] = OP_NOT;
2903     break;
2904 
2905     case OP_NOTSTARI:
2906     list[0] = OP_NOTI;
2907     break;
2908 
2909     case OP_TYPESTAR:
2910     list[0] = *code;
2911     code++;
2912     break;
2913     }
2914   c = list[0];
2915   }
2916 
2917 switch(c)
2918   {
2919   case OP_NOT_DIGIT:
2920   case OP_DIGIT:
2921   case OP_NOT_WHITESPACE:
2922   case OP_WHITESPACE:
2923   case OP_NOT_WORDCHAR:
2924   case OP_WORDCHAR:
2925   case OP_ANY:
2926   case OP_ALLANY:
2927   case OP_ANYNL:
2928   case OP_NOT_HSPACE:
2929   case OP_HSPACE:
2930   case OP_NOT_VSPACE:
2931   case OP_VSPACE:
2932   case OP_EXTUNI:
2933   case OP_EODN:
2934   case OP_EOD:
2935   case OP_DOLL:
2936   case OP_DOLLM:
2937   return code;
2938 
2939   case OP_CHAR:
2940   case OP_NOT:
2941   GETCHARINCTEST(chr, code);
2942   list[2] = chr;
2943   list[3] = NOTACHAR;
2944   return code;
2945 
2946   case OP_CHARI:
2947   case OP_NOTI:
2948   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
2949   GETCHARINCTEST(chr, code);
2950   list[2] = chr;
2951 
2952 #ifdef SUPPORT_UCP
2953   if (chr < 128 || (chr < 256 && !utf))
2954     list[3] = fcc[chr];
2955   else
2956     list[3] = UCD_OTHERCASE(chr);
2957 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
2958   list[3] = (chr < 256) ? fcc[chr] : chr;
2959 #else
2960   list[3] = fcc[chr];
2961 #endif
2962 
2963   /* The othercase might be the same value. */
2964 
2965   if (chr == list[3])
2966     list[3] = NOTACHAR;
2967   else
2968     list[4] = NOTACHAR;
2969   return code;
2970 
2971 #ifdef SUPPORT_UCP
2972   case OP_PROP:
2973   case OP_NOTPROP:
2974   if (code[0] != PT_CLIST)
2975     {
2976     list[2] = code[0];
2977     list[3] = code[1];
2978     return code + 2;
2979     }
2980 
2981   /* Convert only if we have enough space. */
2982 
2983   clist_src = PRIV(ucd_caseless_sets) + code[1];
2984   clist_dest = list + 2;
2985   code += 2;
2986 
2987   do {
2988      if (clist_dest >= list + 8)
2989        {
2990        /* Early return if there is not enough space. This should never
2991        happen, since all clists are shorter than 5 character now. */
2992        list[2] = code[0];
2993        list[3] = code[1];
2994        return code;
2995        }
2996      *clist_dest++ = *clist_src;
2997      }
2998   while(*clist_src++ != NOTACHAR);
2999 
3000   /* All characters are stored. The terminating NOTACHAR
3001   is copied form the clist itself. */
3002 
3003   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3004   return code;
3005 #endif
3006 
3007   case OP_NCLASS:
3008   case OP_CLASS:
3009 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3010   case OP_XCLASS:
3011   if (c == OP_XCLASS)
3012     end = code + GET(code, 0) - 1;
3013   else
3014 #endif
3015     end = code + 32 / sizeof(pcre_uchar);
3016 
3017   switch(*end)
3018     {
3019     case OP_CRSTAR:
3020     case OP_CRMINSTAR:
3021     case OP_CRQUERY:
3022     case OP_CRMINQUERY:
3023     case OP_CRPOSSTAR:
3024     case OP_CRPOSQUERY:
3025     list[1] = TRUE;
3026     end++;
3027     break;
3028 
3029     case OP_CRPLUS:
3030     case OP_CRMINPLUS:
3031     case OP_CRPOSPLUS:
3032     end++;
3033     break;
3034 
3035     case OP_CRRANGE:
3036     case OP_CRMINRANGE:
3037     case OP_CRPOSRANGE:
3038     list[1] = (GET2(end, 1) == 0);
3039     end += 1 + 2 * IMM2_SIZE;
3040     break;
3041     }
3042   list[2] = (pcre_uint32)(end - code);
3043   return end;
3044   }
3045 return NULL;    /* Opcode not accepted */
3046 }
3047 
3048 
3049 
3050 /*************************************************
3051 *    Scan further character sets for match       *
3052 *************************************************/
3053 
3054 /* Checks whether the base and the current opcode have a common character, in
3055 which case the base cannot be possessified.
3056 
3057 Arguments:
3058   code        points to the byte code
3059   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3060   cd          static compile data
3061   base_list   the data list of the base opcode
3062 
3063 Returns:      TRUE if the auto-possessification is possible
3064 */
3065 
3066 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end)3067 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3068   const pcre_uint32 *base_list, const pcre_uchar *base_end)
3069 {
3070 pcre_uchar c;
3071 pcre_uint32 list[8];
3072 const pcre_uint32 *chr_ptr;
3073 const pcre_uint32 *ochr_ptr;
3074 const pcre_uint32 *list_ptr;
3075 const pcre_uchar *next_code;
3076 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3077 const pcre_uchar *xclass_flags;
3078 #endif
3079 const pcre_uint8 *class_bitset;
3080 const pcre_uint8 *set1, *set2, *set_end;
3081 pcre_uint32 chr;
3082 BOOL accepted, invert_bits;
3083 BOOL entered_a_group = FALSE;
3084 
3085 /* Note: the base_list[1] contains whether the current opcode has greedy
3086 (represented by a non-zero value) quantifier. This is a different from
3087 other character type lists, which stores here that the character iterator
3088 matches to an empty string (also represented by a non-zero value). */
3089 
3090 for(;;)
3091   {
3092   /* All operations move the code pointer forward.
3093   Therefore infinite recursions are not possible. */
3094 
3095   c = *code;
3096 
3097   /* Skip over callouts */
3098 
3099   if (c == OP_CALLOUT)
3100     {
3101     code += PRIV(OP_lengths)[c];
3102     continue;
3103     }
3104 
3105   if (c == OP_ALT)
3106     {
3107     do code += GET(code, 1); while (*code == OP_ALT);
3108     c = *code;
3109     }
3110 
3111   switch(c)
3112     {
3113     case OP_END:
3114     case OP_KETRPOS:
3115     /* TRUE only in greedy case. The non-greedy case could be replaced by
3116     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3117     uses more memory, which we cannot get at this stage.) */
3118 
3119     return base_list[1] != 0;
3120 
3121     case OP_KET:
3122     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3123     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3124     cannot be converted to a possessive form. */
3125 
3126     if (base_list[1] == 0) return FALSE;
3127 
3128     switch(*(code - GET(code, 1)))
3129       {
3130       case OP_ASSERT:
3131       case OP_ASSERT_NOT:
3132       case OP_ASSERTBACK:
3133       case OP_ASSERTBACK_NOT:
3134       case OP_ONCE:
3135       case OP_ONCE_NC:
3136       /* Atomic sub-patterns and assertions can always auto-possessify their
3137       last iterator. However, if the group was entered as a result of checking
3138       a previous iterator, this is not possible. */
3139 
3140       return !entered_a_group;
3141       }
3142 
3143     code += PRIV(OP_lengths)[c];
3144     continue;
3145 
3146     case OP_ONCE:
3147     case OP_ONCE_NC:
3148     case OP_BRA:
3149     case OP_CBRA:
3150     next_code = code + GET(code, 1);
3151     code += PRIV(OP_lengths)[c];
3152 
3153     while (*next_code == OP_ALT)
3154       {
3155       if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE;
3156       code = next_code + 1 + LINK_SIZE;
3157       next_code += GET(next_code, 1);
3158       }
3159 
3160     entered_a_group = TRUE;
3161     continue;
3162 
3163     case OP_BRAZERO:
3164     case OP_BRAMINZERO:
3165 
3166     next_code = code + 1;
3167     if (*next_code != OP_BRA && *next_code != OP_CBRA
3168         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3169 
3170     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3171 
3172     /* The bracket content will be checked by the
3173     OP_BRA/OP_CBRA case above. */
3174     next_code += 1 + LINK_SIZE;
3175     if (!compare_opcodes(next_code, utf, cd, base_list, base_end))
3176       return FALSE;
3177 
3178     code += PRIV(OP_lengths)[c];
3179     continue;
3180 
3181     default:
3182     break;
3183     }
3184 
3185   /* Check for a supported opcode, and load its properties. */
3186 
3187   code = get_chr_property_list(code, utf, cd->fcc, list);
3188   if (code == NULL) return FALSE;    /* Unsupported */
3189 
3190   /* If either opcode is a small character list, set pointers for comparing
3191   characters from that list with another list, or with a property. */
3192 
3193   if (base_list[0] == OP_CHAR)
3194     {
3195     chr_ptr = base_list + 2;
3196     list_ptr = list;
3197     }
3198   else if (list[0] == OP_CHAR)
3199     {
3200     chr_ptr = list + 2;
3201     list_ptr = base_list;
3202     }
3203 
3204   /* Character bitsets can also be compared to certain opcodes. */
3205 
3206   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3207 #ifdef COMPILE_PCRE8
3208       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3209       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3210 #endif
3211       )
3212     {
3213 #ifdef COMPILE_PCRE8
3214     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3215 #else
3216     if (base_list[0] == OP_CLASS)
3217 #endif
3218       {
3219       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3220       list_ptr = list;
3221       }
3222     else
3223       {
3224       set1 = (pcre_uint8 *)(code - list[2]);
3225       list_ptr = base_list;
3226       }
3227 
3228     invert_bits = FALSE;
3229     switch(list_ptr[0])
3230       {
3231       case OP_CLASS:
3232       case OP_NCLASS:
3233       set2 = (pcre_uint8 *)
3234         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3235       break;
3236 
3237 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3238       case OP_XCLASS:
3239       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3240       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3241       if ((*xclass_flags & XCL_MAP) == 0)
3242         {
3243         /* No bits are set for characters < 256. */
3244         if (list[1] == 0) return TRUE;
3245         /* Might be an empty repeat. */
3246         continue;
3247         }
3248       set2 = (pcre_uint8 *)(xclass_flags + 1);
3249       break;
3250 #endif
3251 
3252       case OP_NOT_DIGIT:
3253       invert_bits = TRUE;
3254       /* Fall through */
3255       case OP_DIGIT:
3256       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3257       break;
3258 
3259       case OP_NOT_WHITESPACE:
3260       invert_bits = TRUE;
3261       /* Fall through */
3262       case OP_WHITESPACE:
3263       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3264       break;
3265 
3266       case OP_NOT_WORDCHAR:
3267       invert_bits = TRUE;
3268       /* Fall through */
3269       case OP_WORDCHAR:
3270       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3271       break;
3272 
3273       default:
3274       return FALSE;
3275       }
3276 
3277     /* Because the sets are unaligned, we need
3278     to perform byte comparison here. */
3279     set_end = set1 + 32;
3280     if (invert_bits)
3281       {
3282       do
3283         {
3284         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3285         }
3286       while (set1 < set_end);
3287       }
3288     else
3289       {
3290       do
3291         {
3292         if ((*set1++ & *set2++) != 0) return FALSE;
3293         }
3294       while (set1 < set_end);
3295       }
3296 
3297     if (list[1] == 0) return TRUE;
3298     /* Might be an empty repeat. */
3299     continue;
3300     }
3301 
3302   /* Some property combinations also acceptable. Unicode property opcodes are
3303   processed specially; the rest can be handled with a lookup table. */
3304 
3305   else
3306     {
3307     pcre_uint32 leftop, rightop;
3308 
3309     leftop = base_list[0];
3310     rightop = list[0];
3311 
3312 #ifdef SUPPORT_UCP
3313     accepted = FALSE; /* Always set in non-unicode case. */
3314     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3315       {
3316       if (rightop == OP_EOD)
3317         accepted = TRUE;
3318       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3319         {
3320         int n;
3321         const pcre_uint8 *p;
3322         BOOL same = leftop == rightop;
3323         BOOL lisprop = leftop == OP_PROP;
3324         BOOL risprop = rightop == OP_PROP;
3325         BOOL bothprop = lisprop && risprop;
3326 
3327         /* There's a table that specifies how each combination is to be
3328         processed:
3329           0   Always return FALSE (never auto-possessify)
3330           1   Character groups are distinct (possessify if both are OP_PROP)
3331           2   Check character categories in the same group (general or particular)
3332           3   Return TRUE if the two opcodes are not the same
3333           ... see comments below
3334         */
3335 
3336         n = propposstab[base_list[2]][list[2]];
3337         switch(n)
3338           {
3339           case 0: break;
3340           case 1: accepted = bothprop; break;
3341           case 2: accepted = (base_list[3] == list[3]) != same; break;
3342           case 3: accepted = !same; break;
3343 
3344           case 4:  /* Left general category, right particular category */
3345           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3346           break;
3347 
3348           case 5:  /* Right general category, left particular category */
3349           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3350           break;
3351 
3352           /* This code is logically tricky. Think hard before fiddling with it.
3353           The posspropstab table has four entries per row. Each row relates to
3354           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3355           Only WORD actually needs all four entries, but using repeats for the
3356           others means they can all use the same code below.
3357 
3358           The first two entries in each row are Unicode general categories, and
3359           apply always, because all the characters they include are part of the
3360           PCRE character set. The third and fourth entries are a general and a
3361           particular category, respectively, that include one or more relevant
3362           characters. One or the other is used, depending on whether the check
3363           is for a general or a particular category. However, in both cases the
3364           category contains more characters than the specials that are defined
3365           for the property being tested against. Therefore, it cannot be used
3366           in a NOTPROP case.
3367 
3368           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3369           Underscore is covered by ucp_P or ucp_Po. */
3370 
3371           case 6:  /* Left alphanum vs right general category */
3372           case 7:  /* Left space vs right general category */
3373           case 8:  /* Left word vs right general category */
3374           p = posspropstab[n-6];
3375           accepted = risprop && lisprop ==
3376             (list[3] != p[0] &&
3377              list[3] != p[1] &&
3378             (list[3] != p[2] || !lisprop));
3379           break;
3380 
3381           case 9:   /* Right alphanum vs left general category */
3382           case 10:  /* Right space vs left general category */
3383           case 11:  /* Right word vs left general category */
3384           p = posspropstab[n-9];
3385           accepted = lisprop && risprop ==
3386             (base_list[3] != p[0] &&
3387              base_list[3] != p[1] &&
3388             (base_list[3] != p[2] || !risprop));
3389           break;
3390 
3391           case 12:  /* Left alphanum vs right particular category */
3392           case 13:  /* Left space vs right particular category */
3393           case 14:  /* Left word vs right particular category */
3394           p = posspropstab[n-12];
3395           accepted = risprop && lisprop ==
3396             (catposstab[p[0]][list[3]] &&
3397              catposstab[p[1]][list[3]] &&
3398             (list[3] != p[3] || !lisprop));
3399           break;
3400 
3401           case 15:  /* Right alphanum vs left particular category */
3402           case 16:  /* Right space vs left particular category */
3403           case 17:  /* Right word vs left particular category */
3404           p = posspropstab[n-15];
3405           accepted = lisprop && risprop ==
3406             (catposstab[p[0]][base_list[3]] &&
3407              catposstab[p[1]][base_list[3]] &&
3408             (base_list[3] != p[3] || !risprop));
3409           break;
3410           }
3411         }
3412       }
3413 
3414     else
3415 #endif  /* SUPPORT_UCP */
3416 
3417     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3418            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3419            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3420 
3421     if (!accepted) return FALSE;
3422 
3423     if (list[1] == 0) return TRUE;
3424     /* Might be an empty repeat. */
3425     continue;
3426     }
3427 
3428   /* Control reaches here only if one of the items is a small character list.
3429   All characters are checked against the other side. */
3430 
3431   do
3432     {
3433     chr = *chr_ptr;
3434 
3435     switch(list_ptr[0])
3436       {
3437       case OP_CHAR:
3438       ochr_ptr = list_ptr + 2;
3439       do
3440         {
3441         if (chr == *ochr_ptr) return FALSE;
3442         ochr_ptr++;
3443         }
3444       while(*ochr_ptr != NOTACHAR);
3445       break;
3446 
3447       case OP_NOT:
3448       ochr_ptr = list_ptr + 2;
3449       do
3450         {
3451         if (chr == *ochr_ptr)
3452           break;
3453         ochr_ptr++;
3454         }
3455       while(*ochr_ptr != NOTACHAR);
3456       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3457       break;
3458 
3459       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3460       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3461 
3462       case OP_DIGIT:
3463       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3464       break;
3465 
3466       case OP_NOT_DIGIT:
3467       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3468       break;
3469 
3470       case OP_WHITESPACE:
3471       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3472       break;
3473 
3474       case OP_NOT_WHITESPACE:
3475       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3476       break;
3477 
3478       case OP_WORDCHAR:
3479       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3480       break;
3481 
3482       case OP_NOT_WORDCHAR:
3483       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3484       break;
3485 
3486       case OP_HSPACE:
3487       switch(chr)
3488         {
3489         HSPACE_CASES: return FALSE;
3490         default: break;
3491         }
3492       break;
3493 
3494       case OP_NOT_HSPACE:
3495       switch(chr)
3496         {
3497         HSPACE_CASES: break;
3498         default: return FALSE;
3499         }
3500       break;
3501 
3502       case OP_ANYNL:
3503       case OP_VSPACE:
3504       switch(chr)
3505         {
3506         VSPACE_CASES: return FALSE;
3507         default: break;
3508         }
3509       break;
3510 
3511       case OP_NOT_VSPACE:
3512       switch(chr)
3513         {
3514         VSPACE_CASES: break;
3515         default: return FALSE;
3516         }
3517       break;
3518 
3519       case OP_DOLL:
3520       case OP_EODN:
3521       switch (chr)
3522         {
3523         case CHAR_CR:
3524         case CHAR_LF:
3525         case CHAR_VT:
3526         case CHAR_FF:
3527         case CHAR_NEL:
3528 #ifndef EBCDIC
3529         case 0x2028:
3530         case 0x2029:
3531 #endif  /* Not EBCDIC */
3532         return FALSE;
3533         }
3534       break;
3535 
3536       case OP_EOD:    /* Can always possessify before \z */
3537       break;
3538 
3539 #ifdef SUPPORT_UCP
3540       case OP_PROP:
3541       case OP_NOTPROP:
3542       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3543             list_ptr[0] == OP_NOTPROP))
3544         return FALSE;
3545       break;
3546 #endif
3547 
3548       case OP_NCLASS:
3549       if (chr > 255) return FALSE;
3550       /* Fall through */
3551 
3552       case OP_CLASS:
3553       if (chr > 255) break;
3554       class_bitset = (pcre_uint8 *)
3555         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3556       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3557       break;
3558 
3559 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3560       case OP_XCLASS:
3561       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3562           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3563       break;
3564 #endif
3565 
3566       default:
3567       return FALSE;
3568       }
3569 
3570     chr_ptr++;
3571     }
3572   while(*chr_ptr != NOTACHAR);
3573 
3574   /* At least one character must be matched from this opcode. */
3575 
3576   if (list[1] == 0) return TRUE;
3577   }
3578 
3579 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3580 but some compilers complain about an unreachable statement. */
3581 
3582 }
3583 
3584 
3585 
3586 /*************************************************
3587 *    Scan compiled regex for auto-possession     *
3588 *************************************************/
3589 
3590 /* Replaces single character iterations with their possessive alternatives
3591 if appropriate. This function modifies the compiled opcode!
3592 
3593 Arguments:
3594   code        points to start of the byte code
3595   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3596   cd          static compile data
3597 
3598 Returns:      nothing
3599 */
3600 
3601 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3602 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3603 {
3604 register pcre_uchar c;
3605 const pcre_uchar *end;
3606 pcre_uchar *repeat_opcode;
3607 pcre_uint32 list[8];
3608 
3609 for (;;)
3610   {
3611   c = *code;
3612 
3613   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3614     {
3615     c -= get_repeat_base(c) - OP_STAR;
3616     end = (c <= OP_MINUPTO) ?
3617       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3618     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3619 
3620     if (end != NULL && compare_opcodes(end, utf, cd, list, end))
3621       {
3622       switch(c)
3623         {
3624         case OP_STAR:
3625         *code += OP_POSSTAR - OP_STAR;
3626         break;
3627 
3628         case OP_MINSTAR:
3629         *code += OP_POSSTAR - OP_MINSTAR;
3630         break;
3631 
3632         case OP_PLUS:
3633         *code += OP_POSPLUS - OP_PLUS;
3634         break;
3635 
3636         case OP_MINPLUS:
3637         *code += OP_POSPLUS - OP_MINPLUS;
3638         break;
3639 
3640         case OP_QUERY:
3641         *code += OP_POSQUERY - OP_QUERY;
3642         break;
3643 
3644         case OP_MINQUERY:
3645         *code += OP_POSQUERY - OP_MINQUERY;
3646         break;
3647 
3648         case OP_UPTO:
3649         *code += OP_POSUPTO - OP_UPTO;
3650         break;
3651 
3652         case OP_MINUPTO:
3653         *code += OP_POSUPTO - OP_MINUPTO;
3654         break;
3655         }
3656       }
3657     c = *code;
3658     }
3659   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3660     {
3661 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3662     if (c == OP_XCLASS)
3663       repeat_opcode = code + GET(code, 1);
3664     else
3665 #endif
3666       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3667 
3668     c = *repeat_opcode;
3669     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3670       {
3671       /* end must not be NULL. */
3672       end = get_chr_property_list(code, utf, cd->fcc, list);
3673 
3674       list[1] = (c & 1) == 0;
3675 
3676       if (compare_opcodes(end, utf, cd, list, end))
3677         {
3678         switch (c)
3679           {
3680           case OP_CRSTAR:
3681           case OP_CRMINSTAR:
3682           *repeat_opcode = OP_CRPOSSTAR;
3683           break;
3684 
3685           case OP_CRPLUS:
3686           case OP_CRMINPLUS:
3687           *repeat_opcode = OP_CRPOSPLUS;
3688           break;
3689 
3690           case OP_CRQUERY:
3691           case OP_CRMINQUERY:
3692           *repeat_opcode = OP_CRPOSQUERY;
3693           break;
3694 
3695           case OP_CRRANGE:
3696           case OP_CRMINRANGE:
3697           *repeat_opcode = OP_CRPOSRANGE;
3698           break;
3699           }
3700         }
3701       }
3702     c = *code;
3703     }
3704 
3705   switch(c)
3706     {
3707     case OP_END:
3708     return;
3709 
3710     case OP_TYPESTAR:
3711     case OP_TYPEMINSTAR:
3712     case OP_TYPEPLUS:
3713     case OP_TYPEMINPLUS:
3714     case OP_TYPEQUERY:
3715     case OP_TYPEMINQUERY:
3716     case OP_TYPEPOSSTAR:
3717     case OP_TYPEPOSPLUS:
3718     case OP_TYPEPOSQUERY:
3719     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3720     break;
3721 
3722     case OP_TYPEUPTO:
3723     case OP_TYPEMINUPTO:
3724     case OP_TYPEEXACT:
3725     case OP_TYPEPOSUPTO:
3726     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3727       code += 2;
3728     break;
3729 
3730 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3731     case OP_XCLASS:
3732     code += GET(code, 1);
3733     break;
3734 #endif
3735 
3736     case OP_MARK:
3737     case OP_PRUNE_ARG:
3738     case OP_SKIP_ARG:
3739     case OP_THEN_ARG:
3740     code += code[1];
3741     break;
3742     }
3743 
3744   /* Add in the fixed length from the table */
3745 
3746   code += PRIV(OP_lengths)[c];
3747 
3748   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3749   a multi-byte character. The length in the table is a minimum, so we have to
3750   arrange to skip the extra bytes. */
3751 
3752 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3753   if (utf) switch(c)
3754     {
3755     case OP_CHAR:
3756     case OP_CHARI:
3757     case OP_NOT:
3758     case OP_NOTI:
3759     case OP_STAR:
3760     case OP_MINSTAR:
3761     case OP_PLUS:
3762     case OP_MINPLUS:
3763     case OP_QUERY:
3764     case OP_MINQUERY:
3765     case OP_UPTO:
3766     case OP_MINUPTO:
3767     case OP_EXACT:
3768     case OP_POSSTAR:
3769     case OP_POSPLUS:
3770     case OP_POSQUERY:
3771     case OP_POSUPTO:
3772     case OP_STARI:
3773     case OP_MINSTARI:
3774     case OP_PLUSI:
3775     case OP_MINPLUSI:
3776     case OP_QUERYI:
3777     case OP_MINQUERYI:
3778     case OP_UPTOI:
3779     case OP_MINUPTOI:
3780     case OP_EXACTI:
3781     case OP_POSSTARI:
3782     case OP_POSPLUSI:
3783     case OP_POSQUERYI:
3784     case OP_POSUPTOI:
3785     case OP_NOTSTAR:
3786     case OP_NOTMINSTAR:
3787     case OP_NOTPLUS:
3788     case OP_NOTMINPLUS:
3789     case OP_NOTQUERY:
3790     case OP_NOTMINQUERY:
3791     case OP_NOTUPTO:
3792     case OP_NOTMINUPTO:
3793     case OP_NOTEXACT:
3794     case OP_NOTPOSSTAR:
3795     case OP_NOTPOSPLUS:
3796     case OP_NOTPOSQUERY:
3797     case OP_NOTPOSUPTO:
3798     case OP_NOTSTARI:
3799     case OP_NOTMINSTARI:
3800     case OP_NOTPLUSI:
3801     case OP_NOTMINPLUSI:
3802     case OP_NOTQUERYI:
3803     case OP_NOTMINQUERYI:
3804     case OP_NOTUPTOI:
3805     case OP_NOTMINUPTOI:
3806     case OP_NOTEXACTI:
3807     case OP_NOTPOSSTARI:
3808     case OP_NOTPOSPLUSI:
3809     case OP_NOTPOSQUERYI:
3810     case OP_NOTPOSUPTOI:
3811     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3812     break;
3813     }
3814 #else
3815   (void)(utf);  /* Keep compiler happy by referencing function argument */
3816 #endif
3817   }
3818 }
3819 
3820 
3821 
3822 /*************************************************
3823 *           Check for POSIX class syntax         *
3824 *************************************************/
3825 
3826 /* This function is called when the sequence "[:" or "[." or "[=" is
3827 encountered in a character class. It checks whether this is followed by a
3828 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3829 reach an unescaped ']' without the special preceding character, return FALSE.
3830 
3831 Originally, this function only recognized a sequence of letters between the
3832 terminators, but it seems that Perl recognizes any sequence of characters,
3833 though of course unknown POSIX names are subsequently rejected. Perl gives an
3834 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3835 didn't consider this to be a POSIX class. Likewise for [:1234:].
3836 
3837 The problem in trying to be exactly like Perl is in the handling of escapes. We
3838 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3839 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3840 below handles the special case of \], but does not try to do any other escape
3841 processing. This makes it different from Perl for cases such as [:l\ower:]
3842 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
3843 "l\ower". This is a lesser evil than not diagnosing bad classes when Perl does,
3844 I think.
3845 
3846 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3847 It seems that the appearance of a nested POSIX class supersedes an apparent
3848 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3849 a digit.
3850 
3851 In Perl, unescaped square brackets may also appear as part of class names. For
3852 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3853 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3854 seem right at all. PCRE does not allow closing square brackets in POSIX class
3855 names.
3856 
3857 Arguments:
3858   ptr      pointer to the initial [
3859   endptr   where to return the end pointer
3860 
3861 Returns:   TRUE or FALSE
3862 */
3863 
3864 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3865 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3866 {
3867 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3868 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3869 for (++ptr; *ptr != CHAR_NULL; ptr++)
3870   {
3871   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3872     ptr++;
3873   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3874   else
3875     {
3876     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3877       {
3878       *endptr = ptr;
3879       return TRUE;
3880       }
3881     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
3882          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3883           ptr[1] == CHAR_EQUALS_SIGN) &&
3884         check_posix_syntax(ptr, endptr))
3885       return FALSE;
3886     }
3887   }
3888 return FALSE;
3889 }
3890 
3891 
3892 
3893 
3894 /*************************************************
3895 *          Check POSIX class name                *
3896 *************************************************/
3897 
3898 /* This function is called to check the name given in a POSIX-style class entry
3899 such as [:alnum:].
3900 
3901 Arguments:
3902   ptr        points to the first letter
3903   len        the length of the name
3904 
3905 Returns:     a value representing the name, or -1 if unknown
3906 */
3907 
3908 static int
check_posix_name(const pcre_uchar * ptr,int len)3909 check_posix_name(const pcre_uchar *ptr, int len)
3910 {
3911 const char *pn = posix_names;
3912 register int yield = 0;
3913 while (posix_name_lengths[yield] != 0)
3914   {
3915   if (len == posix_name_lengths[yield] &&
3916     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3917   pn += posix_name_lengths[yield] + 1;
3918   yield++;
3919   }
3920 return -1;
3921 }
3922 
3923 
3924 /*************************************************
3925 *    Adjust OP_RECURSE items in repeated group   *
3926 *************************************************/
3927 
3928 /* OP_RECURSE items contain an offset from the start of the regex to the group
3929 that is referenced. This means that groups can be replicated for fixed
3930 repetition simply by copying (because the recursion is allowed to refer to
3931 earlier groups that are outside the current group). However, when a group is
3932 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3933 inserted before it, after it has been compiled. This means that any OP_RECURSE
3934 items within it that refer to the group itself or any contained groups have to
3935 have their offsets adjusted. That one of the jobs of this function. Before it
3936 is called, the partially compiled regex must be temporarily terminated with
3937 OP_END.
3938 
3939 This function has been extended with the possibility of forward references for
3940 recursions and subroutine calls. It must also check the list of such references
3941 for the group we are dealing with. If it finds that one of the recursions in
3942 the current group is on this list, it adjusts the offset in the list, not the
3943 value in the reference (which is a group number).
3944 
3945 Arguments:
3946   group      points to the start of the group
3947   adjust     the amount by which the group is to be moved
3948   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
3949   cd         contains pointers to tables etc.
3950   save_hwm   the hwm forward reference pointer at the start of the group
3951 
3952 Returns:     nothing
3953 */
3954 
3955 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,pcre_uchar * save_hwm)3956 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
3957   pcre_uchar *save_hwm)
3958 {
3959 pcre_uchar *ptr = group;
3960 
3961 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
3962   {
3963   int offset;
3964   pcre_uchar *hc;
3965 
3966   /* See if this recursion is on the forward reference list. If so, adjust the
3967   reference. */
3968 
3969   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
3970     {
3971     offset = (int)GET(hc, 0);
3972     if (cd->start_code + offset == ptr + 1)
3973       {
3974       PUT(hc, 0, offset + adjust);
3975       break;
3976       }
3977     }
3978 
3979   /* Otherwise, adjust the recursion offset if it's after the start of this
3980   group. */
3981 
3982   if (hc >= cd->hwm)
3983     {
3984     offset = (int)GET(ptr, 1);
3985     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
3986     }
3987 
3988   ptr += 1 + LINK_SIZE;
3989   }
3990 }
3991 
3992 
3993 
3994 /*************************************************
3995 *        Insert an automatic callout point       *
3996 *************************************************/
3997 
3998 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
3999 callout points before each pattern item.
4000 
4001 Arguments:
4002   code           current code pointer
4003   ptr            current pattern pointer
4004   cd             pointers to tables etc
4005 
4006 Returns:         new code pointer
4007 */
4008 
4009 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4010 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4011 {
4012 *code++ = OP_CALLOUT;
4013 *code++ = 255;
4014 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4015 PUT(code, LINK_SIZE, 0);                       /* Default length */
4016 return code + 2 * LINK_SIZE;
4017 }
4018 
4019 
4020 
4021 /*************************************************
4022 *         Complete a callout item                *
4023 *************************************************/
4024 
4025 /* A callout item contains the length of the next item in the pattern, which
4026 we can't fill in till after we have reached the relevant point. This is used
4027 for both automatic and manual callouts.
4028 
4029 Arguments:
4030   previous_callout   points to previous callout item
4031   ptr                current pattern pointer
4032   cd                 pointers to tables etc
4033 
4034 Returns:             nothing
4035 */
4036 
4037 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4038 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4039 {
4040 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4041 PUT(previous_callout, 2 + LINK_SIZE, length);
4042 }
4043 
4044 
4045 
4046 #ifdef SUPPORT_UCP
4047 /*************************************************
4048 *           Get othercase range                  *
4049 *************************************************/
4050 
4051 /* This function is passed the start and end of a class range, in UTF-8 mode
4052 with UCP support. It searches up the characters, looking for ranges of
4053 characters in the "other" case. Each call returns the next one, updating the
4054 start address. A character with multiple other cases is returned on its own
4055 with a special return value.
4056 
4057 Arguments:
4058   cptr        points to starting character value; updated
4059   d           end value
4060   ocptr       where to put start of othercase range
4061   odptr       where to put end of othercase range
4062 
4063 Yield:        -1 when no more
4064                0 when a range is returned
4065               >0 the CASESET offset for char with multiple other cases
4066                 in this case, ocptr contains the original
4067 */
4068 
4069 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4070 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4071   pcre_uint32 *odptr)
4072 {
4073 pcre_uint32 c, othercase, next;
4074 unsigned int co;
4075 
4076 /* Find the first character that has an other case. If it has multiple other
4077 cases, return its case offset value. */
4078 
4079 for (c = *cptr; c <= d; c++)
4080   {
4081   if ((co = UCD_CASESET(c)) != 0)
4082     {
4083     *ocptr = c++;   /* Character that has the set */
4084     *cptr = c;      /* Rest of input range */
4085     return (int)co;
4086     }
4087   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4088   }
4089 
4090 if (c > d) return -1;  /* Reached end of range */
4091 
4092 /* Found a character that has a single other case. Search for the end of the
4093 range, which is either the end of the input range, or a character that has zero
4094 or more than one other cases. */
4095 
4096 *ocptr = othercase;
4097 next = othercase + 1;
4098 
4099 for (++c; c <= d; c++)
4100   {
4101   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4102   next++;
4103   }
4104 
4105 *odptr = next - 1;     /* End of othercase range */
4106 *cptr = c;             /* Rest of input range */
4107 return 0;
4108 }
4109 #endif  /* SUPPORT_UCP */
4110 
4111 
4112 
4113 /*************************************************
4114 *        Add a character or range to a class     *
4115 *************************************************/
4116 
4117 /* This function packages up the logic of adding a character or range of
4118 characters to a class. The character values in the arguments will be within the
4119 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4120 mutually recursive with the function immediately below.
4121 
4122 Arguments:
4123   classbits     the bit map for characters < 256
4124   uchardptr     points to the pointer for extra data
4125   options       the options word
4126   cd            contains pointers to tables etc.
4127   start         start of range character
4128   end           end of range character
4129 
4130 Returns:        the number of < 256 characters added
4131                 the pointer to extra data is updated
4132 */
4133 
4134 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4135 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4136   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4137 {
4138 pcre_uint32 c;
4139 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4140 int n8 = 0;
4141 
4142 /* If caseless matching is required, scan the range and process alternate
4143 cases. In Unicode, there are 8-bit characters that have alternate cases that
4144 are greater than 255 and vice-versa. Sometimes we can just extend the original
4145 range. */
4146 
4147 if ((options & PCRE_CASELESS) != 0)
4148   {
4149 #ifdef SUPPORT_UCP
4150   if ((options & PCRE_UTF8) != 0)
4151     {
4152     int rc;
4153     pcre_uint32 oc, od;
4154 
4155     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4156     c = start;
4157 
4158     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4159       {
4160       /* Handle a single character that has more than one other case. */
4161 
4162       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4163         PRIV(ucd_caseless_sets) + rc, oc);
4164 
4165       /* Do nothing if the other case range is within the original range. */
4166 
4167       else if (oc >= start && od <= end) continue;
4168 
4169       /* Extend the original range if there is overlap, noting that if oc < c, we
4170       can't have od > end because a subrange is always shorter than the basic
4171       range. Otherwise, use a recursive call to add the additional range. */
4172 
4173       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4174       else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
4175       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4176       }
4177     }
4178   else
4179 #endif  /* SUPPORT_UCP */
4180 
4181   /* Not UTF-mode, or no UCP */
4182 
4183   for (c = start; c <= classbits_end; c++)
4184     {
4185     SETBIT(classbits, cd->fcc[c]);
4186     n8++;
4187     }
4188   }
4189 
4190 /* Now handle the original range. Adjust the final value according to the bit
4191 length - this means that the same lists of (e.g.) horizontal spaces can be used
4192 in all cases. */
4193 
4194 #if defined COMPILE_PCRE8
4195 #ifdef SUPPORT_UTF
4196   if ((options & PCRE_UTF8) == 0)
4197 #endif
4198   if (end > 0xff) end = 0xff;
4199 
4200 #elif defined COMPILE_PCRE16
4201 #ifdef SUPPORT_UTF
4202   if ((options & PCRE_UTF16) == 0)
4203 #endif
4204   if (end > 0xffff) end = 0xffff;
4205 
4206 #endif /* COMPILE_PCRE[8|16] */
4207 
4208 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4209 
4210 for (c = start; c <= classbits_end; c++)
4211   {
4212   /* Regardless of start, c will always be <= 255. */
4213   SETBIT(classbits, c);
4214   n8++;
4215   }
4216 
4217 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4218 if (start <= 0xff) start = 0xff + 1;
4219 
4220 if (end >= start)
4221   {
4222   pcre_uchar *uchardata = *uchardptr;
4223 #ifdef SUPPORT_UTF
4224   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4225     {
4226     if (start < end)
4227       {
4228       *uchardata++ = XCL_RANGE;
4229       uchardata += PRIV(ord2utf)(start, uchardata);
4230       uchardata += PRIV(ord2utf)(end, uchardata);
4231       }
4232     else if (start == end)
4233       {
4234       *uchardata++ = XCL_SINGLE;
4235       uchardata += PRIV(ord2utf)(start, uchardata);
4236       }
4237     }
4238   else
4239 #endif  /* SUPPORT_UTF */
4240 
4241   /* Without UTF support, character values are constrained by the bit length,
4242   and can only be > 256 for 16-bit and 32-bit libraries. */
4243 
4244 #ifdef COMPILE_PCRE8
4245     {}
4246 #else
4247   if (start < end)
4248     {
4249     *uchardata++ = XCL_RANGE;
4250     *uchardata++ = start;
4251     *uchardata++ = end;
4252     }
4253   else if (start == end)
4254     {
4255     *uchardata++ = XCL_SINGLE;
4256     *uchardata++ = start;
4257     }
4258 #endif
4259 
4260   *uchardptr = uchardata;   /* Updata extra data pointer */
4261   }
4262 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4263 
4264 return n8;    /* Number of 8-bit characters */
4265 }
4266 
4267 
4268 
4269 
4270 /*************************************************
4271 *        Add a list of characters to a class     *
4272 *************************************************/
4273 
4274 /* This function is used for adding a list of case-equivalent characters to a
4275 class, and also for adding a list of horizontal or vertical whitespace. If the
4276 list is in order (which it should be), ranges of characters are detected and
4277 handled appropriately. This function is mutually recursive with the function
4278 above.
4279 
4280 Arguments:
4281   classbits     the bit map for characters < 256
4282   uchardptr     points to the pointer for extra data
4283   options       the options word
4284   cd            contains pointers to tables etc.
4285   p             points to row of 32-bit values, terminated by NOTACHAR
4286   except        character to omit; this is used when adding lists of
4287                   case-equivalent characters to avoid including the one we
4288                   already know about
4289 
4290 Returns:        the number of < 256 characters added
4291                 the pointer to extra data is updated
4292 */
4293 
4294 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4295 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4296   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4297 {
4298 int n8 = 0;
4299 while (p[0] < NOTACHAR)
4300   {
4301   int n = 0;
4302   if (p[0] != except)
4303     {
4304     while(p[n+1] == p[0] + n + 1) n++;
4305     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4306     }
4307   p += n + 1;
4308   }
4309 return n8;
4310 }
4311 
4312 
4313 
4314 /*************************************************
4315 *    Add characters not in a list to a class     *
4316 *************************************************/
4317 
4318 /* This function is used for adding the complement of a list of horizontal or
4319 vertical whitespace to a class. The list must be in order.
4320 
4321 Arguments:
4322   classbits     the bit map for characters < 256
4323   uchardptr     points to the pointer for extra data
4324   options       the options word
4325   cd            contains pointers to tables etc.
4326   p             points to row of 32-bit values, terminated by NOTACHAR
4327 
4328 Returns:        the number of < 256 characters added
4329                 the pointer to extra data is updated
4330 */
4331 
4332 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4333 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4334   int options, compile_data *cd, const pcre_uint32 *p)
4335 {
4336 BOOL utf = (options & PCRE_UTF8) != 0;
4337 int n8 = 0;
4338 if (p[0] > 0)
4339   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4340 while (p[0] < NOTACHAR)
4341   {
4342   while (p[1] == p[0] + 1) p++;
4343   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4344     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4345   p++;
4346   }
4347 return n8;
4348 }
4349 
4350 
4351 
4352 /*************************************************
4353 *           Compile one branch                   *
4354 *************************************************/
4355 
4356 /* Scan the pattern, compiling it into the a vector. If the options are
4357 changed during the branch, the pointer is used to change the external options
4358 bits. This function is used during the pre-compile phase when we are trying
4359 to find out the amount of memory needed, as well as during the real compile
4360 phase. The value of lengthptr distinguishes the two phases.
4361 
4362 Arguments:
4363   optionsptr        pointer to the option bits
4364   codeptr           points to the pointer to the current code point
4365   ptrptr            points to the current pattern pointer
4366   errorcodeptr      points to error code variable
4367   firstcharptr      place to put the first required character
4368   firstcharflagsptr place to put the first character flags, or a negative number
4369   reqcharptr        place to put the last required character
4370   reqcharflagsptr   place to put the last required character flags, or a negative number
4371   bcptr             points to current branch chain
4372   cond_depth        conditional nesting depth
4373   cd                contains pointers to tables etc.
4374   lengthptr         NULL during the real compile phase
4375                     points to length accumulator during pre-compile phase
4376 
4377 Returns:            TRUE on success
4378                     FALSE, with *errorcodeptr set non-zero on error
4379 */
4380 
4381 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4382 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4383   const pcre_uchar **ptrptr, int *errorcodeptr,
4384   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4385   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4386   branch_chain *bcptr, int cond_depth,
4387   compile_data *cd, int *lengthptr)
4388 {
4389 int repeat_type, op_type;
4390 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4391 int bravalue = 0;
4392 int greedy_default, greedy_non_default;
4393 pcre_uint32 firstchar, reqchar;
4394 pcre_int32 firstcharflags, reqcharflags;
4395 pcre_uint32 zeroreqchar, zerofirstchar;
4396 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4397 pcre_int32 req_caseopt, reqvary, tempreqvary;
4398 int options = *optionsptr;               /* May change dynamically */
4399 int after_manual_callout = 0;
4400 int length_prevgroup = 0;
4401 register pcre_uint32 c;
4402 int escape;
4403 register pcre_uchar *code = *codeptr;
4404 pcre_uchar *last_code = code;
4405 pcre_uchar *orig_code = code;
4406 pcre_uchar *tempcode;
4407 BOOL inescq = FALSE;
4408 BOOL groupsetfirstchar = FALSE;
4409 const pcre_uchar *ptr = *ptrptr;
4410 const pcre_uchar *tempptr;
4411 const pcre_uchar *nestptr = NULL;
4412 pcre_uchar *previous = NULL;
4413 pcre_uchar *previous_callout = NULL;
4414 pcre_uchar *save_hwm = NULL;
4415 pcre_uint8 classbits[32];
4416 
4417 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4418 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4419 dynamically as we process the pattern. */
4420 
4421 #ifdef SUPPORT_UTF
4422 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4423 BOOL utf = (options & PCRE_UTF8) != 0;
4424 #ifndef COMPILE_PCRE32
4425 pcre_uchar utf_chars[6];
4426 #endif
4427 #else
4428 BOOL utf = FALSE;
4429 #endif
4430 
4431 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4432 class_uchardata always so that it can be passed to add_to_class() always,
4433 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4434 alternative calls for the different cases. */
4435 
4436 pcre_uchar *class_uchardata;
4437 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4438 BOOL xclass;
4439 pcre_uchar *class_uchardata_base;
4440 #endif
4441 
4442 #ifdef PCRE_DEBUG
4443 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4444 #endif
4445 
4446 /* Set up the default and non-default settings for greediness */
4447 
4448 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4449 greedy_non_default = greedy_default ^ 1;
4450 
4451 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4452 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4453 matches a non-fixed char first char; reqchar just remains unset if we never
4454 find one.
4455 
4456 When we hit a repeat whose minimum is zero, we may have to adjust these values
4457 to take the zero repeat into account. This is implemented by setting them to
4458 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4459 item types that can be repeated set these backoff variables appropriately. */
4460 
4461 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4462 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4463 
4464 /* The variable req_caseopt contains either the REQ_CASELESS value
4465 or zero, according to the current setting of the caseless flag. The
4466 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4467 firstchar or reqchar variables to record the case status of the
4468 value. This is used only for ASCII characters. */
4469 
4470 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4471 
4472 /* Switch on next character until the end of the branch */
4473 
4474 for (;; ptr++)
4475   {
4476   BOOL negate_class;
4477   BOOL should_flip_negation;
4478   BOOL possessive_quantifier;
4479   BOOL is_quantifier;
4480   BOOL is_recurse;
4481   BOOL reset_bracount;
4482   int class_has_8bitchar;
4483   int class_one_char;
4484 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4485   BOOL xclass_has_prop;
4486 #endif
4487   int newoptions;
4488   int recno;
4489   int refsign;
4490   int skipbytes;
4491   pcre_uint32 subreqchar, subfirstchar;
4492   pcre_int32 subreqcharflags, subfirstcharflags;
4493   int terminator;
4494   unsigned int mclength;
4495   unsigned int tempbracount;
4496   pcre_uint32 ec;
4497   pcre_uchar mcbuffer[8];
4498 
4499   /* Get next character in the pattern */
4500 
4501   c = *ptr;
4502 
4503   /* If we are at the end of a nested substitution, revert to the outer level
4504   string. Nesting only happens one level deep. */
4505 
4506   if (c == CHAR_NULL && nestptr != NULL)
4507     {
4508     ptr = nestptr;
4509     nestptr = NULL;
4510     c = *ptr;
4511     }
4512 
4513   /* If we are in the pre-compile phase, accumulate the length used for the
4514   previous cycle of this loop. */
4515 
4516   if (lengthptr != NULL)
4517     {
4518 #ifdef PCRE_DEBUG
4519     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4520 #endif
4521     if (code > cd->start_workspace + cd->workspace_size -
4522         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4523       {
4524       *errorcodeptr = ERR52;
4525       goto FAILED;
4526       }
4527 
4528     /* There is at least one situation where code goes backwards: this is the
4529     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4530     the class is simply eliminated. However, it is created first, so we have to
4531     allow memory for it. Therefore, don't ever reduce the length at this point.
4532     */
4533 
4534     if (code < last_code) code = last_code;
4535 
4536     /* Paranoid check for integer overflow */
4537 
4538     if (OFLOW_MAX - *lengthptr < code - last_code)
4539       {
4540       *errorcodeptr = ERR20;
4541       goto FAILED;
4542       }
4543 
4544     *lengthptr += (int)(code - last_code);
4545     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4546       (int)(code - last_code), c, c));
4547 
4548     /* If "previous" is set and it is not at the start of the work space, move
4549     it back to there, in order to avoid filling up the work space. Otherwise,
4550     if "previous" is NULL, reset the current code pointer to the start. */
4551 
4552     if (previous != NULL)
4553       {
4554       if (previous > orig_code)
4555         {
4556         memmove(orig_code, previous, IN_UCHARS(code - previous));
4557         code -= previous - orig_code;
4558         previous = orig_code;
4559         }
4560       }
4561     else code = orig_code;
4562 
4563     /* Remember where this code item starts so we can pick up the length
4564     next time round. */
4565 
4566     last_code = code;
4567     }
4568 
4569   /* In the real compile phase, just check the workspace used by the forward
4570   reference list. */
4571 
4572   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
4573            WORK_SIZE_SAFETY_MARGIN)
4574     {
4575     *errorcodeptr = ERR52;
4576     goto FAILED;
4577     }
4578 
4579   /* If in \Q...\E, check for the end; if not, we have a literal */
4580 
4581   if (inescq && c != CHAR_NULL)
4582     {
4583     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4584       {
4585       inescq = FALSE;
4586       ptr++;
4587       continue;
4588       }
4589     else
4590       {
4591       if (previous_callout != NULL)
4592         {
4593         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4594           complete_callout(previous_callout, ptr, cd);
4595         previous_callout = NULL;
4596         }
4597       if ((options & PCRE_AUTO_CALLOUT) != 0)
4598         {
4599         previous_callout = code;
4600         code = auto_callout(code, ptr, cd);
4601         }
4602       goto NORMAL_CHAR;
4603       }
4604     /* Control does not reach here. */
4605     }
4606 
4607   /* In extended mode, skip white space and comments. We need a loop in order
4608   to check for more white space and more comments after a comment. */
4609 
4610   if ((options & PCRE_EXTENDED) != 0)
4611     {
4612     for (;;)
4613       {
4614       while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4615       if (c != CHAR_NUMBER_SIGN) break;
4616       ptr++;
4617       while (*ptr != CHAR_NULL)
4618         {
4619         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4620           {                          /* IS_NEWLINE sets cd->nllen. */
4621           ptr += cd->nllen;
4622           break;
4623           }
4624         ptr++;
4625 #ifdef SUPPORT_UTF
4626         if (utf) FORWARDCHAR(ptr);
4627 #endif
4628         }
4629       c = *ptr;     /* Either NULL or the char after a newline */
4630       }
4631     }
4632 
4633   /* See if the next thing is a quantifier. */
4634 
4635   is_quantifier =
4636     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4637     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4638 
4639   /* Fill in length of a previous callout, except when the next thing is a
4640   quantifier or when processing a property substitution string in UCP mode. */
4641 
4642   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4643        after_manual_callout-- <= 0)
4644     {
4645     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4646       complete_callout(previous_callout, ptr, cd);
4647     previous_callout = NULL;
4648     }
4649 
4650   /* Create auto callout, except for quantifiers, or while processing property
4651   strings that are substituted for \w etc in UCP mode. */
4652 
4653   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4654     {
4655     previous_callout = code;
4656     code = auto_callout(code, ptr, cd);
4657     }
4658 
4659   /* Process the next pattern item. */
4660 
4661   switch(c)
4662     {
4663     /* ===================================================================*/
4664     case CHAR_NULL:                /* The branch terminates at string end */
4665     case CHAR_VERTICAL_LINE:       /* or | or ) */
4666     case CHAR_RIGHT_PARENTHESIS:
4667     *firstcharptr = firstchar;
4668     *firstcharflagsptr = firstcharflags;
4669     *reqcharptr = reqchar;
4670     *reqcharflagsptr = reqcharflags;
4671     *codeptr = code;
4672     *ptrptr = ptr;
4673     if (lengthptr != NULL)
4674       {
4675       if (OFLOW_MAX - *lengthptr < code - last_code)
4676         {
4677         *errorcodeptr = ERR20;
4678         goto FAILED;
4679         }
4680       *lengthptr += (int)(code - last_code);   /* To include callout length */
4681       DPRINTF((">> end branch\n"));
4682       }
4683     return TRUE;
4684 
4685 
4686     /* ===================================================================*/
4687     /* Handle single-character metacharacters. In multiline mode, ^ disables
4688     the setting of any following char as a first character. */
4689 
4690     case CHAR_CIRCUMFLEX_ACCENT:
4691     previous = NULL;
4692     if ((options & PCRE_MULTILINE) != 0)
4693       {
4694       if (firstcharflags == REQ_UNSET)
4695         zerofirstcharflags = firstcharflags = REQ_NONE;
4696       *code++ = OP_CIRCM;
4697       }
4698     else *code++ = OP_CIRC;
4699     break;
4700 
4701     case CHAR_DOLLAR_SIGN:
4702     previous = NULL;
4703     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4704     break;
4705 
4706     /* There can never be a first char if '.' is first, whatever happens about
4707     repeats. The value of reqchar doesn't change either. */
4708 
4709     case CHAR_DOT:
4710     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4711     zerofirstchar = firstchar;
4712     zerofirstcharflags = firstcharflags;
4713     zeroreqchar = reqchar;
4714     zeroreqcharflags = reqcharflags;
4715     previous = code;
4716     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4717     break;
4718 
4719 
4720     /* ===================================================================*/
4721     /* Character classes. If the included characters are all < 256, we build a
4722     32-byte bitmap of the permitted characters, except in the special case
4723     where there is only one such character. For negated classes, we build the
4724     map as usual, then invert it at the end. However, we use a different opcode
4725     so that data characters > 255 can be handled correctly.
4726 
4727     If the class contains characters outside the 0-255 range, a different
4728     opcode is compiled. It may optionally have a bit map for characters < 256,
4729     but those above are are explicitly listed afterwards. A flag byte tells
4730     whether the bitmap is present, and whether this is a negated class or not.
4731 
4732     In JavaScript compatibility mode, an isolated ']' causes an error. In
4733     default (Perl) mode, it is treated as a data character. */
4734 
4735     case CHAR_RIGHT_SQUARE_BRACKET:
4736     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4737       {
4738       *errorcodeptr = ERR64;
4739       goto FAILED;
4740       }
4741     goto NORMAL_CHAR;
4742 
4743     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4744     used for "start of word" and "end of word". As these are otherwise illegal
4745     sequences, we don't break anything by recognizing them. They are replaced
4746     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4747     erroneous and are handled by the normal code below. */
4748 
4749     case CHAR_LEFT_SQUARE_BRACKET:
4750     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4751       {
4752       nestptr = ptr + 7;
4753       ptr = sub_start_of_word - 1;
4754       continue;
4755       }
4756 
4757     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4758       {
4759       nestptr = ptr + 7;
4760       ptr = sub_end_of_word - 1;
4761       continue;
4762       }
4763 
4764     /* Handle a real character class. */
4765 
4766     previous = code;
4767 
4768     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4769     they are encountered at the top level, so we'll do that too. */
4770 
4771     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4772          ptr[1] == CHAR_EQUALS_SIGN) &&
4773         check_posix_syntax(ptr, &tempptr))
4774       {
4775       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4776       goto FAILED;
4777       }
4778 
4779     /* If the first character is '^', set the negation flag and skip it. Also,
4780     if the first few characters (either before or after ^) are \Q\E or \E we
4781     skip them too. This makes for compatibility with Perl. */
4782 
4783     negate_class = FALSE;
4784     for (;;)
4785       {
4786       c = *(++ptr);
4787       if (c == CHAR_BACKSLASH)
4788         {
4789         if (ptr[1] == CHAR_E)
4790           ptr++;
4791         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4792           ptr += 3;
4793         else
4794           break;
4795         }
4796       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4797         negate_class = TRUE;
4798       else break;
4799       }
4800 
4801     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4802     an initial ']' is taken as a data character -- the code below handles
4803     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4804     [^] must match any character, so generate OP_ALLANY. */
4805 
4806     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4807         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4808       {
4809       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4810       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4811       zerofirstchar = firstchar;
4812       zerofirstcharflags = firstcharflags;
4813       break;
4814       }
4815 
4816     /* If a class contains a negative special such as \S, we need to flip the
4817     negation flag at the end, so that support for characters > 255 works
4818     correctly (they are all included in the class). */
4819 
4820     should_flip_negation = FALSE;
4821 
4822     /* Extended class (xclass) will be used when characters > 255
4823     might match. */
4824 
4825 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4826     xclass = FALSE;
4827     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4828     class_uchardata_base = class_uchardata;   /* Save the start */
4829 #endif
4830 
4831     /* For optimization purposes, we track some properties of the class:
4832     class_has_8bitchar will be non-zero if the class contains at least one <
4833     256 character; class_one_char will be 1 if the class contains just one
4834     character; xclass_has_prop will be TRUE if unicode property checks
4835     are present in the class. */
4836 
4837     class_has_8bitchar = 0;
4838     class_one_char = 0;
4839 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4840     xclass_has_prop = FALSE;
4841 #endif
4842 
4843     /* Initialize the 32-char bit map to all zeros. We build the map in a
4844     temporary bit of memory, in case the class contains fewer than two
4845     8-bit characters because in that case the compiled code doesn't use the bit
4846     map. */
4847 
4848     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4849 
4850     /* Process characters until ] is reached. By writing this as a "do" it
4851     means that an initial ] is taken as a data character. At the start of the
4852     loop, c contains the first byte of the character. */
4853 
4854     if (c != CHAR_NULL) do
4855       {
4856       const pcre_uchar *oldptr;
4857 
4858 #ifdef SUPPORT_UTF
4859       if (utf && HAS_EXTRALEN(c))
4860         {                           /* Braces are required because the */
4861         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4862         }
4863 #endif
4864 
4865 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4866       /* In the pre-compile phase, accumulate the length of any extra
4867       data and reset the pointer. This is so that very large classes that
4868       contain a zillion > 255 characters no longer overwrite the work space
4869       (which is on the stack). We have to remember that there was XCLASS data,
4870       however. */
4871 
4872       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4873         {
4874         xclass = TRUE;
4875         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4876         class_uchardata = class_uchardata_base;
4877         }
4878 #endif
4879 
4880       /* Inside \Q...\E everything is literal except \E */
4881 
4882       if (inescq)
4883         {
4884         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4885           {
4886           inescq = FALSE;                   /* Reset literal state */
4887           ptr++;                            /* Skip the 'E' */
4888           continue;                         /* Carry on with next */
4889           }
4890         goto CHECK_RANGE;                   /* Could be range if \E follows */
4891         }
4892 
4893       /* Handle POSIX class names. Perl allows a negation extension of the
4894       form [:^name:]. A square bracket that doesn't match the syntax is
4895       treated as a literal. We also recognize the POSIX constructions
4896       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4897       5.6 and 5.8 do. */
4898 
4899       if (c == CHAR_LEFT_SQUARE_BRACKET &&
4900           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4901            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4902         {
4903         BOOL local_negate = FALSE;
4904         int posix_class, taboffset, tabopt;
4905         register const pcre_uint8 *cbits = cd->cbits;
4906         pcre_uint8 pbits[32];
4907 
4908         if (ptr[1] != CHAR_COLON)
4909           {
4910           *errorcodeptr = ERR31;
4911           goto FAILED;
4912           }
4913 
4914         ptr += 2;
4915         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4916           {
4917           local_negate = TRUE;
4918           should_flip_negation = TRUE;  /* Note negative special */
4919           ptr++;
4920           }
4921 
4922         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4923         if (posix_class < 0)
4924           {
4925           *errorcodeptr = ERR30;
4926           goto FAILED;
4927           }
4928 
4929         /* If matching is caseless, upper and lower are converted to
4930         alpha. This relies on the fact that the class table starts with
4931         alpha, lower, upper as the first 3 entries. */
4932 
4933         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
4934           posix_class = 0;
4935 
4936         /* When PCRE_UCP is set, some of the POSIX classes are converted to
4937         different escape sequences that use Unicode properties \p or \P. Others
4938         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4939         directly. */
4940 
4941 #ifdef SUPPORT_UCP
4942         if ((options & PCRE_UCP) != 0)
4943           {
4944           unsigned int ptype = 0;
4945           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4946 
4947           /* The posix_substitutes table specifies which POSIX classes can be
4948           converted to \p or \P items. */
4949 
4950           if (posix_substitutes[pc] != NULL)
4951             {
4952             nestptr = tempptr + 1;
4953             ptr = posix_substitutes[pc] - 1;
4954             continue;
4955             }
4956 
4957           /* There are three other classes that generate special property calls
4958           that are recognized only in an XCLASS. */
4959 
4960           else switch(posix_class)
4961             {
4962             case PC_GRAPH:
4963             ptype = PT_PXGRAPH;
4964             /* Fall through */
4965             case PC_PRINT:
4966             if (ptype == 0) ptype = PT_PXPRINT;
4967             /* Fall through */
4968             case PC_PUNCT:
4969             if (ptype == 0) ptype = PT_PXPUNCT;
4970             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4971             *class_uchardata++ = ptype;
4972             *class_uchardata++ = 0;
4973             xclass_has_prop = TRUE;
4974             ptr = tempptr + 1;
4975             continue;
4976 
4977             /* For all other POSIX classes, no special action is taken in UCP
4978             mode. Fall through to the non_UCP case. */
4979 
4980             default:
4981             break;
4982             }
4983           }
4984 #endif
4985         /* In the non-UCP case, or when UCP makes no difference, we build the
4986         bit map for the POSIX class in a chunk of local store because we may be
4987         adding and subtracting from it, and we don't want to subtract bits that
4988         may be in the main map already. At the end we or the result into the
4989         bit map that is being built. */
4990 
4991         posix_class *= 3;
4992 
4993         /* Copy in the first table (always present) */
4994 
4995         memcpy(pbits, cbits + posix_class_maps[posix_class],
4996           32 * sizeof(pcre_uint8));
4997 
4998         /* If there is a second table, add or remove it as required. */
4999 
5000         taboffset = posix_class_maps[posix_class + 1];
5001         tabopt = posix_class_maps[posix_class + 2];
5002 
5003         if (taboffset >= 0)
5004           {
5005           if (tabopt >= 0)
5006             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5007           else
5008             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5009           }
5010 
5011         /* Now see if we need to remove any special characters. An option
5012         value of 1 removes vertical space and 2 removes underscore. */
5013 
5014         if (tabopt < 0) tabopt = -tabopt;
5015         if (tabopt == 1) pbits[1] &= ~0x3c;
5016           else if (tabopt == 2) pbits[11] &= 0x7f;
5017 
5018         /* Add the POSIX table or its complement into the main table that is
5019         being built and we are done. */
5020 
5021         if (local_negate)
5022           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5023         else
5024           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5025 
5026         ptr = tempptr + 1;
5027         /* Every class contains at least one < 256 character. */
5028         class_has_8bitchar = 1;
5029         /* Every class contains at least two characters. */
5030         class_one_char = 2;
5031         continue;    /* End of POSIX syntax handling */
5032         }
5033 
5034       /* Backslash may introduce a single character, or it may introduce one
5035       of the specials, which just set a flag. The sequence \b is a special
5036       case. Inside a class (and only there) it is treated as backspace. We
5037       assume that other escapes have more than one character in them, so
5038       speculatively set both class_has_8bitchar and class_one_char bigger
5039       than one. Unrecognized escapes fall through and are either treated
5040       as literal characters (by default), or are faulted if
5041       PCRE_EXTRA is set. */
5042 
5043       if (c == CHAR_BACKSLASH)
5044         {
5045         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5046           TRUE);
5047         if (*errorcodeptr != 0) goto FAILED;
5048         if (escape == 0) c = ec;
5049         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5050         else if (escape == ESC_N)          /* \N is not supported in a class */
5051           {
5052           *errorcodeptr = ERR71;
5053           goto FAILED;
5054           }
5055         else if (escape == ESC_Q)            /* Handle start of quoted string */
5056           {
5057           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5058             {
5059             ptr += 2; /* avoid empty string */
5060             }
5061           else inescq = TRUE;
5062           continue;
5063           }
5064         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5065 
5066         else
5067           {
5068           register const pcre_uint8 *cbits = cd->cbits;
5069           /* Every class contains at least two < 256 characters. */
5070           class_has_8bitchar++;
5071           /* Every class contains at least two characters. */
5072           class_one_char += 2;
5073 
5074           switch (escape)
5075             {
5076 #ifdef SUPPORT_UCP
5077             case ESC_du:     /* These are the values given for \d etc */
5078             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5079             case ESC_wu:     /* escape sequence with an appropriate \p */
5080             case ESC_WU:     /* or \P to test Unicode properties instead */
5081             case ESC_su:     /* of the default ASCII testing. */
5082             case ESC_SU:
5083             nestptr = ptr;
5084             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5085             class_has_8bitchar--;                /* Undo! */
5086             continue;
5087 #endif
5088             case ESC_d:
5089             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5090             continue;
5091 
5092             case ESC_D:
5093             should_flip_negation = TRUE;
5094             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5095             continue;
5096 
5097             case ESC_w:
5098             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5099             continue;
5100 
5101             case ESC_W:
5102             should_flip_negation = TRUE;
5103             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5104             continue;
5105 
5106             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5107             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5108             previously set by something earlier in the character class.
5109             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5110             we could just adjust the appropriate bit. From PCRE 8.34 we no
5111             longer treat \s and \S specially. */
5112 
5113             case ESC_s:
5114             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5115             continue;
5116 
5117             case ESC_S:
5118             should_flip_negation = TRUE;
5119             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5120             continue;
5121 
5122             /* The rest apply in both UCP and non-UCP cases. */
5123 
5124             case ESC_h:
5125             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5126               PRIV(hspace_list), NOTACHAR);
5127             continue;
5128 
5129             case ESC_H:
5130             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5131               cd, PRIV(hspace_list));
5132             continue;
5133 
5134             case ESC_v:
5135             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5136               PRIV(vspace_list), NOTACHAR);
5137             continue;
5138 
5139             case ESC_V:
5140             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5141               cd, PRIV(vspace_list));
5142             continue;
5143 
5144 #ifdef SUPPORT_UCP
5145             case ESC_p:
5146             case ESC_P:
5147               {
5148               BOOL negated;
5149               unsigned int ptype = 0, pdata = 0;
5150               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5151                 goto FAILED;
5152               *class_uchardata++ = ((escape == ESC_p) != negated)?
5153                 XCL_PROP : XCL_NOTPROP;
5154               *class_uchardata++ = ptype;
5155               *class_uchardata++ = pdata;
5156               xclass_has_prop = TRUE;
5157               class_has_8bitchar--;                /* Undo! */
5158               continue;
5159               }
5160 #endif
5161             /* Unrecognized escapes are faulted if PCRE is running in its
5162             strict mode. By default, for compatibility with Perl, they are
5163             treated as literals. */
5164 
5165             default:
5166             if ((options & PCRE_EXTRA) != 0)
5167               {
5168               *errorcodeptr = ERR7;
5169               goto FAILED;
5170               }
5171             class_has_8bitchar--;    /* Undo the speculative increase. */
5172             class_one_char -= 2;     /* Undo the speculative increase. */
5173             c = *ptr;                /* Get the final character and fall through */
5174             break;
5175             }
5176           }
5177 
5178         /* Fall through if the escape just defined a single character (c >= 0).
5179         This may be greater than 256. */
5180 
5181         escape = 0;
5182 
5183         }   /* End of backslash handling */
5184 
5185       /* A character may be followed by '-' to form a range. However, Perl does
5186       not permit ']' to be the end of the range. A '-' character at the end is
5187       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5188       code for handling \Q and \E is messy. */
5189 
5190       CHECK_RANGE:
5191       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5192         {
5193         inescq = FALSE;
5194         ptr += 2;
5195         }
5196       oldptr = ptr;
5197 
5198       /* Remember if \r or \n were explicitly used */
5199 
5200       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5201 
5202       /* Check for range */
5203 
5204       if (!inescq && ptr[1] == CHAR_MINUS)
5205         {
5206         pcre_uint32 d;
5207         ptr += 2;
5208         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5209 
5210         /* If we hit \Q (not followed by \E) at this point, go into escaped
5211         mode. */
5212 
5213         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5214           {
5215           ptr += 2;
5216           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5217             { ptr += 2; continue; }
5218           inescq = TRUE;
5219           break;
5220           }
5221 
5222         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5223         back the pointer and jump to handle the character that preceded it. */
5224 
5225         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5226           {
5227           ptr = oldptr;
5228           goto CLASS_SINGLE_CHARACTER;
5229           }
5230 
5231         /* Otherwise, we have a potential range; pick up the next character */
5232 
5233 #ifdef SUPPORT_UTF
5234         if (utf)
5235           {                           /* Braces are required because the */
5236           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5237           }
5238         else
5239 #endif
5240         d = *ptr;  /* Not UTF-8 mode */
5241 
5242         /* The second part of a range can be a single-character escape
5243         sequence, but not any of the other escapes. Perl treats a hyphen as a
5244         literal in such circumstances. However, in Perl's warning mode, a
5245         warning is given, so PCRE now faults it as it is almost certainly a
5246         mistake on the user's part. */
5247 
5248         if (!inescq)
5249           {
5250           if (d == CHAR_BACKSLASH)
5251             {
5252             int descape;
5253             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5254             if (*errorcodeptr != 0) goto FAILED;
5255 
5256             /* 0 means a character was put into d; \b is backspace; any other
5257             special causes an error. */
5258 
5259             if (descape != 0)
5260               {
5261               if (descape == ESC_b) d = CHAR_BS; else
5262                 {
5263                 *errorcodeptr = ERR83;
5264                 goto FAILED;
5265                 }
5266               }
5267             }
5268 
5269           /* A hyphen followed by a POSIX class is treated in the same way. */
5270 
5271           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5272                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5273                     ptr[1] == CHAR_EQUALS_SIGN) &&
5274                    check_posix_syntax(ptr, &tempptr))
5275             {
5276             *errorcodeptr = ERR83;
5277             goto FAILED;
5278             }
5279           }
5280 
5281         /* Check that the two values are in the correct order. Optimize
5282         one-character ranges. */
5283 
5284         if (d < c)
5285           {
5286           *errorcodeptr = ERR8;
5287           goto FAILED;
5288           }
5289         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5290 
5291         /* We have found a character range, so single character optimizations
5292         cannot be done anymore. Any value greater than 1 indicates that there
5293         is more than one character. */
5294 
5295         class_one_char = 2;
5296 
5297         /* Remember an explicit \r or \n, and add the range to the class. */
5298 
5299         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5300 
5301         class_has_8bitchar +=
5302           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5303 
5304         continue;   /* Go get the next char in the class */
5305         }
5306 
5307       /* Handle a single character - we can get here for a normal non-escape
5308       char, or after \ that introduces a single character or for an apparent
5309       range that isn't. Only the value 1 matters for class_one_char, so don't
5310       increase it if it is already 2 or more ... just in case there's a class
5311       with a zillion characters in it. */
5312 
5313       CLASS_SINGLE_CHARACTER:
5314       if (class_one_char < 2) class_one_char++;
5315 
5316       /* If class_one_char is 1, we have the first single character in the
5317       class, and there have been no prior ranges, or XCLASS items generated by
5318       escapes. If this is the final character in the class, we can optimize by
5319       turning the item into a 1-character OP_CHAR[I] if it's positive, or
5320       OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
5321       to be set. Otherwise, there can be no first char if this item is first,
5322       whatever repeat count may follow. In the case of reqchar, save the
5323       previous value for reinstating. */
5324 
5325       if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5326         {
5327         ptr++;
5328         zeroreqchar = reqchar;
5329         zeroreqcharflags = reqcharflags;
5330 
5331         if (negate_class)
5332           {
5333 #ifdef SUPPORT_UCP
5334           int d;
5335 #endif
5336           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5337           zerofirstchar = firstchar;
5338           zerofirstcharflags = firstcharflags;
5339 
5340           /* For caseless UTF-8 mode when UCP support is available, check
5341           whether this character has more than one other case. If so, generate
5342           a special OP_NOTPROP item instead of OP_NOTI. */
5343 
5344 #ifdef SUPPORT_UCP
5345           if (utf && (options & PCRE_CASELESS) != 0 &&
5346               (d = UCD_CASESET(c)) != 0)
5347             {
5348             *code++ = OP_NOTPROP;
5349             *code++ = PT_CLIST;
5350             *code++ = d;
5351             }
5352           else
5353 #endif
5354           /* Char has only one other case, or UCP not available */
5355 
5356             {
5357             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5358 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5359             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5360               code += PRIV(ord2utf)(c, code);
5361             else
5362 #endif
5363               *code++ = c;
5364             }
5365 
5366           /* We are finished with this character class */
5367 
5368           goto END_CLASS;
5369           }
5370 
5371         /* For a single, positive character, get the value into mcbuffer, and
5372         then we can handle this with the normal one-character code. */
5373 
5374 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5375         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5376           mclength = PRIV(ord2utf)(c, mcbuffer);
5377         else
5378 #endif
5379           {
5380           mcbuffer[0] = c;
5381           mclength = 1;
5382           }
5383         goto ONE_CHAR;
5384         }       /* End of 1-char optimization */
5385 
5386       /* There is more than one character in the class, or an XCLASS item
5387       has been generated. Add this character to the class. */
5388 
5389       class_has_8bitchar +=
5390         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5391       }
5392 
5393     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5394     If we are at the end of an internal nested string, revert to the outer
5395     string. */
5396 
5397     while (((c = *(++ptr)) != CHAR_NULL ||
5398            (nestptr != NULL &&
5399              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5400            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5401 
5402     /* Check for missing terminating ']' */
5403 
5404     if (c == CHAR_NULL)
5405       {
5406       *errorcodeptr = ERR6;
5407       goto FAILED;
5408       }
5409 
5410     /* We will need an XCLASS if data has been placed in class_uchardata. In
5411     the second phase this is a sufficient test. However, in the pre-compile
5412     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5413     only if the very last character in the class needs XCLASS will it contain
5414     anything at this point. For this reason, xclass gets set TRUE above when
5415     uchar_classdata is emptied, and that's why this code is the way it is here
5416     instead of just doing a test on class_uchardata below. */
5417 
5418 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5419     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5420 #endif
5421 
5422     /* If this is the first thing in the branch, there can be no first char
5423     setting, whatever the repeat count. Any reqchar setting must remain
5424     unchanged after any kind of repeat. */
5425 
5426     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5427     zerofirstchar = firstchar;
5428     zerofirstcharflags = firstcharflags;
5429     zeroreqchar = reqchar;
5430     zeroreqcharflags = reqcharflags;
5431 
5432     /* If there are characters with values > 255, we have to compile an
5433     extended class, with its own opcode, unless there was a negated special
5434     such as \S in the class, and PCRE_UCP is not set, because in that case all
5435     characters > 255 are in the class, so any that were explicitly given as
5436     well can be ignored. If (when there are explicit characters > 255 that must
5437     be listed) there are no characters < 256, we can omit the bitmap in the
5438     actual compiled code. */
5439 
5440 #ifdef SUPPORT_UTF
5441     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
5442 #elif !defined COMPILE_PCRE8
5443     if (xclass && !should_flip_negation)
5444 #endif
5445 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5446       {
5447       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5448       *code++ = OP_XCLASS;
5449       code += LINK_SIZE;
5450       *code = negate_class? XCL_NOT:0;
5451       if (xclass_has_prop) *code |= XCL_HASPROP;
5452 
5453       /* If the map is required, move up the extra data to make room for it;
5454       otherwise just move the code pointer to the end of the extra data. */
5455 
5456       if (class_has_8bitchar > 0)
5457         {
5458         *code++ |= XCL_MAP;
5459         memmove(code + (32 / sizeof(pcre_uchar)), code,
5460           IN_UCHARS(class_uchardata - code));
5461         if (negate_class && !xclass_has_prop)
5462           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5463         memcpy(code, classbits, 32);
5464         code = class_uchardata + (32 / sizeof(pcre_uchar));
5465         }
5466       else code = class_uchardata;
5467 
5468       /* Now fill in the complete length of the item */
5469 
5470       PUT(previous, 1, (int)(code - previous));
5471       break;   /* End of class handling */
5472       }
5473 #endif
5474 
5475     /* If there are no characters > 255, or they are all to be included or
5476     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5477     whole class was negated and whether there were negative specials such as \S
5478     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5479     negating it if necessary. */
5480 
5481     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5482     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5483       {
5484       if (negate_class)
5485         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5486       memcpy(code, classbits, 32);
5487       }
5488     code += 32 / sizeof(pcre_uchar);
5489 
5490     END_CLASS:
5491     break;
5492 
5493 
5494     /* ===================================================================*/
5495     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5496     has been tested above. */
5497 
5498     case CHAR_LEFT_CURLY_BRACKET:
5499     if (!is_quantifier) goto NORMAL_CHAR;
5500     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5501     if (*errorcodeptr != 0) goto FAILED;
5502     goto REPEAT;
5503 
5504     case CHAR_ASTERISK:
5505     repeat_min = 0;
5506     repeat_max = -1;
5507     goto REPEAT;
5508 
5509     case CHAR_PLUS:
5510     repeat_min = 1;
5511     repeat_max = -1;
5512     goto REPEAT;
5513 
5514     case CHAR_QUESTION_MARK:
5515     repeat_min = 0;
5516     repeat_max = 1;
5517 
5518     REPEAT:
5519     if (previous == NULL)
5520       {
5521       *errorcodeptr = ERR9;
5522       goto FAILED;
5523       }
5524 
5525     if (repeat_min == 0)
5526       {
5527       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5528       firstcharflags = zerofirstcharflags;
5529       reqchar = zeroreqchar;        /* Ditto */
5530       reqcharflags = zeroreqcharflags;
5531       }
5532 
5533     /* Remember whether this is a variable length repeat */
5534 
5535     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5536 
5537     op_type = 0;                    /* Default single-char op codes */
5538     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5539 
5540     /* Save start of previous item, in case we have to move it up in order to
5541     insert something before it. */
5542 
5543     tempcode = previous;
5544 
5545     /* Before checking for a possessive quantifier, we must skip over
5546     whitespace and comments in extended mode because Perl allows white space at
5547     this point. */
5548 
5549     if ((options & PCRE_EXTENDED) != 0)
5550       {
5551       const pcre_uchar *p = ptr + 1;
5552       for (;;)
5553         {
5554         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5555         if (*p != CHAR_NUMBER_SIGN) break;
5556         p++;
5557         while (*p != CHAR_NULL)
5558           {
5559           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5560             {                        /* IS_NEWLINE sets cd->nllen. */
5561             p += cd->nllen;
5562             break;
5563             }
5564           p++;
5565 #ifdef SUPPORT_UTF
5566           if (utf) FORWARDCHAR(p);
5567 #endif
5568           }           /* Loop for comment characters */
5569         }             /* Loop for multiple comments */
5570       ptr = p - 1;    /* Character before the next significant one. */
5571       }
5572 
5573     /* If the next character is '+', we have a possessive quantifier. This
5574     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5575     If the next character is '?' this is a minimizing repeat, by default,
5576     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5577     repeat type to the non-default. */
5578 
5579     if (ptr[1] == CHAR_PLUS)
5580       {
5581       repeat_type = 0;                  /* Force greedy */
5582       possessive_quantifier = TRUE;
5583       ptr++;
5584       }
5585     else if (ptr[1] == CHAR_QUESTION_MARK)
5586       {
5587       repeat_type = greedy_non_default;
5588       ptr++;
5589       }
5590     else repeat_type = greedy_default;
5591 
5592     /* If previous was a recursion call, wrap it in atomic brackets so that
5593     previous becomes the atomic group. All recursions were so wrapped in the
5594     past, but it no longer happens for non-repeated recursions. In fact, the
5595     repeated ones could be re-implemented independently so as not to need this,
5596     but for the moment we rely on the code for repeating groups. */
5597 
5598     if (*previous == OP_RECURSE)
5599       {
5600       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5601       *previous = OP_ONCE;
5602       PUT(previous, 1, 2 + 2*LINK_SIZE);
5603       previous[2 + 2*LINK_SIZE] = OP_KET;
5604       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5605       code += 2 + 2 * LINK_SIZE;
5606       length_prevgroup = 3 + 3*LINK_SIZE;
5607 
5608       /* When actually compiling, we need to check whether this was a forward
5609       reference, and if so, adjust the offset. */
5610 
5611       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5612         {
5613         int offset = GET(cd->hwm, -LINK_SIZE);
5614         if (offset == previous + 1 - cd->start_code)
5615           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5616         }
5617       }
5618 
5619     /* Now handle repetition for the different types of item. */
5620 
5621     /* If previous was a character or negated character match, abolish the item
5622     and generate a repeat item instead. If a char item has a minimum of more
5623     than one, ensure that it is set in reqchar - it might not be if a sequence
5624     such as x{3} is the first thing in a branch because the x will have gone
5625     into firstchar instead.  */
5626 
5627     if (*previous == OP_CHAR || *previous == OP_CHARI
5628         || *previous == OP_NOT || *previous == OP_NOTI)
5629       {
5630       switch (*previous)
5631         {
5632         default: /* Make compiler happy. */
5633         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5634         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5635         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5636         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5637         }
5638 
5639       /* Deal with UTF characters that take up more than one character. It's
5640       easier to write this out separately than try to macrify it. Use c to
5641       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5642       it's a length rather than a small character. */
5643 
5644 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5645       if (utf && NOT_FIRSTCHAR(code[-1]))
5646         {
5647         pcre_uchar *lastchar = code - 1;
5648         BACKCHAR(lastchar);
5649         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5650         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5651         c |= UTF_LENGTH;                /* Flag c as a length */
5652         }
5653       else
5654 #endif /* SUPPORT_UTF */
5655 
5656       /* Handle the case of a single charater - either with no UTF support, or
5657       with UTF disabled, or for a single character UTF character. */
5658         {
5659         c = code[-1];
5660         if (*previous <= OP_CHARI && repeat_min > 1)
5661           {
5662           reqchar = c;
5663           reqcharflags = req_caseopt | cd->req_varyopt;
5664           }
5665         }
5666 
5667       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5668       }
5669 
5670     /* If previous was a character type match (\d or similar), abolish it and
5671     create a suitable repeat item. The code is shared with single-character
5672     repeats by setting op_type to add a suitable offset into repeat_type. Note
5673     the the Unicode property types will be present only when SUPPORT_UCP is
5674     defined, but we don't wrap the little bits of code here because it just
5675     makes it horribly messy. */
5676 
5677     else if (*previous < OP_EODN)
5678       {
5679       pcre_uchar *oldcode;
5680       int prop_type, prop_value;
5681       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5682       c = *previous;
5683 
5684       OUTPUT_SINGLE_REPEAT:
5685       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5686         {
5687         prop_type = previous[1];
5688         prop_value = previous[2];
5689         }
5690       else prop_type = prop_value = -1;
5691 
5692       oldcode = code;
5693       code = previous;                  /* Usually overwrite previous item */
5694 
5695       /* If the maximum is zero then the minimum must also be zero; Perl allows
5696       this case, so we do too - by simply omitting the item altogether. */
5697 
5698       if (repeat_max == 0) goto END_REPEAT;
5699 
5700       /* Combine the op_type with the repeat_type */
5701 
5702       repeat_type += op_type;
5703 
5704       /* A minimum of zero is handled either as the special case * or ?, or as
5705       an UPTO, with the maximum given. */
5706 
5707       if (repeat_min == 0)
5708         {
5709         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5710           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5711         else
5712           {
5713           *code++ = OP_UPTO + repeat_type;
5714           PUT2INC(code, 0, repeat_max);
5715           }
5716         }
5717 
5718       /* A repeat minimum of 1 is optimized into some special cases. If the
5719       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5720       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5721       one less than the maximum. */
5722 
5723       else if (repeat_min == 1)
5724         {
5725         if (repeat_max == -1)
5726           *code++ = OP_PLUS + repeat_type;
5727         else
5728           {
5729           code = oldcode;                 /* leave previous item in place */
5730           if (repeat_max == 1) goto END_REPEAT;
5731           *code++ = OP_UPTO + repeat_type;
5732           PUT2INC(code, 0, repeat_max - 1);
5733           }
5734         }
5735 
5736       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5737       handled as an EXACT followed by an UPTO. */
5738 
5739       else
5740         {
5741         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5742         PUT2INC(code, 0, repeat_min);
5743 
5744         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5745         we have to insert the character for the previous code. For a repeated
5746         Unicode property match, there are two extra bytes that define the
5747         required property. In UTF-8 mode, long characters have their length in
5748         c, with the UTF_LENGTH bit as a flag. */
5749 
5750         if (repeat_max < 0)
5751           {
5752 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5753           if (utf && (c & UTF_LENGTH) != 0)
5754             {
5755             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5756             code += c & 7;
5757             }
5758           else
5759 #endif
5760             {
5761             *code++ = c;
5762             if (prop_type >= 0)
5763               {
5764               *code++ = prop_type;
5765               *code++ = prop_value;
5766               }
5767             }
5768           *code++ = OP_STAR + repeat_type;
5769           }
5770 
5771         /* Else insert an UPTO if the max is greater than the min, again
5772         preceded by the character, for the previously inserted code. If the
5773         UPTO is just for 1 instance, we can use QUERY instead. */
5774 
5775         else if (repeat_max != repeat_min)
5776           {
5777 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5778           if (utf && (c & UTF_LENGTH) != 0)
5779             {
5780             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5781             code += c & 7;
5782             }
5783           else
5784 #endif
5785           *code++ = c;
5786           if (prop_type >= 0)
5787             {
5788             *code++ = prop_type;
5789             *code++ = prop_value;
5790             }
5791           repeat_max -= repeat_min;
5792 
5793           if (repeat_max == 1)
5794             {
5795             *code++ = OP_QUERY + repeat_type;
5796             }
5797           else
5798             {
5799             *code++ = OP_UPTO + repeat_type;
5800             PUT2INC(code, 0, repeat_max);
5801             }
5802           }
5803         }
5804 
5805       /* The character or character type itself comes last in all cases. */
5806 
5807 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5808       if (utf && (c & UTF_LENGTH) != 0)
5809         {
5810         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5811         code += c & 7;
5812         }
5813       else
5814 #endif
5815       *code++ = c;
5816 
5817       /* For a repeated Unicode property match, there are two extra bytes that
5818       define the required property. */
5819 
5820 #ifdef SUPPORT_UCP
5821       if (prop_type >= 0)
5822         {
5823         *code++ = prop_type;
5824         *code++ = prop_value;
5825         }
5826 #endif
5827       }
5828 
5829     /* If previous was a character class or a back reference, we put the repeat
5830     stuff after it, but just skip the item if the repeat was {0,0}. */
5831 
5832     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5833 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5834              *previous == OP_XCLASS ||
5835 #endif
5836              *previous == OP_REF   || *previous == OP_REFI ||
5837              *previous == OP_DNREF || *previous == OP_DNREFI)
5838       {
5839       if (repeat_max == 0)
5840         {
5841         code = previous;
5842         goto END_REPEAT;
5843         }
5844 
5845       if (repeat_min == 0 && repeat_max == -1)
5846         *code++ = OP_CRSTAR + repeat_type;
5847       else if (repeat_min == 1 && repeat_max == -1)
5848         *code++ = OP_CRPLUS + repeat_type;
5849       else if (repeat_min == 0 && repeat_max == 1)
5850         *code++ = OP_CRQUERY + repeat_type;
5851       else
5852         {
5853         *code++ = OP_CRRANGE + repeat_type;
5854         PUT2INC(code, 0, repeat_min);
5855         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5856         PUT2INC(code, 0, repeat_max);
5857         }
5858       }
5859 
5860     /* If previous was a bracket group, we may have to replicate it in certain
5861     cases. Note that at this point we can encounter only the "basic" bracket
5862     opcodes such as BRA and CBRA, as this is the place where they get converted
5863     into the more special varieties such as BRAPOS and SBRA. A test for >=
5864     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5865     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5866     Originally, PCRE did not allow repetition of assertions, but now it does,
5867     for Perl compatibility. */
5868 
5869     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5870       {
5871       register int i;
5872       int len = (int)(code - previous);
5873       pcre_uchar *bralink = NULL;
5874       pcre_uchar *brazeroptr = NULL;
5875 
5876       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5877       we just ignore the repeat. */
5878 
5879       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5880         goto END_REPEAT;
5881 
5882       /* There is no sense in actually repeating assertions. The only potential
5883       use of repetition is in cases when the assertion is optional. Therefore,
5884       if the minimum is greater than zero, just ignore the repeat. If the
5885       maximum is not zero or one, set it to 1. */
5886 
5887       if (*previous < OP_ONCE)    /* Assertion */
5888         {
5889         if (repeat_min > 0) goto END_REPEAT;
5890         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5891         }
5892 
5893       /* The case of a zero minimum is special because of the need to stick
5894       OP_BRAZERO in front of it, and because the group appears once in the
5895       data, whereas in other cases it appears the minimum number of times. For
5896       this reason, it is simplest to treat this case separately, as otherwise
5897       the code gets far too messy. There are several special subcases when the
5898       minimum is zero. */
5899 
5900       if (repeat_min == 0)
5901         {
5902         /* If the maximum is also zero, we used to just omit the group from the
5903         output altogether, like this:
5904 
5905         ** if (repeat_max == 0)
5906         **   {
5907         **   code = previous;
5908         **   goto END_REPEAT;
5909         **   }
5910 
5911         However, that fails when a group or a subgroup within it is referenced
5912         as a subroutine from elsewhere in the pattern, so now we stick in
5913         OP_SKIPZERO in front of it so that it is skipped on execution. As we
5914         don't have a list of which groups are referenced, we cannot do this
5915         selectively.
5916 
5917         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5918         and do no more at this point. However, we do need to adjust any
5919         OP_RECURSE calls inside the group that refer to the group itself or any
5920         internal or forward referenced group, because the offset is from the
5921         start of the whole regex. Temporarily terminate the pattern while doing
5922         this. */
5923 
5924         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5925           {
5926           *code = OP_END;
5927           adjust_recurse(previous, 1, utf, cd, save_hwm);
5928           memmove(previous + 1, previous, IN_UCHARS(len));
5929           code++;
5930           if (repeat_max == 0)
5931             {
5932             *previous++ = OP_SKIPZERO;
5933             goto END_REPEAT;
5934             }
5935           brazeroptr = previous;    /* Save for possessive optimizing */
5936           *previous++ = OP_BRAZERO + repeat_type;
5937           }
5938 
5939         /* If the maximum is greater than 1 and limited, we have to replicate
5940         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5941         The first one has to be handled carefully because it's the original
5942         copy, which has to be moved up. The remainder can be handled by code
5943         that is common with the non-zero minimum case below. We have to
5944         adjust the value or repeat_max, since one less copy is required. Once
5945         again, we may have to adjust any OP_RECURSE calls inside the group. */
5946 
5947         else
5948           {
5949           int offset;
5950           *code = OP_END;
5951           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5952           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5953           code += 2 + LINK_SIZE;
5954           *previous++ = OP_BRAZERO + repeat_type;
5955           *previous++ = OP_BRA;
5956 
5957           /* We chain together the bracket offset fields that have to be
5958           filled in later when the ends of the brackets are reached. */
5959 
5960           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5961           bralink = previous;
5962           PUTINC(previous, 0, offset);
5963           }
5964 
5965         repeat_max--;
5966         }
5967 
5968       /* If the minimum is greater than zero, replicate the group as many
5969       times as necessary, and adjust the maximum to the number of subsequent
5970       copies that we need. If we set a first char from the group, and didn't
5971       set a required char, copy the latter from the former. If there are any
5972       forward reference subroutine calls in the group, there will be entries on
5973       the workspace list; replicate these with an appropriate increment. */
5974 
5975       else
5976         {
5977         if (repeat_min > 1)
5978           {
5979           /* In the pre-compile phase, we don't actually do the replication. We
5980           just adjust the length as if we had. Do some paranoid checks for
5981           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5982           integer type when available, otherwise double. */
5983 
5984           if (lengthptr != NULL)
5985             {
5986             int delta = (repeat_min - 1)*length_prevgroup;
5987             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5988                   (INT64_OR_DOUBLE)length_prevgroup >
5989                     (INT64_OR_DOUBLE)INT_MAX ||
5990                 OFLOW_MAX - *lengthptr < delta)
5991               {
5992               *errorcodeptr = ERR20;
5993               goto FAILED;
5994               }
5995             *lengthptr += delta;
5996             }
5997 
5998           /* This is compiling for real. If there is a set first byte for
5999           the group, and we have not yet set a "required byte", set it. Make
6000           sure there is enough workspace for copying forward references before
6001           doing the copy. */
6002 
6003           else
6004             {
6005             if (groupsetfirstchar && reqcharflags < 0)
6006               {
6007               reqchar = firstchar;
6008               reqcharflags = firstcharflags;
6009               }
6010 
6011             for (i = 1; i < repeat_min; i++)
6012               {
6013               pcre_uchar *hc;
6014               pcre_uchar *this_hwm = cd->hwm;
6015               memcpy(code, previous, IN_UCHARS(len));
6016 
6017               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6018                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
6019                 {
6020                 size_t save_offset = save_hwm - cd->start_workspace;
6021                 size_t this_offset = this_hwm - cd->start_workspace;
6022                 *errorcodeptr = expand_workspace(cd);
6023                 if (*errorcodeptr != 0) goto FAILED;
6024                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
6025                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
6026                 }
6027 
6028               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
6029                 {
6030                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6031                 cd->hwm += LINK_SIZE;
6032                 }
6033               save_hwm = this_hwm;
6034               code += len;
6035               }
6036             }
6037           }
6038 
6039         if (repeat_max > 0) repeat_max -= repeat_min;
6040         }
6041 
6042       /* This code is common to both the zero and non-zero minimum cases. If
6043       the maximum is limited, it replicates the group in a nested fashion,
6044       remembering the bracket starts on a stack. In the case of a zero minimum,
6045       the first one was set up above. In all cases the repeat_max now specifies
6046       the number of additional copies needed. Again, we must remember to
6047       replicate entries on the forward reference list. */
6048 
6049       if (repeat_max >= 0)
6050         {
6051         /* In the pre-compile phase, we don't actually do the replication. We
6052         just adjust the length as if we had. For each repetition we must add 1
6053         to the length for BRAZERO and for all but the last repetition we must
6054         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6055         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6056         a 64-bit integer type when available, otherwise double. */
6057 
6058         if (lengthptr != NULL && repeat_max > 0)
6059           {
6060           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6061                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6062           if ((INT64_OR_DOUBLE)repeat_max *
6063                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6064                   > (INT64_OR_DOUBLE)INT_MAX ||
6065               OFLOW_MAX - *lengthptr < delta)
6066             {
6067             *errorcodeptr = ERR20;
6068             goto FAILED;
6069             }
6070           *lengthptr += delta;
6071           }
6072 
6073         /* This is compiling for real */
6074 
6075         else for (i = repeat_max - 1; i >= 0; i--)
6076           {
6077           pcre_uchar *hc;
6078           pcre_uchar *this_hwm = cd->hwm;
6079 
6080           *code++ = OP_BRAZERO + repeat_type;
6081 
6082           /* All but the final copy start a new nesting, maintaining the
6083           chain of brackets outstanding. */
6084 
6085           if (i != 0)
6086             {
6087             int offset;
6088             *code++ = OP_BRA;
6089             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6090             bralink = code;
6091             PUTINC(code, 0, offset);
6092             }
6093 
6094           memcpy(code, previous, IN_UCHARS(len));
6095 
6096           /* Ensure there is enough workspace for forward references before
6097           copying them. */
6098 
6099           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6100                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
6101             {
6102             size_t save_offset = save_hwm - cd->start_workspace;
6103             size_t this_offset = this_hwm - cd->start_workspace;
6104             *errorcodeptr = expand_workspace(cd);
6105             if (*errorcodeptr != 0) goto FAILED;
6106             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
6107             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
6108             }
6109 
6110           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
6111             {
6112             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6113             cd->hwm += LINK_SIZE;
6114             }
6115           save_hwm = this_hwm;
6116           code += len;
6117           }
6118 
6119         /* Now chain through the pending brackets, and fill in their length
6120         fields (which are holding the chain links pro tem). */
6121 
6122         while (bralink != NULL)
6123           {
6124           int oldlinkoffset;
6125           int offset = (int)(code - bralink + 1);
6126           pcre_uchar *bra = code - offset;
6127           oldlinkoffset = GET(bra, 1);
6128           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6129           *code++ = OP_KET;
6130           PUTINC(code, 0, offset);
6131           PUT(bra, 1, offset);
6132           }
6133         }
6134 
6135       /* If the maximum is unlimited, set a repeater in the final copy. For
6136       ONCE brackets, that's all we need to do. However, possessively repeated
6137       ONCE brackets can be converted into non-capturing brackets, as the
6138       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6139       deal with possessive ONCEs specially.
6140 
6141       Otherwise, when we are doing the actual compile phase, check to see
6142       whether this group is one that could match an empty string. If so,
6143       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6144       that runtime checking can be done. [This check is also applied to ONCE
6145       groups at runtime, but in a different way.]
6146 
6147       Then, if the quantifier was possessive and the bracket is not a
6148       conditional, we convert the BRA code to the POS form, and the KET code to
6149       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6150       subpattern at both the start and at the end.) The use of special opcodes
6151       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6152       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6153 
6154       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6155       flag so that the default action below, of wrapping everything inside
6156       atomic brackets, does not happen. When the minimum is greater than 1,
6157       there will be earlier copies of the group, and so we still have to wrap
6158       the whole thing. */
6159 
6160       else
6161         {
6162         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6163         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6164 
6165         /* Convert possessive ONCE brackets to non-capturing */
6166 
6167         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6168             possessive_quantifier) *bracode = OP_BRA;
6169 
6170         /* For non-possessive ONCE brackets, all we need to do is to
6171         set the KET. */
6172 
6173         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6174           *ketcode = OP_KETRMAX + repeat_type;
6175 
6176         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6177         converted to non-capturing above). */
6178 
6179         else
6180           {
6181           /* In the compile phase, check for empty string matching. */
6182 
6183           if (lengthptr == NULL)
6184             {
6185             pcre_uchar *scode = bracode;
6186             do
6187               {
6188               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6189                 {
6190                 *bracode += OP_SBRA - OP_BRA;
6191                 break;
6192                 }
6193               scode += GET(scode, 1);
6194               }
6195             while (*scode == OP_ALT);
6196             }
6197 
6198           /* Handle possessive quantifiers. */
6199 
6200           if (possessive_quantifier)
6201             {
6202             /* For COND brackets, we wrap the whole thing in a possessively
6203             repeated non-capturing bracket, because we have not invented POS
6204             versions of the COND opcodes. Because we are moving code along, we
6205             must ensure that any pending recursive references are updated. */
6206 
6207             if (*bracode == OP_COND || *bracode == OP_SCOND)
6208               {
6209               int nlen = (int)(code - bracode);
6210               *code = OP_END;
6211               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
6212               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6213               code += 1 + LINK_SIZE;
6214               nlen += 1 + LINK_SIZE;
6215               *bracode = OP_BRAPOS;
6216               *code++ = OP_KETRPOS;
6217               PUTINC(code, 0, nlen);
6218               PUT(bracode, 1, nlen);
6219               }
6220 
6221             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6222 
6223             else
6224               {
6225               *bracode += 1;              /* Switch to xxxPOS opcodes */
6226               *ketcode = OP_KETRPOS;
6227               }
6228 
6229             /* If the minimum is zero, mark it as possessive, then unset the
6230             possessive flag when the minimum is 0 or 1. */
6231 
6232             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6233             if (repeat_min < 2) possessive_quantifier = FALSE;
6234             }
6235 
6236           /* Non-possessive quantifier */
6237 
6238           else *ketcode = OP_KETRMAX + repeat_type;
6239           }
6240         }
6241       }
6242 
6243     /* If previous is OP_FAIL, it was generated by an empty class [] in
6244     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6245     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6246     error above. We can just ignore the repeat in JS case. */
6247 
6248     else if (*previous == OP_FAIL) goto END_REPEAT;
6249 
6250     /* Else there's some kind of shambles */
6251 
6252     else
6253       {
6254       *errorcodeptr = ERR11;
6255       goto FAILED;
6256       }
6257 
6258     /* If the character following a repeat is '+', possessive_quantifier is
6259     TRUE. For some opcodes, there are special alternative opcodes for this
6260     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6261     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6262     Sun's Java package, but the special opcodes can optimize it.
6263 
6264     Some (but not all) possessively repeated subpatterns have already been
6265     completely handled in the code just above. For them, possessive_quantifier
6266     is always FALSE at this stage. Note that the repeated item starts at
6267     tempcode, not at previous, which might be the first part of a string whose
6268     (former) last char we repeated. */
6269 
6270     if (possessive_quantifier)
6271       {
6272       int len;
6273 
6274       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6275       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6276       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6277       remains is greater than zero, there's a further opcode that can be
6278       handled. If not, do nothing, leaving the EXACT alone. */
6279 
6280       switch(*tempcode)
6281         {
6282         case OP_TYPEEXACT:
6283         tempcode += PRIV(OP_lengths)[*tempcode] +
6284           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6285           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6286         break;
6287 
6288         /* CHAR opcodes are used for exacts whose count is 1. */
6289 
6290         case OP_CHAR:
6291         case OP_CHARI:
6292         case OP_NOT:
6293         case OP_NOTI:
6294         case OP_EXACT:
6295         case OP_EXACTI:
6296         case OP_NOTEXACT:
6297         case OP_NOTEXACTI:
6298         tempcode += PRIV(OP_lengths)[*tempcode];
6299 #ifdef SUPPORT_UTF
6300         if (utf && HAS_EXTRALEN(tempcode[-1]))
6301           tempcode += GET_EXTRALEN(tempcode[-1]);
6302 #endif
6303         break;
6304 
6305         /* For the class opcodes, the repeat operator appears at the end;
6306         adjust tempcode to point to it. */
6307 
6308         case OP_CLASS:
6309         case OP_NCLASS:
6310         tempcode += 1 + 32/sizeof(pcre_uchar);
6311         break;
6312 
6313 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6314         case OP_XCLASS:
6315         tempcode += GET(tempcode, 1);
6316         break;
6317 #endif
6318         }
6319 
6320       /* If tempcode is equal to code (which points to the end of the repeated
6321       item), it means we have skipped an EXACT item but there is no following
6322       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6323       all other cases, tempcode will be pointing to the repeat opcode, and will
6324       be less than code, so the value of len will be greater than 0. */
6325 
6326       len = (int)(code - tempcode);
6327       if (len > 0)
6328         {
6329         unsigned int repcode = *tempcode;
6330 
6331         /* There is a table for possessifying opcodes, all of which are less
6332         than OP_CALLOUT. A zero entry means there is no possessified version.
6333         */
6334 
6335         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6336           *tempcode = opcode_possessify[repcode];
6337 
6338         /* For opcode without a special possessified version, wrap the item in
6339         ONCE brackets. Because we are moving code along, we must ensure that any
6340         pending recursive references are updated. */
6341 
6342         else
6343           {
6344           *code = OP_END;
6345           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6346           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6347           code += 1 + LINK_SIZE;
6348           len += 1 + LINK_SIZE;
6349           tempcode[0] = OP_ONCE;
6350           *code++ = OP_KET;
6351           PUTINC(code, 0, len);
6352           PUT(tempcode, 1, len);
6353           }
6354         }
6355 
6356 #ifdef NEVER
6357       if (len > 0) switch (*tempcode)
6358         {
6359         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6360         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6361         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6362         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6363 
6364         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6365         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6366         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6367         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6368 
6369         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6370         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6371         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6372         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6373 
6374         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6375         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6376         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6377         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6378 
6379         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6380         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6381         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6382         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6383 
6384         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6385         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6386         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6387         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6388 
6389         /* Because we are moving code along, we must ensure that any
6390         pending recursive references are updated. */
6391 
6392         default:
6393         *code = OP_END;
6394         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
6395         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6396         code += 1 + LINK_SIZE;
6397         len += 1 + LINK_SIZE;
6398         tempcode[0] = OP_ONCE;
6399         *code++ = OP_KET;
6400         PUTINC(code, 0, len);
6401         PUT(tempcode, 1, len);
6402         break;
6403         }
6404 #endif
6405       }
6406 
6407     /* In all case we no longer have a previous item. We also set the
6408     "follows varying string" flag for subsequently encountered reqchars if
6409     it isn't already set and we have just passed a varying length item. */
6410 
6411     END_REPEAT:
6412     previous = NULL;
6413     cd->req_varyopt |= reqvary;
6414     break;
6415 
6416 
6417     /* ===================================================================*/
6418     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6419     lookbehind or option setting or condition or all the other extended
6420     parenthesis forms.  */
6421 
6422     case CHAR_LEFT_PARENTHESIS:
6423     newoptions = options;
6424     skipbytes = 0;
6425     bravalue = OP_CBRA;
6426     save_hwm = cd->hwm;
6427     reset_bracount = FALSE;
6428 
6429     /* First deal with various "verbs" that can be introduced by '*'. */
6430 
6431     ptr++;
6432     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6433          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6434       {
6435       int i, namelen;
6436       int arglen = 0;
6437       const char *vn = verbnames;
6438       const pcre_uchar *name = ptr + 1;
6439       const pcre_uchar *arg = NULL;
6440       previous = NULL;
6441       ptr++;
6442       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6443       namelen = (int)(ptr - name);
6444 
6445       /* It appears that Perl allows any characters whatsoever, other than
6446       a closing parenthesis, to appear in arguments, so we no longer insist on
6447       letters, digits, and underscores. */
6448 
6449       if (*ptr == CHAR_COLON)
6450         {
6451         arg = ++ptr;
6452         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6453         arglen = (int)(ptr - arg);
6454         if ((unsigned int)arglen > MAX_MARK)
6455           {
6456           *errorcodeptr = ERR75;
6457           goto FAILED;
6458           }
6459         }
6460 
6461       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6462         {
6463         *errorcodeptr = ERR60;
6464         goto FAILED;
6465         }
6466 
6467       /* Scan the table of verb names */
6468 
6469       for (i = 0; i < verbcount; i++)
6470         {
6471         if (namelen == verbs[i].len &&
6472             STRNCMP_UC_C8(name, vn, namelen) == 0)
6473           {
6474           int setverb;
6475 
6476           /* Check for open captures before ACCEPT and convert it to
6477           ASSERT_ACCEPT if in an assertion. */
6478 
6479           if (verbs[i].op == OP_ACCEPT)
6480             {
6481             open_capitem *oc;
6482             if (arglen != 0)
6483               {
6484               *errorcodeptr = ERR59;
6485               goto FAILED;
6486               }
6487             cd->had_accept = TRUE;
6488             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6489               {
6490               *code++ = OP_CLOSE;
6491               PUT2INC(code, 0, oc->number);
6492               }
6493             setverb = *code++ =
6494               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6495 
6496             /* Do not set firstchar after *ACCEPT */
6497             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6498             }
6499 
6500           /* Handle other cases with/without an argument */
6501 
6502           else if (arglen == 0)
6503             {
6504             if (verbs[i].op < 0)   /* Argument is mandatory */
6505               {
6506               *errorcodeptr = ERR66;
6507               goto FAILED;
6508               }
6509             setverb = *code++ = verbs[i].op;
6510             }
6511 
6512           else
6513             {
6514             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6515               {
6516               *errorcodeptr = ERR59;
6517               goto FAILED;
6518               }
6519             setverb = *code++ = verbs[i].op_arg;
6520             *code++ = arglen;
6521             memcpy(code, arg, IN_UCHARS(arglen));
6522             code += arglen;
6523             *code++ = 0;
6524             }
6525 
6526           switch (setverb)
6527             {
6528             case OP_THEN:
6529             case OP_THEN_ARG:
6530             cd->external_flags |= PCRE_HASTHEN;
6531             break;
6532 
6533             case OP_PRUNE:
6534             case OP_PRUNE_ARG:
6535             case OP_SKIP:
6536             case OP_SKIP_ARG:
6537             cd->had_pruneorskip = TRUE;
6538             break;
6539             }
6540 
6541           break;  /* Found verb, exit loop */
6542           }
6543 
6544         vn += verbs[i].len + 1;
6545         }
6546 
6547       if (i < verbcount) continue;    /* Successfully handled a verb */
6548       *errorcodeptr = ERR60;          /* Verb not recognized */
6549       goto FAILED;
6550       }
6551 
6552     /* Deal with the extended parentheses; all are introduced by '?', and the
6553     appearance of any of them means that this is not a capturing group. */
6554 
6555     else if (*ptr == CHAR_QUESTION_MARK)
6556       {
6557       int i, set, unset, namelen;
6558       int *optset;
6559       const pcre_uchar *name;
6560       pcre_uchar *slot;
6561 
6562       switch (*(++ptr))
6563         {
6564         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
6565         ptr++;
6566         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6567         if (*ptr == CHAR_NULL)
6568           {
6569           *errorcodeptr = ERR18;
6570           goto FAILED;
6571           }
6572         continue;
6573 
6574 
6575         /* ------------------------------------------------------------ */
6576         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6577         reset_bracount = TRUE;
6578         /* Fall through */
6579 
6580         /* ------------------------------------------------------------ */
6581         case CHAR_COLON:          /* Non-capturing bracket */
6582         bravalue = OP_BRA;
6583         ptr++;
6584         break;
6585 
6586 
6587         /* ------------------------------------------------------------ */
6588         case CHAR_LEFT_PARENTHESIS:
6589         bravalue = OP_COND;       /* Conditional group */
6590         tempptr = ptr;
6591 
6592         /* A condition can be an assertion, a number (referring to a numbered
6593         group's having been set), a name (referring to a named group), or 'R',
6594         referring to recursion. R<digits> and R&name are also permitted for
6595         recursion tests.
6596 
6597         There are ways of testing a named group: (?(name)) is used by Python;
6598         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6599 
6600         There is one unfortunate ambiguity, caused by history. 'R' can be the
6601         recursive thing or the name 'R' (and similarly for 'R' followed by
6602         digits). We look for a name first; if not found, we try the other case.
6603 
6604         For compatibility with auto-callouts, we allow a callout to be
6605         specified before a condition that is an assertion. First, check for the
6606         syntax of a callout; if found, adjust the temporary pointer that is
6607         used to check for an assertion condition. That's all that is needed! */
6608 
6609         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6610           {
6611           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6612           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6613             tempptr += i + 1;
6614           }
6615 
6616         /* For conditions that are assertions, check the syntax, and then exit
6617         the switch. This will take control down to where bracketed groups,
6618         including assertions, are processed. */
6619 
6620         if (tempptr[1] == CHAR_QUESTION_MARK &&
6621               (tempptr[2] == CHAR_EQUALS_SIGN ||
6622                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6623                tempptr[2] == CHAR_LESS_THAN_SIGN))
6624           break;
6625 
6626         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6627         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6628 
6629         code[1+LINK_SIZE] = OP_CREF;
6630         skipbytes = 1+IMM2_SIZE;
6631         refsign = -1;     /* => not a number */
6632         namelen = -1;     /* => not a name; must set to avoid warning */
6633         name = NULL;      /* Always set to avoid warning */
6634         recno = 0;        /* Always set to avoid warning */
6635 
6636         /* Check for a test for recursion in a named group. */
6637 
6638         ptr++;
6639         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6640           {
6641           terminator = -1;
6642           ptr += 2;
6643           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6644           }
6645 
6646         /* Check for a test for a named group's having been set, using the Perl
6647         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6648         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6649 
6650         else if (*ptr == CHAR_LESS_THAN_SIGN)
6651           {
6652           terminator = CHAR_GREATER_THAN_SIGN;
6653           ptr++;
6654           }
6655         else if (*ptr == CHAR_APOSTROPHE)
6656           {
6657           terminator = CHAR_APOSTROPHE;
6658           ptr++;
6659           }
6660         else
6661           {
6662           terminator = CHAR_NULL;
6663           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6664             else if (IS_DIGIT(*ptr)) refsign = 0;
6665           }
6666 
6667         /* Handle a number */
6668 
6669         if (refsign >= 0)
6670           {
6671           while (IS_DIGIT(*ptr))
6672             {
6673             recno = recno * 10 + (int)(*ptr - CHAR_0);
6674             ptr++;
6675             }
6676           }
6677 
6678         /* Otherwise we expect to read a name; anything else is an error. When
6679         a name is one of a number of duplicates, a different opcode is used and
6680         it needs more memory. Unfortunately we cannot tell whether a name is a
6681         duplicate in the first pass, so we have to allow for more memory. */
6682 
6683         else
6684           {
6685           if (IS_DIGIT(*ptr))
6686             {
6687             *errorcodeptr = ERR84;
6688             goto FAILED;
6689             }
6690           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6691             {
6692             *errorcodeptr = ERR28;   /* Assertion expected */
6693             goto FAILED;
6694             }
6695           name = ptr++;
6696           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6697             {
6698             ptr++;
6699             }
6700           namelen = (int)(ptr - name);
6701           if (lengthptr != NULL && (options & PCRE_DUPNAMES) != 0)
6702             *lengthptr += IMM2_SIZE;
6703           }
6704 
6705         /* Check the terminator */
6706 
6707         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6708             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6709           {
6710           ptr--;                  /* Error offset */
6711           *errorcodeptr = ERR26;  /* Malformed number or name */
6712           goto FAILED;
6713           }
6714 
6715         /* Do no further checking in the pre-compile phase. */
6716 
6717         if (lengthptr != NULL) break;
6718 
6719         /* In the real compile we do the work of looking for the actual
6720         reference. If refsign is not negative, it means we have a number in
6721         recno. */
6722 
6723         if (refsign >= 0)
6724           {
6725           if (recno <= 0)
6726             {
6727             *errorcodeptr = ERR35;
6728             goto FAILED;
6729             }
6730           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6731             cd->bracount - recno + 1 : recno + cd->bracount;
6732           if (recno <= 0 || recno > cd->final_bracount)
6733             {
6734             *errorcodeptr = ERR15;
6735             goto FAILED;
6736             }
6737           PUT2(code, 2+LINK_SIZE, recno);
6738           break;
6739           }
6740 
6741         /* Otherwise look for the name. */
6742 
6743         slot = cd->name_table;
6744         for (i = 0; i < cd->names_found; i++)
6745           {
6746           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6747           slot += cd->name_entry_size;
6748           }
6749 
6750         /* Found the named subpattern. If the name is duplicated, add one to
6751         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6752         appropriate data values. Otherwise, just insert the unique subpattern
6753         number. */
6754 
6755         if (i < cd->names_found)
6756           {
6757           int offset = i++;
6758           int count = 1;
6759           recno = GET2(slot, 0);   /* Number from first found */
6760           for (; i < cd->names_found; i++)
6761             {
6762             slot += cd->name_entry_size;
6763             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6764               (slot+IMM2_SIZE)[namelen] != 0) break;
6765             count++;
6766             }
6767 
6768           if (count > 1)
6769             {
6770             PUT2(code, 2+LINK_SIZE, offset);
6771             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6772             skipbytes += IMM2_SIZE;
6773             code[1+LINK_SIZE]++;
6774             }
6775           else  /* Not a duplicated name */
6776             {
6777             PUT2(code, 2+LINK_SIZE, recno);
6778             }
6779           }
6780 
6781         /* If terminator == CHAR_NULL it means that the name followed directly
6782         after the opening parenthesis [e.g. (?(abc)...] and in this case there
6783         are some further alternatives to try. For the cases where terminator !=
6784         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6785         we have now checked all the possibilities, so give an error. */
6786 
6787         else if (terminator != CHAR_NULL)
6788           {
6789           *errorcodeptr = ERR15;
6790           goto FAILED;
6791           }
6792 
6793         /* Check for (?(R) for recursion. Allow digits after R to specify a
6794         specific group number. */
6795 
6796         else if (*name == CHAR_R)
6797           {
6798           recno = 0;
6799           for (i = 1; i < namelen; i++)
6800             {
6801             if (!IS_DIGIT(name[i]))
6802               {
6803               *errorcodeptr = ERR15;
6804               goto FAILED;
6805               }
6806             recno = recno * 10 + name[i] - CHAR_0;
6807             }
6808           if (recno == 0) recno = RREF_ANY;
6809           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6810           PUT2(code, 2+LINK_SIZE, recno);
6811           }
6812 
6813         /* Similarly, check for the (?(DEFINE) "condition", which is always
6814         false. */
6815 
6816         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6817           {
6818           code[1+LINK_SIZE] = OP_DEF;
6819           skipbytes = 1;
6820           }
6821 
6822         /* Reference to an unidentified subpattern. */
6823 
6824         else
6825           {
6826           *errorcodeptr = ERR15;
6827           goto FAILED;
6828           }
6829         break;
6830 
6831 
6832         /* ------------------------------------------------------------ */
6833         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6834         bravalue = OP_ASSERT;
6835         cd->assert_depth += 1;
6836         ptr++;
6837         break;
6838 
6839         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6840         thing to do, but Perl allows all assertions to be quantified, and when
6841         they contain capturing parentheses there may be a potential use for
6842         this feature. Not that that applies to a quantified (?!) but we allow
6843         it for uniformity. */
6844 
6845         /* ------------------------------------------------------------ */
6846         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6847         ptr++;
6848         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6849              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6850             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6851           {
6852           *code++ = OP_FAIL;
6853           previous = NULL;
6854           continue;
6855           }
6856         bravalue = OP_ASSERT_NOT;
6857         cd->assert_depth += 1;
6858         break;
6859 
6860 
6861         /* ------------------------------------------------------------ */
6862         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
6863         switch (ptr[1])
6864           {
6865           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
6866           bravalue = OP_ASSERTBACK;
6867           cd->assert_depth += 1;
6868           ptr += 2;
6869           break;
6870 
6871           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
6872           bravalue = OP_ASSERTBACK_NOT;
6873           cd->assert_depth += 1;
6874           ptr += 2;
6875           break;
6876 
6877           default:                /* Could be name define, else bad */
6878           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
6879             goto DEFINE_NAME;
6880           ptr++;                  /* Correct offset for error */
6881           *errorcodeptr = ERR24;
6882           goto FAILED;
6883           }
6884         break;
6885 
6886 
6887         /* ------------------------------------------------------------ */
6888         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
6889         bravalue = OP_ONCE;
6890         ptr++;
6891         break;
6892 
6893 
6894         /* ------------------------------------------------------------ */
6895         case CHAR_C:                 /* Callout - may be followed by digits; */
6896         previous_callout = code;     /* Save for later completion */
6897         after_manual_callout = 1;    /* Skip one item before completing */
6898         *code++ = OP_CALLOUT;
6899           {
6900           int n = 0;
6901           ptr++;
6902           while(IS_DIGIT(*ptr))
6903             n = n * 10 + *ptr++ - CHAR_0;
6904           if (*ptr != CHAR_RIGHT_PARENTHESIS)
6905             {
6906             *errorcodeptr = ERR39;
6907             goto FAILED;
6908             }
6909           if (n > 255)
6910             {
6911             *errorcodeptr = ERR38;
6912             goto FAILED;
6913             }
6914           *code++ = n;
6915           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6916           PUT(code, LINK_SIZE, 0);                          /* Default length */
6917           code += 2 * LINK_SIZE;
6918           }
6919         previous = NULL;
6920         continue;
6921 
6922 
6923         /* ------------------------------------------------------------ */
6924         case CHAR_P:              /* Python-style named subpattern handling */
6925         if (*(++ptr) == CHAR_EQUALS_SIGN ||
6926             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6927           {
6928           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6929           terminator = CHAR_RIGHT_PARENTHESIS;
6930           goto NAMED_REF_OR_RECURSE;
6931           }
6932         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6933           {
6934           *errorcodeptr = ERR41;
6935           goto FAILED;
6936           }
6937         /* Fall through to handle (?P< as (?< is handled */
6938 
6939 
6940         /* ------------------------------------------------------------ */
6941         DEFINE_NAME:    /* Come here from (?< handling */
6942         case CHAR_APOSTROPHE:
6943         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6944           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6945         name = ++ptr;
6946         if (IS_DIGIT(*ptr))
6947           {
6948           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
6949           goto FAILED;
6950           }
6951         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6952         namelen = (int)(ptr - name);
6953 
6954         /* In the pre-compile phase, do a syntax check, remember the longest
6955         name, and then remember the group in a vector, expanding it if
6956         necessary. Duplicates for the same number are skipped; other duplicates
6957         are checked for validity. In the actual compile, there is nothing to
6958         do. */
6959 
6960         if (lengthptr != NULL)
6961           {
6962           named_group *ng;
6963           pcre_uint32 number = cd->bracount + 1;
6964 
6965           if (*ptr != (pcre_uchar)terminator)
6966             {
6967             *errorcodeptr = ERR42;
6968             goto FAILED;
6969             }
6970 
6971           if (cd->names_found >= MAX_NAME_COUNT)
6972             {
6973             *errorcodeptr = ERR49;
6974             goto FAILED;
6975             }
6976 
6977           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6978             {
6979             cd->name_entry_size = namelen + IMM2_SIZE + 1;
6980             if (namelen > MAX_NAME_SIZE)
6981               {
6982               *errorcodeptr = ERR48;
6983               goto FAILED;
6984               }
6985             }
6986 
6987           /* Scan the list to check for duplicates. For duplicate names, if the
6988           number is the same, break the loop, which causes the name to be
6989           discarded; otherwise, if DUPNAMES is not set, give an error.
6990           If it is set, allow the name with a different number, but continue
6991           scanning in case this is a duplicate with the same number. For
6992           non-duplicate names, give an error if the number is duplicated. */
6993 
6994           ng = cd->named_groups;
6995           for (i = 0; i < cd->names_found; i++, ng++)
6996             {
6997             if (namelen == ng->length &&
6998                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
6999               {
7000               if (ng->number == number) break;
7001               if ((options & PCRE_DUPNAMES) == 0)
7002                 {
7003                 *errorcodeptr = ERR43;
7004                 goto FAILED;
7005                 }
7006               cd->dupnames = TRUE;  /* Duplicate names exist */
7007               }
7008             else if (ng->number == number)
7009               {
7010               *errorcodeptr = ERR65;
7011               goto FAILED;
7012               }
7013             }
7014 
7015           if (i >= cd->names_found)     /* Not a duplicate with same number */
7016             {
7017             /* Increase the list size if necessary */
7018 
7019             if (cd->names_found >= cd->named_group_list_size)
7020               {
7021               int newsize = cd->named_group_list_size * 2;
7022               named_group *newspace = (PUBL(malloc))
7023                 (newsize * sizeof(named_group));
7024 
7025               if (newspace == NULL)
7026                 {
7027                 *errorcodeptr = ERR21;
7028                 goto FAILED;
7029                 }
7030 
7031               memcpy(newspace, cd->named_groups,
7032                 cd->named_group_list_size * sizeof(named_group));
7033               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7034                 (PUBL(free))((void *)cd->named_groups);
7035               cd->named_groups = newspace;
7036               cd->named_group_list_size = newsize;
7037               }
7038 
7039             cd->named_groups[cd->names_found].name = name;
7040             cd->named_groups[cd->names_found].length = namelen;
7041             cd->named_groups[cd->names_found].number = number;
7042             cd->names_found++;
7043             }
7044           }
7045 
7046         ptr++;                    /* Move past > or ' in both passes. */
7047         goto NUMBERED_GROUP;
7048 
7049 
7050         /* ------------------------------------------------------------ */
7051         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7052         terminator = CHAR_RIGHT_PARENTHESIS;
7053         is_recurse = TRUE;
7054         /* Fall through */
7055 
7056         /* We come here from the Python syntax above that handles both
7057         references (?P=name) and recursion (?P>name), as well as falling
7058         through from the Perl recursion syntax (?&name). We also come here from
7059         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7060         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7061 
7062         NAMED_REF_OR_RECURSE:
7063         name = ++ptr;
7064         if (IS_DIGIT(*ptr))
7065           {
7066           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7067           goto FAILED;
7068           }
7069         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7070         namelen = (int)(ptr - name);
7071 
7072         /* In the pre-compile phase, do a syntax check. We used to just set
7073         a dummy reference number, because it was not used in the first pass.
7074         However, with the change of recursive back references to be atomic,
7075         we have to look for the number so that this state can be identified, as
7076         otherwise the incorrect length is computed. If it's not a backwards
7077         reference, the dummy number will do. */
7078 
7079         if (lengthptr != NULL)
7080           {
7081           named_group *ng;
7082 
7083           if (namelen == 0)
7084             {
7085             *errorcodeptr = ERR62;
7086             goto FAILED;
7087             }
7088           if (*ptr != (pcre_uchar)terminator)
7089             {
7090             *errorcodeptr = ERR42;
7091             goto FAILED;
7092             }
7093           if (namelen > MAX_NAME_SIZE)
7094             {
7095             *errorcodeptr = ERR48;
7096             goto FAILED;
7097             }
7098 
7099           /* The name table does not exist in the first pass; instead we must
7100           scan the list of names encountered so far in order to get the
7101           number. If the name is not found, set the value to 0 for a forward
7102           reference. */
7103 
7104           ng = cd->named_groups;
7105           for (i = 0; i < cd->names_found; i++, ng++)
7106             {
7107             if (namelen == ng->length &&
7108                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7109               break;
7110             }
7111           recno = (i < cd->names_found)? ng->number : 0;
7112 
7113           /* Count named back references. */
7114 
7115           if (!is_recurse) cd->namedrefcount++;
7116 
7117           /* If duplicate names are permitted, we have to allow for a named
7118           reference to a duplicated name (this cannot be determined until the
7119           second pass). This needs an extra 16-bit data item. */
7120 
7121           if ((options & PCRE_DUPNAMES) != 0) *lengthptr += IMM2_SIZE;
7122           }
7123 
7124         /* In the real compile, search the name table. We check the name
7125         first, and then check that we have reached the end of the name in the
7126         table. That way, if the name is longer than any in the table, the
7127         comparison will fail without reading beyond the table entry. */
7128 
7129         else
7130           {
7131           slot = cd->name_table;
7132           for (i = 0; i < cd->names_found; i++)
7133             {
7134             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7135                 slot[IMM2_SIZE+namelen] == 0)
7136               break;
7137             slot += cd->name_entry_size;
7138             }
7139 
7140           if (i < cd->names_found)
7141             {
7142             recno = GET2(slot, 0);
7143             }
7144           else
7145             {
7146             *errorcodeptr = ERR15;
7147             goto FAILED;
7148             }
7149           }
7150 
7151         /* In both phases, for recursions, we can now go to the code than
7152         handles numerical recursion. */
7153 
7154         if (is_recurse) goto HANDLE_RECURSION;
7155 
7156         /* In the second pass we must see if the name is duplicated. If so, we
7157         generate a different opcode. */
7158 
7159         if (lengthptr == NULL && cd->dupnames)
7160           {
7161           int count = 1;
7162           unsigned int index = i;
7163           pcre_uchar *cslot = slot + cd->name_entry_size;
7164 
7165           for (i++; i < cd->names_found; i++)
7166             {
7167             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7168 
7169 
7170             count++;
7171             cslot += cd->name_entry_size;
7172             }
7173 
7174           if (count > 1)
7175             {
7176             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7177             previous = code;
7178             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7179             PUT2INC(code, 0, index);
7180             PUT2INC(code, 0, count);
7181 
7182             /* Process each potentially referenced group. */
7183 
7184             for (; slot < cslot; slot += cd->name_entry_size)
7185               {
7186               open_capitem *oc;
7187               recno = GET2(slot, 0);
7188               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7189               if (recno > cd->top_backref) cd->top_backref = recno;
7190 
7191               /* Check to see if this back reference is recursive, that it, it
7192               is inside the group that it references. A flag is set so that the
7193               group can be made atomic. */
7194 
7195               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7196                 {
7197                 if (oc->number == recno)
7198                   {
7199                   oc->flag = TRUE;
7200                   break;
7201                   }
7202                 }
7203               }
7204 
7205             continue;  /* End of back ref handling */
7206             }
7207           }
7208 
7209         /* First pass, or a non-duplicated name. */
7210 
7211         goto HANDLE_REFERENCE;
7212 
7213 
7214         /* ------------------------------------------------------------ */
7215         case CHAR_R:              /* Recursion */
7216         ptr++;                    /* Same as (?0)      */
7217         /* Fall through */
7218 
7219 
7220         /* ------------------------------------------------------------ */
7221         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7222         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7223         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7224           {
7225           const pcre_uchar *called;
7226           terminator = CHAR_RIGHT_PARENTHESIS;
7227 
7228           /* Come here from the \g<...> and \g'...' code (Oniguruma
7229           compatibility). However, the syntax has been checked to ensure that
7230           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7231           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7232           ever be taken. */
7233 
7234           HANDLE_NUMERICAL_RECURSION:
7235 
7236           if ((refsign = *ptr) == CHAR_PLUS)
7237             {
7238             ptr++;
7239             if (!IS_DIGIT(*ptr))
7240               {
7241               *errorcodeptr = ERR63;
7242               goto FAILED;
7243               }
7244             }
7245           else if (refsign == CHAR_MINUS)
7246             {
7247             if (!IS_DIGIT(ptr[1]))
7248               goto OTHER_CHAR_AFTER_QUERY;
7249             ptr++;
7250             }
7251 
7252           recno = 0;
7253           while(IS_DIGIT(*ptr))
7254             recno = recno * 10 + *ptr++ - CHAR_0;
7255 
7256           if (*ptr != (pcre_uchar)terminator)
7257             {
7258             *errorcodeptr = ERR29;
7259             goto FAILED;
7260             }
7261 
7262           if (refsign == CHAR_MINUS)
7263             {
7264             if (recno == 0)
7265               {
7266               *errorcodeptr = ERR58;
7267               goto FAILED;
7268               }
7269             recno = cd->bracount - recno + 1;
7270             if (recno <= 0)
7271               {
7272               *errorcodeptr = ERR15;
7273               goto FAILED;
7274               }
7275             }
7276           else if (refsign == CHAR_PLUS)
7277             {
7278             if (recno == 0)
7279               {
7280               *errorcodeptr = ERR58;
7281               goto FAILED;
7282               }
7283             recno += cd->bracount;
7284             }
7285 
7286           /* Come here from code above that handles a named recursion */
7287 
7288           HANDLE_RECURSION:
7289 
7290           previous = code;
7291           called = cd->start_code;
7292 
7293           /* When we are actually compiling, find the bracket that is being
7294           referenced. Temporarily end the regex in case it doesn't exist before
7295           this point. If we end up with a forward reference, first check that
7296           the bracket does occur later so we can give the error (and position)
7297           now. Then remember this forward reference in the workspace so it can
7298           be filled in at the end. */
7299 
7300           if (lengthptr == NULL)
7301             {
7302             *code = OP_END;
7303             if (recno != 0)
7304               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7305 
7306             /* Forward reference */
7307 
7308             if (called == NULL)
7309               {
7310               if (recno > cd->final_bracount)
7311                 {
7312                 *errorcodeptr = ERR15;
7313                 goto FAILED;
7314                 }
7315 
7316               /* Fudge the value of "called" so that when it is inserted as an
7317               offset below, what it actually inserted is the reference number
7318               of the group. Then remember the forward reference. */
7319 
7320               called = cd->start_code + recno;
7321               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7322                   WORK_SIZE_SAFETY_MARGIN)
7323                 {
7324                 *errorcodeptr = expand_workspace(cd);
7325                 if (*errorcodeptr != 0) goto FAILED;
7326                 }
7327               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7328               }
7329 
7330             /* If not a forward reference, and the subpattern is still open,
7331             this is a recursive call. We check to see if this is a left
7332             recursion that could loop for ever, and diagnose that case. We
7333             must not, however, do this check if we are in a conditional
7334             subpattern because the condition might be testing for recursion in
7335             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7336             Forever loops are also detected at runtime, so those that occur in
7337             conditional subpatterns will be picked up then. */
7338 
7339             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7340                      could_be_empty(called, code, bcptr, utf, cd))
7341               {
7342               *errorcodeptr = ERR40;
7343               goto FAILED;
7344               }
7345             }
7346 
7347           /* Insert the recursion/subroutine item. It does not have a set first
7348           character (relevant if it is repeated, because it will then be
7349           wrapped with ONCE brackets). */
7350 
7351           *code = OP_RECURSE;
7352           PUT(code, 1, (int)(called - cd->start_code));
7353           code += 1 + LINK_SIZE;
7354           groupsetfirstchar = FALSE;
7355           }
7356 
7357         /* Can't determine a first byte now */
7358 
7359         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7360         continue;
7361 
7362 
7363         /* ------------------------------------------------------------ */
7364         default:              /* Other characters: check option setting */
7365         OTHER_CHAR_AFTER_QUERY:
7366         set = unset = 0;
7367         optset = &set;
7368 
7369         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7370           {
7371           switch (*ptr++)
7372             {
7373             case CHAR_MINUS: optset = &unset; break;
7374 
7375             case CHAR_J:    /* Record that it changed in the external options */
7376             *optset |= PCRE_DUPNAMES;
7377             cd->external_flags |= PCRE_JCHANGED;
7378             break;
7379 
7380             case CHAR_i: *optset |= PCRE_CASELESS; break;
7381             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7382             case CHAR_s: *optset |= PCRE_DOTALL; break;
7383             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7384             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7385             case CHAR_X: *optset |= PCRE_EXTRA; break;
7386 
7387             default:  *errorcodeptr = ERR12;
7388                       ptr--;    /* Correct the offset */
7389                       goto FAILED;
7390             }
7391           }
7392 
7393         /* Set up the changed option bits, but don't change anything yet. */
7394 
7395         newoptions = (options | set) & (~unset);
7396 
7397         /* If the options ended with ')' this is not the start of a nested
7398         group with option changes, so the options change at this level. If this
7399         item is right at the start of the pattern, the options can be
7400         abstracted and made external in the pre-compile phase, and ignored in
7401         the compile phase. This can be helpful when matching -- for instance in
7402         caseless checking of required bytes.
7403 
7404         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7405         definitely *not* at the start of the pattern because something has been
7406         compiled. In the pre-compile phase, however, the code pointer can have
7407         that value after the start, because it gets reset as code is discarded
7408         during the pre-compile. However, this can happen only at top level - if
7409         we are within parentheses, the starting BRA will still be present. At
7410         any parenthesis level, the length value can be used to test if anything
7411         has been compiled at that level. Thus, a test for both these conditions
7412         is necessary to ensure we correctly detect the start of the pattern in
7413         both phases.
7414 
7415         If we are not at the pattern start, reset the greedy defaults and the
7416         case value for firstchar and reqchar. */
7417 
7418         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7419           {
7420           if (code == cd->start_code + 1 + LINK_SIZE &&
7421                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7422             {
7423             cd->external_options = newoptions;
7424             }
7425           else
7426             {
7427             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7428             greedy_non_default = greedy_default ^ 1;
7429             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7430             }
7431 
7432           /* Change options at this level, and pass them back for use
7433           in subsequent branches. */
7434 
7435           *optionsptr = options = newoptions;
7436           previous = NULL;       /* This item can't be repeated */
7437           continue;              /* It is complete */
7438           }
7439 
7440         /* If the options ended with ':' we are heading into a nested group
7441         with possible change of options. Such groups are non-capturing and are
7442         not assertions of any kind. All we need to do is skip over the ':';
7443         the newoptions value is handled below. */
7444 
7445         bravalue = OP_BRA;
7446         ptr++;
7447         }     /* End of switch for character following (? */
7448       }       /* End of (? handling */
7449 
7450     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7451     is set, all unadorned brackets become non-capturing and behave like (?:...)
7452     brackets. */
7453 
7454     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7455       {
7456       bravalue = OP_BRA;
7457       }
7458 
7459     /* Else we have a capturing group. */
7460 
7461     else
7462       {
7463       NUMBERED_GROUP:
7464       cd->bracount += 1;
7465       PUT2(code, 1+LINK_SIZE, cd->bracount);
7466       skipbytes = IMM2_SIZE;
7467       }
7468 
7469     /* Process nested bracketed regex. First check for parentheses nested too
7470     deeply. */
7471 
7472     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7473       {
7474       *errorcodeptr = ERR82;
7475       goto FAILED;
7476       }
7477 
7478     /* Assertions used not to be repeatable, but this was changed for Perl
7479     compatibility, so all kinds can now be repeated. We copy code into a
7480     non-register variable (tempcode) in order to be able to pass its address
7481     because some compilers complain otherwise. */
7482 
7483     previous = code;                      /* For handling repetition */
7484     *code = bravalue;
7485     tempcode = code;
7486     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7487     tempbracount = cd->bracount;          /* Save value before bracket */
7488     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7489 
7490     if (!compile_regex(
7491          newoptions,                      /* The complete new option state */
7492          &tempcode,                       /* Where to put code (updated) */
7493          &ptr,                            /* Input pointer (updated) */
7494          errorcodeptr,                    /* Where to put an error message */
7495          (bravalue == OP_ASSERTBACK ||
7496           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7497          reset_bracount,                  /* True if (?| group */
7498          skipbytes,                       /* Skip over bracket number */
7499          cond_depth +
7500            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7501          &subfirstchar,                   /* For possible first char */
7502          &subfirstcharflags,
7503          &subreqchar,                     /* For possible last char */
7504          &subreqcharflags,
7505          bcptr,                           /* Current branch chain */
7506          cd,                              /* Tables block */
7507          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7508            &length_prevgroup              /* Pre-compile phase */
7509          ))
7510       goto FAILED;
7511 
7512     cd->parens_depth -= 1;
7513 
7514     /* If this was an atomic group and there are no capturing groups within it,
7515     generate OP_ONCE_NC instead of OP_ONCE. */
7516 
7517     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7518       *code = OP_ONCE_NC;
7519 
7520     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7521       cd->assert_depth -= 1;
7522 
7523     /* At the end of compiling, code is still pointing to the start of the
7524     group, while tempcode has been updated to point past the end of the group.
7525     The pattern pointer (ptr) is on the bracket.
7526 
7527     If this is a conditional bracket, check that there are no more than
7528     two branches in the group, or just one if it's a DEFINE group. We do this
7529     in the real compile phase, not in the pre-pass, where the whole group may
7530     not be available. */
7531 
7532     if (bravalue == OP_COND && lengthptr == NULL)
7533       {
7534       pcre_uchar *tc = code;
7535       int condcount = 0;
7536 
7537       do {
7538          condcount++;
7539          tc += GET(tc,1);
7540          }
7541       while (*tc != OP_KET);
7542 
7543       /* A DEFINE group is never obeyed inline (the "condition" is always
7544       false). It must have only one branch. */
7545 
7546       if (code[LINK_SIZE+1] == OP_DEF)
7547         {
7548         if (condcount > 1)
7549           {
7550           *errorcodeptr = ERR54;
7551           goto FAILED;
7552           }
7553         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7554         }
7555 
7556       /* A "normal" conditional group. If there is just one branch, we must not
7557       make use of its firstchar or reqchar, because this is equivalent to an
7558       empty second branch. */
7559 
7560       else
7561         {
7562         if (condcount > 2)
7563           {
7564           *errorcodeptr = ERR27;
7565           goto FAILED;
7566           }
7567         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7568         }
7569       }
7570 
7571     /* Error if hit end of pattern */
7572 
7573     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7574       {
7575       *errorcodeptr = ERR14;
7576       goto FAILED;
7577       }
7578 
7579     /* In the pre-compile phase, update the length by the length of the group,
7580     less the brackets at either end. Then reduce the compiled code to just a
7581     set of non-capturing brackets so that it doesn't use much memory if it is
7582     duplicated by a quantifier.*/
7583 
7584     if (lengthptr != NULL)
7585       {
7586       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7587         {
7588         *errorcodeptr = ERR20;
7589         goto FAILED;
7590         }
7591       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7592       code++;   /* This already contains bravalue */
7593       PUTINC(code, 0, 1 + LINK_SIZE);
7594       *code++ = OP_KET;
7595       PUTINC(code, 0, 1 + LINK_SIZE);
7596       break;    /* No need to waste time with special character handling */
7597       }
7598 
7599     /* Otherwise update the main code pointer to the end of the group. */
7600 
7601     code = tempcode;
7602 
7603     /* For a DEFINE group, required and first character settings are not
7604     relevant. */
7605 
7606     if (bravalue == OP_DEF) break;
7607 
7608     /* Handle updating of the required and first characters for other types of
7609     group. Update for normal brackets of all kinds, and conditions with two
7610     branches (see code above). If the bracket is followed by a quantifier with
7611     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7612     zerofirstchar outside the main loop so that they can be accessed for the
7613     back off. */
7614 
7615     zeroreqchar = reqchar;
7616     zeroreqcharflags = reqcharflags;
7617     zerofirstchar = firstchar;
7618     zerofirstcharflags = firstcharflags;
7619     groupsetfirstchar = FALSE;
7620 
7621     if (bravalue >= OP_ONCE)
7622       {
7623       /* If we have not yet set a firstchar in this branch, take it from the
7624       subpattern, remembering that it was set here so that a repeat of more
7625       than one can replicate it as reqchar if necessary. If the subpattern has
7626       no firstchar, set "none" for the whole branch. In both cases, a zero
7627       repeat forces firstchar to "none". */
7628 
7629       if (firstcharflags == REQ_UNSET)
7630         {
7631         if (subfirstcharflags >= 0)
7632           {
7633           firstchar = subfirstchar;
7634           firstcharflags = subfirstcharflags;
7635           groupsetfirstchar = TRUE;
7636           }
7637         else firstcharflags = REQ_NONE;
7638         zerofirstcharflags = REQ_NONE;
7639         }
7640 
7641       /* If firstchar was previously set, convert the subpattern's firstchar
7642       into reqchar if there wasn't one, using the vary flag that was in
7643       existence beforehand. */
7644 
7645       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7646         {
7647         subreqchar = subfirstchar;
7648         subreqcharflags = subfirstcharflags | tempreqvary;
7649         }
7650 
7651       /* If the subpattern set a required byte (or set a first byte that isn't
7652       really the first byte - see above), set it. */
7653 
7654       if (subreqcharflags >= 0)
7655         {
7656         reqchar = subreqchar;
7657         reqcharflags = subreqcharflags;
7658         }
7659       }
7660 
7661     /* For a forward assertion, we take the reqchar, if set. This can be
7662     helpful if the pattern that follows the assertion doesn't set a different
7663     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7664     for an assertion, however because it leads to incorrect effect for patterns
7665     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7666     of a firstchar. This is overcome by a scan at the end if there's no
7667     firstchar, looking for an asserted first char. */
7668 
7669     else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7670       {
7671       reqchar = subreqchar;
7672       reqcharflags = subreqcharflags;
7673       }
7674     break;     /* End of processing '(' */
7675 
7676 
7677     /* ===================================================================*/
7678     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7679     are arranged to be the negation of the corresponding OP_values in the
7680     default case when PCRE_UCP is not set. For the back references, the values
7681     are negative the reference number. Only back references and those types
7682     that consume a character may be repeated. We can test for values between
7683     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7684     ever created. */
7685 
7686     case CHAR_BACKSLASH:
7687     tempptr = ptr;
7688     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7689     if (*errorcodeptr != 0) goto FAILED;
7690 
7691     if (escape == 0)                  /* The escape coded a single character */
7692       c = ec;
7693     else
7694       {
7695       if (escape == ESC_Q)            /* Handle start of quoted string */
7696         {
7697         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7698           ptr += 2;               /* avoid empty string */
7699             else inescq = TRUE;
7700         continue;
7701         }
7702 
7703       if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7704 
7705       /* For metasequences that actually match a character, we disable the
7706       setting of a first character if it hasn't already been set. */
7707 
7708       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7709         firstcharflags = REQ_NONE;
7710 
7711       /* Set values to reset to if this is followed by a zero repeat. */
7712 
7713       zerofirstchar = firstchar;
7714       zerofirstcharflags = firstcharflags;
7715       zeroreqchar = reqchar;
7716       zeroreqcharflags = reqcharflags;
7717 
7718       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7719       is a subroutine call by number (Oniguruma syntax). In fact, the value
7720       ESC_g is returned only for these cases. So we don't need to check for <
7721       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7722       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7723       that is a synonym for a named back reference). */
7724 
7725       if (escape == ESC_g)
7726         {
7727         const pcre_uchar *p;
7728         pcre_uint32 cf;
7729 
7730         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
7731         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7732           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7733 
7734         /* These two statements stop the compiler for warning about possibly
7735         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7736         fact, because we do the check for a number below, the paths that
7737         would actually be in error are never taken. */
7738 
7739         skipbytes = 0;
7740         reset_bracount = FALSE;
7741 
7742         /* If it's not a signed or unsigned number, treat it as a name. */
7743 
7744         cf = ptr[1];
7745         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7746           {
7747           is_recurse = TRUE;
7748           goto NAMED_REF_OR_RECURSE;
7749           }
7750 
7751         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7752         or a digit. */
7753 
7754         p = ptr + 2;
7755         while (IS_DIGIT(*p)) p++;
7756         if (*p != (pcre_uchar)terminator)
7757           {
7758           *errorcodeptr = ERR57;
7759           break;
7760           }
7761         ptr++;
7762         goto HANDLE_NUMERICAL_RECURSION;
7763         }
7764 
7765       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7766       We also support \k{name} (.NET syntax).  */
7767 
7768       if (escape == ESC_k)
7769         {
7770         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7771           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7772           {
7773           *errorcodeptr = ERR69;
7774           break;
7775           }
7776         is_recurse = FALSE;
7777         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7778           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7779           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7780         goto NAMED_REF_OR_RECURSE;
7781         }
7782 
7783       /* Back references are handled specially; must disable firstchar if
7784       not set to cope with cases like (?=(\w+))\1: which would otherwise set
7785       ':' later. */
7786 
7787       if (escape < 0)
7788         {
7789         open_capitem *oc;
7790         recno = -escape;
7791 
7792         /* Come here from named backref handling when the reference is to a
7793         single group (i.e. not to a duplicated name. */
7794 
7795         HANDLE_REFERENCE:
7796         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7797         previous = code;
7798         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
7799         PUT2INC(code, 0, recno);
7800         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7801         if (recno > cd->top_backref) cd->top_backref = recno;
7802 
7803         /* Check to see if this back reference is recursive, that it, it
7804         is inside the group that it references. A flag is set so that the
7805         group can be made atomic. */
7806 
7807         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7808           {
7809           if (oc->number == recno)
7810             {
7811             oc->flag = TRUE;
7812             break;
7813             }
7814           }
7815         }
7816 
7817       /* So are Unicode property matches, if supported. */
7818 
7819 #ifdef SUPPORT_UCP
7820       else if (escape == ESC_P || escape == ESC_p)
7821         {
7822         BOOL negated;
7823         unsigned int ptype = 0, pdata = 0;
7824         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
7825           goto FAILED;
7826         previous = code;
7827         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
7828         *code++ = ptype;
7829         *code++ = pdata;
7830         }
7831 #else
7832 
7833       /* If Unicode properties are not supported, \X, \P, and \p are not
7834       allowed. */
7835 
7836       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
7837         {
7838         *errorcodeptr = ERR45;
7839         goto FAILED;
7840         }
7841 #endif
7842 
7843       /* For the rest (including \X when Unicode properties are supported), we
7844       can obtain the OP value by negating the escape value in the default
7845       situation when PCRE_UCP is not set. When it *is* set, we substitute
7846       Unicode property tests. Note that \b and \B do a one-character
7847       lookbehind, and \A also behaves as if it does. */
7848 
7849       else
7850         {
7851         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
7852              cd->max_lookbehind == 0)
7853           cd->max_lookbehind = 1;
7854 #ifdef SUPPORT_UCP
7855         if (escape >= ESC_DU && escape <= ESC_wu)
7856           {
7857           nestptr = ptr + 1;                   /* Where to resume */
7858           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
7859           }
7860         else
7861 #endif
7862         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
7863         so that it works in DFA mode and in lookbehinds. */
7864 
7865           {
7866           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
7867           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
7868           }
7869         }
7870       continue;
7871       }
7872 
7873     /* We have a data character whose value is in c. In UTF-8 mode it may have
7874     a value > 127. We set its representation in the length/buffer, and then
7875     handle it as a data character. */
7876 
7877 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
7878     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
7879       mclength = PRIV(ord2utf)(c, mcbuffer);
7880     else
7881 #endif
7882 
7883      {
7884      mcbuffer[0] = c;
7885      mclength = 1;
7886      }
7887     goto ONE_CHAR;
7888 
7889 
7890     /* ===================================================================*/
7891     /* Handle a literal character. It is guaranteed not to be whitespace or #
7892     when the extended flag is set. If we are in a UTF mode, it may be a
7893     multi-unit literal character. */
7894 
7895     default:
7896     NORMAL_CHAR:
7897     mclength = 1;
7898     mcbuffer[0] = c;
7899 
7900 #ifdef SUPPORT_UTF
7901     if (utf && HAS_EXTRALEN(c))
7902       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
7903 #endif
7904 
7905     /* At this point we have the character's bytes in mcbuffer, and the length
7906     in mclength. When not in UTF-8 mode, the length is always 1. */
7907 
7908     ONE_CHAR:
7909     previous = code;
7910 
7911     /* For caseless UTF-8 mode when UCP support is available, check whether
7912     this character has more than one other case. If so, generate a special
7913     OP_PROP item instead of OP_CHARI. */
7914 
7915 #ifdef SUPPORT_UCP
7916     if (utf && (options & PCRE_CASELESS) != 0)
7917       {
7918       GETCHAR(c, mcbuffer);
7919       if ((c = UCD_CASESET(c)) != 0)
7920         {
7921         *code++ = OP_PROP;
7922         *code++ = PT_CLIST;
7923         *code++ = c;
7924         if (firstcharflags == REQ_UNSET)
7925           firstcharflags = zerofirstcharflags = REQ_NONE;
7926         break;
7927         }
7928       }
7929 #endif
7930 
7931     /* Caseful matches, or not one of the multicase characters. */
7932 
7933     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7934     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
7935 
7936     /* Remember if \r or \n were seen */
7937 
7938     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7939       cd->external_flags |= PCRE_HASCRORLF;
7940 
7941     /* Set the first and required bytes appropriately. If no previous first
7942     byte, set it from this character, but revert to none on a zero repeat.
7943     Otherwise, leave the firstchar value alone, and don't change it on a zero
7944     repeat. */
7945 
7946     if (firstcharflags == REQ_UNSET)
7947       {
7948       zerofirstcharflags = REQ_NONE;
7949       zeroreqchar = reqchar;
7950       zeroreqcharflags = reqcharflags;
7951 
7952       /* If the character is more than one byte long, we can set firstchar
7953       only if it is not to be matched caselessly. */
7954 
7955       if (mclength == 1 || req_caseopt == 0)
7956         {
7957         firstchar = mcbuffer[0] | req_caseopt;
7958         firstchar = mcbuffer[0];
7959         firstcharflags = req_caseopt;
7960 
7961         if (mclength != 1)
7962           {
7963           reqchar = code[-1];
7964           reqcharflags = cd->req_varyopt;
7965           }
7966         }
7967       else firstcharflags = reqcharflags = REQ_NONE;
7968       }
7969 
7970     /* firstchar was previously set; we can set reqchar only if the length is
7971     1 or the matching is caseful. */
7972 
7973     else
7974       {
7975       zerofirstchar = firstchar;
7976       zerofirstcharflags = firstcharflags;
7977       zeroreqchar = reqchar;
7978       zeroreqcharflags = reqcharflags;
7979       if (mclength == 1 || req_caseopt == 0)
7980         {
7981         reqchar = code[-1];
7982         reqcharflags = req_caseopt | cd->req_varyopt;
7983         }
7984       }
7985 
7986     break;            /* End of literal character handling */
7987     }
7988   }                   /* end of big loop */
7989 
7990 
7991 /* Control never reaches here by falling through, only by a goto for all the
7992 error states. Pass back the position in the pattern so that it can be displayed
7993 to the user for diagnosing the error. */
7994 
7995 FAILED:
7996 *ptrptr = ptr;
7997 return FALSE;
7998 }
7999 
8000 
8001 
8002 /*************************************************
8003 *     Compile sequence of alternatives           *
8004 *************************************************/
8005 
8006 /* On entry, ptr is pointing past the bracket character, but on return it
8007 points to the closing bracket, or vertical bar, or end of string. The code
8008 variable is pointing at the byte into which the BRA operator has been stored.
8009 This function is used during the pre-compile phase when we are trying to find
8010 out the amount of memory needed, as well as during the real compile phase. The
8011 value of lengthptr distinguishes the two phases.
8012 
8013 Arguments:
8014   options           option bits, including any changes for this subpattern
8015   codeptr           -> the address of the current code pointer
8016   ptrptr            -> the address of the current pattern pointer
8017   errorcodeptr      -> pointer to error code variable
8018   lookbehind        TRUE if this is a lookbehind assertion
8019   reset_bracount    TRUE to reset the count for each branch
8020   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8021   cond_depth        depth of nesting for conditional subpatterns
8022   firstcharptr      place to put the first required character
8023   firstcharflagsptr place to put the first character flags, or a negative number
8024   reqcharptr        place to put the last required character
8025   reqcharflagsptr   place to put the last required character flags, or a negative number
8026   bcptr             pointer to the chain of currently open branches
8027   cd                points to the data block with tables pointers etc.
8028   lengthptr         NULL during the real compile phase
8029                     points to length accumulator during pre-compile phase
8030 
8031 Returns:            TRUE on success
8032 */
8033 
8034 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8035 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8036   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8037   int cond_depth,
8038   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8039   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8040   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8041 {
8042 const pcre_uchar *ptr = *ptrptr;
8043 pcre_uchar *code = *codeptr;
8044 pcre_uchar *last_branch = code;
8045 pcre_uchar *start_bracket = code;
8046 pcre_uchar *reverse_count = NULL;
8047 open_capitem capitem;
8048 int capnumber = 0;
8049 pcre_uint32 firstchar, reqchar;
8050 pcre_int32 firstcharflags, reqcharflags;
8051 pcre_uint32 branchfirstchar, branchreqchar;
8052 pcre_int32 branchfirstcharflags, branchreqcharflags;
8053 int length;
8054 unsigned int orig_bracount;
8055 unsigned int max_bracount;
8056 branch_chain bc;
8057 
8058 /* If set, call the external function that checks for stack availability. */
8059 
8060 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8061   {
8062   *errorcodeptr= ERR85;
8063   return FALSE;
8064   }
8065 
8066 /* Miscellaneous initialization */
8067 
8068 bc.outer = bcptr;
8069 bc.current_branch = code;
8070 
8071 firstchar = reqchar = 0;
8072 firstcharflags = reqcharflags = REQ_UNSET;
8073 
8074 /* Accumulate the length for use in the pre-compile phase. Start with the
8075 length of the BRA and KET and any extra bytes that are required at the
8076 beginning. We accumulate in a local variable to save frequent testing of
8077 lenthptr for NULL. We cannot do this by looking at the value of code at the
8078 start and end of each alternative, because compiled items are discarded during
8079 the pre-compile phase so that the work space is not exceeded. */
8080 
8081 length = 2 + 2*LINK_SIZE + skipbytes;
8082 
8083 /* WARNING: If the above line is changed for any reason, you must also change
8084 the code that abstracts option settings at the start of the pattern and makes
8085 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8086 pre-compile phase to find out whether anything has yet been compiled or not. */
8087 
8088 /* If this is a capturing subpattern, add to the chain of open capturing items
8089 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8090 detect groups that contain recursive back references to themselves. Note that
8091 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8092 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8093 
8094 if (*code == OP_CBRA)
8095   {
8096   capnumber = GET2(code, 1 + LINK_SIZE);
8097   capitem.number = capnumber;
8098   capitem.next = cd->open_caps;
8099   capitem.flag = FALSE;
8100   cd->open_caps = &capitem;
8101   }
8102 
8103 /* Offset is set zero to mark that this bracket is still open */
8104 
8105 PUT(code, 1, 0);
8106 code += 1 + LINK_SIZE + skipbytes;
8107 
8108 /* Loop for each alternative branch */
8109 
8110 orig_bracount = max_bracount = cd->bracount;
8111 for (;;)
8112   {
8113   /* For a (?| group, reset the capturing bracket count so that each branch
8114   uses the same numbers. */
8115 
8116   if (reset_bracount) cd->bracount = orig_bracount;
8117 
8118   /* Set up dummy OP_REVERSE if lookbehind assertion */
8119 
8120   if (lookbehind)
8121     {
8122     *code++ = OP_REVERSE;
8123     reverse_count = code;
8124     PUTINC(code, 0, 0);
8125     length += 1 + LINK_SIZE;
8126     }
8127 
8128   /* Now compile the branch; in the pre-compile phase its length gets added
8129   into the length. */
8130 
8131   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8132         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8133         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8134     {
8135     *ptrptr = ptr;
8136     return FALSE;
8137     }
8138 
8139   /* Keep the highest bracket count in case (?| was used and some branch
8140   has fewer than the rest. */
8141 
8142   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8143 
8144   /* In the real compile phase, there is some post-processing to be done. */
8145 
8146   if (lengthptr == NULL)
8147     {
8148     /* If this is the first branch, the firstchar and reqchar values for the
8149     branch become the values for the regex. */
8150 
8151     if (*last_branch != OP_ALT)
8152       {
8153       firstchar = branchfirstchar;
8154       firstcharflags = branchfirstcharflags;
8155       reqchar = branchreqchar;
8156       reqcharflags = branchreqcharflags;
8157       }
8158 
8159     /* If this is not the first branch, the first char and reqchar have to
8160     match the values from all the previous branches, except that if the
8161     previous value for reqchar didn't have REQ_VARY set, it can still match,
8162     and we set REQ_VARY for the regex. */
8163 
8164     else
8165       {
8166       /* If we previously had a firstchar, but it doesn't match the new branch,
8167       we have to abandon the firstchar for the regex, but if there was
8168       previously no reqchar, it takes on the value of the old firstchar. */
8169 
8170       if (firstcharflags >= 0 &&
8171           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8172         {
8173         if (reqcharflags < 0)
8174           {
8175           reqchar = firstchar;
8176           reqcharflags = firstcharflags;
8177           }
8178         firstcharflags = REQ_NONE;
8179         }
8180 
8181       /* If we (now or from before) have no firstchar, a firstchar from the
8182       branch becomes a reqchar if there isn't a branch reqchar. */
8183 
8184       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8185         {
8186         branchreqchar = branchfirstchar;
8187         branchreqcharflags = branchfirstcharflags;
8188         }
8189 
8190       /* Now ensure that the reqchars match */
8191 
8192       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8193           reqchar != branchreqchar)
8194         reqcharflags = REQ_NONE;
8195       else
8196         {
8197         reqchar = branchreqchar;
8198         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8199         }
8200       }
8201 
8202     /* If lookbehind, check that this branch matches a fixed-length string, and
8203     put the length into the OP_REVERSE item. Temporarily mark the end of the
8204     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8205     because there may be forward references that we can't check here. Set a
8206     flag to cause another lookbehind check at the end. Why not do it all at the
8207     end? Because common, erroneous checks are picked up here and the offset of
8208     the problem can be shown. */
8209 
8210     if (lookbehind)
8211       {
8212       int fixed_length;
8213       *code = OP_END;
8214       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8215         FALSE, cd);
8216       DPRINTF(("fixed length = %d\n", fixed_length));
8217       if (fixed_length == -3)
8218         {
8219         cd->check_lookbehind = TRUE;
8220         }
8221       else if (fixed_length < 0)
8222         {
8223         *errorcodeptr = (fixed_length == -2)? ERR36 :
8224                         (fixed_length == -4)? ERR70: ERR25;
8225         *ptrptr = ptr;
8226         return FALSE;
8227         }
8228       else
8229         {
8230         if (fixed_length > cd->max_lookbehind)
8231           cd->max_lookbehind = fixed_length;
8232         PUT(reverse_count, 0, fixed_length);
8233         }
8234       }
8235     }
8236 
8237   /* Reached end of expression, either ')' or end of pattern. In the real
8238   compile phase, go back through the alternative branches and reverse the chain
8239   of offsets, with the field in the BRA item now becoming an offset to the
8240   first alternative. If there are no alternatives, it points to the end of the
8241   group. The length in the terminating ket is always the length of the whole
8242   bracketed item. Return leaving the pointer at the terminating char. */
8243 
8244   if (*ptr != CHAR_VERTICAL_LINE)
8245     {
8246     if (lengthptr == NULL)
8247       {
8248       int branch_length = (int)(code - last_branch);
8249       do
8250         {
8251         int prev_length = GET(last_branch, 1);
8252         PUT(last_branch, 1, branch_length);
8253         branch_length = prev_length;
8254         last_branch -= branch_length;
8255         }
8256       while (branch_length > 0);
8257       }
8258 
8259     /* Fill in the ket */
8260 
8261     *code = OP_KET;
8262     PUT(code, 1, (int)(code - start_bracket));
8263     code += 1 + LINK_SIZE;
8264 
8265     /* If it was a capturing subpattern, check to see if it contained any
8266     recursive back references. If so, we must wrap it in atomic brackets.
8267     Because we are moving code along, we must ensure that any pending recursive
8268     references are updated. In any event, remove the block from the chain. */
8269 
8270     if (capnumber > 0)
8271       {
8272       if (cd->open_caps->flag)
8273         {
8274         *code = OP_END;
8275         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8276           (options & PCRE_UTF8) != 0, cd, cd->hwm);
8277         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8278           IN_UCHARS(code - start_bracket));
8279         *start_bracket = OP_ONCE;
8280         code += 1 + LINK_SIZE;
8281         PUT(start_bracket, 1, (int)(code - start_bracket));
8282         *code = OP_KET;
8283         PUT(code, 1, (int)(code - start_bracket));
8284         code += 1 + LINK_SIZE;
8285         length += 2 + 2*LINK_SIZE;
8286         }
8287       cd->open_caps = cd->open_caps->next;
8288       }
8289 
8290     /* Retain the highest bracket number, in case resetting was used. */
8291 
8292     cd->bracount = max_bracount;
8293 
8294     /* Set values to pass back */
8295 
8296     *codeptr = code;
8297     *ptrptr = ptr;
8298     *firstcharptr = firstchar;
8299     *firstcharflagsptr = firstcharflags;
8300     *reqcharptr = reqchar;
8301     *reqcharflagsptr = reqcharflags;
8302     if (lengthptr != NULL)
8303       {
8304       if (OFLOW_MAX - *lengthptr < length)
8305         {
8306         *errorcodeptr = ERR20;
8307         return FALSE;
8308         }
8309       *lengthptr += length;
8310       }
8311     return TRUE;
8312     }
8313 
8314   /* Another branch follows. In the pre-compile phase, we can move the code
8315   pointer back to where it was for the start of the first branch. (That is,
8316   pretend that each branch is the only one.)
8317 
8318   In the real compile phase, insert an ALT node. Its length field points back
8319   to the previous branch while the bracket remains open. At the end the chain
8320   is reversed. It's done like this so that the start of the bracket has a
8321   zero offset until it is closed, making it possible to detect recursion. */
8322 
8323   if (lengthptr != NULL)
8324     {
8325     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8326     length += 1 + LINK_SIZE;
8327     }
8328   else
8329     {
8330     *code = OP_ALT;
8331     PUT(code, 1, (int)(code - last_branch));
8332     bc.current_branch = last_branch = code;
8333     code += 1 + LINK_SIZE;
8334     }
8335 
8336   ptr++;
8337   }
8338 /* Control never reaches here */
8339 }
8340 
8341 
8342 
8343 
8344 /*************************************************
8345 *          Check for anchored expression         *
8346 *************************************************/
8347 
8348 /* Try to find out if this is an anchored regular expression. Consider each
8349 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8350 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8351 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8352 be found, because ^ generates OP_CIRCM in that mode.
8353 
8354 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8355 This is the code for \G, which means "match at start of match position, taking
8356 into account the match offset".
8357 
8358 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8359 because that will try the rest of the pattern at all possible matching points,
8360 so there is no point trying again.... er ....
8361 
8362 .... except when the .* appears inside capturing parentheses, and there is a
8363 subsequent back reference to those parentheses. We haven't enough information
8364 to catch that case precisely.
8365 
8366 At first, the best we could do was to detect when .* was in capturing brackets
8367 and the highest back reference was greater than or equal to that level.
8368 However, by keeping a bitmap of the first 31 back references, we can catch some
8369 of the more common cases more precisely.
8370 
8371 ... A second exception is when the .* appears inside an atomic group, because
8372 this prevents the number of characters it matches from being adjusted.
8373 
8374 Arguments:
8375   code           points to start of expression (the bracket)
8376   bracket_map    a bitmap of which brackets we are inside while testing; this
8377                   handles up to substring 31; after that we just have to take
8378                   the less precise approach
8379   cd             points to the compile data block
8380   atomcount      atomic group level
8381 
8382 Returns:     TRUE or FALSE
8383 */
8384 
8385 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8386 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8387   compile_data *cd, int atomcount)
8388 {
8389 do {
8390    const pcre_uchar *scode = first_significant_code(
8391      code + PRIV(OP_lengths)[*code], FALSE);
8392    register int op = *scode;
8393 
8394    /* Non-capturing brackets */
8395 
8396    if (op == OP_BRA  || op == OP_BRAPOS ||
8397        op == OP_SBRA || op == OP_SBRAPOS)
8398      {
8399      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8400      }
8401 
8402    /* Capturing brackets */
8403 
8404    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8405             op == OP_SCBRA || op == OP_SCBRAPOS)
8406      {
8407      int n = GET2(scode, 1+LINK_SIZE);
8408      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8409      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8410      }
8411 
8412    /* Positive forward assertions and conditions */
8413 
8414    else if (op == OP_ASSERT || op == OP_COND)
8415      {
8416      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8417      }
8418 
8419    /* Atomic groups */
8420 
8421    else if (op == OP_ONCE || op == OP_ONCE_NC)
8422      {
8423      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8424        return FALSE;
8425      }
8426 
8427    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8428    it isn't in brackets that are or may be referenced or inside an atomic
8429    group. */
8430 
8431    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8432              op == OP_TYPEPOSSTAR))
8433      {
8434      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8435          atomcount > 0 || cd->had_pruneorskip)
8436        return FALSE;
8437      }
8438 
8439    /* Check for explicit anchoring */
8440 
8441    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8442 
8443    code += GET(code, 1);
8444    }
8445 while (*code == OP_ALT);   /* Loop for each alternative */
8446 return TRUE;
8447 }
8448 
8449 
8450 
8451 /*************************************************
8452 *         Check for starting with ^ or .*        *
8453 *************************************************/
8454 
8455 /* This is called to find out if every branch starts with ^ or .* so that
8456 "first char" processing can be done to speed things up in multiline
8457 matching and for non-DOTALL patterns that start with .* (which must start at
8458 the beginning or after \n). As in the case of is_anchored() (see above), we
8459 have to take account of back references to capturing brackets that contain .*
8460 because in that case we can't make the assumption. Also, the appearance of .*
8461 inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8462 count, because once again the assumption no longer holds.
8463 
8464 Arguments:
8465   code           points to start of expression (the bracket)
8466   bracket_map    a bitmap of which brackets we are inside while testing; this
8467                   handles up to substring 31; after that we just have to take
8468                   the less precise approach
8469   cd             points to the compile data
8470   atomcount      atomic group level
8471 
8472 Returns:         TRUE or FALSE
8473 */
8474 
8475 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8476 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8477   compile_data *cd, int atomcount)
8478 {
8479 do {
8480    const pcre_uchar *scode = first_significant_code(
8481      code + PRIV(OP_lengths)[*code], FALSE);
8482    register int op = *scode;
8483 
8484    /* If we are at the start of a conditional assertion group, *both* the
8485    conditional assertion *and* what follows the condition must satisfy the test
8486    for start of line. Other kinds of condition fail. Note that there may be an
8487    auto-callout at the start of a condition. */
8488 
8489    if (op == OP_COND)
8490      {
8491      scode += 1 + LINK_SIZE;
8492      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8493      switch (*scode)
8494        {
8495        case OP_CREF:
8496        case OP_DNCREF:
8497        case OP_RREF:
8498        case OP_DNRREF:
8499        case OP_DEF:
8500        return FALSE;
8501 
8502        default:     /* Assertion */
8503        if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8504        do scode += GET(scode, 1); while (*scode == OP_ALT);
8505        scode += 1 + LINK_SIZE;
8506        break;
8507        }
8508      scode = first_significant_code(scode, FALSE);
8509      op = *scode;
8510      }
8511 
8512    /* Non-capturing brackets */
8513 
8514    if (op == OP_BRA  || op == OP_BRAPOS ||
8515        op == OP_SBRA || op == OP_SBRAPOS)
8516      {
8517      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8518      }
8519 
8520    /* Capturing brackets */
8521 
8522    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8523             op == OP_SCBRA || op == OP_SCBRAPOS)
8524      {
8525      int n = GET2(scode, 1+LINK_SIZE);
8526      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8527      if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8528      }
8529 
8530    /* Positive forward assertions */
8531 
8532    else if (op == OP_ASSERT)
8533      {
8534      if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8535      }
8536 
8537    /* Atomic brackets */
8538 
8539    else if (op == OP_ONCE || op == OP_ONCE_NC)
8540      {
8541      if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8542      }
8543 
8544    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8545    brackets that may be referenced, as long as the pattern does not contain
8546    *PRUNE or *SKIP, because these break the feature. Consider, for example,
8547    /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8548    start of a line. */
8549 
8550    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8551      {
8552      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8553          atomcount > 0 || cd->had_pruneorskip)
8554        return FALSE;
8555      }
8556 
8557    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8558    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8559    because the number of characters matched by .* cannot be adjusted inside
8560    them. */
8561 
8562    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8563 
8564    /* Move on to the next alternative */
8565 
8566    code += GET(code, 1);
8567    }
8568 while (*code == OP_ALT);  /* Loop for each alternative */
8569 return TRUE;
8570 }
8571 
8572 
8573 
8574 /*************************************************
8575 *       Check for asserted fixed first char      *
8576 *************************************************/
8577 
8578 /* During compilation, the "first char" settings from forward assertions are
8579 discarded, because they can cause conflicts with actual literals that follow.
8580 However, if we end up without a first char setting for an unanchored pattern,
8581 it is worth scanning the regex to see if there is an initial asserted first
8582 char. If all branches start with the same asserted char, or with a
8583 non-conditional bracket all of whose alternatives start with the same asserted
8584 char (recurse ad lib), then we return that char, with the flags set to zero or
8585 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8586 
8587 Arguments:
8588   code       points to start of expression (the bracket)
8589   flags      points to the first char flags, or to REQ_NONE
8590   inassert   TRUE if in an assertion
8591 
8592 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8593 */
8594 
8595 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8596 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8597   BOOL inassert)
8598 {
8599 register pcre_uint32 c = 0;
8600 int cflags = REQ_NONE;
8601 
8602 *flags = REQ_NONE;
8603 do {
8604    pcre_uint32 d;
8605    int dflags;
8606    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8607              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8608    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8609      TRUE);
8610    register pcre_uchar op = *scode;
8611 
8612    switch(op)
8613      {
8614      default:
8615      return 0;
8616 
8617      case OP_BRA:
8618      case OP_BRAPOS:
8619      case OP_CBRA:
8620      case OP_SCBRA:
8621      case OP_CBRAPOS:
8622      case OP_SCBRAPOS:
8623      case OP_ASSERT:
8624      case OP_ONCE:
8625      case OP_ONCE_NC:
8626      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8627      if (dflags < 0)
8628        return 0;
8629      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8630      break;
8631 
8632      case OP_EXACT:
8633      scode += IMM2_SIZE;
8634      /* Fall through */
8635 
8636      case OP_CHAR:
8637      case OP_PLUS:
8638      case OP_MINPLUS:
8639      case OP_POSPLUS:
8640      if (!inassert) return 0;
8641      if (cflags < 0) { c = scode[1]; cflags = 0; }
8642        else if (c != scode[1]) return 0;
8643      break;
8644 
8645      case OP_EXACTI:
8646      scode += IMM2_SIZE;
8647      /* Fall through */
8648 
8649      case OP_CHARI:
8650      case OP_PLUSI:
8651      case OP_MINPLUSI:
8652      case OP_POSPLUSI:
8653      if (!inassert) return 0;
8654      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8655        else if (c != scode[1]) return 0;
8656      break;
8657      }
8658 
8659    code += GET(code, 1);
8660    }
8661 while (*code == OP_ALT);
8662 
8663 *flags = cflags;
8664 return c;
8665 }
8666 
8667 
8668 
8669 /*************************************************
8670 *     Add an entry to the name/number table      *
8671 *************************************************/
8672 
8673 /* This function is called between compiling passes to add an entry to the
8674 name/number table, maintaining alphabetical order. Checking for permitted
8675 and forbidden duplicates has already been done.
8676 
8677 Arguments:
8678   cd           the compile data block
8679   name         the name to add
8680   length       the length of the name
8681   groupno      the group number
8682 
8683 Returns:       nothing
8684 */
8685 
8686 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8687 add_name(compile_data *cd, const pcre_uchar *name, int length,
8688   unsigned int groupno)
8689 {
8690 int i;
8691 pcre_uchar *slot = cd->name_table;
8692 
8693 for (i = 0; i < cd->names_found; i++)
8694   {
8695   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8696   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8697     crc = -1; /* Current name is a substring */
8698 
8699   /* Make space in the table and break the loop for an earlier name. For a
8700   duplicate or later name, carry on. We do this for duplicates so that in the
8701   simple case (when ?(| is not used) they are in order of their numbers. In all
8702   cases they are in the order in which they appear in the pattern. */
8703 
8704   if (crc < 0)
8705     {
8706     memmove(slot + cd->name_entry_size, slot,
8707       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8708     break;
8709     }
8710 
8711   /* Continue the loop for a later or duplicate name */
8712 
8713   slot += cd->name_entry_size;
8714   }
8715 
8716 PUT2(slot, 0, groupno);
8717 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8718 slot[IMM2_SIZE + length] = 0;
8719 cd->names_found++;
8720 }
8721 
8722 
8723 
8724 /*************************************************
8725 *        Compile a Regular Expression            *
8726 *************************************************/
8727 
8728 /* This function takes a string and returns a pointer to a block of store
8729 holding a compiled version of the expression. The original API for this
8730 function had no error code return variable; it is retained for backwards
8731 compatibility. The new function is given a new name.
8732 
8733 Arguments:
8734   pattern       the regular expression
8735   options       various option bits
8736   errorcodeptr  pointer to error code variable (pcre_compile2() only)
8737                   can be NULL if you don't want a code value
8738   errorptr      pointer to pointer to error text
8739   erroroffset   ptr offset in pattern where error was detected
8740   tables        pointer to character tables or NULL
8741 
8742 Returns:        pointer to compiled data block, or NULL on error,
8743                 with errorptr and erroroffset set
8744 */
8745 
8746 #if defined COMPILE_PCRE8
8747 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)8748 pcre_compile(const char *pattern, int options, const char **errorptr,
8749   int *erroroffset, const unsigned char *tables)
8750 #elif defined COMPILE_PCRE16
8751 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8752 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8753   int *erroroffset, const unsigned char *tables)
8754 #elif defined COMPILE_PCRE32
8755 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8756 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8757   int *erroroffset, const unsigned char *tables)
8758 #endif
8759 {
8760 #if defined COMPILE_PCRE8
8761 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8762 #elif defined COMPILE_PCRE16
8763 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8764 #elif defined COMPILE_PCRE32
8765 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8766 #endif
8767 }
8768 
8769 
8770 #if defined COMPILE_PCRE8
8771 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)8772 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8773   const char **errorptr, int *erroroffset, const unsigned char *tables)
8774 #elif defined COMPILE_PCRE16
8775 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8776 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
8777   const char **errorptr, int *erroroffset, const unsigned char *tables)
8778 #elif defined COMPILE_PCRE32
8779 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8780 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
8781   const char **errorptr, int *erroroffset, const unsigned char *tables)
8782 #endif
8783 {
8784 REAL_PCRE *re;
8785 int length = 1;  /* For final END opcode */
8786 pcre_int32 firstcharflags, reqcharflags;
8787 pcre_uint32 firstchar, reqchar;
8788 pcre_uint32 limit_match = PCRE_UINT32_MAX;
8789 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
8790 int newline;
8791 int errorcode = 0;
8792 int skipatstart = 0;
8793 BOOL utf;
8794 BOOL never_utf = FALSE;
8795 size_t size;
8796 pcre_uchar *code;
8797 const pcre_uchar *codestart;
8798 const pcre_uchar *ptr;
8799 compile_data compile_block;
8800 compile_data *cd = &compile_block;
8801 
8802 /* This space is used for "compiling" into during the first phase, when we are
8803 computing the amount of memory that is needed. Compiled items are thrown away
8804 as soon as possible, so that a fairly large buffer should be sufficient for
8805 this purpose. The same space is used in the second phase for remembering where
8806 to fill in forward references to subpatterns. That may overflow, in which case
8807 new memory is obtained from malloc(). */
8808 
8809 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
8810 
8811 /* This vector is used for remembering name groups during the pre-compile. In a
8812 similar way to cworkspace, it can be expanded using malloc() if necessary. */
8813 
8814 named_group named_groups[NAMED_GROUP_LIST_SIZE];
8815 
8816 /* Set this early so that early errors get offset 0. */
8817 
8818 ptr = (const pcre_uchar *)pattern;
8819 
8820 /* We can't pass back an error message if errorptr is NULL; I guess the best we
8821 can do is just return NULL, but we can set a code value if there is a code
8822 pointer. */
8823 
8824 if (errorptr == NULL)
8825   {
8826   if (errorcodeptr != NULL) *errorcodeptr = 99;
8827   return NULL;
8828   }
8829 
8830 *errorptr = NULL;
8831 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
8832 
8833 /* However, we can give a message for this error */
8834 
8835 if (erroroffset == NULL)
8836   {
8837   errorcode = ERR16;
8838   goto PCRE_EARLY_ERROR_RETURN2;
8839   }
8840 
8841 *erroroffset = 0;
8842 
8843 /* Set up pointers to the individual character tables */
8844 
8845 if (tables == NULL) tables = PRIV(default_tables);
8846 cd->lcc = tables + lcc_offset;
8847 cd->fcc = tables + fcc_offset;
8848 cd->cbits = tables + cbits_offset;
8849 cd->ctypes = tables + ctypes_offset;
8850 
8851 /* Check that all undefined public option bits are zero */
8852 
8853 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
8854   {
8855   errorcode = ERR17;
8856   goto PCRE_EARLY_ERROR_RETURN;
8857   }
8858 
8859 /* If PCRE_NEVER_UTF is set, remember it. */
8860 
8861 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
8862 
8863 /* Check for global one-time settings at the start of the pattern, and remember
8864 the offset for later. */
8865 
8866 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
8867 
8868 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
8869        ptr[skipatstart+1] == CHAR_ASTERISK)
8870   {
8871   int newnl = 0;
8872   int newbsr = 0;
8873 
8874 /* For completeness and backward compatibility, (*UTFn) is supported in the
8875 relevant libraries, but (*UTF) is generic and always supported. Note that
8876 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
8877 
8878 #ifdef COMPILE_PCRE8
8879   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
8880     { skipatstart += 7; options |= PCRE_UTF8; continue; }
8881 #endif
8882 #ifdef COMPILE_PCRE16
8883   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
8884     { skipatstart += 8; options |= PCRE_UTF16; continue; }
8885 #endif
8886 #ifdef COMPILE_PCRE32
8887   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
8888     { skipatstart += 8; options |= PCRE_UTF32; continue; }
8889 #endif
8890 
8891   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
8892     { skipatstart += 6; options |= PCRE_UTF8; continue; }
8893   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
8894     { skipatstart += 6; options |= PCRE_UCP; continue; }
8895   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
8896     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
8897   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
8898     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
8899 
8900   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
8901     {
8902     pcre_uint32 c = 0;
8903     int p = skipatstart + 14;
8904     while (isdigit(ptr[p]))
8905       {
8906       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
8907       c = c*10 + ptr[p++] - CHAR_0;
8908       }
8909     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
8910     if (c < limit_match)
8911       {
8912       limit_match = c;
8913       cd->external_flags |= PCRE_MLSET;
8914       }
8915     skipatstart = p;
8916     continue;
8917     }
8918 
8919   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
8920     {
8921     pcre_uint32 c = 0;
8922     int p = skipatstart + 18;
8923     while (isdigit(ptr[p]))
8924       {
8925       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
8926       c = c*10 + ptr[p++] - CHAR_0;
8927       }
8928     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
8929     if (c < limit_recursion)
8930       {
8931       limit_recursion = c;
8932       cd->external_flags |= PCRE_RLSET;
8933       }
8934     skipatstart = p;
8935     continue;
8936     }
8937 
8938   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
8939     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
8940   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
8941     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
8942   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
8943     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
8944   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
8945     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
8946   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
8947     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
8948 
8949   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
8950     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
8951   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
8952     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
8953 
8954   if (newnl != 0)
8955     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
8956   else if (newbsr != 0)
8957     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
8958   else break;
8959   }
8960 
8961 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
8962 utf = (options & PCRE_UTF8) != 0;
8963 if (utf && never_utf)
8964   {
8965   errorcode = ERR78;
8966   goto PCRE_EARLY_ERROR_RETURN2;
8967   }
8968 
8969 /* Can't support UTF unless PCRE has been compiled to include the code. The
8970 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
8971 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
8972 not used here. */
8973 
8974 #ifdef SUPPORT_UTF
8975 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
8976      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
8977   {
8978 #if defined COMPILE_PCRE8
8979   errorcode = ERR44;
8980 #elif defined COMPILE_PCRE16
8981   errorcode = ERR74;
8982 #elif defined COMPILE_PCRE32
8983   errorcode = ERR77;
8984 #endif
8985   goto PCRE_EARLY_ERROR_RETURN2;
8986   }
8987 #else
8988 if (utf)
8989   {
8990   errorcode = ERR32;
8991   goto PCRE_EARLY_ERROR_RETURN;
8992   }
8993 #endif
8994 
8995 /* Can't support UCP unless PCRE has been compiled to include the code. */
8996 
8997 #ifndef SUPPORT_UCP
8998 if ((options & PCRE_UCP) != 0)
8999   {
9000   errorcode = ERR67;
9001   goto PCRE_EARLY_ERROR_RETURN;
9002   }
9003 #endif
9004 
9005 /* Check validity of \R options. */
9006 
9007 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9008      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9009   {
9010   errorcode = ERR56;
9011   goto PCRE_EARLY_ERROR_RETURN;
9012   }
9013 
9014 /* Handle different types of newline. The three bits give seven cases. The
9015 current code allows for fixed one- or two-byte sequences, plus "any" and
9016 "anycrlf". */
9017 
9018 switch (options & PCRE_NEWLINE_BITS)
9019   {
9020   case 0: newline = NEWLINE; break;   /* Build-time default */
9021   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9022   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9023   case PCRE_NEWLINE_CR+
9024        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9025   case PCRE_NEWLINE_ANY: newline = -1; break;
9026   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9027   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9028   }
9029 
9030 if (newline == -2)
9031   {
9032   cd->nltype = NLTYPE_ANYCRLF;
9033   }
9034 else if (newline < 0)
9035   {
9036   cd->nltype = NLTYPE_ANY;
9037   }
9038 else
9039   {
9040   cd->nltype = NLTYPE_FIXED;
9041   if (newline > 255)
9042     {
9043     cd->nllen = 2;
9044     cd->nl[0] = (newline >> 8) & 255;
9045     cd->nl[1] = newline & 255;
9046     }
9047   else
9048     {
9049     cd->nllen = 1;
9050     cd->nl[0] = newline;
9051     }
9052   }
9053 
9054 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9055 references to help in deciding whether (.*) can be treated as anchored or not.
9056 */
9057 
9058 cd->top_backref = 0;
9059 cd->backref_map = 0;
9060 
9061 /* Reflect pattern for debugging output */
9062 
9063 DPRINTF(("------------------------------------------------------------------\n"));
9064 #ifdef PCRE_DEBUG
9065 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9066 #endif
9067 DPRINTF(("\n"));
9068 
9069 /* Pretend to compile the pattern while actually just accumulating the length
9070 of memory required. This behaviour is triggered by passing a non-NULL final
9071 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9072 to compile parts of the pattern into; the compiled code is discarded when it is
9073 no longer needed, so hopefully this workspace will never overflow, though there
9074 is a test for its doing so. */
9075 
9076 cd->bracount = cd->final_bracount = 0;
9077 cd->names_found = 0;
9078 cd->name_entry_size = 0;
9079 cd->name_table = NULL;
9080 cd->dupnames = FALSE;
9081 cd->namedrefcount = 0;
9082 cd->start_code = cworkspace;
9083 cd->hwm = cworkspace;
9084 cd->start_workspace = cworkspace;
9085 cd->workspace_size = COMPILE_WORK_SIZE;
9086 cd->named_groups = named_groups;
9087 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9088 cd->start_pattern = (const pcre_uchar *)pattern;
9089 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9090 cd->req_varyopt = 0;
9091 cd->parens_depth = 0;
9092 cd->assert_depth = 0;
9093 cd->max_lookbehind = 0;
9094 cd->external_options = options;
9095 cd->open_caps = NULL;
9096 
9097 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9098 don't need to look at the result of the function here. The initial options have
9099 been put into the cd block so that they can be changed if an option setting is
9100 found within the regex right at the beginning. Bringing initial option settings
9101 outside can help speed up starting point checks. */
9102 
9103 ptr += skipatstart;
9104 code = cworkspace;
9105 *code = OP_BRA;
9106 
9107 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9108   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9109   cd, &length);
9110 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9111 
9112 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9113   (int)(cd->hwm - cworkspace)));
9114 
9115 if (length > MAX_PATTERN_SIZE)
9116   {
9117   errorcode = ERR20;
9118   goto PCRE_EARLY_ERROR_RETURN;
9119   }
9120 
9121 /* If there are groups with duplicate names and there are also references by
9122 name, we must allow for the possibility of named references to duplicated
9123 groups. These require an extra data item each. */
9124 
9125 if (cd->dupnames && cd->namedrefcount > 0)
9126   length += cd->namedrefcount * IMM2_SIZE * sizeof(pcre_uchar);
9127 
9128 /* Compute the size of the data block for storing the compiled pattern. Integer
9129 overflow should no longer be possible because nowadays we limit the maximum
9130 value of cd->names_found and cd->name_entry_size. */
9131 
9132 size = sizeof(REAL_PCRE) +
9133   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9134 
9135 /* Get the memory. */
9136 
9137 re = (REAL_PCRE *)(PUBL(malloc))(size);
9138 if (re == NULL)
9139   {
9140   errorcode = ERR21;
9141   goto PCRE_EARLY_ERROR_RETURN;
9142   }
9143 
9144 /* Put in the magic number, and save the sizes, initial options, internal
9145 flags, and character table pointer. NULL is used for the default character
9146 tables. The nullpad field is at the end; it's there to help in the case when a
9147 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9148 pointers. */
9149 
9150 re->magic_number = MAGIC_NUMBER;
9151 re->size = (int)size;
9152 re->options = cd->external_options;
9153 re->flags = cd->external_flags;
9154 re->limit_match = limit_match;
9155 re->limit_recursion = limit_recursion;
9156 re->first_char = 0;
9157 re->req_char = 0;
9158 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9159 re->name_entry_size = cd->name_entry_size;
9160 re->name_count = cd->names_found;
9161 re->ref_count = 0;
9162 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9163 re->nullpad = NULL;
9164 #ifdef COMPILE_PCRE32
9165 re->dummy = 0;
9166 #else
9167 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9168 #endif
9169 
9170 /* The starting points of the name/number translation table and of the code are
9171 passed around in the compile data block. The start/end pattern and initial
9172 options are already set from the pre-compile phase, as is the name_entry_size
9173 field. Reset the bracket count and the names_found field. Also reset the hwm
9174 field; this time it's used for remembering forward references to subpatterns.
9175 */
9176 
9177 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9178 cd->parens_depth = 0;
9179 cd->assert_depth = 0;
9180 cd->bracount = 0;
9181 cd->max_lookbehind = 0;
9182 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9183 codestart = cd->name_table + re->name_entry_size * re->name_count;
9184 cd->start_code = codestart;
9185 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9186 cd->req_varyopt = 0;
9187 cd->had_accept = FALSE;
9188 cd->had_pruneorskip = FALSE;
9189 cd->check_lookbehind = FALSE;
9190 cd->open_caps = NULL;
9191 
9192 /* If any named groups were found, create the name/number table from the list
9193 created in the first pass. */
9194 
9195 if (cd->names_found > 0)
9196   {
9197   int i = cd->names_found;
9198   named_group *ng = cd->named_groups;
9199   cd->names_found = 0;
9200   for (; i > 0; i--, ng++)
9201     add_name(cd, ng->name, ng->length, ng->number);
9202   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9203     (PUBL(free))((void *)cd->named_groups);
9204   }
9205 
9206 /* Set up a starting, non-extracting bracket, then compile the expression. On
9207 error, errorcode will be set non-zero, so we don't need to look at the result
9208 of the function here. */
9209 
9210 ptr = (const pcre_uchar *)pattern + skipatstart;
9211 code = (pcre_uchar *)codestart;
9212 *code = OP_BRA;
9213 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9214   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9215 re->top_bracket = cd->bracount;
9216 re->top_backref = cd->top_backref;
9217 re->max_lookbehind = cd->max_lookbehind;
9218 re->flags = cd->external_flags | PCRE_MODE;
9219 
9220 if (cd->had_accept)
9221   {
9222   reqchar = 0;              /* Must disable after (*ACCEPT) */
9223   reqcharflags = REQ_NONE;
9224   }
9225 
9226 /* If not reached end of pattern on success, there's an excess bracket. */
9227 
9228 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9229 
9230 /* Fill in the terminating state and check for disastrous overflow, but
9231 if debugging, leave the test till after things are printed out. */
9232 
9233 *code++ = OP_END;
9234 
9235 #ifndef PCRE_DEBUG
9236 if (code - codestart > length) errorcode = ERR23;
9237 #endif
9238 
9239 #ifdef SUPPORT_VALGRIND
9240 /* If the estimated length exceeds the really used length, mark the extra
9241 allocated memory as unaddressable, so that any out-of-bound reads can be
9242 detected. */
9243 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9244 #endif
9245 
9246 /* Fill in any forward references that are required. There may be repeated
9247 references; optimize for them, as searching a large regex takes time. */
9248 
9249 if (cd->hwm > cd->start_workspace)
9250   {
9251   int prev_recno = -1;
9252   const pcre_uchar *groupptr = NULL;
9253   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9254     {
9255     int offset, recno;
9256     cd->hwm -= LINK_SIZE;
9257     offset = GET(cd->hwm, 0);
9258     recno = GET(codestart, offset);
9259     if (recno != prev_recno)
9260       {
9261       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9262       prev_recno = recno;
9263       }
9264     if (groupptr == NULL) errorcode = ERR53;
9265       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9266     }
9267   }
9268 
9269 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9270 NULL to indicate that forward references have been filled in. */
9271 
9272 if (cd->workspace_size > COMPILE_WORK_SIZE)
9273   (PUBL(free))((void *)cd->start_workspace);
9274 cd->start_workspace = NULL;
9275 
9276 /* Give an error if there's back reference to a non-existent capturing
9277 subpattern. */
9278 
9279 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9280 
9281 /* Unless disabled, check whether any single character iterators can be
9282 auto-possessified. The function overwrites the appropriate opcode values, so
9283 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9284 used in this code because at least one compiler gives a warning about loss of
9285 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9286 function call. */
9287 
9288 if ((options & PCRE_NO_AUTO_POSSESS) == 0)
9289   {
9290   pcre_uchar *temp = (pcre_uchar *)codestart;
9291   auto_possessify(temp, utf, cd);
9292   }
9293 
9294 /* If there were any lookbehind assertions that contained OP_RECURSE
9295 (recursions or subroutine calls), a flag is set for them to be checked here,
9296 because they may contain forward references. Actual recursions cannot be fixed
9297 length, but subroutine calls can. It is done like this so that those without
9298 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9299 exceptional ones forgo this. We scan the pattern to check that they are fixed
9300 length, and set their lengths. */
9301 
9302 if (cd->check_lookbehind)
9303   {
9304   pcre_uchar *cc = (pcre_uchar *)codestart;
9305 
9306   /* Loop, searching for OP_REVERSE items, and process those that do not have
9307   their length set. (Actually, it will also re-process any that have a length
9308   of zero, but that is a pathological case, and it does no harm.) When we find
9309   one, we temporarily terminate the branch it is in while we scan it. */
9310 
9311   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9312        cc != NULL;
9313        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9314     {
9315     if (GET(cc, 1) == 0)
9316       {
9317       int fixed_length;
9318       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9319       int end_op = *be;
9320       *be = OP_END;
9321       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9322         cd);
9323       *be = end_op;
9324       DPRINTF(("fixed length = %d\n", fixed_length));
9325       if (fixed_length < 0)
9326         {
9327         errorcode = (fixed_length == -2)? ERR36 :
9328                     (fixed_length == -4)? ERR70 : ERR25;
9329         break;
9330         }
9331       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9332       PUT(cc, 1, fixed_length);
9333       }
9334     cc += 1 + LINK_SIZE;
9335     }
9336   }
9337 
9338 /* Failed to compile, or error while post-processing */
9339 
9340 if (errorcode != 0)
9341   {
9342   (PUBL(free))(re);
9343   PCRE_EARLY_ERROR_RETURN:
9344   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9345   PCRE_EARLY_ERROR_RETURN2:
9346   *errorptr = find_error_text(errorcode);
9347   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9348   return NULL;
9349   }
9350 
9351 /* If the anchored option was not passed, set the flag if we can determine that
9352 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9353 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9354 of *PRUNE or *SKIP.
9355 
9356 Otherwise, if we know what the first byte has to be, save it, because that
9357 speeds up unanchored matches no end. If not, see if we can set the
9358 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9359 start with ^. and also when all branches start with non-atomic .* for
9360 non-DOTALL matches when *PRUNE and SKIP are not present. */
9361 
9362 if ((re->options & PCRE_ANCHORED) == 0)
9363   {
9364   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9365   else
9366     {
9367     if (firstcharflags < 0)
9368       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9369     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9370       {
9371 #if defined COMPILE_PCRE8
9372       re->first_char = firstchar & 0xff;
9373 #elif defined COMPILE_PCRE16
9374       re->first_char = firstchar & 0xffff;
9375 #elif defined COMPILE_PCRE32
9376       re->first_char = firstchar;
9377 #endif
9378       if ((firstcharflags & REQ_CASELESS) != 0)
9379         {
9380 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9381         /* We ignore non-ASCII first chars in 8 bit mode. */
9382         if (utf)
9383           {
9384           if (re->first_char < 128)
9385             {
9386             if (cd->fcc[re->first_char] != re->first_char)
9387               re->flags |= PCRE_FCH_CASELESS;
9388             }
9389           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9390             re->flags |= PCRE_FCH_CASELESS;
9391           }
9392         else
9393 #endif
9394         if (MAX_255(re->first_char)
9395             && cd->fcc[re->first_char] != re->first_char)
9396           re->flags |= PCRE_FCH_CASELESS;
9397         }
9398 
9399       re->flags |= PCRE_FIRSTSET;
9400       }
9401 
9402     else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9403     }
9404   }
9405 
9406 /* For an anchored pattern, we use the "required byte" only if it follows a
9407 variable length item in the regex. Remove the caseless flag for non-caseable
9408 bytes. */
9409 
9410 if (reqcharflags >= 0 &&
9411      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9412   {
9413 #if defined COMPILE_PCRE8
9414   re->req_char = reqchar & 0xff;
9415 #elif defined COMPILE_PCRE16
9416   re->req_char = reqchar & 0xffff;
9417 #elif defined COMPILE_PCRE32
9418   re->req_char = reqchar;
9419 #endif
9420   if ((reqcharflags & REQ_CASELESS) != 0)
9421     {
9422 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9423     /* We ignore non-ASCII first chars in 8 bit mode. */
9424     if (utf)
9425       {
9426       if (re->req_char < 128)
9427         {
9428         if (cd->fcc[re->req_char] != re->req_char)
9429           re->flags |= PCRE_RCH_CASELESS;
9430         }
9431       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9432         re->flags |= PCRE_RCH_CASELESS;
9433       }
9434     else
9435 #endif
9436     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9437       re->flags |= PCRE_RCH_CASELESS;
9438     }
9439 
9440   re->flags |= PCRE_REQCHSET;
9441   }
9442 
9443 /* Print out the compiled data if debugging is enabled. This is never the
9444 case when building a production library. */
9445 
9446 #ifdef PCRE_DEBUG
9447 printf("Length = %d top_bracket = %d top_backref = %d\n",
9448   length, re->top_bracket, re->top_backref);
9449 
9450 printf("Options=%08x\n", re->options);
9451 
9452 if ((re->flags & PCRE_FIRSTSET) != 0)
9453   {
9454   pcre_uchar ch = re->first_char;
9455   const char *caseless =
9456     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9457   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9458     else printf("First char = \\x%02x%s\n", ch, caseless);
9459   }
9460 
9461 if ((re->flags & PCRE_REQCHSET) != 0)
9462   {
9463   pcre_uchar ch = re->req_char;
9464   const char *caseless =
9465     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9466   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9467     else printf("Req char = \\x%02x%s\n", ch, caseless);
9468   }
9469 
9470 #if defined COMPILE_PCRE8
9471 pcre_printint((pcre *)re, stdout, TRUE);
9472 #elif defined COMPILE_PCRE16
9473 pcre16_printint((pcre *)re, stdout, TRUE);
9474 #elif defined COMPILE_PCRE32
9475 pcre32_printint((pcre *)re, stdout, TRUE);
9476 #endif
9477 
9478 /* This check is done here in the debugging case so that the code that
9479 was compiled can be seen. */
9480 
9481 if (code - codestart > length)
9482   {
9483   (PUBL(free))(re);
9484   *errorptr = find_error_text(ERR23);
9485   *erroroffset = ptr - (pcre_uchar *)pattern;
9486   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9487   return NULL;
9488   }
9489 #endif   /* PCRE_DEBUG */
9490 
9491 /* Check for a pattern than can match an empty string, so that this information
9492 can be provided to applications. */
9493 
9494 do
9495   {
9496   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9497     {
9498     re->flags |= PCRE_MATCH_EMPTY;
9499     break;
9500     }
9501   codestart += GET(codestart, 1);
9502   }
9503 while (*codestart == OP_ALT);
9504 
9505 #if defined COMPILE_PCRE8
9506 return (pcre *)re;
9507 #elif defined COMPILE_PCRE16
9508 return (pcre16 *)re;
9509 #elif defined COMPILE_PCRE32
9510 return (pcre32 *)re;
9511 #endif
9512 }
9513 
9514 /* End of pcre_compile.c */
9515 
9516