1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2016 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains mode-dependent macro and structure definitions. The
43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
44 These mode-dependent items are kept in a separate file so that they can also be
45 #included multiple times for different code unit widths by pcre2test in order
46 to have access to the hidden structures at all supported widths.
47 
48 Some of the mode-dependent macros are required at different widths for
49 different parts of the pcre2test code (in particular, the included
50 pcre_printint.c file). We undefine them here so that they can be re-defined for
51 multiple inclusions. Not all of these are used in pcre2test, but it's easier
52 just to undefine them all. */
53 
54 #undef ACROSSCHAR
55 #undef BACKCHAR
56 #undef BYTES2CU
57 #undef CU2BYTES
58 #undef FORWARDCHAR
59 #undef FORWARDCHARTEST
60 #undef GET
61 #undef GET2
62 #undef GETCHAR
63 #undef GETCHARINC
64 #undef GETCHARINCTEST
65 #undef GETCHARLEN
66 #undef GETCHARLENTEST
67 #undef GETCHARTEST
68 #undef GET_EXTRALEN
69 #undef HAS_EXTRALEN
70 #undef IMM2_SIZE
71 #undef MAX_255
72 #undef MAX_MARK
73 #undef MAX_PATTERN_SIZE
74 #undef MAX_UTF_SINGLE_CU
75 #undef NOT_FIRSTCU
76 #undef PUT
77 #undef PUT2
78 #undef PUT2INC
79 #undef PUTCHAR
80 #undef PUTINC
81 #undef TABLE_GET
82 
83 
84 
85 /* -------------------------- MACROS ----------------------------- */
86 
87 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities
88 (always stored in big-endian order in 8-bit mode) by default. These are used,
89 for example, to link from the start of a subpattern to its alternatives and its
90 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
91 to around 64K, which is big enough for almost everybody. However, I received a
92 request for an even bigger limit. For this reason, and also to make the code
93 easier to maintain, the storing and loading of offsets from the compiled code
94 unit string is now handled by the macros that are defined here.
95 
96 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
97 values of 3 or 4 are also supported. */
98 
99 /* ------------------- 8-bit support  ------------------ */
100 
101 #if PCRE2_CODE_UNIT_WIDTH == 8
102 
103 #if LINK_SIZE == 2
104 #define PUT(a,n,d)   \
105   (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
106   (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
107 #define GET(a,n) \
108   (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
109 #define MAX_PATTERN_SIZE (1 << 16)
110 
111 #elif LINK_SIZE == 3
112 #define PUT(a,n,d)       \
113   (a[n] = (PCRE2_UCHAR)((d) >> 16)),    \
114   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
115   (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
116 #define GET(a,n) \
117   (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
118 #define MAX_PATTERN_SIZE (1 << 24)
119 
120 #elif LINK_SIZE == 4
121 #define PUT(a,n,d)        \
122   (a[n] = (PCRE2_UCHAR)((d) >> 24)),     \
123   (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
124   (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)),  \
125   (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
126 #define GET(a,n) \
127   (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
128 #define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
129 
130 #else
131 #error LINK_SIZE must be 2, 3, or 4
132 #endif
133 
134 
135 /* ------------------- 16-bit support  ------------------ */
136 
137 #elif PCRE2_CODE_UNIT_WIDTH == 16
138 
139 #if LINK_SIZE == 2
140 #undef LINK_SIZE
141 #define LINK_SIZE 1
142 #define PUT(a,n,d)   \
143   (a[n] = (d))
144 #define GET(a,n) \
145   (a[n])
146 #define MAX_PATTERN_SIZE (1 << 16)
147 
148 #elif LINK_SIZE == 3 || LINK_SIZE == 4
149 #undef LINK_SIZE
150 #define LINK_SIZE 2
151 #define PUT(a,n,d)   \
152   (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
153   (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
154 #define GET(a,n) \
155   (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
156 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
157 
158 #else
159 #error LINK_SIZE must be 2, 3, or 4
160 #endif
161 
162 
163 /* ------------------- 32-bit support  ------------------ */
164 
165 #elif PCRE2_CODE_UNIT_WIDTH == 32
166 #undef LINK_SIZE
167 #define LINK_SIZE 1
168 #define PUT(a,n,d)   \
169   (a[n] = (d))
170 #define GET(a,n) \
171   (a[n])
172 #define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
173 
174 #else
175 #error Unsupported compiling mode
176 #endif
177 
178 
179 /* --------------- Other mode-specific macros ----------------- */
180 
181 /* PCRE uses some other (at least) 16-bit quantities that do not change when
182 the size of offsets changes. There are used for repeat counts and for other
183 things such as capturing parenthesis numbers in back references.
184 
185 Define the number of code units required to hold a 16-bit count/offset, and
186 macros to load and store such a value. For reasons that I do not understand,
187 the expression in the 8-bit GET2 macro is treated by gcc as a signed
188 expression, even when a is declared as unsigned. It seems that any kind of
189 arithmetic results in a signed value. Hence the cast. */
190 
191 #if PCRE2_CODE_UNIT_WIDTH == 8
192 #define IMM2_SIZE 2
193 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
194 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
195 
196 #else  /* Code units are 16 or 32 bits */
197 #define IMM2_SIZE 1
198 #define GET2(a,n) a[n]
199 #define PUT2(a,n,d) a[n] = d
200 #endif
201 
202 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks
203 whether its argument is less than 256. The maximum length of a MARK name must
204 fit in one code unit; currently it is set to 255 or 65535. The TABLE_GET macro
205 is used to access elements of tables containing exactly 256 items. When code
206 points can be greater than 255, a check is needed before accessing these
207 tables. */
208 
209 #if PCRE2_CODE_UNIT_WIDTH == 8
210 #define MAX_255(c) TRUE
211 #define MAX_MARK ((1u << 8) - 1)
212 #ifdef SUPPORT_UNICODE
213 #define SUPPORT_WIDE_CHARS
214 #endif  /* SUPPORT_UNICODE */
215 #define TABLE_GET(c, table, default) ((table)[c])
216 
217 #else  /* Code units are 16 or 32 bits */
218 #define MAX_255(c) ((c) <= 255u)
219 #define MAX_MARK ((1u << 16) - 1)
220 #define SUPPORT_WIDE_CHARS
221 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
222 #endif
223 
224 
225 
226 /* ----------------- Character-handling macros ----------------- */
227 
228 /* There is a proposed future special "UTF-21" mode, in which only the lowest
229 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
230 high-order bits available to the application for other uses. In preparation for
231 the future implementation of this mode, there are macros that load a data item
232 and, if in this special mode, mask it to 21 bits. These macros all have names
233 starting with UCHAR21. In all other modes, including the normal 32-bit
234 library, the macros all have the same simple definitions. When the new mode is
235 implemented, it is expected that these definitions will be varied appropriately
236 using #ifdef when compiling the library that supports the special mode. */
237 
238 #define UCHAR21(eptr)        (*(eptr))
239 #define UCHAR21TEST(eptr)    (*(eptr))
240 #define UCHAR21INC(eptr)     (*(eptr)++)
241 #define UCHAR21INCTEST(eptr) (*(eptr)++)
242 
243 /* When UTF encoding is being used, a character is no longer just a single
244 byte in 8-bit mode or a single short in 16-bit mode. The macros for character
245 handling generate simple sequences when used in the basic mode, and more
246 complicated ones for UTF characters. GETCHARLENTEST and other macros are not
247 used when UTF is not supported. To make sure they can never even appear when
248 UTF support is omitted, we don't even define them. */
249 
250 #ifndef SUPPORT_UNICODE
251 
252 /* #define MAX_UTF_SINGLE_CU */
253 /* #define HAS_EXTRALEN(c) */
254 /* #define GET_EXTRALEN(c) */
255 /* #define NOT_FIRSTCU(c) */
256 #define GETCHAR(c, eptr) c = *eptr;
257 #define GETCHARTEST(c, eptr) c = *eptr;
258 #define GETCHARINC(c, eptr) c = *eptr++;
259 #define GETCHARINCTEST(c, eptr) c = *eptr++;
260 #define GETCHARLEN(c, eptr, len) c = *eptr;
261 #define PUTCHAR(c, p) (*p = c, 1)
262 /* #define GETCHARLENTEST(c, eptr, len) */
263 /* #define BACKCHAR(eptr) */
264 /* #define FORWARDCHAR(eptr) */
265 /* #define FORWARCCHARTEST(eptr,end) */
266 /* #define ACROSSCHAR(condition, eptr, action) */
267 
268 #else   /* SUPPORT_UNICODE */
269 
270 /* ------------------- 8-bit support  ------------------ */
271 
272 #if PCRE2_CODE_UNIT_WIDTH == 8
273 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
274 
275 /* The largest UTF code point that can be encoded as a single code unit. */
276 
277 #define MAX_UTF_SINGLE_CU 127
278 
279 /* Tests whether the code point needs extra characters to decode. */
280 
281 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
282 
283 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
284 Otherwise it has an undefined behaviour. */
285 
286 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
287 
288 /* Returns TRUE, if the given value is not the first code unit of a UTF
289 sequence. */
290 
291 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
292 
293 /* Get the next UTF-8 character, not advancing the pointer. This is called when
294 we know we are in UTF-8 mode. */
295 
296 #define GETCHAR(c, eptr) \
297   c = *eptr; \
298   if (c >= 0xc0u) GETUTF8(c, eptr);
299 
300 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
301 pointer. */
302 
303 #define GETCHARTEST(c, eptr) \
304   c = *eptr; \
305   if (utf && c >= 0xc0u) GETUTF8(c, eptr);
306 
307 /* Get the next UTF-8 character, advancing the pointer. This is called when we
308 know we are in UTF-8 mode. */
309 
310 #define GETCHARINC(c, eptr) \
311   c = *eptr++; \
312   if (c >= 0xc0u) GETUTF8INC(c, eptr);
313 
314 /* Get the next character, testing for UTF-8 mode, and advancing the pointer.
315 This is called when we don't know if we are in UTF-8 mode. */
316 
317 #define GETCHARINCTEST(c, eptr) \
318   c = *eptr++; \
319   if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
320 
321 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
322 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
323 
324 #define GETCHARLEN(c, eptr, len) \
325   c = *eptr; \
326   if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
327 
328 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
329 pointer, incrementing length if there are extra bytes. This is called when we
330 do not know if we are in UTF-8 mode. */
331 
332 #define GETCHARLENTEST(c, eptr, len) \
333   c = *eptr; \
334   if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
335 
336 /* If the pointer is not at the start of a character, move it back until
337 it is. This is called only in UTF-8 mode - we don't put a test within the macro
338 because almost all calls are already within a block of UTF-8 only code. */
339 
340 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
341 
342 /* Same as above, just in the other direction. */
343 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
344 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
345 
346 /* Same as above, but it allows a fully customizable form. */
347 #define ACROSSCHAR(condition, eptr, action) \
348   while((condition) && ((eptr) & 0xc0u) == 0x80u) action
349 
350 /* Deposit a character into memory, returning the number of code units. */
351 
352 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
353   PRIV(ord2utf)(c,p) : (*p = c, 1))
354 
355 
356 /* ------------------- 16-bit support  ------------------ */
357 
358 #elif PCRE2_CODE_UNIT_WIDTH == 16
359 #define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
360 
361 /* The largest UTF code point that can be encoded as a single code unit. */
362 
363 #define MAX_UTF_SINGLE_CU 65535
364 
365 /* Tests whether the code point needs extra characters to decode. */
366 
367 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
368 
369 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
370 Otherwise it has an undefined behaviour. */
371 
372 #define GET_EXTRALEN(c) 1
373 
374 /* Returns TRUE, if the given value is not the first code unit of a UTF
375 sequence. */
376 
377 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
378 
379 /* Base macro to pick up the low surrogate of a UTF-16 character, not
380 advancing the pointer. */
381 
382 #define GETUTF16(c, eptr) \
383    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
384 
385 /* Get the next UTF-16 character, not advancing the pointer. This is called when
386 we know we are in UTF-16 mode. */
387 
388 #define GETCHAR(c, eptr) \
389   c = *eptr; \
390   if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
391 
392 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
393 pointer. */
394 
395 #define GETCHARTEST(c, eptr) \
396   c = *eptr; \
397   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
398 
399 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing
400 the pointer. */
401 
402 #define GETUTF16INC(c, eptr) \
403    { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
404 
405 /* Get the next UTF-16 character, advancing the pointer. This is called when we
406 know we are in UTF-16 mode. */
407 
408 #define GETCHARINC(c, eptr) \
409   c = *eptr++; \
410   if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
411 
412 /* Get the next character, testing for UTF-16 mode, and advancing the pointer.
413 This is called when we don't know if we are in UTF-16 mode. */
414 
415 #define GETCHARINCTEST(c, eptr) \
416   c = *eptr++; \
417   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
418 
419 /* Base macro to pick up the low surrogate of a UTF-16 character, not
420 advancing the pointer, incrementing the length. */
421 
422 #define GETUTF16LEN(c, eptr, len) \
423    { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
424 
425 /* Get the next UTF-16 character, not advancing the pointer, incrementing
426 length if there is a low surrogate. This is called when we know we are in
427 UTF-16 mode. */
428 
429 #define GETCHARLEN(c, eptr, len) \
430   c = *eptr; \
431   if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
432 
433 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
434 pointer, incrementing length if there is a low surrogate. This is called when
435 we do not know if we are in UTF-16 mode. */
436 
437 #define GETCHARLENTEST(c, eptr, len) \
438   c = *eptr; \
439   if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
440 
441 /* If the pointer is not at the start of a character, move it back until
442 it is. This is called only in UTF-16 mode - we don't put a test within the
443 macro because almost all calls are already within a block of UTF-16 only
444 code. */
445 
446 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
447 
448 /* Same as above, just in the other direction. */
449 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
450 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
451 
452 /* Same as above, but it allows a fully customizable form. */
453 #define ACROSSCHAR(condition, eptr, action) \
454   if ((condition) && ((eptr) & 0xfc00u) == 0xdc00u) action
455 
456 /* Deposit a character into memory, returning the number of code units. */
457 
458 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
459   PRIV(ord2utf)(c,p) : (*p = c, 1))
460 
461 
462 /* ------------------- 32-bit support  ------------------ */
463 
464 #else
465 
466 /* These are trivial for the 32-bit library, since all UTF-32 characters fit
467 into one PCRE2_UCHAR unit. */
468 
469 #define MAX_UTF_SINGLE_CU (0x10ffffu)
470 #define HAS_EXTRALEN(c) (0)
471 #define GET_EXTRALEN(c) (0)
472 #define NOT_FIRSTCU(c) (0)
473 
474 /* Get the next UTF-32 character, not advancing the pointer. This is called when
475 we know we are in UTF-32 mode. */
476 
477 #define GETCHAR(c, eptr) \
478   c = *(eptr);
479 
480 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
481 pointer. */
482 
483 #define GETCHARTEST(c, eptr) \
484   c = *(eptr);
485 
486 /* Get the next UTF-32 character, advancing the pointer. This is called when we
487 know we are in UTF-32 mode. */
488 
489 #define GETCHARINC(c, eptr) \
490   c = *((eptr)++);
491 
492 /* Get the next character, testing for UTF-32 mode, and advancing the pointer.
493 This is called when we don't know if we are in UTF-32 mode. */
494 
495 #define GETCHARINCTEST(c, eptr) \
496   c = *((eptr)++);
497 
498 /* Get the next UTF-32 character, not advancing the pointer, not incrementing
499 length (since all UTF-32 is of length 1). This is called when we know we are in
500 UTF-32 mode. */
501 
502 #define GETCHARLEN(c, eptr, len) \
503   GETCHAR(c, eptr)
504 
505 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
506 pointer, not incrementing the length (since all UTF-32 is of length 1).
507 This is called when we do not know if we are in UTF-32 mode. */
508 
509 #define GETCHARLENTEST(c, eptr, len) \
510   GETCHARTEST(c, eptr)
511 
512 /* If the pointer is not at the start of a character, move it back until
513 it is. This is called only in UTF-32 mode - we don't put a test within the
514 macro because almost all calls are already within a block of UTF-32 only
515 code.
516 
517 These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
518 
519 #define BACKCHAR(eptr) do { } while (0)
520 
521 /* Same as above, just in the other direction. */
522 
523 #define FORWARDCHAR(eptr) do { } while (0)
524 #define FORWARDCHARTEST(eptr,end) do { } while (0)
525 
526 /* Same as above, but it allows a fully customizable form. */
527 
528 #define ACROSSCHAR(condition, eptr, action) do { } while (0)
529 
530 /* Deposit a character into memory, returning the number of code units. */
531 
532 #define PUTCHAR(c, p) (*p = c, 1)
533 
534 #endif  /* UTF-32 character handling */
535 #endif  /* SUPPORT_UNICODE */
536 
537 
538 /* Mode-dependent macros that have the same definition in all modes. */
539 
540 #define CU2BYTES(x)     ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
541 #define BYTES2CU(x)     ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
542 #define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
543 #define PUT2INC(a,n,d)  PUT2(a,n,d), a += IMM2_SIZE
544 
545 
546 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */
547 
548 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The
549 code that uses them is simpler because it assumes this. */
550 
551 /* The real general context structure. At present it holds only data for custom
552 memory control. */
553 
554 typedef struct pcre2_real_general_context {
555   pcre2_memctl memctl;
556 } pcre2_real_general_context;
557 
558 /* The real compile context structure */
559 
560 typedef struct pcre2_real_compile_context {
561   pcre2_memctl memctl;
562   int (*stack_guard)(uint32_t, void *);
563   void *stack_guard_data;
564   const uint8_t *tables;
565   PCRE2_SIZE max_pattern_length;
566   uint16_t bsr_convention;
567   uint16_t newline_convention;
568   uint32_t parens_nest_limit;
569 } pcre2_real_compile_context;
570 
571 /* The real match context structure. */
572 
573 typedef struct pcre2_real_match_context {
574   pcre2_memctl memctl;
575 #ifdef HEAP_MATCH_RECURSE
576   pcre2_memctl stack_memctl;
577 #endif
578 #ifdef SUPPORT_JIT
579   pcre2_jit_callback jit_callback;
580   void *jit_callback_data;
581 #endif
582   int    (*callout)(pcre2_callout_block *, void *);
583   void    *callout_data;
584   PCRE2_SIZE offset_limit;
585   uint32_t match_limit;
586   uint32_t recursion_limit;
587 } pcre2_real_match_context;
588 
589 /* The real compiled code structure. The type for the blocksize field is
590 defined specially because it is required in pcre2_serialize_decode() when
591 copying the size from possibly unaligned memory into a variable of the same
592 type. Use a macro rather than a typedef to avoid compiler warnings when this
593 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
594 largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
595 argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
596 here.) */
597 
598 #undef  CODE_BLOCKSIZE_TYPE
599 #define CODE_BLOCKSIZE_TYPE size_t
600 
601 #undef  LOOKBEHIND_MAX
602 #define LOOKBEHIND_MAX UINT16_MAX
603 
604 typedef struct pcre2_real_code {
605   pcre2_memctl memctl;            /* Memory control fields */
606   const uint8_t *tables;          /* The character tables */
607   void    *executable_jit;        /* Pointer to JIT code */
608   uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
609   CODE_BLOCKSIZE_TYPE blocksize;  /* Total (bytes) that was malloc-ed */
610   uint32_t magic_number;          /* Paranoid and endianness check */
611   uint32_t compile_options;       /* Options passed to pcre2_compile() */
612   uint32_t overall_options;       /* Options after processing the pattern */
613   uint32_t flags;                 /* Various state flags */
614   uint32_t limit_match;           /* Limit set in the pattern */
615   uint32_t limit_recursion;       /* Limit set in the pattern */
616   uint32_t first_codeunit;        /* Starting code unit */
617   uint32_t last_codeunit;         /* This codeunit must be seen */
618   uint16_t bsr_convention;        /* What \R matches */
619   uint16_t newline_convention;    /* What is a newline? */
620   uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
621   uint16_t minlength;             /* Minimum length of match */
622   uint16_t top_bracket;           /* Highest numbered group */
623   uint16_t top_backref;           /* Highest numbered back reference */
624   uint16_t name_entry_size;       /* Size (code units) of table entries */
625   uint16_t name_count;            /* Number of name entries in the table */
626 } pcre2_real_code;
627 
628 /* The real match data structure. */
629 
630 typedef struct pcre2_real_match_data {
631   pcre2_memctl     memctl;
632   const pcre2_real_code *code;    /* The pattern used for the match */
633   PCRE2_SPTR       subject;       /* The subject that was matched */
634   PCRE2_SPTR       mark;          /* Pointer to last mark */
635   PCRE2_SIZE       leftchar;      /* Offset to leftmost code unit */
636   PCRE2_SIZE       rightchar;     /* Offset to rightmost code unit */
637   PCRE2_SIZE       startchar;     /* Offset to starting code unit */
638   uint16_t         matchedby;     /* Type of match (normal, JIT, DFA) */
639   uint16_t         oveccount;     /* Number of pairs */
640   int              rc;            /* The return code from the match */
641   PCRE2_SIZE       ovector[1];    /* The first field */
642 } pcre2_real_match_data;
643 
644 
645 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */
646 
647 /* These structures are not needed for pcre2test. */
648 
649 #ifndef PCRE2_PCRE2TEST
650 
651 /* Structure for checking for mutual recursion when scanning compiled code. */
652 
653 typedef struct recurse_check {
654   struct recurse_check *prev;
655   PCRE2_SPTR group;
656 } recurse_check;
657 
658 /* Structure for building a cache when filling in recursion offsets. */
659 
660 typedef struct recurse_cache {
661   PCRE2_SPTR group;
662   int recno;
663 } recurse_cache;
664 
665 /* Structure for maintaining a chain of pointers to the currently incomplete
666 branches, for testing for left recursion while compiling. */
667 
668 typedef struct branch_chain {
669   struct branch_chain *outer;
670   PCRE2_UCHAR *current_branch;
671 } branch_chain;
672 
673 /* Structure for building a list of named groups during the first pass of
674 compiling. */
675 
676 typedef struct named_group {
677   PCRE2_SPTR   name;          /* Points to the name in the pattern */
678   uint32_t     number;        /* Group number */
679   uint16_t     length;        /* Length of the name */
680   uint16_t     isdup;         /* TRUE if a duplicate */
681 } named_group;
682 
683 /* Structure for passing "static" information around between the functions
684 doing the compiling, so that they are thread-safe. */
685 
686 typedef struct compile_block {
687   pcre2_real_compile_context *cx;  /* Points to the compile context */
688   const uint8_t *lcc;              /* Points to lower casing table */
689   const uint8_t *fcc;              /* Points to case-flipping table */
690   const uint8_t *cbits;            /* Points to character type table */
691   const uint8_t *ctypes;           /* Points to table of type maps */
692   PCRE2_SPTR start_workspace;      /* The start of working space */
693   PCRE2_SPTR start_code;           /* The start of the compiled code */
694   PCRE2_SPTR start_pattern;        /* The start of the pattern */
695   PCRE2_SPTR end_pattern;          /* The end of the pattern */
696   PCRE2_SPTR nestptr[2];           /* Pointer(s) saved for string substitution */
697   PCRE2_UCHAR *name_table;         /* The name/number table */
698   size_t workspace_size;           /* Size of workspace */
699   uint16_t names_found;            /* Number of entries so far */
700   uint16_t name_entry_size;        /* Size of each entry */
701   open_capitem *open_caps;         /* Chain of open capture items */
702   named_group *named_groups;       /* Points to vector in pre-compile */
703   uint32_t named_group_list_size;  /* Number of entries in the list */
704   uint32_t external_options;       /* External (initial) options */
705   uint32_t external_flags;         /* External flag bits to be set */
706   uint32_t bracount;               /* Count of capturing parens as we compile */
707   uint32_t final_bracount;         /* Saved value after first pass */
708   uint32_t *groupinfo;             /* Group info vector */
709   uint32_t top_backref;            /* Maximum back reference */
710   uint32_t backref_map;            /* Bitmap of low back refs */
711   uint32_t nltype;                 /* Newline type */
712   uint32_t nllen;                  /* Newline string length */
713   PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
714   int  max_lookbehind;             /* Maximum lookbehind (characters) */
715   int  parens_depth;               /* Depth of nested parentheses */
716   int  assert_depth;               /* Depth of nested assertions */
717   int  req_varyopt;                /* "After variable item" flag for reqbyte */
718   BOOL had_accept;                 /* (*ACCEPT) encountered */
719   BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
720   BOOL had_recurse;                /* Had a recursion or subroutine call */
721   BOOL check_lookbehind;           /* Lookbehinds need later checking */
722   BOOL dupnames;                   /* Duplicate names exist */
723   BOOL iscondassert;               /* Next assert is a condition */
724 } compile_block;
725 
726 /* Structure for keeping the properties of the in-memory stack used
727 by the JIT matcher. */
728 
729 typedef struct pcre2_real_jit_stack {
730   pcre2_memctl memctl;
731   void* stack;
732 } pcre2_real_jit_stack;
733 
734 /* Structure for keeping a chain of heap blocks used for saving ovectors
735 during pattern recursion when the ovector is larger than can be saved on
736 the system stack. */
737 
738 typedef struct ovecsave_frame {
739   struct ovecsave_frame *next;     /* Next frame on free chain */
740   PCRE2_SIZE saved_ovec[1];        /* First vector element */
741 } ovecsave_frame;
742 
743 /* Structure for items in a linked list that represents an explicit recursive
744 call within the pattern; used by pcre_match(). */
745 
746 typedef struct recursion_info {
747   struct recursion_info *prevrec;  /* Previous recursion record (or NULL) */
748   unsigned int group_num;          /* Number of group that was called */
749   PCRE2_SIZE *ovec_save;           /* Pointer to saved ovector frame */
750   uint32_t saved_capture_last;     /* Last capture number */
751   PCRE2_SPTR subject_position;     /* Position at start of recursion */
752 } recursion_info;
753 
754 /* A similar structure for pcre_dfa_match(). */
755 
756 typedef struct dfa_recursion_info {
757   struct dfa_recursion_info *prevrec;
758   PCRE2_SPTR subject_position;
759   uint32_t group_num;
760 } dfa_recursion_info;
761 
762 /* Structure for building a chain of data for holding the values of the subject
763 pointer at the start of each subpattern, so as to detect when an empty string
764 has been matched by a subpattern - to break infinite loops; used by
765 pcre2_match(). */
766 
767 typedef struct eptrblock {
768   struct eptrblock *epb_prev;
769   PCRE2_SPTR epb_saved_eptr;
770 } eptrblock;
771 
772 /* Structure for passing "static" information around between the functions
773 doing traditional NFA matching (pcre2_match() and friends). */
774 
775 typedef struct match_block {
776   pcre2_memctl memctl;            /* For general use */
777 #ifdef HEAP_MATCH_RECURSE
778   pcre2_memctl stack_memctl;      /* For "stack" frames */
779 #endif
780   uint32_t match_call_count;      /* As it says */
781   uint32_t match_limit;           /* As it says */
782   uint32_t match_limit_recursion; /* As it says */
783   BOOL hitend;                    /* Hit the end of the subject at some point */
784   BOOL hasthen;                   /* Pattern contains (*THEN) */
785   const uint8_t *lcc;             /* Points to lower casing table */
786   const uint8_t *fcc;             /* Points to case-flipping table */
787   const uint8_t *ctypes;          /* Points to table of type maps */
788   PCRE2_SIZE *ovector;            /* Pointer to the offset vector */
789   PCRE2_SIZE offset_end;          /* One past the end */
790   PCRE2_SIZE offset_max;          /* The maximum usable for return data */
791   PCRE2_SIZE start_offset;        /* The start offset value */
792   PCRE2_SIZE end_offset_top;      /* Highwater mark at end of match */
793   uint16_t partial;               /* PARTIAL options */
794   uint16_t bsr_convention;        /* \R interpretation */
795   uint16_t name_count;            /* Number of names in name table */
796   uint16_t name_entry_size;       /* Size of entry in names table */
797   PCRE2_SPTR name_table;          /* Table of group names */
798   PCRE2_SPTR start_code;          /* For use when recursing */
799   PCRE2_SPTR start_subject;       /* Start of the subject string */
800   PCRE2_SPTR end_subject;         /* End of the subject string */
801   PCRE2_SPTR start_match_ptr;     /* Start of matched string */
802   PCRE2_SPTR end_match_ptr;       /* Subject position at end match */
803   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
804   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
805   PCRE2_SPTR mark;                /* Mark pointer to pass back on success */
806   PCRE2_SPTR nomatch_mark;        /* Mark pointer to pass back on failure */
807   PCRE2_SPTR once_target;         /* Where to back up to for atomic groups */
808   uint32_t moptions;              /* Match options */
809   uint32_t poptions;              /* Pattern options */
810   uint32_t capture_last;          /* Most recent capture number + overflow flag */
811   uint32_t skip_arg_count;        /* For counting SKIP_ARGs */
812   uint32_t ignore_skip_arg;       /* For re-run when SKIP arg name not found */
813   uint32_t match_function_type;   /* Set for certain special calls of match() */
814   uint32_t nltype;                /* Newline type */
815   uint32_t nllen;                 /* Newline string length */
816   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
817   eptrblock *eptrchain;           /* Chain of eptrblocks for tail recursions */
818   recursion_info *recursive;      /* Linked list of recursion data */
819   ovecsave_frame *ovecsave_chain; /* Linked list of free ovecsave blocks */
820   void  *callout_data;            /* To pass back to callouts */
821   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
822 #ifdef HEAP_MATCH_RECURSE
823   void  *match_frames_base;       /* For remembering malloc'd frames */
824 #endif
825 } match_block;
826 
827 /* A similar structure is used for the same purpose by the DFA matching
828 functions. */
829 
830 typedef struct dfa_match_block {
831   pcre2_memctl memctl;            /* For general use */
832   PCRE2_SPTR start_code;          /* Start of the compiled pattern */
833   PCRE2_SPTR start_subject ;      /* Start of the subject string */
834   PCRE2_SPTR end_subject;         /* End of subject string */
835   PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
836   PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
837   const uint8_t *tables;          /* Character tables */
838   PCRE2_SIZE start_offset;        /* The start offset value */
839   uint32_t moptions;              /* Match options */
840   uint32_t poptions;              /* Pattern options */
841   uint32_t nltype;                /* Newline type */
842   uint32_t nllen;                 /* Newline string length */
843   PCRE2_UCHAR nl[4];              /* Newline string when fixed */
844   uint16_t bsr_convention;        /* \R interpretation */
845   void *callout_data;             /* To pass back to callouts */
846   int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
847   dfa_recursion_info *recursive;  /* Linked list of recursion data */
848 } dfa_match_block;
849 
850 #endif  /* PCRE2_PCRE2TEST */
851 
852 /* End of pcre2_intmodedep.h */
853