1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
pchars(const pcre_uchar * p,int length,BOOL is_subject,match_data * md)131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
match_ref(int offset,register PCRE_PUCHAR eptr,int length,match_data * md,BOOL caseless)165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
199 if (utf)
200 {
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
208 wrong). */
209
210 PCRE_PUCHAR endptr = p + length;
211 while (p < endptr)
212 {
213 pcre_uint32 c, d;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
216 GETCHARINC(c, eptr);
217 GETCHARINC(d, p);
218 ur = GET_UCD(d);
219 if (c != d && c != d + ur->other_case)
220 {
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222 for (;;)
223 {
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
226 }
227 }
228 }
229 }
230 else
231 #endif
232
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
235 {
236 while (length-- > 0)
237 {
238 pcre_uint32 cc, cp;
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
241 cp = UCHAR21TEST(p);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
243 p++;
244 eptr++;
245 }
246 }
247 }
248
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
251
252 else
253 {
254 while (length-- > 0)
255 {
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
258 }
259 }
260
261 return (int)(eptr - eptr_start);
262 }
263
264
265
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
269
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
275 fine.
276
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
281
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
286 always used to.
287
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
294
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
303
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
306
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
314
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
318
319 #ifndef NO_RECURSE
320 #define REGISTER register
321
322 #ifdef PCRE_DEBUG
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
324 { \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
328 }
329 #define RRETURN(ra) \
330 { \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
332 return ra; \
333 }
334 #else
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
338 #endif
339
340 #else
341
342
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
346
347 #define REGISTER
348
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
350 {\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
353 {\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
358 }\
359 frame->Xwhere = rw;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
367 frame = newframe;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
369 goto HEAP_RECURSE;\
370 L_##rw:\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
372 }
373
374 #define RRETURN(ra)\
375 {\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
378 if (frame != NULL)\
379 {\
380 rrc = ra;\
381 goto HEAP_RETURN;\
382 }\
383 return ra;\
384 }
385
386
387 /* Structure for remembering the local variables in a private frame */
388
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
392
393 /* Function arguments that may change */
394
395 PCRE_PUCHAR Xeptr;
396 const pcre_uchar *Xecode;
397 PCRE_PUCHAR Xmstart;
398 int Xoffset_top;
399 eptrblock *Xeptrb;
400 unsigned int Xrdepth;
401
402 /* Function local variables */
403
404 PCRE_PUCHAR Xcallpat;
405 #ifdef SUPPORT_UTF
406 PCRE_PUCHAR Xcharptr;
407 #endif
408 PCRE_PUCHAR Xdata;
409 PCRE_PUCHAR Xnext;
410 PCRE_PUCHAR Xpp;
411 PCRE_PUCHAR Xprev;
412 PCRE_PUCHAR Xsaved_eptr;
413
414 recursion_info Xnew_recursive;
415
416 BOOL Xcur_is_word;
417 BOOL Xcondition;
418 BOOL Xprev_is_word;
419
420 #ifdef SUPPORT_UCP
421 int Xprop_type;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
424 int Xoclength;
425 pcre_uchar Xocchars[6];
426 #endif
427
428 int Xcodelink;
429 int Xctype;
430 unsigned int Xfc;
431 int Xfi;
432 int Xlength;
433 int Xmax;
434 int Xmin;
435 unsigned int Xnumber;
436 int Xoffset;
437 unsigned int Xop;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
441
442 eptrblock Xnewptrb;
443
444 /* Where to jump back to */
445
446 int Xwhere;
447
448 } heapframe;
449
450 #endif
451
452
453 /***************************************************************************
454 ***************************************************************************/
455
456
457
458 /*************************************************
459 * Match from current position *
460 *************************************************/
461
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
464 same response. */
465
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
471 the subject. */
472
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
476 { \
477 md->hitend = TRUE; \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
479 }
480
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
483 { \
484 md->hitend = TRUE; \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
486 }
487
488
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
493
494 Arguments:
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
498 by encountering \K)
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
504
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
510 */
511
512 static int
match(REGISTER PCRE_PUCHAR eptr,REGISTER const pcre_uchar * ecode,PCRE_PUCHAR mstart,int offset_top,match_data * md,eptrblock * eptrb,unsigned int rdepth)513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515 unsigned int rdepth)
516 {
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
520
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
525
526 BOOL minimize, possessive; /* Quantifier options */
527 BOOL caseless;
528 int condcode;
529
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
536
537 #ifdef NO_RECURSE
538 heapframe *frame = (heapframe *)md->match_frames_base;
539
540 /* Copy in the original argument variables */
541
542 frame->Xeptr = eptr;
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
548
549 /* This is where control jumps back to to effect "recursion" */
550
551 HEAP_RECURSE:
552
553 /* Macros make the argument variables come from the current frame */
554
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
561
562 /* Ditto for the local variables */
563
564 #ifdef SUPPORT_UTF
565 #define charptr frame->Xcharptr
566 #endif
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
574
575 #define new_recursive frame->Xnew_recursive
576
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
580
581 #ifdef SUPPORT_UCP
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
587 #endif
588
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
603
604 #define newptrb frame->Xnewptrb
605
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
609
610 #else /* NO_RECURSE not defined */
611 #define fi i
612 #define fc c
613
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
620 to RMATCH(). */
621
622 #ifdef SUPPORT_UTF
623 const pcre_uchar *charptr;
624 #endif
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
628 PCRE_PUCHAR pp;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
631
632 recursion_info new_recursive;
633
634 BOOL cur_is_word;
635 BOOL condition;
636 BOOL prev_is_word;
637
638 #ifdef SUPPORT_UCP
639 int prop_type;
640 unsigned int prop_value;
641 int prop_fail_result;
642 int oclength;
643 pcre_uchar occhars[6];
644 #endif
645
646 int codelink;
647 int ctype;
648 int length;
649 int max;
650 int min;
651 unsigned int number;
652 int offset;
653 unsigned int op;
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
657
658 eptrblock newptrb;
659
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
664 size. */
665
666 if (ecode == NULL)
667 {
668 if (rdepth == 0)
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670 else
671 {
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
674 }
675 }
676 #endif /* NO_RECURSE */
677
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
682
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
688 #define foc number
689 #define save_mark data
690
691 /* These statements are here to stop the compiler complaining about unitialized
692 variables. */
693
694 #ifdef SUPPORT_UCP
695 prop_value = 0;
696 prop_fail_result = 0;
697 #endif
698
699
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
703 original patch. */
704
705 TAIL_RECURSE:
706
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
714
715 #ifdef SUPPORT_UTF
716 utf = md->utf; /* Local copy of the flag */
717 #else
718 utf = FALSE;
719 #endif
720
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
723
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
731
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
738
739 if (md->match_function_type == MATCH_CBEGROUP)
740 {
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
743 eptrb = &newptrb;
744 md->match_function_type = 0;
745 }
746
747 /* Now start processing the opcodes. */
748
749 for (;;)
750 {
751 minimize = possessive = FALSE;
752 op = *ecode;
753
754 switch(op)
755 {
756 case OP_MARK:
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760 eptrb, RM55);
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
763
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
769 unaltered. */
770
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
773 {
774 md->start_match_ptr = eptr;
775 RRETURN(MATCH_SKIP);
776 }
777 RRETURN(rrc);
778
779 case OP_FAIL:
780 RRETURN(MATCH_NOMATCH);
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
787
788 case OP_PRUNE:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM51);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
793
794 case OP_PRUNE_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798 eptrb, RM56);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
803
804 case OP_SKIP:
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806 eptrb, RM53);
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
809 RRETURN(MATCH_SKIP);
810
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
818
819 case OP_SKIP_ARG:
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
822 {
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824 break;
825 }
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827 eptrb, RM57);
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
829
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
834
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
837
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
841
842 case OP_THEN:
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844 eptrb, RM54);
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
847 RRETURN(MATCH_THEN);
848
849 case OP_THEN_ARG:
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853 md, eptrb, RM58);
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
858 RRETURN(MATCH_THEN);
859
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
867
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
872
873 case OP_ONCE_NC:
874 prev = ecode;
875 saved_eptr = eptr;
876 save_mark = md->mark;
877 do
878 {
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881 {
882 mstart = md->start_match_ptr;
883 break;
884 }
885 if (rrc == MATCH_THEN)
886 {
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
890 rrc = MATCH_NOMATCH;
891 }
892
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
896 }
897 while (*ecode == OP_ALT);
898
899 /* If hit the end of the group (which could be repeated), fail */
900
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
905
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
910
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
914 5.005. */
915
916 if (*ecode == OP_KET || eptr == saved_eptr)
917 {
918 ecode += 1+LINK_SIZE;
919 break;
920 }
921
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
925
926 if (*ecode == OP_KETRMIN)
927 {
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930 ecode = prev;
931 goto TAIL_RECURSE;
932 }
933 else /* OP_KETRMAX */
934 {
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
938 goto TAIL_RECURSE;
939 }
940 /* Control never gets here */
941
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
951
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
955
956 case OP_CBRA:
957 case OP_SCBRA:
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
960
961 #ifdef PCRE_DEBUG
962 printf("start bracket %d\n", number);
963 printf("subject=");
964 pchars(eptr, 16, TRUE, md);
965 printf("\n");
966 #endif
967
968 if (offset < md->offset_max)
969 {
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
975
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
979
980 for (;;)
981 {
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984 eptrb, RM1);
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
997
998 if (rrc == MATCH_THEN)
999 {
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1004 }
1005
1006 /* Anything other than NOMATCH is passed back. */
1007
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1013 }
1014
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1019
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021
1022 RRETURN(rrc);
1023 }
1024
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1027
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1038
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1045
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1048
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1055
1056 case OP_ONCE:
1057 case OP_BRA:
1058 case OP_SBRA:
1059 DPRINTF(("start non-capturing bracket\n"));
1060
1061 for (;;)
1062 {
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1065
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1068 above. */
1069
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1071 {
1072 ecode += PRIV(OP_lengths)[*ecode];
1073 goto TAIL_RECURSE;
1074 }
1075
1076 /* In all other cases, we have to make another call to match(). */
1077
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081 RM2);
1082
1083 /* See comment in the code for capturing groups above about handling
1084 THEN. */
1085
1086 if (rrc == MATCH_THEN)
1087 {
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1092 }
1093
1094 if (rrc != MATCH_NOMATCH)
1095 {
1096 if (rrc == MATCH_ONCE)
1097 {
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1100 {
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1103 }
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1105 }
1106 RRETURN(rrc);
1107 }
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1112 }
1113
1114 RRETURN(MATCH_NOMATCH);
1115
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1123
1124 case OP_CBRAPOS:
1125 case OP_SCBRAPOS:
1126 allow_zero = FALSE;
1127
1128 POSSESSIVE_CAPTURE:
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1131
1132 #ifdef PCRE_DEBUG
1133 printf("start possessive bracket %d\n", number);
1134 printf("subject=");
1135 pchars(eptr, 16, TRUE, md);
1136 printf("\n");
1137 #endif
1138
1139 if (offset < md->offset_max)
1140 {
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1143
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1148
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1150
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1159
1160 for (;;)
1161 {
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166 eptrb, RM63);
1167 if (rrc == MATCH_KETRPOS)
1168 {
1169 offset_top = md->end_offset_top;
1170 ecode = md->start_code + code_offset;
1171 save_capture_last = md->capture_last;
1172 matched_once = TRUE;
1173 mstart = md->start_match_ptr; /* In case \K changed it */
1174 if (eptr == md->end_match_ptr) /* Matched an empty string */
1175 {
1176 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1177 break;
1178 }
1179 eptr = md->end_match_ptr;
1180 continue;
1181 }
1182
1183 /* See comment in the code for capturing groups above about handling
1184 THEN. */
1185
1186 if (rrc == MATCH_THEN)
1187 {
1188 next = ecode + GET(ecode,1);
1189 if (md->start_match_ptr < next &&
1190 (*ecode == OP_ALT || *next == OP_ALT))
1191 rrc = MATCH_NOMATCH;
1192 }
1193
1194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1195 md->capture_last = save_capture_last;
1196 ecode += GET(ecode, 1);
1197 if (*ecode != OP_ALT) break;
1198 }
1199
1200 if (!matched_once)
1201 {
1202 md->offset_vector[offset] = save_offset1;
1203 md->offset_vector[offset+1] = save_offset2;
1204 md->offset_vector[md->offset_end - number] = save_offset3;
1205 }
1206
1207 if (allow_zero || matched_once)
1208 {
1209 ecode += 1 + LINK_SIZE;
1210 break;
1211 }
1212
1213 RRETURN(MATCH_NOMATCH);
1214 }
1215
1216 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1217 as a non-capturing bracket. */
1218
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221
1222 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1223
1224 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1225 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1226
1227 /* Non-capturing possessive bracket with unlimited repeat. We come here
1228 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1229 without the capturing complication. It is written out separately for speed
1230 and cleanliness. */
1231
1232 case OP_BRAPOS:
1233 case OP_SBRAPOS:
1234 allow_zero = FALSE;
1235
1236 POSSESSIVE_NON_CAPTURE:
1237 matched_once = FALSE;
1238 code_offset = (int)(ecode - md->start_code);
1239 save_capture_last = md->capture_last;
1240
1241 for (;;)
1242 {
1243 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1244 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1245 eptrb, RM48);
1246 if (rrc == MATCH_KETRPOS)
1247 {
1248 offset_top = md->end_offset_top;
1249 ecode = md->start_code + code_offset;
1250 matched_once = TRUE;
1251 mstart = md->start_match_ptr; /* In case \K reset it */
1252 if (eptr == md->end_match_ptr) /* Matched an empty string */
1253 {
1254 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1255 break;
1256 }
1257 eptr = md->end_match_ptr;
1258 continue;
1259 }
1260
1261 /* See comment in the code for capturing groups above about handling
1262 THEN. */
1263
1264 if (rrc == MATCH_THEN)
1265 {
1266 next = ecode + GET(ecode,1);
1267 if (md->start_match_ptr < next &&
1268 (*ecode == OP_ALT || *next == OP_ALT))
1269 rrc = MATCH_NOMATCH;
1270 }
1271
1272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1273 ecode += GET(ecode, 1);
1274 if (*ecode != OP_ALT) break;
1275 md->capture_last = save_capture_last;
1276 }
1277
1278 if (matched_once || allow_zero)
1279 {
1280 ecode += 1 + LINK_SIZE;
1281 break;
1282 }
1283 RRETURN(MATCH_NOMATCH);
1284
1285 /* Control never reaches here. */
1286
1287 /* Conditional group: compilation checked that there are no more than two
1288 branches. If the condition is false, skipping the first branch takes us
1289 past the end of the item if there is only one branch, but that's exactly
1290 what we want. */
1291
1292 case OP_COND:
1293 case OP_SCOND:
1294
1295 /* The variable codelink will be added to ecode when the condition is
1296 false, to get to the second branch. Setting it to the offset to the ALT
1297 or KET, then incrementing ecode achieves this effect. We now have ecode
1298 pointing to the condition or callout. */
1299
1300 codelink = GET(ecode, 1); /* Offset to the second branch */
1301 ecode += 1 + LINK_SIZE; /* From this opcode */
1302
1303 /* Because of the way auto-callout works during compile, a callout item is
1304 inserted between OP_COND and an assertion condition. */
1305
1306 if (*ecode == OP_CALLOUT)
1307 {
1308 if (PUBL(callout) != NULL)
1309 {
1310 PUBL(callout_block) cb;
1311 cb.version = 2; /* Version 1 of the callout block */
1312 cb.callout_number = ecode[1];
1313 cb.offset_vector = md->offset_vector;
1314 #if defined COMPILE_PCRE8
1315 cb.subject = (PCRE_SPTR)md->start_subject;
1316 #elif defined COMPILE_PCRE16
1317 cb.subject = (PCRE_SPTR16)md->start_subject;
1318 #elif defined COMPILE_PCRE32
1319 cb.subject = (PCRE_SPTR32)md->start_subject;
1320 #endif
1321 cb.subject_length = (int)(md->end_subject - md->start_subject);
1322 cb.start_match = (int)(mstart - md->start_subject);
1323 cb.current_position = (int)(eptr - md->start_subject);
1324 cb.pattern_position = GET(ecode, 2);
1325 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1326 cb.capture_top = offset_top/2;
1327 cb.capture_last = md->capture_last & CAPLMASK;
1328 /* Internal change requires this for API compatibility. */
1329 if (cb.capture_last == 0) cb.capture_last = -1;
1330 cb.callout_data = md->callout_data;
1331 cb.mark = md->nomatch_mark;
1332 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1333 if (rrc < 0) RRETURN(rrc);
1334 }
1335
1336 /* Advance ecode past the callout, so it now points to the condition. We
1337 must adjust codelink so that the value of ecode+codelink is unchanged. */
1338
1339 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1340 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1341 }
1342
1343 /* Test the various possible conditions */
1344
1345 condition = FALSE;
1346 switch(condcode = *ecode)
1347 {
1348 case OP_RREF: /* Numbered group recursion test */
1349 if (md->recursive != NULL) /* Not recursing => FALSE */
1350 {
1351 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1352 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1353 }
1354 break;
1355
1356 case OP_DNRREF: /* Duplicate named group recursion test */
1357 if (md->recursive != NULL)
1358 {
1359 int count = GET2(ecode, 1 + IMM2_SIZE);
1360 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1361 while (count-- > 0)
1362 {
1363 unsigned int recno = GET2(slot, 0);
1364 condition = recno == md->recursive->group_num;
1365 if (condition) break;
1366 slot += md->name_entry_size;
1367 }
1368 }
1369 break;
1370
1371 case OP_CREF: /* Numbered group used test */
1372 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 break;
1375
1376 case OP_DNCREF: /* Duplicate named group used test */
1377 {
1378 int count = GET2(ecode, 1 + IMM2_SIZE);
1379 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1380 while (count-- > 0)
1381 {
1382 offset = GET2(slot, 0) << 1;
1383 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1384 if (condition) break;
1385 slot += md->name_entry_size;
1386 }
1387 }
1388 break;
1389
1390 case OP_DEF: /* DEFINE - always false */
1391 break;
1392
1393 /* The condition is an assertion. Call match() to evaluate it - setting
1394 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1395 of an assertion. */
1396
1397 default:
1398 md->match_function_type = MATCH_CONDASSERT;
1399 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1400 if (rrc == MATCH_MATCH)
1401 {
1402 if (md->end_offset_top > offset_top)
1403 offset_top = md->end_offset_top; /* Captures may have happened */
1404 condition = TRUE;
1405
1406 /* Advance ecode past the assertion to the start of the first branch,
1407 but adjust it so that the general choosing code below works. */
1408
1409 ecode += GET(ecode, 1);
1410 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1411 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1412 }
1413
1414 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1415 assertion; it is therefore treated as NOMATCH. Any other return is an
1416 error. */
1417
1418 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1419 {
1420 RRETURN(rrc); /* Need braces because of following else */
1421 }
1422 break;
1423 }
1424
1425 /* Choose branch according to the condition */
1426
1427 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1428
1429 /* We are now at the branch that is to be obeyed. As there is only one, we
1430 can use tail recursion to avoid using another stack frame, except when
1431 there is unlimited repeat of a possibly empty group. In the latter case, a
1432 recursive call to match() is always required, unless the second alternative
1433 doesn't exist, in which case we can just plough on. Note that, for
1434 compatibility with Perl, the | in a conditional group is NOT treated as
1435 creating two alternatives. If a THEN is encountered in the branch, it
1436 propagates out to the enclosing alternative (unless nested in a deeper set
1437 of alternatives, of course). */
1438
1439 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1440 {
1441 if (op != OP_SCOND)
1442 {
1443 goto TAIL_RECURSE;
1444 }
1445
1446 md->match_function_type = MATCH_CBEGROUP;
1447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1448 RRETURN(rrc);
1449 }
1450
1451 /* Condition false & no alternative; continue after the group. */
1452
1453 else
1454 {
1455 }
1456 break;
1457
1458
1459 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1460 to close any currently open capturing brackets. */
1461
1462 case OP_CLOSE:
1463 number = GET2(ecode, 1); /* Must be less than 65536 */
1464 offset = number << 1;
1465
1466 #ifdef PCRE_DEBUG
1467 printf("end bracket %d at *ACCEPT", number);
1468 printf("\n");
1469 #endif
1470
1471 md->capture_last = (md->capture_last & OVFLMASK) | number;
1472 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1473 {
1474 md->offset_vector[offset] =
1475 md->offset_vector[md->offset_end - number];
1476 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1477 if (offset_top <= offset) offset_top = offset + 2;
1478 }
1479 ecode += 1 + IMM2_SIZE;
1480 break;
1481
1482
1483 /* End of the pattern, either real or forced. */
1484
1485 case OP_END:
1486 case OP_ACCEPT:
1487 case OP_ASSERT_ACCEPT:
1488
1489 /* If we have matched an empty string, fail if not in an assertion and not
1490 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1491 is set and we have matched at the start of the subject. In both cases,
1492 backtracking will then try other alternatives, if any. */
1493
1494 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1495 md->recursive == NULL &&
1496 (md->notempty ||
1497 (md->notempty_atstart &&
1498 mstart == md->start_subject + md->start_offset)))
1499 RRETURN(MATCH_NOMATCH);
1500
1501 /* Otherwise, we have a match. */
1502
1503 md->end_match_ptr = eptr; /* Record where we ended */
1504 md->end_offset_top = offset_top; /* and how many extracts were taken */
1505 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1506
1507 /* For some reason, the macros don't work properly if an expression is
1508 given as the argument to RRETURN when the heap is in use. */
1509
1510 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1511 RRETURN(rrc);
1512
1513 /* Assertion brackets. Check the alternative branches in turn - the
1514 matching won't pass the KET for an assertion. If any one branch matches,
1515 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1516 start of each branch to move the current point backwards, so the code at
1517 this level is identical to the lookahead case. When the assertion is part
1518 of a condition, we want to return immediately afterwards. The caller of
1519 this incarnation of the match() function will have set MATCH_CONDASSERT in
1520 md->match_function type, and one of these opcodes will be the first opcode
1521 that is processed. We use a local variable that is preserved over calls to
1522 match() to remember this case. */
1523
1524 case OP_ASSERT:
1525 case OP_ASSERTBACK:
1526 save_mark = md->mark;
1527 if (md->match_function_type == MATCH_CONDASSERT)
1528 {
1529 condassert = TRUE;
1530 md->match_function_type = 0;
1531 }
1532 else condassert = FALSE;
1533
1534 /* Loop for each branch */
1535
1536 do
1537 {
1538 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1539
1540 /* A match means that the assertion is true; break out of the loop
1541 that matches its alternatives. */
1542
1543 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1544 {
1545 mstart = md->start_match_ptr; /* In case \K reset it */
1546 break;
1547 }
1548
1549 /* If not matched, restore the previous mark setting. */
1550
1551 md->mark = save_mark;
1552
1553 /* See comment in the code for capturing groups above about handling
1554 THEN. */
1555
1556 if (rrc == MATCH_THEN)
1557 {
1558 next = ecode + GET(ecode,1);
1559 if (md->start_match_ptr < next &&
1560 (*ecode == OP_ALT || *next == OP_ALT))
1561 rrc = MATCH_NOMATCH;
1562 }
1563
1564 /* Anything other than NOMATCH causes the entire assertion to fail,
1565 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1566 uncaptured THEN, which means they take their normal effect. This
1567 consistent approach does not always have exactly the same effect as in
1568 Perl. */
1569
1570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1571 ecode += GET(ecode, 1);
1572 }
1573 while (*ecode == OP_ALT); /* Continue for next alternative */
1574
1575 /* If we have tried all the alternative branches, the assertion has
1576 failed. If not, we broke out after a match. */
1577
1578 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1579
1580 /* If checking an assertion for a condition, return MATCH_MATCH. */
1581
1582 if (condassert) RRETURN(MATCH_MATCH);
1583
1584 /* Continue from after a successful assertion, updating the offsets high
1585 water mark, since extracts may have been taken during the assertion. */
1586
1587 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1588 ecode += 1 + LINK_SIZE;
1589 offset_top = md->end_offset_top;
1590 continue;
1591
1592 /* Negative assertion: all branches must fail to match for the assertion to
1593 succeed. */
1594
1595 case OP_ASSERT_NOT:
1596 case OP_ASSERTBACK_NOT:
1597 save_mark = md->mark;
1598 if (md->match_function_type == MATCH_CONDASSERT)
1599 {
1600 condassert = TRUE;
1601 md->match_function_type = 0;
1602 }
1603 else condassert = FALSE;
1604
1605 /* Loop for each alternative branch. */
1606
1607 do
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1610 md->mark = save_mark; /* Always restore the mark setting */
1611
1612 switch(rrc)
1613 {
1614 case MATCH_MATCH: /* A successful match means */
1615 case MATCH_ACCEPT: /* the assertion has failed. */
1616 RRETURN(MATCH_NOMATCH);
1617
1618 case MATCH_NOMATCH: /* Carry on with next branch */
1619 break;
1620
1621 /* See comment in the code for capturing groups above about handling
1622 THEN. */
1623
1624 case MATCH_THEN:
1625 next = ecode + GET(ecode,1);
1626 if (md->start_match_ptr < next &&
1627 (*ecode == OP_ALT || *next == OP_ALT))
1628 {
1629 rrc = MATCH_NOMATCH;
1630 break;
1631 }
1632 /* Otherwise fall through. */
1633
1634 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1635 assertion to fail to match, without considering any more alternatives.
1636 Failing to match means the assertion is true. This is a consistent
1637 approach, but does not always have the same effect as in Perl. */
1638
1639 case MATCH_COMMIT:
1640 case MATCH_SKIP:
1641 case MATCH_SKIP_ARG:
1642 case MATCH_PRUNE:
1643 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1644 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1645
1646 /* Anything else is an error */
1647
1648 default:
1649 RRETURN(rrc);
1650 }
1651
1652 /* Continue with next branch */
1653
1654 ecode += GET(ecode,1);
1655 }
1656 while (*ecode == OP_ALT);
1657
1658 /* All branches in the assertion failed to match. */
1659
1660 NEG_ASSERT_TRUE:
1661 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1662 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1663 continue;
1664
1665 /* Move the subject pointer back. This occurs only at the start of
1666 each branch of a lookbehind assertion. If we are too close to the start to
1667 move back, this match function fails. When working with UTF-8 we move
1668 back a number of characters, not bytes. */
1669
1670 case OP_REVERSE:
1671 #ifdef SUPPORT_UTF
1672 if (utf)
1673 {
1674 i = GET(ecode, 1);
1675 while (i-- > 0)
1676 {
1677 eptr--;
1678 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1679 BACKCHAR(eptr);
1680 }
1681 }
1682 else
1683 #endif
1684
1685 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1686
1687 {
1688 eptr -= GET(ecode, 1);
1689 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1690 }
1691
1692 /* Save the earliest consulted character, then skip to next op code */
1693
1694 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1695 ecode += 1 + LINK_SIZE;
1696 break;
1697
1698 /* The callout item calls an external function, if one is provided, passing
1699 details of the match so far. This is mainly for debugging, though the
1700 function is able to force a failure. */
1701
1702 case OP_CALLOUT:
1703 if (PUBL(callout) != NULL)
1704 {
1705 PUBL(callout_block) cb;
1706 cb.version = 2; /* Version 1 of the callout block */
1707 cb.callout_number = ecode[1];
1708 cb.offset_vector = md->offset_vector;
1709 #if defined COMPILE_PCRE8
1710 cb.subject = (PCRE_SPTR)md->start_subject;
1711 #elif defined COMPILE_PCRE16
1712 cb.subject = (PCRE_SPTR16)md->start_subject;
1713 #elif defined COMPILE_PCRE32
1714 cb.subject = (PCRE_SPTR32)md->start_subject;
1715 #endif
1716 cb.subject_length = (int)(md->end_subject - md->start_subject);
1717 cb.start_match = (int)(mstart - md->start_subject);
1718 cb.current_position = (int)(eptr - md->start_subject);
1719 cb.pattern_position = GET(ecode, 2);
1720 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1721 cb.capture_top = offset_top/2;
1722 cb.capture_last = md->capture_last & CAPLMASK;
1723 /* Internal change requires this for API compatibility. */
1724 if (cb.capture_last == 0) cb.capture_last = -1;
1725 cb.callout_data = md->callout_data;
1726 cb.mark = md->nomatch_mark;
1727 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1728 if (rrc < 0) RRETURN(rrc);
1729 }
1730 ecode += 2 + 2*LINK_SIZE;
1731 break;
1732
1733 /* Recursion either matches the current regex, or some subexpression. The
1734 offset data is the offset to the starting bracket from the start of the
1735 whole pattern. (This is so that it works from duplicated subpatterns.)
1736
1737 The state of the capturing groups is preserved over recursion, and
1738 re-instated afterwards. We don't know how many are started and not yet
1739 finished (offset_top records the completed total) so we just have to save
1740 all the potential data. There may be up to 65535 such values, which is too
1741 large to put on the stack, but using malloc for small numbers seems
1742 expensive. As a compromise, the stack is used when there are no more than
1743 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1744
1745 There are also other values that have to be saved. We use a chained
1746 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1747 for the original version of this logic. It has, however, been hacked around
1748 a lot, so he is not to blame for the current way it works. */
1749
1750 case OP_RECURSE:
1751 {
1752 recursion_info *ri;
1753 unsigned int recno;
1754
1755 callpat = md->start_code + GET(ecode, 1);
1756 recno = (callpat == md->start_code)? 0 :
1757 GET2(callpat, 1 + LINK_SIZE);
1758
1759 /* Check for repeating a recursion without advancing the subject pointer.
1760 This should catch convoluted mutual recursions. (Some simple cases are
1761 caught at compile time.) */
1762
1763 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1764 if (recno == ri->group_num && eptr == ri->subject_position)
1765 RRETURN(PCRE_ERROR_RECURSELOOP);
1766
1767 /* Add to "recursing stack" */
1768
1769 new_recursive.group_num = recno;
1770 new_recursive.saved_capture_last = md->capture_last;
1771 new_recursive.subject_position = eptr;
1772 new_recursive.prevrec = md->recursive;
1773 md->recursive = &new_recursive;
1774
1775 /* Where to continue from afterwards */
1776
1777 ecode += 1 + LINK_SIZE;
1778
1779 /* Now save the offset data */
1780
1781 new_recursive.saved_max = md->offset_end;
1782 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1783 new_recursive.offset_save = stacksave;
1784 else
1785 {
1786 new_recursive.offset_save =
1787 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1788 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1789 }
1790 memcpy(new_recursive.offset_save, md->offset_vector,
1791 new_recursive.saved_max * sizeof(int));
1792
1793 /* OK, now we can do the recursion. After processing each alternative,
1794 restore the offset data and the last captured value. If there were nested
1795 recursions, md->recursive might be changed, so reset it before looping.
1796 */
1797
1798 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1799 cbegroup = (*callpat >= OP_SBRA);
1800 do
1801 {
1802 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1803 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1804 md, eptrb, RM6);
1805 memcpy(md->offset_vector, new_recursive.offset_save,
1806 new_recursive.saved_max * sizeof(int));
1807 md->capture_last = new_recursive.saved_capture_last;
1808 md->recursive = new_recursive.prevrec;
1809 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1810 {
1811 DPRINTF(("Recursion matched\n"));
1812 if (new_recursive.offset_save != stacksave)
1813 (PUBL(free))(new_recursive.offset_save);
1814
1815 /* Set where we got to in the subject, and reset the start in case
1816 it was changed by \K. This *is* propagated back out of a recursion,
1817 for Perl compatibility. */
1818
1819 eptr = md->end_match_ptr;
1820 mstart = md->start_match_ptr;
1821 goto RECURSION_MATCHED; /* Exit loop; end processing */
1822 }
1823
1824 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1825 recursion; they cause a NOMATCH for the entire recursion. These codes
1826 are defined in a range that can be tested for. */
1827
1828 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1829 RRETURN(MATCH_NOMATCH);
1830
1831 /* Any return code other than NOMATCH is an error. */
1832
1833 if (rrc != MATCH_NOMATCH)
1834 {
1835 DPRINTF(("Recursion gave error %d\n", rrc));
1836 if (new_recursive.offset_save != stacksave)
1837 (PUBL(free))(new_recursive.offset_save);
1838 RRETURN(rrc);
1839 }
1840
1841 md->recursive = &new_recursive;
1842 callpat += GET(callpat, 1);
1843 }
1844 while (*callpat == OP_ALT);
1845
1846 DPRINTF(("Recursion didn't match\n"));
1847 md->recursive = new_recursive.prevrec;
1848 if (new_recursive.offset_save != stacksave)
1849 (PUBL(free))(new_recursive.offset_save);
1850 RRETURN(MATCH_NOMATCH);
1851 }
1852
1853 RECURSION_MATCHED:
1854 break;
1855
1856 /* An alternation is the end of a branch; scan along to find the end of the
1857 bracketed group and go to there. */
1858
1859 case OP_ALT:
1860 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1861 break;
1862
1863 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1864 indicating that it may occur zero times. It may repeat infinitely, or not
1865 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1866 with fixed upper repeat limits are compiled as a number of copies, with the
1867 optional ones preceded by BRAZERO or BRAMINZERO. */
1868
1869 case OP_BRAZERO:
1870 next = ecode + 1;
1871 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1873 do next += GET(next, 1); while (*next == OP_ALT);
1874 ecode = next + 1 + LINK_SIZE;
1875 break;
1876
1877 case OP_BRAMINZERO:
1878 next = ecode + 1;
1879 do next += GET(next, 1); while (*next == OP_ALT);
1880 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1882 ecode++;
1883 break;
1884
1885 case OP_SKIPZERO:
1886 next = ecode+1;
1887 do next += GET(next,1); while (*next == OP_ALT);
1888 ecode = next + 1 + LINK_SIZE;
1889 break;
1890
1891 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1892 here; just jump to the group, with allow_zero set TRUE. */
1893
1894 case OP_BRAPOSZERO:
1895 op = *(++ecode);
1896 allow_zero = TRUE;
1897 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1898 goto POSSESSIVE_NON_CAPTURE;
1899
1900 /* End of a group, repeated or non-repeating. */
1901
1902 case OP_KET:
1903 case OP_KETRMIN:
1904 case OP_KETRMAX:
1905 case OP_KETRPOS:
1906 prev = ecode - GET(ecode, 1);
1907
1908 /* If this was a group that remembered the subject start, in order to break
1909 infinite repeats of empty string matches, retrieve the subject start from
1910 the chain. Otherwise, set it NULL. */
1911
1912 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1913 {
1914 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1915 eptrb = eptrb->epb_prev; /* Backup to previous group */
1916 }
1917 else saved_eptr = NULL;
1918
1919 /* If we are at the end of an assertion group or a non-capturing atomic
1920 group, stop matching and return MATCH_MATCH, but record the current high
1921 water mark for use by positive assertions. We also need to record the match
1922 start in case it was changed by \K. */
1923
1924 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1925 *prev == OP_ONCE_NC)
1926 {
1927 md->end_match_ptr = eptr; /* For ONCE_NC */
1928 md->end_offset_top = offset_top;
1929 md->start_match_ptr = mstart;
1930 RRETURN(MATCH_MATCH); /* Sets md->mark */
1931 }
1932
1933 /* For capturing groups we have to check the group number back at the start
1934 and if necessary complete handling an extraction by setting the offsets and
1935 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1936 into group 0, so it won't be picked up here. Instead, we catch it when the
1937 OP_END is reached. Other recursion is handled here. We just have to record
1938 the current subject position and start match pointer and give a MATCH
1939 return. */
1940
1941 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1942 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1943 {
1944 number = GET2(prev, 1+LINK_SIZE);
1945 offset = number << 1;
1946
1947 #ifdef PCRE_DEBUG
1948 printf("end bracket %d", number);
1949 printf("\n");
1950 #endif
1951
1952 /* Handle a recursively called group. */
1953
1954 if (md->recursive != NULL && md->recursive->group_num == number)
1955 {
1956 md->end_match_ptr = eptr;
1957 md->start_match_ptr = mstart;
1958 RRETURN(MATCH_MATCH);
1959 }
1960
1961 /* Deal with capturing */
1962
1963 md->capture_last = (md->capture_last & OVFLMASK) | number;
1964 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1965 {
1966 /* If offset is greater than offset_top, it means that we are
1967 "skipping" a capturing group, and that group's offsets must be marked
1968 unset. In earlier versions of PCRE, all the offsets were unset at the
1969 start of matching, but this doesn't work because atomic groups and
1970 assertions can cause a value to be set that should later be unset.
1971 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1972 part of the atomic group, but this is not on the final matching path,
1973 so must be unset when 2 is set. (If there is no group 2, there is no
1974 problem, because offset_top will then be 2, indicating no capture.) */
1975
1976 if (offset > offset_top)
1977 {
1978 register int *iptr = md->offset_vector + offset_top;
1979 register int *iend = md->offset_vector + offset;
1980 while (iptr < iend) *iptr++ = -1;
1981 }
1982
1983 /* Now make the extraction */
1984
1985 md->offset_vector[offset] =
1986 md->offset_vector[md->offset_end - number];
1987 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1988 if (offset_top <= offset) offset_top = offset + 2;
1989 }
1990 }
1991
1992 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1993 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1994 at a time from the outer level, thus saving stack. This must precede the
1995 empty string test - in this case that test is done at the outer level. */
1996
1997 if (*ecode == OP_KETRPOS)
1998 {
1999 md->start_match_ptr = mstart; /* In case \K reset it */
2000 md->end_match_ptr = eptr;
2001 md->end_offset_top = offset_top;
2002 RRETURN(MATCH_KETRPOS);
2003 }
2004
2005 /* For an ordinary non-repeating ket, just continue at this level. This
2006 also happens for a repeating ket if no characters were matched in the
2007 group. This is the forcible breaking of infinite loops as implemented in
2008 Perl 5.005. For a non-repeating atomic group that includes captures,
2009 establish a backup point by processing the rest of the pattern at a lower
2010 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2011 original OP_ONCE level, thereby bypassing intermediate backup points, but
2012 resetting any captures that happened along the way. */
2013
2014 if (*ecode == OP_KET || eptr == saved_eptr)
2015 {
2016 if (*prev == OP_ONCE)
2017 {
2018 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2021 RRETURN(MATCH_ONCE);
2022 }
2023 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2024 break;
2025 }
2026
2027 /* The normal repeating kets try the rest of the pattern or restart from
2028 the preceding bracket, in the appropriate order. In the second case, we can
2029 use tail recursion to avoid using another stack frame, unless we have an
2030 an atomic group or an unlimited repeat of a group that can match an empty
2031 string. */
2032
2033 if (*ecode == OP_KETRMIN)
2034 {
2035 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2037 if (*prev == OP_ONCE)
2038 {
2039 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2041 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2042 RRETURN(MATCH_ONCE);
2043 }
2044 if (*prev >= OP_SBRA) /* Could match an empty string */
2045 {
2046 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2047 RRETURN(rrc);
2048 }
2049 ecode = prev;
2050 goto TAIL_RECURSE;
2051 }
2052 else /* OP_KETRMAX */
2053 {
2054 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2055 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2057 if (*prev == OP_ONCE)
2058 {
2059 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2060 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2061 md->once_target = prev;
2062 RRETURN(MATCH_ONCE);
2063 }
2064 ecode += 1 + LINK_SIZE;
2065 goto TAIL_RECURSE;
2066 }
2067 /* Control never gets here */
2068
2069 /* Not multiline mode: start of subject assertion, unless notbol. */
2070
2071 case OP_CIRC:
2072 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2073
2074 /* Start of subject assertion */
2075
2076 case OP_SOD:
2077 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2078 ecode++;
2079 break;
2080
2081 /* Multiline mode: start of subject unless notbol, or after any newline. */
2082
2083 case OP_CIRCM:
2084 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2085 if (eptr != md->start_subject &&
2086 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2087 RRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 break;
2090
2091 /* Start of match assertion */
2092
2093 case OP_SOM:
2094 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2095 ecode++;
2096 break;
2097
2098 /* Reset the start of match point */
2099
2100 case OP_SET_SOM:
2101 mstart = eptr;
2102 ecode++;
2103 break;
2104
2105 /* Multiline mode: assert before any newline, or before end of subject
2106 unless noteol is set. */
2107
2108 case OP_DOLLM:
2109 if (eptr < md->end_subject)
2110 {
2111 if (!IS_NEWLINE(eptr))
2112 {
2113 if (md->partial != 0 &&
2114 eptr + 1 >= md->end_subject &&
2115 NLBLOCK->nltype == NLTYPE_FIXED &&
2116 NLBLOCK->nllen == 2 &&
2117 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2118 {
2119 md->hitend = TRUE;
2120 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2121 }
2122 RRETURN(MATCH_NOMATCH);
2123 }
2124 }
2125 else
2126 {
2127 if (md->noteol) RRETURN(MATCH_NOMATCH);
2128 SCHECK_PARTIAL();
2129 }
2130 ecode++;
2131 break;
2132
2133 /* Not multiline mode: assert before a terminating newline or before end of
2134 subject unless noteol is set. */
2135
2136 case OP_DOLL:
2137 if (md->noteol) RRETURN(MATCH_NOMATCH);
2138 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2139
2140 /* ... else fall through for endonly */
2141
2142 /* End of subject assertion (\z) */
2143
2144 case OP_EOD:
2145 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2146 SCHECK_PARTIAL();
2147 ecode++;
2148 break;
2149
2150 /* End of subject or ending \n assertion (\Z) */
2151
2152 case OP_EODN:
2153 ASSERT_NL_OR_EOS:
2154 if (eptr < md->end_subject &&
2155 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2156 {
2157 if (md->partial != 0 &&
2158 eptr + 1 >= md->end_subject &&
2159 NLBLOCK->nltype == NLTYPE_FIXED &&
2160 NLBLOCK->nllen == 2 &&
2161 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2162 {
2163 md->hitend = TRUE;
2164 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2165 }
2166 RRETURN(MATCH_NOMATCH);
2167 }
2168
2169 /* Either at end of string or \n before end. */
2170
2171 SCHECK_PARTIAL();
2172 ecode++;
2173 break;
2174
2175 /* Word boundary assertions */
2176
2177 case OP_NOT_WORD_BOUNDARY:
2178 case OP_WORD_BOUNDARY:
2179 {
2180
2181 /* Find out if the previous and current characters are "word" characters.
2182 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2183 be "non-word" characters. Remember the earliest consulted character for
2184 partial matching. */
2185
2186 #ifdef SUPPORT_UTF
2187 if (utf)
2188 {
2189 /* Get status of previous character */
2190
2191 if (eptr == md->start_subject) prev_is_word = FALSE; else
2192 {
2193 PCRE_PUCHAR lastptr = eptr - 1;
2194 BACKCHAR(lastptr);
2195 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2196 GETCHAR(c, lastptr);
2197 #ifdef SUPPORT_UCP
2198 if (md->use_ucp)
2199 {
2200 if (c == '_') prev_is_word = TRUE; else
2201 {
2202 int cat = UCD_CATEGORY(c);
2203 prev_is_word = (cat == ucp_L || cat == ucp_N);
2204 }
2205 }
2206 else
2207 #endif
2208 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2209 }
2210
2211 /* Get status of next character */
2212
2213 if (eptr >= md->end_subject)
2214 {
2215 SCHECK_PARTIAL();
2216 cur_is_word = FALSE;
2217 }
2218 else
2219 {
2220 GETCHAR(c, eptr);
2221 #ifdef SUPPORT_UCP
2222 if (md->use_ucp)
2223 {
2224 if (c == '_') cur_is_word = TRUE; else
2225 {
2226 int cat = UCD_CATEGORY(c);
2227 cur_is_word = (cat == ucp_L || cat == ucp_N);
2228 }
2229 }
2230 else
2231 #endif
2232 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2233 }
2234 }
2235 else
2236 #endif
2237
2238 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2239 consistency with the behaviour of \w we do use it in this case. */
2240
2241 {
2242 /* Get status of previous character */
2243
2244 if (eptr == md->start_subject) prev_is_word = FALSE; else
2245 {
2246 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2247 #ifdef SUPPORT_UCP
2248 if (md->use_ucp)
2249 {
2250 c = eptr[-1];
2251 if (c == '_') prev_is_word = TRUE; else
2252 {
2253 int cat = UCD_CATEGORY(c);
2254 prev_is_word = (cat == ucp_L || cat == ucp_N);
2255 }
2256 }
2257 else
2258 #endif
2259 prev_is_word = MAX_255(eptr[-1])
2260 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2261 }
2262
2263 /* Get status of next character */
2264
2265 if (eptr >= md->end_subject)
2266 {
2267 SCHECK_PARTIAL();
2268 cur_is_word = FALSE;
2269 }
2270 else
2271 #ifdef SUPPORT_UCP
2272 if (md->use_ucp)
2273 {
2274 c = *eptr;
2275 if (c == '_') cur_is_word = TRUE; else
2276 {
2277 int cat = UCD_CATEGORY(c);
2278 cur_is_word = (cat == ucp_L || cat == ucp_N);
2279 }
2280 }
2281 else
2282 #endif
2283 cur_is_word = MAX_255(*eptr)
2284 && ((md->ctypes[*eptr] & ctype_word) != 0);
2285 }
2286
2287 /* Now see if the situation is what we want */
2288
2289 if ((*ecode++ == OP_WORD_BOUNDARY)?
2290 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2291 RRETURN(MATCH_NOMATCH);
2292 }
2293 break;
2294
2295 /* Match any single character type except newline; have to take care with
2296 CRLF newlines and partial matching. */
2297
2298 case OP_ANY:
2299 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2300 if (md->partial != 0 &&
2301 eptr + 1 >= md->end_subject &&
2302 NLBLOCK->nltype == NLTYPE_FIXED &&
2303 NLBLOCK->nllen == 2 &&
2304 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2305 {
2306 md->hitend = TRUE;
2307 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2308 }
2309
2310 /* Fall through */
2311
2312 /* Match any single character whatsoever. */
2313
2314 case OP_ALLANY:
2315 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2316 { /* not be updated before SCHECK_PARTIAL. */
2317 SCHECK_PARTIAL();
2318 RRETURN(MATCH_NOMATCH);
2319 }
2320 eptr++;
2321 #ifdef SUPPORT_UTF
2322 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2323 #endif
2324 ecode++;
2325 break;
2326
2327 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2328 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2329
2330 case OP_ANYBYTE:
2331 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2332 { /* not be updated before SCHECK_PARTIAL. */
2333 SCHECK_PARTIAL();
2334 RRETURN(MATCH_NOMATCH);
2335 }
2336 eptr++;
2337 ecode++;
2338 break;
2339
2340 case OP_NOT_DIGIT:
2341 if (eptr >= md->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 GETCHARINCTEST(c, eptr);
2347 if (
2348 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2349 c < 256 &&
2350 #endif
2351 (md->ctypes[c] & ctype_digit) != 0
2352 )
2353 RRETURN(MATCH_NOMATCH);
2354 ecode++;
2355 break;
2356
2357 case OP_DIGIT:
2358 if (eptr >= md->end_subject)
2359 {
2360 SCHECK_PARTIAL();
2361 RRETURN(MATCH_NOMATCH);
2362 }
2363 GETCHARINCTEST(c, eptr);
2364 if (
2365 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2366 c > 255 ||
2367 #endif
2368 (md->ctypes[c] & ctype_digit) == 0
2369 )
2370 RRETURN(MATCH_NOMATCH);
2371 ecode++;
2372 break;
2373
2374 case OP_NOT_WHITESPACE:
2375 if (eptr >= md->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 RRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(c, eptr);
2381 if (
2382 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2383 c < 256 &&
2384 #endif
2385 (md->ctypes[c] & ctype_space) != 0
2386 )
2387 RRETURN(MATCH_NOMATCH);
2388 ecode++;
2389 break;
2390
2391 case OP_WHITESPACE:
2392 if (eptr >= md->end_subject)
2393 {
2394 SCHECK_PARTIAL();
2395 RRETURN(MATCH_NOMATCH);
2396 }
2397 GETCHARINCTEST(c, eptr);
2398 if (
2399 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2400 c > 255 ||
2401 #endif
2402 (md->ctypes[c] & ctype_space) == 0
2403 )
2404 RRETURN(MATCH_NOMATCH);
2405 ecode++;
2406 break;
2407
2408 case OP_NOT_WORDCHAR:
2409 if (eptr >= md->end_subject)
2410 {
2411 SCHECK_PARTIAL();
2412 RRETURN(MATCH_NOMATCH);
2413 }
2414 GETCHARINCTEST(c, eptr);
2415 if (
2416 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2417 c < 256 &&
2418 #endif
2419 (md->ctypes[c] & ctype_word) != 0
2420 )
2421 RRETURN(MATCH_NOMATCH);
2422 ecode++;
2423 break;
2424
2425 case OP_WORDCHAR:
2426 if (eptr >= md->end_subject)
2427 {
2428 SCHECK_PARTIAL();
2429 RRETURN(MATCH_NOMATCH);
2430 }
2431 GETCHARINCTEST(c, eptr);
2432 if (
2433 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2434 c > 255 ||
2435 #endif
2436 (md->ctypes[c] & ctype_word) == 0
2437 )
2438 RRETURN(MATCH_NOMATCH);
2439 ecode++;
2440 break;
2441
2442 case OP_ANYNL:
2443 if (eptr >= md->end_subject)
2444 {
2445 SCHECK_PARTIAL();
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 GETCHARINCTEST(c, eptr);
2449 switch(c)
2450 {
2451 default: RRETURN(MATCH_NOMATCH);
2452
2453 case CHAR_CR:
2454 if (eptr >= md->end_subject)
2455 {
2456 SCHECK_PARTIAL();
2457 }
2458 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2459 break;
2460
2461 case CHAR_LF:
2462 break;
2463
2464 case CHAR_VT:
2465 case CHAR_FF:
2466 case CHAR_NEL:
2467 #ifndef EBCDIC
2468 case 0x2028:
2469 case 0x2029:
2470 #endif /* Not EBCDIC */
2471 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2472 break;
2473 }
2474 ecode++;
2475 break;
2476
2477 case OP_NOT_HSPACE:
2478 if (eptr >= md->end_subject)
2479 {
2480 SCHECK_PARTIAL();
2481 RRETURN(MATCH_NOMATCH);
2482 }
2483 GETCHARINCTEST(c, eptr);
2484 switch(c)
2485 {
2486 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2487 default: break;
2488 }
2489 ecode++;
2490 break;
2491
2492 case OP_HSPACE:
2493 if (eptr >= md->end_subject)
2494 {
2495 SCHECK_PARTIAL();
2496 RRETURN(MATCH_NOMATCH);
2497 }
2498 GETCHARINCTEST(c, eptr);
2499 switch(c)
2500 {
2501 HSPACE_CASES: break; /* Byte and multibyte cases */
2502 default: RRETURN(MATCH_NOMATCH);
2503 }
2504 ecode++;
2505 break;
2506
2507 case OP_NOT_VSPACE:
2508 if (eptr >= md->end_subject)
2509 {
2510 SCHECK_PARTIAL();
2511 RRETURN(MATCH_NOMATCH);
2512 }
2513 GETCHARINCTEST(c, eptr);
2514 switch(c)
2515 {
2516 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2517 default: break;
2518 }
2519 ecode++;
2520 break;
2521
2522 case OP_VSPACE:
2523 if (eptr >= md->end_subject)
2524 {
2525 SCHECK_PARTIAL();
2526 RRETURN(MATCH_NOMATCH);
2527 }
2528 GETCHARINCTEST(c, eptr);
2529 switch(c)
2530 {
2531 VSPACE_CASES: break;
2532 default: RRETURN(MATCH_NOMATCH);
2533 }
2534 ecode++;
2535 break;
2536
2537 #ifdef SUPPORT_UCP
2538 /* Check the next character by Unicode property. We will get here only
2539 if the support is in the binary; otherwise a compile-time error occurs. */
2540
2541 case OP_PROP:
2542 case OP_NOTPROP:
2543 if (eptr >= md->end_subject)
2544 {
2545 SCHECK_PARTIAL();
2546 RRETURN(MATCH_NOMATCH);
2547 }
2548 GETCHARINCTEST(c, eptr);
2549 {
2550 const pcre_uint32 *cp;
2551 const ucd_record *prop = GET_UCD(c);
2552
2553 switch(ecode[1])
2554 {
2555 case PT_ANY:
2556 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2557 break;
2558
2559 case PT_LAMP:
2560 if ((prop->chartype == ucp_Lu ||
2561 prop->chartype == ucp_Ll ||
2562 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2563 RRETURN(MATCH_NOMATCH);
2564 break;
2565
2566 case PT_GC:
2567 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2568 RRETURN(MATCH_NOMATCH);
2569 break;
2570
2571 case PT_PC:
2572 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2573 RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_SC:
2577 if ((ecode[2] != prop->script) == (op == OP_PROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2580
2581 /* These are specials */
2582
2583 case PT_ALNUM:
2584 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2585 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2586 RRETURN(MATCH_NOMATCH);
2587 break;
2588
2589 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2590 which means that Perl space and POSIX space are now identical. PCRE
2591 was changed at release 8.34. */
2592
2593 case PT_SPACE: /* Perl space */
2594 case PT_PXSPACE: /* POSIX space */
2595 switch(c)
2596 {
2597 HSPACE_CASES:
2598 VSPACE_CASES:
2599 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2600 break;
2601
2602 default:
2603 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2604 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2605 break;
2606 }
2607 break;
2608
2609 case PT_WORD:
2610 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2611 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2612 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2613 RRETURN(MATCH_NOMATCH);
2614 break;
2615
2616 case PT_CLIST:
2617 cp = PRIV(ucd_caseless_sets) + ecode[2];
2618 for (;;)
2619 {
2620 if (c < *cp)
2621 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2622 if (c == *cp++)
2623 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2624 }
2625 break;
2626
2627 case PT_UCNC:
2628 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2629 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2630 c >= 0xe000) == (op == OP_NOTPROP))
2631 RRETURN(MATCH_NOMATCH);
2632 break;
2633
2634 /* This should never occur */
2635
2636 default:
2637 RRETURN(PCRE_ERROR_INTERNAL);
2638 }
2639
2640 ecode += 3;
2641 }
2642 break;
2643
2644 /* Match an extended Unicode sequence. We will get here only if the support
2645 is in the binary; otherwise a compile-time error occurs. */
2646
2647 case OP_EXTUNI:
2648 if (eptr >= md->end_subject)
2649 {
2650 SCHECK_PARTIAL();
2651 RRETURN(MATCH_NOMATCH);
2652 }
2653 else
2654 {
2655 int lgb, rgb;
2656 GETCHARINCTEST(c, eptr);
2657 lgb = UCD_GRAPHBREAK(c);
2658 while (eptr < md->end_subject)
2659 {
2660 int len = 1;
2661 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2662 rgb = UCD_GRAPHBREAK(c);
2663 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2664 lgb = rgb;
2665 eptr += len;
2666 }
2667 }
2668 CHECK_PARTIAL();
2669 ecode++;
2670 break;
2671 #endif /* SUPPORT_UCP */
2672
2673
2674 /* Match a back reference, possibly repeatedly. Look past the end of the
2675 item to see if there is repeat information following. The code is similar
2676 to that for character classes, but repeated for efficiency. Then obey
2677 similar code to character type repeats - written out again for speed.
2678 However, if the referenced string is the empty string, always treat
2679 it as matched, any number of times (otherwise there could be infinite
2680 loops). If the reference is unset, there are two possibilities:
2681
2682 (a) In the default, Perl-compatible state, set the length negative;
2683 this ensures that every attempt at a match fails. We can't just fail
2684 here, because of the possibility of quantifiers with zero minima.
2685
2686 (b) If the JavaScript compatibility flag is set, set the length to zero
2687 so that the back reference matches an empty string.
2688
2689 Otherwise, set the length to the length of what was matched by the
2690 referenced subpattern.
2691
2692 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2693 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2694 and OP_DNREFI are used. In this case we must scan the list of groups to
2695 which the name refers, and use the first one that is set. */
2696
2697 case OP_DNREF:
2698 case OP_DNREFI:
2699 caseless = op == OP_DNREFI;
2700 {
2701 int count = GET2(ecode, 1+IMM2_SIZE);
2702 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2703 ecode += 1 + 2*IMM2_SIZE;
2704
2705 /* Setting the default length first and initializing 'offset' avoids
2706 compiler warnings in the REF_REPEAT code. */
2707
2708 length = (md->jscript_compat)? 0 : -1;
2709 offset = 0;
2710
2711 while (count-- > 0)
2712 {
2713 offset = GET2(slot, 0) << 1;
2714 if (offset < offset_top && md->offset_vector[offset] >= 0)
2715 {
2716 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2717 break;
2718 }
2719 slot += md->name_entry_size;
2720 }
2721 }
2722 goto REF_REPEAT;
2723
2724 case OP_REF:
2725 case OP_REFI:
2726 caseless = op == OP_REFI;
2727 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2728 ecode += 1 + IMM2_SIZE;
2729 if (offset >= offset_top || md->offset_vector[offset] < 0)
2730 length = (md->jscript_compat)? 0 : -1;
2731 else
2732 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2733
2734 /* Set up for repetition, or handle the non-repeated case */
2735
2736 REF_REPEAT:
2737 switch (*ecode)
2738 {
2739 case OP_CRSTAR:
2740 case OP_CRMINSTAR:
2741 case OP_CRPLUS:
2742 case OP_CRMINPLUS:
2743 case OP_CRQUERY:
2744 case OP_CRMINQUERY:
2745 c = *ecode++ - OP_CRSTAR;
2746 minimize = (c & 1) != 0;
2747 min = rep_min[c]; /* Pick up values from tables; */
2748 max = rep_max[c]; /* zero for max => infinity */
2749 if (max == 0) max = INT_MAX;
2750 break;
2751
2752 case OP_CRRANGE:
2753 case OP_CRMINRANGE:
2754 minimize = (*ecode == OP_CRMINRANGE);
2755 min = GET2(ecode, 1);
2756 max = GET2(ecode, 1 + IMM2_SIZE);
2757 if (max == 0) max = INT_MAX;
2758 ecode += 1 + 2 * IMM2_SIZE;
2759 break;
2760
2761 default: /* No repeat follows */
2762 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2763 {
2764 if (length == -2) eptr = md->end_subject; /* Partial match */
2765 CHECK_PARTIAL();
2766 RRETURN(MATCH_NOMATCH);
2767 }
2768 eptr += length;
2769 continue; /* With the main loop */
2770 }
2771
2772 /* Handle repeated back references. If the length of the reference is
2773 zero, just continue with the main loop. If the length is negative, it
2774 means the reference is unset in non-Java-compatible mode. If the minimum is
2775 zero, we can continue at the same level without recursion. For any other
2776 minimum, carrying on will result in NOMATCH. */
2777
2778 if (length == 0) continue;
2779 if (length < 0 && min == 0) continue;
2780
2781 /* First, ensure the minimum number of matches are present. We get back
2782 the length of the reference string explicitly rather than passing the
2783 address of eptr, so that eptr can be a register variable. */
2784
2785 for (i = 1; i <= min; i++)
2786 {
2787 int slength;
2788 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2789 {
2790 if (slength == -2) eptr = md->end_subject; /* Partial match */
2791 CHECK_PARTIAL();
2792 RRETURN(MATCH_NOMATCH);
2793 }
2794 eptr += slength;
2795 }
2796
2797 /* If min = max, continue at the same level without recursion.
2798 They are not both allowed to be zero. */
2799
2800 if (min == max) continue;
2801
2802 /* If minimizing, keep trying and advancing the pointer */
2803
2804 if (minimize)
2805 {
2806 for (fi = min;; fi++)
2807 {
2808 int slength;
2809 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2811 if (fi >= max) RRETURN(MATCH_NOMATCH);
2812 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2813 {
2814 if (slength == -2) eptr = md->end_subject; /* Partial match */
2815 CHECK_PARTIAL();
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 eptr += slength;
2819 }
2820 /* Control never gets here */
2821 }
2822
2823 /* If maximizing, find the longest string and work backwards */
2824
2825 else
2826 {
2827 pp = eptr;
2828 for (i = min; i < max; i++)
2829 {
2830 int slength;
2831 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2832 {
2833 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2834 the soft partial matching case. */
2835
2836 if (slength == -2 && md->partial != 0 &&
2837 md->end_subject > md->start_used_ptr)
2838 {
2839 md->hitend = TRUE;
2840 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2841 }
2842 break;
2843 }
2844 eptr += slength;
2845 }
2846
2847 while (eptr >= pp)
2848 {
2849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2851 eptr -= length;
2852 }
2853 RRETURN(MATCH_NOMATCH);
2854 }
2855 /* Control never gets here */
2856
2857 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2858 used when all the characters in the class have values in the range 0-255,
2859 and either the matching is caseful, or the characters are in the range
2860 0-127 when UTF-8 processing is enabled. The only difference between
2861 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2862 encountered.
2863
2864 First, look past the end of the item to see if there is repeat information
2865 following. Then obey similar code to character type repeats - written out
2866 again for speed. */
2867
2868 case OP_NCLASS:
2869 case OP_CLASS:
2870 {
2871 /* The data variable is saved across frames, so the byte map needs to
2872 be stored there. */
2873 #define BYTE_MAP ((pcre_uint8 *)data)
2874 data = ecode + 1; /* Save for matching */
2875 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2876
2877 switch (*ecode)
2878 {
2879 case OP_CRSTAR:
2880 case OP_CRMINSTAR:
2881 case OP_CRPLUS:
2882 case OP_CRMINPLUS:
2883 case OP_CRQUERY:
2884 case OP_CRMINQUERY:
2885 case OP_CRPOSSTAR:
2886 case OP_CRPOSPLUS:
2887 case OP_CRPOSQUERY:
2888 c = *ecode++ - OP_CRSTAR;
2889 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2890 else possessive = TRUE;
2891 min = rep_min[c]; /* Pick up values from tables; */
2892 max = rep_max[c]; /* zero for max => infinity */
2893 if (max == 0) max = INT_MAX;
2894 break;
2895
2896 case OP_CRRANGE:
2897 case OP_CRMINRANGE:
2898 case OP_CRPOSRANGE:
2899 minimize = (*ecode == OP_CRMINRANGE);
2900 possessive = (*ecode == OP_CRPOSRANGE);
2901 min = GET2(ecode, 1);
2902 max = GET2(ecode, 1 + IMM2_SIZE);
2903 if (max == 0) max = INT_MAX;
2904 ecode += 1 + 2 * IMM2_SIZE;
2905 break;
2906
2907 default: /* No repeat follows */
2908 min = max = 1;
2909 break;
2910 }
2911
2912 /* First, ensure the minimum number of matches are present. */
2913
2914 #ifdef SUPPORT_UTF
2915 if (utf)
2916 {
2917 for (i = 1; i <= min; i++)
2918 {
2919 if (eptr >= md->end_subject)
2920 {
2921 SCHECK_PARTIAL();
2922 RRETURN(MATCH_NOMATCH);
2923 }
2924 GETCHARINC(c, eptr);
2925 if (c > 255)
2926 {
2927 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2928 }
2929 else
2930 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2931 }
2932 }
2933 else
2934 #endif
2935 /* Not UTF mode */
2936 {
2937 for (i = 1; i <= min; i++)
2938 {
2939 if (eptr >= md->end_subject)
2940 {
2941 SCHECK_PARTIAL();
2942 RRETURN(MATCH_NOMATCH);
2943 }
2944 c = *eptr++;
2945 #ifndef COMPILE_PCRE8
2946 if (c > 255)
2947 {
2948 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2949 }
2950 else
2951 #endif
2952 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2953 }
2954 }
2955
2956 /* If max == min we can continue with the main loop without the
2957 need to recurse. */
2958
2959 if (min == max) continue;
2960
2961 /* If minimizing, keep testing the rest of the expression and advancing
2962 the pointer while it matches the class. */
2963
2964 if (minimize)
2965 {
2966 #ifdef SUPPORT_UTF
2967 if (utf)
2968 {
2969 for (fi = min;; fi++)
2970 {
2971 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2973 if (fi >= max) RRETURN(MATCH_NOMATCH);
2974 if (eptr >= md->end_subject)
2975 {
2976 SCHECK_PARTIAL();
2977 RRETURN(MATCH_NOMATCH);
2978 }
2979 GETCHARINC(c, eptr);
2980 if (c > 255)
2981 {
2982 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2983 }
2984 else
2985 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2986 }
2987 }
2988 else
2989 #endif
2990 /* Not UTF mode */
2991 {
2992 for (fi = min;; fi++)
2993 {
2994 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2996 if (fi >= max) RRETURN(MATCH_NOMATCH);
2997 if (eptr >= md->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 RRETURN(MATCH_NOMATCH);
3001 }
3002 c = *eptr++;
3003 #ifndef COMPILE_PCRE8
3004 if (c > 255)
3005 {
3006 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3007 }
3008 else
3009 #endif
3010 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3011 }
3012 }
3013 /* Control never gets here */
3014 }
3015
3016 /* If maximizing, find the longest possible run, then work backwards. */
3017
3018 else
3019 {
3020 pp = eptr;
3021
3022 #ifdef SUPPORT_UTF
3023 if (utf)
3024 {
3025 for (i = min; i < max; i++)
3026 {
3027 int len = 1;
3028 if (eptr >= md->end_subject)
3029 {
3030 SCHECK_PARTIAL();
3031 break;
3032 }
3033 GETCHARLEN(c, eptr, len);
3034 if (c > 255)
3035 {
3036 if (op == OP_CLASS) break;
3037 }
3038 else
3039 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3040 eptr += len;
3041 }
3042
3043 if (possessive) continue; /* No backtracking */
3044
3045 for (;;)
3046 {
3047 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3048 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3049 if (eptr-- == pp) break; /* Stop if tried at original pos */
3050 BACKCHAR(eptr);
3051 }
3052 }
3053 else
3054 #endif
3055 /* Not UTF mode */
3056 {
3057 for (i = min; i < max; i++)
3058 {
3059 if (eptr >= md->end_subject)
3060 {
3061 SCHECK_PARTIAL();
3062 break;
3063 }
3064 c = *eptr;
3065 #ifndef COMPILE_PCRE8
3066 if (c > 255)
3067 {
3068 if (op == OP_CLASS) break;
3069 }
3070 else
3071 #endif
3072 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3073 eptr++;
3074 }
3075
3076 if (possessive) continue; /* No backtracking */
3077
3078 while (eptr >= pp)
3079 {
3080 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3081 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3082 eptr--;
3083 }
3084 }
3085
3086 RRETURN(MATCH_NOMATCH);
3087 }
3088 #undef BYTE_MAP
3089 }
3090 /* Control never gets here */
3091
3092
3093 /* Match an extended character class. In the 8-bit library, this opcode is
3094 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3095 32-bit libraries, codepoints greater than 255 may be encountered even when
3096 UTF is not supported. */
3097
3098 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3099 case OP_XCLASS:
3100 {
3101 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3102 ecode += GET(ecode, 1); /* Advance past the item */
3103
3104 switch (*ecode)
3105 {
3106 case OP_CRSTAR:
3107 case OP_CRMINSTAR:
3108 case OP_CRPLUS:
3109 case OP_CRMINPLUS:
3110 case OP_CRQUERY:
3111 case OP_CRMINQUERY:
3112 case OP_CRPOSSTAR:
3113 case OP_CRPOSPLUS:
3114 case OP_CRPOSQUERY:
3115 c = *ecode++ - OP_CRSTAR;
3116 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3117 else possessive = TRUE;
3118 min = rep_min[c]; /* Pick up values from tables; */
3119 max = rep_max[c]; /* zero for max => infinity */
3120 if (max == 0) max = INT_MAX;
3121 break;
3122
3123 case OP_CRRANGE:
3124 case OP_CRMINRANGE:
3125 case OP_CRPOSRANGE:
3126 minimize = (*ecode == OP_CRMINRANGE);
3127 possessive = (*ecode == OP_CRPOSRANGE);
3128 min = GET2(ecode, 1);
3129 max = GET2(ecode, 1 + IMM2_SIZE);
3130 if (max == 0) max = INT_MAX;
3131 ecode += 1 + 2 * IMM2_SIZE;
3132 break;
3133
3134 default: /* No repeat follows */
3135 min = max = 1;
3136 break;
3137 }
3138
3139 /* First, ensure the minimum number of matches are present. */
3140
3141 for (i = 1; i <= min; i++)
3142 {
3143 if (eptr >= md->end_subject)
3144 {
3145 SCHECK_PARTIAL();
3146 RRETURN(MATCH_NOMATCH);
3147 }
3148 GETCHARINCTEST(c, eptr);
3149 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3150 }
3151
3152 /* If max == min we can continue with the main loop without the
3153 need to recurse. */
3154
3155 if (min == max) continue;
3156
3157 /* If minimizing, keep testing the rest of the expression and advancing
3158 the pointer while it matches the class. */
3159
3160 if (minimize)
3161 {
3162 for (fi = min;; fi++)
3163 {
3164 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3165 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3166 if (fi >= max) RRETURN(MATCH_NOMATCH);
3167 if (eptr >= md->end_subject)
3168 {
3169 SCHECK_PARTIAL();
3170 RRETURN(MATCH_NOMATCH);
3171 }
3172 GETCHARINCTEST(c, eptr);
3173 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3174 }
3175 /* Control never gets here */
3176 }
3177
3178 /* If maximizing, find the longest possible run, then work backwards. */
3179
3180 else
3181 {
3182 pp = eptr;
3183 for (i = min; i < max; i++)
3184 {
3185 int len = 1;
3186 if (eptr >= md->end_subject)
3187 {
3188 SCHECK_PARTIAL();
3189 break;
3190 }
3191 #ifdef SUPPORT_UTF
3192 GETCHARLENTEST(c, eptr, len);
3193 #else
3194 c = *eptr;
3195 #endif
3196 if (!PRIV(xclass)(c, data, utf)) break;
3197 eptr += len;
3198 }
3199
3200 if (possessive) continue; /* No backtracking */
3201
3202 for(;;)
3203 {
3204 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3205 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3206 if (eptr-- == pp) break; /* Stop if tried at original pos */
3207 #ifdef SUPPORT_UTF
3208 if (utf) BACKCHAR(eptr);
3209 #endif
3210 }
3211 RRETURN(MATCH_NOMATCH);
3212 }
3213
3214 /* Control never gets here */
3215 }
3216 #endif /* End of XCLASS */
3217
3218 /* Match a single character, casefully */
3219
3220 case OP_CHAR:
3221 #ifdef SUPPORT_UTF
3222 if (utf)
3223 {
3224 length = 1;
3225 ecode++;
3226 GETCHARLEN(fc, ecode, length);
3227 if (length > md->end_subject - eptr)
3228 {
3229 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3230 RRETURN(MATCH_NOMATCH);
3231 }
3232 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3233 }
3234 else
3235 #endif
3236 /* Not UTF mode */
3237 {
3238 if (md->end_subject - eptr < 1)
3239 {
3240 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3241 RRETURN(MATCH_NOMATCH);
3242 }
3243 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3244 ecode += 2;
3245 }
3246 break;
3247
3248 /* Match a single character, caselessly. If we are at the end of the
3249 subject, give up immediately. */
3250
3251 case OP_CHARI:
3252 if (eptr >= md->end_subject)
3253 {
3254 SCHECK_PARTIAL();
3255 RRETURN(MATCH_NOMATCH);
3256 }
3257
3258 #ifdef SUPPORT_UTF
3259 if (utf)
3260 {
3261 length = 1;
3262 ecode++;
3263 GETCHARLEN(fc, ecode, length);
3264
3265 /* If the pattern character's value is < 128, we have only one byte, and
3266 we know that its other case must also be one byte long, so we can use the
3267 fast lookup table. We know that there is at least one byte left in the
3268 subject. */
3269
3270 if (fc < 128)
3271 {
3272 pcre_uint32 cc = UCHAR21(eptr);
3273 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3274 ecode++;
3275 eptr++;
3276 }
3277
3278 /* Otherwise we must pick up the subject character. Note that we cannot
3279 use the value of "length" to check for sufficient bytes left, because the
3280 other case of the character may have more or fewer bytes. */
3281
3282 else
3283 {
3284 pcre_uint32 dc;
3285 GETCHARINC(dc, eptr);
3286 ecode += length;
3287
3288 /* If we have Unicode property support, we can use it to test the other
3289 case of the character, if there is one. */
3290
3291 if (fc != dc)
3292 {
3293 #ifdef SUPPORT_UCP
3294 if (dc != UCD_OTHERCASE(fc))
3295 #endif
3296 RRETURN(MATCH_NOMATCH);
3297 }
3298 }
3299 }
3300 else
3301 #endif /* SUPPORT_UTF */
3302
3303 /* Not UTF mode */
3304 {
3305 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3306 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3307 eptr++;
3308 ecode += 2;
3309 }
3310 break;
3311
3312 /* Match a single character repeatedly. */
3313
3314 case OP_EXACT:
3315 case OP_EXACTI:
3316 min = max = GET2(ecode, 1);
3317 ecode += 1 + IMM2_SIZE;
3318 goto REPEATCHAR;
3319
3320 case OP_POSUPTO:
3321 case OP_POSUPTOI:
3322 possessive = TRUE;
3323 /* Fall through */
3324
3325 case OP_UPTO:
3326 case OP_UPTOI:
3327 case OP_MINUPTO:
3328 case OP_MINUPTOI:
3329 min = 0;
3330 max = GET2(ecode, 1);
3331 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3332 ecode += 1 + IMM2_SIZE;
3333 goto REPEATCHAR;
3334
3335 case OP_POSSTAR:
3336 case OP_POSSTARI:
3337 possessive = TRUE;
3338 min = 0;
3339 max = INT_MAX;
3340 ecode++;
3341 goto REPEATCHAR;
3342
3343 case OP_POSPLUS:
3344 case OP_POSPLUSI:
3345 possessive = TRUE;
3346 min = 1;
3347 max = INT_MAX;
3348 ecode++;
3349 goto REPEATCHAR;
3350
3351 case OP_POSQUERY:
3352 case OP_POSQUERYI:
3353 possessive = TRUE;
3354 min = 0;
3355 max = 1;
3356 ecode++;
3357 goto REPEATCHAR;
3358
3359 case OP_STAR:
3360 case OP_STARI:
3361 case OP_MINSTAR:
3362 case OP_MINSTARI:
3363 case OP_PLUS:
3364 case OP_PLUSI:
3365 case OP_MINPLUS:
3366 case OP_MINPLUSI:
3367 case OP_QUERY:
3368 case OP_QUERYI:
3369 case OP_MINQUERY:
3370 case OP_MINQUERYI:
3371 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3372 minimize = (c & 1) != 0;
3373 min = rep_min[c]; /* Pick up values from tables; */
3374 max = rep_max[c]; /* zero for max => infinity */
3375 if (max == 0) max = INT_MAX;
3376
3377 /* Common code for all repeated single-character matches. We first check
3378 for the minimum number of characters. If the minimum equals the maximum, we
3379 are done. Otherwise, if minimizing, check the rest of the pattern for a
3380 match; if there isn't one, advance up to the maximum, one character at a
3381 time.
3382
3383 If maximizing, advance up to the maximum number of matching characters,
3384 until eptr is past the end of the maximum run. If possessive, we are
3385 then done (no backing up). Otherwise, match at this position; anything
3386 other than no match is immediately returned. For nomatch, back up one
3387 character, unless we are matching \R and the last thing matched was
3388 \r\n, in which case, back up two bytes. When we reach the first optional
3389 character position, we can save stack by doing a tail recurse.
3390
3391 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3392 for speed. */
3393
3394 REPEATCHAR:
3395 #ifdef SUPPORT_UTF
3396 if (utf)
3397 {
3398 length = 1;
3399 charptr = ecode;
3400 GETCHARLEN(fc, ecode, length);
3401 ecode += length;
3402
3403 /* Handle multibyte character matching specially here. There is
3404 support for caseless matching if UCP support is present. */
3405
3406 if (length > 1)
3407 {
3408 #ifdef SUPPORT_UCP
3409 pcre_uint32 othercase;
3410 if (op >= OP_STARI && /* Caseless */
3411 (othercase = UCD_OTHERCASE(fc)) != fc)
3412 oclength = PRIV(ord2utf)(othercase, occhars);
3413 else oclength = 0;
3414 #endif /* SUPPORT_UCP */
3415
3416 for (i = 1; i <= min; i++)
3417 {
3418 if (eptr <= md->end_subject - length &&
3419 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3420 #ifdef SUPPORT_UCP
3421 else if (oclength > 0 &&
3422 eptr <= md->end_subject - oclength &&
3423 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3424 #endif /* SUPPORT_UCP */
3425 else
3426 {
3427 CHECK_PARTIAL();
3428 RRETURN(MATCH_NOMATCH);
3429 }
3430 }
3431
3432 if (min == max) continue;
3433
3434 if (minimize)
3435 {
3436 for (fi = min;; fi++)
3437 {
3438 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3439 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3440 if (fi >= max) RRETURN(MATCH_NOMATCH);
3441 if (eptr <= md->end_subject - length &&
3442 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3443 #ifdef SUPPORT_UCP
3444 else if (oclength > 0 &&
3445 eptr <= md->end_subject - oclength &&
3446 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3447 #endif /* SUPPORT_UCP */
3448 else
3449 {
3450 CHECK_PARTIAL();
3451 RRETURN(MATCH_NOMATCH);
3452 }
3453 }
3454 /* Control never gets here */
3455 }
3456
3457 else /* Maximize */
3458 {
3459 pp = eptr;
3460 for (i = min; i < max; i++)
3461 {
3462 if (eptr <= md->end_subject - length &&
3463 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3464 #ifdef SUPPORT_UCP
3465 else if (oclength > 0 &&
3466 eptr <= md->end_subject - oclength &&
3467 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3468 #endif /* SUPPORT_UCP */
3469 else
3470 {
3471 CHECK_PARTIAL();
3472 break;
3473 }
3474 }
3475
3476 if (possessive) continue; /* No backtracking */
3477 for(;;)
3478 {
3479 if (eptr == pp) goto TAIL_RECURSE;
3480 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3481 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3482 #ifdef SUPPORT_UCP
3483 eptr--;
3484 BACKCHAR(eptr);
3485 #else /* without SUPPORT_UCP */
3486 eptr -= length;
3487 #endif /* SUPPORT_UCP */
3488 }
3489 }
3490 /* Control never gets here */
3491 }
3492
3493 /* If the length of a UTF-8 character is 1, we fall through here, and
3494 obey the code as for non-UTF-8 characters below, though in this case the
3495 value of fc will always be < 128. */
3496 }
3497 else
3498 #endif /* SUPPORT_UTF */
3499 /* When not in UTF-8 mode, load a single-byte character. */
3500 fc = *ecode++;
3501
3502 /* The value of fc at this point is always one character, though we may
3503 or may not be in UTF mode. The code is duplicated for the caseless and
3504 caseful cases, for speed, since matching characters is likely to be quite
3505 common. First, ensure the minimum number of matches are present. If min =
3506 max, continue at the same level without recursing. Otherwise, if
3507 minimizing, keep trying the rest of the expression and advancing one
3508 matching character if failing, up to the maximum. Alternatively, if
3509 maximizing, find the maximum number of characters and work backwards. */
3510
3511 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3512 max, (char *)eptr));
3513
3514 if (op >= OP_STARI) /* Caseless */
3515 {
3516 #ifdef COMPILE_PCRE8
3517 /* fc must be < 128 if UTF is enabled. */
3518 foc = md->fcc[fc];
3519 #else
3520 #ifdef SUPPORT_UTF
3521 #ifdef SUPPORT_UCP
3522 if (utf && fc > 127)
3523 foc = UCD_OTHERCASE(fc);
3524 #else
3525 if (utf && fc > 127)
3526 foc = fc;
3527 #endif /* SUPPORT_UCP */
3528 else
3529 #endif /* SUPPORT_UTF */
3530 foc = TABLE_GET(fc, md->fcc, fc);
3531 #endif /* COMPILE_PCRE8 */
3532
3533 for (i = 1; i <= min; i++)
3534 {
3535 pcre_uint32 cc; /* Faster than pcre_uchar */
3536 if (eptr >= md->end_subject)
3537 {
3538 SCHECK_PARTIAL();
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 cc = UCHAR21TEST(eptr);
3542 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3543 eptr++;
3544 }
3545 if (min == max) continue;
3546 if (minimize)
3547 {
3548 for (fi = min;; fi++)
3549 {
3550 pcre_uint32 cc; /* Faster than pcre_uchar */
3551 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3552 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3553 if (fi >= max) RRETURN(MATCH_NOMATCH);
3554 if (eptr >= md->end_subject)
3555 {
3556 SCHECK_PARTIAL();
3557 RRETURN(MATCH_NOMATCH);
3558 }
3559 cc = UCHAR21TEST(eptr);
3560 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3561 eptr++;
3562 }
3563 /* Control never gets here */
3564 }
3565 else /* Maximize */
3566 {
3567 pp = eptr;
3568 for (i = min; i < max; i++)
3569 {
3570 pcre_uint32 cc; /* Faster than pcre_uchar */
3571 if (eptr >= md->end_subject)
3572 {
3573 SCHECK_PARTIAL();
3574 break;
3575 }
3576 cc = UCHAR21TEST(eptr);
3577 if (fc != cc && foc != cc) break;
3578 eptr++;
3579 }
3580 if (possessive) continue; /* No backtracking */
3581 for (;;)
3582 {
3583 if (eptr == pp) goto TAIL_RECURSE;
3584 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3585 eptr--;
3586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 }
3588 /* Control never gets here */
3589 }
3590 }
3591
3592 /* Caseful comparisons (includes all multi-byte characters) */
3593
3594 else
3595 {
3596 for (i = 1; i <= min; i++)
3597 {
3598 if (eptr >= md->end_subject)
3599 {
3600 SCHECK_PARTIAL();
3601 RRETURN(MATCH_NOMATCH);
3602 }
3603 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3604 }
3605
3606 if (min == max) continue;
3607
3608 if (minimize)
3609 {
3610 for (fi = min;; fi++)
3611 {
3612 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3614 if (fi >= max) RRETURN(MATCH_NOMATCH);
3615 if (eptr >= md->end_subject)
3616 {
3617 SCHECK_PARTIAL();
3618 RRETURN(MATCH_NOMATCH);
3619 }
3620 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3621 }
3622 /* Control never gets here */
3623 }
3624 else /* Maximize */
3625 {
3626 pp = eptr;
3627 for (i = min; i < max; i++)
3628 {
3629 if (eptr >= md->end_subject)
3630 {
3631 SCHECK_PARTIAL();
3632 break;
3633 }
3634 if (fc != UCHAR21TEST(eptr)) break;
3635 eptr++;
3636 }
3637 if (possessive) continue; /* No backtracking */
3638 for (;;)
3639 {
3640 if (eptr == pp) goto TAIL_RECURSE;
3641 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3642 eptr--;
3643 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3644 }
3645 /* Control never gets here */
3646 }
3647 }
3648 /* Control never gets here */
3649
3650 /* Match a negated single one-byte character. The character we are
3651 checking can be multibyte. */
3652
3653 case OP_NOT:
3654 case OP_NOTI:
3655 if (eptr >= md->end_subject)
3656 {
3657 SCHECK_PARTIAL();
3658 RRETURN(MATCH_NOMATCH);
3659 }
3660 #ifdef SUPPORT_UTF
3661 if (utf)
3662 {
3663 register pcre_uint32 ch, och;
3664
3665 ecode++;
3666 GETCHARINC(ch, ecode);
3667 GETCHARINC(c, eptr);
3668
3669 if (op == OP_NOT)
3670 {
3671 if (ch == c) RRETURN(MATCH_NOMATCH);
3672 }
3673 else
3674 {
3675 #ifdef SUPPORT_UCP
3676 if (ch > 127)
3677 och = UCD_OTHERCASE(ch);
3678 #else
3679 if (ch > 127)
3680 och = ch;
3681 #endif /* SUPPORT_UCP */
3682 else
3683 och = TABLE_GET(ch, md->fcc, ch);
3684 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3685 }
3686 }
3687 else
3688 #endif
3689 {
3690 register pcre_uint32 ch = ecode[1];
3691 c = *eptr++;
3692 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3693 RRETURN(MATCH_NOMATCH);
3694 ecode += 2;
3695 }
3696 break;
3697
3698 /* Match a negated single one-byte character repeatedly. This is almost a
3699 repeat of the code for a repeated single character, but I haven't found a
3700 nice way of commoning these up that doesn't require a test of the
3701 positive/negative option for each character match. Maybe that wouldn't add
3702 very much to the time taken, but character matching *is* what this is all
3703 about... */
3704
3705 case OP_NOTEXACT:
3706 case OP_NOTEXACTI:
3707 min = max = GET2(ecode, 1);
3708 ecode += 1 + IMM2_SIZE;
3709 goto REPEATNOTCHAR;
3710
3711 case OP_NOTUPTO:
3712 case OP_NOTUPTOI:
3713 case OP_NOTMINUPTO:
3714 case OP_NOTMINUPTOI:
3715 min = 0;
3716 max = GET2(ecode, 1);
3717 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3718 ecode += 1 + IMM2_SIZE;
3719 goto REPEATNOTCHAR;
3720
3721 case OP_NOTPOSSTAR:
3722 case OP_NOTPOSSTARI:
3723 possessive = TRUE;
3724 min = 0;
3725 max = INT_MAX;
3726 ecode++;
3727 goto REPEATNOTCHAR;
3728
3729 case OP_NOTPOSPLUS:
3730 case OP_NOTPOSPLUSI:
3731 possessive = TRUE;
3732 min = 1;
3733 max = INT_MAX;
3734 ecode++;
3735 goto REPEATNOTCHAR;
3736
3737 case OP_NOTPOSQUERY:
3738 case OP_NOTPOSQUERYI:
3739 possessive = TRUE;
3740 min = 0;
3741 max = 1;
3742 ecode++;
3743 goto REPEATNOTCHAR;
3744
3745 case OP_NOTPOSUPTO:
3746 case OP_NOTPOSUPTOI:
3747 possessive = TRUE;
3748 min = 0;
3749 max = GET2(ecode, 1);
3750 ecode += 1 + IMM2_SIZE;
3751 goto REPEATNOTCHAR;
3752
3753 case OP_NOTSTAR:
3754 case OP_NOTSTARI:
3755 case OP_NOTMINSTAR:
3756 case OP_NOTMINSTARI:
3757 case OP_NOTPLUS:
3758 case OP_NOTPLUSI:
3759 case OP_NOTMINPLUS:
3760 case OP_NOTMINPLUSI:
3761 case OP_NOTQUERY:
3762 case OP_NOTQUERYI:
3763 case OP_NOTMINQUERY:
3764 case OP_NOTMINQUERYI:
3765 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3766 minimize = (c & 1) != 0;
3767 min = rep_min[c]; /* Pick up values from tables; */
3768 max = rep_max[c]; /* zero for max => infinity */
3769 if (max == 0) max = INT_MAX;
3770
3771 /* Common code for all repeated single-byte matches. */
3772
3773 REPEATNOTCHAR:
3774 GETCHARINCTEST(fc, ecode);
3775
3776 /* The code is duplicated for the caseless and caseful cases, for speed,
3777 since matching characters is likely to be quite common. First, ensure the
3778 minimum number of matches are present. If min = max, continue at the same
3779 level without recursing. Otherwise, if minimizing, keep trying the rest of
3780 the expression and advancing one matching character if failing, up to the
3781 maximum. Alternatively, if maximizing, find the maximum number of
3782 characters and work backwards. */
3783
3784 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3785 max, (char *)eptr));
3786
3787 if (op >= OP_NOTSTARI) /* Caseless */
3788 {
3789 #ifdef SUPPORT_UTF
3790 #ifdef SUPPORT_UCP
3791 if (utf && fc > 127)
3792 foc = UCD_OTHERCASE(fc);
3793 #else
3794 if (utf && fc > 127)
3795 foc = fc;
3796 #endif /* SUPPORT_UCP */
3797 else
3798 #endif /* SUPPORT_UTF */
3799 foc = TABLE_GET(fc, md->fcc, fc);
3800
3801 #ifdef SUPPORT_UTF
3802 if (utf)
3803 {
3804 register pcre_uint32 d;
3805 for (i = 1; i <= min; i++)
3806 {
3807 if (eptr >= md->end_subject)
3808 {
3809 SCHECK_PARTIAL();
3810 RRETURN(MATCH_NOMATCH);
3811 }
3812 GETCHARINC(d, eptr);
3813 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3814 }
3815 }
3816 else
3817 #endif /* SUPPORT_UTF */
3818 /* Not UTF mode */
3819 {
3820 for (i = 1; i <= min; i++)
3821 {
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 RRETURN(MATCH_NOMATCH);
3826 }
3827 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3828 eptr++;
3829 }
3830 }
3831
3832 if (min == max) continue;
3833
3834 if (minimize)
3835 {
3836 #ifdef SUPPORT_UTF
3837 if (utf)
3838 {
3839 register pcre_uint32 d;
3840 for (fi = min;; fi++)
3841 {
3842 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3844 if (fi >= max) RRETURN(MATCH_NOMATCH);
3845 if (eptr >= md->end_subject)
3846 {
3847 SCHECK_PARTIAL();
3848 RRETURN(MATCH_NOMATCH);
3849 }
3850 GETCHARINC(d, eptr);
3851 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3852 }
3853 }
3854 else
3855 #endif /*SUPPORT_UTF */
3856 /* Not UTF mode */
3857 {
3858 for (fi = min;; fi++)
3859 {
3860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3862 if (fi >= max) RRETURN(MATCH_NOMATCH);
3863 if (eptr >= md->end_subject)
3864 {
3865 SCHECK_PARTIAL();
3866 RRETURN(MATCH_NOMATCH);
3867 }
3868 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3869 eptr++;
3870 }
3871 }
3872 /* Control never gets here */
3873 }
3874
3875 /* Maximize case */
3876
3877 else
3878 {
3879 pp = eptr;
3880
3881 #ifdef SUPPORT_UTF
3882 if (utf)
3883 {
3884 register pcre_uint32 d;
3885 for (i = min; i < max; i++)
3886 {
3887 int len = 1;
3888 if (eptr >= md->end_subject)
3889 {
3890 SCHECK_PARTIAL();
3891 break;
3892 }
3893 GETCHARLEN(d, eptr, len);
3894 if (fc == d || (unsigned int)foc == d) break;
3895 eptr += len;
3896 }
3897 if (possessive) continue; /* No backtracking */
3898 for(;;)
3899 {
3900 if (eptr == pp) goto TAIL_RECURSE;
3901 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3902 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3903 eptr--;
3904 BACKCHAR(eptr);
3905 }
3906 }
3907 else
3908 #endif /* SUPPORT_UTF */
3909 /* Not UTF mode */
3910 {
3911 for (i = min; i < max; i++)
3912 {
3913 if (eptr >= md->end_subject)
3914 {
3915 SCHECK_PARTIAL();
3916 break;
3917 }
3918 if (fc == *eptr || foc == *eptr) break;
3919 eptr++;
3920 }
3921 if (possessive) continue; /* No backtracking */
3922 for (;;)
3923 {
3924 if (eptr == pp) goto TAIL_RECURSE;
3925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3927 eptr--;
3928 }
3929 }
3930 /* Control never gets here */
3931 }
3932 }
3933
3934 /* Caseful comparisons */
3935
3936 else
3937 {
3938 #ifdef SUPPORT_UTF
3939 if (utf)
3940 {
3941 register pcre_uint32 d;
3942 for (i = 1; i <= min; i++)
3943 {
3944 if (eptr >= md->end_subject)
3945 {
3946 SCHECK_PARTIAL();
3947 RRETURN(MATCH_NOMATCH);
3948 }
3949 GETCHARINC(d, eptr);
3950 if (fc == d) RRETURN(MATCH_NOMATCH);
3951 }
3952 }
3953 else
3954 #endif
3955 /* Not UTF mode */
3956 {
3957 for (i = 1; i <= min; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 RRETURN(MATCH_NOMATCH);
3963 }
3964 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3965 }
3966 }
3967
3968 if (min == max) continue;
3969
3970 if (minimize)
3971 {
3972 #ifdef SUPPORT_UTF
3973 if (utf)
3974 {
3975 register pcre_uint32 d;
3976 for (fi = min;; fi++)
3977 {
3978 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3980 if (fi >= max) RRETURN(MATCH_NOMATCH);
3981 if (eptr >= md->end_subject)
3982 {
3983 SCHECK_PARTIAL();
3984 RRETURN(MATCH_NOMATCH);
3985 }
3986 GETCHARINC(d, eptr);
3987 if (fc == d) RRETURN(MATCH_NOMATCH);
3988 }
3989 }
3990 else
3991 #endif
3992 /* Not UTF mode */
3993 {
3994 for (fi = min;; fi++)
3995 {
3996 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3997 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3998 if (fi >= max) RRETURN(MATCH_NOMATCH);
3999 if (eptr >= md->end_subject)
4000 {
4001 SCHECK_PARTIAL();
4002 RRETURN(MATCH_NOMATCH);
4003 }
4004 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4005 }
4006 }
4007 /* Control never gets here */
4008 }
4009
4010 /* Maximize case */
4011
4012 else
4013 {
4014 pp = eptr;
4015
4016 #ifdef SUPPORT_UTF
4017 if (utf)
4018 {
4019 register pcre_uint32 d;
4020 for (i = min; i < max; i++)
4021 {
4022 int len = 1;
4023 if (eptr >= md->end_subject)
4024 {
4025 SCHECK_PARTIAL();
4026 break;
4027 }
4028 GETCHARLEN(d, eptr, len);
4029 if (fc == d) break;
4030 eptr += len;
4031 }
4032 if (possessive) continue; /* No backtracking */
4033 for(;;)
4034 {
4035 if (eptr == pp) goto TAIL_RECURSE;
4036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4038 eptr--;
4039 BACKCHAR(eptr);
4040 }
4041 }
4042 else
4043 #endif
4044 /* Not UTF mode */
4045 {
4046 for (i = min; i < max; i++)
4047 {
4048 if (eptr >= md->end_subject)
4049 {
4050 SCHECK_PARTIAL();
4051 break;
4052 }
4053 if (fc == *eptr) break;
4054 eptr++;
4055 }
4056 if (possessive) continue; /* No backtracking */
4057 for (;;)
4058 {
4059 if (eptr == pp) goto TAIL_RECURSE;
4060 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4062 eptr--;
4063 }
4064 }
4065 /* Control never gets here */
4066 }
4067 }
4068 /* Control never gets here */
4069
4070 /* Match a single character type repeatedly; several different opcodes
4071 share code. This is very similar to the code for single characters, but we
4072 repeat it in the interests of efficiency. */
4073
4074 case OP_TYPEEXACT:
4075 min = max = GET2(ecode, 1);
4076 minimize = TRUE;
4077 ecode += 1 + IMM2_SIZE;
4078 goto REPEATTYPE;
4079
4080 case OP_TYPEUPTO:
4081 case OP_TYPEMINUPTO:
4082 min = 0;
4083 max = GET2(ecode, 1);
4084 minimize = *ecode == OP_TYPEMINUPTO;
4085 ecode += 1 + IMM2_SIZE;
4086 goto REPEATTYPE;
4087
4088 case OP_TYPEPOSSTAR:
4089 possessive = TRUE;
4090 min = 0;
4091 max = INT_MAX;
4092 ecode++;
4093 goto REPEATTYPE;
4094
4095 case OP_TYPEPOSPLUS:
4096 possessive = TRUE;
4097 min = 1;
4098 max = INT_MAX;
4099 ecode++;
4100 goto REPEATTYPE;
4101
4102 case OP_TYPEPOSQUERY:
4103 possessive = TRUE;
4104 min = 0;
4105 max = 1;
4106 ecode++;
4107 goto REPEATTYPE;
4108
4109 case OP_TYPEPOSUPTO:
4110 possessive = TRUE;
4111 min = 0;
4112 max = GET2(ecode, 1);
4113 ecode += 1 + IMM2_SIZE;
4114 goto REPEATTYPE;
4115
4116 case OP_TYPESTAR:
4117 case OP_TYPEMINSTAR:
4118 case OP_TYPEPLUS:
4119 case OP_TYPEMINPLUS:
4120 case OP_TYPEQUERY:
4121 case OP_TYPEMINQUERY:
4122 c = *ecode++ - OP_TYPESTAR;
4123 minimize = (c & 1) != 0;
4124 min = rep_min[c]; /* Pick up values from tables; */
4125 max = rep_max[c]; /* zero for max => infinity */
4126 if (max == 0) max = INT_MAX;
4127
4128 /* Common code for all repeated single character type matches. Note that
4129 in UTF-8 mode, '.' matches a character of any length, but for the other
4130 character types, the valid characters are all one-byte long. */
4131
4132 REPEATTYPE:
4133 ctype = *ecode++; /* Code for the character type */
4134
4135 #ifdef SUPPORT_UCP
4136 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4137 {
4138 prop_fail_result = ctype == OP_NOTPROP;
4139 prop_type = *ecode++;
4140 prop_value = *ecode++;
4141 }
4142 else prop_type = -1;
4143 #endif
4144
4145 /* First, ensure the minimum number of matches are present. Use inline
4146 code for maximizing the speed, and do the type test once at the start
4147 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4148 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4149 and single-bytes. */
4150
4151 if (min > 0)
4152 {
4153 #ifdef SUPPORT_UCP
4154 if (prop_type >= 0)
4155 {
4156 switch(prop_type)
4157 {
4158 case PT_ANY:
4159 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4160 for (i = 1; i <= min; i++)
4161 {
4162 if (eptr >= md->end_subject)
4163 {
4164 SCHECK_PARTIAL();
4165 RRETURN(MATCH_NOMATCH);
4166 }
4167 GETCHARINCTEST(c, eptr);
4168 }
4169 break;
4170
4171 case PT_LAMP:
4172 for (i = 1; i <= min; i++)
4173 {
4174 int chartype;
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINCTEST(c, eptr);
4181 chartype = UCD_CHARTYPE(c);
4182 if ((chartype == ucp_Lu ||
4183 chartype == ucp_Ll ||
4184 chartype == ucp_Lt) == prop_fail_result)
4185 RRETURN(MATCH_NOMATCH);
4186 }
4187 break;
4188
4189 case PT_GC:
4190 for (i = 1; i <= min; i++)
4191 {
4192 if (eptr >= md->end_subject)
4193 {
4194 SCHECK_PARTIAL();
4195 RRETURN(MATCH_NOMATCH);
4196 }
4197 GETCHARINCTEST(c, eptr);
4198 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 break;
4202
4203 case PT_PC:
4204 for (i = 1; i <= min; i++)
4205 {
4206 if (eptr >= md->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 RRETURN(MATCH_NOMATCH);
4210 }
4211 GETCHARINCTEST(c, eptr);
4212 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4213 RRETURN(MATCH_NOMATCH);
4214 }
4215 break;
4216
4217 case PT_SC:
4218 for (i = 1; i <= min; i++)
4219 {
4220 if (eptr >= md->end_subject)
4221 {
4222 SCHECK_PARTIAL();
4223 RRETURN(MATCH_NOMATCH);
4224 }
4225 GETCHARINCTEST(c, eptr);
4226 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 break;
4230
4231 case PT_ALNUM:
4232 for (i = 1; i <= min; i++)
4233 {
4234 int category;
4235 if (eptr >= md->end_subject)
4236 {
4237 SCHECK_PARTIAL();
4238 RRETURN(MATCH_NOMATCH);
4239 }
4240 GETCHARINCTEST(c, eptr);
4241 category = UCD_CATEGORY(c);
4242 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4243 RRETURN(MATCH_NOMATCH);
4244 }
4245 break;
4246
4247 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4248 which means that Perl space and POSIX space are now identical. PCRE
4249 was changed at release 8.34. */
4250
4251 case PT_SPACE: /* Perl space */
4252 case PT_PXSPACE: /* POSIX space */
4253 for (i = 1; i <= min; i++)
4254 {
4255 if (eptr >= md->end_subject)
4256 {
4257 SCHECK_PARTIAL();
4258 RRETURN(MATCH_NOMATCH);
4259 }
4260 GETCHARINCTEST(c, eptr);
4261 switch(c)
4262 {
4263 HSPACE_CASES:
4264 VSPACE_CASES:
4265 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4266 break;
4267
4268 default:
4269 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4270 RRETURN(MATCH_NOMATCH);
4271 break;
4272 }
4273 }
4274 break;
4275
4276 case PT_WORD:
4277 for (i = 1; i <= min; i++)
4278 {
4279 int category;
4280 if (eptr >= md->end_subject)
4281 {
4282 SCHECK_PARTIAL();
4283 RRETURN(MATCH_NOMATCH);
4284 }
4285 GETCHARINCTEST(c, eptr);
4286 category = UCD_CATEGORY(c);
4287 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4288 == prop_fail_result)
4289 RRETURN(MATCH_NOMATCH);
4290 }
4291 break;
4292
4293 case PT_CLIST:
4294 for (i = 1; i <= min; i++)
4295 {
4296 const pcre_uint32 *cp;
4297 if (eptr >= md->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 RRETURN(MATCH_NOMATCH);
4301 }
4302 GETCHARINCTEST(c, eptr);
4303 cp = PRIV(ucd_caseless_sets) + prop_value;
4304 for (;;)
4305 {
4306 if (c < *cp)
4307 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4308 if (c == *cp++)
4309 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4310 }
4311 }
4312 break;
4313
4314 case PT_UCNC:
4315 for (i = 1; i <= min; i++)
4316 {
4317 if (eptr >= md->end_subject)
4318 {
4319 SCHECK_PARTIAL();
4320 RRETURN(MATCH_NOMATCH);
4321 }
4322 GETCHARINCTEST(c, eptr);
4323 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4324 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4325 c >= 0xe000) == prop_fail_result)
4326 RRETURN(MATCH_NOMATCH);
4327 }
4328 break;
4329
4330 /* This should not occur */
4331
4332 default:
4333 RRETURN(PCRE_ERROR_INTERNAL);
4334 }
4335 }
4336
4337 /* Match extended Unicode sequences. We will get here only if the
4338 support is in the binary; otherwise a compile-time error occurs. */
4339
4340 else if (ctype == OP_EXTUNI)
4341 {
4342 for (i = 1; i <= min; i++)
4343 {
4344 if (eptr >= md->end_subject)
4345 {
4346 SCHECK_PARTIAL();
4347 RRETURN(MATCH_NOMATCH);
4348 }
4349 else
4350 {
4351 int lgb, rgb;
4352 GETCHARINCTEST(c, eptr);
4353 lgb = UCD_GRAPHBREAK(c);
4354 while (eptr < md->end_subject)
4355 {
4356 int len = 1;
4357 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4358 rgb = UCD_GRAPHBREAK(c);
4359 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4360 lgb = rgb;
4361 eptr += len;
4362 }
4363 }
4364 CHECK_PARTIAL();
4365 }
4366 }
4367
4368 else
4369 #endif /* SUPPORT_UCP */
4370
4371 /* Handle all other cases when the coding is UTF-8 */
4372
4373 #ifdef SUPPORT_UTF
4374 if (utf) switch(ctype)
4375 {
4376 case OP_ANY:
4377 for (i = 1; i <= min; i++)
4378 {
4379 if (eptr >= md->end_subject)
4380 {
4381 SCHECK_PARTIAL();
4382 RRETURN(MATCH_NOMATCH);
4383 }
4384 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4385 if (md->partial != 0 &&
4386 eptr + 1 >= md->end_subject &&
4387 NLBLOCK->nltype == NLTYPE_FIXED &&
4388 NLBLOCK->nllen == 2 &&
4389 UCHAR21(eptr) == NLBLOCK->nl[0])
4390 {
4391 md->hitend = TRUE;
4392 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4393 }
4394 eptr++;
4395 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4396 }
4397 break;
4398
4399 case OP_ALLANY:
4400 for (i = 1; i <= min; i++)
4401 {
4402 if (eptr >= md->end_subject)
4403 {
4404 SCHECK_PARTIAL();
4405 RRETURN(MATCH_NOMATCH);
4406 }
4407 eptr++;
4408 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4409 }
4410 break;
4411
4412 case OP_ANYBYTE:
4413 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4414 eptr += min;
4415 break;
4416
4417 case OP_ANYNL:
4418 for (i = 1; i <= min; i++)
4419 {
4420 if (eptr >= md->end_subject)
4421 {
4422 SCHECK_PARTIAL();
4423 RRETURN(MATCH_NOMATCH);
4424 }
4425 GETCHARINC(c, eptr);
4426 switch(c)
4427 {
4428 default: RRETURN(MATCH_NOMATCH);
4429
4430 case CHAR_CR:
4431 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4432 break;
4433
4434 case CHAR_LF:
4435 break;
4436
4437 case CHAR_VT:
4438 case CHAR_FF:
4439 case CHAR_NEL:
4440 #ifndef EBCDIC
4441 case 0x2028:
4442 case 0x2029:
4443 #endif /* Not EBCDIC */
4444 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4445 break;
4446 }
4447 }
4448 break;
4449
4450 case OP_NOT_HSPACE:
4451 for (i = 1; i <= min; i++)
4452 {
4453 if (eptr >= md->end_subject)
4454 {
4455 SCHECK_PARTIAL();
4456 RRETURN(MATCH_NOMATCH);
4457 }
4458 GETCHARINC(c, eptr);
4459 switch(c)
4460 {
4461 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4462 default: break;
4463 }
4464 }
4465 break;
4466
4467 case OP_HSPACE:
4468 for (i = 1; i <= min; i++)
4469 {
4470 if (eptr >= md->end_subject)
4471 {
4472 SCHECK_PARTIAL();
4473 RRETURN(MATCH_NOMATCH);
4474 }
4475 GETCHARINC(c, eptr);
4476 switch(c)
4477 {
4478 HSPACE_CASES: break; /* Byte and multibyte cases */
4479 default: RRETURN(MATCH_NOMATCH);
4480 }
4481 }
4482 break;
4483
4484 case OP_NOT_VSPACE:
4485 for (i = 1; i <= min; i++)
4486 {
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 RRETURN(MATCH_NOMATCH);
4491 }
4492 GETCHARINC(c, eptr);
4493 switch(c)
4494 {
4495 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4496 default: break;
4497 }
4498 }
4499 break;
4500
4501 case OP_VSPACE:
4502 for (i = 1; i <= min; i++)
4503 {
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 RRETURN(MATCH_NOMATCH);
4508 }
4509 GETCHARINC(c, eptr);
4510 switch(c)
4511 {
4512 VSPACE_CASES: break;
4513 default: RRETURN(MATCH_NOMATCH);
4514 }
4515 }
4516 break;
4517
4518 case OP_NOT_DIGIT:
4519 for (i = 1; i <= min; i++)
4520 {
4521 if (eptr >= md->end_subject)
4522 {
4523 SCHECK_PARTIAL();
4524 RRETURN(MATCH_NOMATCH);
4525 }
4526 GETCHARINC(c, eptr);
4527 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4528 RRETURN(MATCH_NOMATCH);
4529 }
4530 break;
4531
4532 case OP_DIGIT:
4533 for (i = 1; i <= min; i++)
4534 {
4535 pcre_uint32 cc;
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 RRETURN(MATCH_NOMATCH);
4540 }
4541 cc = UCHAR21(eptr);
4542 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4543 RRETURN(MATCH_NOMATCH);
4544 eptr++;
4545 /* No need to skip more bytes - we know it's a 1-byte character */
4546 }
4547 break;
4548
4549 case OP_NOT_WHITESPACE:
4550 for (i = 1; i <= min; i++)
4551 {
4552 pcre_uint32 cc;
4553 if (eptr >= md->end_subject)
4554 {
4555 SCHECK_PARTIAL();
4556 RRETURN(MATCH_NOMATCH);
4557 }
4558 cc = UCHAR21(eptr);
4559 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4560 RRETURN(MATCH_NOMATCH);
4561 eptr++;
4562 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4563 }
4564 break;
4565
4566 case OP_WHITESPACE:
4567 for (i = 1; i <= min; i++)
4568 {
4569 pcre_uint32 cc;
4570 if (eptr >= md->end_subject)
4571 {
4572 SCHECK_PARTIAL();
4573 RRETURN(MATCH_NOMATCH);
4574 }
4575 cc = UCHAR21(eptr);
4576 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4577 RRETURN(MATCH_NOMATCH);
4578 eptr++;
4579 /* No need to skip more bytes - we know it's a 1-byte character */
4580 }
4581 break;
4582
4583 case OP_NOT_WORDCHAR:
4584 for (i = 1; i <= min; i++)
4585 {
4586 pcre_uint32 cc;
4587 if (eptr >= md->end_subject)
4588 {
4589 SCHECK_PARTIAL();
4590 RRETURN(MATCH_NOMATCH);
4591 }
4592 cc = UCHAR21(eptr);
4593 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4594 RRETURN(MATCH_NOMATCH);
4595 eptr++;
4596 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4597 }
4598 break;
4599
4600 case OP_WORDCHAR:
4601 for (i = 1; i <= min; i++)
4602 {
4603 pcre_uint32 cc;
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 cc = UCHAR21(eptr);
4610 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4611 RRETURN(MATCH_NOMATCH);
4612 eptr++;
4613 /* No need to skip more bytes - we know it's a 1-byte character */
4614 }
4615 break;
4616
4617 default:
4618 RRETURN(PCRE_ERROR_INTERNAL);
4619 } /* End switch(ctype) */
4620
4621 else
4622 #endif /* SUPPORT_UTF */
4623
4624 /* Code for the non-UTF-8 case for minimum matching of operators other
4625 than OP_PROP and OP_NOTPROP. */
4626
4627 switch(ctype)
4628 {
4629 case OP_ANY:
4630 for (i = 1; i <= min; i++)
4631 {
4632 if (eptr >= md->end_subject)
4633 {
4634 SCHECK_PARTIAL();
4635 RRETURN(MATCH_NOMATCH);
4636 }
4637 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4638 if (md->partial != 0 &&
4639 eptr + 1 >= md->end_subject &&
4640 NLBLOCK->nltype == NLTYPE_FIXED &&
4641 NLBLOCK->nllen == 2 &&
4642 *eptr == NLBLOCK->nl[0])
4643 {
4644 md->hitend = TRUE;
4645 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4646 }
4647 eptr++;
4648 }
4649 break;
4650
4651 case OP_ALLANY:
4652 if (eptr > md->end_subject - min)
4653 {
4654 SCHECK_PARTIAL();
4655 RRETURN(MATCH_NOMATCH);
4656 }
4657 eptr += min;
4658 break;
4659
4660 case OP_ANYBYTE:
4661 if (eptr > md->end_subject - min)
4662 {
4663 SCHECK_PARTIAL();
4664 RRETURN(MATCH_NOMATCH);
4665 }
4666 eptr += min;
4667 break;
4668
4669 case OP_ANYNL:
4670 for (i = 1; i <= min; i++)
4671 {
4672 if (eptr >= md->end_subject)
4673 {
4674 SCHECK_PARTIAL();
4675 RRETURN(MATCH_NOMATCH);
4676 }
4677 switch(*eptr++)
4678 {
4679 default: RRETURN(MATCH_NOMATCH);
4680
4681 case CHAR_CR:
4682 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4683 break;
4684
4685 case CHAR_LF:
4686 break;
4687
4688 case CHAR_VT:
4689 case CHAR_FF:
4690 case CHAR_NEL:
4691 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4692 case 0x2028:
4693 case 0x2029:
4694 #endif
4695 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4696 break;
4697 }
4698 }
4699 break;
4700
4701 case OP_NOT_HSPACE:
4702 for (i = 1; i <= min; i++)
4703 {
4704 if (eptr >= md->end_subject)
4705 {
4706 SCHECK_PARTIAL();
4707 RRETURN(MATCH_NOMATCH);
4708 }
4709 switch(*eptr++)
4710 {
4711 default: break;
4712 HSPACE_BYTE_CASES:
4713 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4714 HSPACE_MULTIBYTE_CASES:
4715 #endif
4716 RRETURN(MATCH_NOMATCH);
4717 }
4718 }
4719 break;
4720
4721 case OP_HSPACE:
4722 for (i = 1; i <= min; i++)
4723 {
4724 if (eptr >= md->end_subject)
4725 {
4726 SCHECK_PARTIAL();
4727 RRETURN(MATCH_NOMATCH);
4728 }
4729 switch(*eptr++)
4730 {
4731 default: RRETURN(MATCH_NOMATCH);
4732 HSPACE_BYTE_CASES:
4733 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4734 HSPACE_MULTIBYTE_CASES:
4735 #endif
4736 break;
4737 }
4738 }
4739 break;
4740
4741 case OP_NOT_VSPACE:
4742 for (i = 1; i <= min; i++)
4743 {
4744 if (eptr >= md->end_subject)
4745 {
4746 SCHECK_PARTIAL();
4747 RRETURN(MATCH_NOMATCH);
4748 }
4749 switch(*eptr++)
4750 {
4751 VSPACE_BYTE_CASES:
4752 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4753 VSPACE_MULTIBYTE_CASES:
4754 #endif
4755 RRETURN(MATCH_NOMATCH);
4756 default: break;
4757 }
4758 }
4759 break;
4760
4761 case OP_VSPACE:
4762 for (i = 1; i <= min; i++)
4763 {
4764 if (eptr >= md->end_subject)
4765 {
4766 SCHECK_PARTIAL();
4767 RRETURN(MATCH_NOMATCH);
4768 }
4769 switch(*eptr++)
4770 {
4771 default: RRETURN(MATCH_NOMATCH);
4772 VSPACE_BYTE_CASES:
4773 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4774 VSPACE_MULTIBYTE_CASES:
4775 #endif
4776 break;
4777 }
4778 }
4779 break;
4780
4781 case OP_NOT_DIGIT:
4782 for (i = 1; i <= min; i++)
4783 {
4784 if (eptr >= md->end_subject)
4785 {
4786 SCHECK_PARTIAL();
4787 RRETURN(MATCH_NOMATCH);
4788 }
4789 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4790 RRETURN(MATCH_NOMATCH);
4791 eptr++;
4792 }
4793 break;
4794
4795 case OP_DIGIT:
4796 for (i = 1; i <= min; i++)
4797 {
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 RRETURN(MATCH_NOMATCH);
4802 }
4803 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4804 RRETURN(MATCH_NOMATCH);
4805 eptr++;
4806 }
4807 break;
4808
4809 case OP_NOT_WHITESPACE:
4810 for (i = 1; i <= min; i++)
4811 {
4812 if (eptr >= md->end_subject)
4813 {
4814 SCHECK_PARTIAL();
4815 RRETURN(MATCH_NOMATCH);
4816 }
4817 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4818 RRETURN(MATCH_NOMATCH);
4819 eptr++;
4820 }
4821 break;
4822
4823 case OP_WHITESPACE:
4824 for (i = 1; i <= min; i++)
4825 {
4826 if (eptr >= md->end_subject)
4827 {
4828 SCHECK_PARTIAL();
4829 RRETURN(MATCH_NOMATCH);
4830 }
4831 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4832 RRETURN(MATCH_NOMATCH);
4833 eptr++;
4834 }
4835 break;
4836
4837 case OP_NOT_WORDCHAR:
4838 for (i = 1; i <= min; i++)
4839 {
4840 if (eptr >= md->end_subject)
4841 {
4842 SCHECK_PARTIAL();
4843 RRETURN(MATCH_NOMATCH);
4844 }
4845 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4846 RRETURN(MATCH_NOMATCH);
4847 eptr++;
4848 }
4849 break;
4850
4851 case OP_WORDCHAR:
4852 for (i = 1; i <= min; i++)
4853 {
4854 if (eptr >= md->end_subject)
4855 {
4856 SCHECK_PARTIAL();
4857 RRETURN(MATCH_NOMATCH);
4858 }
4859 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4860 RRETURN(MATCH_NOMATCH);
4861 eptr++;
4862 }
4863 break;
4864
4865 default:
4866 RRETURN(PCRE_ERROR_INTERNAL);
4867 }
4868 }
4869
4870 /* If min = max, continue at the same level without recursing */
4871
4872 if (min == max) continue;
4873
4874 /* If minimizing, we have to test the rest of the pattern before each
4875 subsequent match. Again, separate the UTF-8 case for speed, and also
4876 separate the UCP cases. */
4877
4878 if (minimize)
4879 {
4880 #ifdef SUPPORT_UCP
4881 if (prop_type >= 0)
4882 {
4883 switch(prop_type)
4884 {
4885 case PT_ANY:
4886 for (fi = min;; fi++)
4887 {
4888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4890 if (fi >= max) RRETURN(MATCH_NOMATCH);
4891 if (eptr >= md->end_subject)
4892 {
4893 SCHECK_PARTIAL();
4894 RRETURN(MATCH_NOMATCH);
4895 }
4896 GETCHARINCTEST(c, eptr);
4897 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4898 }
4899 /* Control never gets here */
4900
4901 case PT_LAMP:
4902 for (fi = min;; fi++)
4903 {
4904 int chartype;
4905 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4906 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4907 if (fi >= max) RRETURN(MATCH_NOMATCH);
4908 if (eptr >= md->end_subject)
4909 {
4910 SCHECK_PARTIAL();
4911 RRETURN(MATCH_NOMATCH);
4912 }
4913 GETCHARINCTEST(c, eptr);
4914 chartype = UCD_CHARTYPE(c);
4915 if ((chartype == ucp_Lu ||
4916 chartype == ucp_Ll ||
4917 chartype == ucp_Lt) == prop_fail_result)
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 /* Control never gets here */
4921
4922 case PT_GC:
4923 for (fi = min;; fi++)
4924 {
4925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4927 if (fi >= max) RRETURN(MATCH_NOMATCH);
4928 if (eptr >= md->end_subject)
4929 {
4930 SCHECK_PARTIAL();
4931 RRETURN(MATCH_NOMATCH);
4932 }
4933 GETCHARINCTEST(c, eptr);
4934 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4935 RRETURN(MATCH_NOMATCH);
4936 }
4937 /* Control never gets here */
4938
4939 case PT_PC:
4940 for (fi = min;; fi++)
4941 {
4942 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4943 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4944 if (fi >= max) RRETURN(MATCH_NOMATCH);
4945 if (eptr >= md->end_subject)
4946 {
4947 SCHECK_PARTIAL();
4948 RRETURN(MATCH_NOMATCH);
4949 }
4950 GETCHARINCTEST(c, eptr);
4951 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4952 RRETURN(MATCH_NOMATCH);
4953 }
4954 /* Control never gets here */
4955
4956 case PT_SC:
4957 for (fi = min;; fi++)
4958 {
4959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4961 if (fi >= max) RRETURN(MATCH_NOMATCH);
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 GETCHARINCTEST(c, eptr);
4968 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4969 RRETURN(MATCH_NOMATCH);
4970 }
4971 /* Control never gets here */
4972
4973 case PT_ALNUM:
4974 for (fi = min;; fi++)
4975 {
4976 int category;
4977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4979 if (fi >= max) RRETURN(MATCH_NOMATCH);
4980 if (eptr >= md->end_subject)
4981 {
4982 SCHECK_PARTIAL();
4983 RRETURN(MATCH_NOMATCH);
4984 }
4985 GETCHARINCTEST(c, eptr);
4986 category = UCD_CATEGORY(c);
4987 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4988 RRETURN(MATCH_NOMATCH);
4989 }
4990 /* Control never gets here */
4991
4992 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4993 which means that Perl space and POSIX space are now identical. PCRE
4994 was changed at release 8.34. */
4995
4996 case PT_SPACE: /* Perl space */
4997 case PT_PXSPACE: /* POSIX space */
4998 for (fi = min;; fi++)
4999 {
5000 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5002 if (fi >= max) RRETURN(MATCH_NOMATCH);
5003 if (eptr >= md->end_subject)
5004 {
5005 SCHECK_PARTIAL();
5006 RRETURN(MATCH_NOMATCH);
5007 }
5008 GETCHARINCTEST(c, eptr);
5009 switch(c)
5010 {
5011 HSPACE_CASES:
5012 VSPACE_CASES:
5013 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5014 break;
5015
5016 default:
5017 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5018 RRETURN(MATCH_NOMATCH);
5019 break;
5020 }
5021 }
5022 /* Control never gets here */
5023
5024 case PT_WORD:
5025 for (fi = min;; fi++)
5026 {
5027 int category;
5028 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5030 if (fi >= max) RRETURN(MATCH_NOMATCH);
5031 if (eptr >= md->end_subject)
5032 {
5033 SCHECK_PARTIAL();
5034 RRETURN(MATCH_NOMATCH);
5035 }
5036 GETCHARINCTEST(c, eptr);
5037 category = UCD_CATEGORY(c);
5038 if ((category == ucp_L ||
5039 category == ucp_N ||
5040 c == CHAR_UNDERSCORE)
5041 == prop_fail_result)
5042 RRETURN(MATCH_NOMATCH);
5043 }
5044 /* Control never gets here */
5045
5046 case PT_CLIST:
5047 for (fi = min;; fi++)
5048 {
5049 const pcre_uint32 *cp;
5050 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5052 if (fi >= max) RRETURN(MATCH_NOMATCH);
5053 if (eptr >= md->end_subject)
5054 {
5055 SCHECK_PARTIAL();
5056 RRETURN(MATCH_NOMATCH);
5057 }
5058 GETCHARINCTEST(c, eptr);
5059 cp = PRIV(ucd_caseless_sets) + prop_value;
5060 for (;;)
5061 {
5062 if (c < *cp)
5063 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5064 if (c == *cp++)
5065 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5066 }
5067 }
5068 /* Control never gets here */
5069
5070 case PT_UCNC:
5071 for (fi = min;; fi++)
5072 {
5073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5075 if (fi >= max) RRETURN(MATCH_NOMATCH);
5076 if (eptr >= md->end_subject)
5077 {
5078 SCHECK_PARTIAL();
5079 RRETURN(MATCH_NOMATCH);
5080 }
5081 GETCHARINCTEST(c, eptr);
5082 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5083 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5084 c >= 0xe000) == prop_fail_result)
5085 RRETURN(MATCH_NOMATCH);
5086 }
5087 /* Control never gets here */
5088
5089 /* This should never occur */
5090 default:
5091 RRETURN(PCRE_ERROR_INTERNAL);
5092 }
5093 }
5094
5095 /* Match extended Unicode sequences. We will get here only if the
5096 support is in the binary; otherwise a compile-time error occurs. */
5097
5098 else if (ctype == OP_EXTUNI)
5099 {
5100 for (fi = min;; fi++)
5101 {
5102 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5103 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5104 if (fi >= max) RRETURN(MATCH_NOMATCH);
5105 if (eptr >= md->end_subject)
5106 {
5107 SCHECK_PARTIAL();
5108 RRETURN(MATCH_NOMATCH);
5109 }
5110 else
5111 {
5112 int lgb, rgb;
5113 GETCHARINCTEST(c, eptr);
5114 lgb = UCD_GRAPHBREAK(c);
5115 while (eptr < md->end_subject)
5116 {
5117 int len = 1;
5118 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5119 rgb = UCD_GRAPHBREAK(c);
5120 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5121 lgb = rgb;
5122 eptr += len;
5123 }
5124 }
5125 CHECK_PARTIAL();
5126 }
5127 }
5128 else
5129 #endif /* SUPPORT_UCP */
5130
5131 #ifdef SUPPORT_UTF
5132 if (utf)
5133 {
5134 for (fi = min;; fi++)
5135 {
5136 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5138 if (fi >= max) RRETURN(MATCH_NOMATCH);
5139 if (eptr >= md->end_subject)
5140 {
5141 SCHECK_PARTIAL();
5142 RRETURN(MATCH_NOMATCH);
5143 }
5144 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5145 RRETURN(MATCH_NOMATCH);
5146 GETCHARINC(c, eptr);
5147 switch(ctype)
5148 {
5149 case OP_ANY: /* This is the non-NL case */
5150 if (md->partial != 0 && /* Take care with CRLF partial */
5151 eptr >= md->end_subject &&
5152 NLBLOCK->nltype == NLTYPE_FIXED &&
5153 NLBLOCK->nllen == 2 &&
5154 c == NLBLOCK->nl[0])
5155 {
5156 md->hitend = TRUE;
5157 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5158 }
5159 break;
5160
5161 case OP_ALLANY:
5162 case OP_ANYBYTE:
5163 break;
5164
5165 case OP_ANYNL:
5166 switch(c)
5167 {
5168 default: RRETURN(MATCH_NOMATCH);
5169 case CHAR_CR:
5170 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5171 break;
5172
5173 case CHAR_LF:
5174 break;
5175
5176 case CHAR_VT:
5177 case CHAR_FF:
5178 case CHAR_NEL:
5179 #ifndef EBCDIC
5180 case 0x2028:
5181 case 0x2029:
5182 #endif /* Not EBCDIC */
5183 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5184 break;
5185 }
5186 break;
5187
5188 case OP_NOT_HSPACE:
5189 switch(c)
5190 {
5191 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5192 default: break;
5193 }
5194 break;
5195
5196 case OP_HSPACE:
5197 switch(c)
5198 {
5199 HSPACE_CASES: break;
5200 default: RRETURN(MATCH_NOMATCH);
5201 }
5202 break;
5203
5204 case OP_NOT_VSPACE:
5205 switch(c)
5206 {
5207 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5208 default: break;
5209 }
5210 break;
5211
5212 case OP_VSPACE:
5213 switch(c)
5214 {
5215 VSPACE_CASES: break;
5216 default: RRETURN(MATCH_NOMATCH);
5217 }
5218 break;
5219
5220 case OP_NOT_DIGIT:
5221 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5222 RRETURN(MATCH_NOMATCH);
5223 break;
5224
5225 case OP_DIGIT:
5226 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5227 RRETURN(MATCH_NOMATCH);
5228 break;
5229
5230 case OP_NOT_WHITESPACE:
5231 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5232 RRETURN(MATCH_NOMATCH);
5233 break;
5234
5235 case OP_WHITESPACE:
5236 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5237 RRETURN(MATCH_NOMATCH);
5238 break;
5239
5240 case OP_NOT_WORDCHAR:
5241 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5242 RRETURN(MATCH_NOMATCH);
5243 break;
5244
5245 case OP_WORDCHAR:
5246 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5247 RRETURN(MATCH_NOMATCH);
5248 break;
5249
5250 default:
5251 RRETURN(PCRE_ERROR_INTERNAL);
5252 }
5253 }
5254 }
5255 else
5256 #endif
5257 /* Not UTF mode */
5258 {
5259 for (fi = min;; fi++)
5260 {
5261 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5263 if (fi >= max) RRETURN(MATCH_NOMATCH);
5264 if (eptr >= md->end_subject)
5265 {
5266 SCHECK_PARTIAL();
5267 RRETURN(MATCH_NOMATCH);
5268 }
5269 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5270 RRETURN(MATCH_NOMATCH);
5271 c = *eptr++;
5272 switch(ctype)
5273 {
5274 case OP_ANY: /* This is the non-NL case */
5275 if (md->partial != 0 && /* Take care with CRLF partial */
5276 eptr >= md->end_subject &&
5277 NLBLOCK->nltype == NLTYPE_FIXED &&
5278 NLBLOCK->nllen == 2 &&
5279 c == NLBLOCK->nl[0])
5280 {
5281 md->hitend = TRUE;
5282 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5283 }
5284 break;
5285
5286 case OP_ALLANY:
5287 case OP_ANYBYTE:
5288 break;
5289
5290 case OP_ANYNL:
5291 switch(c)
5292 {
5293 default: RRETURN(MATCH_NOMATCH);
5294 case CHAR_CR:
5295 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5296 break;
5297
5298 case CHAR_LF:
5299 break;
5300
5301 case CHAR_VT:
5302 case CHAR_FF:
5303 case CHAR_NEL:
5304 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5305 case 0x2028:
5306 case 0x2029:
5307 #endif
5308 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5309 break;
5310 }
5311 break;
5312
5313 case OP_NOT_HSPACE:
5314 switch(c)
5315 {
5316 default: break;
5317 HSPACE_BYTE_CASES:
5318 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5319 HSPACE_MULTIBYTE_CASES:
5320 #endif
5321 RRETURN(MATCH_NOMATCH);
5322 }
5323 break;
5324
5325 case OP_HSPACE:
5326 switch(c)
5327 {
5328 default: RRETURN(MATCH_NOMATCH);
5329 HSPACE_BYTE_CASES:
5330 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5331 HSPACE_MULTIBYTE_CASES:
5332 #endif
5333 break;
5334 }
5335 break;
5336
5337 case OP_NOT_VSPACE:
5338 switch(c)
5339 {
5340 default: break;
5341 VSPACE_BYTE_CASES:
5342 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5343 VSPACE_MULTIBYTE_CASES:
5344 #endif
5345 RRETURN(MATCH_NOMATCH);
5346 }
5347 break;
5348
5349 case OP_VSPACE:
5350 switch(c)
5351 {
5352 default: RRETURN(MATCH_NOMATCH);
5353 VSPACE_BYTE_CASES:
5354 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5355 VSPACE_MULTIBYTE_CASES:
5356 #endif
5357 break;
5358 }
5359 break;
5360
5361 case OP_NOT_DIGIT:
5362 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5363 break;
5364
5365 case OP_DIGIT:
5366 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5367 break;
5368
5369 case OP_NOT_WHITESPACE:
5370 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5371 break;
5372
5373 case OP_WHITESPACE:
5374 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5375 break;
5376
5377 case OP_NOT_WORDCHAR:
5378 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5379 break;
5380
5381 case OP_WORDCHAR:
5382 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5383 break;
5384
5385 default:
5386 RRETURN(PCRE_ERROR_INTERNAL);
5387 }
5388 }
5389 }
5390 /* Control never gets here */
5391 }
5392
5393 /* If maximizing, it is worth using inline code for speed, doing the type
5394 test once at the start (i.e. keep it out of the loop). Again, keep the
5395 UTF-8 and UCP stuff separate. */
5396
5397 else
5398 {
5399 pp = eptr; /* Remember where we started */
5400
5401 #ifdef SUPPORT_UCP
5402 if (prop_type >= 0)
5403 {
5404 switch(prop_type)
5405 {
5406 case PT_ANY:
5407 for (i = min; i < max; i++)
5408 {
5409 int len = 1;
5410 if (eptr >= md->end_subject)
5411 {
5412 SCHECK_PARTIAL();
5413 break;
5414 }
5415 GETCHARLENTEST(c, eptr, len);
5416 if (prop_fail_result) break;
5417 eptr+= len;
5418 }
5419 break;
5420
5421 case PT_LAMP:
5422 for (i = min; i < max; i++)
5423 {
5424 int chartype;
5425 int len = 1;
5426 if (eptr >= md->end_subject)
5427 {
5428 SCHECK_PARTIAL();
5429 break;
5430 }
5431 GETCHARLENTEST(c, eptr, len);
5432 chartype = UCD_CHARTYPE(c);
5433 if ((chartype == ucp_Lu ||
5434 chartype == ucp_Ll ||
5435 chartype == ucp_Lt) == prop_fail_result)
5436 break;
5437 eptr+= len;
5438 }
5439 break;
5440
5441 case PT_GC:
5442 for (i = min; i < max; i++)
5443 {
5444 int len = 1;
5445 if (eptr >= md->end_subject)
5446 {
5447 SCHECK_PARTIAL();
5448 break;
5449 }
5450 GETCHARLENTEST(c, eptr, len);
5451 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5452 eptr+= len;
5453 }
5454 break;
5455
5456 case PT_PC:
5457 for (i = min; i < max; i++)
5458 {
5459 int len = 1;
5460 if (eptr >= md->end_subject)
5461 {
5462 SCHECK_PARTIAL();
5463 break;
5464 }
5465 GETCHARLENTEST(c, eptr, len);
5466 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5467 eptr+= len;
5468 }
5469 break;
5470
5471 case PT_SC:
5472 for (i = min; i < max; i++)
5473 {
5474 int len = 1;
5475 if (eptr >= md->end_subject)
5476 {
5477 SCHECK_PARTIAL();
5478 break;
5479 }
5480 GETCHARLENTEST(c, eptr, len);
5481 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5482 eptr+= len;
5483 }
5484 break;
5485
5486 case PT_ALNUM:
5487 for (i = min; i < max; i++)
5488 {
5489 int category;
5490 int len = 1;
5491 if (eptr >= md->end_subject)
5492 {
5493 SCHECK_PARTIAL();
5494 break;
5495 }
5496 GETCHARLENTEST(c, eptr, len);
5497 category = UCD_CATEGORY(c);
5498 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5499 break;
5500 eptr+= len;
5501 }
5502 break;
5503
5504 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5505 which means that Perl space and POSIX space are now identical. PCRE
5506 was changed at release 8.34. */
5507
5508 case PT_SPACE: /* Perl space */
5509 case PT_PXSPACE: /* POSIX space */
5510 for (i = min; i < max; i++)
5511 {
5512 int len = 1;
5513 if (eptr >= md->end_subject)
5514 {
5515 SCHECK_PARTIAL();
5516 break;
5517 }
5518 GETCHARLENTEST(c, eptr, len);
5519 switch(c)
5520 {
5521 HSPACE_CASES:
5522 VSPACE_CASES:
5523 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5524 break;
5525
5526 default:
5527 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5528 goto ENDLOOP99; /* Break the loop */
5529 break;
5530 }
5531 eptr+= len;
5532 }
5533 ENDLOOP99:
5534 break;
5535
5536 case PT_WORD:
5537 for (i = min; i < max; i++)
5538 {
5539 int category;
5540 int len = 1;
5541 if (eptr >= md->end_subject)
5542 {
5543 SCHECK_PARTIAL();
5544 break;
5545 }
5546 GETCHARLENTEST(c, eptr, len);
5547 category = UCD_CATEGORY(c);
5548 if ((category == ucp_L || category == ucp_N ||
5549 c == CHAR_UNDERSCORE) == prop_fail_result)
5550 break;
5551 eptr+= len;
5552 }
5553 break;
5554
5555 case PT_CLIST:
5556 for (i = min; i < max; i++)
5557 {
5558 const pcre_uint32 *cp;
5559 int len = 1;
5560 if (eptr >= md->end_subject)
5561 {
5562 SCHECK_PARTIAL();
5563 break;
5564 }
5565 GETCHARLENTEST(c, eptr, len);
5566 cp = PRIV(ucd_caseless_sets) + prop_value;
5567 for (;;)
5568 {
5569 if (c < *cp)
5570 { if (prop_fail_result) break; else goto GOT_MAX; }
5571 if (c == *cp++)
5572 { if (prop_fail_result) goto GOT_MAX; else break; }
5573 }
5574 eptr += len;
5575 }
5576 GOT_MAX:
5577 break;
5578
5579 case PT_UCNC:
5580 for (i = min; i < max; i++)
5581 {
5582 int len = 1;
5583 if (eptr >= md->end_subject)
5584 {
5585 SCHECK_PARTIAL();
5586 break;
5587 }
5588 GETCHARLENTEST(c, eptr, len);
5589 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5590 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5591 c >= 0xe000) == prop_fail_result)
5592 break;
5593 eptr += len;
5594 }
5595 break;
5596
5597 default:
5598 RRETURN(PCRE_ERROR_INTERNAL);
5599 }
5600
5601 /* eptr is now past the end of the maximum run */
5602
5603 if (possessive) continue; /* No backtracking */
5604 for(;;)
5605 {
5606 if (eptr == pp) goto TAIL_RECURSE;
5607 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5608 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5609 eptr--;
5610 if (utf) BACKCHAR(eptr);
5611 }
5612 }
5613
5614 /* Match extended Unicode grapheme clusters. We will get here only if the
5615 support is in the binary; otherwise a compile-time error occurs. */
5616
5617 else if (ctype == OP_EXTUNI)
5618 {
5619 for (i = min; i < max; i++)
5620 {
5621 if (eptr >= md->end_subject)
5622 {
5623 SCHECK_PARTIAL();
5624 break;
5625 }
5626 else
5627 {
5628 int lgb, rgb;
5629 GETCHARINCTEST(c, eptr);
5630 lgb = UCD_GRAPHBREAK(c);
5631 while (eptr < md->end_subject)
5632 {
5633 int len = 1;
5634 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5635 rgb = UCD_GRAPHBREAK(c);
5636 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5637 lgb = rgb;
5638 eptr += len;
5639 }
5640 }
5641 CHECK_PARTIAL();
5642 }
5643
5644 /* eptr is now past the end of the maximum run */
5645
5646 if (possessive) continue; /* No backtracking */
5647
5648 for(;;)
5649 {
5650 int lgb, rgb;
5651 PCRE_PUCHAR fptr;
5652
5653 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5654 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5656
5657 /* Backtracking over an extended grapheme cluster involves inspecting
5658 the previous two characters (if present) to see if a break is
5659 permitted between them. */
5660
5661 eptr--;
5662 if (!utf) c = *eptr; else
5663 {
5664 BACKCHAR(eptr);
5665 GETCHAR(c, eptr);
5666 }
5667 rgb = UCD_GRAPHBREAK(c);
5668
5669 for (;;)
5670 {
5671 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5672 fptr = eptr - 1;
5673 if (!utf) c = *fptr; else
5674 {
5675 BACKCHAR(fptr);
5676 GETCHAR(c, fptr);
5677 }
5678 lgb = UCD_GRAPHBREAK(c);
5679 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5680 eptr = fptr;
5681 rgb = lgb;
5682 }
5683 }
5684 }
5685
5686 else
5687 #endif /* SUPPORT_UCP */
5688
5689 #ifdef SUPPORT_UTF
5690 if (utf)
5691 {
5692 switch(ctype)
5693 {
5694 case OP_ANY:
5695 for (i = min; i < max; i++)
5696 {
5697 if (eptr >= md->end_subject)
5698 {
5699 SCHECK_PARTIAL();
5700 break;
5701 }
5702 if (IS_NEWLINE(eptr)) break;
5703 if (md->partial != 0 && /* Take care with CRLF partial */
5704 eptr + 1 >= md->end_subject &&
5705 NLBLOCK->nltype == NLTYPE_FIXED &&
5706 NLBLOCK->nllen == 2 &&
5707 UCHAR21(eptr) == NLBLOCK->nl[0])
5708 {
5709 md->hitend = TRUE;
5710 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5711 }
5712 eptr++;
5713 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5714 }
5715 break;
5716
5717 case OP_ALLANY:
5718 if (max < INT_MAX)
5719 {
5720 for (i = min; i < max; i++)
5721 {
5722 if (eptr >= md->end_subject)
5723 {
5724 SCHECK_PARTIAL();
5725 break;
5726 }
5727 eptr++;
5728 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5729 }
5730 }
5731 else
5732 {
5733 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5734 SCHECK_PARTIAL();
5735 }
5736 break;
5737
5738 /* The byte case is the same as non-UTF8 */
5739
5740 case OP_ANYBYTE:
5741 c = max - min;
5742 if (c > (unsigned int)(md->end_subject - eptr))
5743 {
5744 eptr = md->end_subject;
5745 SCHECK_PARTIAL();
5746 }
5747 else eptr += c;
5748 break;
5749
5750 case OP_ANYNL:
5751 for (i = min; i < max; i++)
5752 {
5753 int len = 1;
5754 if (eptr >= md->end_subject)
5755 {
5756 SCHECK_PARTIAL();
5757 break;
5758 }
5759 GETCHARLEN(c, eptr, len);
5760 if (c == CHAR_CR)
5761 {
5762 if (++eptr >= md->end_subject) break;
5763 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5764 }
5765 else
5766 {
5767 if (c != CHAR_LF &&
5768 (md->bsr_anycrlf ||
5769 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5770 #ifndef EBCDIC
5771 && c != 0x2028 && c != 0x2029
5772 #endif /* Not EBCDIC */
5773 )))
5774 break;
5775 eptr += len;
5776 }
5777 }
5778 break;
5779
5780 case OP_NOT_HSPACE:
5781 case OP_HSPACE:
5782 for (i = min; i < max; i++)
5783 {
5784 BOOL gotspace;
5785 int len = 1;
5786 if (eptr >= md->end_subject)
5787 {
5788 SCHECK_PARTIAL();
5789 break;
5790 }
5791 GETCHARLEN(c, eptr, len);
5792 switch(c)
5793 {
5794 HSPACE_CASES: gotspace = TRUE; break;
5795 default: gotspace = FALSE; break;
5796 }
5797 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5798 eptr += len;
5799 }
5800 break;
5801
5802 case OP_NOT_VSPACE:
5803 case OP_VSPACE:
5804 for (i = min; i < max; i++)
5805 {
5806 BOOL gotspace;
5807 int len = 1;
5808 if (eptr >= md->end_subject)
5809 {
5810 SCHECK_PARTIAL();
5811 break;
5812 }
5813 GETCHARLEN(c, eptr, len);
5814 switch(c)
5815 {
5816 VSPACE_CASES: gotspace = TRUE; break;
5817 default: gotspace = FALSE; break;
5818 }
5819 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5820 eptr += len;
5821 }
5822 break;
5823
5824 case OP_NOT_DIGIT:
5825 for (i = min; i < max; i++)
5826 {
5827 int len = 1;
5828 if (eptr >= md->end_subject)
5829 {
5830 SCHECK_PARTIAL();
5831 break;
5832 }
5833 GETCHARLEN(c, eptr, len);
5834 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5835 eptr+= len;
5836 }
5837 break;
5838
5839 case OP_DIGIT:
5840 for (i = min; i < max; i++)
5841 {
5842 int len = 1;
5843 if (eptr >= md->end_subject)
5844 {
5845 SCHECK_PARTIAL();
5846 break;
5847 }
5848 GETCHARLEN(c, eptr, len);
5849 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5850 eptr+= len;
5851 }
5852 break;
5853
5854 case OP_NOT_WHITESPACE:
5855 for (i = min; i < max; i++)
5856 {
5857 int len = 1;
5858 if (eptr >= md->end_subject)
5859 {
5860 SCHECK_PARTIAL();
5861 break;
5862 }
5863 GETCHARLEN(c, eptr, len);
5864 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5865 eptr+= len;
5866 }
5867 break;
5868
5869 case OP_WHITESPACE:
5870 for (i = min; i < max; i++)
5871 {
5872 int len = 1;
5873 if (eptr >= md->end_subject)
5874 {
5875 SCHECK_PARTIAL();
5876 break;
5877 }
5878 GETCHARLEN(c, eptr, len);
5879 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5880 eptr+= len;
5881 }
5882 break;
5883
5884 case OP_NOT_WORDCHAR:
5885 for (i = min; i < max; i++)
5886 {
5887 int len = 1;
5888 if (eptr >= md->end_subject)
5889 {
5890 SCHECK_PARTIAL();
5891 break;
5892 }
5893 GETCHARLEN(c, eptr, len);
5894 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5895 eptr+= len;
5896 }
5897 break;
5898
5899 case OP_WORDCHAR:
5900 for (i = min; i < max; i++)
5901 {
5902 int len = 1;
5903 if (eptr >= md->end_subject)
5904 {
5905 SCHECK_PARTIAL();
5906 break;
5907 }
5908 GETCHARLEN(c, eptr, len);
5909 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5910 eptr+= len;
5911 }
5912 break;
5913
5914 default:
5915 RRETURN(PCRE_ERROR_INTERNAL);
5916 }
5917
5918 if (possessive) continue; /* No backtracking */
5919 for(;;)
5920 {
5921 if (eptr == pp) goto TAIL_RECURSE;
5922 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5923 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5924 eptr--;
5925 BACKCHAR(eptr);
5926 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5927 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5928 }
5929 }
5930 else
5931 #endif /* SUPPORT_UTF */
5932 /* Not UTF mode */
5933 {
5934 switch(ctype)
5935 {
5936 case OP_ANY:
5937 for (i = min; i < max; i++)
5938 {
5939 if (eptr >= md->end_subject)
5940 {
5941 SCHECK_PARTIAL();
5942 break;
5943 }
5944 if (IS_NEWLINE(eptr)) break;
5945 if (md->partial != 0 && /* Take care with CRLF partial */
5946 eptr + 1 >= md->end_subject &&
5947 NLBLOCK->nltype == NLTYPE_FIXED &&
5948 NLBLOCK->nllen == 2 &&
5949 *eptr == NLBLOCK->nl[0])
5950 {
5951 md->hitend = TRUE;
5952 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5953 }
5954 eptr++;
5955 }
5956 break;
5957
5958 case OP_ALLANY:
5959 case OP_ANYBYTE:
5960 c = max - min;
5961 if (c > (unsigned int)(md->end_subject - eptr))
5962 {
5963 eptr = md->end_subject;
5964 SCHECK_PARTIAL();
5965 }
5966 else eptr += c;
5967 break;
5968
5969 case OP_ANYNL:
5970 for (i = min; i < max; i++)
5971 {
5972 if (eptr >= md->end_subject)
5973 {
5974 SCHECK_PARTIAL();
5975 break;
5976 }
5977 c = *eptr;
5978 if (c == CHAR_CR)
5979 {
5980 if (++eptr >= md->end_subject) break;
5981 if (*eptr == CHAR_LF) eptr++;
5982 }
5983 else
5984 {
5985 if (c != CHAR_LF && (md->bsr_anycrlf ||
5986 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5987 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5988 && c != 0x2028 && c != 0x2029
5989 #endif
5990 ))) break;
5991 eptr++;
5992 }
5993 }
5994 break;
5995
5996 case OP_NOT_HSPACE:
5997 for (i = min; i < max; i++)
5998 {
5999 if (eptr >= md->end_subject)
6000 {
6001 SCHECK_PARTIAL();
6002 break;
6003 }
6004 switch(*eptr)
6005 {
6006 default: eptr++; break;
6007 HSPACE_BYTE_CASES:
6008 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6009 HSPACE_MULTIBYTE_CASES:
6010 #endif
6011 goto ENDLOOP00;
6012 }
6013 }
6014 ENDLOOP00:
6015 break;
6016
6017 case OP_HSPACE:
6018 for (i = min; i < max; i++)
6019 {
6020 if (eptr >= md->end_subject)
6021 {
6022 SCHECK_PARTIAL();
6023 break;
6024 }
6025 switch(*eptr)
6026 {
6027 default: goto ENDLOOP01;
6028 HSPACE_BYTE_CASES:
6029 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6030 HSPACE_MULTIBYTE_CASES:
6031 #endif
6032 eptr++; break;
6033 }
6034 }
6035 ENDLOOP01:
6036 break;
6037
6038 case OP_NOT_VSPACE:
6039 for (i = min; i < max; i++)
6040 {
6041 if (eptr >= md->end_subject)
6042 {
6043 SCHECK_PARTIAL();
6044 break;
6045 }
6046 switch(*eptr)
6047 {
6048 default: eptr++; break;
6049 VSPACE_BYTE_CASES:
6050 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6051 VSPACE_MULTIBYTE_CASES:
6052 #endif
6053 goto ENDLOOP02;
6054 }
6055 }
6056 ENDLOOP02:
6057 break;
6058
6059 case OP_VSPACE:
6060 for (i = min; i < max; i++)
6061 {
6062 if (eptr >= md->end_subject)
6063 {
6064 SCHECK_PARTIAL();
6065 break;
6066 }
6067 switch(*eptr)
6068 {
6069 default: goto ENDLOOP03;
6070 VSPACE_BYTE_CASES:
6071 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6072 VSPACE_MULTIBYTE_CASES:
6073 #endif
6074 eptr++; break;
6075 }
6076 }
6077 ENDLOOP03:
6078 break;
6079
6080 case OP_NOT_DIGIT:
6081 for (i = min; i < max; i++)
6082 {
6083 if (eptr >= md->end_subject)
6084 {
6085 SCHECK_PARTIAL();
6086 break;
6087 }
6088 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6089 eptr++;
6090 }
6091 break;
6092
6093 case OP_DIGIT:
6094 for (i = min; i < max; i++)
6095 {
6096 if (eptr >= md->end_subject)
6097 {
6098 SCHECK_PARTIAL();
6099 break;
6100 }
6101 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6102 eptr++;
6103 }
6104 break;
6105
6106 case OP_NOT_WHITESPACE:
6107 for (i = min; i < max; i++)
6108 {
6109 if (eptr >= md->end_subject)
6110 {
6111 SCHECK_PARTIAL();
6112 break;
6113 }
6114 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6115 eptr++;
6116 }
6117 break;
6118
6119 case OP_WHITESPACE:
6120 for (i = min; i < max; i++)
6121 {
6122 if (eptr >= md->end_subject)
6123 {
6124 SCHECK_PARTIAL();
6125 break;
6126 }
6127 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6128 eptr++;
6129 }
6130 break;
6131
6132 case OP_NOT_WORDCHAR:
6133 for (i = min; i < max; i++)
6134 {
6135 if (eptr >= md->end_subject)
6136 {
6137 SCHECK_PARTIAL();
6138 break;
6139 }
6140 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6141 eptr++;
6142 }
6143 break;
6144
6145 case OP_WORDCHAR:
6146 for (i = min; i < max; i++)
6147 {
6148 if (eptr >= md->end_subject)
6149 {
6150 SCHECK_PARTIAL();
6151 break;
6152 }
6153 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6154 eptr++;
6155 }
6156 break;
6157
6158 default:
6159 RRETURN(PCRE_ERROR_INTERNAL);
6160 }
6161
6162 if (possessive) continue; /* No backtracking */
6163 for (;;)
6164 {
6165 if (eptr == pp) goto TAIL_RECURSE;
6166 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6167 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6168 eptr--;
6169 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6170 eptr[-1] == CHAR_CR) eptr--;
6171 }
6172 }
6173
6174 /* Control never gets here */
6175 }
6176
6177 /* There's been some horrible disaster. Arrival here can only mean there is
6178 something seriously wrong in the code above or the OP_xxx definitions. */
6179
6180 default:
6181 DPRINTF(("Unknown opcode %d\n", *ecode));
6182 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6183 }
6184
6185 /* Do not stick any code in here without much thought; it is assumed
6186 that "continue" in the code above comes out to here to repeat the main
6187 loop. */
6188
6189 } /* End of main loop */
6190 /* Control never reaches here */
6191
6192
6193 /* When compiling to use the heap rather than the stack for recursive calls to
6194 match(), the RRETURN() macro jumps here. The number that is saved in
6195 frame->Xwhere indicates which label we actually want to return to. */
6196
6197 #ifdef NO_RECURSE
6198 #define LBL(val) case val: goto L_RM##val;
6199 HEAP_RETURN:
6200 switch (frame->Xwhere)
6201 {
6202 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6203 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6204 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6205 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6206 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6207 LBL(65) LBL(66)
6208 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6209 LBL(20) LBL(21)
6210 #endif
6211 #ifdef SUPPORT_UTF
6212 LBL(16) LBL(18)
6213 LBL(22) LBL(23) LBL(28) LBL(30)
6214 LBL(32) LBL(34) LBL(42) LBL(46)
6215 #ifdef SUPPORT_UCP
6216 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6217 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6218 #endif /* SUPPORT_UCP */
6219 #endif /* SUPPORT_UTF */
6220 default:
6221 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6222 return PCRE_ERROR_INTERNAL;
6223 }
6224 #undef LBL
6225 #endif /* NO_RECURSE */
6226 }
6227
6228
6229 /***************************************************************************
6230 ****************************************************************************
6231 RECURSION IN THE match() FUNCTION
6232
6233 Undefine all the macros that were defined above to handle this. */
6234
6235 #ifdef NO_RECURSE
6236 #undef eptr
6237 #undef ecode
6238 #undef mstart
6239 #undef offset_top
6240 #undef eptrb
6241 #undef flags
6242
6243 #undef callpat
6244 #undef charptr
6245 #undef data
6246 #undef next
6247 #undef pp
6248 #undef prev
6249 #undef saved_eptr
6250
6251 #undef new_recursive
6252
6253 #undef cur_is_word
6254 #undef condition
6255 #undef prev_is_word
6256
6257 #undef ctype
6258 #undef length
6259 #undef max
6260 #undef min
6261 #undef number
6262 #undef offset
6263 #undef op
6264 #undef save_capture_last
6265 #undef save_offset1
6266 #undef save_offset2
6267 #undef save_offset3
6268 #undef stacksave
6269
6270 #undef newptrb
6271
6272 #endif
6273
6274 /* These two are defined as macros in both cases */
6275
6276 #undef fc
6277 #undef fi
6278
6279 /***************************************************************************
6280 ***************************************************************************/
6281
6282
6283 #ifdef NO_RECURSE
6284 /*************************************************
6285 * Release allocated heap frames *
6286 *************************************************/
6287
6288 /* This function releases all the allocated frames. The base frame is on the
6289 machine stack, and so must not be freed.
6290
6291 Argument: the address of the base frame
6292 Returns: nothing
6293 */
6294
6295 static void
release_match_heapframes(heapframe * frame_base)6296 release_match_heapframes (heapframe *frame_base)
6297 {
6298 heapframe *nextframe = frame_base->Xnextframe;
6299 while (nextframe != NULL)
6300 {
6301 heapframe *oldframe = nextframe;
6302 nextframe = nextframe->Xnextframe;
6303 (PUBL(stack_free))(oldframe);
6304 }
6305 }
6306 #endif
6307
6308
6309 /*************************************************
6310 * Execute a Regular Expression *
6311 *************************************************/
6312
6313 /* This function applies a compiled re to a subject string and picks out
6314 portions of the string if it matches. Two elements in the vector are set for
6315 each substring: the offsets to the start and end of the substring.
6316
6317 Arguments:
6318 argument_re points to the compiled expression
6319 extra_data points to extra data or is NULL
6320 subject points to the subject string
6321 length length of subject string (may contain binary zeros)
6322 start_offset where to start in the subject string
6323 options option bits
6324 offsets points to a vector of ints to be filled in with offsets
6325 offsetcount the number of elements in the vector
6326
6327 Returns: > 0 => success; value is the number of elements filled in
6328 = 0 => success, but offsets is not big enough
6329 -1 => failed to match
6330 < -1 => some kind of unexpected problem
6331 */
6332
6333 #if defined COMPILE_PCRE8
6334 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_exec(const pcre * argument_re,const pcre_extra * extra_data,PCRE_SPTR subject,int length,int start_offset,int options,int * offsets,int offsetcount)6335 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6336 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6337 int offsetcount)
6338 #elif defined COMPILE_PCRE16
6339 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6340 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6341 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6342 int offsetcount)
6343 #elif defined COMPILE_PCRE32
6344 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6345 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6346 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6347 int offsetcount)
6348 #endif
6349 {
6350 int rc, ocount, arg_offset_max;
6351 int newline;
6352 BOOL using_temporary_offsets = FALSE;
6353 BOOL anchored;
6354 BOOL startline;
6355 BOOL firstline;
6356 BOOL utf;
6357 BOOL has_first_char = FALSE;
6358 BOOL has_req_char = FALSE;
6359 pcre_uchar first_char = 0;
6360 pcre_uchar first_char2 = 0;
6361 pcre_uchar req_char = 0;
6362 pcre_uchar req_char2 = 0;
6363 match_data match_block;
6364 match_data *md = &match_block;
6365 const pcre_uint8 *tables;
6366 const pcre_uint8 *start_bits = NULL;
6367 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6368 PCRE_PUCHAR end_subject;
6369 PCRE_PUCHAR start_partial = NULL;
6370 PCRE_PUCHAR match_partial = NULL;
6371 PCRE_PUCHAR req_char_ptr = start_match - 1;
6372
6373 const pcre_study_data *study;
6374 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6375
6376 #ifdef NO_RECURSE
6377 heapframe frame_zero;
6378 frame_zero.Xprevframe = NULL; /* Marks the top level */
6379 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6380 md->match_frames_base = &frame_zero;
6381 #endif
6382
6383 /* Check for the special magic call that measures the size of the stack used
6384 per recursive call of match(). Without the funny casting for sizeof, a Windows
6385 compiler gave this error: "unary minus operator applied to unsigned type,
6386 result still unsigned". Hopefully the cast fixes that. */
6387
6388 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6389 start_offset == -999)
6390 #ifdef NO_RECURSE
6391 return -((int)sizeof(heapframe));
6392 #else
6393 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6394 #endif
6395
6396 /* Plausibility checks */
6397
6398 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6399 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6400 return PCRE_ERROR_NULL;
6401 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6402 if (length < 0) return PCRE_ERROR_BADLENGTH;
6403 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6404
6405 /* Check that the first field in the block is the magic number. If it is not,
6406 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6407 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6408 means that the pattern is likely compiled with different endianness. */
6409
6410 if (re->magic_number != MAGIC_NUMBER)
6411 return re->magic_number == REVERSED_MAGIC_NUMBER?
6412 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6413 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6414
6415 /* These two settings are used in the code for checking a UTF-8 string that
6416 follows immediately afterwards. Other values in the md block are used only
6417 during "normal" pcre_exec() processing, not when the JIT support is in use,
6418 so they are set up later. */
6419
6420 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6421 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6422 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6423 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6424
6425 /* Check a UTF-8 string if required. Pass back the character offset and error
6426 code for an invalid string if a results vector is available. */
6427
6428 #ifdef SUPPORT_UTF
6429 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6430 {
6431 int erroroffset;
6432 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6433 if (errorcode != 0)
6434 {
6435 if (offsetcount >= 2)
6436 {
6437 offsets[0] = erroroffset;
6438 offsets[1] = errorcode;
6439 }
6440 #if defined COMPILE_PCRE8
6441 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6442 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6443 #elif defined COMPILE_PCRE16
6444 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6445 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6446 #elif defined COMPILE_PCRE32
6447 return PCRE_ERROR_BADUTF32;
6448 #endif
6449 }
6450 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6451 /* Check that a start_offset points to the start of a UTF character. */
6452 if (start_offset > 0 && start_offset < length &&
6453 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6454 return PCRE_ERROR_BADUTF8_OFFSET;
6455 #endif
6456 }
6457 #endif
6458
6459 /* If the pattern was successfully studied with JIT support, run the JIT
6460 executable instead of the rest of this function. Most options must be set at
6461 compile time for the JIT code to be usable. Fallback to the normal code path if
6462 an unsupported flag is set. */
6463
6464 #ifdef SUPPORT_JIT
6465 if (extra_data != NULL
6466 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6467 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6468 && extra_data->executable_jit != NULL
6469 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6470 {
6471 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6472 start_offset, options, offsets, offsetcount);
6473
6474 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6475 mode is not compiled. In this case we simply fallback to interpreter. */
6476
6477 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6478 }
6479 #endif
6480
6481 /* Carry on with non-JIT matching. This information is for finding all the
6482 numbers associated with a given name, for condition testing. */
6483
6484 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6485 md->name_count = re->name_count;
6486 md->name_entry_size = re->name_entry_size;
6487
6488 /* Fish out the optional data from the extra_data structure, first setting
6489 the default values. */
6490
6491 study = NULL;
6492 md->match_limit = MATCH_LIMIT;
6493 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6494 md->callout_data = NULL;
6495
6496 /* The table pointer is always in native byte order. */
6497
6498 tables = re->tables;
6499
6500 /* The two limit values override the defaults, whatever their value. */
6501
6502 if (extra_data != NULL)
6503 {
6504 unsigned long int flags = extra_data->flags;
6505 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6506 study = (const pcre_study_data *)extra_data->study_data;
6507 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6508 md->match_limit = extra_data->match_limit;
6509 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6510 md->match_limit_recursion = extra_data->match_limit_recursion;
6511 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6512 md->callout_data = extra_data->callout_data;
6513 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6514 }
6515
6516 /* Limits in the regex override only if they are smaller. */
6517
6518 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6519 md->match_limit = re->limit_match;
6520
6521 if ((re->flags & PCRE_RLSET) != 0 &&
6522 re->limit_recursion < md->match_limit_recursion)
6523 md->match_limit_recursion = re->limit_recursion;
6524
6525 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6526 is a feature that makes it possible to save compiled regex and re-use them
6527 in other programs later. */
6528
6529 if (tables == NULL) tables = PRIV(default_tables);
6530
6531 /* Set up other data */
6532
6533 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6534 startline = (re->flags & PCRE_STARTLINE) != 0;
6535 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6536
6537 /* The code starts after the real_pcre block and the capture name table. */
6538
6539 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6540 re->name_count * re->name_entry_size;
6541
6542 md->start_subject = (PCRE_PUCHAR)subject;
6543 md->start_offset = start_offset;
6544 md->end_subject = md->start_subject + length;
6545 end_subject = md->end_subject;
6546
6547 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6548 md->use_ucp = (re->options & PCRE_UCP) != 0;
6549 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6550 md->ignore_skip_arg = 0;
6551
6552 /* Some options are unpacked into BOOL variables in the hope that testing
6553 them will be faster than individual option bits. */
6554
6555 md->notbol = (options & PCRE_NOTBOL) != 0;
6556 md->noteol = (options & PCRE_NOTEOL) != 0;
6557 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6558 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6559
6560 md->hitend = FALSE;
6561 md->mark = md->nomatch_mark = NULL; /* In case never set */
6562
6563 md->recursive = NULL; /* No recursion at top level */
6564 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6565
6566 md->lcc = tables + lcc_offset;
6567 md->fcc = tables + fcc_offset;
6568 md->ctypes = tables + ctypes_offset;
6569
6570 /* Handle different \R options. */
6571
6572 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6573 {
6574 case 0:
6575 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6576 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6577 else
6578 #ifdef BSR_ANYCRLF
6579 md->bsr_anycrlf = TRUE;
6580 #else
6581 md->bsr_anycrlf = FALSE;
6582 #endif
6583 break;
6584
6585 case PCRE_BSR_ANYCRLF:
6586 md->bsr_anycrlf = TRUE;
6587 break;
6588
6589 case PCRE_BSR_UNICODE:
6590 md->bsr_anycrlf = FALSE;
6591 break;
6592
6593 default: return PCRE_ERROR_BADNEWLINE;
6594 }
6595
6596 /* Handle different types of newline. The three bits give eight cases. If
6597 nothing is set at run time, whatever was used at compile time applies. */
6598
6599 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6600 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6601 {
6602 case 0: newline = NEWLINE; break; /* Compile-time default */
6603 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6604 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6605 case PCRE_NEWLINE_CR+
6606 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6607 case PCRE_NEWLINE_ANY: newline = -1; break;
6608 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6609 default: return PCRE_ERROR_BADNEWLINE;
6610 }
6611
6612 if (newline == -2)
6613 {
6614 md->nltype = NLTYPE_ANYCRLF;
6615 }
6616 else if (newline < 0)
6617 {
6618 md->nltype = NLTYPE_ANY;
6619 }
6620 else
6621 {
6622 md->nltype = NLTYPE_FIXED;
6623 if (newline > 255)
6624 {
6625 md->nllen = 2;
6626 md->nl[0] = (newline >> 8) & 255;
6627 md->nl[1] = newline & 255;
6628 }
6629 else
6630 {
6631 md->nllen = 1;
6632 md->nl[0] = newline;
6633 }
6634 }
6635
6636 /* Partial matching was originally supported only for a restricted set of
6637 regexes; from release 8.00 there are no restrictions, but the bits are still
6638 defined (though never set). So there's no harm in leaving this code. */
6639
6640 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6641 return PCRE_ERROR_BADPARTIAL;
6642
6643 /* If the expression has got more back references than the offsets supplied can
6644 hold, we get a temporary chunk of working store to use during the matching.
6645 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6646 of 3. */
6647
6648 ocount = offsetcount - (offsetcount % 3);
6649 arg_offset_max = (2*ocount)/3;
6650
6651 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6652 {
6653 ocount = re->top_backref * 3 + 3;
6654 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6655 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6656 using_temporary_offsets = TRUE;
6657 DPRINTF(("Got memory to hold back references\n"));
6658 }
6659 else md->offset_vector = offsets;
6660 md->offset_end = ocount;
6661 md->offset_max = (2*ocount)/3;
6662 md->capture_last = 0;
6663
6664 /* Reset the working variable associated with each extraction. These should
6665 never be used unless previously set, but they get saved and restored, and so we
6666 initialize them to avoid reading uninitialized locations. Also, unset the
6667 offsets for the matched string. This is really just for tidiness with callouts,
6668 in case they inspect these fields. */
6669
6670 if (md->offset_vector != NULL)
6671 {
6672 register int *iptr = md->offset_vector + ocount;
6673 register int *iend = iptr - re->top_bracket;
6674 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6675 while (--iptr >= iend) *iptr = -1;
6676 md->offset_vector[0] = md->offset_vector[1] = -1;
6677 }
6678
6679 /* Set up the first character to match, if available. The first_char value is
6680 never set for an anchored regular expression, but the anchoring may be forced
6681 at run time, so we have to test for anchoring. The first char may be unset for
6682 an unanchored pattern, of course. If there's no first char and the pattern was
6683 studied, there may be a bitmap of possible first characters. */
6684
6685 if (!anchored)
6686 {
6687 if ((re->flags & PCRE_FIRSTSET) != 0)
6688 {
6689 has_first_char = TRUE;
6690 first_char = first_char2 = (pcre_uchar)(re->first_char);
6691 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6692 {
6693 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6694 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6695 if (utf && first_char > 127)
6696 first_char2 = UCD_OTHERCASE(first_char);
6697 #endif
6698 }
6699 }
6700 else
6701 if (!startline && study != NULL &&
6702 (study->flags & PCRE_STUDY_MAPPED) != 0)
6703 start_bits = study->start_bits;
6704 }
6705
6706 /* For anchored or unanchored matches, there may be a "last known required
6707 character" set. */
6708
6709 if ((re->flags & PCRE_REQCHSET) != 0)
6710 {
6711 has_req_char = TRUE;
6712 req_char = req_char2 = (pcre_uchar)(re->req_char);
6713 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6714 {
6715 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6716 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6717 if (utf && req_char > 127)
6718 req_char2 = UCD_OTHERCASE(req_char);
6719 #endif
6720 }
6721 }
6722
6723
6724 /* ==========================================================================*/
6725
6726 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6727 the loop runs just once. */
6728
6729 for(;;)
6730 {
6731 PCRE_PUCHAR save_end_subject = end_subject;
6732 PCRE_PUCHAR new_start_match;
6733
6734 /* If firstline is TRUE, the start of the match is constrained to the first
6735 line of a multiline string. That is, the match must be before or at the first
6736 newline. Implement this by temporarily adjusting end_subject so that we stop
6737 scanning at a newline. If the match fails at the newline, later code breaks
6738 this loop. */
6739
6740 if (firstline)
6741 {
6742 PCRE_PUCHAR t = start_match;
6743 #ifdef SUPPORT_UTF
6744 if (utf)
6745 {
6746 while (t < md->end_subject && !IS_NEWLINE(t))
6747 {
6748 t++;
6749 ACROSSCHAR(t < end_subject, *t, t++);
6750 }
6751 }
6752 else
6753 #endif
6754 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6755 end_subject = t;
6756 }
6757
6758 /* There are some optimizations that avoid running the match if a known
6759 starting point is not found, or if a known later character is not present.
6760 However, there is an option that disables these, for testing and for ensuring
6761 that all callouts do actually occur. The option can be set in the regex by
6762 (*NO_START_OPT) or passed in match-time options. */
6763
6764 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6765 {
6766 /* Advance to a unique first char if there is one. */
6767
6768 if (has_first_char)
6769 {
6770 pcre_uchar smc;
6771
6772 if (first_char != first_char2)
6773 while (start_match < end_subject &&
6774 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6775 start_match++;
6776 else
6777 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6778 start_match++;
6779 }
6780
6781 /* Or to just after a linebreak for a multiline match */
6782
6783 else if (startline)
6784 {
6785 if (start_match > md->start_subject + start_offset)
6786 {
6787 #ifdef SUPPORT_UTF
6788 if (utf)
6789 {
6790 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6791 {
6792 start_match++;
6793 ACROSSCHAR(start_match < end_subject, *start_match,
6794 start_match++);
6795 }
6796 }
6797 else
6798 #endif
6799 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6800 start_match++;
6801
6802 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6803 and we are now at a LF, advance the match position by one more character.
6804 */
6805
6806 if (start_match[-1] == CHAR_CR &&
6807 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6808 start_match < end_subject &&
6809 UCHAR21TEST(start_match) == CHAR_NL)
6810 start_match++;
6811 }
6812 }
6813
6814 /* Or to a non-unique first byte after study */
6815
6816 else if (start_bits != NULL)
6817 {
6818 while (start_match < end_subject)
6819 {
6820 register pcre_uint32 c = UCHAR21TEST(start_match);
6821 #ifndef COMPILE_PCRE8
6822 if (c > 255) c = 255;
6823 #endif
6824 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6825 start_match++;
6826 }
6827 }
6828 } /* Starting optimizations */
6829
6830 /* Restore fudged end_subject */
6831
6832 end_subject = save_end_subject;
6833
6834 /* The following two optimizations are disabled for partial matching or if
6835 disabling is explicitly requested. */
6836
6837 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6838 {
6839 /* If the pattern was studied, a minimum subject length may be set. This is
6840 a lower bound; no actual string of that length may actually match the
6841 pattern. Although the value is, strictly, in characters, we treat it as
6842 bytes to avoid spending too much time in this optimization. */
6843
6844 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6845 (pcre_uint32)(end_subject - start_match) < study->minlength)
6846 {
6847 rc = MATCH_NOMATCH;
6848 break;
6849 }
6850
6851 /* If req_char is set, we know that that character must appear in the
6852 subject for the match to succeed. If the first character is set, req_char
6853 must be later in the subject; otherwise the test starts at the match point.
6854 This optimization can save a huge amount of backtracking in patterns with
6855 nested unlimited repeats that aren't going to match. Writing separate code
6856 for cased/caseless versions makes it go faster, as does using an
6857 autoincrement and backing off on a match.
6858
6859 HOWEVER: when the subject string is very, very long, searching to its end
6860 can take a long time, and give bad performance on quite ordinary patterns.
6861 This showed up when somebody was matching something like /^\d+C/ on a
6862 32-megabyte string... so we don't do this when the string is sufficiently
6863 long. */
6864
6865 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6866 {
6867 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6868
6869 /* We don't need to repeat the search if we haven't yet reached the
6870 place we found it at last time. */
6871
6872 if (p > req_char_ptr)
6873 {
6874 if (req_char != req_char2)
6875 {
6876 while (p < end_subject)
6877 {
6878 register pcre_uint32 pp = UCHAR21INCTEST(p);
6879 if (pp == req_char || pp == req_char2) { p--; break; }
6880 }
6881 }
6882 else
6883 {
6884 while (p < end_subject)
6885 {
6886 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6887 }
6888 }
6889
6890 /* If we can't find the required character, break the matching loop,
6891 forcing a match failure. */
6892
6893 if (p >= end_subject)
6894 {
6895 rc = MATCH_NOMATCH;
6896 break;
6897 }
6898
6899 /* If we have found the required character, save the point where we
6900 found it, so that we don't search again next time round the loop if
6901 the start hasn't passed this character yet. */
6902
6903 req_char_ptr = p;
6904 }
6905 }
6906 }
6907
6908 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6909 printf(">>>> Match against: ");
6910 pchars(start_match, end_subject - start_match, TRUE, md);
6911 printf("\n");
6912 #endif
6913
6914 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6915 first starting point for which a partial match was found. */
6916
6917 md->start_match_ptr = start_match;
6918 md->start_used_ptr = start_match;
6919 md->match_call_count = 0;
6920 md->match_function_type = 0;
6921 md->end_offset_top = 0;
6922 md->skip_arg_count = 0;
6923 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6924 if (md->hitend && start_partial == NULL)
6925 {
6926 start_partial = md->start_used_ptr;
6927 match_partial = start_match;
6928 }
6929
6930 switch(rc)
6931 {
6932 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6933 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6934 entirely. The only way we can do that is to re-do the match at the same
6935 point, with a flag to force SKIP with an argument to be ignored. Just
6936 treating this case as NOMATCH does not work because it does not check other
6937 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6938
6939 case MATCH_SKIP_ARG:
6940 new_start_match = start_match;
6941 md->ignore_skip_arg = md->skip_arg_count;
6942 break;
6943
6944 /* SKIP passes back the next starting point explicitly, but if it is no
6945 greater than the match we have just done, treat it as NOMATCH. */
6946
6947 case MATCH_SKIP:
6948 if (md->start_match_ptr > start_match)
6949 {
6950 new_start_match = md->start_match_ptr;
6951 break;
6952 }
6953 /* Fall through */
6954
6955 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6956 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6957
6958 case MATCH_NOMATCH:
6959 case MATCH_PRUNE:
6960 case MATCH_THEN:
6961 md->ignore_skip_arg = 0;
6962 new_start_match = start_match + 1;
6963 #ifdef SUPPORT_UTF
6964 if (utf)
6965 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6966 new_start_match++);
6967 #endif
6968 break;
6969
6970 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6971
6972 case MATCH_COMMIT:
6973 rc = MATCH_NOMATCH;
6974 goto ENDLOOP;
6975
6976 /* Any other return is either a match, or some kind of error. */
6977
6978 default:
6979 goto ENDLOOP;
6980 }
6981
6982 /* Control reaches here for the various types of "no match at this point"
6983 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6984
6985 rc = MATCH_NOMATCH;
6986
6987 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6988 newline in the subject (though it may continue over the newline). Therefore,
6989 if we have just failed to match, starting at a newline, do not continue. */
6990
6991 if (firstline && IS_NEWLINE(start_match)) break;
6992
6993 /* Advance to new matching position */
6994
6995 start_match = new_start_match;
6996
6997 /* Break the loop if the pattern is anchored or if we have passed the end of
6998 the subject. */
6999
7000 if (anchored || start_match > end_subject) break;
7001
7002 /* If we have just passed a CR and we are now at a LF, and the pattern does
7003 not contain any explicit matches for \r or \n, and the newline option is CRLF
7004 or ANY or ANYCRLF, advance the match position by one more character. In
7005 normal matching start_match will aways be greater than the first position at
7006 this stage, but a failed *SKIP can cause a return at the same point, which is
7007 why the first test exists. */
7008
7009 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7010 start_match[-1] == CHAR_CR &&
7011 start_match < end_subject &&
7012 *start_match == CHAR_NL &&
7013 (re->flags & PCRE_HASCRORLF) == 0 &&
7014 (md->nltype == NLTYPE_ANY ||
7015 md->nltype == NLTYPE_ANYCRLF ||
7016 md->nllen == 2))
7017 start_match++;
7018
7019 md->mark = NULL; /* Reset for start of next match attempt */
7020 } /* End of for(;;) "bumpalong" loop */
7021
7022 /* ==========================================================================*/
7023
7024 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7025 conditions is true:
7026
7027 (1) The pattern is anchored or the match was failed by (*COMMIT);
7028
7029 (2) We are past the end of the subject;
7030
7031 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7032 this option requests that a match occur at or before the first newline in
7033 the subject.
7034
7035 When we have a match and the offset vector is big enough to deal with any
7036 backreferences, captured substring offsets will already be set up. In the case
7037 where we had to get some local store to hold offsets for backreference
7038 processing, copy those that we can. In this case there need not be overflow if
7039 certain parts of the pattern were not used, even though there are more
7040 capturing parentheses than vector slots. */
7041
7042 ENDLOOP:
7043
7044 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7045 {
7046 if (using_temporary_offsets)
7047 {
7048 if (arg_offset_max >= 4)
7049 {
7050 memcpy(offsets + 2, md->offset_vector + 2,
7051 (arg_offset_max - 2) * sizeof(int));
7052 DPRINTF(("Copied offsets from temporary memory\n"));
7053 }
7054 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7055 DPRINTF(("Freeing temporary memory\n"));
7056 (PUBL(free))(md->offset_vector);
7057 }
7058
7059 /* Set the return code to the number of captured strings, or 0 if there were
7060 too many to fit into the vector. */
7061
7062 rc = ((md->capture_last & OVFLBIT) != 0 &&
7063 md->end_offset_top >= arg_offset_max)?
7064 0 : md->end_offset_top/2;
7065
7066 /* If there is space in the offset vector, set any unused pairs at the end of
7067 the pattern to -1 for backwards compatibility. It is documented that this
7068 happens. In earlier versions, the whole set of potential capturing offsets
7069 was set to -1 each time round the loop, but this is handled differently now.
7070 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7071 those at the end that need unsetting here. We can't just unset them all at
7072 the start of the whole thing because they may get set in one branch that is
7073 not the final matching branch. */
7074
7075 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7076 {
7077 register int *iptr, *iend;
7078 int resetcount = 2 + re->top_bracket * 2;
7079 if (resetcount > offsetcount) resetcount = offsetcount;
7080 iptr = offsets + md->end_offset_top;
7081 iend = offsets + resetcount;
7082 while (iptr < iend) *iptr++ = -1;
7083 }
7084
7085 /* If there is space, set up the whole thing as substring 0. The value of
7086 md->start_match_ptr might be modified if \K was encountered on the success
7087 matching path. */
7088
7089 if (offsetcount < 2) rc = 0; else
7090 {
7091 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7092 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7093 }
7094
7095 /* Return MARK data if requested */
7096
7097 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7098 *(extra_data->mark) = (pcre_uchar *)md->mark;
7099 DPRINTF((">>>> returning %d\n", rc));
7100 #ifdef NO_RECURSE
7101 release_match_heapframes(&frame_zero);
7102 #endif
7103 return rc;
7104 }
7105
7106 /* Control gets here if there has been an error, or if the overall match
7107 attempt has failed at all permitted starting positions. */
7108
7109 if (using_temporary_offsets)
7110 {
7111 DPRINTF(("Freeing temporary memory\n"));
7112 (PUBL(free))(md->offset_vector);
7113 }
7114
7115 /* For anything other than nomatch or partial match, just return the code. */
7116
7117 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7118 {
7119 DPRINTF((">>>> error: returning %d\n", rc));
7120 #ifdef NO_RECURSE
7121 release_match_heapframes(&frame_zero);
7122 #endif
7123 return rc;
7124 }
7125
7126 /* Handle partial matches - disable any mark data */
7127
7128 if (match_partial != NULL)
7129 {
7130 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7131 md->mark = NULL;
7132 if (offsetcount > 1)
7133 {
7134 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7135 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7136 if (offsetcount > 2)
7137 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7138 }
7139 rc = PCRE_ERROR_PARTIAL;
7140 }
7141
7142 /* This is the classic nomatch case */
7143
7144 else
7145 {
7146 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7147 rc = PCRE_ERROR_NOMATCH;
7148 }
7149
7150 /* Return the MARK data if it has been requested. */
7151
7152 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7153 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7154 #ifdef NO_RECURSE
7155 release_match_heapframes(&frame_zero);
7156 #endif
7157 return rc;
7158 }
7159
7160 /* End of pcre_exec.c */
7161