1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "config.h"
43 #endif
44
45 #include <stdio.h>
46 #include <string.h>
47
48 #define PCRE2_CODE_UNIT_WIDTH 0
49 #include "pcre2.h"
50
51 /*
52 Letter characters:
53 \xe6\x92\xad = 0x64ad = 25773 (kanji)
54 Non-letter characters:
55 \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark)
56 \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888
57 \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character)
58 \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character)
59 Newlines:
60 \xc2\x85 = 0x85 = 133 (NExt Line = NEL)
61 \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator)
62 Othercase pairs:
63 \xc3\xa9 = 0xe9 = 233 (e')
64 \xc3\x89 = 0xc9 = 201 (E')
65 \xc3\xa1 = 0xe1 = 225 (a')
66 \xc3\x81 = 0xc1 = 193 (A')
67 \x53 = 0x53 = S
68 \x73 = 0x73 = s
69 \xc5\xbf = 0x17f = 383 (long S)
70 \xc8\xba = 0x23a = 570
71 \xe2\xb1\xa5 = 0x2c65 = 11365
72 \xe1\xbd\xb8 = 0x1f78 = 8056
73 \xe1\xbf\xb8 = 0x1ff8 = 8184
74 \xf0\x90\x90\x80 = 0x10400 = 66560
75 \xf0\x90\x90\xa8 = 0x10428 = 66600
76 \xc7\x84 = 0x1c4 = 452
77 \xc7\x85 = 0x1c5 = 453
78 \xc7\x86 = 0x1c6 = 454
79 Caseless sets:
80 ucp_Armenian - \x{531}-\x{556} -> \x{561}-\x{586}
81 ucp_Coptic - \x{2c80}-\x{2ce3} -> caseless: XOR 0x1
82 ucp_Latin - \x{ff21}-\x{ff3a} -> \x{ff41]-\x{ff5a}
83
84 Mark property:
85 \xcc\x8d = 0x30d = 781
86 Special:
87 \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
88 \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
89 \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
90 \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
91 \xf0\x90\x80\x80 = 0x10000 = 65536 (lowest 4 byte character)
92 \xf4\x8f\xbf\xbf = 0x10ffff = 1114111 (highest allowed utf character)
93 */
94
95 static int regression_tests(void);
96 static int invalid_utf8_regression_tests(void);
97 static int invalid_utf16_regression_tests(void);
98 static int invalid_utf32_regression_tests(void);
99
main(void)100 int main(void)
101 {
102 int jit = 0;
103 #if defined SUPPORT_PCRE2_8
104 pcre2_config_8(PCRE2_CONFIG_JIT, &jit);
105 #elif defined SUPPORT_PCRE2_16
106 pcre2_config_16(PCRE2_CONFIG_JIT, &jit);
107 #elif defined SUPPORT_PCRE2_32
108 pcre2_config_32(PCRE2_CONFIG_JIT, &jit);
109 #endif
110 if (!jit) {
111 printf("JIT must be enabled to run pcre_jit_test\n");
112 return 1;
113 }
114 return regression_tests()
115 | invalid_utf8_regression_tests()
116 | invalid_utf16_regression_tests()
117 | invalid_utf32_regression_tests();
118 }
119
120 /* --------------------------------------------------------------------------------------- */
121
122 #if !(defined SUPPORT_PCRE2_8) && !(defined SUPPORT_PCRE2_16) && !(defined SUPPORT_PCRE2_32)
123 #error SUPPORT_PCRE2_8 or SUPPORT_PCRE2_16 or SUPPORT_PCRE2_32 must be defined
124 #endif
125
126 #define MU (PCRE2_MULTILINE | PCRE2_UTF)
127 #define MUP (PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
128 #define CMU (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF)
129 #define CMUP (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
130 #define M (PCRE2_MULTILINE)
131 #define MP (PCRE2_MULTILINE | PCRE2_UCP)
132 #define U (PCRE2_UTF)
133 #define CM (PCRE2_CASELESS | PCRE2_MULTILINE)
134
135 #define BSR(x) ((x) << 16)
136 #define A PCRE2_NEWLINE_ANYCRLF
137
138 #define GET_NEWLINE(x) ((x) & 0xffff)
139 #define GET_BSR(x) ((x) >> 16)
140
141 #define OFFSET_MASK 0x00ffff
142 #define F_NO8 0x010000
143 #define F_NO16 0x020000
144 #define F_NO32 0x020000
145 #define F_NOMATCH 0x040000
146 #define F_DIFF 0x080000
147 #define F_FORCECONV 0x100000
148 #define F_PROPERTY 0x200000
149
150 struct regression_test_case {
151 int compile_options;
152 int newline;
153 int match_options;
154 int start_offset;
155 const char *pattern;
156 const char *input;
157 };
158
159 static struct regression_test_case regression_test_cases[] = {
160 /* Constant strings. */
161 { MU, A, 0, 0, "AbC", "AbAbC" },
162 { MU, A, 0, 0, "ACCEPT", "AACACCACCEACCEPACCEPTACCEPTT" },
163 { CMU, A, 0, 0, "aA#\xc3\xa9\xc3\x81", "aA#Aa#\xc3\x89\xc3\xa1" },
164 { M, A, 0, 0, "[^a]", "aAbB" },
165 { CM, A, 0, 0, "[^m]", "mMnN" },
166 { M, A, 0, 0, "a[^b][^#]", "abacd" },
167 { CM, A, 0, 0, "A[^B][^E]", "abacd" },
168 { CMU, A, 0, 0, "[^x][^#]", "XxBll" },
169 { MU, A, 0, 0, "[^a]", "aaa\xc3\xa1#Ab" },
170 { CMU, A, 0, 0, "[^A]", "aA\xe6\x92\xad" },
171 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\n+bc" },
172 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\r+bc" },
173 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\r+bc" },
174 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\n+bc" },
175 { MU, A, 0, 0, "[axd]", "sAXd" },
176 { CMU, A, 0, 0, "[axd]", "sAXd" },
177 { CMU, A, 0, 0 | F_NOMATCH, "[^axd]", "DxA" },
178 { MU, A, 0, 0, "[a-dA-C]", "\xe6\x92\xad\xc3\xa9.B" },
179 { MU, A, 0, 0, "[^a-dA-C]", "\xe6\x92\xad\xc3\xa9" },
180 { CMU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
181 { MU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
182 { MU, A, 0, 0, "[^a]", "\xc2\x80[]" },
183 { CMU, A, 0, 0, "\xf0\x90\x90\xa7", "\xf0\x90\x91\x8f" },
184 { CM, A, 0, 0, "1a2b3c4", "1a2B3c51A2B3C4" },
185 { PCRE2_CASELESS, 0, 0, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
186 { PCRE2_CASELESS, 0, 0, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
187 { PCRE2_CASELESS, 0, 0, 0, "a1", "Aa1" },
188 #ifndef NEVER_BACKSLASH_C
189 { M, A, 0, 0, "\\Ca", "cda" },
190 { CM, A, 0, 0, "\\Ca", "CDA" },
191 { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
192 { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
193 #endif /* !NEVER_BACKSLASH_C */
194 { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
195 { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
196 { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
197 { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
198 { M, A, 0, 0, "[3-57-9]", "5" },
199 { PCRE2_AUTO_CALLOUT, A, 0, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890",
200 "12345678901234567890123456789012345678901234567890123456789012345678901234567890" },
201
202 /* Assertions. */
203 { MU, A, 0, 0, "\\b[^A]", "A_B#" },
204 { M, A, 0, 0 | F_NOMATCH, "\\b\\W", "\n*" },
205 { MU, A, 0, 0, "\\B[^,]\\b[^s]\\b", "#X" },
206 { MP, A, 0, 0, "\\B", "_\xa1" },
207 { MP, A, 0, 0 | F_PROPERTY, "\\b_\\b[,A]\\B", "_," },
208 { MUP, A, 0, 0, "\\b", "\xe6\x92\xad!" },
209 { MUP, A, 0, 0, "\\B", "_\xc2\xa1\xc3\xa1\xc2\x85" },
210 { MUP, A, 0, 0, "\\b[^A]\\B[^c]\\b[^_]\\B", "_\xc3\xa1\xe2\x80\xa8" },
211 { MUP, A, 0, 0, "\\b\\w+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
212 { MU, A, 0, 0 | F_NOMATCH, "\\b.", "\xcd\xbe" },
213 { CMUP, A, 0, 0, "\\By", "\xf0\x90\x90\xa8y" },
214 { M, A, 0, 0 | F_NOMATCH, "\\R^", "\n" },
215 { M, A, 0, 1 | F_NOMATCH, "^", "\n" },
216 { 0, 0, 0, 0, "^ab", "ab" },
217 { 0, 0, 0, 0 | F_NOMATCH, "^ab", "aab" },
218 { M, PCRE2_NEWLINE_CRLF, 0, 0, "^a", "\r\raa\n\naa\r\naa" },
219 { MU, A, 0, 0, "^-", "\xe2\x80\xa8--\xc2\x85-\r\n-" },
220 { M, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--b--\x85--" },
221 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xe2\x80\xa8--" },
222 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xc2\x85--" },
223 { 0, 0, 0, 0, "ab$", "ab" },
224 { 0, 0, 0, 0 | F_NOMATCH, "ab$", "abab\n\n" },
225 { PCRE2_DOLLAR_ENDONLY, 0, 0, 0 | F_NOMATCH, "ab$", "abab\r\n" },
226 { M, PCRE2_NEWLINE_CRLF, 0, 0, "a$", "\r\raa\n\naa\r\naa" },
227 { M, PCRE2_NEWLINE_ANY, 0, 0, "a$", "aaa" },
228 { MU, PCRE2_NEWLINE_ANYCRLF, 0, 0, "#$", "#\xc2\x85###\r#" },
229 { MU, PCRE2_NEWLINE_ANY, 0, 0, "#$", "#\xe2\x80\xa9" },
230 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0 | F_NOMATCH, "^a", "aa\naa" },
231 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0, "^a", "aa\naa" },
232 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\naa" },
233 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\r\n" },
234 { U | PCRE2_DOLLAR_ENDONLY, PCRE2_NEWLINE_ANY, 0, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
235 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0, "a$", "aa\naa" },
236 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa" },
237 { U, PCRE2_NEWLINE_CR, 0, 0, "a\\Z", "aaa\r" },
238 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa\n" },
239 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r" },
240 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\n" },
241 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r\n" },
242 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
243 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
244 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
245 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
246 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
247 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
248 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
249 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
250 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
251 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xc2\x85" },
252 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
253 { M, A, 0, 0, "\\Aa", "aaa" },
254 { M, A, 0, 1 | F_NOMATCH, "\\Aa", "aaa" },
255 { M, A, 0, 1, "\\Ga", "aaa" },
256 { M, A, 0, 1 | F_NOMATCH, "\\Ga", "aba" },
257 { M, A, 0, 0, "a\\z", "aaa" },
258 { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },
259
260 /* Brackets and alternatives. */
261 { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
262 { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
263 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
264 { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
265 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
266 { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
267 { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
268 { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
269 { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
270 { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
271 { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
272 { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
273 { CM, A, 0, 0, "ab|cd", "CD" },
274 { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
275 { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
276
277 /* Greedy and non-greedy ? operators. */
278 { MU, A, 0, 0, "(?:a)?a", "laab" },
279 { CMU, A, 0, 0, "(A)?A", "llaab" },
280 { MU, A, 0, 0, "(a)?\?a", "aab" }, /* ?? is the prefix of trygraphs in GCC. */
281 { MU, A, 0, 0, "(a)?a", "manm" },
282 { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
283 { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
284 { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
285
286 /* Greedy and non-greedy + operators */
287 { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
288 { MU, A, 0, 0, "(aa)+?aa", "aaaaaaa" },
289 { MU, A, 0, 0, "(?:aba|ab|a)+l", "ababamababal" },
290 { MU, A, 0, 0, "(?:aba|ab|a)+?l", "ababamababal" },
291 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+?|ss)+e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
292 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+|ss)+?e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
293 { MU, A, 0, 0, "(?:(b(c)+?)+)?\?(?:(bc)+|(cb)+)+(?:m)+", "bccbcccbcbccbcbPbccbcccbcbccbcbmmn" },
294
295 /* Greedy and non-greedy * operators */
296 { CMU, A, 0, 0, "(?:AA)*AB", "aaaaaaamaaaaaaab" },
297 { MU, A, 0, 0, "(?:aa)*?ab", "aaaaaaamaaaaaaab" },
298 { MU, A, 0, 0, "(aa|ab)*ab", "aaabaaab" },
299 { CMU, A, 0, 0, "(aa|Ab)*?aB", "aaabaaab" },
300 { MU, A, 0, 0, "(a|b)*(?:a)*(?:b)*m", "abbbaaababanabbbaaababamm" },
301 { MU, A, 0, 0, "(a|b)*?(?:a)*?(?:b)*?m", "abbbaaababanabbbaaababamm" },
302 { M, A, 0, 0, "a(a(\\1*)a|(b)b+){0}a", "aa" },
303 { M, A, 0, 0, "((?:a|)*){0}a", "a" },
304
305 /* Combining ? + * operators */
306 { MU, A, 0, 0, "((bm)+)?\?(?:a)*(bm)+n|((am)+?)?(?:a)+(am)*n", "bmbmabmamaaamambmaman" },
307 { MU, A, 0, 0, "(((ab)?cd)*ef)+g", "abcdcdefcdefefmabcdcdefcdefefgg" },
308 { MU, A, 0, 0, "(((ab)?\?cd)*?ef)+?g", "abcdcdefcdefefmabcdcdefcdefefgg" },
309 { MU, A, 0, 0, "(?:(ab)?c|(?:ab)+?d)*g", "ababcdccababddg" },
310 { MU, A, 0, 0, "(?:(?:ab)?\?c|(ab)+d)*?g", "ababcdccababddg" },
311
312 /* Single character iterators. */
313 { MU, A, 0, 0, "(a+aab)+aaaab", "aaaabcaaaabaabcaabcaaabaaaab" },
314 { MU, A, 0, 0, "(a*a*aab)+x", "aaaaabaabaaabmaabx" },
315 { MU, A, 0, 0, "(a*?(b|ab)a*?)+x", "aaaabcxbbaabaacbaaabaabax" },
316 { MU, A, 0, 0, "(a+(ab|ad)a+)+x", "aaabaaaadaabaaabaaaadaaax" },
317 { MU, A, 0, 0, "(a?(a)a?)+(aaa)", "abaaabaaaaaaaa" },
318 { MU, A, 0, 0, "(a?\?(a)a?\?)+(b)", "aaaacaaacaacacbaaab" },
319 { MU, A, 0, 0, "(a{0,4}(b))+d", "aaaaaabaabcaaaaabaaaaabd" },
320 { MU, A, 0, 0, "(a{0,4}?[^b])+d+(a{0,4}[^b])d+", "aaaaadaaaacaadddaaddd" },
321 { MU, A, 0, 0, "(ba{2})+c", "baabaaabacbaabaac" },
322 { MU, A, 0, 0, "(a*+bc++)+", "aaabbcaaabcccab" },
323 { MU, A, 0, 0, "(a?+[^b])+", "babaacacb" },
324 { MU, A, 0, 0, "(a{0,3}+b)(a{0,3}+b)(a{0,3}+)[^c]", "abaabaaacbaabaaaac" },
325 { CMU, A, 0, 0, "([a-c]+[d-f]+?)+?g", "aBdacdehAbDaFgA" },
326 { CMU, A, 0, 0, "[c-f]+k", "DemmFke" },
327 { MU, A, 0, 0, "([DGH]{0,4}M)+", "GGDGHDGMMHMDHHGHM" },
328 { MU, A, 0, 0, "([a-c]{4,}s)+", "abasabbasbbaabsbba" },
329 { CMU, A, 0, 0, "[ace]{3,7}", "AcbDAcEEcEd" },
330 { CMU, A, 0, 0, "[ace]{3,7}?", "AcbDAcEEcEd" },
331 { CMU, A, 0, 0, "[ace]{3,}", "AcbDAcEEcEd" },
332 { CMU, A, 0, 0, "[ace]{3,}?", "AcbDAcEEcEd" },
333 { MU, A, 0, 0, "[ckl]{2,}?g", "cdkkmlglglkcg" },
334 { CMU, A, 0, 0, "[ace]{5}?", "AcCebDAcEEcEd" },
335 { MU, A, 0, 0, "([AbC]{3,5}?d)+", "BACaAbbAEAACCbdCCbdCCAAbb" },
336 { MU, A, 0, 0, "([^ab]{0,}s){2}", "abaabcdsABamsDDs" },
337 { MU, A, 0, 0, "\\b\\w+\\B", "x,a_cd" },
338 { MUP, A, 0, 0, "\\b[^\xc2\xa1]+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
339 { CMU, A, 0, 0, "[^b]+(a*)([^c]?d{3})", "aaaaddd" },
340 { CMUP, A, 0, 0, "\xe1\xbd\xb8{2}", "\xe1\xbf\xb8#\xe1\xbf\xb8\xe1\xbd\xb8" },
341 { CMU, A, 0, 0, "[^\xf0\x90\x90\x80]{2,4}@", "\xf0\x90\x90\xa8\xf0\x90\x90\x80###\xf0\x90\x90\x80@@@" },
342 { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
343 { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
344 { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" },
345 { MU, A, 0, 0, "\\d+123", "987654321,01234" },
346 { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" },
347 { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" },
348 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."},
349 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."},
350 { MU, A, 0, 0, ".[ab]*.", "xx" },
351 { MU, A, 0, 0, ".[ab]*a", "xxa" },
352 { MU, A, 0, 0, ".[ab]?.", "xx" },
353 { MU, A, 0, 0, "_[ab]+_*a", "_aa" },
354
355 /* Bracket repeats with limit. */
356 { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
357 { MU, A, 0, 0, "(?:ab|abab){1,5}M", "abababababababababababM" },
358 { MU, A, 0, 0, "(?>ab|abab){1,5}M", "abababababababababababM" },
359 { MU, A, 0, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" },
360 { MU, A, 0, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" },
361 { MU, A, 0, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" },
362 { MU, A, 0, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" },
363 { MU, A, 0, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" },
364 { MU, A, 0, 0, "(ab){4,6}?M", "abababababababM" },
365
366 /* Basic character sets. */
367 { MU, A, 0, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " },
368 { MU, A, 0, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" },
369 { MU, A, 0, 0, "\\((\\d)+\\)\\D", "a() (83 (8)2 (9)ab" },
370 { MU, A, 0, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
371 { MU, A, 0, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
372 { MU, A, 0, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
373 { MU, A, 0, 0, "x[bcef]+", "xaxdxecbfg" },
374 { MU, A, 0, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
375 { MU, A, 0, 0, "x[^befg]+", "xbxexacdhg" },
376 { MU, A, 0, 0, "x[^bcdl]+", "xlxbxaekmd" },
377 { MU, A, 0, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
378 { MU, A, 0, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
379 { CMU, A, 0, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
380 { CMU, A, 0, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
381 { MU, A, 0, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
382 { MU, A, 0, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
383 { MU, A, 0, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
384 { MU, A, 0, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
385 { MU, A, 0, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
386 { MU, A, 0, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
387 { MU, A, 0, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
388 { MU, A, 0, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },
389 { CMU, A, 0, 0 | F_NOMATCH, "^[\\x{0100}-\\x{017f}]", " " },
390
391 /* Unicode properties. */
392 { MUP, A, 0, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
393 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
394 { MUP, A, 0, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
395 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
396 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
397 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
398 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
399 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
400 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
401 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
402 { MUP, A, 0, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
403 { MUP, A, 0, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
404 { CMUP, A, 0, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
405 { MUP, A, 0, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
406 { MUP, A, 0, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
407 { MU, A, 0, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
408 { CMUP, A, 0, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
409 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
410 { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
411 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
412 { MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
413
414 /* Possible empty brackets. */
415 { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
416 { MU, A, 0, 0, "(|ab||bc|a)+d", "abcxabcabd" },
417 { MU, A, 0, 0, "(?:|ab||bc|a)*d", "abcxabcabd" },
418 { MU, A, 0, 0, "(|ab||bc|a)*d", "abcxabcabd" },
419 { MU, A, 0, 0, "(?:|ab||bc|a)+?d", "abcxabcabd" },
420 { MU, A, 0, 0, "(|ab||bc|a)+?d", "abcxabcabd" },
421 { MU, A, 0, 0, "(?:|ab||bc|a)*?d", "abcxabcabd" },
422 { MU, A, 0, 0, "(|ab||bc|a)*?d", "abcxabcabd" },
423 { MU, A, 0, 0, "(((a)*?|(?:ba)+)+?|(?:|c|ca)*)*m", "abaacaccabacabalabaacaccabacabamm" },
424 { MU, A, 0, 0, "(?:((?:a)*|(ba)+?)+|(|c|ca)*?)*?m", "abaacaccabacabalabaacaccabacabamm" },
425
426 /* Start offset. */
427 { MU, A, 0, 3, "(\\d|(?:\\w)*\\w)+", "0ac01Hb" },
428 { MU, A, 0, 4 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
429 { MU, A, 0, 2 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
430 { MU, A, 0, 1, "(\\w\\W\\w)+", "ab#d" },
431
432 /* Newline. */
433 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
434 { M, PCRE2_NEWLINE_CR, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
435 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{1,3}[^#]", "\r\n##...." },
436 { MU, A, PCRE2_NO_UTF_CHECK, 1, "^.a", "\n\x80\nxa" },
437 { MU, A, 0, 1, "^", "\r\n" },
438 { M, PCRE2_NEWLINE_CRLF, 0, 1 | F_NOMATCH, "^", "\r\n" },
439 { M, PCRE2_NEWLINE_CRLF, 0, 1, "^", "\r\na" },
440
441 /* Any character except newline or any newline. */
442 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
443 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".(.).", "a\xc3\xa1\r\n\n\r\r" },
444 { 0, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
445 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
446 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.).", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa9$de" },
447 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0 | F_NOMATCH, ".(.).", "\xe2\x80\xa8\nb\r" },
448 { 0, PCRE2_NEWLINE_ANY, 0, 0, "(.)(.)", "#\x85#\r#\n#\r\n#\x84" },
449 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.+)#", "#\rMn\xc2\x85#\n###" },
450 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\r" },
451 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\x85#\r\n#" },
452 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\xe2\x80\xa8#c" },
453 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\r\nc" },
454 { U, PCRE2_NEWLINE_CRLF | BSR(PCRE2_BSR_UNICODE), 0, 0, "(\\R.)+", "\xc2\x85\r\n#\xe2\x80\xa8\n\r\n\r" },
455 { MU, A, 0, 0 | F_NOMATCH, "\\R+", "ab" },
456 { MU, A, 0, 0, "\\R+", "ab\r\n\r" },
457 { MU, A, 0, 0, "\\R*", "ab\r\n\r" },
458 { MU, A, 0, 0, "\\R*", "\r\n\r" },
459 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
460 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
461 { MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },
462 { MU, A, 0, 0, "\\R{0,3}", "\r\n\r\n\r\n\r\n\r\n" },
463 { MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
464 { MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
465 { MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },
466 { MU, A, 0, 0 | F_NOMATCH, "\\R{2,4}\\R\\R", "\r\r\r" },
467 { MU, A, 0, 0, "\\R{2,4}\\R\\R", "\r\r\r\r" },
468
469 /* Atomic groups (no fallback from "next" direction). */
470 { MU, A, 0, 0 | F_NOMATCH, "(?>ab)ab", "bab" },
471 { MU, A, 0, 0 | F_NOMATCH, "(?>(ab))ab", "bab" },
472 { MU, A, 0, 0, "(?>ab)+abc(?>de)*def(?>gh)?ghe(?>ij)+?k(?>lm)*?n(?>op)?\?op",
473 "bababcdedefgheijijklmlmnop" },
474 { MU, A, 0, 0, "(?>a(b)+a|(ab)?\?(b))an", "abban" },
475 { MU, A, 0, 0, "(?>ab+a|(?:ab)?\?b)an", "abban" },
476 { MU, A, 0, 0, "((?>ab|ad|)*?)(?>|c)*abad", "abababcababad" },
477 { MU, A, 0, 0, "(?>(aa|b|)*+(?>(##)|###)*d|(aa)(?>(baa)?)m)", "aabaa#####da" },
478 { MU, A, 0, 0, "((?>a|)+?)b", "aaacaaab" },
479 { MU, A, 0, 0, "(?>x|)*$", "aaa" },
480 { MU, A, 0, 0, "(?>(x)|)*$", "aaa" },
481 { MU, A, 0, 0, "(?>x|())*$", "aaa" },
482 { MU, A, 0, 0, "((?>[cxy]a|[a-d])*?)b", "aaa+ aaab" },
483 { MU, A, 0, 0, "((?>[cxy](a)|[a-d])*?)b", "aaa+ aaab" },
484 { MU, A, 0, 0, "(?>((?>(a+))))bab|(?>((?>(a+))))bb", "aaaabaaabaabab" },
485 { MU, A, 0, 0, "(?>(?>a+))bab|(?>(?>a+))bb", "aaaabaaabaabab" },
486 { MU, A, 0, 0, "(?>(a)c|(?>(c)|(a))a)b*?bab", "aaaabaaabaabab" },
487 { MU, A, 0, 0, "(?>ac|(?>c|a)a)b*?bab", "aaaabaaabaabab" },
488 { MU, A, 0, 0, "(?>(b)b|(a))*b(?>(c)|d)?x", "ababcaaabdbx" },
489 { MU, A, 0, 0, "(?>bb|a)*b(?>c|d)?x", "ababcaaabdbx" },
490 { MU, A, 0, 0, "(?>(bb)|a)*b(?>c|(d))?x", "ababcaaabdbx" },
491 { MU, A, 0, 0, "(?>(a))*?(?>(a))+?(?>(a))??x", "aaaaaacccaaaaabax" },
492 { MU, A, 0, 0, "(?>a)*?(?>a)+?(?>a)??x", "aaaaaacccaaaaabax" },
493 { MU, A, 0, 0, "(?>(a)|)*?(?>(a)|)+?(?>(a)|)??x", "aaaaaacccaaaaabax" },
494 { MU, A, 0, 0, "(?>a|)*?(?>a|)+?(?>a|)??x", "aaaaaacccaaaaabax" },
495 { MU, A, 0, 0, "(?>a(?>(a{0,2}))*?b|aac)+b", "aaaaaaacaaaabaaaaacaaaabaacaaabb" },
496 { CM, A, 0, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
497 { MU, A, 0, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
498 { MU, A, 0, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
499 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
500 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
501 { MU, A, 0, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
502 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
503 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
504 { MU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
505 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
506 { MU, A, 0, 0, "(c(ab)?+ab)+", "cabcababcab" },
507 { MU, A, 0, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
508
509 /* Possessive quantifiers. */
510 { MU, A, 0, 0, "(?:a|b)++m", "mababbaaxababbaam" },
511 { MU, A, 0, 0, "(?:a|b)*+m", "mababbaaxababbaam" },
512 { MU, A, 0, 0, "(?:a|b)*+m", "ababbaaxababbaam" },
513 { MU, A, 0, 0, "(a|b)++m", "mababbaaxababbaam" },
514 { MU, A, 0, 0, "(a|b)*+m", "mababbaaxababbaam" },
515 { MU, A, 0, 0, "(a|b)*+m", "ababbaaxababbaam" },
516 { MU, A, 0, 0, "(a|b(*ACCEPT))++m", "maaxab" },
517 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxm" },
518 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxbbm" },
519 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxm" },
520 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxbbm" },
521 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxm" },
522 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxbbm" },
523 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxm" },
524 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxbbm" },
525 { MU, A, 0, 0, "(?:a|(b))++m", "mababbaaxababbaam" },
526 { MU, A, 0, 0, "(?:(a)|b)*+m", "mababbaaxababbaam" },
527 { MU, A, 0, 0, "(?:(a)|(b))*+m", "ababbaaxababbaam" },
528 { MU, A, 0, 0, "(a|(b))++m", "mababbaaxababbaam" },
529 { MU, A, 0, 0, "((a)|b)*+m", "mababbaaxababbaam" },
530 { MU, A, 0, 0, "((a)|(b))*+m", "ababbaaxababbaam" },
531 { MU, A, 0, 0, "(a|(b)(*ACCEPT))++m", "maaxab" },
532 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxm" },
533 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxbbm" },
534 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxm" },
535 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxbbm" },
536 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxm" },
537 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxbbm" },
538 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxm" },
539 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxbbm" },
540 { MU, A, 0, 0 | F_NOMATCH, "(?>(b{2,4}))(?:(?:(aa|c))++m|(?:(aa|c))+n)", "bbaacaaccaaaacxbbbmbn" },
541 { MU, A, 0, 0, "((?:b)++a)+(cd)*+m", "bbababbacdcdnbbababbacdcdm" },
542 { MU, A, 0, 0, "((?:(b))++a)+((c)d)*+m", "bbababbacdcdnbbababbacdcdm" },
543 { MU, A, 0, 0, "(?:(?:(?:ab)*+k)++(?:n(?:cd)++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
544 { MU, A, 0, 0, "(?:((ab)*+(k))++(n(?:c(d))++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
545
546 /* Back references. */
547 { MU, A, 0, 0, "(aa|bb)(\\1*)(ll|)(\\3*)bbbbbbc", "aaaaaabbbbbbbbc" },
548 { CMU, A, 0, 0, "(aa|bb)(\\1+)(ll|)(\\3+)bbbbbbc", "bBbbBbCbBbbbBbbcbbBbbbBBbbC" },
549 { CM, A, 0, 0, "(a{2,4})\\1", "AaAaaAaA" },
550 { MU, A, 0, 0, "(aa|bb)(\\1?)aa(\\1?)(ll|)(\\4+)bbc", "aaaaaaaabbaabbbbaabbbbc" },
551 { MU, A, 0, 0, "(aa|bb)(\\1{0,5})(ll|)(\\3{0,5})cc", "bbxxbbbbxxaaaaaaaaaaaaaaaacc" },
552 { MU, A, 0, 0, "(aa|bb)(\\1{3,5})(ll|)(\\3{3,5})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
553 { MU, A, 0, 0, "(aa|bb)(\\1{3,})(ll|)(\\3{3,})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
554 { MU, A, 0, 0, "(\\w+)b(\\1+)c", "GabGaGaDbGaDGaDc" },
555 { MU, A, 0, 0, "(?:(aa)|b)\\1?b", "bb" },
556 { CMU, A, 0, 0, "(aa|bb)(\\1*?)aa(\\1+?)", "bBBbaaAAaaAAaa" },
557 { MU, A, 0, 0, "(aa|bb)(\\1*?)(dd|)cc(\\3+?)", "aaaaaccdd" },
558 { CMU, A, 0, 0, "(?:(aa|bb)(\\1?\?)cc){2}(\\1?\?)", "aAaABBbbAAaAcCaAcCaA" },
559 { MU, A, 0, 0, "(?:(aa|bb)(\\1{3,5}?)){2}(dd|)(\\3{3,5}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
560 { CM, A, 0, 0, "(?:(aa|bb)(\\1{3,}?)){2}(dd|)(\\3{3,}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
561 { MU, A, 0, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
562 { MU, A, 0, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
563 { M, A, 0, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
564 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
565 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
566 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
567 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
568 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
569 { CMUP, A, 0, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
570 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
571 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
572 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>*(?<A>aa)(?<A>bb)", "aabb" },
573 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{0,3}aaaaaa", "aabbaaaaaa" },
574 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{2,5}bb", "aabbaaaabb" },
575 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}m", "aaaaaaaabbbbaabbbbm" },
576 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
577 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
578 { MU | PCRE2_DUPNAMES, A, 0, 0, "\\k<A>*?(?<A>aa)(?<A>bb)", "aabb" },
579 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
580 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>*?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
581 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
582 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}M", "aaaaaaaabbbbaabbbbm" },
583 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{1,3}M", "aaaaaaaabbbbaabbbbm" },
584 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}?M", "aaaaaabbbbbbaabbbbbbbbbbm" },
585 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
586
587 /* Assertions. */
588 { MU, A, 0, 0, "(?=xx|yy|zz)\\w{4}", "abczzdefg" },
589 { MU, A, 0, 0, "(?=((\\w+)b){3}|ab)", "dbbbb ab" },
590 { MU, A, 0, 0, "(?!ab|bc|cd)[a-z]{2}", "Xabcdef" },
591 { MU, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
592 { MU, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
593 { M, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
594 { M, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
595 { MU, A, 0, 0, "(\\d{2})(?!\\w+c|(((\\w?)m){2}n)+|\\1)", "x5656" },
596 { MU, A, 0, 0, "((?=((\\d{2,6}\\w){2,}))\\w{5,20}K){2,}", "567v09708K12l00M00 567v09708K12l00M00K45K" },
597 { MU, A, 0, 0, "(?=(?:(?=\\S+a)\\w*(b)){3})\\w+\\d", "bba bbab nbbkba nbbkba0kl" },
598 { MU, A, 0, 0, "(?>a(?>(b+))a(?=(..)))*?k", "acabbcabbaabacabaabbakk" },
599 { MU, A, 0, 0, "((?(?=(a))a)+k)", "bbak" },
600 { MU, A, 0, 0, "((?(?=a)a)+k)", "bbak" },
601 { MU, A, 0, 0 | F_NOMATCH, "(?=(?>(a))m)amk", "a k" },
602 { MU, A, 0, 0 | F_NOMATCH, "(?!(?>(a))m)amk", "a k" },
603 { MU, A, 0, 0 | F_NOMATCH, "(?>(?=(a))am)amk", "a k" },
604 { MU, A, 0, 0, "(?=(?>a|(?=(?>(b+))a|c)[a-c]+)*?m)[a-cm]+k", "aaam bbam baaambaam abbabba baaambaamk" },
605 { MU, A, 0, 0, "(?> ?\?\\b(?(?=\\w{1,4}(a))m)\\w{0,8}bc){2,}?", "bca ssbc mabd ssbc mabc" },
606 { MU, A, 0, 0, "(?:(?=ab)?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
607 { MU, A, 0, 0, "(?:(?=a(b))?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
608 { MU, A, 0, 0, "(?:(?=.(.))??\\1.)+m", "aabbbcbacccanaabbbcbacccam" },
609 { MU, A, 0, 0, "(?:(?=.)??[a-c])+m", "abacdcbacacdcaccam" },
610 { MU, A, 0, 0, "((?!a)?(?!([^a]))?)+$", "acbab" },
611 { MU, A, 0, 0, "((?!a)?\?(?!([^a]))?\?)+$", "acbab" },
612 { MU, A, 0, 0, "a(?=(?C)\\B(?C`x`))b", "ab" },
613 { MU, A, 0, 0, "a(?!(?C)\\B(?C`x`))bb|ab", "abb" },
614 { MU, A, 0, 0, "a(?=\\b|(?C)\\B(?C`x`))b", "ab" },
615 { MU, A, 0, 0, "a(?!\\b|(?C)\\B(?C`x`))bb|ab", "abb" },
616 { MU, A, 0, 0, "c(?(?=(?C)\\B(?C`x`))ab|a)", "cab" },
617 { MU, A, 0, 0, "c(?(?!(?C)\\B(?C`x`))ab|a)", "cab" },
618 { MU, A, 0, 0, "c(?(?=\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
619 { MU, A, 0, 0, "c(?(?!\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
620 { MU, A, 0, 0, "a(?=)b", "ab" },
621 { MU, A, 0, 0 | F_NOMATCH, "a(?!)b", "ab" },
622
623 /* Not empty, ACCEPT, FAIL */
624 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*", "bcx" },
625 { MU, A, PCRE2_NOTEMPTY, 0, "a*", "bcaad" },
626 { MU, A, PCRE2_NOTEMPTY, 0, "a*?", "bcaad" },
627 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*", "bcaad" },
628 { MU, A, 0, 0, "a(*ACCEPT)b", "ab" },
629 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*(*ACCEPT)b", "bcx" },
630 { MU, A, PCRE2_NOTEMPTY, 0, "a*(*ACCEPT)b", "bcaad" },
631 { MU, A, PCRE2_NOTEMPTY, 0, "a*?(*ACCEPT)b", "bcaad" },
632 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?:z|a*(*ACCEPT)b)", "bcx" },
633 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*(*ACCEPT)b)", "bcaad" },
634 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*?(*ACCEPT)b)", "bcaad" },
635 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*(*ACCEPT)b", "bcx" },
636 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0 | F_NOMATCH, "a*(*ACCEPT)b", "" },
637 { MU, A, 0, 0, "((a(*ACCEPT)b))", "ab" },
638 { MU, A, 0, 0, "(a(*FAIL)a|a)", "aaa" },
639 { MU, A, 0, 0, "(?=ab(*ACCEPT)b)a", "ab" },
640 { MU, A, 0, 0, "(?=(?:x|ab(*ACCEPT)b))", "ab" },
641 { MU, A, 0, 0, "(?=(a(b(*ACCEPT)b)))a", "ab" },
642 { MU, A, PCRE2_NOTEMPTY, 0, "(?=a*(*ACCEPT))c", "c" },
643 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?=A)", "AB" },
644
645 /* Conditional blocks. */
646 { MU, A, 0, 0, "(?(?=(a))a|b)+k", "ababbalbbadabak" },
647 { MU, A, 0, 0, "(?(?!(b))a|b)+k", "ababbalbbadabak" },
648 { MU, A, 0, 0, "(?(?=a)a|b)+k", "ababbalbbadabak" },
649 { MU, A, 0, 0, "(?(?!b)a|b)+k", "ababbalbbadabak" },
650 { MU, A, 0, 0, "(?(?=(a))a*|b*)+k", "ababbalbbadabak" },
651 { MU, A, 0, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" },
652 { MU, A, 0, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
653 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
654 { MU, A, 0, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
655 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
656 { MU, A, 0, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" },
657 { MU, A, 0, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" },
658 { MU, A, 0, 0, "(?(?=a)ab)", "a" },
659 { MU, A, 0, 0, "(?(?<!b)c)", "b" },
660 { MU, A, 0, 0, "(?(DEFINE)a(b))", "a" },
661 { MU, A, 0, 0, "a(?(DEFINE)(?:b|(?:c?)+)*)", "a" },
662 { MU, A, 0, 0, "(?(?=.[a-c])[k-l]|[A-D])", "kdB" },
663 { MU, A, 0, 0, "(?(?!.{0,4}[cd])(aa|bb)|(cc|dd))+", "aabbccddaa" },
664 { MU, A, 0, 0, "(?(?=[^#@]*@)(aaab|aa|aba)|(aba|aab)){3,}", "aaabaaaba#aaabaaaba#aaabaaaba@" },
665 { MU, A, 0, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" },
666 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cdcaa" },
667 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cbb" },
668 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" },
669 { MU, A, 0, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" },
670 { MU, A, 0, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" },
671 { MU, A, 0, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" },
672 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" },
673 { MU, A, 0, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" },
674 { MU, A, 0, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" },
675 { MU, A, 0, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" },
676 { MU, A, 0, 0, "(?(?=(?=(?!(x))a)aa)aaa|(?(?=(?!y)bb)bbb))*k", "abaabbaaabbbaaabbb abaabbaaabbbaaabbbk" },
677 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)*l", "bc ddd abccabccl" },
678 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+?dd", "bcabcacdb bdddd" },
679 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+l", "ababccddabdbccd abcccl" },
680 { MU, A, 0, 0, "((?:a|aa)(?(1)aaa))x", "aax" },
681 { MU, A, 0, 0, "(?(?!)a|b)", "ab" },
682 { MU, A, 0, 0, "(?(?!)a)", "ab" },
683 { MU, A, 0, 0 | F_NOMATCH, "(?(?!)a|b)", "ac" },
684
685 /* Set start of match. */
686 { MU, A, 0, 0, "(?:\\Ka)*aaaab", "aaaaaaaa aaaaaaabb" },
687 { MU, A, 0, 0, "(?>\\Ka\\Ka)*aaaab", "aaaaaaaa aaaaaaaaaabb" },
688 { MU, A, 0, 0, "a+\\K(?<=\\Gaa)a", "aaaaaa" },
689 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a\\K(*ACCEPT)b", "aa" },
690 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
691
692 /* First line. */
693 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
694 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
695 { MU | PCRE2_FIRSTLINE, A, 0, 0, "(?<=a)", "a" },
696 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[^a][^b]", "ab" },
697 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "a", "\na" },
698 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[abc]", "\na" },
699 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^a", "\na" },
700 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^(?<=\n)", "\na" },
701 { MU | PCRE2_FIRSTLINE, A, 0, 0, "\xf0\x90\x90\x80", "\xf0\x90\x90\x80" },
702 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\xc2\x85#" },
703 { M | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\x85#" },
704 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
705 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
706 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
707 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, "a", "\ra" },
708 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
709 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
710 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" },
711 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" },
712 { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" },
713 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" },
714 { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" },
715
716 /* Recurse. */
717 { MU, A, 0, 0, "(a)(?1)", "aa" },
718 { MU, A, 0, 0, "((a))(?1)", "aa" },
719 { MU, A, 0, 0, "(b|a)(?1)", "aa" },
720 { MU, A, 0, 0, "(b|(a))(?1)", "aa" },
721 { MU, A, 0, 0 | F_NOMATCH, "((a)(b)(?:a*))(?1)", "aba" },
722 { MU, A, 0, 0, "((a)(b)(?:a*))(?1)", "abab" },
723 { MU, A, 0, 0, "((a+)c(?2))b(?1)", "aacaabaca" },
724 { MU, A, 0, 0, "((?2)b|(a)){2}(?1)", "aabab" },
725 { MU, A, 0, 0, "(?1)(a)*+(?2)(b(?1))", "aababa" },
726 { MU, A, 0, 0, "(?1)(((a(*ACCEPT)))b)", "axaa" },
727 { MU, A, 0, 0, "(?1)(?(DEFINE) (((ac(*ACCEPT)))b) )", "akaac" },
728 { MU, A, 0, 0, "(a+)b(?1)b\\1", "abaaabaaaaa" },
729 { MU, A, 0, 0, "(?(DEFINE)(aa|a))(?1)ab", "aab" },
730 { MU, A, 0, 0, "(?(DEFINE)(a\\Kb))(?1)+ababc", "abababxabababc" },
731 { MU, A, 0, 0, "(a\\Kb)(?1)+ababc", "abababxababababc" },
732 { MU, A, 0, 0 | F_NOMATCH, "(a\\Kb)(?1)+ababc", "abababxababababxc" },
733 { MU, A, 0, 0, "b|<(?R)*>", "<<b>" },
734 { MU, A, 0, 0, "(a\\K){0}(?:(?1)b|ac)", "ac" },
735 { MU, A, 0, 0, "(?(DEFINE)(a(?2)|b)(b(?1)|(a)))(?:(?1)|(?2))m", "ababababnababababaam" },
736 { MU, A, 0, 0, "(a)((?(R)a|b))(?2)", "aabbabaa" },
737 { MU, A, 0, 0, "(a)((?(R2)a|b))(?2)", "aabbabaa" },
738 { MU, A, 0, 0, "(a)((?(R1)a|b))(?2)", "ababba" },
739 { MU, A, 0, 0, "(?(R0)aa|bb(?R))", "abba aabb bbaa" },
740 { MU, A, 0, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" },
741 { MU, A, 0, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" },
742 { MU, A, 0, 0, "((?(R)a|(?1)){3})", "XaaaaaaaaaX" },
743 { MU, A, 0, 0, "((?:(?(R)a|(?1))){3})", "XaaaaaaaaaX" },
744 { MU, A, 0, 0, "((?(R)a|(?1)){1,3})aaaaaa", "aaaaaaaaXaaaaaaaaa" },
745 { MU, A, 0, 0, "((?(R)a|(?1)){1,3}?)M", "aaaM" },
746 { MU, A, 0, 0, "((.)(?:.|\\2(?1))){0}#(?1)#", "#aabbccdde# #aabbccddee#" },
747 { MU, A, 0, 0, "((.)(?:\\2|\\2{4}b)){0}#(?:(?1))+#", "#aaaab# #aaaaab#" },
748
749 /* 16 bit specific tests. */
750 { CM, A, 0, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" },
751 { CM, A, 0, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
752 { CM, A, 0, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" },
753 { CM, A, 0, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" },
754 { CM, A, 0, 0 | F_FORCECONV, "[a-\xed\xb0\x80]", "A" },
755 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[a-\\x{dc00}]", "B" },
756 { CM, A, 0, 0 | F_NO8 | F_NOMATCH | F_FORCECONV, "[b-\\x{dc00}]", "a" },
757 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" },
758 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" },
759 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" },
760 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" },
761 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" },
762 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" },
763 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" },
764 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" },
765 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80][\xed\xb0\x80]{2,}", "\xed\xa0\x80\xed\xb0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80\xed\xb0\x80" },
766 { M, A, 0, 0 | F_FORCECONV, "[^\xed\xb0\x80]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
767 { M, A, 0, 0 | F_NO8 | F_FORCECONV, "[^\\x{dc00}]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
768 { CM, A, 0, 0 | F_FORCECONV, ".\\B.", "\xed\xa0\x80\xed\xb0\x80" },
769 { CM, A, 0, 0 | F_FORCECONV, "\\D+(?:\\d+|.)\\S+(?:\\s+|.)\\W+(?:\\w+|.)\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80" },
770 { CM, A, 0, 0 | F_FORCECONV, "\\d*\\s*\\w*\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80" },
771 { CM, A, 0, 0 | F_FORCECONV | F_NOMATCH, "\\d*?\\D*?\\s*?\\S*?\\w*?\\W*?##", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80#" },
772 { CM | PCRE2_EXTENDED, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
773 { CM, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
774 { CM, A, 0, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
775 { M, PCRE2_NEWLINE_ANY, 0, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
776 { 0, BSR(PCRE2_BSR_UNICODE), 0, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
777 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
778 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
779 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
780 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
781
782 /* Partial matching. */
783 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab", "a" },
784 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab|a", "a" },
785 { MU, A, PCRE2_PARTIAL_HARD, 0, "ab|a", "a" },
786 { MU, A, PCRE2_PARTIAL_SOFT, 0, "\\b#", "a" },
787 { MU, A, PCRE2_PARTIAL_SOFT, 0, "(?<=a)b", "a" },
788 { MU, A, PCRE2_PARTIAL_SOFT, 0, "abc|(?<=xxa)bc", "xxab" },
789 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a\\B", "a" },
790 { MU, A, PCRE2_PARTIAL_HARD, 0, "a\\b", "a" },
791
792 /* (*MARK) verb. */
793 { MU, A, 0, 0, "a(*MARK:aa)a", "ababaa" },
794 { MU, A, 0, 0 | F_NOMATCH, "a(*:aa)a", "abab" },
795 { MU, A, 0, 0, "a(*:aa)(b(*:bb)b|bc)", "abc" },
796 { MU, A, 0, 0 | F_NOMATCH, "a(*:1)x|b(*:2)y", "abc" },
797 { MU, A, 0, 0, "(?>a(*:aa))b|ac", "ac" },
798 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))(?1)", "a" },
799 { MU, A, 0, 0 | F_NOMATCH, "(?(DEFINE)((a)(*:aa)))(?1)b", "aa" },
800 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" },
801 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
802 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" },
803 { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" },
804 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
805 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" },
806 { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" },
807 { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" },
808
809 /* (*COMMIT) verb. */
810 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" },
811 { MU, A, 0, 0, "aa(*COMMIT)b", "xaxaab" },
812 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)(*:msg)b|ac", "ac" },
813 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b)++", "abac" },
814 { MU, A, 0, 0 | F_NOMATCH, "((a)(*COMMIT)b)++", "abac" },
815 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*COMMIT)b)ab|ad", "ad" },
816
817 /* (*PRUNE) verb. */
818 { MU, A, 0, 0, "aa\\K(*PRUNE)b", "aaab" },
819 { MU, A, 0, 0, "aa(*PRUNE:bb)b|a", "aa" },
820 { MU, A, 0, 0, "(a)(a)(*PRUNE)b|(a)", "aa" },
821 { MU, A, 0, 0, "(a)(a)(a)(a)(a)(a)(a)(a)(*PRUNE)b|(a)", "aaaaaaaa" },
822 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|", "a" },
823 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|m", "a" },
824 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*PRUNE)b)ab|ad", "ad" },
825 { MU, A, 0, 0, "a(*COMMIT)(*PRUNE)d|bc", "abc" },
826 { MU, A, 0, 0, "(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
827 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
828 { MU, A, 0, 0, "(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
829 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
830 { MU, A, 0, 0, "(a(*COMMIT)b){0}a(?1)(*PRUNE)c|bc", "abc" },
831 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b){0}a(*COMMIT)(?1)(*PRUNE)c|bc", "abc" },
832 { MU, A, 0, 0, "(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
833 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
834 { MU, A, 0, 0, "((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
835 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
836 { MU, A, 0, 0, "(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
837 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
838 { MU, A, 0, 0, "(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
839 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
840 { MU, A, 0, 0, "(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
841 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
842 { MU, A, 0, 0, "(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
843 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
844 { MU, A, 0, 0, "(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
845 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
846 { MU, A, 0, 0, "(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
847 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
848
849 /* (*SKIP) verb. */
850 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" },
851 { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," },
852 { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," },
853 { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" },
854
855 /* (*THEN) verb. */
856 { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" },
857 { MU, A, 0, 0 | F_NOMATCH, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcm" },
858 { MU, A, 0, 0, "((?:a(*THEN)|aab)c|a+)+m", "aabcaabcnmaabcaabcm" },
859 { MU, A, 0, 0, "((?:a|aab)(*THEN)c|a+)+m", "aam" },
860 { MU, A, 0, 0, "((?:a(*COMMIT)|aab)(*THEN)c|a+)+m", "aam" },
861 { MU, A, 0, 0, "(?(?=a(*THEN)b)ab|ad)", "ad" },
862 { MU, A, 0, 0, "(?(?!a(*THEN)b)ad|add)", "add" },
863 { MU, A, 0, 0 | F_NOMATCH, "(?(?=a)a(*THEN)b|ad)", "ad" },
864 { MU, A, 0, 0, "(?!(?(?=a)ab|b(*THEN)d))bn|bnn", "bnn" },
865 { MU, A, 0, 0, "(?=(*THEN: ))* ", " " },
866 { MU, A, 0, 0, "a(*THEN)(?R) |", "a" },
867
868 /* Recurse and control verbs. */
869 { MU, A, 0, 0, "(a(*ACCEPT)b){0}a(?1)b", "aacaabb" },
870 { MU, A, 0, 0, "((a)\\2(*ACCEPT)b){0}a(?1)b", "aaacaaabb" },
871 { MU, A, 0, 0, "((ab|a(*ACCEPT)x)+|ababababax){0}_(?1)_", "_ababababax_ _ababababa_" },
872 { MU, A, 0, 0, "((.)(?:A(*ACCEPT)|(?1)\\2)){0}_(?1)_", "_bcdaAdcb_bcdaAdcb_" },
873 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_", "_ab_" },
874 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_|(_aa_)", "_aa_" },
875 { MU, A, 0, 0, "(a(*COMMIT)(?:b|bb)|c(*ACCEPT)d|dd){0}_(?1)+_", "_ax_ _cd_ _abbb_ _abcd_ _abbcdd_" },
876 { MU, A, 0, 0, "((.)(?:.|(*COMMIT)\\2{3}(*ACCEPT).*|.*)){0}_(?1){0,4}_", "_aaaabbbbccccddd_ _aaaabbbbccccdddd_" },
877
878 #ifdef SUPPORT_UNICODE
879 /* Script runs and iterations. */
880 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
881 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
882 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
883 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
884 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
885 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)++#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
886 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)?#", "!ab!abc!ab!ab#" },
887 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)??#", "!ab!abc!ab!ab#" },
888 #endif
889
890 /* Deep recursion. */
891 { MU, A, 0, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
892 { MU, A, 0, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
893 { MU, A, 0, 0, "((a?)+)+b", "aaaaaaaaaaaa b" },
894
895 /* Deep recursion: Stack limit reached. */
896 { M, A, 0, 0 | F_NOMATCH, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
897 { M, A, 0, 0 | F_NOMATCH, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
898 { M, A, 0, 0 | F_NOMATCH, "(?:a+?)+?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
899 { M, A, 0, 0 | F_NOMATCH, "(?:a*)*b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
900 { M, A, 0, 0 | F_NOMATCH, "(?:a*?)*?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
901
902 { 0, 0, 0, 0, NULL, NULL }
903 };
904
905 #ifdef SUPPORT_PCRE2_8
callback8(void * arg)906 static pcre2_jit_stack_8* callback8(void *arg)
907 {
908 return (pcre2_jit_stack_8 *)arg;
909 }
910 #endif
911
912 #ifdef SUPPORT_PCRE2_16
callback16(void * arg)913 static pcre2_jit_stack_16* callback16(void *arg)
914 {
915 return (pcre2_jit_stack_16 *)arg;
916 }
917 #endif
918
919 #ifdef SUPPORT_PCRE2_32
callback32(void * arg)920 static pcre2_jit_stack_32* callback32(void *arg)
921 {
922 return (pcre2_jit_stack_32 *)arg;
923 }
924 #endif
925
926 #ifdef SUPPORT_PCRE2_8
927 static pcre2_jit_stack_8 *stack8;
928
getstack8(void)929 static pcre2_jit_stack_8 *getstack8(void)
930 {
931 if (!stack8)
932 stack8 = pcre2_jit_stack_create_8(1, 1024 * 1024, NULL);
933 return stack8;
934 }
935
setstack8(pcre2_match_context_8 * mcontext)936 static void setstack8(pcre2_match_context_8 *mcontext)
937 {
938 if (!mcontext) {
939 if (stack8)
940 pcre2_jit_stack_free_8(stack8);
941 stack8 = NULL;
942 return;
943 }
944
945 pcre2_jit_stack_assign_8(mcontext, callback8, getstack8());
946 }
947 #endif /* SUPPORT_PCRE2_8 */
948
949 #ifdef SUPPORT_PCRE2_16
950 static pcre2_jit_stack_16 *stack16;
951
getstack16(void)952 static pcre2_jit_stack_16 *getstack16(void)
953 {
954 if (!stack16)
955 stack16 = pcre2_jit_stack_create_16(1, 1024 * 1024, NULL);
956 return stack16;
957 }
958
setstack16(pcre2_match_context_16 * mcontext)959 static void setstack16(pcre2_match_context_16 *mcontext)
960 {
961 if (!mcontext) {
962 if (stack16)
963 pcre2_jit_stack_free_16(stack16);
964 stack16 = NULL;
965 return;
966 }
967
968 pcre2_jit_stack_assign_16(mcontext, callback16, getstack16());
969 }
970 #endif /* SUPPORT_PCRE2_16 */
971
972 #ifdef SUPPORT_PCRE2_32
973 static pcre2_jit_stack_32 *stack32;
974
getstack32(void)975 static pcre2_jit_stack_32 *getstack32(void)
976 {
977 if (!stack32)
978 stack32 = pcre2_jit_stack_create_32(1, 1024 * 1024, NULL);
979 return stack32;
980 }
981
setstack32(pcre2_match_context_32 * mcontext)982 static void setstack32(pcre2_match_context_32 *mcontext)
983 {
984 if (!mcontext) {
985 if (stack32)
986 pcre2_jit_stack_free_32(stack32);
987 stack32 = NULL;
988 return;
989 }
990
991 pcre2_jit_stack_assign_32(mcontext, callback32, getstack32());
992 }
993 #endif /* SUPPORT_PCRE2_32 */
994
995 #ifdef SUPPORT_PCRE2_16
996
convert_utf8_to_utf16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int * offsetmap,int max_length)997 static int convert_utf8_to_utf16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int *offsetmap, int max_length)
998 {
999 PCRE2_SPTR8 iptr = input;
1000 PCRE2_UCHAR16 *optr = output;
1001 unsigned int c;
1002
1003 if (max_length == 0)
1004 return 0;
1005
1006 while (*iptr && max_length > 1) {
1007 c = 0;
1008 if (offsetmap)
1009 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1010
1011 if (*iptr < 0xc0)
1012 c = *iptr++;
1013 else if (!(*iptr & 0x20)) {
1014 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1015 iptr += 2;
1016 } else if (!(*iptr & 0x10)) {
1017 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1018 iptr += 3;
1019 } else if (!(*iptr & 0x08)) {
1020 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1021 iptr += 4;
1022 }
1023
1024 if (c < 65536) {
1025 *optr++ = c;
1026 max_length--;
1027 } else if (max_length <= 2) {
1028 *optr = '\0';
1029 return (int)(optr - output);
1030 } else {
1031 c -= 0x10000;
1032 *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
1033 *optr++ = 0xdc00 | (c & 0x3ff);
1034 max_length -= 2;
1035 if (offsetmap)
1036 offsetmap++;
1037 }
1038 }
1039 if (offsetmap)
1040 *offsetmap = (int)(iptr - (unsigned char*)input);
1041 *optr = '\0';
1042 return (int)(optr - output);
1043 }
1044
copy_char8_to_char16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int max_length)1045 static int copy_char8_to_char16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int max_length)
1046 {
1047 PCRE2_SPTR8 iptr = input;
1048 PCRE2_UCHAR16 *optr = output;
1049
1050 if (max_length == 0)
1051 return 0;
1052
1053 while (*iptr && max_length > 1) {
1054 *optr++ = *iptr++;
1055 max_length--;
1056 }
1057 *optr = '\0';
1058 return (int)(optr - output);
1059 }
1060
1061 #define REGTEST_MAX_LENGTH16 4096
1062 static PCRE2_UCHAR16 regtest_buf16[REGTEST_MAX_LENGTH16];
1063 static int regtest_offsetmap16[REGTEST_MAX_LENGTH16];
1064
1065 #endif /* SUPPORT_PCRE2_16 */
1066
1067 #ifdef SUPPORT_PCRE2_32
1068
convert_utf8_to_utf32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int * offsetmap,int max_length)1069 static int convert_utf8_to_utf32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int *offsetmap, int max_length)
1070 {
1071 PCRE2_SPTR8 iptr = input;
1072 PCRE2_UCHAR32 *optr = output;
1073 unsigned int c;
1074
1075 if (max_length == 0)
1076 return 0;
1077
1078 while (*iptr && max_length > 1) {
1079 c = 0;
1080 if (offsetmap)
1081 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1082
1083 if (*iptr < 0xc0)
1084 c = *iptr++;
1085 else if (!(*iptr & 0x20)) {
1086 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1087 iptr += 2;
1088 } else if (!(*iptr & 0x10)) {
1089 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1090 iptr += 3;
1091 } else if (!(*iptr & 0x08)) {
1092 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1093 iptr += 4;
1094 }
1095
1096 *optr++ = c;
1097 max_length--;
1098 }
1099 if (offsetmap)
1100 *offsetmap = (int)(iptr - (unsigned char*)input);
1101 *optr = 0;
1102 return (int)(optr - output);
1103 }
1104
copy_char8_to_char32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int max_length)1105 static int copy_char8_to_char32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int max_length)
1106 {
1107 PCRE2_SPTR8 iptr = input;
1108 PCRE2_UCHAR32 *optr = output;
1109
1110 if (max_length == 0)
1111 return 0;
1112
1113 while (*iptr && max_length > 1) {
1114 *optr++ = *iptr++;
1115 max_length--;
1116 }
1117 *optr = '\0';
1118 return (int)(optr - output);
1119 }
1120
1121 #define REGTEST_MAX_LENGTH32 4096
1122 static PCRE2_UCHAR32 regtest_buf32[REGTEST_MAX_LENGTH32];
1123 static int regtest_offsetmap32[REGTEST_MAX_LENGTH32];
1124
1125 #endif /* SUPPORT_PCRE2_32 */
1126
check_ascii(const char * input)1127 static int check_ascii(const char *input)
1128 {
1129 const unsigned char *ptr = (unsigned char *)input;
1130 while (*ptr) {
1131 if (*ptr > 127)
1132 return 0;
1133 ptr++;
1134 }
1135 return 1;
1136 }
1137
1138 #define OVECTOR_SIZE 15
1139
regression_tests(void)1140 static int regression_tests(void)
1141 {
1142 struct regression_test_case *current = regression_test_cases;
1143 int error;
1144 PCRE2_SIZE err_offs;
1145 int is_successful;
1146 int is_ascii;
1147 int total = 0;
1148 int successful = 0;
1149 int successful_row = 0;
1150 int counter = 0;
1151 int jit_compile_mode;
1152 int utf = 0;
1153 int disabled_options = 0;
1154 int i;
1155 #ifdef SUPPORT_PCRE2_8
1156 pcre2_code_8 *re8;
1157 pcre2_compile_context_8 *ccontext8;
1158 pcre2_match_data_8 *mdata8_1;
1159 pcre2_match_data_8 *mdata8_2;
1160 pcre2_match_context_8 *mcontext8;
1161 PCRE2_SIZE *ovector8_1 = NULL;
1162 PCRE2_SIZE *ovector8_2 = NULL;
1163 int return_value8[2];
1164 #endif
1165 #ifdef SUPPORT_PCRE2_16
1166 pcre2_code_16 *re16;
1167 pcre2_compile_context_16 *ccontext16;
1168 pcre2_match_data_16 *mdata16_1;
1169 pcre2_match_data_16 *mdata16_2;
1170 pcre2_match_context_16 *mcontext16;
1171 PCRE2_SIZE *ovector16_1 = NULL;
1172 PCRE2_SIZE *ovector16_2 = NULL;
1173 int return_value16[2];
1174 int length16;
1175 #endif
1176 #ifdef SUPPORT_PCRE2_32
1177 pcre2_code_32 *re32;
1178 pcre2_compile_context_32 *ccontext32;
1179 pcre2_match_data_32 *mdata32_1;
1180 pcre2_match_data_32 *mdata32_2;
1181 pcre2_match_context_32 *mcontext32;
1182 PCRE2_SIZE *ovector32_1 = NULL;
1183 PCRE2_SIZE *ovector32_2 = NULL;
1184 int return_value32[2];
1185 int length32;
1186 #endif
1187
1188 #if defined SUPPORT_PCRE2_8
1189 PCRE2_UCHAR8 cpu_info[128];
1190 #elif defined SUPPORT_PCRE2_16
1191 PCRE2_UCHAR16 cpu_info[128];
1192 #elif defined SUPPORT_PCRE2_32
1193 PCRE2_UCHAR32 cpu_info[128];
1194 #endif
1195 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1196 int return_value;
1197 #endif
1198
1199 /* This test compares the behaviour of interpreter and JIT. Although disabling
1200 utf or ucp may make tests fail, if the pcre_exec result is the SAME, it is
1201 still considered successful from pcre_jit_test point of view. */
1202
1203 #if defined SUPPORT_PCRE2_8
1204 pcre2_config_8(PCRE2_CONFIG_JITTARGET, &cpu_info);
1205 #elif defined SUPPORT_PCRE2_16
1206 pcre2_config_16(PCRE2_CONFIG_JITTARGET, &cpu_info);
1207 #elif defined SUPPORT_PCRE2_32
1208 pcre2_config_32(PCRE2_CONFIG_JITTARGET, &cpu_info);
1209 #endif
1210
1211 printf("Running JIT regression tests\n");
1212 printf(" target CPU of SLJIT compiler: ");
1213 for (i = 0; cpu_info[i]; i++)
1214 printf("%c", (char)(cpu_info[i]));
1215 printf("\n");
1216
1217 #if defined SUPPORT_PCRE2_8
1218 pcre2_config_8(PCRE2_CONFIG_UNICODE, &utf);
1219 #elif defined SUPPORT_PCRE2_16
1220 pcre2_config_16(PCRE2_CONFIG_UNICODE, &utf);
1221 #elif defined SUPPORT_PCRE2_32
1222 pcre2_config_32(PCRE2_CONFIG_UNICODE, &utf);
1223 #endif
1224
1225 if (!utf)
1226 disabled_options |= PCRE2_UTF;
1227 #ifdef SUPPORT_PCRE2_8
1228 printf(" in 8 bit mode with UTF-8 %s:\n", utf ? "enabled" : "disabled");
1229 #endif
1230 #ifdef SUPPORT_PCRE2_16
1231 printf(" in 16 bit mode with UTF-16 %s:\n", utf ? "enabled" : "disabled");
1232 #endif
1233 #ifdef SUPPORT_PCRE2_32
1234 printf(" in 32 bit mode with UTF-32 %s:\n", utf ? "enabled" : "disabled");
1235 #endif
1236
1237 while (current->pattern) {
1238 /* printf("\nPattern: %s :\n", current->pattern); */
1239 total++;
1240 is_ascii = 0;
1241 if (!(current->start_offset & F_PROPERTY))
1242 is_ascii = check_ascii(current->pattern) && check_ascii(current->input);
1243
1244 if (current->match_options & PCRE2_PARTIAL_SOFT)
1245 jit_compile_mode = PCRE2_JIT_PARTIAL_SOFT;
1246 else if (current->match_options & PCRE2_PARTIAL_HARD)
1247 jit_compile_mode = PCRE2_JIT_PARTIAL_HARD;
1248 else
1249 jit_compile_mode = PCRE2_JIT_COMPLETE;
1250 error = 0;
1251 #ifdef SUPPORT_PCRE2_8
1252 re8 = NULL;
1253 ccontext8 = pcre2_compile_context_create_8(NULL);
1254 if (ccontext8) {
1255 if (GET_NEWLINE(current->newline))
1256 pcre2_set_newline_8(ccontext8, GET_NEWLINE(current->newline));
1257 if (GET_BSR(current->newline))
1258 pcre2_set_bsr_8(ccontext8, GET_BSR(current->newline));
1259
1260 if (!(current->start_offset & F_NO8)) {
1261 re8 = pcre2_compile_8((PCRE2_SPTR8)current->pattern, PCRE2_ZERO_TERMINATED,
1262 current->compile_options & ~disabled_options,
1263 &error, &err_offs, ccontext8);
1264
1265 if (!re8 && (utf || is_ascii))
1266 printf("\n8 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1267 }
1268 pcre2_compile_context_free_8(ccontext8);
1269 }
1270 else
1271 printf("\n8 bit: Cannot allocate compile context\n");
1272 #endif
1273 #ifdef SUPPORT_PCRE2_16
1274 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1275 convert_utf8_to_utf16((PCRE2_SPTR8)current->pattern, regtest_buf16, NULL, REGTEST_MAX_LENGTH16);
1276 else
1277 copy_char8_to_char16((PCRE2_SPTR8)current->pattern, regtest_buf16, REGTEST_MAX_LENGTH16);
1278
1279 re16 = NULL;
1280 ccontext16 = pcre2_compile_context_create_16(NULL);
1281 if (ccontext16) {
1282 if (GET_NEWLINE(current->newline))
1283 pcre2_set_newline_16(ccontext16, GET_NEWLINE(current->newline));
1284 if (GET_BSR(current->newline))
1285 pcre2_set_bsr_16(ccontext16, GET_BSR(current->newline));
1286
1287 if (!(current->start_offset & F_NO16)) {
1288 re16 = pcre2_compile_16(regtest_buf16, PCRE2_ZERO_TERMINATED,
1289 current->compile_options & ~disabled_options,
1290 &error, &err_offs, ccontext16);
1291
1292 if (!re16 && (utf || is_ascii))
1293 printf("\n16 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1294 }
1295 pcre2_compile_context_free_16(ccontext16);
1296 }
1297 else
1298 printf("\n16 bit: Cannot allocate compile context\n");
1299 #endif
1300 #ifdef SUPPORT_PCRE2_32
1301 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1302 convert_utf8_to_utf32((PCRE2_SPTR8)current->pattern, regtest_buf32, NULL, REGTEST_MAX_LENGTH32);
1303 else
1304 copy_char8_to_char32((PCRE2_SPTR8)current->pattern, regtest_buf32, REGTEST_MAX_LENGTH32);
1305
1306 re32 = NULL;
1307 ccontext32 = pcre2_compile_context_create_32(NULL);
1308 if (ccontext32) {
1309 if (GET_NEWLINE(current->newline))
1310 pcre2_set_newline_32(ccontext32, GET_NEWLINE(current->newline));
1311 if (GET_BSR(current->newline))
1312 pcre2_set_bsr_32(ccontext32, GET_BSR(current->newline));
1313
1314 if (!(current->start_offset & F_NO32)) {
1315 re32 = pcre2_compile_32(regtest_buf32, PCRE2_ZERO_TERMINATED,
1316 current->compile_options & ~disabled_options,
1317 &error, &err_offs, ccontext32);
1318
1319 if (!re32 && (utf || is_ascii))
1320 printf("\n32 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1321 }
1322 pcre2_compile_context_free_32(ccontext32);
1323 }
1324 else
1325 printf("\n32 bit: Cannot allocate compile context\n");
1326 #endif
1327
1328 counter++;
1329 if ((counter & 0x3) != 0) {
1330 #ifdef SUPPORT_PCRE2_8
1331 setstack8(NULL);
1332 #endif
1333 #ifdef SUPPORT_PCRE2_16
1334 setstack16(NULL);
1335 #endif
1336 #ifdef SUPPORT_PCRE2_32
1337 setstack32(NULL);
1338 #endif
1339 }
1340
1341 #ifdef SUPPORT_PCRE2_8
1342 return_value8[0] = -1000;
1343 return_value8[1] = -1000;
1344 mdata8_1 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1345 mdata8_2 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1346 mcontext8 = pcre2_match_context_create_8(NULL);
1347 if (!mdata8_1 || !mdata8_2 || !mcontext8) {
1348 printf("\n8 bit: Cannot allocate match data\n");
1349 pcre2_match_data_free_8(mdata8_1);
1350 pcre2_match_data_free_8(mdata8_2);
1351 pcre2_match_context_free_8(mcontext8);
1352 pcre2_code_free_8(re8);
1353 re8 = NULL;
1354 } else {
1355 ovector8_1 = pcre2_get_ovector_pointer_8(mdata8_1);
1356 ovector8_2 = pcre2_get_ovector_pointer_8(mdata8_2);
1357 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1358 ovector8_1[i] = -2;
1359 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1360 ovector8_2[i] = -2;
1361 pcre2_set_match_limit_8(mcontext8, 10000000);
1362 }
1363 if (re8) {
1364 return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1365 current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8);
1366
1367 if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
1368 printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1369 } else if ((counter & 0x1) != 0) {
1370 setstack8(mcontext8);
1371 return_value8[0] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1372 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1373 } else {
1374 pcre2_jit_stack_assign_8(mcontext8, NULL, getstack8());
1375 return_value8[0] = pcre2_jit_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1376 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1377 }
1378 }
1379 #endif
1380
1381 #ifdef SUPPORT_PCRE2_16
1382 return_value16[0] = -1000;
1383 return_value16[1] = -1000;
1384 mdata16_1 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1385 mdata16_2 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1386 mcontext16 = pcre2_match_context_create_16(NULL);
1387 if (!mdata16_1 || !mdata16_2 || !mcontext16) {
1388 printf("\n16 bit: Cannot allocate match data\n");
1389 pcre2_match_data_free_16(mdata16_1);
1390 pcre2_match_data_free_16(mdata16_2);
1391 pcre2_match_context_free_16(mcontext16);
1392 pcre2_code_free_16(re16);
1393 re16 = NULL;
1394 } else {
1395 ovector16_1 = pcre2_get_ovector_pointer_16(mdata16_1);
1396 ovector16_2 = pcre2_get_ovector_pointer_16(mdata16_2);
1397 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1398 ovector16_1[i] = -2;
1399 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1400 ovector16_2[i] = -2;
1401 pcre2_set_match_limit_16(mcontext16, 10000000);
1402 }
1403 if (re16) {
1404 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1405 length16 = convert_utf8_to_utf16((PCRE2_SPTR8)current->input, regtest_buf16, regtest_offsetmap16, REGTEST_MAX_LENGTH16);
1406 else
1407 length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
1408
1409 return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
1410 current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16);
1411
1412 if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
1413 printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1414 } else if ((counter & 0x1) != 0) {
1415 setstack16(mcontext16);
1416 return_value16[0] = pcre2_match_16(re16, regtest_buf16, length16,
1417 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1418 } else {
1419 pcre2_jit_stack_assign_16(mcontext16, NULL, getstack16());
1420 return_value16[0] = pcre2_jit_match_16(re16, regtest_buf16, length16,
1421 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1422 }
1423 }
1424 #endif
1425
1426 #ifdef SUPPORT_PCRE2_32
1427 return_value32[0] = -1000;
1428 return_value32[1] = -1000;
1429 mdata32_1 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1430 mdata32_2 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1431 mcontext32 = pcre2_match_context_create_32(NULL);
1432 if (!mdata32_1 || !mdata32_2 || !mcontext32) {
1433 printf("\n32 bit: Cannot allocate match data\n");
1434 pcre2_match_data_free_32(mdata32_1);
1435 pcre2_match_data_free_32(mdata32_2);
1436 pcre2_match_context_free_32(mcontext32);
1437 pcre2_code_free_32(re32);
1438 re32 = NULL;
1439 } else {
1440 ovector32_1 = pcre2_get_ovector_pointer_32(mdata32_1);
1441 ovector32_2 = pcre2_get_ovector_pointer_32(mdata32_2);
1442 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1443 ovector32_1[i] = -2;
1444 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1445 ovector32_2[i] = -2;
1446 pcre2_set_match_limit_32(mcontext32, 10000000);
1447 }
1448 if (re32) {
1449 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1450 length32 = convert_utf8_to_utf32((PCRE2_SPTR8)current->input, regtest_buf32, regtest_offsetmap32, REGTEST_MAX_LENGTH32);
1451 else
1452 length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
1453
1454 return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
1455 current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32);
1456
1457 if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
1458 printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1459 } else if ((counter & 0x1) != 0) {
1460 setstack32(mcontext32);
1461 return_value32[0] = pcre2_match_32(re32, regtest_buf32, length32,
1462 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1463 } else {
1464 pcre2_jit_stack_assign_32(mcontext32, NULL, getstack32());
1465 return_value32[0] = pcre2_jit_match_32(re32, regtest_buf32, length32,
1466 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1467 }
1468 }
1469 #endif
1470
1471 /* printf("[%d-%d-%d|%d-%d|%d-%d|%d-%d]%s",
1472 return_value8[0], return_value16[0], return_value32[0],
1473 (int)ovector8_1[0], (int)ovector8_1[1],
1474 (int)ovector16_1[0], (int)ovector16_1[1],
1475 (int)ovector32_1[0], (int)ovector32_1[1],
1476 (current->compile_options & PCRE2_CASELESS) ? "C" : ""); */
1477
1478 /* If F_DIFF is set, just run the test, but do not compare the results.
1479 Segfaults can still be captured. */
1480
1481 is_successful = 1;
1482 if (!(current->start_offset & F_DIFF)) {
1483 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1484 if (!(current->start_offset & F_FORCECONV)) {
1485
1486 /* All results must be the same. */
1487 #ifdef SUPPORT_PCRE2_8
1488 if ((return_value = return_value8[0]) != return_value8[1]) {
1489 printf("\n8 bit: Return value differs(J8:%d,I8:%d): [%d] '%s' @ '%s'\n",
1490 return_value8[0], return_value8[1], total, current->pattern, current->input);
1491 is_successful = 0;
1492 } else
1493 #endif
1494 #ifdef SUPPORT_PCRE2_16
1495 if ((return_value = return_value16[0]) != return_value16[1]) {
1496 printf("\n16 bit: Return value differs(J16:%d,I16:%d): [%d] '%s' @ '%s'\n",
1497 return_value16[0], return_value16[1], total, current->pattern, current->input);
1498 is_successful = 0;
1499 } else
1500 #endif
1501 #ifdef SUPPORT_PCRE2_32
1502 if ((return_value = return_value32[0]) != return_value32[1]) {
1503 printf("\n32 bit: Return value differs(J32:%d,I32:%d): [%d] '%s' @ '%s'\n",
1504 return_value32[0], return_value32[1], total, current->pattern, current->input);
1505 is_successful = 0;
1506 } else
1507 #endif
1508 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1509 if (return_value8[0] != return_value16[0]) {
1510 printf("\n8 and 16 bit: Return value differs(J8:%d,J16:%d): [%d] '%s' @ '%s'\n",
1511 return_value8[0], return_value16[0],
1512 total, current->pattern, current->input);
1513 is_successful = 0;
1514 } else
1515 #endif
1516 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1517 if (return_value8[0] != return_value32[0]) {
1518 printf("\n8 and 32 bit: Return value differs(J8:%d,J32:%d): [%d] '%s' @ '%s'\n",
1519 return_value8[0], return_value32[0],
1520 total, current->pattern, current->input);
1521 is_successful = 0;
1522 } else
1523 #endif
1524 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1525 if (return_value16[0] != return_value32[0]) {
1526 printf("\n16 and 32 bit: Return value differs(J16:%d,J32:%d): [%d] '%s' @ '%s'\n",
1527 return_value16[0], return_value32[0],
1528 total, current->pattern, current->input);
1529 is_successful = 0;
1530 } else
1531 #endif
1532 if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
1533 if (return_value == PCRE2_ERROR_PARTIAL) {
1534 return_value = 2;
1535 } else {
1536 return_value *= 2;
1537 }
1538 #ifdef SUPPORT_PCRE2_8
1539 return_value8[0] = return_value;
1540 #endif
1541 #ifdef SUPPORT_PCRE2_16
1542 return_value16[0] = return_value;
1543 #endif
1544 #ifdef SUPPORT_PCRE2_32
1545 return_value32[0] = return_value;
1546 #endif
1547 /* Transform back the results. */
1548 if (current->compile_options & PCRE2_UTF) {
1549 #ifdef SUPPORT_PCRE2_16
1550 for (i = 0; i < return_value; ++i) {
1551 if (ovector16_1[i] != PCRE2_UNSET)
1552 ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
1553 if (ovector16_2[i] != PCRE2_UNSET)
1554 ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
1555 }
1556 #endif
1557 #ifdef SUPPORT_PCRE2_32
1558 for (i = 0; i < return_value; ++i) {
1559 if (ovector32_1[i] != PCRE2_UNSET)
1560 ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
1561 if (ovector32_2[i] != PCRE2_UNSET)
1562 ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
1563 }
1564 #endif
1565 }
1566
1567 for (i = 0; i < return_value; ++i) {
1568 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1569 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
1570 printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
1571 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
1572 total, current->pattern, current->input);
1573 is_successful = 0;
1574 }
1575 #endif
1576 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1577 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
1578 printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1579 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1580 total, current->pattern, current->input);
1581 is_successful = 0;
1582 }
1583 #endif
1584 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1585 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
1586 printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1587 i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1588 total, current->pattern, current->input);
1589 is_successful = 0;
1590 }
1591 #endif
1592 }
1593 }
1594 } else
1595 #endif /* more than one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16 and SUPPORT_PCRE2_32 */
1596 {
1597 #ifdef SUPPORT_PCRE2_8
1598 if (return_value8[0] != return_value8[1]) {
1599 printf("\n8 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1600 return_value8[0], return_value8[1], total, current->pattern, current->input);
1601 is_successful = 0;
1602 } else if (return_value8[0] >= 0 || return_value8[0] == PCRE2_ERROR_PARTIAL) {
1603 if (return_value8[0] == PCRE2_ERROR_PARTIAL)
1604 return_value8[0] = 2;
1605 else
1606 return_value8[0] *= 2;
1607
1608 for (i = 0; i < return_value8[0]; ++i)
1609 if (ovector8_1[i] != ovector8_2[i]) {
1610 printf("\n8 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1611 i, (int)ovector8_1[i], (int)ovector8_2[i], total, current->pattern, current->input);
1612 is_successful = 0;
1613 }
1614 }
1615 #endif
1616
1617 #ifdef SUPPORT_PCRE2_16
1618 if (return_value16[0] != return_value16[1]) {
1619 printf("\n16 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1620 return_value16[0], return_value16[1], total, current->pattern, current->input);
1621 is_successful = 0;
1622 } else if (return_value16[0] >= 0 || return_value16[0] == PCRE2_ERROR_PARTIAL) {
1623 if (return_value16[0] == PCRE2_ERROR_PARTIAL)
1624 return_value16[0] = 2;
1625 else
1626 return_value16[0] *= 2;
1627
1628 for (i = 0; i < return_value16[0]; ++i)
1629 if (ovector16_1[i] != ovector16_2[i]) {
1630 printf("\n16 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1631 i, (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input);
1632 is_successful = 0;
1633 }
1634 }
1635 #endif
1636
1637 #ifdef SUPPORT_PCRE2_32
1638 if (return_value32[0] != return_value32[1]) {
1639 printf("\n32 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1640 return_value32[0], return_value32[1], total, current->pattern, current->input);
1641 is_successful = 0;
1642 } else if (return_value32[0] >= 0 || return_value32[0] == PCRE2_ERROR_PARTIAL) {
1643 if (return_value32[0] == PCRE2_ERROR_PARTIAL)
1644 return_value32[0] = 2;
1645 else
1646 return_value32[0] *= 2;
1647
1648 for (i = 0; i < return_value32[0]; ++i)
1649 if (ovector32_1[i] != ovector32_2[i]) {
1650 printf("\n32 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1651 i, (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input);
1652 is_successful = 0;
1653 }
1654 }
1655 #endif
1656 }
1657 }
1658
1659 if (is_successful) {
1660 #ifdef SUPPORT_PCRE2_8
1661 if (!(current->start_offset & F_NO8) && (utf || is_ascii)) {
1662 if (return_value8[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1663 printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
1664 total, current->pattern, current->input);
1665 is_successful = 0;
1666 }
1667
1668 if (return_value8[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1669 printf("8 bit: Test should not match: [%d] '%s' @ '%s'\n",
1670 total, current->pattern, current->input);
1671 is_successful = 0;
1672 }
1673 }
1674 #endif
1675 #ifdef SUPPORT_PCRE2_16
1676 if (!(current->start_offset & F_NO16) && (utf || is_ascii)) {
1677 if (return_value16[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1678 printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
1679 total, current->pattern, current->input);
1680 is_successful = 0;
1681 }
1682
1683 if (return_value16[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1684 printf("16 bit: Test should not match: [%d] '%s' @ '%s'\n",
1685 total, current->pattern, current->input);
1686 is_successful = 0;
1687 }
1688 }
1689 #endif
1690 #ifdef SUPPORT_PCRE2_32
1691 if (!(current->start_offset & F_NO32) && (utf || is_ascii)) {
1692 if (return_value32[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1693 printf("32 bit: Test should match: [%d] '%s' @ '%s'\n",
1694 total, current->pattern, current->input);
1695 is_successful = 0;
1696 }
1697
1698 if (return_value32[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1699 printf("32 bit: Test should not match: [%d] '%s' @ '%s'\n",
1700 total, current->pattern, current->input);
1701 is_successful = 0;
1702 }
1703 }
1704 #endif
1705 }
1706
1707 if (is_successful) {
1708 #ifdef SUPPORT_PCRE2_8
1709 if (re8 && !(current->start_offset & F_NO8) && pcre2_get_mark_8(mdata8_1) != pcre2_get_mark_8(mdata8_2)) {
1710 printf("8 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1711 total, current->pattern, current->input);
1712 is_successful = 0;
1713 }
1714 #endif
1715 #ifdef SUPPORT_PCRE2_16
1716 if (re16 && !(current->start_offset & F_NO16) && pcre2_get_mark_16(mdata16_1) != pcre2_get_mark_16(mdata16_2)) {
1717 printf("16 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1718 total, current->pattern, current->input);
1719 is_successful = 0;
1720 }
1721 #endif
1722 #ifdef SUPPORT_PCRE2_32
1723 if (re32 && !(current->start_offset & F_NO32) && pcre2_get_mark_32(mdata32_1) != pcre2_get_mark_32(mdata32_2)) {
1724 printf("32 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1725 total, current->pattern, current->input);
1726 is_successful = 0;
1727 }
1728 #endif
1729 }
1730
1731 #ifdef SUPPORT_PCRE2_8
1732 pcre2_code_free_8(re8);
1733 pcre2_match_data_free_8(mdata8_1);
1734 pcre2_match_data_free_8(mdata8_2);
1735 pcre2_match_context_free_8(mcontext8);
1736 #endif
1737 #ifdef SUPPORT_PCRE2_16
1738 pcre2_code_free_16(re16);
1739 pcre2_match_data_free_16(mdata16_1);
1740 pcre2_match_data_free_16(mdata16_2);
1741 pcre2_match_context_free_16(mcontext16);
1742 #endif
1743 #ifdef SUPPORT_PCRE2_32
1744 pcre2_code_free_32(re32);
1745 pcre2_match_data_free_32(mdata32_1);
1746 pcre2_match_data_free_32(mdata32_2);
1747 pcre2_match_context_free_32(mcontext32);
1748 #endif
1749
1750 if (is_successful) {
1751 successful++;
1752 successful_row++;
1753 printf(".");
1754 if (successful_row >= 60) {
1755 successful_row = 0;
1756 printf("\n");
1757 }
1758 } else
1759 successful_row = 0;
1760
1761 fflush(stdout);
1762 current++;
1763 }
1764 #ifdef SUPPORT_PCRE2_8
1765 setstack8(NULL);
1766 #endif
1767 #ifdef SUPPORT_PCRE2_16
1768 setstack16(NULL);
1769 #endif
1770 #ifdef SUPPORT_PCRE2_32
1771 setstack32(NULL);
1772 #endif
1773
1774 if (total == successful) {
1775 printf("\nAll JIT regression tests are successfully passed.\n");
1776 return 0;
1777 } else {
1778 printf("\nSuccessful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
1779 return 1;
1780 }
1781 }
1782
1783 #if defined SUPPORT_UNICODE
1784
check_invalid_utf_result(int pattern_index,const char * type,int result,int match_start,int match_end,PCRE2_SIZE * ovector)1785 static int check_invalid_utf_result(int pattern_index, const char *type, int result,
1786 int match_start, int match_end, PCRE2_SIZE *ovector)
1787 {
1788 if (match_start < 0) {
1789 if (result != -1) {
1790 printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
1791 return 1;
1792 }
1793 return 0;
1794 }
1795
1796 if (result <= 0) {
1797 printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
1798 return 1;
1799 }
1800
1801 if (ovector[0] != (PCRE2_SIZE)match_start) {
1802 printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
1803 pattern_index, type, (int)ovector[0], match_start);
1804 return 1;
1805 }
1806
1807 if (ovector[1] != (PCRE2_SIZE)match_end) {
1808 printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
1809 pattern_index, type, (int)ovector[1], match_end);
1810 return 1;
1811 }
1812
1813 return 0;
1814 }
1815
1816 #endif /* SUPPORT_UNICODE */
1817
1818 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
1819
1820 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
1821 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
1822 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
1823
1824 struct invalid_utf8_regression_test_case {
1825 int compile_options;
1826 int jit_compile_options;
1827 int start_offset;
1828 int skip_left;
1829 int skip_right;
1830 int match_start;
1831 int match_end;
1832 const char *pattern[2];
1833 const char *input;
1834 };
1835
1836 static const char invalid_utf8_newline_cr;
1837
1838 static const struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
1839 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1840 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
1841 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
1842 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1843 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
1844 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
1845 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
1846 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
1847 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
1848 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
1849 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
1850 { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
1851 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
1852 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
1853 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
1854 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
1855 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
1856 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
1857 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
1858 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
1859 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
1860 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
1861 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
1862 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
1863 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
1864 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
1865 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
1866 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
1867 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
1868 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
1869 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
1870 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
1871 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
1872 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
1873 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
1874 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
1875 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
1876 { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
1877
1878 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
1879 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80\xf4\xa0\x80\x80" },
1880 { UDA, CPI, 4, 1, 1, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf" },
1881 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
1882 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
1883 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
1884 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
1885 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf\xf0\x8f\xbf\xbf" },
1886 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80\xf5\x80\x80\x80" },
1887 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80\xf4\x90\x80\x80" },
1888 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff\xf4\x8f\xbf\xff" },
1889 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf\xf4\x8f\xff\xbf" },
1890 { UDA, CPI, 4, 0, 1, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80\xef\x80\x80" },
1891 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80\x80\x80\x80\x80" },
1892 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf\xe0\x9f\xbf#" },
1893 { UDA, CPI, 4, 2, 2, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80\xe0\xa0\x80#" },
1894 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80\xf0\x80\x80#" },
1895 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80\xed\xa0\x80#" },
1896 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
1897 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
1898 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
1899 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
1900 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf\xc1\xbf##" },
1901 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0\xdf\xc0##" },
1902 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80\xe0\x80##" },
1903
1904 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
1905 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
1906 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf\xe0\x9f\xbf" },
1907 { UDA, CPI, 3, 1, 1, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf\xef\xbf\xbf" },
1908 { UDA, CPI, 3, 0, 1, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80\xdf\x80" },
1909 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff\xef\xbf\xff" },
1910 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf\xef\xff\xbf" },
1911 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf\xed\xbf\xbf" },
1912
1913 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
1914 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
1915 { UDA, CPI, 2, 1, 1, -1, -1, { "\\B", "\\b" }, "\xdf\xbf\xdf\xbf" },
1916 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf\xc1\xbf" },
1917 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80\xe0\x80" },
1918 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff\xdf\xff" },
1919 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf\xff\xbf" },
1920
1921 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
1922 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
1923 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80" },
1924 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\xb0\xb0" },
1925
1926 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
1927 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
1928 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1929 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1930 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
1931 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1932 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1933 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1934 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1935
1936 { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
1937 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
1938 { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
1939 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
1940 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
1941 { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
1942 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
1943 { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1944 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1945
1946 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
1947 { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
1948 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
1949 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
1950
1951 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
1952 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
1953 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
1954 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
1955 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
1956 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
1957 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
1958 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
1959 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
1960 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
1961 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},
1962
1963 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
1964 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
1965 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
1966 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
1967 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
1968 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
1969
1970 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1971 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1972 { PCRE2_UTF, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1973 { PCRE2_UTF, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1974
1975 { PCRE2_UTF | PCRE2_UCP, CI, 0, 0, 0, -1, -1, { "[\\s]", NULL }, "\xed\xa0\x80" },
1976
1977 /* These two are not invalid UTF tests, but this infrastructure fits better for them. */
1978 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\X{2}", NULL }, "\r\n\n" },
1979 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\R{2}", NULL }, "\r\n\n" },
1980
1981 { PCRE2_UTF | PCRE2_MULTILINE, CI, 0, 0, 0, -1, -1, { "^.a", &invalid_utf8_newline_cr }, "\xc3\xa7#a" },
1982
1983 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
1984 };
1985
1986 #undef UDA
1987 #undef CI
1988 #undef CPI
1989
run_invalid_utf8_test(const struct invalid_utf8_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_8 * ccontext,pcre2_match_data_8 * mdata)1990 static int run_invalid_utf8_test(const struct invalid_utf8_regression_test_case *current,
1991 int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
1992 {
1993 pcre2_code_8 *code;
1994 int result, errorcode;
1995 PCRE2_SIZE length, erroroffset;
1996 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
1997
1998 if (current->pattern[i] == NULL)
1999 return 1;
2000
2001 code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
2002 current->compile_options, &errorcode, &erroroffset, ccontext);
2003
2004 if (!code) {
2005 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2006 return 0;
2007 }
2008
2009 if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
2010 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2011 pcre2_code_free_8(code);
2012 return 0;
2013 }
2014
2015 length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
2016
2017 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2018 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2019 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2020
2021 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2022 pcre2_code_free_8(code);
2023 return 0;
2024 }
2025 }
2026
2027 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2028 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2029 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2030
2031 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2032 pcre2_code_free_8(code);
2033 return 0;
2034 }
2035 }
2036
2037 pcre2_code_free_8(code);
2038 return 1;
2039 }
2040
invalid_utf8_regression_tests(void)2041 static int invalid_utf8_regression_tests(void)
2042 {
2043 const struct invalid_utf8_regression_test_case *current;
2044 pcre2_compile_context_8 *ccontext;
2045 pcre2_match_data_8 *mdata;
2046 int total = 0, successful = 0;
2047 int result;
2048
2049 printf("\nRunning invalid-utf8 JIT regression tests\n");
2050
2051 ccontext = pcre2_compile_context_create_8(NULL);
2052 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2053 mdata = pcre2_match_data_create_8(4, NULL);
2054
2055 for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
2056 /* printf("\nPattern: %s :\n", current->pattern); */
2057 total++;
2058
2059 result = 1;
2060 if (current->pattern[1] != &invalid_utf8_newline_cr)
2061 {
2062 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2063 result = 0;
2064 if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
2065 result = 0;
2066 } else {
2067 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_CR);
2068 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2069 result = 0;
2070 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2071 }
2072
2073 if (result) {
2074 successful++;
2075 }
2076
2077 printf(".");
2078 if ((total % 60) == 0)
2079 printf("\n");
2080 }
2081
2082 if ((total % 60) != 0)
2083 printf("\n");
2084
2085 pcre2_match_data_free_8(mdata);
2086 pcre2_compile_context_free_8(ccontext);
2087
2088 if (total == successful) {
2089 printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
2090 return 0;
2091 } else {
2092 printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2093 return 1;
2094 }
2095 }
2096
2097 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
2098
invalid_utf8_regression_tests(void)2099 static int invalid_utf8_regression_tests(void)
2100 {
2101 return 0;
2102 }
2103
2104 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
2105
2106 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
2107
2108 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2109 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2110 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2111
2112 struct invalid_utf16_regression_test_case {
2113 int compile_options;
2114 int jit_compile_options;
2115 int start_offset;
2116 int skip_left;
2117 int skip_right;
2118 int match_start;
2119 int match_end;
2120 const PCRE2_UCHAR16 *pattern[2];
2121 const PCRE2_UCHAR16 *input;
2122 };
2123
2124 static PCRE2_UCHAR16 allany16[] = { '.', 0 };
2125 static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
2126 static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
2127 static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
2128 static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
2129 static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
2130 static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
2131 static PCRE2_UCHAR16 generic16[] = { '#', 0xd800, 0xdc00, '#', 0 };
2132 static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
2133 static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, 0xd800, 0xdc00, 0 };
2134 static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, 0xdbff, 0xdfff, 0 };
2135 static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, 0xd800, 0xdbff, 0 };
2136 static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, 0xdc00, '#', 0 };
2137 static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
2138 static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
2139 static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
2140 static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
2141 static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
2142 static PCRE2_UCHAR16 test16_11[] = { 0xdc00, 0xdc00, 0xd800, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2143 static PCRE2_UCHAR16 test16_12[] = { '#', 0xd800, 0xdc00, 0xd800, '#', 0xd800, 0xdc00, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2144
2145 static const struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
2146 { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
2147 { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
2148 { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
2149 { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
2150 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
2151 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_2 },
2152 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
2153 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
2154 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_3 },
2155 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
2156
2157 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
2158 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
2159 { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
2160 { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
2161 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
2162 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
2163 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
2164 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
2165 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
2166 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
2167
2168 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
2169 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
2170 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
2171 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
2172
2173 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
2174 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
2175 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
2176 { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
2177 { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
2178 { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
2179
2180 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2181 { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
2182 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2183
2184 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
2185 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
2186
2187 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2188 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2189 { PCRE2_UTF, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2190 { PCRE2_UTF, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2191
2192 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2193 };
2194
2195 #undef UDA
2196 #undef CI
2197 #undef CPI
2198
run_invalid_utf16_test(const struct invalid_utf16_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_16 * ccontext,pcre2_match_data_16 * mdata)2199 static int run_invalid_utf16_test(const struct invalid_utf16_regression_test_case *current,
2200 int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
2201 {
2202 pcre2_code_16 *code;
2203 int result, errorcode;
2204 PCRE2_SIZE length, erroroffset;
2205 const PCRE2_UCHAR16 *input;
2206 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
2207
2208 if (current->pattern[i] == NULL)
2209 return 1;
2210
2211 code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
2212 current->compile_options, &errorcode, &erroroffset, ccontext);
2213
2214 if (!code) {
2215 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2216 return 0;
2217 }
2218
2219 if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
2220 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2221 pcre2_code_free_16(code);
2222 return 0;
2223 }
2224
2225 input = current->input;
2226 length = 0;
2227
2228 while (*input++ != 0)
2229 length++;
2230
2231 length -= current->skip_left + current->skip_right;
2232
2233 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2234 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2235 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2236
2237 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2238 pcre2_code_free_16(code);
2239 return 0;
2240 }
2241 }
2242
2243 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2244 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2245 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2246
2247 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2248 pcre2_code_free_16(code);
2249 return 0;
2250 }
2251 }
2252
2253 pcre2_code_free_16(code);
2254 return 1;
2255 }
2256
invalid_utf16_regression_tests(void)2257 static int invalid_utf16_regression_tests(void)
2258 {
2259 const struct invalid_utf16_regression_test_case *current;
2260 pcre2_compile_context_16 *ccontext;
2261 pcre2_match_data_16 *mdata;
2262 int total = 0, successful = 0;
2263 int result;
2264
2265 printf("\nRunning invalid-utf16 JIT regression tests\n");
2266
2267 ccontext = pcre2_compile_context_create_16(NULL);
2268 pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
2269 mdata = pcre2_match_data_create_16(4, NULL);
2270
2271 for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
2272 /* printf("\nPattern: %s :\n", current->pattern); */
2273 total++;
2274
2275 result = 1;
2276 if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
2277 result = 0;
2278 if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
2279 result = 0;
2280
2281 if (result) {
2282 successful++;
2283 }
2284
2285 printf(".");
2286 if ((total % 60) == 0)
2287 printf("\n");
2288 }
2289
2290 if ((total % 60) != 0)
2291 printf("\n");
2292
2293 pcre2_match_data_free_16(mdata);
2294 pcre2_compile_context_free_16(ccontext);
2295
2296 if (total == successful) {
2297 printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
2298 return 0;
2299 } else {
2300 printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2301 return 1;
2302 }
2303 }
2304
2305 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
2306
invalid_utf16_regression_tests(void)2307 static int invalid_utf16_regression_tests(void)
2308 {
2309 return 0;
2310 }
2311
2312 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
2313
2314 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
2315
2316 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2317 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2318 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2319
2320 struct invalid_utf32_regression_test_case {
2321 int compile_options;
2322 int jit_compile_options;
2323 int start_offset;
2324 int skip_left;
2325 int skip_right;
2326 int match_start;
2327 int match_end;
2328 const PCRE2_UCHAR32 *pattern[2];
2329 const PCRE2_UCHAR32 *input;
2330 };
2331
2332 static PCRE2_UCHAR32 allany32[] = { '.', 0 };
2333 static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
2334 static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
2335 static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
2336 static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
2337 static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
2338 static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
2339 static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x110000, 0x10ffff, 0 };
2340 static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0xdfff, 0xd800, 0 };
2341 static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
2342 static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
2343 static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
2344 static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };
2345
2346 static const struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
2347 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
2348 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
2349 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
2350 { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
2351 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2352 { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2353
2354 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
2355 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2356 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
2357 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2358 { UDA, CPI, 6, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2359
2360 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
2361 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },
2362
2363 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
2364 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
2365 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
2366 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2367 { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2368 { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },
2369
2370 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2371 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
2372 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2373 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
2374 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },
2375
2376 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
2377 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },
2378
2379 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2380 };
2381
2382 #undef UDA
2383 #undef CI
2384 #undef CPI
2385
run_invalid_utf32_test(const struct invalid_utf32_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_32 * ccontext,pcre2_match_data_32 * mdata)2386 static int run_invalid_utf32_test(const struct invalid_utf32_regression_test_case *current,
2387 int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
2388 {
2389 pcre2_code_32 *code;
2390 int result, errorcode;
2391 PCRE2_SIZE length, erroroffset;
2392 const PCRE2_UCHAR32 *input;
2393 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
2394
2395 if (current->pattern[i] == NULL)
2396 return 1;
2397
2398 code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
2399 current->compile_options, &errorcode, &erroroffset, ccontext);
2400
2401 if (!code) {
2402 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2403 return 0;
2404 }
2405
2406 if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
2407 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2408 pcre2_code_free_32(code);
2409 return 0;
2410 }
2411
2412 input = current->input;
2413 length = 0;
2414
2415 while (*input++ != 0)
2416 length++;
2417
2418 length -= current->skip_left + current->skip_right;
2419
2420 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2421 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2422 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2423
2424 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2425 pcre2_code_free_32(code);
2426 return 0;
2427 }
2428 }
2429
2430 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2431 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2432 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2433
2434 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2435 pcre2_code_free_32(code);
2436 return 0;
2437 }
2438 }
2439
2440 pcre2_code_free_32(code);
2441 return 1;
2442 }
2443
invalid_utf32_regression_tests(void)2444 static int invalid_utf32_regression_tests(void)
2445 {
2446 const struct invalid_utf32_regression_test_case *current;
2447 pcre2_compile_context_32 *ccontext;
2448 pcre2_match_data_32 *mdata;
2449 int total = 0, successful = 0;
2450 int result;
2451
2452 printf("\nRunning invalid-utf32 JIT regression tests\n");
2453
2454 ccontext = pcre2_compile_context_create_32(NULL);
2455 pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
2456 mdata = pcre2_match_data_create_32(4, NULL);
2457
2458 for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
2459 /* printf("\nPattern: %s :\n", current->pattern); */
2460 total++;
2461
2462 result = 1;
2463 if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
2464 result = 0;
2465 if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
2466 result = 0;
2467
2468 if (result) {
2469 successful++;
2470 }
2471
2472 printf(".");
2473 if ((total % 60) == 0)
2474 printf("\n");
2475 }
2476
2477 if ((total % 60) != 0)
2478 printf("\n");
2479
2480 pcre2_match_data_free_32(mdata);
2481 pcre2_compile_context_free_32(ccontext);
2482
2483 if (total == successful) {
2484 printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
2485 return 0;
2486 } else {
2487 printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2488 return 1;
2489 }
2490 }
2491
2492 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
2493
invalid_utf32_regression_tests(void)2494 static int invalid_utf32_regression_tests(void)
2495 {
2496 return 0;
2497 }
2498
2499 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
2500
2501 /* End of pcre2_jit_test.c */
2502