1# This set of tests is for UTF-8 support and Unicode property support, with 2# relevance only for the 8-bit library. 3 4# The next 5 patterns have UTF-8 errors 5 6/[�]/utf 7 8/�/utf 9 10/���xxx/utf 11 12/Â��������/utf 13 14/Â��������/match_invalid_utf 15 16# Now test subjects 17 18/badutf/utf 19\= Expect UTF-8 errors 20 X\xdf 21 XX\xef 22 XXX\xef\x80 23 X\xf7 24 XX\xf7\x80 25 XXX\xf7\x80\x80 26 \xfb 27 \xfb\x80 28 \xfb\x80\x80 29 \xfb\x80\x80\x80 30 \xfd 31 \xfd\x80 32 \xfd\x80\x80 33 \xfd\x80\x80\x80 34 \xfd\x80\x80\x80\x80 35 \xdf\x7f 36 \xef\x7f\x80 37 \xef\x80\x7f 38 \xf7\x7f\x80\x80 39 \xf7\x80\x7f\x80 40 \xf7\x80\x80\x7f 41 \xfb\x7f\x80\x80\x80 42 \xfb\x80\x7f\x80\x80 43 \xfb\x80\x80\x7f\x80 44 \xfb\x80\x80\x80\x7f 45 \xfd\x7f\x80\x80\x80\x80 46 \xfd\x80\x7f\x80\x80\x80 47 \xfd\x80\x80\x7f\x80\x80 48 \xfd\x80\x80\x80\x7f\x80 49 \xfd\x80\x80\x80\x80\x7f 50 \xed\xa0\x80 51 \xc0\x8f 52 \xe0\x80\x8f 53 \xf0\x80\x80\x8f 54 \xf8\x80\x80\x80\x8f 55 \xfc\x80\x80\x80\x80\x8f 56 \x80 57 \xfe 58 \xff 59 60/badutf/utf 61\= Expect UTF-8 errors 62 XX\xfb\x80\x80\x80\x80 63 XX\xfd\x80\x80\x80\x80\x80 64 XX\xf7\xbf\xbf\xbf 65 66/shortutf/utf 67\= Expect UTF-8 errors 68 XX\xdf\=ph 69 XX\xef\=ph 70 XX\xef\x80\=ph 71 \xf7\=ph 72 \xf7\x80\=ph 73 \xf7\x80\x80\=ph 74 \xfb\=ph 75 \xfb\x80\=ph 76 \xfb\x80\x80\=ph 77 \xfb\x80\x80\x80\=ph 78 \xfd\=ph 79 \xfd\x80\=ph 80 \xfd\x80\x80\=ph 81 \xfd\x80\x80\x80\=ph 82 \xfd\x80\x80\x80\x80\=ph 83 84/anything/utf 85\= Expect UTF-8 errors 86 X\xc0\x80 87 XX\xc1\x8f 88 XXX\xe0\x9f\x80 89 \xf0\x8f\x80\x80 90 \xf8\x87\x80\x80\x80 91 \xfc\x83\x80\x80\x80\x80 92 \xfe\x80\x80\x80\x80\x80 93 \xff\x80\x80\x80\x80\x80 94 \xf8\x88\x80\x80\x80 95 \xf9\x87\x80\x80\x80 96 \xfc\x84\x80\x80\x80\x80 97 \xfd\x83\x80\x80\x80\x80 98\= Expect no match 99 \xc3\x8f 100 \xe0\xaf\x80 101 \xe1\x80\x80 102 \xf0\x9f\x80\x80 103 \xf1\x8f\x80\x80 104 \xf8\x88\x80\x80\x80\=no_utf_check 105 \xf9\x87\x80\x80\x80\=no_utf_check 106 \xfc\x84\x80\x80\x80\x80\=no_utf_check 107 \xfd\x83\x80\x80\x80\x80\=no_utf_check 108 109# Similar tests with offsets 110 111/badutf/utf 112\= Expect UTF-8 errors 113 X\xdfabcd 114 X\xdfabcd\=offset=1 115\= Expect no match 116 X\xdfabcd\=offset=2 117 118/(?<=x)badutf/utf 119\= Expect UTF-8 errors 120 X\xdfabcd 121 X\xdfabcd\=offset=1 122 X\xdfabcd\=offset=2 123 X\xdfabcd\xdf\=offset=3 124\= Expect no match 125 X\xdfabcd\=offset=3 126 127/(?<=xx)badutf/utf 128\= Expect UTF-8 errors 129 X\xdfabcd 130 X\xdfabcd\=offset=1 131 X\xdfabcd\=offset=2 132 X\xdfabcd\=offset=3 133 134/(?<=xxxx)badutf/utf 135\= Expect UTF-8 errors 136 X\xdfabcd 137 X\xdfabcd\=offset=1 138 X\xdfabcd\=offset=2 139 X\xdfabcd\=offset=3 140 X\xdfabc\xdf\=offset=6 141 X\xdfabc\xdf\=offset=7 142\= Expect no match 143 X\xdfabcd\=offset=6 144 145/\x{100}/IB,utf 146 147/\x{1000}/IB,utf 148 149/\x{10000}/IB,utf 150 151/\x{100000}/IB,utf 152 153/\x{10ffff}/IB,utf 154 155/[\x{ff}]/IB,utf 156 157/[\x{100}]/IB,utf 158 159/\x80/IB,utf 160 161/\xff/IB,utf 162 163/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf 164 \x{D55c}\x{ad6d}\x{C5B4} 165 166/\x{65e5}\x{672c}\x{8a9e}/IB,utf 167 \x{65e5}\x{672c}\x{8a9e} 168 169/\x{80}/IB,utf 170 171/\x{084}/IB,utf 172 173/\x{104}/IB,utf 174 175/\x{861}/IB,utf 176 177/\x{212ab}/IB,utf 178 179/[^ab\xC0-\xF0]/IB,utf 180 \x{f1} 181 \x{bf} 182 \x{100} 183 \x{1000} 184\= Expect no match 185 \x{c0} 186 \x{f0} 187 188/Ā{3,4}/IB,utf 189 \x{100}\x{100}\x{100}\x{100\x{100} 190 191/(\x{100}+|x)/IB,utf 192 193/(\x{100}*a|x)/IB,utf 194 195/(\x{100}{0,2}a|x)/IB,utf 196 197/(\x{100}{1,2}a|x)/IB,utf 198 199/\x{100}/IB,utf 200 201/a\x{100}\x{101}*/IB,utf 202 203/a\x{100}\x{101}+/IB,utf 204 205/[^\x{c4}]/IB 206 207/[\x{100}]/IB,utf 208 \x{100} 209 Z\x{100} 210 \x{100}Z 211 212/[\xff]/IB,utf 213 >\x{ff}< 214 215/[^\xff]/IB,utf 216 217/\x{100}abc(xyz(?1))/IB,utf 218 219/\777/I,utf 220 \x{1ff} 221 \777 222 223/\x{100}+\x{200}/IB,utf 224 225/\x{100}+X/IB,utf 226 227/^[\QĀ\E-\QŐ\E/B,utf 228 229# This tests the stricter UTF-8 check according to RFC 3629. 230 231/X/utf 232\= Expect UTF-8 errors 233 \x{d800} 234 \x{da00} 235 \x{dfff} 236 \x{110000} 237 \x{2000000} 238 \x{7fffffff} 239\= Expect no match 240 \x{d800}\=no_utf_check 241 \x{da00}\=no_utf_check 242 \x{dfff}\=no_utf_check 243 \x{110000}\=no_utf_check 244 \x{2000000}\=no_utf_check 245 \x{7fffffff}\=no_utf_check 246 247/(*UTF8)\x{1234}/ 248 abcd\x{1234}pqr 249 250/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I 251 252/\h/I,utf 253 ABC\x{09} 254 ABC\x{20} 255 ABC\x{a0} 256 ABC\x{1680} 257 ABC\x{180e} 258 ABC\x{2000} 259 ABC\x{202f} 260 ABC\x{205f} 261 ABC\x{3000} 262 263/\v/I,utf 264 ABC\x{0a} 265 ABC\x{0b} 266 ABC\x{0c} 267 ABC\x{0d} 268 ABC\x{85} 269 ABC\x{2028} 270 271/\h*A/I,utf 272 CDBABC 273 274/\v+A/I,utf 275 276/\s?xxx\s/I,utf 277 278/\sxxx\s/I,utf,tables=2 279 AB\x{85}xxx\x{a0}XYZ 280 AB\x{a0}xxx\x{85}XYZ 281 282/\S \S/I,utf,tables=2 283 \x{a2} \x{84} 284 A Z 285 286/a+/utf 287 a\x{123}aa\=offset=1 288 a\x{123}aa\=offset=3 289 a\x{123}aa\=offset=4 290\= Expect bad offset value 291 a\x{123}aa\=offset=6 292\= Expect bad UTF-8 offset 293 a\x{123}aa\=offset=2 294\= Expect no match 295 a\x{123}aa\=offset=5 296 297/\x{1234}+/Ii,utf 298 299/\x{1234}+?/Ii,utf 300 301/\x{1234}++/Ii,utf 302 303/\x{1234}{2}/Ii,utf 304 305/[^\x{c4}]/IB,utf 306 307/X+\x{200}/IB,utf 308 309/\R/I,utf 310 311/\777/IB,utf 312 313/\w+\x{C4}/B,utf 314 a\x{C4}\x{C4} 315 316/\w+\x{C4}/B,utf,tables=2 317 a\x{C4}\x{C4} 318 319/\W+\x{C4}/B,utf 320 !\x{C4} 321 322/\W+\x{C4}/B,utf,tables=2 323 !\x{C4} 324 325/\W+\x{A1}/B,utf 326 !\x{A1} 327 328/\W+\x{A1}/B,utf,tables=2 329 !\x{A1} 330 331/X\s+\x{A0}/B,utf 332 X\x20\x{A0}\x{A0} 333 334/X\s+\x{A0}/B,utf,tables=2 335 X\x20\x{A0}\x{A0} 336 337/\S+\x{A0}/B,utf 338 X\x{A0}\x{A0} 339 340/\S+\x{A0}/B,utf,tables=2 341 X\x{A0}\x{A0} 342 343/\x{a0}+\s!/B,utf 344 \x{a0}\x20! 345 346/\x{a0}+\s!/B,utf,tables=2 347 \x{a0}\x20! 348 349/A/utf 350 \x{ff000041} 351 \x{7f000041} 352 353/(*UTF8)abc/never_utf 354 355/abc/utf,never_utf 356 357/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf 358 359/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf 360 361/AB\x{1fb0}/IB,utf 362 363/AB\x{1fb0}/IBi,utf 364 365/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf 366 \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} 367 \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} 368 369/[ⱥ]/Bi,utf 370 371/[^ⱥ]/Bi,utf 372 373/\h/I 374 375/\v/I 376 377/\R/I 378 379/[[:blank:]]/B,ucp 380 381/\x{212a}+/Ii,utf 382 KKkk\x{212a} 383 384/s+/Ii,utf 385 SSss\x{17f} 386 387/\x{100}*A/IB,utf 388 A 389 390/\x{100}*\d(?R)/IB,utf 391 392/[Z\x{100}]/IB,utf 393 Z\x{100} 394 \x{100} 395 \x{100}Z 396 397/[z-\x{100}]/IB,utf 398 399/[z\Qa-d]Ā\E]/IB,utf 400 \x{100} 401 Ā 402 403/[ab\x{100}]abc(xyz(?1))/IB,utf 404 405/\x{100}*\s/IB,utf 406 407/\x{100}*\d/IB,utf 408 409/\x{100}*\w/IB,utf 410 411/\x{100}*\D/IB,utf 412 413/\x{100}*\S/IB,utf 414 415/\x{100}*\W/IB,utf 416 417/[\x{105}-\x{109}]/IBi,utf 418 \x{104} 419 \x{105} 420 \x{109} 421\= Expect no match 422 \x{100} 423 \x{10a} 424 425/[z-\x{100}]/IBi,utf 426 Z 427 z 428 \x{39c} 429 \x{178} 430 | 431 \x{80} 432 \x{ff} 433 \x{100} 434 \x{101} 435\= Expect no match 436 \x{102} 437 Y 438 y 439 440/[z-\x{100}]/IBi,utf 441 442/\x{3a3}B/IBi,utf 443 444/abc/utf,replace=� 445 abc 446 447/(?<=(a)(?-1))x/I,utf 448 a\x80zx\=offset=3 449 450/[\W\p{Any}]/B 451 abc 452 123 453 454/[\W\pL]/B 455 abc 456\= Expect no match 457 123 458 459/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf 460 461/[\s[:^ascii:]]/B,ucp 462 463# A special extra option allows excaped surrogate code points in 8-bit mode, 464# but subjects containing them must not be UTF-checked. 465 466/\x{d800}/I,utf,allow_surrogate_escapes 467 \x{d800}\=no_utf_check 468 469/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes 470 \x{dfff}\x{df01}\=no_utf_check 471 472# This has different starting code units in 8-bit mode. 473 474/^[^ab]/IB,utf 475 c 476 \x{ff} 477 \x{100} 478\= Expect no match 479 aaa 480 481# Offsets are different in 8-bit mode. 482 483/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout 484 123abcáyzabcdef789abcሴqr 485 486# Check name length with non-ASCII characters 487 488/(?'ABáC678901234567890123456789012'...)/utf 489 490/(?'ABáC6789012345678901234567890123'...)/utf 491 492/(?'ABZC6789012345678901234567890123'...)/utf 493 494/(?(n/utf 495 496/(?(á/utf 497 498# Invalid UTF-8 tests 499 500/.../g,match_invalid_utf 501 abcd\x80wxzy\x80pqrs 502 abcd\x{80}wxzy\x80pqrs 503 504/abc/match_invalid_utf 505 ab\x80ab\=ph 506\= Expect no match 507 ab\x80cdef\=ph 508 509/ab$/match_invalid_utf 510 ab\x80cdeab 511\= Expect no match 512 ab\x80cde 513 514/.../g,match_invalid_utf 515 abcd\x{80}wxzy\x80pqrs 516 517/(?<=x)../g,match_invalid_utf 518 abcd\x{80}wxzy\x80pqrs 519 abcd\x{80}wxzy\x80xpqrs 520 521/X$/match_invalid_utf 522\= Expect no match 523 X\xc4 524 525/(?<=..)X/match_invalid_utf,aftertext 526 AB\x80AQXYZ 527 AB\x80AQXYZ\=offset=5 528 AB\x80\x80AXYZXC\=offset=5 529\= Expect no match 530 AB\x80XYZ 531 AB\x80XYZ\=offset=3 532 AB\xfeXYZ 533 AB\xffXYZ\=offset=3 534 AB\x80AXYZ 535 AB\x80AXYZ\=offset=4 536 AB\x80\x80AXYZ\=offset=5 537 538/.../match_invalid_utf 539 AB\xc4CCC 540\= Expect no match 541 A\x{d800}B 542 A\x{110000}B 543 A\xc4B 544 545/\bX/match_invalid_utf 546 A\x80X 547 548/\BX/match_invalid_utf 549\= Expect no match 550 A\x80X 551 552/(?<=...)X/match_invalid_utf 553 AAA\x80BBBXYZ 554\= Expect no match 555 AAA\x80BXYZ 556 AAA\x80BBXYZ 557 558# ------------------------------------- 559 560/(*UTF)(?=\x{123})/I 561 562/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf 563 564/[,]/BI,utf 565 566/[\x{fff4}-\x{ffff8}]/I,utf 567 568/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf 569 570/[\xff\x{ffff}]/I,utf 571 572/[\xff\x{ff}]/I,utf 573 abc\x{ff}def 574 575/[\xff\x{ff}]/I 576 abc\x{ff}def 577 578/[Ss]/I 579 580/[Ss]/I,utf 581 582/(?:\x{ff}|\x{3000})/I,utf 583 584/x/utf 585 abxyz 586 \x80\=startchar 587 abc\x80\=startchar 588 abc\x80\=startchar,offset=3 589 590/\x{c1}+\x{e1}/iIB,ucp 591 \x{c1}\x{c1}\x{c1} 592 \x{e1}\x{e1}\x{e1} 593 594/a|\x{c1}/iI,ucp 595 \x{e1}xxx 596 597/a|\x{c1}/iI,utf 598 \x{e1}xxx 599 600/\x{c1}|\x{e1}/iI,ucp 601 602/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended 603 X\x{e1}Y 604 605/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended 606 X\x{c1}Y 607 608# Without UTF or UCP characters > 127 have only one case in the default locale. 609 610/X(\x{e1})Y/replace=>\U$1<,substitute_extended 611 X\x{e1}Y 612 613/A/utf,match_invalid_utf,caseless 614 \xe5A 615 616/\bch\b/utf,match_invalid_utf 617 qchq\=ph 618 qchq\=ps 619 620# End of testinput10 621