1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 package java.util.regex; 28 29 import android.compat.Compatibility; 30 import android.compat.annotation.ChangeId; 31 import android.compat.annotation.EnabledSince; 32 33 import com.android.icu.util.regex.PatternNative; 34 35 import dalvik.annotation.compat.VersionCodes; 36 import dalvik.system.VMRuntime; 37 import java.util.ArrayList; 38 import java.util.Iterator; 39 import java.util.NoSuchElementException; 40 import java.util.Spliterator; 41 import java.util.Spliterators; 42 import java.util.function.Predicate; 43 import java.util.stream.Stream; 44 import java.util.stream.StreamSupport; 45 import libcore.util.EmptyArray; 46 47 // Android-changed: Document that named capturing is only available from API 26. 48 // Android-changed: Android always uses unicode character classes. 49 // Android-changed: Remove reference to Character.codePointOf(String) until it's implemented. 50 // Android-changed: UNICODE_CHARACTER_CLASS causes IllegalArgumentException on Android. 51 // Android-changed: POSIX character classes are Unicode-aware. 52 // Android-changed: Throw PatternSyntaxException for non-existent back references. 53 // Android-changed: Remove "Compatibility Properties of Unicode Regular Expression" table. 54 // Android-changed: Remove supported \b{g} Unicode extended grapheme cluster boundary. 55 // Android-changed: Prefix "Is" is supported since Android 10. http://b/110364810 56 /** 57 * A compiled representation of a regular expression. 58 * 59 * <p> A regular expression, specified as a string, must first be compiled into 60 * an instance of this class. The resulting pattern can then be used to create 61 * a {@link Matcher} object that can match arbitrary {@linkplain 62 * java.lang.CharSequence character sequences} against the regular 63 * expression. All of the state involved in performing a match resides in the 64 * matcher, so many matchers can share the same pattern. 65 * 66 * <p> A typical invocation sequence is thus 67 * 68 * <blockquote><pre> 69 * Pattern p = Pattern.{@link #compile compile}("a*b"); 70 * Matcher m = p.{@link #matcher matcher}("aaaaab"); 71 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote> 72 * 73 * <p> A {@link #matches matches} method is defined by this class as a 74 * convenience for when a regular expression is used just once. This method 75 * compiles an expression and matches an input sequence against it in a single 76 * invocation. The statement 77 * 78 * <blockquote><pre> 79 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote> 80 * 81 * is equivalent to the three statements above, though for repeated matches it 82 * is less efficient since it does not allow the compiled pattern to be reused. 83 * 84 * <p> Instances of this class are immutable and are safe for use by multiple 85 * concurrent threads. Instances of the {@link Matcher} class are not safe for 86 * such use. 87 * 88 * 89 * <h2><a id="sum">Summary of regular-expression constructs</a></h2> 90 * 91 * <table class="borderless"> 92 * <caption style="display:none">Regular expression constructs, and what they match</caption> 93 * <thead style="text-align:left"> 94 * <tr> 95 * <th id="construct">Construct</th> 96 * <th id="matches">Matches</th> 97 * </tr> 98 * </thead> 99 * <tbody style="text-align:left"> 100 * 101 * <tr><th colspan="2" style="padding-top:20px" id="characters">Characters</th></tr> 102 * 103 * <tr><th style="vertical-align:top; font-weight: normal" id="x"><i>x</i></th> 104 * <td headers="matches characters x">The character <i>x</i></td></tr> 105 * <tr><th style="vertical-align:top; font-weight: normal" id="backslash">{@code \\}</th> 106 * <td headers="matches characters backslash">The backslash character</td></tr> 107 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_n">{@code \0}<i>n</i></th> 108 * <td headers="matches characters octal_n">The character with octal value {@code 0}<i>n</i> 109 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 110 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nn">{@code \0}<i>nn</i></th> 111 * <td headers="matches characters octal_nn">The character with octal value {@code 0}<i>nn</i> 112 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 113 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nnn">{@code \0}<i>mnn</i></th> 114 * <td headers="matches characters octal_nnn">The character with octal value {@code 0}<i>mnn</i> 115 * (0 {@code <=} <i>m</i> {@code <=} 3, 116 * 0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 117 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hh">{@code \x}<i>hh</i></th> 118 * <td headers="matches characters hex_hh">The character with hexadecimal value {@code 0x}<i>hh</i></td></tr> 119 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hhhh"><code>\u</code><i>hhhh</i></th> 120 * <td headers="matches characters hex_hhhh">The character with hexadecimal value {@code 0x}<i>hhhh</i></td></tr> 121 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th> 122 * <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i> 123 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} 124 * <= {@code 0x}<i>h...h</i> <= 125 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr> 126 * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th> 127 * <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr> 128 * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th> 129 * <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr> 130 * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th> 131 * <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr> 132 * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th> 133 * <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr> 134 * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th> 135 * <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr> 136 * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th> 137 * <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr> 138 * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th> 139 * <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr> 140 * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th> 141 * <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i></td></tr> 142 * 143 * <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr> 144 * 145 * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th> 146 * <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr> 147 * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th> 148 * <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr> 149 * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th> 150 * <td headers="matches classes range">{@code a} through {@code z} 151 * or {@code A} through {@code Z}, inclusive (range)</td></tr> 152 * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th> 153 * <td headers="matches classes union">{@code a} through {@code d}, 154 * or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr> 155 * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th> 156 * <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr> 157 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th> 158 * <td headers="matches classes subtraction1">{@code a} through {@code z}, 159 * except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr> 160 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th> 161 * <td headers="matches classes subtraction2">{@code a} through {@code z}, 162 * and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr> 163 * 164 * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr> 165 * 166 * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th> 167 * <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr> 168 * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th> 169 * <td headers="matches predef digit">A digit: {@code \p{IsDigit}}</td></tr> 170 * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th> 171 * <td headers="matches predef non_digit">A non-digit: {@code [^\d]}</td></tr> 172 * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th> 173 * <td headers="matches predef horiz_white">A horizontal whitespace character: 174 * <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr> 175 * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th> 176 * <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr> 177 * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th> 178 * <td headers="matches predef white">A whitespace character: {@code \p{IsWhite_Space}}</td></tr> 179 * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th> 180 * <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr> 181 * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th> 182 * <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code> 183 * </td></tr> 184 * <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th> 185 * <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr> 186 * <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th> 187 * <td headers="matches predef word">A word character: {@code [\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}\p{Join_Control}]}</td></tr> 188 * <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th> 189 * <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr> 190 * 191 * <tr><th colspan="2" style="padding-top:20px" id="posix"><b>POSIX character classes (Unicode-aware)</b></th></tr> 192 * 193 * <tr><th style="vertical-align:top; font-weight:normal" id="Lower">{@code \p{Lower}}</th> 194 * <td headers="matches posix Lower">A lower-case alphabetic character: {@code \p{IsLowercase}}</td></tr> 195 * <tr><th style="vertical-align:top; font-weight:normal" id="Upper">{@code \p{Upper}}</th> 196 * <td headers="matches posix Upper">An upper-case alphabetic character:{@code \p{IsUppercase}}</td></tr> 197 * <tr><th style="vertical-align:top; font-weight:normal" id="ASCII">{@code \p{ASCII}}</th> 198 * <td headers="matches posix ASCII">All ASCII:{@code [\x00-\x7F]}</td></tr> 199 * <tr><th style="vertical-align:top; font-weight:normal" id="Alpha">{@code \p{Alpha}}</th> 200 * <td headers="matches posix Alpha">An alphabetic character:{@code [\p{IsAlphabetic}]}</td></tr> 201 * <tr><th style="vertical-align:top; font-weight:normal" id="Digit">{@code \p{IsDigit}}</th> 202 * <td headers="matches posix Digit">A decimal digit: {@code \p{gc=Decimal_Number}}</td></tr> 203 * <tr><th style="vertical-align:top; font-weight:normal" id="Alnum">{@code \p{Alnum}}</th> 204 * <td headers="matches posix Alnum">An alphanumeric character:{@code [\p{Alpha}\p{Digit}]}</td></tr> 205 * <tr><th style="vertical-align:top; font-weight:normal" id="Punct">{@code \p{Punct}}</th> 206 * <td headers="matches posix Punct">Punctuation: {@code \p{IsPunctuation}}</td></tr> 207 * <tr><th style="vertical-align:top; font-weight:normal" id="Graph">{@code \p{Graph}}</th> 208 * <td headers="matches posix Graph">A visible character: 209 * {@code [^p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]}</td></tr> 210 * <tr><th style="vertical-align:top; font-weight:normal" id="Print">{@code \p{Print}}</th> 211 * <td headers="matches posix Print">A printable character: {@code [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]}</td></tr> 212 * <tr><th style="vertical-align:top; font-weight:normal" id="Blank">{@code \p{Blank}}</th> 213 * <td headers="matches posix Blank">A space or a tab: {@code [\p{gc=Space_Separator}\N{CHARACTER TABULATION}]}</td></tr> 214 * <tr><th style="vertical-align:top; font-weight:normal" id="Cntrl">{@code \p{Cntrl}}</th> 215 * <td headers="matches posix Cntrl">A control character: {@code \p{gc=Control}}</td></tr> 216 * <tr><th style="vertical-align:top; font-weight:normal" id="XDigit">{@code \p{XDigit}}</th> 217 * <td headers="matches posix XDigit">A hexadecimal digit: {@code [\p{gc=Decimal_Number}\p{IsHex_Digit}]}</td></tr> 218 * <tr><th style="vertical-align:top; font-weight:normal" id="Space">{@code \p{Space}}</th> 219 * <td headers="matches posix Space">A whitespace character: {@code \p{IsWhite_Space}}</td></tr> 220 * <tr><th style="vertical-align:top; font-weight:normal" id="PosixCompatible">POSIX-Compatible expression</th> 221 * <td headers="matches posix PosixCompatible">See <a href="http://www.unicode.org/reports/tr18/#Compatibility_Properties">Unicode documentation</a></td></tr> 222 * 223 * <tr><th colspan="2" style="padding-top:20px" id="java">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr> 224 * 225 * <tr><th style="vertical-align:top; font-weight:normal" id="javaLowerCase">{@code \p{javaLowerCase}}</th> 226 * <td headers="matches java javaLowerCase">Equivalent to java.lang.Character.isLowerCase()</td></tr> 227 * <tr><th style="vertical-align:top; font-weight:normal" id="javaUpperCase">{@code \p{javaUpperCase}}</th> 228 * <td headers="matches java javaUpperCase">Equivalent to java.lang.Character.isUpperCase()</td></tr> 229 * <tr><th style="vertical-align:top; font-weight:normal" id="javaWhitespace">{@code \p{javaWhitespace}}</th> 230 * <td headers="matches java javaWhitespace">Equivalent to java.lang.Character.isWhitespace()</td></tr> 231 * <tr><th style="vertical-align:top; font-weight:normal" id="javaMirrored">{@code \p{javaMirrored}}</th> 232 * <td headers="matches java javaMirrored">Equivalent to java.lang.Character.isMirrored()</td></tr> 233 * 234 * <tr><th colspan="2" style="padding-top:20px" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr> 235 * 236 * <tr><th style="vertical-align:top; font-weight:normal" id="IsLatin">{@code \p{IsLatin}}</th> 237 * <td headers="matches unicode IsLatin">A Latin script character (<a href="#usc">script</a>)</td></tr> 238 * <tr><th style="vertical-align:top; font-weight:normal" id="InGreek">{@code \p{InGreek}}</th> 239 * <td headers="matches unicode InGreek">A character in the Greek block (<a href="#ubc">block</a>)</td></tr> 240 * <tr><th style="vertical-align:top; font-weight:normal" id="Lu">{@code \p{Lu}}</th> 241 * <td headers="matches unicode Lu">An uppercase letter (<a href="#ucc">category</a>)</td></tr> 242 * <tr><th style="vertical-align:top; font-weight:normal" id="IsAlphabetic">{@code \p{IsAlphabetic}}</th> 243 * <td headers="matches unicode IsAlphabetic">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr> 244 * <tr><th style="vertical-align:top; font-weight:normal" id="Sc">{@code \p{Sc}}</th> 245 * <td headers="matches unicode Sc">A currency symbol</td></tr> 246 * <tr><th style="vertical-align:top; font-weight:normal" id="not_InGreek">{@code \P{InGreek}}</th> 247 * <td headers="matches unicode not_InGreek">Any character except one in the Greek block (negation)</td></tr> 248 * <tr><th style="vertical-align:top; font-weight:normal" id="not_uppercase">{@code [\p{L}&&[^\p{Lu}]]}</th> 249 * <td headers="matches unicode not_uppercase">Any letter except an uppercase letter (subtraction)</td></tr> 250 * 251 * <tr><th colspan="2" style="padding-top:20px" id="bounds">Boundary matchers</th></tr> 252 * 253 * <tr><th style="vertical-align:top; font-weight:normal" id="begin_line">{@code ^}</th> 254 * <td headers="matches bounds begin_line">The beginning of a line</td></tr> 255 * <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th> 256 * <td headers="matches bounds end_line">The end of a line</td></tr> 257 * <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th> 258 * <td headers="matches bounds word_boundary">A word boundary</td></tr> 259 * <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th> 260 * <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr> 261 * <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th> 262 * <td headers="matches bounds begin_input">The beginning of the input</td></tr> 263 * <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th> 264 * <td headers="matches bounds end_prev_match">The end of the previous match</td></tr> 265 * <tr><th style="vertical-align:top; font-weight:normal" id="end_input_except_term">{@code \Z}</th> 266 * <td headers="matches bounds end_input_except_term">The end of the input but for the final 267 * <a href="#lt">terminator</a>, if any</td></tr> 268 * <tr><th style="vertical-align:top; font-weight:normal" id="end_input">{@code \z}</th> 269 * <td headers="matches bounds end_input">The end of the input</td></tr> 270 * 271 * <tr><th colspan="2" style="padding-top:20px" id="linebreak">Linebreak matcher</th></tr> 272 * 273 * <tr><th style="vertical-align:top; font-weight:normal" id="any_unicode_linebreak">{@code \R}</th> 274 * <td headers="matches linebreak any_unicode_linebreak">Any Unicode linebreak sequence, is equivalent to 275 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] 276 * </code></td></tr> 277 * 278 * <tr><th colspan="2" style="padding-top:20px" id="grapheme">Unicode Extended Grapheme matcher</th></tr> 279 * 280 * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_any">{@code \X}</th> 281 * <td headers="matches grapheme grapheme_any">Any Unicode extended grapheme cluster</td></tr> 282 * 283 * <tr><th colspan="2" style="padding-top:20px" id="greedy">Greedy quantifiers</th></tr> 284 * 285 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_once_or_not"><i>X</i>{@code ?}</th> 286 * <td headers="matches greedy greedy_once_or_not"><i>X</i>, once or not at all</td></tr> 287 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_zero_or_more"><i>X</i>{@code *}</th> 288 * <td headers="matches greedy greedy_zero_or_more"><i>X</i>, zero or more times</td></tr> 289 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_one_or_more"><i>X</i>{@code +}</th> 290 * <td headers="matches greedy greedy_one_or_more"><i>X</i>, one or more times</td></tr> 291 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_exactly"><i>X</i><code>{</code><i>n</i><code>}</code></th> 292 * <td headers="matches greedy greedy_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 293 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least"><i>X</i><code>{</code><i>n</i>{@code ,}}</th> 294 * <td headers="matches greedy greedy_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 295 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></th> 296 * <td headers="matches greedy greedy_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 297 * 298 * <tr><th colspan="2" style="padding-top:20px" id="reluc">Reluctant quantifiers</th></tr> 299 * 300 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_once_or_not"><i>X</i>{@code ??}</th> 301 * <td headers="matches reluc reluc_once_or_not"><i>X</i>, once or not at all</td></tr> 302 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_zero_or_more"><i>X</i>{@code *?}</th> 303 * <td headers="matches reluc reluc_zero_or_more"><i>X</i>, zero or more times</td></tr> 304 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_one_or_more"><i>X</i>{@code +?}</th> 305 * <td headers="matches reluc reluc_one_or_more"><i>X</i>, one or more times</td></tr> 306 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_exactly"><i>X</i><code>{</code><i>n</i><code>}?</code></th> 307 * <td headers="matches reluc reluc_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 308 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least"><i>X</i><code>{</code><i>n</i><code>,}?</code></th> 309 * <td headers="matches reluc reluc_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 310 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}?</code></th> 311 * <td headers="matches reluc reluc_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 312 * 313 * <tr><th colspan="2" style="padding-top:20px" id="poss">Possessive quantifiers</th></tr> 314 * 315 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_once_or_not"><i>X</i>{@code ?+}</th> 316 * <td headers="matches poss poss_once_or_not"><i>X</i>, once or not at all</td></tr> 317 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_zero_or_more"><i>X</i>{@code *+}</th> 318 * <td headers="matches poss poss_zero_or_more"><i>X</i>, zero or more times</td></tr> 319 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_one_or_more"><i>X</i>{@code ++}</th> 320 * <td headers="matches poss poss_one_or_more"><i>X</i>, one or more times</td></tr> 321 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_exactly"><i>X</i><code>{</code><i>n</i><code>}+</code></th> 322 * <td headers="matches poss poss_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 323 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least"><i>X</i><code>{</code><i>n</i><code>,}+</code></th> 324 * <td headers="matches poss poss_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 325 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}+</code></th> 326 * <td headers="matches poss poss_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 327 * 328 * <tr><th colspan="2" style="padding-top:20px" id="logical">Logical operators</th></tr> 329 * 330 * <tr><th style="vertical-align:top; font-weight:normal" id="concat"><i>XY</i></th> 331 * <td headers="matches logical concat"><i>X</i> followed by <i>Y</i></td></tr> 332 * <tr><th style="vertical-align:top; font-weight:normal" id="alternate"><i>X</i>{@code |}<i>Y</i></th> 333 * <td headers="matches logical alternate">Either <i>X</i> or <i>Y</i></td></tr> 334 * <tr><th style="vertical-align:top; font-weight:normal" id="group">{@code (}<i>X</i>{@code )}</th> 335 * <td headers="matches logical group">X, as a <a href="#cg">capturing group</a></td></tr> 336 * 337 * <tr><th colspan="2" style="padding-top:20px" id="backref">Back references</th></tr> 338 * 339 * <tr><th style="vertical-align:top; font-weight:normal" id="back_nth">{@code \}<i>n</i></th> 340 * <td headers="matches backref back_nth">Whatever the <i>n</i><sup>th</sup> 341 * <a href="#cg">capturing group</a> matched</td></tr> 342 * <tr><th style="vertical-align:top; font-weight:normal" id="back_named">{@code \}<i>k</i><<i>name</i>></th> 343 * <td headers="matches backref back_named">Whatever the 344 * <a href="#groupname">named-capturing group</a> "name" matched. Only available for API 26 or above</td></tr> 345 * 346 * <tr><th colspan="2" style="padding-top:20px" id="quote">Quotation</th></tr> 347 * 348 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_follow">{@code \}</th> 349 * <td headers="matches quote quote_follow">Nothing, but quotes the following character</td></tr> 350 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_begin">{@code \Q}</th> 351 * <td headers="matches quote quote_begin">Nothing, but quotes all characters until {@code \E}</td></tr> 352 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_end">{@code \E}</th> 353 * <td headers="matches quote quote_end">Nothing, but ends quoting started by {@code \Q}</td></tr> 354 * <!-- Metachars: !$()*+.<>?[\]^{|} --> 355 * 356 * <tr><th colspan="2" style="padding-top:20px" id="special">Special constructs (named-capturing and non-capturing)</th></tr> 357 * 358 * <tr><th style="vertical-align:top; font-weight:normal" id="named_group"><code>(?<<a href="#groupname">name</a>></code><i>X</i>{@code )}</th> 359 * <td headers="matches special named_group"><i>X</i>, as a named-capturing group. Only available for API 26 or above.</td></tr> 360 * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group">{@code (?:}<i>X</i>{@code )}</th> 361 * <td headers="matches special non_capture_group"><i>X</i>, as a non-capturing group</td></tr> 362 * <tr><th style="vertical-align:top; font-weight:normal" id="flags"><code>(?idmsux-idmsux) </code></th> 363 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> 364 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a> 365 * on - off</td></tr> 366 * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group_flags">{@code (?idmsuxU-idmsuxU:}<i>X</i>{@code )} </th> 367 * <td headers="matches special non_capture_group_flags"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the 368 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a> 369 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a > 370 * <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a> on - off</td></tr> 371 * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookahead">{@code (?=}<i>X</i>{@code )}</th> 372 * <td headers="matches special pos_lookahead"><i>X</i>, via zero-width positive lookahead</td></tr> 373 * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookahead">{@code (?!}<i>X</i>{@code )}</th> 374 * <td headers="matches special neg_lookahead"><i>X</i>, via zero-width negative lookahead</td></tr> 375 * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookbehind">{@code (?<=}<i>X</i>{@code )}</th> 376 * <td headers="matches special pos_lookbehind"><i>X</i>, via zero-width positive lookbehind</td></tr> 377 * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookbehind">{@code (?<!}<i>X</i>{@code )}</th> 378 * <td headers="matches special neg_lookbehind"><i>X</i>, via zero-width negative lookbehind</td></tr> 379 * <tr><th style="vertical-align:top; font-weight:normal" id="indep_non_capture_group">{@code (?>}<i>X</i>{@code )}</th> 380 * <td headers="matches special indep_non_capture_group"><i>X</i>, as an independent, non-capturing group</td></tr> 381 * 382 * </tbody> 383 * </table> 384 * 385 * <hr> 386 * 387 * 388 * <h2><a id="bs">Backslashes, escapes, and quoting</a></h2> 389 * 390 * <p> The backslash character ({@code '\'}) serves to introduce escaped 391 * constructs, as defined in the table above, as well as to quote characters 392 * that otherwise would be interpreted as unescaped constructs. Thus the 393 * expression {@code \\} matches a single backslash and <code>\{</code> matches a 394 * left brace. 395 * 396 * <p> It is an error to use a backslash prior to any alphabetic character that 397 * does not denote an escaped construct; these are reserved for future 398 * extensions to the regular-expression language. A backslash may be used 399 * prior to a non-alphabetic character regardless of whether that character is 400 * part of an unescaped construct. 401 * 402 * <p> Backslashes within string literals in Java source code are interpreted 403 * as required by 404 * <cite>The Java Language Specification</cite> 405 * as either Unicode escapes (section {@jls 3.3}) or other character escapes (section {@jls 3.10.6}) 406 * It is therefore necessary to double backslashes in string 407 * literals that represent regular expressions to protect them from 408 * interpretation by the Java bytecode compiler. The string literal 409 * <code>"\b"</code>, for example, matches a single backspace character when 410 * interpreted as a regular expression, while {@code "\\b"} matches a 411 * word boundary. The string literal {@code "\(hello\)"} is illegal 412 * and leads to a compile-time error; in order to match the string 413 * {@code (hello)} the string literal {@code "\\(hello\\)"} 414 * must be used. 415 * 416 * <h2><a id="cc">Character Classes</a></h2> 417 * 418 * <p> Character classes may appear within other character classes, and 419 * may be composed by the union operator (implicit) and the intersection 420 * operator ({@code &&}). 421 * The union operator denotes a class that contains every character that is 422 * in at least one of its operand classes. The intersection operator 423 * denotes a class that contains every character that is in both of its 424 * operand classes. 425 * 426 * <p> The precedence of character-class operators is as follows, from 427 * highest to lowest: 428 * 429 * <table class="striped" style="margin-left: 2em;"> 430 * <caption style="display:none">Precedence of character class operators.</caption> 431 * <thead> 432 * <tr><th scope="col">Precedence<th scope="col">Name<th scope="col">Example 433 * </thead> 434 * <tbody> 435 * <tr><th scope="row">1</th> 436 * <td>Literal escape </td> 437 * <td>{@code \x}</td></tr> 438 * <tr><th scope="row">2</th> 439 * <td>Grouping</td> 440 * <td>{@code [...]}</td></tr> 441 * <tr><th scope="row">3</th> 442 * <td>Range</td> 443 * <td>{@code a-z}</td></tr> 444 * <tr><th scope="row">4</th> 445 * <td>Union</td> 446 * <td>{@code [a-e][i-u]}</td></tr> 447 * <tr><th scope="row">5</th> 448 * <td>Intersection</td> 449 * <td>{@code [a-z&&[aeiou]]}</td></tr> 450 * </tbody> 451 * </table> 452 * 453 * <p> Note that a different set of metacharacters are in effect inside 454 * a character class than outside a character class. For instance, the 455 * regular expression {@code .} loses its special meaning inside a 456 * character class, while the expression {@code -} becomes a range 457 * forming metacharacter. 458 * 459 * <h2><a id="lt">Line terminators</a></h2> 460 * 461 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks 462 * the end of a line of the input character sequence. The following are 463 * recognized as line terminators: 464 * 465 * <ul> 466 * 467 * <li> A newline (line feed) character ({@code '\n'}), 468 * 469 * <li> A carriage-return character followed immediately by a newline 470 * character ({@code "\r\n"}), 471 * 472 * <li> A standalone carriage-return character ({@code '\r'}), 473 * 474 * <li> A next-line character (<code>'\u0085'</code>), 475 * 476 * <li> A line-separator character (<code>'\u2028'</code>), or 477 * 478 * <li> A paragraph-separator character (<code>'\u2029'</code>). 479 * 480 * </ul> 481 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators 482 * recognized are newline characters. 483 * 484 * <p> The regular expression {@code .} matches any character except a line 485 * terminator unless the {@link #DOTALL} flag is specified. 486 * 487 * <p> By default, the regular expressions {@code ^} and {@code $} ignore 488 * line terminators and only match at the beginning and the end, respectively, 489 * of the entire input sequence. If {@link #MULTILINE} mode is activated then 490 * {@code ^} matches at the beginning of input and after any line terminator 491 * except at the end of input. When in {@link #MULTILINE} mode {@code $} 492 * matches just before a line terminator or the end of the input sequence. 493 * 494 * <h2><a id="cg">Groups and capturing</a></h2> 495 * 496 * <h3><a id="gnumber">Group number</a></h3> 497 * <p> Capturing groups are numbered by counting their opening parentheses from 498 * left to right. In the expression {@code ((A)(B(C)))}, for example, there 499 * are four such groups: </p> 500 * 501 * <ol style="margin-left:2em;"> 502 * <li> {@code ((A)(B(C)))} 503 * <li> {@code (A)} 504 * <li> {@code (B(C))} 505 * <li> {@code (C)} 506 * </ol> 507 * 508 * <p> Group zero always stands for the entire expression. 509 * 510 * <p> Capturing groups are so named because, during a match, each subsequence 511 * of the input sequence that matches such a group is saved. The captured 512 * subsequence may be used later in the expression, via a back reference, and 513 * may also be retrieved from the matcher once the match operation is complete. 514 * 515 * <h3><a id="groupname">Group name</a></h3> 516 * <p>The constructs and APIs are available since API level 26. A capturing group 517 * can also be assigned a "name", a {@code named-capturing group}, 518 * and then be back-referenced later by the "name". Group names are composed of 519 * the following characters. The first character must be a {@code letter}. 520 * 521 * <ul> 522 * <li> The uppercase letters {@code 'A'} through {@code 'Z'} 523 * (<code>'\u0041'</code> through <code>'\u005a'</code>), 524 * <li> The lowercase letters {@code 'a'} through {@code 'z'} 525 * (<code>'\u0061'</code> through <code>'\u007a'</code>), 526 * <li> The digits {@code '0'} through {@code '9'} 527 * (<code>'\u0030'</code> through <code>'\u0039'</code>), 528 * </ul> 529 * 530 * <p> A {@code named-capturing group} is still numbered as described in 531 * <a href="#gnumber">Group number</a>. 532 * 533 * <p> The captured input associated with a group is always the subsequence 534 * that the group most recently matched. If a group is evaluated a second time 535 * because of quantification then its previously-captured value, if any, will 536 * be retained if the second evaluation fails. Matching the string 537 * {@code "aba"} against the expression {@code (a(b)?)+}, for example, leaves 538 * group two set to {@code "b"}. All captured input is discarded at the 539 * beginning of each match. 540 * 541 * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups 542 * that do not capture text and do not count towards the group total, or 543 * <i>named-capturing</i> group. 544 * 545 * <h2> Unicode support </h2> 546 * 547 * <p> This class is in conformance with Level 1 of <a 548 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 549 * Standard #18: Unicode Regular Expressions</i></a>, plus RL2.1 550 * Canonical Equivalents and RL2.2 Extended Grapheme Clusters. 551 * <p> 552 * <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code 553 * are processed as described in section {@jls 3.3} of 554 * <cite>The Java Language Specification</cite>. 555 * Such escape sequences are also implemented directly by the regular-expression 556 * parser so that Unicode escapes can be used in expressions that are read from 557 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and 558 * {@code "\\u2014"}, while not equal, compile into the same pattern, which 559 * matches the character with hexadecimal value {@code 0x2014}. 560 * <p> 561 * A Unicode character can also be represented by using its <b>Hex notation</b> 562 * (hexadecimal code point value) directly as described in construct 563 * <code>\x{...}</code>, for example a supplementary character U+2011F can be 564 * specified as <code>\x{2011F}</code>, instead of two consecutive Unicode escape 565 * sequences of the surrogate pair <code>\uD840</code><code>\uDD1F</code>. 566 * <p> 567 * <b>Unicode character names</b> are supported by the named character construct 568 * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code> 569 * specifies character <code>\u263A</code>. The character names supported 570 * by this class are the valid Unicode character names matched by 571 * {@code java.lang.Character.codePointOf(String) Character.codePointOf(name)}. 572 * <p> 573 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> 574 * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme 575 * cluster matcher {@code \X}. 576 * <p> 577 * Unicode scripts, blocks, categories and binary properties are written with 578 * the {@code \p} and {@code \P} constructs as in Perl. 579 * <code>\p{</code><i>prop</i><code>}</code> matches if 580 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code> 581 * does not match if the input has that property. 582 * <p> 583 * Scripts, blocks, categories and binary properties can be used both inside 584 * and outside of a character class. 585 * 586 * <p> 587 * <b><a id="usc">Scripts</a></b> are specified either with the prefix {@code Is} supported since 588 * Android 10, as in {@code IsHiragana}, or by using the {@code script} keyword (or its short 589 * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}. 590 * <p> 591 * The script names supported by {@code Pattern} are the valid script names 592 * accepted and defined by 593 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. 594 * 595 * <p> 596 * <b><a id="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in 597 * {@code InMongolian}, or by using the keyword {@code block} (or its short 598 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. 599 * <p> 600 * The block names supported by {@code Pattern} are the valid block names 601 * accepted and defined by 602 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. 603 * <p> 604 * 605 * <b><a id="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}: 606 * Both {@code \p{IsL}} supported since Android 10 and {@code \p{L}} denote the category of Unicode 607 * letters. Same as scripts and blocks, categories can also be specified 608 * by using the keyword {@code general_category} (or its short form 609 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. 610 * <p> 611 * The supported categories are those of 612 * <a href="http://www.unicode.org/standard/standard.html"> 613 * <i>The Unicode Standard</i></a> in the version specified by the 614 * {@link java.lang.Character Character} class. The category names are those 615 * defined in the Standard, both normative and informative. 616 * <p> 617 * 618 * <b><a id="ubpc">Binary properties</a></b> are specified with the prefix {@code Is} since 619 * Android 10, as in {@code IsAlphabetic}. The prefix {@code Is} isn't needed before Android 10, 620 * as in {@code Alphabetic}. The supported binary properties by {@code Pattern} 621 * are 622 * <ul> 623 * <li> Alphabetic 624 * <li> Ideographic 625 * <li> Letter 626 * <li> Lowercase 627 * <li> Uppercase 628 * <li> Titlecase 629 * <li> Punctuation 630 * <Li> Control 631 * <li> White_Space 632 * <li> Digit 633 * <li> Hex_Digit 634 * <li> Join_Control 635 * <li> Noncharacter_Code_Point 636 * <li> Assigned 637 * </ul> 638 * <p> 639 * The <b>Predefined Character classes</b> and <b>POSIX character classes</b> 640 * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i> 641 * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical Standard #18: 642 * Unicode Regular Expressions</i></a>. 643 * 644 * <p> 645 * <a id="jcc"> 646 * Categories that behave like the java.lang.Character 647 * boolean is<i>methodname</i> methods (except for the deprecated ones) are 648 * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where 649 * the specified property has the name <code>java<i>methodname</i></code></a>. 650 * 651 * <h3> Behavior starting from API level 10 (Android 2.3) </h3> 652 * 653 * <p> Starting from Android 2.3 Gingerbread, ICU4C becomes the backend of the regular expression 654 * implementation. Android could behave differently compared with other regex implementation, e.g. 655 * literal right brace ('}') has to be escaped on Android.</p> 656 * 657 * <p> Some other behavior differences can be found in the 658 * <a href="https://unicode-org.github.io/icu/userguide/strings/regexp.html#differences-with-java-regular-expressions"> 659 * ICU documentation</a>. </p> 660 * 661 * <h2> Comparison to Perl 5 </h2> 662 * 663 * <p>The {@code Pattern} engine performs traditional NFA-based matching 664 * with ordered alternation as occurs in Perl 5. 665 * 666 * <p> Perl constructs not supported by this class: </p> 667 * 668 * <ul> 669 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for 670 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and 671 * <code>\g{</code><i>name</i><code>}</code> for 672 * <a href="#groupname">named-capturing group</a>. 673 * </p></li> 674 * 675 * <li><p> The conditional constructs 676 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and 677 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )}, 678 * </p></li> 679 * 680 * <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code> 681 * and <code>(??{</code><i>code</i><code>})</code>,</p></li> 682 * 683 * <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li> 684 * 685 * <li><p> The preprocessing operations {@code \l} <code>\u</code>, 686 * {@code \L}, and {@code \U}. </p></li> 687 * 688 * </ul> 689 * 690 * <p> Constructs supported by this class but not by Perl: </p> 691 * 692 * <ul> 693 * 694 * <li><p> Character-class union and intersection as described 695 * <a href="#cc">above</a>.</p></li> 696 * 697 * </ul> 698 * 699 * <p> Notable differences from Perl: </p> 700 * 701 * <ul> 702 * 703 * <li><p> In Perl, {@code \1} through {@code \9} are always interpreted 704 * as back references; a backslash-escaped number greater than {@code 9} is 705 * treated as a back reference if at least that many subexpressions exist, 706 * otherwise it is interpreted, if possible, as an octal escape. In this 707 * class octal escapes must always begin with a zero. In this class, 708 * {@link #compile(String)} throws {@link PatternSyntaxException} for any 709 * non-existent back references. Please use {@code \Q} and {@code \E} to 710 * quote any digit literals followed by back references. 711 * </p></li> 712 * 713 * <li><p> Perl uses the {@code g} flag to request a match that resumes 714 * where the last match left off. This functionality is provided implicitly 715 * by the {@link Matcher} class: Repeated invocations of the {@link 716 * Matcher#find find} method will resume where the last match left off, 717 * unless the matcher is reset. </p></li> 718 * 719 * <li><p> In Perl, embedded flags at the top level of an expression affect 720 * the whole expression. In this class, embedded flags always take effect 721 * at the point at which they appear, whether they are at the top level or 722 * within a group; in the latter case, flags are restored at the end of the 723 * group just as in Perl. </p></li> 724 * 725 * </ul> 726 * 727 * 728 * <p> For a more precise description of the behavior of regular expression 729 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/"> 730 * <i>Mastering Regular Expressions, 3rd Edition</i>, Jeffrey E. F. Friedl, 731 * O'Reilly and Associates, 2006.</a> 732 * </p> 733 * 734 * @see java.lang.String#split(String, int) 735 * @see java.lang.String#split(String) 736 * 737 * @author Mike McCloskey 738 * @author Mark Reinhold 739 * @author JSR-51 Expert Group 740 * @since 1.4 741 */ 742 743 public final class Pattern 744 implements java.io.Serializable 745 { 746 747 /* 748 * Regular expression modifier values. Instead of being passed as 749 * arguments, they can also be passed as inline modifiers. 750 * For example, the following statements have the same effect. 751 * 752 * Pattern p1 = Pattern.compile("abc", Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); 753 * Pattern p2 = Pattern.compile("(?im)abc", 0); 754 */ 755 756 /** 757 * Enables Unix lines mode. 758 * 759 * <p> In this mode, only the {@code '\n'} line terminator is recognized 760 * in the behavior of {@code .}, {@code ^}, and {@code $}. 761 * 762 * <p> Unix lines mode can also be enabled via the embedded flag 763 * expression {@code (?d)}. 764 */ 765 public static final int UNIX_LINES = 0x01; 766 767 // Android-changed: CASE_INSENSITIVE is Unicode-aware on Android. 768 /** 769 * Enables case-insensitive matching. 770 * 771 * <p> Case-insensitive matching is Unicode-aware on Android. 772 * 773 * <p> Case-insensitive matching can also be enabled via the embedded flag 774 * expression {@code (?i)}. 775 * 776 * <p> Specifying this flag may impose a slight performance penalty. </p> 777 */ 778 public static final int CASE_INSENSITIVE = 0x02; 779 780 /** 781 * Permits whitespace and comments in pattern. 782 * 783 * <p> In this mode, whitespace is ignored, and embedded comments starting 784 * with {@code #} are ignored until the end of a line. 785 * 786 * <p> Comments mode can also be enabled via the embedded flag 787 * expression {@code (?x)}. 788 */ 789 public static final int COMMENTS = 0x04; 790 791 /** 792 * Enables multiline mode. 793 * 794 * <p> In multiline mode the expressions {@code ^} and {@code $} match 795 * just after or just before, respectively, a line terminator or the end of 796 * the input sequence. By default these expressions only match at the 797 * beginning and the end of the entire input sequence. 798 * 799 * <p> Multiline mode can also be enabled via the embedded flag 800 * expression {@code (?m)}. </p> 801 */ 802 public static final int MULTILINE = 0x08; 803 804 /** 805 * Enables literal parsing of the pattern. 806 * 807 * <p> When this flag is specified then the input string that specifies 808 * the pattern is treated as a sequence of literal characters. 809 * Metacharacters or escape sequences in the input sequence will be 810 * given no special meaning. 811 * 812 * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on 813 * matching when used in conjunction with this flag. The other flags 814 * become superfluous. 815 * 816 * <p> There is no embedded flag character for enabling literal parsing. 817 * @since 1.5 818 */ 819 public static final int LITERAL = 0x10; 820 821 /** 822 * Enables dotall mode. 823 * 824 * <p> In dotall mode, the expression {@code .} matches any character, 825 * including a line terminator. By default this expression does not match 826 * line terminators. 827 * 828 * <p> Dotall mode can also be enabled via the embedded flag 829 * expression {@code (?s)}. (The {@code s} is a mnemonic for 830 * "single-line" mode, which is what this is called in Perl.) </p> 831 */ 832 public static final int DOTALL = 0x20; 833 834 // Android-changed: UNICODE_CASE flag is ignored. 835 /** 836 * Enables Unicode-aware case folding. This flag is ignoredon Android. 837 * When {@link #CASE_INSENSITIVE} flag is provided, case-insensitive 838 * matching is always done in a manner consistent with the Unicode Standard. 839 * 840 * <p> The embedded flag {@code (?u)} is ignored. 841 * 842 * <p> Specifying this flag may impose a performance penalty. </p> 843 */ 844 public static final int UNICODE_CASE = 0x40; 845 846 // Android-changed: Android does not support CANON_EQ flag. 847 /** 848 * This flag is not supported on Android. 849 */ 850 public static final int CANON_EQ = 0x80; 851 852 // Android-changed: Android always uses unicode character classes. 853 /** 854 * This flag is not supported on Android, and Unicode character classes are always 855 * used. 856 * <p> 857 * See the Unicode version of 858 * <i>Predefined character classes</i> and <i>POSIX character classes</i> 859 * are in conformance with 860 * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 861 * Standard #18: Unicode Regular Expressions</i></a> 862 * <i>Annex C: Compatibility Properties</i>. 863 * <p> 864 * @since 1.7 865 */ 866 public static final int UNICODE_CHARACTER_CLASS = 0x100; 867 868 /** 869 * Contains all possible flags for compile(regex, flags). 870 */ 871 private static final int ALL_FLAGS = CASE_INSENSITIVE | MULTILINE | 872 // Android-changed: CANON_EQ and UNICODE_CHARACTER_CLASS flags aren't supported. 873 // DOTALL | UNICODE_CASE | CANON_EQ | UNIX_LINES | LITERAL | 874 // UNICODE_CHARACTER_CLASS | COMMENTS; 875 DOTALL | UNICODE_CASE | UNIX_LINES | LITERAL | COMMENTS; 876 877 /* Pattern has only two serialized components: The pattern string 878 * and the flags, which are all that is needed to recompile the pattern 879 * when it is deserialized. 880 */ 881 882 /** use serialVersionUID from Merlin b59 for interoperability */ 883 @java.io.Serial 884 private static final long serialVersionUID = 5073258162644648461L; 885 886 /** 887 * The original regular-expression pattern string. 888 * 889 * @serial 890 */ 891 // Android-changed: reimplement matching logic natively via ICU. 892 // private String pattern; 893 private final String pattern; 894 895 /** 896 * The original pattern flags. 897 * 898 * @serial 899 */ 900 // Android-changed: reimplement matching logic natively via ICU. 901 // private int flags; 902 private final int flags; 903 904 905 // BEGIN Android-changed: reimplement matching logic natively via ICU. 906 // We only need some tie-ins to native memory, instead of a large number 907 // of fields on the .java side. 908 /* package */ transient PatternNative nativePattern; 909 // END Android-changed: reimplement matching logic natively via ICU. 910 911 /** 912 * Compiles the given regular expression into a pattern. 913 * 914 * @param regex 915 * The expression to be compiled 916 * @return the given regular expression compiled into a pattern 917 * @throws PatternSyntaxException 918 * If the expression's syntax is invalid 919 */ compile(String regex)920 public static Pattern compile(String regex) { 921 return new Pattern(regex, 0); 922 } 923 924 // Android-changed: Android doesn't support CANON_EQ and UNICODE_CHARACTER_CLASS flags. 925 /** 926 * Compiles the given regular expression into a pattern with the given 927 * flags. 928 * 929 * @param regex 930 * The expression to be compiled 931 * 932 * @param flags 933 * Match flags, a bit mask that may include 934 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, 935 * {@link #UNICODE_CASE}, {@link #UNIX_LINES}, {@link #LITERAL}, 936 * and {@link #COMMENTS} 937 * 938 * @return the given regular expression compiled into a pattern with the given flags 939 * @throws IllegalArgumentException 940 * If bit values other than those corresponding to the defined 941 * match flags are set in {@code flags} 942 * 943 * @throws PatternSyntaxException 944 * If the expression's syntax is invalid 945 */ compile(String regex, int flags)946 public static Pattern compile(String regex, int flags) { 947 return new Pattern(regex, flags); 948 } 949 950 /** 951 * Returns the regular expression from which this pattern was compiled. 952 * 953 * @return The source of this pattern 954 */ pattern()955 public String pattern() { 956 return pattern; 957 } 958 959 /** 960 * <p>Returns the string representation of this pattern. This 961 * is the regular expression from which this pattern was 962 * compiled.</p> 963 * 964 * @return The string representation of this pattern 965 * @since 1.5 966 */ toString()967 public String toString() { 968 return pattern; 969 } 970 971 /** 972 * Creates a matcher that will match the given input against this pattern. 973 * 974 * @param input 975 * The character sequence to be matched 976 * 977 * @return A new matcher for this pattern 978 */ matcher(CharSequence input)979 public Matcher matcher(CharSequence input) { 980 // Android-removed: Pattern is eagerly compiled() upon construction. 981 /* 982 if (!compiled) { 983 synchronized(this) { 984 if (!compiled) 985 compile(); 986 } 987 } 988 */ 989 Matcher m = new Matcher(this, input); 990 return m; 991 } 992 993 /** 994 * Returns this pattern's match flags. 995 * 996 * @return The match flags specified when this pattern was compiled 997 */ flags()998 public int flags() { 999 // Android-changed: We don't need the temporary pattern flags0. 1000 // return flags0; 1001 return flags; 1002 } 1003 1004 /** 1005 * Compiles the given regular expression and attempts to match the given 1006 * input against it. 1007 * 1008 * <p> An invocation of this convenience method of the form 1009 * 1010 * <blockquote><pre> 1011 * Pattern.matches(regex, input);</pre></blockquote> 1012 * 1013 * behaves in exactly the same way as the expression 1014 * 1015 * <blockquote><pre> 1016 * Pattern.compile(regex).matcher(input).matches()</pre></blockquote> 1017 * 1018 * <p> If a pattern is to be used multiple times, compiling it once and reusing 1019 * it will be more efficient than invoking this method each time. </p> 1020 * 1021 * @param regex 1022 * The expression to be compiled 1023 * 1024 * @param input 1025 * The character sequence to be matched 1026 * @return whether or not the regular expression matches on the input 1027 * @throws PatternSyntaxException 1028 * If the expression's syntax is invalid 1029 */ matches(String regex, CharSequence input)1030 public static boolean matches(String regex, CharSequence input) { 1031 Pattern p = Pattern.compile(regex); 1032 Matcher m = p.matcher(input); 1033 return m.matches(); 1034 } 1035 1036 // Android-changed: Adopt split() behavior change only for apps targeting API > 28. 1037 // http://b/109659282#comment7 1038 /** 1039 * Splits the given input sequence around matches of this pattern. 1040 * 1041 * <p> The array returned by this method contains each substring of the 1042 * input sequence that is terminated by another subsequence that matches 1043 * this pattern or is terminated by the end of the input sequence. The 1044 * substrings in the array are in the order in which they occur in the 1045 * input. If this pattern does not match any subsequence of the input then 1046 * the resulting array has just one element, namely the input sequence in 1047 * string form. 1048 * 1049 * <p> When there is a positive-width match at the beginning of the input 1050 * sequence then an empty leading substring is included at the beginning 1051 * of the resulting array. A zero-width match at the beginning however 1052 * can only produce such an empty leading substring for apps running on or 1053 * targeting API versions <= 28. 1054 * 1055 * <p> The {@code limit} parameter controls the number of times the 1056 * pattern is applied and therefore affects the length of the resulting 1057 * array. 1058 * <ul> 1059 * <li><p> 1060 * If the <i>limit</i> is positive then the pattern will be applied 1061 * at most <i>limit</i> - 1 times, the array's length will be 1062 * no greater than <i>limit</i>, and the array's last entry will contain 1063 * all input beyond the last matched delimiter.</p></li> 1064 * 1065 * <li><p> 1066 * If the <i>limit</i> is zero then the pattern will be applied as 1067 * many times as possible, the array can have any length, and trailing 1068 * empty strings will be discarded.</p></li> 1069 * 1070 * <li><p> 1071 * If the <i>limit</i> is negative then the pattern will be applied 1072 * as many times as possible and the array can have any length.</p></li> 1073 * </ul> 1074 * 1075 * <p> The input {@code "boo:and:foo"}, for example, yields the following 1076 * results with these parameters: 1077 * 1078 * <table class="plain" style="margin-left:2em;"> 1079 * <caption style="display:none">Split example showing regex, limit, and result</caption> 1080 * <thead> 1081 * <tr> 1082 * <th scope="col">Regex</th> 1083 * <th scope="col">Limit</th> 1084 * <th scope="col">Result</th> 1085 * </tr> 1086 * </thead> 1087 * <tbody> 1088 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:</th> 1089 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 1090 * <td>{@code { "boo", "and:foo" }}</td></tr> 1091 * <tr><!-- : --> 1092 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 1093 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1094 * <tr><!-- : --> 1095 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 1096 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1097 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 1098 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 1099 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 1100 * <tr><!-- o --> 1101 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 1102 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 1103 * <tr><!-- o --> 1104 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 1105 * <td>{@code { "b", "", ":and:f" }}</td></tr> 1106 * </tbody> 1107 * </table> 1108 * 1109 * @param input 1110 * The character sequence to be split 1111 * 1112 * @param limit 1113 * The result threshold, as described above 1114 * 1115 * @return The array of strings computed by splitting the input 1116 * around matches of this pattern 1117 */ split(CharSequence input, int limit)1118 public String[] split(CharSequence input, int limit) { 1119 // BEGIN Android-added: fastSplit() to speed up simple cases. 1120 String[] fast = fastSplit(pattern, input.toString(), limit); 1121 if (fast != null) { 1122 return fast; 1123 } 1124 // END Android-added: fastSplit() to speed up simple cases. 1125 int index = 0; 1126 boolean matchLimited = limit > 0; 1127 ArrayList<String> matchList = new ArrayList<>(); 1128 Matcher m = matcher(input); 1129 1130 // Add segments before each match found 1131 while(m.find()) { 1132 if (!matchLimited || matchList.size() < limit - 1) { 1133 if (index == 0 && index == m.start() && m.start() == m.end()) { 1134 // no empty leading substring included for zero-width match 1135 // at the beginning of the input char sequence. 1136 // BEGIN Android-changed: split() compat behavior for apps targeting <= 28. 1137 // continue; 1138 int targetSdkVersion = VMRuntime.getRuntime().getTargetSdkVersion(); 1139 if (targetSdkVersion > 28) { 1140 continue; 1141 } 1142 // END Android-changed: split() compat behavior for apps targeting <= 28. 1143 } 1144 String match = input.subSequence(index, m.start()).toString(); 1145 matchList.add(match); 1146 index = m.end(); 1147 } else if (matchList.size() == limit - 1) { // last one 1148 String match = input.subSequence(index, 1149 input.length()).toString(); 1150 matchList.add(match); 1151 index = m.end(); 1152 } 1153 } 1154 1155 // If no match was found, return this 1156 if (index == 0) 1157 return new String[] {input.toString()}; 1158 1159 // Add remaining segment 1160 if (!matchLimited || matchList.size() < limit) 1161 matchList.add(input.subSequence(index, input.length()).toString()); 1162 1163 // Construct result 1164 int resultSize = matchList.size(); 1165 if (limit == 0) 1166 while (resultSize > 0 && matchList.get(resultSize-1).isEmpty()) 1167 resultSize--; 1168 String[] result = new String[resultSize]; 1169 return matchList.subList(0, resultSize).toArray(result); 1170 } 1171 1172 // BEGIN Android-added: fastSplit() to speed up simple cases. 1173 private static final String FASTSPLIT_METACHARACTERS = "\\?*+[](){}^$.|"; 1174 1175 /** 1176 * Returns a result equivalent to {@code s.split(separator, limit)} if it's able 1177 * to compute it more cheaply than native impl, or null if the caller should fall back to 1178 * using native impl. 1179 * 1180 * fastpath will work if the regex is a 1181 * (1)one-char String and this character is not one of the 1182 * RegEx's meta characters ".$|()[{^?*+\\", or 1183 * (2)two-char String and the first char is the backslash and 1184 * the second is one of regEx's meta characters ".$|()[{^?*+\\". 1185 * @hide 1186 */ fastSplit(String re, String input, int limit)1187 public static String[] fastSplit(String re, String input, int limit) { 1188 // Can we do it cheaply? 1189 int len = re.length(); 1190 if (len == 0) { 1191 return null; 1192 } 1193 char ch = re.charAt(0); 1194 if (len == 1) { 1195 if (Character.isSurrogate(ch)) { 1196 // Single surrogate is an invalid UTF-16 sequence. 1197 return null; 1198 } else if (FASTSPLIT_METACHARACTERS.indexOf(ch) != -1) { 1199 // We don't allow a single metacharacter. 1200 return null; 1201 } 1202 // pass through 1203 } else if (len == 2 && ch == '\\') { 1204 // We're looking for a quoted character. 1205 // Quoted metacharacters are effectively single non-metacharacters. 1206 ch = re.charAt(1); 1207 if (FASTSPLIT_METACHARACTERS.indexOf(ch) == -1) { 1208 return null; 1209 } 1210 } else { 1211 return null; 1212 } 1213 1214 // We can do this cheaply... 1215 1216 // Unlike Perl, which considers the result of splitting the empty string to be the empty 1217 // array, Java returns an array containing the empty string. 1218 if (input.isEmpty()) { 1219 return new String[] { "" }; 1220 } 1221 1222 // Count separators 1223 int separatorCount = 0; 1224 int begin = 0; 1225 int end; 1226 while (separatorCount + 1 != limit && (end = input.indexOf(ch, begin)) != -1) { 1227 ++separatorCount; 1228 begin = end + 1; 1229 } 1230 int lastPartEnd = input.length(); 1231 if (limit == 0 && begin == lastPartEnd) { 1232 // Last part is empty for limit == 0, remove all trailing empty matches. 1233 if (separatorCount == lastPartEnd) { 1234 // Input contains only separators. 1235 return EmptyArray.STRING; 1236 } 1237 // Find the beginning of trailing separators. 1238 do { 1239 --begin; 1240 } while (input.charAt(begin - 1) == ch); 1241 // Reduce separatorCount and fix lastPartEnd. 1242 separatorCount -= input.length() - begin; 1243 lastPartEnd = begin; 1244 } 1245 1246 // Collect the result parts. 1247 String[] result = new String[separatorCount + 1]; 1248 begin = 0; 1249 for (int i = 0; i != separatorCount; ++i) { 1250 end = input.indexOf(ch, begin); 1251 result[i] = input.substring(begin, end); 1252 begin = end + 1; 1253 } 1254 // Add last part. 1255 result[separatorCount] = input.substring(begin, lastPartEnd); 1256 return result; 1257 } 1258 // END Android-added: fastSplit() to speed up simple cases. 1259 1260 /** 1261 * Splits the given input sequence around matches of this pattern. 1262 * 1263 * <p> This method works as if by invoking the two-argument {@link 1264 * #split(java.lang.CharSequence, int) split} method with the given input 1265 * sequence and a limit argument of zero. Trailing empty strings are 1266 * therefore not included in the resulting array. </p> 1267 * 1268 * <p> The input {@code "boo:and:foo"}, for example, yields the following 1269 * results with these expressions: 1270 * 1271 * <table class="plain" style="margin-left:2em"> 1272 * <caption style="display:none">Split examples showing regex and result</caption> 1273 * <thead> 1274 * <tr> 1275 * <th scope="col">Regex</th> 1276 * <th scope="col">Result</th> 1277 * </tr> 1278 * </thead> 1279 * <tbody> 1280 * <tr><th scope="row" style="text-weight:normal">:</th> 1281 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1282 * <tr><th scope="row" style="text-weight:normal">o</th> 1283 * <td>{@code { "b", "", ":and:f" }}</td></tr> 1284 * </tbody> 1285 * </table> 1286 * 1287 * 1288 * @param input 1289 * The character sequence to be split 1290 * 1291 * @return The array of strings computed by splitting the input 1292 * around matches of this pattern 1293 */ split(CharSequence input)1294 public String[] split(CharSequence input) { 1295 return split(input, 0); 1296 } 1297 1298 /** 1299 * Returns a literal pattern {@code String} for the specified 1300 * {@code String}. 1301 * 1302 * <p>This method produces a {@code String} that can be used to 1303 * create a {@code Pattern} that would match the string 1304 * {@code s} as if it were a literal pattern.</p> Metacharacters 1305 * or escape sequences in the input sequence will be given no special 1306 * meaning. 1307 * 1308 * @param s The string to be literalized 1309 * @return A literal string replacement 1310 * @since 1.5 1311 */ quote(String s)1312 public static String quote(String s) { 1313 int slashEIndex = s.indexOf("\\E"); 1314 if (slashEIndex == -1) 1315 return "\\Q" + s + "\\E"; 1316 1317 int lenHint = s.length(); 1318 lenHint = (lenHint < Integer.MAX_VALUE - 8 - lenHint) ? 1319 (lenHint << 1) : (Integer.MAX_VALUE - 8); 1320 1321 StringBuilder sb = new StringBuilder(lenHint); 1322 sb.append("\\Q"); 1323 int current = 0; 1324 do { 1325 sb.append(s, current, slashEIndex) 1326 .append("\\E\\\\E\\Q"); 1327 current = slashEIndex + 2; 1328 } while ((slashEIndex = s.indexOf("\\E", current)) != -1); 1329 1330 return sb.append(s, current, s.length()) 1331 .append("\\E") 1332 .toString(); 1333 } 1334 1335 /** 1336 * Recompile the Pattern instance from a stream. The original pattern 1337 * string is read in and the object tree is recompiled from it. 1338 */ 1339 @java.io.Serial readObject(java.io.ObjectInputStream s)1340 private void readObject(java.io.ObjectInputStream s) 1341 throws java.io.IOException, ClassNotFoundException { 1342 1343 // Read in all fields 1344 s.defaultReadObject(); 1345 1346 // Android-removed: reimplement matching logic natively via ICU. 1347 /* 1348 // reset the flags 1349 flags0 = flags; 1350 1351 // Initialize counts 1352 capturingGroupCount = 1; 1353 localCount = 0; 1354 localTCNCount = 0; 1355 */ 1356 1357 // Android-changed: Pattern is eagerly compiled() upon construction. 1358 /* 1359 // if length > 0, the Pattern is lazily compiled 1360 if (pattern.isEmpty()) { 1361 root = new Start(lastAccept); 1362 matchRoot = lastAccept; 1363 compiled = true; 1364 } 1365 */ 1366 compile(); 1367 } 1368 1369 // Android-changed: reimplement matching logic natively via ICU. 1370 // Dropped documentation reference to Start and LastNode implementation 1371 // details which do not apply on Android. 1372 /** 1373 * This private constructor is used to create all Patterns. The pattern 1374 * string and match flags are all that is needed to completely describe 1375 * a Pattern. 1376 */ Pattern(String p, int f)1377 private Pattern(String p, int f) { 1378 // BEGIN Android-added: CANON_EQ and UNICODE_CHARACTER_CLASS flags are not supported. 1379 if ((f & CANON_EQ) != 0) { 1380 throw new IllegalArgumentException("CANON_EQ flag isn't supported"); 1381 } 1382 if ((f & UNICODE_CHARACTER_CLASS) != 0) { 1383 throw new IllegalArgumentException("UNICODE_CHARACTER_CLASS flag not supported"); 1384 } 1385 // END Android-added: CANON_EQ and UNICODE_CHARACTER_CLASS flags are not supported. 1386 if ((f & ~ALL_FLAGS) != 0) { 1387 throw new IllegalArgumentException("Unknown flag 0x" 1388 + Integer.toHexString(f)); 1389 } 1390 pattern = p; 1391 flags = f; 1392 1393 // Android-changed: Pattern is eagerly compiled() upon construction. 1394 // BEGIN Android-changed: Reimplement matching logic via ICU4C, and shouldn't overflow. 1395 /* 1396 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present 1397 if ((flags & UNICODE_CHARACTER_CLASS) != 0) 1398 flags |= UNICODE_CASE; 1399 1400 // 'flags' for compiling 1401 flags0 = flags; 1402 1403 // Reset group index count 1404 capturingGroupCount = 1; 1405 localCount = 0; 1406 localTCNCount = 0; 1407 1408 if (!pattern.isEmpty()) { 1409 try { 1410 compile(); 1411 } catch (StackOverflowError soe) { 1412 throw error("Stack overflow during pattern compilation"); 1413 } 1414 } else { 1415 root = new Start(lastAccept); 1416 matchRoot = lastAccept; 1417 } 1418 */ 1419 compile(); 1420 // END Android-changed: Reimplement matching logic via ICU4C, and shouldn't overflow. 1421 } 1422 1423 // BEGIN Android-removed: Reimplement matching logic via ICU4C. 1424 /** 1425 * The pattern is converted to normalized form ({@link 1426 * java.text.Normalizer.Form#NFC NFC}, canonical decomposition, 1427 * followed by canonical composition for the character class 1428 * part, and {@link java.text.Normalizer.Form#NFD NFD}, 1429 * canonical decomposition for the rest), and then a pure 1430 * group is constructed to match canonical equivalences of the 1431 * characters. 1432 * 1433 private static String normalize(String pattern) { 1434 int plen = pattern.length(); 1435 StringBuilder pbuf = new StringBuilder(plen); 1436 char last = 0; 1437 int lastStart = 0; 1438 char cc = 0; 1439 for (int i = 0; i < plen;) { 1440 char c = pattern.charAt(i); 1441 if (cc == 0 && // top level 1442 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') { 1443 i += 2; last = 0; 1444 continue; 1445 } 1446 if (c == '[' && last != '\\') { 1447 if (cc == 0) { 1448 if (lastStart < i) 1449 normalizeSlice(pattern, lastStart, i, pbuf); 1450 lastStart = i; 1451 } 1452 cc++; 1453 } else if (c == ']' && last != '\\') { 1454 cc--; 1455 if (cc == 0) { 1456 normalizeClazz(pattern, lastStart, i + 1, pbuf); 1457 lastStart = i + 1; 1458 } 1459 } 1460 last = c; 1461 i++; 1462 } 1463 assert (cc == 0); 1464 if (lastStart < plen) 1465 normalizeSlice(pattern, lastStart, plen, pbuf); 1466 return pbuf.toString(); 1467 } 1468 1469 private static void normalizeSlice(String src, int off, int limit, 1470 StringBuilder dst) 1471 { 1472 int len = src.length(); 1473 int off0 = off; 1474 while (off < limit && ASCII.isAscii(src.charAt(off))) { 1475 off++; 1476 } 1477 if (off == limit) { 1478 dst.append(src, off0, limit); 1479 return; 1480 } 1481 off--; 1482 if (off < off0) 1483 off = off0; 1484 else 1485 dst.append(src, off0, off); 1486 while (off < limit) { 1487 int ch0 = src.codePointAt(off); 1488 if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) { 1489 dst.append((char)ch0); 1490 off++; 1491 continue; 1492 } 1493 int j = Grapheme.nextBoundary(src, off, limit); 1494 int ch1; 1495 String seq = src.substring(off, j); 1496 String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD); 1497 off = j; 1498 if (nfd.codePointCount(0, nfd.length()) > 1) { 1499 ch0 = nfd.codePointAt(0); 1500 ch1 = nfd.codePointAt(Character.charCount(ch0)); 1501 if (Character.getType(ch1) == Character.NON_SPACING_MARK) { 1502 Set<String> altns = new LinkedHashSet<>(); 1503 altns.add(seq); 1504 produceEquivalentAlternation(nfd, altns); 1505 dst.append("(?:"); 1506 altns.forEach( s -> dst.append(s).append('|')); 1507 dst.delete(dst.length() - 1, dst.length()); 1508 dst.append(")"); 1509 continue; 1510 } 1511 } 1512 String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC); 1513 if (!seq.equals(nfc) && !nfd.equals(nfc)) 1514 dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")"); 1515 else if (!seq.equals(nfd)) 1516 dst.append("(?:" + seq + "|" + nfd + ")"); 1517 else 1518 dst.append(seq); 1519 } 1520 } 1521 1522 private static void normalizeClazz(String src, int off, int limit, 1523 StringBuilder dst) 1524 { 1525 dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC)); 1526 } 1527 1528 /** 1529 * Given a specific sequence composed of a regular character and 1530 * combining marks that follow it, produce the alternation that will 1531 * match all canonical equivalences of that sequence. 1532 * 1533 private static void produceEquivalentAlternation(String src, 1534 Set<String> dst) 1535 { 1536 int len = countChars(src, 0, 1); 1537 if (src.length() == len) { 1538 dst.add(src); // source has one character. 1539 return; 1540 } 1541 String base = src.substring(0,len); 1542 String combiningMarks = src.substring(len); 1543 String[] perms = producePermutations(combiningMarks); 1544 // Add combined permutations 1545 for(int x = 0; x < perms.length; x++) { 1546 String next = base + perms[x]; 1547 dst.add(next); 1548 next = composeOneStep(next); 1549 if (next != null) { 1550 produceEquivalentAlternation(next, dst); 1551 } 1552 } 1553 } 1554 1555 /** 1556 * Returns an array of strings that have all the possible 1557 * permutations of the characters in the input string. 1558 * This is used to get a list of all possible orderings 1559 * of a set of combining marks. Note that some of the permutations 1560 * are invalid because of combining class collisions, and these 1561 * possibilities must be removed because they are not canonically 1562 * equivalent. 1563 * 1564 private static String[] producePermutations(String input) { 1565 if (input.length() == countChars(input, 0, 1)) 1566 return new String[] {input}; 1567 1568 if (input.length() == countChars(input, 0, 2)) { 1569 int c0 = Character.codePointAt(input, 0); 1570 int c1 = Character.codePointAt(input, Character.charCount(c0)); 1571 if (getClass(c1) == getClass(c0)) { 1572 return new String[] {input}; 1573 } 1574 String[] result = new String[2]; 1575 result[0] = input; 1576 StringBuilder sb = new StringBuilder(2); 1577 sb.appendCodePoint(c1); 1578 sb.appendCodePoint(c0); 1579 result[1] = sb.toString(); 1580 return result; 1581 } 1582 1583 int length = 1; 1584 int nCodePoints = countCodePoints(input); 1585 for(int x=1; x<nCodePoints; x++) 1586 length = length * (x+1); 1587 1588 String[] temp = new String[length]; 1589 1590 int combClass[] = new int[nCodePoints]; 1591 for(int x=0, i=0; x<nCodePoints; x++) { 1592 int c = Character.codePointAt(input, i); 1593 combClass[x] = getClass(c); 1594 i += Character.charCount(c); 1595 } 1596 1597 // For each char, take it out and add the permutations 1598 // of the remaining chars 1599 int index = 0; 1600 int len; 1601 // offset maintains the index in code units. 1602 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { 1603 len = countChars(input, offset, 1); 1604 for(int y=x-1; y>=0; y--) { 1605 if (combClass[y] == combClass[x]) { 1606 continue loop; 1607 } 1608 } 1609 StringBuilder sb = new StringBuilder(input); 1610 String otherChars = sb.delete(offset, offset+len).toString(); 1611 String[] subResult = producePermutations(otherChars); 1612 1613 String prefix = input.substring(offset, offset+len); 1614 for (String sre : subResult) 1615 temp[index++] = prefix + sre; 1616 } 1617 String[] result = new String[index]; 1618 System.arraycopy(temp, 0, result, 0, index); 1619 return result; 1620 } 1621 1622 private static int getClass(int c) { 1623 return sun.text.Normalizer.getCombiningClass(c); 1624 } 1625 1626 /** 1627 * Attempts to compose input by combining the first character 1628 * with the first combining mark following it. Returns a String 1629 * that is the composition of the leading character with its first 1630 * combining mark followed by the remaining combining marks. Returns 1631 * null if the first two characters cannot be further composed. 1632 * 1633 private static String composeOneStep(String input) { 1634 int len = countChars(input, 0, 2); 1635 String firstTwoCharacters = input.substring(0, len); 1636 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); 1637 if (result.equals(firstTwoCharacters)) 1638 return null; 1639 else { 1640 String remainder = input.substring(len); 1641 return result + remainder; 1642 } 1643 } 1644 1645 /** 1646 * Preprocess any \Q...\E sequences in `temp', meta-quoting them. 1647 * See the description of `quotemeta' in perlfunc(1). 1648 * 1649 private void RemoveQEQuoting() { 1650 final int pLen = patternLength; 1651 int i = 0; 1652 while (i < pLen-1) { 1653 if (temp[i] != '\\') 1654 i += 1; 1655 else if (temp[i + 1] != 'Q') 1656 i += 2; 1657 else 1658 break; 1659 } 1660 if (i >= pLen - 1) // No \Q sequence found 1661 return; 1662 int j = i; 1663 i += 2; 1664 int newTempLen; 1665 try { 1666 newTempLen = Math.addExact(j + 2, Math.multiplyExact(3, pLen - i)); 1667 } catch (ArithmeticException ae) { 1668 throw new OutOfMemoryError("Required pattern length too large"); 1669 } 1670 int[] newtemp = new int[newTempLen]; 1671 System.arraycopy(temp, 0, newtemp, 0, j); 1672 1673 boolean inQuote = true; 1674 boolean beginQuote = true; 1675 while (i < pLen) { 1676 int c = temp[i++]; 1677 if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) { 1678 newtemp[j++] = c; 1679 } else if (ASCII.isDigit(c)) { 1680 if (beginQuote) { 1681 /* 1682 * A unicode escape \[0xu] could be before this quote, 1683 * and we don't want this numeric char to processed as 1684 * part of the escape. 1685 * 1686 newtemp[j++] = '\\'; 1687 newtemp[j++] = 'x'; 1688 newtemp[j++] = '3'; 1689 } 1690 newtemp[j++] = c; 1691 } else if (c != '\\') { 1692 if (inQuote) newtemp[j++] = '\\'; 1693 newtemp[j++] = c; 1694 } else if (inQuote) { 1695 if (temp[i] == 'E') { 1696 i++; 1697 inQuote = false; 1698 } else { 1699 newtemp[j++] = '\\'; 1700 newtemp[j++] = '\\'; 1701 } 1702 } else { 1703 if (temp[i] == 'Q') { 1704 i++; 1705 inQuote = true; 1706 beginQuote = true; 1707 continue; 1708 } else { 1709 newtemp[j++] = c; 1710 if (i != pLen) 1711 newtemp[j++] = temp[i++]; 1712 } 1713 } 1714 1715 beginQuote = false; 1716 } 1717 1718 patternLength = j; 1719 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination 1720 } 1721 1722 /** 1723 * Copies regular expression to an int array and invokes the parsing 1724 * of the expression which will create the object tree. 1725 * 1726 private void compile() { 1727 // Handle canonical equivalences 1728 if (has(CANON_EQ) && !has(LITERAL)) { 1729 normalizedPattern = normalize(pattern); 1730 } else { 1731 normalizedPattern = pattern; 1732 } 1733 patternLength = normalizedPattern.length(); 1734 1735 // Copy pattern to int array for convenience 1736 // Use double zero to terminate pattern 1737 temp = new int[patternLength + 2]; 1738 1739 hasSupplementary = false; 1740 int c, count = 0; 1741 // Convert all chars into code points 1742 for (int x = 0; x < patternLength; x += Character.charCount(c)) { 1743 c = normalizedPattern.codePointAt(x); 1744 if (isSupplementary(c)) { 1745 hasSupplementary = true; 1746 } 1747 temp[count++] = c; 1748 } 1749 1750 patternLength = count; // patternLength now in code points 1751 1752 if (! has(LITERAL)) 1753 RemoveQEQuoting(); 1754 1755 // Allocate all temporary objects here. 1756 buffer = new int[32]; 1757 groupNodes = new GroupHead[10]; 1758 namedGroups = null; 1759 topClosureNodes = new ArrayList<>(10); 1760 1761 if (has(LITERAL)) { 1762 // Literal pattern handling 1763 matchRoot = newSlice(temp, patternLength, hasSupplementary); 1764 matchRoot.next = lastAccept; 1765 } else { 1766 // Start recursive descent parsing 1767 matchRoot = expr(lastAccept); 1768 // Check extra pattern characters 1769 if (patternLength != cursor) { 1770 if (peek() == ')') { 1771 throw error("Unmatched closing ')'"); 1772 } else { 1773 throw error("Unexpected internal error"); 1774 } 1775 } 1776 } 1777 1778 // Peephole optimization 1779 if (matchRoot instanceof Slice) { 1780 root = BnM.optimize(matchRoot); 1781 if (root == matchRoot) { 1782 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1783 } 1784 } else if (matchRoot instanceof Begin || matchRoot instanceof First) { 1785 root = matchRoot; 1786 } else { 1787 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1788 } 1789 1790 // Optimize the greedy Loop to prevent exponential backtracking, IF there 1791 // is no group ref in this pattern. With a non-negative localTCNCount value, 1792 // the greedy type Loop, Curly will skip the backtracking for any starting 1793 // position "i" that failed in the past. 1794 if (!hasGroupRef) { 1795 for (Node node : topClosureNodes) { 1796 if (node instanceof Loop) { 1797 // non-deterministic-greedy-group 1798 ((Loop)node).posIndex = localTCNCount++; 1799 } 1800 } 1801 } 1802 1803 // Release temporary storage 1804 temp = null; 1805 buffer = null; 1806 groupNodes = null; 1807 patternLength = 0; 1808 compiled = true; 1809 topClosureNodes = null; 1810 } 1811 1812 Map<String, Integer> namedGroups() { 1813 Map<String, Integer> groups = namedGroups; 1814 if (groups == null) { 1815 namedGroups = groups = new HashMap<>(2); 1816 } 1817 return groups; 1818 } 1819 1820 /** 1821 * Used to accumulate information about a subtree of the object graph 1822 * so that optimizations can be applied to the subtree. 1823 * 1824 static final class TreeInfo { 1825 int minLength; 1826 int maxLength; 1827 boolean maxValid; 1828 boolean deterministic; 1829 1830 TreeInfo() { 1831 reset(); 1832 } 1833 void reset() { 1834 minLength = 0; 1835 maxLength = 0; 1836 maxValid = true; 1837 deterministic = true; 1838 } 1839 } 1840 1841 /* 1842 * The following private methods are mainly used to improve the 1843 * readability of the code. In order to let the Java compiler easily 1844 * inline them, we should not put many assertions or error checks in them. 1845 * 1846 1847 /** 1848 * Indicates whether a particular flag is set or not. 1849 * 1850 private boolean has(int f) { 1851 return (flags0 & f) != 0; 1852 } 1853 1854 /** 1855 * Match next character, signal error if failed. 1856 * 1857 private void accept(int ch, String s) { 1858 int testChar = temp[cursor++]; 1859 if (has(COMMENTS)) 1860 testChar = parsePastWhitespace(testChar); 1861 if (ch != testChar) { 1862 throw error(s); 1863 } 1864 } 1865 1866 /** 1867 * Mark the end of pattern with a specific character. 1868 * 1869 private void mark(int c) { 1870 temp[patternLength] = c; 1871 } 1872 1873 /** 1874 * Peek the next character, and do not advance the cursor. 1875 * 1876 private int peek() { 1877 int ch = temp[cursor]; 1878 if (has(COMMENTS)) 1879 ch = peekPastWhitespace(ch); 1880 return ch; 1881 } 1882 1883 /** 1884 * Read the next character, and advance the cursor by one. 1885 * 1886 private int read() { 1887 int ch = temp[cursor++]; 1888 if (has(COMMENTS)) 1889 ch = parsePastWhitespace(ch); 1890 return ch; 1891 } 1892 1893 /** 1894 * Read the next character, and advance the cursor by one, 1895 * ignoring the COMMENTS setting 1896 * 1897 private int readEscaped() { 1898 int ch = temp[cursor++]; 1899 return ch; 1900 } 1901 1902 /** 1903 * Advance the cursor by one, and peek the next character. 1904 * 1905 private int next() { 1906 int ch = temp[++cursor]; 1907 if (has(COMMENTS)) 1908 ch = peekPastWhitespace(ch); 1909 return ch; 1910 } 1911 1912 /** 1913 * Advance the cursor by one, and peek the next character, 1914 * ignoring the COMMENTS setting 1915 * 1916 private int nextEscaped() { 1917 int ch = temp[++cursor]; 1918 return ch; 1919 } 1920 1921 /** 1922 * If in xmode peek past whitespace and comments. 1923 * 1924 private int peekPastWhitespace(int ch) { 1925 while (ASCII.isSpace(ch) || ch == '#') { 1926 while (ASCII.isSpace(ch)) 1927 ch = temp[++cursor]; 1928 if (ch == '#') { 1929 ch = peekPastLine(); 1930 } 1931 } 1932 return ch; 1933 } 1934 1935 /** 1936 * If in xmode parse past whitespace and comments. 1937 * 1938 private int parsePastWhitespace(int ch) { 1939 while (ASCII.isSpace(ch) || ch == '#') { 1940 while (ASCII.isSpace(ch)) 1941 ch = temp[cursor++]; 1942 if (ch == '#') 1943 ch = parsePastLine(); 1944 } 1945 return ch; 1946 } 1947 1948 /** 1949 * xmode parse past comment to end of line. 1950 * 1951 private int parsePastLine() { 1952 int ch = temp[cursor++]; 1953 while (ch != 0 && !isLineSeparator(ch)) 1954 ch = temp[cursor++]; 1955 if (ch == 0 && cursor > patternLength) { 1956 cursor = patternLength; 1957 ch = temp[cursor++]; 1958 } 1959 return ch; 1960 } 1961 1962 /** 1963 * xmode peek past comment to end of line. 1964 * 1965 private int peekPastLine() { 1966 int ch = temp[++cursor]; 1967 while (ch != 0 && !isLineSeparator(ch)) 1968 ch = temp[++cursor]; 1969 if (ch == 0 && cursor > patternLength) { 1970 cursor = patternLength; 1971 ch = temp[cursor]; 1972 } 1973 return ch; 1974 } 1975 1976 /** 1977 * Determines if character is a line separator in the current mode 1978 * 1979 private boolean isLineSeparator(int ch) { 1980 if (has(UNIX_LINES)) { 1981 return ch == '\n'; 1982 } else { 1983 return (ch == '\n' || 1984 ch == '\r' || 1985 (ch|1) == '\u2029' || 1986 ch == '\u0085'); 1987 } 1988 } 1989 1990 /** 1991 * Read the character after the next one, and advance the cursor by two. 1992 * 1993 private int skip() { 1994 int i = cursor; 1995 int ch = temp[i+1]; 1996 cursor = i + 2; 1997 return ch; 1998 } 1999 2000 /** 2001 * Unread one next character, and retreat cursor by one. 2002 * 2003 private void unread() { 2004 cursor--; 2005 } 2006 2007 /** 2008 * Internal method used for handling all syntax errors. The pattern is 2009 * displayed with a pointer to aid in locating the syntax error. 2010 * 2011 private PatternSyntaxException error(String s) { 2012 return new PatternSyntaxException(s, normalizedPattern, cursor - 1); 2013 } 2014 2015 /** 2016 * Determines if there is any supplementary character or unpaired 2017 * surrogate in the specified range. 2018 * 2019 private boolean findSupplementary(int start, int end) { 2020 for (int i = start; i < end; i++) { 2021 if (isSupplementary(temp[i])) 2022 return true; 2023 } 2024 return false; 2025 } 2026 2027 /** 2028 * Determines if the specified code point is a supplementary 2029 * character or unpaired surrogate. 2030 * 2031 private static final boolean isSupplementary(int ch) { 2032 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || 2033 Character.isSurrogate((char)ch); 2034 } 2035 2036 /** 2037 * The following methods handle the main parsing. They are sorted 2038 * according to their precedence order, the lowest one first. 2039 * 2040 2041 /** 2042 * The expression is parsed with branch nodes added for alternations. 2043 * This may be called recursively to parse sub expressions that may 2044 * contain alternations. 2045 * 2046 private Node expr(Node end) { 2047 Node prev = null; 2048 Node firstTail = null; 2049 Branch branch = null; 2050 Node branchConn = null; 2051 2052 for (;;) { 2053 Node node = sequence(end); 2054 Node nodeTail = root; //double return 2055 if (prev == null) { 2056 prev = node; 2057 firstTail = nodeTail; 2058 } else { 2059 // Branch 2060 if (branchConn == null) { 2061 branchConn = new BranchConn(); 2062 branchConn.next = end; 2063 } 2064 if (node == end) { 2065 // if the node returned from sequence() is "end" 2066 // we have an empty expr, set a null atom into 2067 // the branch to indicate to go "next" directly. 2068 node = null; 2069 } else { 2070 // the "tail.next" of each atom goes to branchConn 2071 nodeTail.next = branchConn; 2072 } 2073 if (prev == branch) { 2074 branch.add(node); 2075 } else { 2076 if (prev == end) { 2077 prev = null; 2078 } else { 2079 // replace the "end" with "branchConn" at its tail.next 2080 // when put the "prev" into the branch as the first atom. 2081 firstTail.next = branchConn; 2082 } 2083 prev = branch = new Branch(prev, node, branchConn); 2084 } 2085 } 2086 if (peek() != '|') { 2087 return prev; 2088 } 2089 next(); 2090 } 2091 } 2092 2093 @SuppressWarnings("fallthrough") 2094 /** 2095 * Parsing of sequences between alternations. 2096 * 2097 private Node sequence(Node end) { 2098 Node head = null; 2099 Node tail = null; 2100 Node node; 2101 LOOP: 2102 for (;;) { 2103 int ch = peek(); 2104 switch (ch) { 2105 case '(': 2106 // Because group handles its own closure, 2107 // we need to treat it differently 2108 node = group0(); 2109 // Check for comment or flag group 2110 if (node == null) 2111 continue; 2112 if (head == null) 2113 head = node; 2114 else 2115 tail.next = node; 2116 // Double return: Tail was returned in root 2117 tail = root; 2118 continue; 2119 case '[': 2120 if (has(CANON_EQ) && !has(LITERAL)) 2121 node = new NFCCharProperty(clazz(true)); 2122 else 2123 node = newCharProperty(clazz(true)); 2124 break; 2125 case '\\': 2126 ch = nextEscaped(); 2127 if (ch == 'p' || ch == 'P') { 2128 boolean oneLetter = true; 2129 boolean comp = (ch == 'P'); 2130 ch = next(); // Consume { if present 2131 if (ch != '{') { 2132 unread(); 2133 } else { 2134 oneLetter = false; 2135 } 2136 // node = newCharProperty(family(oneLetter, comp)); 2137 if (has(CANON_EQ) && !has(LITERAL)) 2138 node = new NFCCharProperty(family(oneLetter, comp)); 2139 else 2140 node = newCharProperty(family(oneLetter, comp)); 2141 } else { 2142 unread(); 2143 node = atom(); 2144 } 2145 break; 2146 case '^': 2147 next(); 2148 if (has(MULTILINE)) { 2149 if (has(UNIX_LINES)) 2150 node = new UnixCaret(); 2151 else 2152 node = new Caret(); 2153 } else { 2154 node = new Begin(); 2155 } 2156 break; 2157 case '$': 2158 next(); 2159 if (has(UNIX_LINES)) 2160 node = new UnixDollar(has(MULTILINE)); 2161 else 2162 node = new Dollar(has(MULTILINE)); 2163 break; 2164 case '.': 2165 next(); 2166 if (has(DOTALL)) { 2167 node = new CharProperty(ALL()); 2168 } else { 2169 if (has(UNIX_LINES)) { 2170 node = new CharProperty(UNIXDOT()); 2171 } else { 2172 node = new CharProperty(DOT()); 2173 } 2174 } 2175 break; 2176 case '|': 2177 case ')': 2178 break LOOP; 2179 case ']': // Now interpreting dangling ] and } as literals 2180 case '}': 2181 node = atom(); 2182 break; 2183 case '?': 2184 case '*': 2185 case '+': 2186 next(); 2187 throw error("Dangling meta character '" + ((char)ch) + "'"); 2188 case 0: 2189 if (cursor >= patternLength) { 2190 break LOOP; 2191 } 2192 // Fall through 2193 default: 2194 node = atom(); 2195 break; 2196 } 2197 2198 node = closure(node); 2199 /* save the top dot-greedy nodes (.*, .+) as well 2200 if (node instanceof GreedyCharProperty && 2201 ((GreedyCharProperty)node).cp instanceof Dot) { 2202 topClosureNodes.add(node); 2203 } 2204 * 2205 if (head == null) { 2206 head = tail = node; 2207 } else { 2208 tail.next = node; 2209 tail = node; 2210 } 2211 } 2212 if (head == null) { 2213 return end; 2214 } 2215 tail.next = end; 2216 root = tail; //double return 2217 return head; 2218 } 2219 2220 @SuppressWarnings("fallthrough") 2221 /** 2222 * Parse and add a new Single or Slice. 2223 * 2224 private Node atom() { 2225 int first = 0; 2226 int prev = -1; 2227 boolean hasSupplementary = false; 2228 int ch = peek(); 2229 for (;;) { 2230 switch (ch) { 2231 case '*': 2232 case '+': 2233 case '?': 2234 case '{': 2235 if (first > 1) { 2236 cursor = prev; // Unwind one character 2237 first--; 2238 } 2239 break; 2240 case '$': 2241 case '.': 2242 case '^': 2243 case '(': 2244 case '[': 2245 case '|': 2246 case ')': 2247 break; 2248 case '\\': 2249 ch = nextEscaped(); 2250 if (ch == 'p' || ch == 'P') { // Property 2251 if (first > 0) { // Slice is waiting; handle it first 2252 unread(); 2253 break; 2254 } else { // No slice; just return the family node 2255 boolean comp = (ch == 'P'); 2256 boolean oneLetter = true; 2257 ch = next(); // Consume { if present 2258 if (ch != '{') 2259 unread(); 2260 else 2261 oneLetter = false; 2262 if (has(CANON_EQ) && !has(LITERAL)) 2263 return new NFCCharProperty(family(oneLetter, comp)); 2264 else 2265 return newCharProperty(family(oneLetter, comp)); 2266 } 2267 } 2268 unread(); 2269 prev = cursor; 2270 ch = escape(false, first == 0, false); 2271 if (ch >= 0) { 2272 append(ch, first); 2273 first++; 2274 if (isSupplementary(ch)) { 2275 hasSupplementary = true; 2276 } 2277 ch = peek(); 2278 continue; 2279 } else if (first == 0) { 2280 return root; 2281 } 2282 // Unwind meta escape sequence 2283 cursor = prev; 2284 break; 2285 case 0: 2286 if (cursor >= patternLength) { 2287 break; 2288 } 2289 // Fall through 2290 default: 2291 prev = cursor; 2292 append(ch, first); 2293 first++; 2294 if (isSupplementary(ch)) { 2295 hasSupplementary = true; 2296 } 2297 ch = next(); 2298 continue; 2299 } 2300 break; 2301 } 2302 if (first == 1) { 2303 return newCharProperty(single(buffer[0])); 2304 } else { 2305 return newSlice(buffer, first, hasSupplementary); 2306 } 2307 } 2308 2309 private void append(int ch, int index) { 2310 int len = buffer.length; 2311 if (index - len >= 0) { 2312 len = ArraysSupport.newLength(len, 2313 1 + index - len, /* minimum growth * / 2314 len /* preferred growth * /); 2315 buffer = Arrays.copyOf(buffer, len); 2316 } 2317 buffer[index] = ch; 2318 } 2319 2320 /** 2321 * Parses a backref greedily, taking as many numbers as it 2322 * can. The first digit is always treated as a backref, but 2323 * multi digit numbers are only treated as a backref if at 2324 * least that many backrefs exist at this point in the regex. 2325 * 2326 private Node ref(int refNum) { 2327 boolean done = false; 2328 while(!done) { 2329 int ch = peek(); 2330 switch (ch) { 2331 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> { 2332 int newRefNum = (refNum * 10) + (ch - '0'); 2333 // Add another number if it doesn't make a group 2334 // that doesn't exist 2335 if (capturingGroupCount - 1 < newRefNum) { 2336 done = true; 2337 break; 2338 } 2339 refNum = newRefNum; 2340 read(); 2341 } 2342 default -> done = true; 2343 } 2344 } 2345 hasGroupRef = true; 2346 if (has(CASE_INSENSITIVE)) 2347 return new CIBackRef(refNum, has(UNICODE_CASE)); 2348 else 2349 return new BackRef(refNum); 2350 } 2351 2352 /** 2353 * Parses an escape sequence to determine the actual value that needs 2354 * to be matched. 2355 * If -1 is returned and create was true a new object was added to the tree 2356 * to handle the escape sequence. 2357 * If the returned value is greater than zero, it is the value that 2358 * matches the escape sequence. 2359 * 2360 private int escape(boolean inclass, boolean create, boolean isrange) { 2361 int ch = skip(); 2362 switch (ch) { 2363 case '0': 2364 return o(); 2365 case '1': 2366 case '2': 2367 case '3': 2368 case '4': 2369 case '5': 2370 case '6': 2371 case '7': 2372 case '8': 2373 case '9': 2374 if (inclass) break; 2375 if (create) { 2376 root = ref((ch - '0')); 2377 } 2378 return -1; 2379 case 'A': 2380 if (inclass) break; 2381 if (create) root = new Begin(); 2382 return -1; 2383 case 'B': 2384 if (inclass) break; 2385 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); 2386 return -1; 2387 case 'C': 2388 break; 2389 case 'D': 2390 if (create) { 2391 predicate = has(UNICODE_CHARACTER_CLASS) ? 2392 CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT(); 2393 predicate = predicate.negate(); 2394 if (!inclass) 2395 root = newCharProperty(predicate); 2396 } 2397 return -1; 2398 case 'E': 2399 case 'F': 2400 break; 2401 case 'G': 2402 if (inclass) break; 2403 if (create) root = new LastMatch(); 2404 return -1; 2405 case 'H': 2406 if (create) { 2407 predicate = HorizWS().negate(); 2408 if (!inclass) 2409 root = newCharProperty(predicate); 2410 } 2411 return -1; 2412 case 'I': 2413 case 'J': 2414 case 'K': 2415 case 'L': 2416 case 'M': 2417 break; 2418 case 'N': 2419 return N(); 2420 case 'O': 2421 case 'P': 2422 case 'Q': 2423 break; 2424 case 'R': 2425 if (inclass) break; 2426 if (create) root = new LineEnding(); 2427 return -1; 2428 case 'S': 2429 if (create) { 2430 predicate = has(UNICODE_CHARACTER_CLASS) ? 2431 CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE(); 2432 predicate = predicate.negate(); 2433 if (!inclass) 2434 root = newCharProperty(predicate); 2435 } 2436 return -1; 2437 case 'T': 2438 case 'U': 2439 break; 2440 case 'V': 2441 if (create) { 2442 predicate = VertWS().negate(); 2443 if (!inclass) 2444 root = newCharProperty(predicate); 2445 } 2446 return -1; 2447 case 'W': 2448 if (create) { 2449 predicate = has(UNICODE_CHARACTER_CLASS) ? 2450 CharPredicates.WORD() : CharPredicates.ASCII_WORD(); 2451 predicate = predicate.negate(); 2452 if (!inclass) 2453 root = newCharProperty(predicate); 2454 } 2455 return -1; 2456 case 'X': 2457 if (inclass) break; 2458 if (create) { 2459 root = new XGrapheme(); 2460 } 2461 return -1; 2462 case 'Y': 2463 break; 2464 case 'Z': 2465 if (inclass) break; 2466 if (create) { 2467 if (has(UNIX_LINES)) 2468 root = new UnixDollar(false); 2469 else 2470 root = new Dollar(false); 2471 } 2472 return -1; 2473 case 'a': 2474 return '\007'; 2475 case 'b': 2476 if (inclass) break; 2477 if (create) { 2478 if (peek() == '{') { 2479 if (skip() == 'g') { 2480 if (read() == '}') { 2481 root = new GraphemeBound(); 2482 return -1; 2483 } 2484 break; // error missing trailing } 2485 } 2486 unread(); unread(); 2487 } 2488 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); 2489 } 2490 return -1; 2491 case 'c': 2492 return c(); 2493 case 'd': 2494 if (create) { 2495 predicate = has(UNICODE_CHARACTER_CLASS) ? 2496 CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT(); 2497 if (!inclass) 2498 root = newCharProperty(predicate); 2499 } 2500 return -1; 2501 case 'e': 2502 return '\033'; 2503 case 'f': 2504 return '\f'; 2505 case 'g': 2506 break; 2507 case 'h': 2508 if (create) { 2509 predicate = HorizWS(); 2510 if (!inclass) 2511 root = newCharProperty(predicate); 2512 } 2513 return -1; 2514 case 'i': 2515 case 'j': 2516 break; 2517 case 'k': 2518 if (inclass) 2519 break; 2520 if (read() != '<') 2521 throw error("\\k is not followed by '<' for named capturing group"); 2522 String name = groupname(read()); 2523 if (!namedGroups().containsKey(name)) 2524 throw error("named capturing group <" + name + "> does not exist"); 2525 if (create) { 2526 hasGroupRef = true; 2527 if (has(CASE_INSENSITIVE)) 2528 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); 2529 else 2530 root = new BackRef(namedGroups().get(name)); 2531 } 2532 return -1; 2533 case 'l': 2534 case 'm': 2535 break; 2536 case 'n': 2537 return '\n'; 2538 case 'o': 2539 case 'p': 2540 case 'q': 2541 break; 2542 case 'r': 2543 return '\r'; 2544 case 's': 2545 if (create) { 2546 predicate = has(UNICODE_CHARACTER_CLASS) ? 2547 CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE(); 2548 if (!inclass) 2549 root = newCharProperty(predicate); 2550 } 2551 return -1; 2552 case 't': 2553 return '\t'; 2554 case 'u': 2555 return u(); 2556 case 'v': 2557 // '\v' was implemented as VT/0x0B in releases < 1.8 (though 2558 // undocumented). In JDK8 '\v' is specified as a predefined 2559 // character class for all vertical whitespace characters. 2560 // So [-1, root=VertWS node] pair is returned (instead of a 2561 // single 0x0B). This breaks the range if '\v' is used as 2562 // the start or end value, such as [\v-...] or [...-\v], in 2563 // which a single definite value (0x0B) is expected. For 2564 // compatibility concern '\013'/0x0B is returned if isrange. 2565 if (isrange) 2566 return '\013'; 2567 if (create) { 2568 predicate = VertWS(); 2569 if (!inclass) 2570 root = newCharProperty(predicate); 2571 } 2572 return -1; 2573 case 'w': 2574 if (create) { 2575 predicate = has(UNICODE_CHARACTER_CLASS) ? 2576 CharPredicates.WORD() : CharPredicates.ASCII_WORD(); 2577 if (!inclass) 2578 root = newCharProperty(predicate); 2579 } 2580 return -1; 2581 case 'x': 2582 return x(); 2583 case 'y': 2584 break; 2585 case 'z': 2586 if (inclass) break; 2587 if (create) root = new End(); 2588 return -1; 2589 default: 2590 return ch; 2591 } 2592 throw error("Illegal/unsupported escape sequence"); 2593 } 2594 2595 /** 2596 * Parse a character class, and return the node that matches it. 2597 * 2598 * Consumes a ] on the way out if consume is true. Usually consume 2599 * is true except for the case of [abc&&def] where def is a separate 2600 * right hand node with "understood" brackets. 2601 * 2602 private CharPredicate clazz(boolean consume) { 2603 CharPredicate prev = null; 2604 CharPredicate curr = null; 2605 BitClass bits = new BitClass(); 2606 2607 boolean isNeg = false; 2608 boolean hasBits = false; 2609 int ch = next(); 2610 2611 // Negates if first char in a class, otherwise literal 2612 if (ch == '^' && temp[cursor-1] == '[') { 2613 ch = next(); 2614 isNeg = true; 2615 } 2616 for (;;) { 2617 switch (ch) { 2618 case '[': 2619 curr = clazz(true); 2620 if (prev == null) 2621 prev = curr; 2622 else 2623 prev = prev.union(curr); 2624 ch = peek(); 2625 continue; 2626 case '&': 2627 ch = next(); 2628 if (ch == '&') { 2629 ch = next(); 2630 CharPredicate right = null; 2631 while (ch != ']' && ch != '&') { 2632 if (ch == '[') { 2633 if (right == null) 2634 right = clazz(true); 2635 else 2636 right = right.union(clazz(true)); 2637 } else { // abc&&def 2638 unread(); 2639 if (right == null) { 2640 right = clazz(false); 2641 } else { 2642 right = right.union(clazz(false)); 2643 } 2644 } 2645 ch = peek(); 2646 } 2647 if (hasBits) { 2648 // bits used, union has high precedence 2649 if (prev == null) { 2650 prev = curr = bits; 2651 } else { 2652 prev = prev.union(bits); 2653 } 2654 hasBits = false; 2655 } 2656 if (right != null) 2657 curr = right; 2658 if (prev == null) { 2659 if (right == null) 2660 throw error("Bad class syntax"); 2661 else 2662 prev = right; 2663 } else { 2664 prev = prev.and(curr); 2665 } 2666 } else { 2667 // treat as a literal & 2668 unread(); 2669 break; 2670 } 2671 continue; 2672 case 0: 2673 if (cursor >= patternLength) 2674 throw error("Unclosed character class"); 2675 break; 2676 case ']': 2677 if (prev != null || hasBits) { 2678 if (consume) 2679 next(); 2680 if (prev == null) 2681 prev = bits; 2682 else if (hasBits) 2683 prev = prev.union(bits); 2684 if (isNeg) 2685 return prev.negate(); 2686 return prev; 2687 } 2688 break; 2689 default: 2690 break; 2691 } 2692 curr = range(bits); 2693 if (curr == null) { // the bits used 2694 hasBits = true; 2695 } else { 2696 if (prev == null) 2697 prev = curr; 2698 else if (prev != curr) 2699 prev = prev.union(curr); 2700 } 2701 ch = peek(); 2702 } 2703 } 2704 2705 private CharPredicate bitsOrSingle(BitClass bits, int ch) { 2706 /* Bits can only handle codepoints in [u+0000-u+00ff] range. 2707 Use "single" node instead of bits when dealing with unicode 2708 case folding for codepoints listed below. 2709 (1)Uppercase out of range: u+00ff, u+00b5 2710 toUpperCase(u+00ff) -> u+0178 2711 toUpperCase(u+00b5) -> u+039c 2712 (2)LatinSmallLetterLongS u+17f 2713 toUpperCase(u+017f) -> u+0053 2714 (3)LatinSmallLetterDotlessI u+131 2715 toUpperCase(u+0131) -> u+0049 2716 (4)LatinCapitalLetterIWithDotAbove u+0130 2717 toLowerCase(u+0130) -> u+0069 2718 (5)KelvinSign u+212a 2719 toLowerCase(u+212a) ==> u+006B 2720 (6)AngstromSign u+212b 2721 toLowerCase(u+212b) ==> u+00e5 2722 * 2723 if (ch < 256 && 2724 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && 2725 (ch == 0xff || ch == 0xb5 || 2726 ch == 0x49 || ch == 0x69 || //I and i 2727 ch == 0x53 || ch == 0x73 || //S and s 2728 ch == 0x4b || ch == 0x6b || //K and k 2729 ch == 0xc5 || ch == 0xe5))) { //A+ring 2730 bits.add(ch, flags0); 2731 return null; 2732 } 2733 return single(ch); 2734 } 2735 2736 /** 2737 * Returns a suitably optimized, single character predicate 2738 * 2739 private CharPredicate single(final int ch) { 2740 if (has(CASE_INSENSITIVE)) { 2741 int lower, upper; 2742 if (has(UNICODE_CASE)) { 2743 upper = Character.toUpperCase(ch); 2744 lower = Character.toLowerCase(upper); 2745 // Unicode case insensitive matches 2746 if (upper != lower) 2747 return SingleU(lower); 2748 } else if (ASCII.isAscii(ch)) { 2749 lower = ASCII.toLower(ch); 2750 upper = ASCII.toUpper(ch); 2751 // Case insensitive matches a given BMP character 2752 if (lower != upper) 2753 return SingleI(lower, upper); 2754 } 2755 } 2756 if (isSupplementary(ch)) 2757 return SingleS(ch); 2758 return Single(ch); // Match a given BMP character 2759 } 2760 2761 /** 2762 * Parse a single character or a character range in a character class 2763 * and return its representative node. 2764 * 2765 private CharPredicate range(BitClass bits) { 2766 int ch = peek(); 2767 if (ch == '\\') { 2768 ch = nextEscaped(); 2769 if (ch == 'p' || ch == 'P') { // A property 2770 boolean comp = (ch == 'P'); 2771 boolean oneLetter = true; 2772 // Consume { if present 2773 ch = next(); 2774 if (ch != '{') 2775 unread(); 2776 else 2777 oneLetter = false; 2778 return family(oneLetter, comp); 2779 } else { // ordinary escape 2780 boolean isrange = temp[cursor+1] == '-'; 2781 unread(); 2782 ch = escape(true, true, isrange); 2783 if (ch == -1) 2784 return predicate; 2785 } 2786 } else { 2787 next(); 2788 } 2789 if (ch >= 0) { 2790 if (peek() == '-') { 2791 int endRange = temp[cursor+1]; 2792 if (endRange == '[') { 2793 return bitsOrSingle(bits, ch); 2794 } 2795 if (endRange != ']') { 2796 next(); 2797 int m = peek(); 2798 if (m == '\\') { 2799 m = escape(true, false, true); 2800 } else { 2801 next(); 2802 } 2803 if (m < ch) { 2804 throw error("Illegal character range"); 2805 } 2806 if (has(CASE_INSENSITIVE)) { 2807 if (has(UNICODE_CASE)) 2808 return CIRangeU(ch, m); 2809 return CIRange(ch, m); 2810 } else { 2811 return Range(ch, m); 2812 } 2813 } 2814 } 2815 return bitsOrSingle(bits, ch); 2816 } 2817 throw error("Unexpected character '"+((char)ch)+"'"); 2818 } 2819 2820 /** 2821 * Parses a Unicode character family and returns its representative node. 2822 * 2823 private CharPredicate family(boolean singleLetter, boolean isComplement) { 2824 next(); 2825 String name; 2826 CharPredicate p = null; 2827 2828 if (singleLetter) { 2829 int c = temp[cursor]; 2830 if (!Character.isSupplementaryCodePoint(c)) { 2831 name = String.valueOf((char)c); 2832 } else { 2833 name = new String(temp, cursor, 1); 2834 } 2835 read(); 2836 } else { 2837 int i = cursor; 2838 mark('}'); 2839 while(read() != '}') { 2840 } 2841 mark('\000'); 2842 int j = cursor; 2843 if (j > patternLength) 2844 throw error("Unclosed character family"); 2845 if (i + 1 >= j) 2846 throw error("Empty character family"); 2847 name = new String(temp, i, j-i-1); 2848 } 2849 2850 int i = name.indexOf('='); 2851 if (i != -1) { 2852 // property construct \p{name=value} 2853 String value = name.substring(i + 1); 2854 name = name.substring(0, i).toLowerCase(Locale.ENGLISH); 2855 switch (name) { 2856 case "sc": 2857 case "script": 2858 p = CharPredicates.forUnicodeScript(value); 2859 break; 2860 case "blk": 2861 case "block": 2862 p = CharPredicates.forUnicodeBlock(value); 2863 break; 2864 case "gc": 2865 case "general_category": 2866 p = CharPredicates.forProperty(value, has(CASE_INSENSITIVE)); 2867 break; 2868 default: 2869 break; 2870 } 2871 if (p == null) 2872 throw error("Unknown Unicode property {name=<" + name + ">, " 2873 + "value=<" + value + ">}"); 2874 2875 } else { 2876 if (name.startsWith("In")) { 2877 // \p{InBlockName} 2878 p = CharPredicates.forUnicodeBlock(name.substring(2)); 2879 } else if (name.startsWith("Is")) { 2880 // \p{IsGeneralCategory} and \p{IsScriptName} 2881 String shortName = name.substring(2); 2882 p = CharPredicates.forUnicodeProperty(shortName, has(CASE_INSENSITIVE)); 2883 if (p == null) 2884 p = CharPredicates.forProperty(shortName, has(CASE_INSENSITIVE)); 2885 if (p == null) 2886 p = CharPredicates.forUnicodeScript(shortName); 2887 } else { 2888 if (has(UNICODE_CHARACTER_CLASS)) 2889 p = CharPredicates.forPOSIXName(name, has(CASE_INSENSITIVE)); 2890 if (p == null) 2891 p = CharPredicates.forProperty(name, has(CASE_INSENSITIVE)); 2892 } 2893 if (p == null) 2894 throw error("Unknown character property name {" + name + "}"); 2895 } 2896 if (isComplement) { 2897 // it might be too expensive to detect if a complement of 2898 // CharProperty can match "certain" supplementary. So just 2899 // go with StartS. 2900 hasSupplementary = true; 2901 p = p.negate(); 2902 } 2903 return p; 2904 } 2905 2906 private CharProperty newCharProperty(CharPredicate p) { 2907 if (p == null) 2908 return null; 2909 if (p instanceof BmpCharPredicate) 2910 return new BmpCharProperty((BmpCharPredicate)p); 2911 else { 2912 hasSupplementary = true; 2913 return new CharProperty(p); 2914 } 2915 } 2916 2917 /** 2918 * Parses and returns the name of a "named capturing group", the trailing 2919 * ">" is consumed after parsing. 2920 * 2921 private String groupname(int ch) { 2922 StringBuilder sb = new StringBuilder(); 2923 if (!ASCII.isAlpha(ch)) 2924 throw error("capturing group name does not start with a Latin letter"); 2925 do { 2926 sb.append((char) ch); 2927 } while (ASCII.isAlnum(ch=read())); 2928 if (ch != '>') 2929 throw error("named capturing group is missing trailing '>'"); 2930 return sb.toString(); 2931 } 2932 2933 /** 2934 * Parses a group and returns the head node of a set of nodes that process 2935 * the group. Sometimes a double return system is used where the tail is 2936 * returned in root. 2937 * 2938 private Node group0() { 2939 boolean capturingGroup = false; 2940 Node head; 2941 Node tail; 2942 int save = flags0; 2943 int saveTCNCount = topClosureNodes.size(); 2944 root = null; 2945 int ch = next(); 2946 if (ch == '?') { 2947 ch = skip(); 2948 switch (ch) { 2949 case ':' -> { // (?:xxx) pure group 2950 head = createGroup(true); 2951 tail = root; 2952 head.next = expr(tail); 2953 } 2954 case '=', '!' -> { // (?=xxx) and (?!xxx) lookahead 2955 head = createGroup(true); 2956 tail = root; 2957 head.next = expr(tail); 2958 if (ch == '=') { 2959 head = tail = new Pos(head); 2960 } else { 2961 head = tail = new Neg(head); 2962 } 2963 } 2964 case '>' -> { // (?>xxx) independent group 2965 head = createGroup(true); 2966 tail = root; 2967 head.next = expr(tail); 2968 head = tail = new Ques(head, Qtype.INDEPENDENT); 2969 } 2970 case '<' -> { // (?<xxx) look behind 2971 ch = read(); 2972 if (ch != '=' && ch != '!') { 2973 // named captured group 2974 String name = groupname(ch); 2975 if (namedGroups().containsKey(name)) 2976 throw error("Named capturing group <" + name 2977 + "> is already defined"); 2978 capturingGroup = true; 2979 head = createGroup(false); 2980 tail = root; 2981 namedGroups().put(name, capturingGroupCount - 1); 2982 head.next = expr(tail); 2983 break; 2984 } 2985 int start = cursor; 2986 head = createGroup(true); 2987 tail = root; 2988 head.next = expr(tail); 2989 tail.next = LookBehindEndNode.INSTANCE; 2990 TreeInfo info = new TreeInfo(); 2991 head.study(info); 2992 if (info.maxValid == false) { 2993 throw error("Look-behind group does not have " 2994 + "an obvious maximum length"); 2995 } 2996 boolean hasSupplementary = findSupplementary(start, patternLength); 2997 if (ch == '=') { 2998 head = tail = (hasSupplementary ? 2999 new BehindS(head, info.maxLength, 3000 info.minLength) : 3001 new Behind(head, info.maxLength, 3002 info.minLength)); 3003 } else { // if (ch == '!') 3004 head = tail = (hasSupplementary ? 3005 new NotBehindS(head, info.maxLength, 3006 info.minLength) : 3007 new NotBehind(head, info.maxLength, 3008 info.minLength)); 3009 } 3010 // clear all top-closure-nodes inside lookbehind 3011 if (saveTCNCount < topClosureNodes.size()) 3012 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); 3013 } 3014 case '$', '@' -> throw error("Unknown group type"); 3015 default -> { // (?xxx:) inlined match flags 3016 unread(); 3017 addFlag(); 3018 ch = read(); 3019 if (ch == ')') { 3020 return null; // Inline modifier only 3021 } 3022 if (ch != ':') { 3023 throw error("Unknown inline modifier"); 3024 } 3025 head = createGroup(true); 3026 tail = root; 3027 head.next = expr(tail); 3028 } 3029 } 3030 } else { // (xxx) a regular group 3031 capturingGroup = true; 3032 head = createGroup(false); 3033 tail = root; 3034 head.next = expr(tail); 3035 } 3036 3037 accept(')', "Unclosed group"); 3038 flags0 = save; 3039 3040 // Check for quantifiers 3041 Node node = closure(head); 3042 if (node == head) { // No closure 3043 root = tail; 3044 return node; // Dual return 3045 } 3046 if (head == tail) { // Zero length assertion 3047 root = node; 3048 return node; // Dual return 3049 } 3050 3051 // have group closure, clear all inner closure nodes from the 3052 // top list (no backtracking stopper optimization for inner 3053 if (saveTCNCount < topClosureNodes.size()) 3054 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); 3055 3056 if (node instanceof Ques ques) { 3057 if (ques.type == Qtype.POSSESSIVE) { 3058 root = node; 3059 return node; 3060 } 3061 tail.next = new BranchConn(); 3062 tail = tail.next; 3063 if (ques.type == Qtype.GREEDY) { 3064 head = new Branch(head, null, tail); 3065 } else { // Reluctant quantifier 3066 head = new Branch(null, head, tail); 3067 } 3068 root = tail; 3069 return head; 3070 } else if (node instanceof Curly curly) { 3071 if (curly.type == Qtype.POSSESSIVE) { 3072 root = node; 3073 return node; 3074 } 3075 // Discover if the group is deterministic 3076 TreeInfo info = new TreeInfo(); 3077 if (head.study(info)) { // Deterministic 3078 GroupTail temp = (GroupTail) tail; 3079 head = root = new GroupCurly(head.next, curly.cmin, 3080 curly.cmax, curly.type, 3081 ((GroupTail)tail).localIndex, 3082 ((GroupTail)tail).groupIndex, 3083 capturingGroup); 3084 return head; 3085 } else { // Non-deterministic 3086 int temp = ((GroupHead) head).localIndex; 3087 Loop loop; 3088 if (curly.type == Qtype.GREEDY) { 3089 loop = new Loop(this.localCount, temp); 3090 // add the max_reps greedy to the top-closure-node list 3091 if (curly.cmax == MAX_REPS) 3092 topClosureNodes.add(loop); 3093 } else { // Reluctant Curly 3094 loop = new LazyLoop(this.localCount, temp); 3095 } 3096 Prolog prolog = new Prolog(loop); 3097 this.localCount += 1; 3098 loop.cmin = curly.cmin; 3099 loop.cmax = curly.cmax; 3100 loop.body = head; 3101 tail.next = loop; 3102 root = loop; 3103 return prolog; // Dual return 3104 } 3105 } 3106 throw error("Internal logic error"); 3107 } 3108 3109 /** 3110 * Create group head and tail nodes using double return. If the group is 3111 * created with anonymous true then it is a pure group and should not 3112 * affect group counting. 3113 * 3114 private Node createGroup(boolean anonymous) { 3115 int localIndex = localCount++; 3116 int groupIndex = 0; 3117 if (!anonymous) 3118 groupIndex = capturingGroupCount++; 3119 GroupHead head = new GroupHead(localIndex); 3120 root = new GroupTail(localIndex, groupIndex); 3121 3122 // for debug/print only, head.match does NOT need the "tail" info 3123 head.tail = (GroupTail)root; 3124 3125 if (!anonymous && groupIndex < 10) 3126 groupNodes[groupIndex] = head; 3127 return head; 3128 } 3129 3130 @SuppressWarnings("fallthrough") 3131 /** 3132 * Parses inlined match flags and set them appropriately. 3133 * 3134 private void addFlag() { 3135 int ch = peek(); 3136 for (;;) { 3137 switch (ch) { 3138 case 'i': 3139 flags0 |= CASE_INSENSITIVE; 3140 break; 3141 case 'm': 3142 flags0 |= MULTILINE; 3143 break; 3144 case 's': 3145 flags0 |= DOTALL; 3146 break; 3147 case 'd': 3148 flags0 |= UNIX_LINES; 3149 break; 3150 case 'u': 3151 flags0 |= UNICODE_CASE; 3152 break; 3153 case 'c': 3154 flags0 |= CANON_EQ; 3155 break; 3156 case 'x': 3157 flags0 |= COMMENTS; 3158 break; 3159 case 'U': 3160 flags0 |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3161 break; 3162 case '-': // subFlag then fall through 3163 ch = next(); 3164 subFlag(); 3165 default: 3166 return; 3167 } 3168 ch = next(); 3169 } 3170 } 3171 3172 @SuppressWarnings("fallthrough") 3173 /** 3174 * Parses the second part of inlined match flags and turns off 3175 * flags appropriately. 3176 * 3177 private void subFlag() { 3178 int ch = peek(); 3179 for (;;) { 3180 switch (ch) { 3181 case 'i': 3182 flags0 &= ~CASE_INSENSITIVE; 3183 break; 3184 case 'm': 3185 flags0 &= ~MULTILINE; 3186 break; 3187 case 's': 3188 flags0 &= ~DOTALL; 3189 break; 3190 case 'd': 3191 flags0 &= ~UNIX_LINES; 3192 break; 3193 case 'u': 3194 flags0 &= ~UNICODE_CASE; 3195 break; 3196 case 'c': 3197 flags0 &= ~CANON_EQ; 3198 break; 3199 case 'x': 3200 flags0 &= ~COMMENTS; 3201 break; 3202 case 'U': 3203 flags0 &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3204 break; 3205 default: 3206 return; 3207 } 3208 ch = next(); 3209 } 3210 } 3211 3212 static final int MAX_REPS = 0x7FFFFFFF; 3213 3214 static enum Qtype { 3215 GREEDY, LAZY, POSSESSIVE, INDEPENDENT 3216 } 3217 3218 private Qtype qtype() { 3219 int ch = next(); 3220 if (ch == '?') { 3221 next(); 3222 return Qtype.LAZY; 3223 } else if (ch == '+') { 3224 next(); 3225 return Qtype.POSSESSIVE; 3226 } 3227 return Qtype.GREEDY; 3228 } 3229 3230 private Node curly(Node prev, int cmin) { 3231 Qtype qtype = qtype(); 3232 if (qtype == Qtype.GREEDY) { 3233 if (prev instanceof BmpCharProperty) { 3234 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin); 3235 } else if (prev instanceof CharProperty) { 3236 return new CharPropertyGreedy((CharProperty)prev, cmin); 3237 } 3238 } 3239 return new Curly(prev, cmin, MAX_REPS, qtype); 3240 } 3241 3242 /** 3243 * Processes repetition. If the next character peeked is a quantifier 3244 * then new nodes must be appended to handle the repetition. 3245 * Prev could be a single or a group, so it could be a chain of nodes. 3246 * 3247 private Node closure(Node prev) { 3248 int ch = peek(); 3249 switch (ch) { 3250 case '?': 3251 return new Ques(prev, qtype()); 3252 case '*': 3253 return curly(prev, 0); 3254 case '+': 3255 return curly(prev, 1); 3256 case '{': 3257 ch = skip(); 3258 if (ASCII.isDigit(ch)) { 3259 int cmin = 0, cmax; 3260 try { 3261 do { 3262 cmin = Math.addExact(Math.multiplyExact(cmin, 10), 3263 ch - '0'); 3264 } while (ASCII.isDigit(ch = read())); 3265 if (ch == ',') { 3266 ch = read(); 3267 if (ch == '}') { 3268 unread(); 3269 return curly(prev, cmin); 3270 } else { 3271 cmax = 0; 3272 while (ASCII.isDigit(ch)) { 3273 cmax = Math.addExact(Math.multiplyExact(cmax, 10), 3274 ch - '0'); 3275 ch = read(); 3276 } 3277 } 3278 } else { 3279 cmax = cmin; 3280 } 3281 } catch (ArithmeticException ae) { 3282 throw error("Illegal repetition range"); 3283 } 3284 if (ch != '}') 3285 throw error("Unclosed counted closure"); 3286 if (cmax < cmin) 3287 throw error("Illegal repetition range"); 3288 unread(); 3289 return (cmin == 0 && cmax == 1) 3290 ? new Ques(prev, qtype()) 3291 : new Curly(prev, cmin, cmax, qtype()); 3292 } else { 3293 throw error("Illegal repetition"); 3294 } 3295 default: 3296 return prev; 3297 } 3298 } 3299 3300 /** 3301 * Utility method for parsing control escape sequences. 3302 * 3303 private int c() { 3304 if (cursor < patternLength) { 3305 return read() ^ 64; 3306 } 3307 throw error("Illegal control escape sequence"); 3308 } 3309 3310 /** 3311 * Utility method for parsing octal escape sequences. 3312 * 3313 private int o() { 3314 int n = read(); 3315 if (((n-'0')|('7'-n)) >= 0) { 3316 int m = read(); 3317 if (((m-'0')|('7'-m)) >= 0) { 3318 int o = read(); 3319 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) { 3320 return (n - '0') * 64 + (m - '0') * 8 + (o - '0'); 3321 } 3322 unread(); 3323 return (n - '0') * 8 + (m - '0'); 3324 } 3325 unread(); 3326 return (n - '0'); 3327 } 3328 throw error("Illegal octal escape sequence"); 3329 } 3330 3331 /** 3332 * Utility method for parsing hexadecimal escape sequences. 3333 * 3334 private int x() { 3335 int n = read(); 3336 if (ASCII.isHexDigit(n)) { 3337 int m = read(); 3338 if (ASCII.isHexDigit(m)) { 3339 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); 3340 } 3341 } else if (n == '{' && ASCII.isHexDigit(peek())) { 3342 int ch = 0; 3343 while (ASCII.isHexDigit(n = read())) { 3344 ch = (ch << 4) + ASCII.toDigit(n); 3345 if (ch > Character.MAX_CODE_POINT) 3346 throw error("Hexadecimal codepoint is too big"); 3347 } 3348 if (n != '}') 3349 throw error("Unclosed hexadecimal escape sequence"); 3350 return ch; 3351 } 3352 throw error("Illegal hexadecimal escape sequence"); 3353 } 3354 3355 /** 3356 * Utility method for parsing unicode escape sequences. 3357 * 3358 private int cursor() { 3359 return cursor; 3360 } 3361 3362 private void setcursor(int pos) { 3363 cursor = pos; 3364 } 3365 3366 private int uxxxx() { 3367 int n = 0; 3368 for (int i = 0; i < 4; i++) { 3369 int ch = read(); 3370 if (!ASCII.isHexDigit(ch)) { 3371 throw error("Illegal Unicode escape sequence"); 3372 } 3373 n = n * 16 + ASCII.toDigit(ch); 3374 } 3375 return n; 3376 } 3377 3378 private int u() { 3379 int n = uxxxx(); 3380 if (Character.isHighSurrogate((char)n)) { 3381 int cur = cursor(); 3382 if (read() == '\\' && read() == 'u') { 3383 int n2 = uxxxx(); 3384 if (Character.isLowSurrogate((char)n2)) 3385 return Character.toCodePoint((char)n, (char)n2); 3386 } 3387 setcursor(cur); 3388 } 3389 return n; 3390 } 3391 3392 private int N() { 3393 if (read() == '{') { 3394 int i = cursor; 3395 while (read() != '}') { 3396 if (cursor >= patternLength) 3397 throw error("Unclosed character name escape sequence"); 3398 } 3399 String name = new String(temp, i, cursor - i - 1); 3400 try { 3401 return Character.codePointOf(name); 3402 } catch (IllegalArgumentException x) { 3403 throw error("Unknown character name [" + name + "]"); 3404 } 3405 } 3406 throw error("Illegal character name escape sequence"); 3407 } 3408 3409 // 3410 // Utility methods for code point support 3411 // 3412 private static final int countChars(CharSequence seq, int index, 3413 int lengthInCodePoints) { 3414 // optimization 3415 if (lengthInCodePoints == 1 && index >= 0 && index < seq.length() && 3416 !Character.isHighSurrogate(seq.charAt(index))) { 3417 return 1; 3418 } 3419 int length = seq.length(); 3420 int x = index; 3421 if (lengthInCodePoints >= 0) { 3422 assert ((length == 0 && index == 0) || index >= 0 && index < length); 3423 for (int i = 0; x < length && i < lengthInCodePoints; i++) { 3424 if (Character.isHighSurrogate(seq.charAt(x++))) { 3425 if (x < length && Character.isLowSurrogate(seq.charAt(x))) { 3426 x++; 3427 } 3428 } 3429 } 3430 return x - index; 3431 } 3432 3433 assert (index >= 0 && index <= length); 3434 if (index == 0) { 3435 return 0; 3436 } 3437 int len = -lengthInCodePoints; 3438 for (int i = 0; x > 0 && i < len; i++) { 3439 if (Character.isLowSurrogate(seq.charAt(--x))) { 3440 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) { 3441 x--; 3442 } 3443 } 3444 } 3445 return index - x; 3446 } 3447 3448 private static final int countCodePoints(CharSequence seq) { 3449 int length = seq.length(); 3450 int n = 0; 3451 for (int i = 0; i < length; ) { 3452 n++; 3453 if (Character.isHighSurrogate(seq.charAt(i++))) { 3454 if (i < length && Character.isLowSurrogate(seq.charAt(i))) { 3455 i++; 3456 } 3457 } 3458 } 3459 return n; 3460 } 3461 3462 /** 3463 * Creates a bit vector for matching Latin-1 values. A normal BitClass 3464 * never matches values above Latin-1, and a complemented BitClass always 3465 * matches values above Latin-1. 3466 * 3467 static final class BitClass implements BmpCharPredicate { 3468 final boolean[] bits; 3469 BitClass() { 3470 bits = new boolean[256]; 3471 } 3472 BitClass add(int c, int flags) { 3473 assert c >= 0 && c <= 255; 3474 if ((flags & CASE_INSENSITIVE) != 0) { 3475 if (ASCII.isAscii(c)) { 3476 bits[ASCII.toUpper(c)] = true; 3477 bits[ASCII.toLower(c)] = true; 3478 } else if ((flags & UNICODE_CASE) != 0) { 3479 bits[Character.toLowerCase(c)] = true; 3480 bits[Character.toUpperCase(c)] = true; 3481 } 3482 } 3483 bits[c] = true; 3484 return this; 3485 } 3486 public boolean is(int ch) { 3487 return ch < 256 && bits[ch]; 3488 } 3489 } 3490 3491 3492 /** 3493 * Utility method for creating a string slice matcher. 3494 * 3495 private Node newSlice(int[] buf, int count, boolean hasSupplementary) { 3496 int[] tmp = new int[count]; 3497 if (has(CASE_INSENSITIVE)) { 3498 if (has(UNICODE_CASE)) { 3499 for (int i = 0; i < count; i++) { 3500 tmp[i] = Character.toLowerCase( 3501 Character.toUpperCase(buf[i])); 3502 } 3503 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp); 3504 } 3505 for (int i = 0; i < count; i++) { 3506 tmp[i] = ASCII.toLower(buf[i]); 3507 } 3508 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp); 3509 } 3510 for (int i = 0; i < count; i++) { 3511 tmp[i] = buf[i]; 3512 } 3513 return hasSupplementary ? new SliceS(tmp) : new Slice(tmp); 3514 } 3515 */ 3516 // END Android-removed: Reimplement matching logic via ICU4C. 3517 3518 // BEGIN Android-changed: reimplement matching logic natively via ICU. 3519 // Use native implementation instead of > 3000 lines of helper methods. compile()3520 private void compile() throws PatternSyntaxException { 3521 if (pattern == null) { 3522 throw new NullPointerException("pattern == null"); 3523 } 3524 3525 String icuPattern = pattern; 3526 if ((flags & LITERAL) != 0) { 3527 icuPattern = quote(pattern); 3528 } 3529 3530 // These are the flags natively supported by ICU. 3531 // They even have the same value in native code. 3532 int icuFlags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES); 3533 nativePattern = PatternNative.create(icuPattern, icuFlags); 3534 } 3535 // END Android-changed: reimplement matching logic natively via ICU. 3536 3537 // BEGIN Android-removed: Reimplement matching logic via ICU4C. 3538 /** 3539 * Node to anchor at the beginning of input. This object implements the 3540 * match for a \A sequence, and the caret anchor will use this if not in 3541 * multiline mode. 3542 * 3543 static final class Begin extends Node { 3544 boolean match(Matcher matcher, int i, CharSequence seq) { 3545 int fromIndex = (matcher.anchoringBounds) ? 3546 matcher.from : 0; 3547 if (i == fromIndex && next.match(matcher, i, seq)) { 3548 matcher.first = i; 3549 matcher.groups[0] = i; 3550 matcher.groups[1] = matcher.last; 3551 return true; 3552 } else { 3553 return false; 3554 } 3555 } 3556 } 3557 3558 /** 3559 * Node to anchor at the end of input. This is the absolute end, so this 3560 * should not match at the last newline before the end as $ will. 3561 * 3562 static final class End extends Node { 3563 boolean match(Matcher matcher, int i, CharSequence seq) { 3564 int endIndex = (matcher.anchoringBounds) ? 3565 matcher.to : matcher.getTextLength(); 3566 if (i == endIndex) { 3567 matcher.hitEnd = true; 3568 return next.match(matcher, i, seq); 3569 } 3570 return false; 3571 } 3572 } 3573 3574 /** 3575 * Node to anchor at the beginning of a line. This is essentially the 3576 * object to match for the multiline ^. 3577 * 3578 static final class Caret extends Node { 3579 boolean match(Matcher matcher, int i, CharSequence seq) { 3580 int startIndex = matcher.from; 3581 int endIndex = matcher.to; 3582 if (!matcher.anchoringBounds) { 3583 startIndex = 0; 3584 endIndex = matcher.getTextLength(); 3585 } 3586 // Perl does not match ^ at end of input even after newline 3587 if (i == endIndex) { 3588 matcher.hitEnd = true; 3589 return false; 3590 } 3591 if (i > startIndex) { 3592 char ch = seq.charAt(i-1); 3593 if (ch != '\n' && ch != '\r' 3594 && (ch|1) != '\u2029' 3595 && ch != '\u0085' ) { 3596 return false; 3597 } 3598 // Should treat /r/n as one newline 3599 if (ch == '\r' && seq.charAt(i) == '\n') 3600 return false; 3601 } 3602 return next.match(matcher, i, seq); 3603 } 3604 } 3605 3606 /** 3607 * Node to anchor at the beginning of a line when in unixdot mode. 3608 * 3609 static final class UnixCaret extends Node { 3610 boolean match(Matcher matcher, int i, CharSequence seq) { 3611 int startIndex = matcher.from; 3612 int endIndex = matcher.to; 3613 if (!matcher.anchoringBounds) { 3614 startIndex = 0; 3615 endIndex = matcher.getTextLength(); 3616 } 3617 // Perl does not match ^ at end of input even after newline 3618 if (i == endIndex) { 3619 matcher.hitEnd = true; 3620 return false; 3621 } 3622 if (i > startIndex) { 3623 char ch = seq.charAt(i-1); 3624 if (ch != '\n') { 3625 return false; 3626 } 3627 } 3628 return next.match(matcher, i, seq); 3629 } 3630 } 3631 3632 /** 3633 * Node to match the location where the last match ended. 3634 * This is used for the \G construct. 3635 * 3636 static final class LastMatch extends Node { 3637 boolean match(Matcher matcher, int i, CharSequence seq) { 3638 if (i != matcher.oldLast) 3639 return false; 3640 return next.match(matcher, i, seq); 3641 } 3642 } 3643 3644 /** 3645 * Node to anchor at the end of a line or the end of input based on the 3646 * multiline mode. 3647 * 3648 * When not in multiline mode, the $ can only match at the very end 3649 * of the input, unless the input ends in a line terminator in which 3650 * it matches right before the last line terminator. 3651 * 3652 * Note that \r\n is considered an atomic line terminator. 3653 * 3654 * Like ^ the $ operator matches at a position, it does not match the 3655 * line terminators themselves. 3656 * 3657 static final class Dollar extends Node { 3658 boolean multiline; 3659 Dollar(boolean mul) { 3660 multiline = mul; 3661 } 3662 boolean match(Matcher matcher, int i, CharSequence seq) { 3663 int endIndex = (matcher.anchoringBounds) ? 3664 matcher.to : matcher.getTextLength(); 3665 if (!multiline) { 3666 if (i < endIndex - 2) 3667 return false; 3668 if (i == endIndex - 2) { 3669 char ch = seq.charAt(i); 3670 if (ch != '\r') 3671 return false; 3672 ch = seq.charAt(i + 1); 3673 if (ch != '\n') 3674 return false; 3675 } 3676 } 3677 // Matches before any line terminator; also matches at the 3678 // end of input 3679 // Before line terminator: 3680 // If multiline, we match here no matter what 3681 // If not multiline, fall through so that the end 3682 // is marked as hit; this must be a /r/n or a /n 3683 // at the very end so the end was hit; more input 3684 // could make this not match here 3685 if (i < endIndex) { 3686 char ch = seq.charAt(i); 3687 if (ch == '\n') { 3688 // No match between \r\n 3689 if (i > 0 && seq.charAt(i-1) == '\r') 3690 return false; 3691 if (multiline) 3692 return next.match(matcher, i, seq); 3693 } else if (ch == '\r' || ch == '\u0085' || 3694 (ch|1) == '\u2029') { 3695 if (multiline) 3696 return next.match(matcher, i, seq); 3697 } else { // No line terminator, no match 3698 return false; 3699 } 3700 } 3701 // Matched at current end so hit end 3702 matcher.hitEnd = true; 3703 // If a $ matches because of end of input, then more input 3704 // could cause it to fail! 3705 matcher.requireEnd = true; 3706 return next.match(matcher, i, seq); 3707 } 3708 boolean study(TreeInfo info) { 3709 next.study(info); 3710 return info.deterministic; 3711 } 3712 } 3713 3714 /** 3715 * Node to anchor at the end of a line or the end of input based on the 3716 * multiline mode when in unix lines mode. 3717 * 3718 static final class UnixDollar extends Node { 3719 boolean multiline; 3720 UnixDollar(boolean mul) { 3721 multiline = mul; 3722 } 3723 boolean match(Matcher matcher, int i, CharSequence seq) { 3724 int endIndex = (matcher.anchoringBounds) ? 3725 matcher.to : matcher.getTextLength(); 3726 if (i < endIndex) { 3727 char ch = seq.charAt(i); 3728 if (ch == '\n') { 3729 // If not multiline, then only possible to 3730 // match at very end or one before end 3731 if (multiline == false && i != endIndex - 1) 3732 return false; 3733 // If multiline return next.match without setting 3734 // matcher.hitEnd 3735 if (multiline) 3736 return next.match(matcher, i, seq); 3737 } else { 3738 return false; 3739 } 3740 } 3741 // Matching because at the end or 1 before the end; 3742 // more input could change this so set hitEnd 3743 matcher.hitEnd = true; 3744 // If a $ matches because of end of input, then more input 3745 // could cause it to fail! 3746 matcher.requireEnd = true; 3747 return next.match(matcher, i, seq); 3748 } 3749 boolean study(TreeInfo info) { 3750 next.study(info); 3751 return info.deterministic; 3752 } 3753 } 3754 3755 /** 3756 * Node class that matches a Unicode line ending '\R' 3757 * 3758 static final class LineEnding extends Node { 3759 boolean match(Matcher matcher, int i, CharSequence seq) { 3760 // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029]) 3761 if (i < matcher.to) { 3762 int ch = seq.charAt(i); 3763 if (ch == 0x0A || ch == 0x0B || ch == 0x0C || 3764 ch == 0x85 || ch == 0x2028 || ch == 0x2029) 3765 return next.match(matcher, i + 1, seq); 3766 if (ch == 0x0D) { 3767 i++; 3768 if (i < matcher.to) { 3769 if (seq.charAt(i) == 0x0A && 3770 next.match(matcher, i + 1, seq)) { 3771 return true; 3772 } 3773 } else { 3774 matcher.hitEnd = true; 3775 } 3776 return next.match(matcher, i, seq); 3777 } 3778 } else { 3779 matcher.hitEnd = true; 3780 } 3781 return false; 3782 } 3783 boolean study(TreeInfo info) { 3784 info.minLength++; 3785 info.maxLength += 2; 3786 return next.study(info); 3787 } 3788 } 3789 3790 /** 3791 * Abstract node class to match one character satisfying some 3792 * boolean property. 3793 * 3794 static class CharProperty extends Node { 3795 final CharPredicate predicate; 3796 3797 CharProperty (CharPredicate predicate) { 3798 this.predicate = predicate; 3799 } 3800 boolean match(Matcher matcher, int i, CharSequence seq) { 3801 if (i < matcher.to) { 3802 int ch = Character.codePointAt(seq, i); 3803 i += Character.charCount(ch); 3804 if (i <= matcher.to) { 3805 return predicate.is(ch) && 3806 next.match(matcher, i, seq); 3807 } 3808 } 3809 matcher.hitEnd = true; 3810 return false; 3811 } 3812 boolean study(TreeInfo info) { 3813 info.minLength++; 3814 info.maxLength++; 3815 return next.study(info); 3816 } 3817 } 3818 3819 /** 3820 * Optimized version of CharProperty that works only for 3821 * properties never satisfied by Supplementary characters. 3822 * 3823 private static class BmpCharProperty extends CharProperty { 3824 BmpCharProperty (BmpCharPredicate predicate) { 3825 super(predicate); 3826 } 3827 boolean match(Matcher matcher, int i, CharSequence seq) { 3828 if (i < matcher.to) { 3829 return predicate.is(seq.charAt(i)) && 3830 next.match(matcher, i + 1, seq); 3831 } else { 3832 matcher.hitEnd = true; 3833 return false; 3834 } 3835 } 3836 } 3837 3838 private static class NFCCharProperty extends Node { 3839 CharPredicate predicate; 3840 NFCCharProperty (CharPredicate predicate) { 3841 this.predicate = predicate; 3842 } 3843 3844 boolean match(Matcher matcher, int i, CharSequence seq) { 3845 if (i < matcher.to) { 3846 int ch0 = Character.codePointAt(seq, i); 3847 int n = Character.charCount(ch0); 3848 int j = Grapheme.nextBoundary(seq, i, matcher.to); 3849 if (i + n == j) { // single cp grapheme, assume nfc 3850 if (predicate.is(ch0)) 3851 return next.match(matcher, j, seq); 3852 } else { 3853 while (i + n < j) { 3854 String nfc = Normalizer.normalize( 3855 seq.toString().substring(i, j), Normalizer.Form.NFC); 3856 if (nfc.codePointCount(0, nfc.length()) == 1) { 3857 if (predicate.is(nfc.codePointAt(0)) && 3858 next.match(matcher, j, seq)) { 3859 return true; 3860 } 3861 } 3862 3863 ch0 = Character.codePointBefore(seq, j); 3864 j -= Character.charCount(ch0); 3865 } 3866 } 3867 if (j < matcher.to) 3868 return false; 3869 } 3870 matcher.hitEnd = true; 3871 return false; 3872 } 3873 3874 boolean study(TreeInfo info) { 3875 info.minLength++; 3876 info.deterministic = false; 3877 return next.study(info); 3878 } 3879 } 3880 3881 /** 3882 * Node class that matches an unicode extended grapheme cluster 3883 * 3884 static class XGrapheme extends Node { 3885 boolean match(Matcher matcher, int i, CharSequence seq) { 3886 if (i < matcher.to) { 3887 i = Grapheme.nextBoundary(seq, i, matcher.to); 3888 return next.match(matcher, i, seq); 3889 } 3890 matcher.hitEnd = true; 3891 return false; 3892 } 3893 3894 boolean study(TreeInfo info) { 3895 info.minLength++; 3896 info.deterministic = false; 3897 return next.study(info); 3898 } 3899 } 3900 3901 /** 3902 * Node class that handles grapheme boundaries 3903 * 3904 static class GraphemeBound extends Node { 3905 boolean match(Matcher matcher, int i, CharSequence seq) { 3906 int startIndex = matcher.from; 3907 int endIndex = matcher.to; 3908 if (matcher.transparentBounds) { 3909 startIndex = 0; 3910 endIndex = matcher.getTextLength(); 3911 } 3912 if (i == startIndex) { 3913 // continue with return below 3914 } else if (i < endIndex) { 3915 if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) { 3916 return false; 3917 } 3918 if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) { 3919 return false; 3920 } 3921 } else { 3922 matcher.hitEnd = true; 3923 matcher.requireEnd = true; 3924 } 3925 return next.match(matcher, i, seq); 3926 } 3927 } 3928 3929 /** 3930 * Base class for all Slice nodes 3931 * 3932 static class SliceNode extends Node { 3933 int[] buffer; 3934 SliceNode(int[] buf) { 3935 buffer = buf; 3936 } 3937 boolean study(TreeInfo info) { 3938 info.minLength += buffer.length; 3939 info.maxLength += buffer.length; 3940 return next.study(info); 3941 } 3942 } 3943 3944 /** 3945 * Node class for a case sensitive/BMP-only sequence of literal 3946 * characters. 3947 * 3948 static class Slice extends SliceNode { 3949 Slice(int[] buf) { 3950 super(buf); 3951 } 3952 boolean match(Matcher matcher, int i, CharSequence seq) { 3953 int[] buf = buffer; 3954 int len = buf.length; 3955 for (int j=0; j<len; j++) { 3956 if ((i+j) >= matcher.to) { 3957 matcher.hitEnd = true; 3958 return false; 3959 } 3960 if (buf[j] != seq.charAt(i+j)) 3961 return false; 3962 } 3963 return next.match(matcher, i+len, seq); 3964 } 3965 } 3966 3967 /** 3968 * Node class for a case_insensitive/BMP-only sequence of literal 3969 * characters. 3970 * 3971 static class SliceI extends SliceNode { 3972 SliceI(int[] buf) { 3973 super(buf); 3974 } 3975 boolean match(Matcher matcher, int i, CharSequence seq) { 3976 int[] buf = buffer; 3977 int len = buf.length; 3978 for (int j=0; j<len; j++) { 3979 if ((i+j) >= matcher.to) { 3980 matcher.hitEnd = true; 3981 return false; 3982 } 3983 int c = seq.charAt(i+j); 3984 if (buf[j] != c && 3985 buf[j] != ASCII.toLower(c)) 3986 return false; 3987 } 3988 return next.match(matcher, i+len, seq); 3989 } 3990 } 3991 3992 /** 3993 * Node class for a unicode_case_insensitive/BMP-only sequence of 3994 * literal characters. Uses unicode case folding. 3995 * 3996 static final class SliceU extends SliceNode { 3997 SliceU(int[] buf) { 3998 super(buf); 3999 } 4000 boolean match(Matcher matcher, int i, CharSequence seq) { 4001 int[] buf = buffer; 4002 int len = buf.length; 4003 for (int j=0; j<len; j++) { 4004 if ((i+j) >= matcher.to) { 4005 matcher.hitEnd = true; 4006 return false; 4007 } 4008 int c = seq.charAt(i+j); 4009 if (buf[j] != c && 4010 buf[j] != Character.toLowerCase(Character.toUpperCase(c))) 4011 return false; 4012 } 4013 return next.match(matcher, i+len, seq); 4014 } 4015 } 4016 4017 /** 4018 * Node class for a case sensitive sequence of literal characters 4019 * including supplementary characters. 4020 * 4021 static final class SliceS extends Slice { 4022 SliceS(int[] buf) { 4023 super(buf); 4024 } 4025 boolean match(Matcher matcher, int i, CharSequence seq) { 4026 int[] buf = buffer; 4027 int x = i; 4028 for (int j = 0; j < buf.length; j++) { 4029 if (x >= matcher.to) { 4030 matcher.hitEnd = true; 4031 return false; 4032 } 4033 int c = Character.codePointAt(seq, x); 4034 if (buf[j] != c) 4035 return false; 4036 x += Character.charCount(c); 4037 if (x > matcher.to) { 4038 matcher.hitEnd = true; 4039 return false; 4040 } 4041 } 4042 return next.match(matcher, x, seq); 4043 } 4044 } 4045 4046 /** 4047 * Node class for a case insensitive sequence of literal characters 4048 * including supplementary characters. 4049 * 4050 static class SliceIS extends SliceNode { 4051 SliceIS(int[] buf) { 4052 super(buf); 4053 } 4054 int toLower(int c) { 4055 return ASCII.toLower(c); 4056 } 4057 boolean match(Matcher matcher, int i, CharSequence seq) { 4058 int[] buf = buffer; 4059 int x = i; 4060 for (int j = 0; j < buf.length; j++) { 4061 if (x >= matcher.to) { 4062 matcher.hitEnd = true; 4063 return false; 4064 } 4065 int c = Character.codePointAt(seq, x); 4066 if (buf[j] != c && buf[j] != toLower(c)) 4067 return false; 4068 x += Character.charCount(c); 4069 if (x > matcher.to) { 4070 matcher.hitEnd = true; 4071 return false; 4072 } 4073 } 4074 return next.match(matcher, x, seq); 4075 } 4076 } 4077 4078 /** 4079 * Node class for a case insensitive sequence of literal characters. 4080 * Uses unicode case folding. 4081 * 4082 static final class SliceUS extends SliceIS { 4083 SliceUS(int[] buf) { 4084 super(buf); 4085 } 4086 int toLower(int c) { 4087 return Character.toLowerCase(Character.toUpperCase(c)); 4088 } 4089 } 4090 4091 /** 4092 * The 0 or 1 quantifier. This one class implements all three types. 4093 * 4094 static final class Ques extends Node { 4095 Node atom; 4096 Qtype type; 4097 Ques(Node node, Qtype type) { 4098 this.atom = node; 4099 this.type = type; 4100 } 4101 boolean match(Matcher matcher, int i, CharSequence seq) { 4102 switch (type) { 4103 case GREEDY: 4104 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)) 4105 || next.match(matcher, i, seq); 4106 case LAZY: 4107 return next.match(matcher, i, seq) 4108 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)); 4109 case POSSESSIVE: 4110 if (atom.match(matcher, i, seq)) i = matcher.last; 4111 return next.match(matcher, i, seq); 4112 default: 4113 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); 4114 } 4115 } 4116 boolean study(TreeInfo info) { 4117 if (type != Qtype.INDEPENDENT) { 4118 int minL = info.minLength; 4119 atom.study(info); 4120 info.minLength = minL; 4121 info.deterministic = false; 4122 return next.study(info); 4123 } else { 4124 atom.study(info); 4125 return next.study(info); 4126 } 4127 } 4128 } 4129 4130 /** 4131 * Handles the greedy style repetition with the specified minimum 4132 * and the maximum equal to MAX_REPS, for *, + and {N,} quantifiers. 4133 * 4134 static class CharPropertyGreedy extends Node { 4135 final CharPredicate predicate; 4136 final int cmin; 4137 4138 CharPropertyGreedy(CharProperty cp, int cmin) { 4139 this.predicate = cp.predicate; 4140 this.cmin = cmin; 4141 } 4142 boolean match(Matcher matcher, int i, CharSequence seq) { 4143 int starti = i; 4144 int n = 0; 4145 int to = matcher.to; 4146 // greedy, all the way down 4147 while (i < to) { 4148 int ch = Character.codePointAt(seq, i); 4149 int len = Character.charCount(ch); 4150 if (i + len > to) { 4151 // the region cut off the high half of a surrogate pair 4152 matcher.hitEnd = true; 4153 ch = seq.charAt(i); 4154 len = 1; 4155 } 4156 if (!predicate.is(ch)) 4157 break; 4158 i += len; 4159 n++; 4160 } 4161 if (i >= to) { 4162 matcher.hitEnd = true; 4163 } 4164 while (n >= cmin) { 4165 if (next.match(matcher, i, seq)) 4166 return true; 4167 if (n == cmin) 4168 return false; 4169 // backing off if match fails 4170 int ch = Character.codePointBefore(seq, i); 4171 // check if the region cut off the low half of a surrogate pair 4172 i = Math.max(starti, i - Character.charCount(ch)); 4173 n--; 4174 } 4175 return false; 4176 } 4177 4178 boolean study(TreeInfo info) { 4179 info.minLength += cmin; 4180 if (info.maxValid) { 4181 info.maxLength += MAX_REPS; 4182 } 4183 info.deterministic = false; 4184 return next.study(info); 4185 } 4186 } 4187 4188 static final class BmpCharPropertyGreedy extends CharPropertyGreedy { 4189 4190 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { 4191 super(bcp, cmin); 4192 } 4193 4194 boolean match(Matcher matcher, int i, CharSequence seq) { 4195 int n = 0; 4196 int to = matcher.to; 4197 while (i < to && predicate.is(seq.charAt(i))) { 4198 i++; n++; 4199 } 4200 if (i >= to) { 4201 matcher.hitEnd = true; 4202 } 4203 while (n >= cmin) { 4204 if (next.match(matcher, i, seq)) 4205 return true; 4206 i--; n--; // backing off if match fails 4207 } 4208 return false; 4209 } 4210 } 4211 4212 /** 4213 * Handles the curly-brace style repetition with a specified minimum and 4214 * maximum occurrences. The * quantifier is handled as a special case. 4215 * This class handles the three types. 4216 * 4217 static final class Curly extends Node { 4218 Node atom; 4219 Qtype type; 4220 int cmin; 4221 int cmax; 4222 4223 Curly(Node node, int cmin, int cmax, Qtype type) { 4224 this.atom = node; 4225 this.type = type; 4226 this.cmin = cmin; 4227 this.cmax = cmax; 4228 } 4229 boolean match(Matcher matcher, int i, CharSequence seq) { 4230 int j; 4231 for (j = 0; j < cmin; j++) { 4232 if (atom.match(matcher, i, seq)) { 4233 i = matcher.last; 4234 continue; 4235 } 4236 return false; 4237 } 4238 if (type == Qtype.GREEDY) 4239 return match0(matcher, i, j, seq); 4240 else if (type == Qtype.LAZY) 4241 return match1(matcher, i, j, seq); 4242 else 4243 return match2(matcher, i, j, seq); 4244 } 4245 // Greedy match. 4246 // i is the index to start matching at 4247 // j is the number of atoms that have matched 4248 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4249 if (j >= cmax) { 4250 // We have matched the maximum... continue with the rest of 4251 // the regular expression 4252 return next.match(matcher, i, seq); 4253 } 4254 int backLimit = j; 4255 while (atom.match(matcher, i, seq)) { 4256 // k is the length of this match 4257 int k = matcher.last - i; 4258 if (k == 0) // Zero length match 4259 break; 4260 // Move up index and number matched 4261 i = matcher.last; 4262 j++; 4263 // We are greedy so match as many as we can 4264 while (j < cmax) { 4265 if (!atom.match(matcher, i, seq)) 4266 break; 4267 if (i + k != matcher.last) { 4268 if (match0(matcher, matcher.last, j+1, seq)) 4269 return true; 4270 break; 4271 } 4272 i += k; 4273 j++; 4274 } 4275 // Handle backing off if match fails 4276 while (j >= backLimit) { 4277 if (next.match(matcher, i, seq)) 4278 return true; 4279 i -= k; 4280 j--; 4281 } 4282 return false; 4283 } 4284 return next.match(matcher, i, seq); 4285 } 4286 // Reluctant match. At this point, the minimum has been satisfied. 4287 // i is the index to start matching at 4288 // j is the number of atoms that have matched 4289 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4290 for (;;) { 4291 // Try finishing match without consuming any more 4292 if (next.match(matcher, i, seq)) 4293 return true; 4294 // At the maximum, no match found 4295 if (j >= cmax) 4296 return false; 4297 // Okay, must try one more atom 4298 if (!atom.match(matcher, i, seq)) 4299 return false; 4300 // If we haven't moved forward then must break out 4301 if (i == matcher.last) 4302 return false; 4303 // Move up index and number matched 4304 i = matcher.last; 4305 j++; 4306 } 4307 } 4308 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4309 for (; j < cmax; j++) { 4310 if (!atom.match(matcher, i, seq)) 4311 break; 4312 if (i == matcher.last) 4313 break; 4314 i = matcher.last; 4315 } 4316 return next.match(matcher, i, seq); 4317 } 4318 boolean study(TreeInfo info) { 4319 // Save original info 4320 int minL = info.minLength; 4321 int maxL = info.maxLength; 4322 boolean maxV = info.maxValid; 4323 boolean detm = info.deterministic; 4324 info.reset(); 4325 4326 atom.study(info); 4327 4328 int temp = info.minLength * cmin + minL; 4329 if (temp < minL) { 4330 temp = 0xFFFFFFF; // arbitrary large number 4331 } 4332 info.minLength = temp; 4333 4334 if (maxV & info.maxValid) { 4335 temp = info.maxLength * cmax + maxL; 4336 info.maxLength = temp; 4337 if (temp < maxL) { 4338 info.maxValid = false; 4339 } 4340 } else { 4341 info.maxValid = false; 4342 } 4343 4344 if (info.deterministic && cmin == cmax) 4345 info.deterministic = detm; 4346 else 4347 info.deterministic = false; 4348 return next.study(info); 4349 } 4350 } 4351 4352 /** 4353 * Handles the curly-brace style repetition with a specified minimum and 4354 * maximum occurrences in deterministic cases. This is an iterative 4355 * optimization over the Prolog and Loop system which would handle this 4356 * in a recursive way. The * quantifier is handled as a special case. 4357 * If capture is true then this class saves group settings and ensures 4358 * that groups are unset when backing off of a group match. 4359 * 4360 static final class GroupCurly extends Node { 4361 Node atom; 4362 Qtype type; 4363 int cmin; 4364 int cmax; 4365 int localIndex; 4366 int groupIndex; 4367 boolean capture; 4368 4369 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local, 4370 int group, boolean capture) { 4371 this.atom = node; 4372 this.type = type; 4373 this.cmin = cmin; 4374 this.cmax = cmax; 4375 this.localIndex = local; 4376 this.groupIndex = group; 4377 this.capture = capture; 4378 } 4379 boolean match(Matcher matcher, int i, CharSequence seq) { 4380 int[] groups = matcher.groups; 4381 int[] locals = matcher.locals; 4382 int save0 = locals[localIndex]; 4383 int save1 = 0; 4384 int save2 = 0; 4385 4386 if (capture) { 4387 save1 = groups[groupIndex]; 4388 save2 = groups[groupIndex+1]; 4389 } 4390 4391 // Notify GroupTail there is no need to setup group info 4392 // because it will be set here 4393 locals[localIndex] = -1; 4394 4395 boolean ret = true; 4396 for (int j = 0; j < cmin; j++) { 4397 if (atom.match(matcher, i, seq)) { 4398 if (capture) { 4399 groups[groupIndex] = i; 4400 groups[groupIndex+1] = matcher.last; 4401 } 4402 i = matcher.last; 4403 } else { 4404 ret = false; 4405 break; 4406 } 4407 } 4408 if (ret) { 4409 if (type == Qtype.GREEDY) { 4410 ret = match0(matcher, i, cmin, seq); 4411 } else if (type == Qtype.LAZY) { 4412 ret = match1(matcher, i, cmin, seq); 4413 } else { 4414 ret = match2(matcher, i, cmin, seq); 4415 } 4416 } 4417 if (!ret) { 4418 locals[localIndex] = save0; 4419 if (capture) { 4420 groups[groupIndex] = save1; 4421 groups[groupIndex+1] = save2; 4422 } 4423 } 4424 return ret; 4425 } 4426 // Aggressive group match 4427 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4428 // don't back off passing the starting "j" 4429 int min = j; 4430 int[] groups = matcher.groups; 4431 int save0 = 0; 4432 int save1 = 0; 4433 if (capture) { 4434 save0 = groups[groupIndex]; 4435 save1 = groups[groupIndex+1]; 4436 } 4437 for (;;) { 4438 if (j >= cmax) 4439 break; 4440 if (!atom.match(matcher, i, seq)) 4441 break; 4442 int k = matcher.last - i; 4443 if (k <= 0) { 4444 if (capture) { 4445 groups[groupIndex] = i; 4446 groups[groupIndex+1] = i + k; 4447 } 4448 i = i + k; 4449 break; 4450 } 4451 for (;;) { 4452 if (capture) { 4453 groups[groupIndex] = i; 4454 groups[groupIndex+1] = i + k; 4455 } 4456 i = i + k; 4457 if (++j >= cmax) 4458 break; 4459 if (!atom.match(matcher, i, seq)) 4460 break; 4461 if (i + k != matcher.last) { 4462 if (match0(matcher, i, j, seq)) 4463 return true; 4464 break; 4465 } 4466 } 4467 while (j > min) { 4468 if (next.match(matcher, i, seq)) { 4469 if (capture) { 4470 groups[groupIndex+1] = i; 4471 groups[groupIndex] = i - k; 4472 } 4473 return true; 4474 } 4475 // backing off 4476 i = i - k; 4477 if (capture) { 4478 groups[groupIndex+1] = i; 4479 groups[groupIndex] = i - k; 4480 } 4481 j--; 4482 4483 } 4484 break; 4485 } 4486 if (capture) { 4487 groups[groupIndex] = save0; 4488 groups[groupIndex+1] = save1; 4489 } 4490 return next.match(matcher, i, seq); 4491 } 4492 // Reluctant matching 4493 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4494 for (;;) { 4495 if (next.match(matcher, i, seq)) 4496 return true; 4497 if (j >= cmax) 4498 return false; 4499 if (!atom.match(matcher, i, seq)) 4500 return false; 4501 if (i == matcher.last) 4502 return false; 4503 if (capture) { 4504 matcher.groups[groupIndex] = i; 4505 matcher.groups[groupIndex+1] = matcher.last; 4506 } 4507 i = matcher.last; 4508 j++; 4509 } 4510 } 4511 // Possessive matching 4512 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4513 for (; j < cmax; j++) { 4514 if (!atom.match(matcher, i, seq)) { 4515 break; 4516 } 4517 if (capture) { 4518 matcher.groups[groupIndex] = i; 4519 matcher.groups[groupIndex+1] = matcher.last; 4520 } 4521 if (i == matcher.last) { 4522 break; 4523 } 4524 i = matcher.last; 4525 } 4526 return next.match(matcher, i, seq); 4527 } 4528 boolean study(TreeInfo info) { 4529 // Save original info 4530 int minL = info.minLength; 4531 int maxL = info.maxLength; 4532 boolean maxV = info.maxValid; 4533 boolean detm = info.deterministic; 4534 info.reset(); 4535 4536 atom.study(info); 4537 4538 int temp = info.minLength * cmin + minL; 4539 if (temp < minL) { 4540 temp = 0xFFFFFFF; // Arbitrary large number 4541 } 4542 info.minLength = temp; 4543 4544 if (maxV & info.maxValid) { 4545 temp = info.maxLength * cmax + maxL; 4546 info.maxLength = temp; 4547 if (temp < maxL) { 4548 info.maxValid = false; 4549 } 4550 } else { 4551 info.maxValid = false; 4552 } 4553 4554 if (info.deterministic && cmin == cmax) { 4555 info.deterministic = detm; 4556 } else { 4557 info.deterministic = false; 4558 } 4559 return next.study(info); 4560 } 4561 } 4562 4563 /** 4564 * A Guard node at the end of each atom node in a Branch. It 4565 * serves the purpose of chaining the "match" operation to 4566 * "next" but not the "study", so we can collect the TreeInfo 4567 * of each atom node without including the TreeInfo of the 4568 * "next". 4569 * 4570 static final class BranchConn extends Node { 4571 BranchConn() {} 4572 boolean match(Matcher matcher, int i, CharSequence seq) { 4573 return next.match(matcher, i, seq); 4574 } 4575 boolean study(TreeInfo info) { 4576 return info.deterministic; 4577 } 4578 } 4579 4580 /** 4581 * Handles the branching of alternations. Note this is also used for 4582 * the ? quantifier to branch between the case where it matches once 4583 * and where it does not occur. 4584 * 4585 static final class Branch extends Node { 4586 Node[] atoms = new Node[2]; 4587 int size = 2; 4588 Node conn; 4589 Branch(Node first, Node second, Node branchConn) { 4590 conn = branchConn; 4591 atoms[0] = first; 4592 atoms[1] = second; 4593 } 4594 4595 void add(Node node) { 4596 if (size >= atoms.length) { 4597 Node[] tmp = new Node[atoms.length*2]; 4598 System.arraycopy(atoms, 0, tmp, 0, atoms.length); 4599 atoms = tmp; 4600 } 4601 atoms[size++] = node; 4602 } 4603 4604 boolean match(Matcher matcher, int i, CharSequence seq) { 4605 for (int n = 0; n < size; n++) { 4606 if (atoms[n] == null) { 4607 if (conn.next.match(matcher, i, seq)) 4608 return true; 4609 } else if (atoms[n].match(matcher, i, seq)) { 4610 return true; 4611 } 4612 } 4613 return false; 4614 } 4615 4616 boolean study(TreeInfo info) { 4617 int minL = info.minLength; 4618 int maxL = info.maxLength; 4619 boolean maxV = info.maxValid; 4620 4621 int minL2 = Integer.MAX_VALUE; //arbitrary large enough num 4622 int maxL2 = -1; 4623 for (int n = 0; n < size; n++) { 4624 info.reset(); 4625 if (atoms[n] != null) 4626 atoms[n].study(info); 4627 minL2 = Math.min(minL2, info.minLength); 4628 maxL2 = Math.max(maxL2, info.maxLength); 4629 maxV = (maxV & info.maxValid); 4630 } 4631 4632 minL += minL2; 4633 maxL += maxL2; 4634 4635 info.reset(); 4636 conn.next.study(info); 4637 4638 info.minLength += minL; 4639 info.maxLength += maxL; 4640 info.maxValid &= maxV; 4641 info.deterministic = false; 4642 return false; 4643 } 4644 } 4645 4646 /** 4647 * The GroupHead saves the location where the group begins in the locals 4648 * and restores them when the match is done. 4649 * 4650 * The matchRef is used when a reference to this group is accessed later 4651 * in the expression. The locals will have a negative value in them to 4652 * indicate that we do not want to unset the group if the reference 4653 * doesn't match. 4654 * 4655 static final class GroupHead extends Node { 4656 int localIndex; 4657 GroupTail tail; // for debug/print only, match does not need to know 4658 GroupHead(int localCount) { 4659 localIndex = localCount; 4660 } 4661 boolean match(Matcher matcher, int i, CharSequence seq) { 4662 int save = matcher.locals[localIndex]; 4663 matcher.locals[localIndex] = i; 4664 boolean ret = next.match(matcher, i, seq); 4665 matcher.locals[localIndex] = save; 4666 return ret; 4667 } 4668 } 4669 4670 /** 4671 * The GroupTail handles the setting of group beginning and ending 4672 * locations when groups are successfully matched. It must also be able to 4673 * unset groups that have to be backed off of. 4674 * 4675 * The GroupTail node is also used when a previous group is referenced, 4676 * and in that case no group information needs to be set. 4677 * 4678 static final class GroupTail extends Node { 4679 int localIndex; 4680 int groupIndex; 4681 GroupTail(int localCount, int groupCount) { 4682 localIndex = localCount; 4683 groupIndex = groupCount + groupCount; 4684 } 4685 boolean match(Matcher matcher, int i, CharSequence seq) { 4686 int tmp = matcher.locals[localIndex]; 4687 if (tmp >= 0) { // This is the normal group case. 4688 // Save the group so we can unset it if it 4689 // backs off of a match. 4690 int groupStart = matcher.groups[groupIndex]; 4691 int groupEnd = matcher.groups[groupIndex+1]; 4692 4693 matcher.groups[groupIndex] = tmp; 4694 matcher.groups[groupIndex+1] = i; 4695 if (next.match(matcher, i, seq)) { 4696 return true; 4697 } 4698 matcher.groups[groupIndex] = groupStart; 4699 matcher.groups[groupIndex+1] = groupEnd; 4700 return false; 4701 } else { 4702 // This is a group reference case. We don't need to save any 4703 // group info because it isn't really a group. 4704 matcher.last = i; 4705 return true; 4706 } 4707 } 4708 } 4709 4710 /** 4711 * This sets up a loop to handle a recursive quantifier structure. 4712 * 4713 static final class Prolog extends Node { 4714 Loop loop; 4715 Prolog(Loop loop) { 4716 this.loop = loop; 4717 } 4718 boolean match(Matcher matcher, int i, CharSequence seq) { 4719 return loop.matchInit(matcher, i, seq); 4720 } 4721 boolean study(TreeInfo info) { 4722 return loop.study(info); 4723 } 4724 } 4725 4726 /** 4727 * Handles the repetition count for a greedy Curly. The matchInit 4728 * is called from the Prolog to save the index of where the group 4729 * beginning is stored. A zero length group check occurs in the 4730 * normal match but is skipped in the matchInit. 4731 * 4732 static class Loop extends Node { 4733 Node body; 4734 int countIndex; // local count index in matcher locals 4735 int beginIndex; // group beginning index 4736 int cmin, cmax; 4737 int posIndex; 4738 Loop(int countIndex, int beginIndex) { 4739 this.countIndex = countIndex; 4740 this.beginIndex = beginIndex; 4741 this.posIndex = -1; 4742 } 4743 boolean match(Matcher matcher, int i, CharSequence seq) { 4744 // Avoid infinite loop in zero-length case. 4745 if (i > matcher.locals[beginIndex]) { 4746 int count = matcher.locals[countIndex]; 4747 4748 // This block is for before we reach the minimum 4749 // iterations required for the loop to match 4750 if (count < cmin) { 4751 matcher.locals[countIndex] = count + 1; 4752 boolean b = body.match(matcher, i, seq); 4753 // If match failed we must backtrack, so 4754 // the loop count should NOT be incremented 4755 if (!b) 4756 matcher.locals[countIndex] = count; 4757 // Return success or failure since we are under 4758 // minimum 4759 return b; 4760 } 4761 // This block is for after we have the minimum 4762 // iterations required for the loop to match 4763 if (count < cmax) { 4764 // Let's check if we have already tried and failed 4765 // at this starting position "i" in the past. 4766 // If yes, then just return false wihtout trying 4767 // again, to stop the exponential backtracking. 4768 if (posIndex != -1 && 4769 matcher.localsPos[posIndex].contains(i)) { 4770 return next.match(matcher, i, seq); 4771 } 4772 matcher.locals[countIndex] = count + 1; 4773 boolean b = body.match(matcher, i, seq); 4774 // If match failed we must backtrack, so 4775 // the loop count should NOT be incremented 4776 if (b) 4777 return true; 4778 matcher.locals[countIndex] = count; 4779 // save the failed position 4780 if (posIndex != -1) { 4781 matcher.localsPos[posIndex].add(i); 4782 } 4783 } 4784 } 4785 return next.match(matcher, i, seq); 4786 } 4787 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4788 int save = matcher.locals[countIndex]; 4789 boolean ret; 4790 if (posIndex != -1 && matcher.localsPos[posIndex] == null) { 4791 matcher.localsPos[posIndex] = new IntHashSet(); 4792 } 4793 if (0 < cmin) { 4794 matcher.locals[countIndex] = 1; 4795 ret = body.match(matcher, i, seq); 4796 } else if (0 < cmax) { 4797 matcher.locals[countIndex] = 1; 4798 ret = body.match(matcher, i, seq); 4799 if (ret == false) 4800 ret = next.match(matcher, i, seq); 4801 } else { 4802 ret = next.match(matcher, i, seq); 4803 } 4804 matcher.locals[countIndex] = save; 4805 return ret; 4806 } 4807 boolean study(TreeInfo info) { 4808 info.maxValid = false; 4809 info.deterministic = false; 4810 return false; 4811 } 4812 } 4813 4814 /** 4815 * Handles the repetition count for a reluctant Curly. The matchInit 4816 * is called from the Prolog to save the index of where the group 4817 * beginning is stored. A zero length group check occurs in the 4818 * normal match but is skipped in the matchInit. 4819 * 4820 static final class LazyLoop extends Loop { 4821 LazyLoop(int countIndex, int beginIndex) { 4822 super(countIndex, beginIndex); 4823 } 4824 boolean match(Matcher matcher, int i, CharSequence seq) { 4825 // Check for zero length group 4826 if (i > matcher.locals[beginIndex]) { 4827 int count = matcher.locals[countIndex]; 4828 if (count < cmin) { 4829 matcher.locals[countIndex] = count + 1; 4830 boolean result = body.match(matcher, i, seq); 4831 // If match failed we must backtrack, so 4832 // the loop count should NOT be incremented 4833 if (!result) 4834 matcher.locals[countIndex] = count; 4835 return result; 4836 } 4837 if (next.match(matcher, i, seq)) 4838 return true; 4839 if (count < cmax) { 4840 matcher.locals[countIndex] = count + 1; 4841 boolean result = body.match(matcher, i, seq); 4842 // If match failed we must backtrack, so 4843 // the loop count should NOT be incremented 4844 if (!result) 4845 matcher.locals[countIndex] = count; 4846 return result; 4847 } 4848 return false; 4849 } 4850 return next.match(matcher, i, seq); 4851 } 4852 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4853 int save = matcher.locals[countIndex]; 4854 boolean ret = false; 4855 if (0 < cmin) { 4856 matcher.locals[countIndex] = 1; 4857 ret = body.match(matcher, i, seq); 4858 } else if (next.match(matcher, i, seq)) { 4859 ret = true; 4860 } else if (0 < cmax) { 4861 matcher.locals[countIndex] = 1; 4862 ret = body.match(matcher, i, seq); 4863 } 4864 matcher.locals[countIndex] = save; 4865 return ret; 4866 } 4867 boolean study(TreeInfo info) { 4868 info.maxValid = false; 4869 info.deterministic = false; 4870 return false; 4871 } 4872 } 4873 4874 /** 4875 * Refers to a group in the regular expression. Attempts to match 4876 * whatever the group referred to last matched. 4877 * 4878 static class BackRef extends Node { 4879 int groupIndex; 4880 BackRef(int groupCount) { 4881 super(); 4882 groupIndex = groupCount + groupCount; 4883 } 4884 boolean match(Matcher matcher, int i, CharSequence seq) { 4885 int j = matcher.groups[groupIndex]; 4886 int k = matcher.groups[groupIndex+1]; 4887 4888 int groupSize = k - j; 4889 // If the referenced group didn't match, neither can this 4890 if (j < 0) 4891 return false; 4892 4893 // If there isn't enough input left no match 4894 if (i + groupSize > matcher.to) { 4895 matcher.hitEnd = true; 4896 return false; 4897 } 4898 // Check each new char to make sure it matches what the group 4899 // referenced matched last time around 4900 for (int index=0; index<groupSize; index++) 4901 if (seq.charAt(i+index) != seq.charAt(j+index)) 4902 return false; 4903 4904 return next.match(matcher, i+groupSize, seq); 4905 } 4906 boolean study(TreeInfo info) { 4907 info.maxValid = false; 4908 return next.study(info); 4909 } 4910 } 4911 4912 static class CIBackRef extends Node { 4913 int groupIndex; 4914 boolean doUnicodeCase; 4915 CIBackRef(int groupCount, boolean doUnicodeCase) { 4916 super(); 4917 groupIndex = groupCount + groupCount; 4918 this.doUnicodeCase = doUnicodeCase; 4919 } 4920 boolean match(Matcher matcher, int i, CharSequence seq) { 4921 int j = matcher.groups[groupIndex]; 4922 int k = matcher.groups[groupIndex+1]; 4923 4924 int groupSize = k - j; 4925 4926 // If the referenced group didn't match, neither can this 4927 if (j < 0) 4928 return false; 4929 4930 // If there isn't enough input left no match 4931 if (i + groupSize > matcher.to) { 4932 matcher.hitEnd = true; 4933 return false; 4934 } 4935 4936 // Check each new char to make sure it matches what the group 4937 // referenced matched last time around 4938 int x = i; 4939 for (int index=0; index<groupSize; index++) { 4940 int c1 = Character.codePointAt(seq, x); 4941 int c2 = Character.codePointAt(seq, j); 4942 if (c1 != c2) { 4943 if (doUnicodeCase) { 4944 int cc1 = Character.toUpperCase(c1); 4945 int cc2 = Character.toUpperCase(c2); 4946 if (cc1 != cc2 && 4947 Character.toLowerCase(cc1) != 4948 Character.toLowerCase(cc2)) 4949 return false; 4950 } else { 4951 if (ASCII.toLower(c1) != ASCII.toLower(c2)) 4952 return false; 4953 } 4954 } 4955 x += Character.charCount(c1); 4956 j += Character.charCount(c2); 4957 } 4958 4959 return next.match(matcher, i+groupSize, seq); 4960 } 4961 boolean study(TreeInfo info) { 4962 info.maxValid = false; 4963 return next.study(info); 4964 } 4965 } 4966 4967 /** 4968 * Searches until the next instance of its atom. This is useful for 4969 * finding the atom efficiently without passing an instance of it 4970 * (greedy problem) and without a lot of wasted search time (reluctant 4971 * problem). 4972 * 4973 static final class First extends Node { 4974 Node atom; 4975 First(Node node) { 4976 this.atom = BnM.optimize(node); 4977 } 4978 boolean match(Matcher matcher, int i, CharSequence seq) { 4979 if (atom instanceof BnM) { 4980 return atom.match(matcher, i, seq) 4981 && next.match(matcher, matcher.last, seq); 4982 } 4983 for (;;) { 4984 if (i > matcher.to) { 4985 matcher.hitEnd = true; 4986 return false; 4987 } 4988 if (atom.match(matcher, i, seq)) { 4989 return next.match(matcher, matcher.last, seq); 4990 } 4991 i += countChars(seq, i, 1); 4992 matcher.first++; 4993 } 4994 } 4995 boolean study(TreeInfo info) { 4996 atom.study(info); 4997 info.maxValid = false; 4998 info.deterministic = false; 4999 return next.study(info); 5000 } 5001 } 5002 5003 /** 5004 * Zero width positive lookahead. 5005 * 5006 static final class Pos extends Node { 5007 Node cond; 5008 Pos(Node cond) { 5009 this.cond = cond; 5010 } 5011 boolean match(Matcher matcher, int i, CharSequence seq) { 5012 int savedTo = matcher.to; 5013 boolean conditionMatched; 5014 5015 // Relax transparent region boundaries for lookahead 5016 if (matcher.transparentBounds) 5017 matcher.to = matcher.getTextLength(); 5018 try { 5019 conditionMatched = cond.match(matcher, i, seq); 5020 } finally { 5021 // Reinstate region boundaries 5022 matcher.to = savedTo; 5023 } 5024 return conditionMatched && next.match(matcher, i, seq); 5025 } 5026 } 5027 5028 /** 5029 * Zero width negative lookahead. 5030 * 5031 static final class Neg extends Node { 5032 Node cond; 5033 Neg(Node cond) { 5034 this.cond = cond; 5035 } 5036 boolean match(Matcher matcher, int i, CharSequence seq) { 5037 int savedTo = matcher.to; 5038 boolean conditionMatched; 5039 5040 // Relax transparent region boundaries for lookahead 5041 if (matcher.transparentBounds) 5042 matcher.to = matcher.getTextLength(); 5043 try { 5044 if (i < matcher.to) { 5045 conditionMatched = !cond.match(matcher, i, seq); 5046 } else { 5047 // If a negative lookahead succeeds then more input 5048 // could cause it to fail! 5049 matcher.requireEnd = true; 5050 conditionMatched = !cond.match(matcher, i, seq); 5051 } 5052 } finally { 5053 // Reinstate region boundaries 5054 matcher.to = savedTo; 5055 } 5056 return conditionMatched && next.match(matcher, i, seq); 5057 } 5058 } 5059 5060 /** 5061 * For use with lookbehinds; matches the position where the lookbehind 5062 * was encountered. 5063 * 5064 static class LookBehindEndNode extends Node { 5065 private LookBehindEndNode() {} // Singleton 5066 5067 static LookBehindEndNode INSTANCE = new LookBehindEndNode(); 5068 5069 boolean match(Matcher matcher, int i, CharSequence seq) { 5070 return i == matcher.lookbehindTo; 5071 } 5072 } 5073 5074 /** 5075 * Zero width positive lookbehind. 5076 * 5077 static class Behind extends Node { 5078 Node cond; 5079 int rmax, rmin; 5080 Behind(Node cond, int rmax, int rmin) { 5081 this.cond = cond; 5082 this.rmax = rmax; 5083 this.rmin = rmin; 5084 } 5085 5086 boolean match(Matcher matcher, int i, CharSequence seq) { 5087 int savedFrom = matcher.from; 5088 boolean conditionMatched = false; 5089 int startIndex = (!matcher.transparentBounds) ? 5090 matcher.from : 0; 5091 int from = Math.max(i - rmax, startIndex); 5092 // Set end boundary 5093 int savedLBT = matcher.lookbehindTo; 5094 matcher.lookbehindTo = i; 5095 // Relax transparent region boundaries for lookbehind 5096 if (matcher.transparentBounds) 5097 matcher.from = 0; 5098 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5099 conditionMatched = cond.match(matcher, j, seq); 5100 } 5101 matcher.from = savedFrom; 5102 matcher.lookbehindTo = savedLBT; 5103 return conditionMatched && next.match(matcher, i, seq); 5104 } 5105 } 5106 5107 /** 5108 * Zero width positive lookbehind, including supplementary 5109 * characters or unpaired surrogates. 5110 * 5111 static final class BehindS extends Behind { 5112 BehindS(Node cond, int rmax, int rmin) { 5113 super(cond, rmax, rmin); 5114 } 5115 boolean match(Matcher matcher, int i, CharSequence seq) { 5116 int rmaxChars = countChars(seq, i, -rmax); 5117 int rminChars = countChars(seq, i, -rmin); 5118 int savedFrom = matcher.from; 5119 int startIndex = (!matcher.transparentBounds) ? 5120 matcher.from : 0; 5121 boolean conditionMatched = false; 5122 int from = Math.max(i - rmaxChars, startIndex); 5123 // Set end boundary 5124 int savedLBT = matcher.lookbehindTo; 5125 matcher.lookbehindTo = i; 5126 // Relax transparent region boundaries for lookbehind 5127 if (matcher.transparentBounds) 5128 matcher.from = 0; 5129 5130 for (int j = i - rminChars; 5131 !conditionMatched && j >= from; 5132 j -= j>from ? countChars(seq, j, -1) : 1) { 5133 conditionMatched = cond.match(matcher, j, seq); 5134 } 5135 matcher.from = savedFrom; 5136 matcher.lookbehindTo = savedLBT; 5137 return conditionMatched && next.match(matcher, i, seq); 5138 } 5139 } 5140 5141 /** 5142 * Zero width negative lookbehind. 5143 * 5144 static class NotBehind extends Node { 5145 Node cond; 5146 int rmax, rmin; 5147 NotBehind(Node cond, int rmax, int rmin) { 5148 this.cond = cond; 5149 this.rmax = rmax; 5150 this.rmin = rmin; 5151 } 5152 5153 boolean match(Matcher matcher, int i, CharSequence seq) { 5154 int savedLBT = matcher.lookbehindTo; 5155 int savedFrom = matcher.from; 5156 boolean conditionMatched = false; 5157 int startIndex = (!matcher.transparentBounds) ? 5158 matcher.from : 0; 5159 int from = Math.max(i - rmax, startIndex); 5160 matcher.lookbehindTo = i; 5161 // Relax transparent region boundaries for lookbehind 5162 if (matcher.transparentBounds) 5163 matcher.from = 0; 5164 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5165 conditionMatched = cond.match(matcher, j, seq); 5166 } 5167 // Reinstate region boundaries 5168 matcher.from = savedFrom; 5169 matcher.lookbehindTo = savedLBT; 5170 return !conditionMatched && next.match(matcher, i, seq); 5171 } 5172 } 5173 5174 /** 5175 * Zero width negative lookbehind, including supplementary 5176 * characters or unpaired surrogates. 5177 * 5178 static final class NotBehindS extends NotBehind { 5179 NotBehindS(Node cond, int rmax, int rmin) { 5180 super(cond, rmax, rmin); 5181 } 5182 boolean match(Matcher matcher, int i, CharSequence seq) { 5183 int rmaxChars = countChars(seq, i, -rmax); 5184 int rminChars = countChars(seq, i, -rmin); 5185 int savedFrom = matcher.from; 5186 int savedLBT = matcher.lookbehindTo; 5187 boolean conditionMatched = false; 5188 int startIndex = (!matcher.transparentBounds) ? 5189 matcher.from : 0; 5190 int from = Math.max(i - rmaxChars, startIndex); 5191 matcher.lookbehindTo = i; 5192 // Relax transparent region boundaries for lookbehind 5193 if (matcher.transparentBounds) 5194 matcher.from = 0; 5195 for (int j = i - rminChars; 5196 !conditionMatched && j >= from; 5197 j -= j>from ? countChars(seq, j, -1) : 1) { 5198 conditionMatched = cond.match(matcher, j, seq); 5199 } 5200 //Reinstate region boundaries 5201 matcher.from = savedFrom; 5202 matcher.lookbehindTo = savedLBT; 5203 return !conditionMatched && next.match(matcher, i, seq); 5204 } 5205 } 5206 5207 /** 5208 * Handles word boundaries. Includes a field to allow this one class to 5209 * deal with the different types of word boundaries we can match. The word 5210 * characters include underscores, letters, and digits. Non spacing marks 5211 * can are also part of a word if they have a base character, otherwise 5212 * they are ignored for purposes of finding word boundaries. 5213 * 5214 static final class Bound extends Node { 5215 static int LEFT = 0x1; 5216 static int RIGHT= 0x2; 5217 static int BOTH = 0x3; 5218 static int NONE = 0x4; 5219 int type; 5220 boolean useUWORD; 5221 Bound(int n, boolean useUWORD) { 5222 type = n; 5223 this.useUWORD = useUWORD; 5224 } 5225 5226 boolean isWord(int ch) { 5227 return useUWORD ? CharPredicates.WORD().is(ch) 5228 : (ch == '_' || Character.isLetterOrDigit(ch)); 5229 } 5230 5231 int check(Matcher matcher, int i, CharSequence seq) { 5232 int ch; 5233 boolean left = false; 5234 int startIndex = matcher.from; 5235 int endIndex = matcher.to; 5236 if (matcher.transparentBounds) { 5237 startIndex = 0; 5238 endIndex = matcher.getTextLength(); 5239 } 5240 if (i > startIndex) { 5241 ch = Character.codePointBefore(seq, i); 5242 left = (isWord(ch) || 5243 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5244 && hasBaseCharacter(matcher, i-1, seq))); 5245 } 5246 boolean right = false; 5247 if (i < endIndex) { 5248 ch = Character.codePointAt(seq, i); 5249 right = (isWord(ch) || 5250 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5251 && hasBaseCharacter(matcher, i, seq))); 5252 } else { 5253 // Tried to access char past the end 5254 matcher.hitEnd = true; 5255 // The addition of another char could wreck a boundary 5256 matcher.requireEnd = true; 5257 } 5258 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); 5259 } 5260 boolean match(Matcher matcher, int i, CharSequence seq) { 5261 return (check(matcher, i, seq) & type) > 0 5262 && next.match(matcher, i, seq); 5263 } 5264 } 5265 5266 /** 5267 * Non spacing marks only count as word characters in bounds calculations 5268 * if they have a base character. 5269 * 5270 private static boolean hasBaseCharacter(Matcher matcher, int i, 5271 CharSequence seq) 5272 { 5273 int start = (!matcher.transparentBounds) ? 5274 matcher.from : 0; 5275 for (int x=i; x >= start; x--) { 5276 int ch = Character.codePointAt(seq, x); 5277 if (Character.isLetterOrDigit(ch)) 5278 return true; 5279 if (Character.getType(ch) == Character.NON_SPACING_MARK) 5280 continue; 5281 return false; 5282 } 5283 return false; 5284 } 5285 5286 /** 5287 * Attempts to match a slice in the input using the Boyer-Moore string 5288 * matching algorithm. The algorithm is based on the idea that the 5289 * pattern can be shifted farther ahead in the search text if it is 5290 * matched right to left. 5291 * <p> 5292 * The pattern is compared to the input one character at a time, from 5293 * the rightmost character in the pattern to the left. If the characters 5294 * all match the pattern has been found. If a character does not match, 5295 * the pattern is shifted right a distance that is the maximum of two 5296 * functions, the bad character shift and the good suffix shift. This 5297 * shift moves the attempted match position through the input more 5298 * quickly than a naive one position at a time check. 5299 * <p> 5300 * The bad character shift is based on the character from the text that 5301 * did not match. If the character does not appear in the pattern, the 5302 * pattern can be shifted completely beyond the bad character. If the 5303 * character does occur in the pattern, the pattern can be shifted to 5304 * line the pattern up with the next occurrence of that character. 5305 * <p> 5306 * The good suffix shift is based on the idea that some subset on the right 5307 * side of the pattern has matched. When a bad character is found, the 5308 * pattern can be shifted right by the pattern length if the subset does 5309 * not occur again in pattern, or by the amount of distance to the 5310 * next occurrence of the subset in the pattern. 5311 * 5312 * Boyer-Moore search methods adapted from code by Amy Yu. 5313 * 5314 static class BnM extends Node { 5315 int[] buffer; 5316 int[] lastOcc; 5317 int[] optoSft; 5318 5319 /** 5320 * Pre calculates arrays needed to generate the bad character 5321 * shift and the good suffix shift. Only the last seven bits 5322 * are used to see if chars match; This keeps the tables small 5323 * and covers the heavily used ASCII range, but occasionally 5324 * results in an aliased match for the bad character shift. 5325 * 5326 static Node optimize(Node node) { 5327 if (!(node instanceof Slice)) { 5328 return node; 5329 } 5330 5331 int[] src = ((Slice) node).buffer; 5332 int patternLength = src.length; 5333 // The BM algorithm requires a bit of overhead; 5334 // If the pattern is short don't use it, since 5335 // a shift larger than the pattern length cannot 5336 // be used anyway. 5337 if (patternLength < 4) { 5338 return node; 5339 } 5340 int i, j; 5341 int[] lastOcc = new int[128]; 5342 int[] optoSft = new int[patternLength]; 5343 // Precalculate part of the bad character shift 5344 // It is a table for where in the pattern each 5345 // lower 7-bit value occurs 5346 for (i = 0; i < patternLength; i++) { 5347 lastOcc[src[i]&0x7F] = i + 1; 5348 } 5349 // Precalculate the good suffix shift 5350 // i is the shift amount being considered 5351 NEXT: for (i = patternLength; i > 0; i--) { 5352 // j is the beginning index of suffix being considered 5353 for (j = patternLength - 1; j >= i; j--) { 5354 // Testing for good suffix 5355 if (src[j] == src[j-i]) { 5356 // src[j..len] is a good suffix 5357 optoSft[j-1] = i; 5358 } else { 5359 // No match. The array has already been 5360 // filled up with correct values before. 5361 continue NEXT; 5362 } 5363 } 5364 // This fills up the remaining of optoSft 5365 // any suffix can not have larger shift amount 5366 // then its sub-suffix. Why??? 5367 while (j > 0) { 5368 optoSft[--j] = i; 5369 } 5370 } 5371 // Set the guard value because of unicode compression 5372 optoSft[patternLength-1] = 1; 5373 if (node instanceof SliceS) 5374 return new BnMS(src, lastOcc, optoSft, node.next); 5375 return new BnM(src, lastOcc, optoSft, node.next); 5376 } 5377 BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5378 this.buffer = src; 5379 this.lastOcc = lastOcc; 5380 this.optoSft = optoSft; 5381 this.next = next; 5382 } 5383 boolean match(Matcher matcher, int i, CharSequence seq) { 5384 int[] src = buffer; 5385 int patternLength = src.length; 5386 int last = matcher.to - patternLength; 5387 5388 // Loop over all possible match positions in text 5389 NEXT: while (i <= last) { 5390 // Loop over pattern from right to left 5391 for (int j = patternLength - 1; j >= 0; j--) { 5392 int ch = seq.charAt(i+j); 5393 if (ch != src[j]) { 5394 // Shift search to the right by the maximum of the 5395 // bad character shift and the good suffix shift 5396 i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); 5397 continue NEXT; 5398 } 5399 } 5400 // Entire pattern matched starting at i 5401 matcher.first = i; 5402 boolean ret = next.match(matcher, i + patternLength, seq); 5403 if (ret) { 5404 matcher.first = i; 5405 matcher.groups[0] = matcher.first; 5406 matcher.groups[1] = matcher.last; 5407 return true; 5408 } 5409 i++; 5410 } 5411 // BnM is only used as the leading node in the unanchored case, 5412 // and it replaced its Start() which always searches to the end 5413 // if it doesn't find what it's looking for, so hitEnd is true. 5414 matcher.hitEnd = true; 5415 return false; 5416 } 5417 boolean study(TreeInfo info) { 5418 info.minLength += buffer.length; 5419 info.maxValid = false; 5420 return next.study(info); 5421 } 5422 } 5423 5424 /** 5425 * Supplementary support version of BnM(). Unpaired surrogates are 5426 * also handled by this class. 5427 * 5428 static final class BnMS extends BnM { 5429 int lengthInChars; 5430 5431 BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5432 super(src, lastOcc, optoSft, next); 5433 for (int cp : buffer) { 5434 lengthInChars += Character.charCount(cp); 5435 } 5436 } 5437 boolean match(Matcher matcher, int i, CharSequence seq) { 5438 int[] src = buffer; 5439 int patternLength = src.length; 5440 int last = matcher.to - lengthInChars; 5441 5442 // Loop over all possible match positions in text 5443 NEXT: while (i <= last) { 5444 // Loop over pattern from right to left 5445 int ch; 5446 for (int j = countChars(seq, i, patternLength), x = patternLength - 1; 5447 j > 0; j -= Character.charCount(ch), x--) { 5448 ch = Character.codePointBefore(seq, i+j); 5449 if (ch != src[x]) { 5450 // Shift search to the right by the maximum of the 5451 // bad character shift and the good suffix shift 5452 int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]); 5453 i += countChars(seq, i, n); 5454 continue NEXT; 5455 } 5456 } 5457 // Entire pattern matched starting at i 5458 matcher.first = i; 5459 boolean ret = next.match(matcher, i + lengthInChars, seq); 5460 if (ret) { 5461 matcher.first = i; 5462 matcher.groups[0] = matcher.first; 5463 matcher.groups[1] = matcher.last; 5464 return true; 5465 } 5466 i += countChars(seq, i, 1); 5467 } 5468 matcher.hitEnd = true; 5469 return false; 5470 } 5471 } 5472 5473 @FunctionalInterface 5474 static interface CharPredicate { 5475 boolean is(int ch); 5476 5477 default CharPredicate and(CharPredicate p) { 5478 return ch -> is(ch) && p.is(ch); 5479 } 5480 default CharPredicate union(CharPredicate p) { 5481 return ch -> is(ch) || p.is(ch); 5482 } 5483 default CharPredicate union(CharPredicate p1, 5484 CharPredicate p2) { 5485 return ch -> is(ch) || p1.is(ch) || p2.is(ch); 5486 } 5487 default CharPredicate negate() { 5488 return ch -> !is(ch); 5489 } 5490 } 5491 5492 static interface BmpCharPredicate extends CharPredicate { 5493 5494 default CharPredicate and(CharPredicate p) { 5495 if (p instanceof BmpCharPredicate) 5496 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); 5497 return ch -> is(ch) && p.is(ch); 5498 } 5499 default CharPredicate union(CharPredicate p) { 5500 if (p instanceof BmpCharPredicate) 5501 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch)); 5502 return ch -> is(ch) || p.is(ch); 5503 } 5504 static CharPredicate union(CharPredicate... predicates) { 5505 CharPredicate cp = ch -> { 5506 for (CharPredicate p : predicates) { 5507 if (!p.is(ch)) 5508 return false; 5509 } 5510 return true; 5511 }; 5512 for (CharPredicate p : predicates) { 5513 if (! (p instanceof BmpCharPredicate)) 5514 return cp; 5515 } 5516 return (BmpCharPredicate)cp; 5517 } 5518 } 5519 5520 /** 5521 * matches a Perl vertical whitespace 5522 * 5523 static BmpCharPredicate VertWS() { 5524 return cp -> (cp >= 0x0A && cp <= 0x0D) || 5525 cp == 0x85 || cp == 0x2028 || cp == 0x2029; 5526 } 5527 5528 /** 5529 * matches a Perl horizontal whitespace 5530 * 5531 static BmpCharPredicate HorizWS() { 5532 return cp -> 5533 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || 5534 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || 5535 cp == 0x205f || cp == 0x3000; 5536 } 5537 5538 /** 5539 * for the Unicode category ALL and the dot metacharacter when 5540 * in dotall mode. 5541 * 5542 static CharPredicate ALL() { 5543 return ch -> true; 5544 } 5545 5546 /** 5547 * for the dot metacharacter when dotall is not enabled. 5548 * 5549 static CharPredicate DOT() { 5550 return ch -> 5551 (ch != '\n' && ch != '\r' 5552 && (ch|1) != '\u2029' 5553 && ch != '\u0085'); 5554 } 5555 5556 /** 5557 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. 5558 * 5559 static CharPredicate UNIXDOT() { 5560 return ch -> ch != '\n'; 5561 } 5562 5563 /** 5564 * Indicate that matches a Supplementary Unicode character 5565 * 5566 static CharPredicate SingleS(int c) { 5567 return ch -> ch == c; 5568 } 5569 5570 /** 5571 * A bmp/optimized predicate of single 5572 * 5573 static BmpCharPredicate Single(int c) { 5574 return ch -> ch == c; 5575 } 5576 5577 /** 5578 * Case insensitive matches a given BMP character 5579 * 5580 static BmpCharPredicate SingleI(int lower, int upper) { 5581 return ch -> ch == lower || ch == upper; 5582 } 5583 5584 /** 5585 * Unicode case insensitive matches a given Unicode character 5586 * 5587 static CharPredicate SingleU(int lower) { 5588 return ch -> lower == ch || 5589 lower == Character.toLowerCase(Character.toUpperCase(ch)); 5590 } 5591 5592 private static boolean inRange(int lower, int ch, int upper) { 5593 return lower <= ch && ch <= upper; 5594 } 5595 5596 /** 5597 * Characters within a explicit value range 5598 * 5599 static CharPredicate Range(int lower, int upper) { 5600 if (upper < Character.MIN_HIGH_SURROGATE || 5601 lower > Character.MAX_LOW_SURROGATE && 5602 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) 5603 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); 5604 return ch -> inRange(lower, ch, upper); 5605 } 5606 5607 /** 5608 * Characters within a explicit value range in a case insensitive manner. 5609 * 5610 static CharPredicate CIRange(int lower, int upper) { 5611 return ch -> inRange(lower, ch, upper) || 5612 ASCII.isAscii(ch) && 5613 (inRange(lower, ASCII.toUpper(ch), upper) || 5614 inRange(lower, ASCII.toLower(ch), upper)); 5615 } 5616 5617 static CharPredicate CIRangeU(int lower, int upper) { 5618 return ch -> { 5619 if (inRange(lower, ch, upper)) 5620 return true; 5621 int up = Character.toUpperCase(ch); 5622 return inRange(lower, up, upper) || 5623 inRange(lower, Character.toLowerCase(up), upper); 5624 }; 5625 } 5626 5627 /** 5628 * This must be the very first initializer. 5629 * 5630 static final Node accept = new Node(); 5631 5632 static final Node lastAccept = new LastNode(); 5633 */ 5634 // END Android-removed: Reimplement matching logic via ICU4C. 5635 5636 /** 5637 * Creates a predicate that tests if this pattern is found in a given input 5638 * string. 5639 * 5640 * @apiNote 5641 * This method creates a predicate that behaves as if it creates a matcher 5642 * from the input sequence and then calls {@code find}, for example a 5643 * predicate of the form: 5644 * <pre>{@code 5645 * s -> matcher(s).find(); 5646 * }</pre> 5647 * 5648 * @return The predicate which can be used for finding a match on a 5649 * subsequence of a string 5650 * @since 1.8 5651 * @see Matcher#find 5652 */ asPredicate()5653 public Predicate<String> asPredicate() { 5654 return s -> matcher(s).find(); 5655 } 5656 5657 /** 5658 * Creates a predicate that tests if this pattern matches a given input string. 5659 * 5660 * @apiNote 5661 * This method creates a predicate that behaves as if it creates a matcher 5662 * from the input sequence and then calls {@code matches}, for example a 5663 * predicate of the form: 5664 * <pre>{@code 5665 * s -> matcher(s).matches(); 5666 * }</pre> 5667 * 5668 * @return The predicate which can be used for matching an input string 5669 * against this pattern. 5670 * @since 11 5671 * @see Matcher#matches 5672 */ asMatchPredicate()5673 public Predicate<String> asMatchPredicate() { 5674 return s -> matcher(s).matches(); 5675 } 5676 5677 /** 5678 * Creates a stream from the given input sequence around matches of this 5679 * pattern. 5680 * 5681 * <p> The stream returned by this method contains each substring of the 5682 * input sequence that is terminated by another subsequence that matches 5683 * this pattern or is terminated by the end of the input sequence. The 5684 * substrings in the stream are in the order in which they occur in the 5685 * input. Trailing empty strings will be discarded and not encountered in 5686 * the stream. 5687 * 5688 * <p> If this pattern does not match any subsequence of the input then 5689 * the resulting stream has just one element, namely the input sequence in 5690 * string form. 5691 * 5692 * <p> When there is a positive-width match at the beginning of the input 5693 * sequence then an empty leading substring is included at the beginning 5694 * of the stream. A zero-width match at the beginning however never produces 5695 * such empty leading substring. 5696 * 5697 * <p> If the input sequence is mutable, it must remain constant during the 5698 * execution of the terminal stream operation. Otherwise, the result of the 5699 * terminal stream operation is undefined. 5700 * 5701 * @param input 5702 * The character sequence to be split 5703 * 5704 * @return The stream of strings computed by splitting the input 5705 * around matches of this pattern 5706 * @see #split(CharSequence) 5707 * @since 1.8 5708 */ splitAsStream(final CharSequence input)5709 public Stream<String> splitAsStream(final CharSequence input) { 5710 class MatcherIterator implements Iterator<String> { 5711 private Matcher matcher; 5712 // The start position of the next sub-sequence of input 5713 // when current == input.length there are no more elements 5714 private int current; 5715 // null if the next element, if any, needs to obtained 5716 private String nextElement; 5717 // > 0 if there are N next empty elements 5718 private int emptyElementCount; 5719 5720 public String next() { 5721 if (!hasNext()) 5722 throw new NoSuchElementException(); 5723 5724 if (emptyElementCount == 0) { 5725 String n = nextElement; 5726 nextElement = null; 5727 return n; 5728 } else { 5729 emptyElementCount--; 5730 return ""; 5731 } 5732 } 5733 5734 public boolean hasNext() { 5735 if (matcher == null) { 5736 matcher = matcher(input); 5737 // If the input is an empty string then the result can only be a 5738 // stream of the input. Induce that by setting the empty 5739 // element count to 1 5740 // Android-changed: Keep old behavior on Android 13 or below. http://b/286499139 5741 // emptyElementCount = input.length() == 0 ? 1 : 0; 5742 if (input.length() == 0 5743 && VMRuntime.getSdkVersion() >= VersionCodes.UPSIDE_DOWN_CAKE 5744 && Compatibility.isChangeEnabled( 5745 SPLIT_AS_STREAM_RETURNS_SINGLE_EMPTY_STRING)) { 5746 emptyElementCount = 1; 5747 } else { 5748 emptyElementCount = 0; 5749 } 5750 } 5751 if (nextElement != null || emptyElementCount > 0) 5752 return true; 5753 5754 if (current == input.length()) 5755 return false; 5756 5757 // Consume the next matching element 5758 // Count sequence of matching empty elements 5759 while (matcher.find()) { 5760 nextElement = input.subSequence(current, matcher.start()).toString(); 5761 current = matcher.end(); 5762 if (!nextElement.isEmpty()) { 5763 return true; 5764 } else if (current > 0) { // no empty leading substring for zero-width 5765 // match at the beginning of the input 5766 emptyElementCount++; 5767 } 5768 } 5769 5770 // Consume last matching element 5771 nextElement = input.subSequence(current, input.length()).toString(); 5772 current = input.length(); 5773 if (!nextElement.isEmpty()) { 5774 return true; 5775 } else { 5776 // Ignore a terminal sequence of matching empty elements 5777 emptyElementCount = 0; 5778 nextElement = null; 5779 return false; 5780 } 5781 } 5782 } 5783 return StreamSupport.stream(Spliterators.spliteratorUnknownSize( 5784 new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false); 5785 } 5786 5787 // Android-added: Backward-compatible flag for splitAsStream() API. 5788 /** 5789 * Since Android 14, {@link Pattern#splitAsStream(CharSequence)} return a stream of a single 5790 * empty String as described in the API documentation. Previously, given an empty string input, 5791 * the method returns an empty stream. 5792 * 5793 * This flag is enabled for apps targeting Android 14+. 5794 * 5795 * @hide 5796 */ 5797 @ChangeId 5798 @EnabledSince(targetSdkVersion = VersionCodes.UPSIDE_DOWN_CAKE) 5799 public static final long SPLIT_AS_STREAM_RETURNS_SINGLE_EMPTY_STRING = 288845345L; 5800 5801 } 5802