1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 package java.util.regex;
28 
29 import libcore.util.NativeAllocationRegistry;
30 
31 /**
32  * An engine that performs match operations on a {@link java.lang.CharSequence
33  * </code>character sequence<code>} by interpreting a {@link Pattern}.
34  *
35  * <p> A matcher is created from a pattern by invoking the pattern's {@link
36  * Pattern#matcher matcher} method.  Once created, a matcher can be used to
37  * perform three different kinds of match operations:
38  *
39  * <ul>
40  *
41  *   <li><p> The {@link #matches matches} method attempts to match the entire
42  *   input sequence against the pattern.  </p></li>
43  *
44  *   <li><p> The {@link #lookingAt lookingAt} method attempts to match the
45  *   input sequence, starting at the beginning, against the pattern.  </p></li>
46  *
47  *   <li><p> The {@link #find find} method scans the input sequence looking for
48  *   the next subsequence that matches the pattern.  </p></li>
49  *
50  * </ul>
51  *
52  * <p> Each of these methods returns a boolean indicating success or failure.
53  * More information about a successful match can be obtained by querying the
54  * state of the matcher.
55  *
56  * <p> A matcher finds matches in a subset of its input called the
57  * <i>region</i>. By default, the region contains all of the matcher's input.
58  * The region can be modified via the{@link #region region} method and queried
59  * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
60  * methods. The way that the region boundaries interact with some pattern
61  * constructs can be changed. See {@link #useAnchoringBounds
62  * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
63  * for more details.
64  *
65  * <p> This class also defines methods for replacing matched subsequences with
66  * new strings whose contents can, if desired, be computed from the match
67  * result.  The {@link #appendReplacement appendReplacement} and {@link
68  * #appendTail appendTail} methods can be used in tandem in order to collect
69  * the result into an existing string buffer, or the more convenient {@link
70  * #replaceAll replaceAll} method can be used to create a string in which every
71  * matching subsequence in the input sequence is replaced.
72  *
73  * <p> The explicit state of a matcher includes the start and end indices of
74  * the most recent successful match.  It also includes the start and end
75  * indices of the input subsequence captured by each <a
76  * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
77  * count of such subsequences.  As a convenience, methods are also provided for
78  * returning these captured subsequences in string form.
79  *
80  * <p> The explicit state of a matcher is initially undefined; attempting to
81  * query any part of it before a successful match will cause an {@link
82  * IllegalStateException} to be thrown.  The explicit state of a matcher is
83  * recomputed by every match operation.
84  *
85  * <p> The implicit state of a matcher includes the input character sequence as
86  * well as the <i>append position</i>, which is initially zero and is updated
87  * by the {@link #appendReplacement appendReplacement} method.
88  *
89  * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
90  * method or, if a new input sequence is desired, its {@link
91  * #reset(java.lang.CharSequence) reset(CharSequence)} method.  Resetting a
92  * matcher discards its explicit state information and sets the append position
93  * to zero.
94  *
95  * <p> Instances of this class are not safe for use by multiple concurrent
96  * threads. </p>
97  *
98  *
99  * @author      Mike McCloskey
100  * @author      Mark Reinhold
101  * @author      JSR-51 Expert Group
102  * @since       1.4
103  * @spec        JSR-51
104  */
105 
106 public final class Matcher implements MatchResult {
107     /**
108      * The Pattern object that created this Matcher.
109      */
110     private Pattern pattern;
111 
112     /**
113      * The address of the native peer.
114      * Uses of this must be manually synchronized to avoid native crashes.
115      */
116     private long address;
117 
118     /**
119      * If non-null, a Runnable that can be used to explicitly deallocate address.
120      */
121     private Runnable nativeFinalizer;
122 
123     private static final NativeAllocationRegistry registry = new NativeAllocationRegistry(
124             Matcher.class.getClassLoader(), getNativeFinalizer(), nativeSize());
125 
126     /**
127      * Holds the input text.
128      */
129     private String input;
130 
131     /**
132      * Holds the start of the region, or 0 if the matching should start at the
133      * beginning of the text.
134      */
135     private int regionStart;
136 
137     /**
138      * Holds the end of the region, or input.length() if the matching should
139      * go until the end of the input.
140      */
141     private int regionEnd;
142 
143     /**
144      * Holds the position where the next append operation will take place.
145      */
146     private int appendPos;
147 
148     /**
149      * Reflects whether a match has been found during the most recent find
150      * operation.
151      */
152     private boolean matchFound;
153 
154     /**
155      * Holds the offsets for the most recent match.
156      */
157     private int[] matchOffsets;
158 
159     /**
160      * Reflects whether the bounds of the region are anchoring.
161      */
162     private boolean anchoringBounds = true;
163 
164     /**
165      * Reflects whether the bounds of the region are transparent.
166      */
167     private boolean transparentBounds;
168 
169     /**
170      * All matchers have the state used by Pattern during a match.
171      */
Matcher(Pattern parent, CharSequence text)172     Matcher(Pattern parent, CharSequence text) {
173         usePattern(parent);
174         reset(text);
175     }
176 
177     /**
178      * Returns the pattern that is interpreted by this matcher.
179      *
180      * @return  The pattern for which this matcher was created
181      */
pattern()182     public Pattern pattern() {
183         return pattern;
184     }
185 
186     /**
187      * Returns the match state of this matcher as a {@link MatchResult}.
188      * The result is unaffected by subsequent operations performed upon this
189      * matcher.
190      *
191      * @return  a <code>MatchResult</code> with the state of this matcher
192      * @since 1.5
193      */
toMatchResult()194     public MatchResult toMatchResult() {
195         ensureMatch();
196         return new OffsetBasedMatchResult(input, matchOffsets);
197     }
198 
199     /**
200       * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
201       * find matches with.
202       *
203       * <p> This method causes this matcher to lose information
204       * about the groups of the last match that occurred. The
205       * matcher's position in the input is maintained and its
206       * last append position is unaffected.</p>
207       *
208       * @param  newPattern
209       *         The new pattern used by this matcher
210       * @return  This matcher
211       * @throws  IllegalArgumentException
212       *          If newPattern is <tt>null</tt>
213       * @since 1.5
214       */
usePattern(Pattern newPattern)215     public Matcher usePattern(Pattern newPattern) {
216         if (newPattern == null) {
217             throw new IllegalArgumentException("newPattern == null");
218         }
219 
220         this.pattern = newPattern;
221 
222         synchronized (this) {
223             if (nativeFinalizer != null) {
224                 nativeFinalizer.run();
225                 address = 0; // In case openImpl throws.
226                 nativeFinalizer = null;
227             }
228             address = openImpl(pattern.address);
229             nativeFinalizer = registry.registerNativeAllocation(this, address);
230         }
231 
232         if (input != null) {
233             resetForInput();
234         }
235 
236         matchOffsets = new int[(groupCount() + 1) * 2];
237         matchFound = false;
238         return this;
239     }
240 
241     /**
242      * Returns the offset after the last character matched.  </p>
243      *
244      * @return  The offset after the last character matched
245      *
246      * @throws  IllegalStateException
247      *          If no match has yet been attempted,
248      *          or if the previous match operation failed
249      */
end()250     public int end() {
251         return end(0);
252     }
253 
254     /**
255      * Returns the offset after the last character of the subsequence
256      * captured by the given group during the previous match operation.
257      *
258      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
259      * to right, starting at one.  Group zero denotes the entire pattern, so
260      * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
261      * <i>m.</i><tt>end()</tt>.  </p>
262      *
263      * @param  group
264      *         The index of a capturing group in this matcher's pattern
265      *
266      * @return  The offset after the last character captured by the group,
267      *          or <tt>-1</tt> if the match was successful
268      *          but the group itself did not match anything
269      *
270      * @throws  IllegalStateException
271      *          If no match has yet been attempted,
272      *          or if the previous match operation failed
273      *
274      * @throws  IndexOutOfBoundsException
275      *          If there is no capturing group in the pattern
276      *          with the given index
277      */
end(int group)278     public int end(int group) {
279         ensureMatch();
280         return matchOffsets[(group * 2) + 1];
281     }
282 
283     /**
284      * Returns the offset after the last character of the subsequence
285      * captured by the given <a href="Pattern.html#groupname">named-capturing
286      * group</a> during the previous match operation.
287      *
288      * @param  name
289      *         The name of a named-capturing group in this matcher's pattern
290      *
291      * @return  The offset after the last character captured by the group,
292      *          or {@code -1} if the match was successful
293      *          but the group itself did not match anything
294      *
295      * @throws  IllegalStateException
296      *          If no match has yet been attempted,
297      *          or if the previous match operation failed
298      *
299      * @throws  IllegalArgumentException
300      *          If there is no capturing group in the pattern
301      *          with the given name
302      * @since 1.8
303      */
end(String name)304     public int end(String name) {
305         ensureMatch();
306         return matchOffsets[getMatchedGroupIndex(pattern.address, name) * 2 + 1];
307     }
308 
309 
310     /**
311      * Returns the input subsequence matched by the previous match.
312      *
313      * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
314      * the expressions <i>m.</i><tt>group()</tt> and
315      * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt>&nbsp;<i>m.</i><tt>end())</tt>
316      * are equivalent.  </p>
317      *
318      * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
319      * string.  This method will return the empty string when the pattern
320      * successfully matches the empty string in the input.  </p>
321      *
322      * @return The (possibly empty) subsequence matched by the previous match,
323      *         in string form
324      *
325      * @throws  IllegalStateException
326      *          If no match has yet been attempted,
327      *          or if the previous match operation failed
328      */
group()329     public String group() {
330         return group(0);
331     }
332 
333     /**
334      * Returns the input subsequence captured by the given group during the
335      * previous match operation.
336      *
337      * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
338      * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
339      * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt>&nbsp;<i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
340      * are equivalent.  </p>
341      *
342      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
343      * to right, starting at one.  Group zero denotes the entire pattern, so
344      * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
345      * </p>
346      *
347      * <p> If the match was successful but the group specified failed to match
348      * any part of the input sequence, then <tt>null</tt> is returned. Note
349      * that some groups, for example <tt>(a*)</tt>, match the empty string.
350      * This method will return the empty string when such a group successfully
351      * matches the empty string in the input.  </p>
352      *
353      * @param  group
354      *         The index of a capturing group in this matcher's pattern
355      *
356      * @return  The (possibly empty) subsequence captured by the group
357      *          during the previous match, or <tt>null</tt> if the group
358      *          failed to match part of the input
359      *
360      * @throws  IllegalStateException
361      *          If no match has yet been attempted,
362      *          or if the previous match operation failed
363      *
364      * @throws  IndexOutOfBoundsException
365      *          If there is no capturing group in the pattern
366      *          with the given index
367      */
group(int group)368     public String group(int group) {
369         ensureMatch();
370         int from = matchOffsets[group * 2];
371         int to = matchOffsets[(group * 2) + 1];
372         if (from == -1 || to == -1) {
373             return null;
374         } else {
375             return input.substring(from, to);
376         }
377     }
378 
379     /**
380      * Returns the input subsequence captured by the given
381      * <a href="Pattern.html#groupname">named-capturing group</a> during the previous
382      * match operation.
383      *
384      * <p> If the match was successful but the group specified failed to match
385      * any part of the input sequence, then <tt>null</tt> is returned. Note
386      * that some groups, for example <tt>(a*)</tt>, match the empty string.
387      * This method will return the empty string when such a group successfully
388      * matches the empty string in the input.  </p>
389      *
390      * @param  name
391      *         The name of a named-capturing group in this matcher's pattern
392      *
393      * @return  The (possibly empty) subsequence captured by the named group
394      *          during the previous match, or <tt>null</tt> if the group
395      *          failed to match part of the input
396      *
397      * @throws  IllegalStateException
398      *          If no match has yet been attempted,
399      *          or if the previous match operation failed
400      *
401      * @throws  IllegalArgumentException
402      *          If there is no capturing group in the pattern
403      *          with the given name
404      * @since 1.7
405      */
group(String name)406     public String group(String name) {
407         ensureMatch();
408         int group = getMatchedGroupIndex(pattern.address, name);
409         int from = matchOffsets[group * 2];
410         int to = matchOffsets[(group * 2) + 1];
411         if (from == -1 || to == -1) {
412             return null;
413         } else {
414             return input.substring(from, to);
415         }
416     }
417 
418     /**
419      * Returns the number of capturing groups in this matcher's pattern.
420      *
421      * <p> Group zero denotes the entire pattern by convention. It is not
422      * included in this count.
423      *
424      * <p> Any non-negative integer smaller than or equal to the value
425      * returned by this method is guaranteed to be a valid group index for
426      * this matcher.  </p>
427      *
428      * @return The number of capturing groups in this matcher's pattern
429      */
groupCount()430     public int groupCount() {
431         synchronized (this) {
432             return groupCountImpl(address);
433         }
434     }
435 
436     /**
437      * Attempts to match the entire region against the pattern.
438      *
439      * <p> If the match succeeds then more information can be obtained via the
440      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
441      *
442      * @return  <tt>true</tt> if, and only if, the entire region sequence
443      *          matches this matcher's pattern
444      */
matches()445     public boolean matches() {
446         synchronized (this) {
447             matchFound = matchesImpl(address, matchOffsets);
448         }
449         return matchFound;
450     }
451 
452     /**
453      * Attempts to find the next subsequence of the input sequence that matches
454      * the pattern.
455      *
456      * <p> This method starts at the beginning of this matcher's region, or, if
457      * a previous invocation of the method was successful and the matcher has
458      * not since been reset, at the first character not matched by the previous
459      * match.
460      *
461      * <p> If the match succeeds then more information can be obtained via the
462      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
463      *
464      * @return  <tt>true</tt> if, and only if, a subsequence of the input
465      *          sequence matches this matcher's pattern
466      */
find()467     public boolean find() {
468         synchronized (this) {
469             matchFound = findNextImpl(address, matchOffsets);
470         }
471         return matchFound;
472     }
473 
474     /**
475      * Resets this matcher and then attempts to find the next subsequence of
476      * the input sequence that matches the pattern, starting at the specified
477      * index.
478      *
479      * <p> If the match succeeds then more information can be obtained via the
480      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
481      * invocations of the {@link #find()} method will start at the first
482      * character not matched by this match.  </p>
483      *
484      * @throws  IndexOutOfBoundsException
485      *          If start is less than zero or if start is greater than the
486      *          length of the input sequence.
487      *
488      * @return  <tt>true</tt> if, and only if, a subsequence of the input
489      *          sequence starting at the given index matches this matcher's
490      *          pattern
491      */
find(int start)492     public boolean find(int start) {
493         if (start < 0 || start > input.length()) {
494             throw new IndexOutOfBoundsException("start=" + start + "; length=" + input.length());
495         }
496 
497         synchronized (this) {
498             matchFound = findImpl(address, start, matchOffsets);
499         }
500         return matchFound;
501     }
502 
503     /**
504      * Attempts to match the input sequence, starting at the beginning of the
505      * region, against the pattern.
506      *
507      * <p> Like the {@link #matches matches} method, this method always starts
508      * at the beginning of the region; unlike that method, it does not
509      * require that the entire region be matched.
510      *
511      * <p> If the match succeeds then more information can be obtained via the
512      * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
513      *
514      * @return  <tt>true</tt> if, and only if, a prefix of the input
515      *          sequence matches this matcher's pattern
516      */
lookingAt()517     public boolean lookingAt() {
518         synchronized (this) {
519             matchFound = lookingAtImpl(address, matchOffsets);
520         }
521         return matchFound;
522     }
523 
524     /**
525      * Returns a literal replacement <code>String</code> for the specified
526      * <code>String</code>.
527      *
528      * This method produces a <code>String</code> that will work
529      * as a literal replacement <code>s</code> in the
530      * <code>appendReplacement</code> method of the {@link Matcher} class.
531      * The <code>String</code> produced will match the sequence of characters
532      * in <code>s</code> treated as a literal sequence. Slashes ('\') and
533      * dollar signs ('$') will be given no special meaning.
534      *
535      * @param  s The string to be literalized
536      * @return  A literal string replacement
537      * @since 1.5
538      */
quoteReplacement(String s)539     public static String quoteReplacement(String s) {
540         if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
541             return s;
542         StringBuilder sb = new StringBuilder();
543         for (int i=0; i<s.length(); i++) {
544             char c = s.charAt(i);
545             if (c == '\\' || c == '$') {
546                 sb.append('\\');
547             }
548             sb.append(c);
549         }
550         return sb.toString();
551     }
552 
553     /**
554      * Implements a non-terminal append-and-replace step.
555      *
556      * <p> This method performs the following actions: </p>
557      *
558      * <ol>
559      *
560      *   <li><p> It reads characters from the input sequence, starting at the
561      *   append position, and appends them to the given string buffer.  It
562      *   stops after reading the last character preceding the previous match,
563      *   that is, the character at index {@link
564      *   #start()}&nbsp;<tt>-</tt>&nbsp;<tt>1</tt>.  </p></li>
565      *
566      *   <li><p> It appends the given replacement string to the string buffer.
567      *   </p></li>
568      *
569      *   <li><p> It sets the append position of this matcher to the index of
570      *   the last character matched, plus one, that is, to {@link #end()}.
571      *   </p></li>
572      *
573      * </ol>
574      *
575      * <p> The replacement string may contain references to subsequences
576      * captured during the previous match: Each occurrence of
577      * <tt>$</tt><i>g</i> will be replaced by the result of evaluating the corresponding
578      * {@link #group(int) group(g)</tt>} respectively. For  <tt>$</tt><i>g</i><tt></tt>,
579      * the first number after the <tt>$</tt> is always treated as part of
580      * the group reference. Subsequent numbers are incorporated into g if
581      * they would form a legal group reference. Only the numerals '0'
582      * through '9' are considered as potential components of the group
583      * reference. If the second group matched the string <tt>"foo"</tt>, for
584      * example, then passing the replacement string <tt>"$2bar"</tt> would
585      * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
586      * sign (<tt>$</tt>) may be included as a literal in the replacement
587      * string by preceding it with a backslash (<tt>\$</tt>).
588      *
589      * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
590      * the replacement string may cause the results to be different than if it
591      * were being treated as a literal replacement string. Dollar signs may be
592      * treated as references to captured subsequences as described above, and
593      * backslashes are used to escape literal characters in the replacement
594      * string.
595      *
596      * <p> This method is intended to be used in a loop together with the
597      * {@link #appendTail appendTail} and {@link #find find} methods.  The
598      * following code, for example, writes <tt>one dog two dogs in the
599      * yard</tt> to the standard-output stream: </p>
600      *
601      * <blockquote><pre>
602      * Pattern p = Pattern.compile("cat");
603      * Matcher m = p.matcher("one cat two cats in the yard");
604      * StringBuffer sb = new StringBuffer();
605      * while (m.find()) {
606      *     m.appendReplacement(sb, "dog");
607      * }
608      * m.appendTail(sb);
609      * System.out.println(sb.toString());</pre></blockquote>
610      *
611      * @param  sb
612      *         The target string buffer
613      *
614      * @param  replacement
615      *         The replacement string
616      *
617      * @return  This matcher
618      *
619      * @throws  IllegalStateException
620      *          If no match has yet been attempted,
621      *          or if the previous match operation failed
622      *
623      * @throws  IllegalArgumentException
624      *          If the replacement string refers to a named-capturing
625      *          group that does not exist in the pattern
626      *
627      * @throws  IndexOutOfBoundsException
628      *          If the replacement string refers to a capturing group
629      *          that does not exist in the pattern
630      */
appendReplacement(StringBuffer sb, String replacement)631     public Matcher appendReplacement(StringBuffer sb, String replacement) {
632         sb.append(input.substring(appendPos, start()));
633         appendEvaluated(sb, replacement);
634         appendPos = end();
635 
636         return this;
637     }
638 
639     /**
640      * Internal helper method to append a given string to a given string buffer.
641      * If the string contains any references to groups, these are replaced by
642      * the corresponding group's contents.
643      *
644      * @param buffer the string buffer.
645      * @param s the string to append.
646      */
appendEvaluated(StringBuffer buffer, String s)647     private void appendEvaluated(StringBuffer buffer, String s) {
648         boolean escape = false;
649         boolean dollar = false;
650         boolean escapeNamedGroup = false;
651         int escapeNamedGroupStart = -1;
652 
653         for (int i = 0; i < s.length(); i++) {
654             char c = s.charAt(i);
655             if (c == '\\' && !escape) {
656                 escape = true;
657             } else if (c == '$' && !escape) {
658                 dollar = true;
659             } else if (c >= '0' && c <= '9' && dollar) {
660                 buffer.append(group(c - '0'));
661                 dollar = false;
662             } else if (c == '{' && dollar) {
663                 escapeNamedGroup = true;
664                 escapeNamedGroupStart = i;
665             } else if (c == '}' && dollar && escapeNamedGroup) {
666                 String namedGroupName =
667                     s.substring(escapeNamedGroupStart + 1, i);
668                 buffer.append(group(namedGroupName));
669                 dollar = false;
670                 escapeNamedGroup = false;
671             } else if (c != '}' && dollar && escapeNamedGroup) {
672                 continue;
673             } else {
674                 buffer.append(c);
675                 dollar = false;
676                 escape = false;
677                 escapeNamedGroup = false;
678             }
679         }
680 
681         if (escapeNamedGroup) {
682             throw new IllegalArgumentException("Missing ending brace '}' from replacement string");
683         }
684 
685         if (escape) {
686             throw new ArrayIndexOutOfBoundsException(s.length());
687         }
688     }
689 
690 
691     /**
692      * Implements a terminal append-and-replace step.
693      *
694      * <p> This method reads characters from the input sequence, starting at
695      * the append position, and appends them to the given string buffer.  It is
696      * intended to be invoked after one or more invocations of the {@link
697      * #appendReplacement appendReplacement} method in order to copy the
698      * remainder of the input sequence.  </p>
699      *
700      * @param  sb
701      *         The target string buffer
702      *
703      * @return  The target string buffer
704      */
appendTail(StringBuffer sb)705     public StringBuffer appendTail(StringBuffer sb) {
706         if (appendPos < regionEnd) {
707             sb.append(input.substring(appendPos, regionEnd));
708         }
709         return sb;
710     }
711 
712     /**
713      * Replaces every subsequence of the input sequence that matches the
714      * pattern with the given replacement string.
715      *
716      * <p> This method first resets this matcher.  It then scans the input
717      * sequence looking for matches of the pattern.  Characters that are not
718      * part of any match are appended directly to the result string; each match
719      * is replaced in the result by the replacement string.  The replacement
720      * string may contain references to captured subsequences as in the {@link
721      * #appendReplacement appendReplacement} method.
722      *
723      * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
724      * the replacement string may cause the results to be different than if it
725      * were being treated as a literal replacement string. Dollar signs may be
726      * treated as references to captured subsequences as described above, and
727      * backslashes are used to escape literal characters in the replacement
728      * string.
729      *
730      * <p> Given the regular expression <tt>a*b</tt>, the input
731      * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
732      * <tt>"-"</tt>, an invocation of this method on a matcher for that
733      * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
734      *
735      * <p> Invoking this method changes this matcher's state.  If the matcher
736      * is to be used in further matching operations then it should first be
737      * reset.  </p>
738      *
739      * @param  replacement
740      *         The replacement string
741      *
742      * @return  The string constructed by replacing each matching subsequence
743      *          by the replacement string, substituting captured subsequences
744      *          as needed
745      */
replaceAll(String replacement)746     public String replaceAll(String replacement) {
747         reset();
748         StringBuffer buffer = new StringBuffer(input.length());
749         while (find()) {
750             appendReplacement(buffer, replacement);
751         }
752         return appendTail(buffer).toString();
753     }
754 
755     /**
756      * Replaces the first subsequence of the input sequence that matches the
757      * pattern with the given replacement string.
758      *
759      * <p> This method first resets this matcher.  It then scans the input
760      * sequence looking for a match of the pattern.  Characters that are not
761      * part of the match are appended directly to the result string; the match
762      * is replaced in the result by the replacement string.  The replacement
763      * string may contain references to captured subsequences as in the {@link
764      * #appendReplacement appendReplacement} method.
765      *
766      * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
767      * the replacement string may cause the results to be different than if it
768      * were being treated as a literal replacement string. Dollar signs may be
769      * treated as references to captured subsequences as described above, and
770      * backslashes are used to escape literal characters in the replacement
771      * string.
772      *
773      * <p> Given the regular expression <tt>dog</tt>, the input
774      * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
775      * <tt>"cat"</tt>, an invocation of this method on a matcher for that
776      * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>.  </p>
777      *
778      * <p> Invoking this method changes this matcher's state.  If the matcher
779      * is to be used in further matching operations then it should first be
780      * reset.  </p>
781      *
782      * @param  replacement
783      *         The replacement string
784      * @return  The string constructed by replacing the first matching
785      *          subsequence by the replacement string, substituting captured
786      *          subsequences as needed
787      */
replaceFirst(String replacement)788     public String replaceFirst(String replacement) {
789         reset();
790         StringBuffer buffer = new StringBuffer(input.length());
791         if (find()) {
792             appendReplacement(buffer, replacement);
793         }
794         return appendTail(buffer).toString();
795     }
796 
797     /**
798      * Sets the limits of this matcher's region. The region is the part of the
799      * input sequence that will be searched to find a match. Invoking this
800      * method resets the matcher, and then sets the region to start at the
801      * index specified by the <code>start</code> parameter and end at the
802      * index specified by the <code>end</code> parameter.
803      *
804      * <p>Depending on the transparency and anchoring being used (see
805      * {@link #useTransparentBounds useTransparentBounds} and
806      * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
807      * as anchors may behave differently at or around the boundaries of the
808      * region.
809      *
810      * @param  start
811      *         The index to start searching at (inclusive)
812      * @param  end
813      *         The index to end searching at (exclusive)
814      * @throws  IndexOutOfBoundsException
815      *          If start or end is less than zero, if
816      *          start is greater than the length of the input sequence, if
817      *          end is greater than the length of the input sequence, or if
818      *          start is greater than end.
819      * @return  this matcher
820      * @since 1.5
821      */
region(int start, int end)822     public Matcher region(int start, int end) {
823         return reset(input, start, end);
824     }
825 
826     /**
827      * Reports the start index of this matcher's region. The
828      * searches this matcher conducts are limited to finding matches
829      * within {@link #regionStart regionStart} (inclusive) and
830      * {@link #regionEnd regionEnd} (exclusive).
831      *
832      * @return  The starting point of this matcher's region
833      * @since 1.5
834      */
regionStart()835     public int regionStart() {
836         return regionStart;
837     }
838 
839     /**
840      * Reports the end index (exclusive) of this matcher's region.
841      * The searches this matcher conducts are limited to finding matches
842      * within {@link #regionStart regionStart} (inclusive) and
843      * {@link #regionEnd regionEnd} (exclusive).
844      *
845      * @return  the ending point of this matcher's region
846      * @since 1.5
847      */
regionEnd()848     public int regionEnd() {
849         return regionEnd;
850     }
851 
852     /**
853      * Queries the transparency of region bounds for this matcher.
854      *
855      * <p> This method returns <tt>true</tt> if this matcher uses
856      * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
857      * bounds.
858      *
859      * <p> See {@link #useTransparentBounds useTransparentBounds} for a
860      * description of transparent and opaque bounds.
861      *
862      * <p> By default, a matcher uses opaque region boundaries.
863      *
864      * @return <tt>true</tt> iff this matcher is using transparent bounds,
865      *         <tt>false</tt> otherwise.
866      * @see java.util.regex.Matcher#useTransparentBounds(boolean)
867      * @since 1.5
868      */
hasTransparentBounds()869     public boolean hasTransparentBounds() {
870         return transparentBounds;
871     }
872 
873     /**
874      * Sets the transparency of region bounds for this matcher.
875      *
876      * <p> Invoking this method with an argument of <tt>true</tt> will set this
877      * matcher to use <i>transparent</i> bounds. If the boolean
878      * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
879      *
880      * <p> Using transparent bounds, the boundaries of this
881      * matcher's region are transparent to lookahead, lookbehind,
882      * and boundary matching constructs. Those constructs can see beyond the
883      * boundaries of the region to see if a match is appropriate.
884      *
885      * <p> Using opaque bounds, the boundaries of this matcher's
886      * region are opaque to lookahead, lookbehind, and boundary matching
887      * constructs that may try to see beyond them. Those constructs cannot
888      * look past the boundaries so they will fail to match anything outside
889      * of the region.
890      *
891      * <p> By default, a matcher uses opaque bounds.
892      *
893      * @param  value a boolean indicating whether to use opaque or transparent
894      *         regions
895      * @return this matcher
896      * @see java.util.regex.Matcher#hasTransparentBounds
897      * @since 1.5
898      */
useTransparentBounds(boolean value)899     public Matcher useTransparentBounds(boolean value) {
900         synchronized (this) {
901             transparentBounds = value;
902             useTransparentBoundsImpl(address, value);
903         }
904         return this;
905     }
906 
907     /**
908      * Queries the anchoring of region bounds for this matcher.
909      *
910      * <p> This method returns <tt>true</tt> if this matcher uses
911      * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
912      *
913      * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
914      * description of anchoring bounds.
915      *
916      * <p> By default, a matcher uses anchoring region boundaries.
917      *
918      * @return <tt>true</tt> iff this matcher is using anchoring bounds,
919      *         <tt>false</tt> otherwise.
920      * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
921      * @since 1.5
922      */
hasAnchoringBounds()923     public boolean hasAnchoringBounds() {
924         return anchoringBounds;
925     }
926 
927     /**
928      * Sets the anchoring of region bounds for this matcher.
929      *
930      * <p> Invoking this method with an argument of <tt>true</tt> will set this
931      * matcher to use <i>anchoring</i> bounds. If the boolean
932      * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
933      * used.
934      *
935      * <p> Using anchoring bounds, the boundaries of this
936      * matcher's region match anchors such as ^ and $.
937      *
938      * <p> Without anchoring bounds, the boundaries of this
939      * matcher's region will not match anchors such as ^ and $.
940      *
941      * <p> By default, a matcher uses anchoring region boundaries.
942      *
943      * @param  value a boolean indicating whether or not to use anchoring bounds.
944      * @return this matcher
945      * @see java.util.regex.Matcher#hasAnchoringBounds
946      * @since 1.5
947      */
useAnchoringBounds(boolean value)948     public Matcher useAnchoringBounds(boolean value) {
949         synchronized (this) {
950             anchoringBounds = value;
951             useAnchoringBoundsImpl(address, value);
952         }
953         return this;
954     }
955 
956     /**
957      * <p>Returns the string representation of this matcher. The
958      * string representation of a <code>Matcher</code> contains information
959      * that may be useful for debugging. The exact format is unspecified.
960      *
961      * @return  The string representation of this matcher
962      * @since 1.5
963      */
toString()964     public String toString() {
965         StringBuilder sb = new StringBuilder();
966         sb.append("java.util.regex.Matcher");
967         sb.append("[pattern=" + pattern());
968         sb.append(" region=");
969         sb.append(regionStart() + "," + regionEnd());
970         sb.append(" lastmatch=");
971         if (matchFound && (group() != null)) {
972             sb.append(group());
973         }
974         sb.append("]");
975         return sb.toString();
976     }
977 
978     /**
979      * <p>Returns true if the end of input was hit by the search engine in
980      * the last match operation performed by this matcher.
981      *
982      * <p>When this method returns true, then it is possible that more input
983      * would have changed the result of the last search.
984      *
985      * @return  true iff the end of input was hit in the last match; false
986      *          otherwise
987      * @since 1.5
988      */
hitEnd()989     public boolean hitEnd() {
990         synchronized (this) {
991             return hitEndImpl(address);
992         }
993     }
994 
995 
996     /**
997      * <p>Returns true if more input could change a positive match into a
998      * negative one.
999      *
1000      * <p>If this method returns true, and a match was found, then more
1001      * input could cause the match to be lost. If this method returns false
1002      * and a match was found, then more input might change the match but the
1003      * match won't be lost. If a match was not found, then requireEnd has no
1004      * meaning.
1005      *
1006      * @return  true iff more input could change a positive match into a
1007      *          negative one.
1008      * @since 1.5
1009      */
requireEnd()1010     public boolean requireEnd() {
1011         synchronized (this) {
1012             return requireEndImpl(address);
1013         }
1014     }
1015 
1016     /**
1017      * Resets this matcher.
1018      *
1019      * <p> Resetting a matcher discards all of its explicit state information
1020      * and sets its append position to zero. The matcher's region is set to the
1021      * default region, which is its entire character sequence. The anchoring
1022      * and transparency of this matcher's region boundaries are unaffected.
1023      *
1024      * @return  This matcher
1025      */
reset()1026     public Matcher reset() {
1027         return reset(input, 0, input.length());
1028     }
1029 
1030     /**
1031      * Resets this matcher with a new input sequence.
1032      *
1033      * <p> Resetting a matcher discards all of its explicit state information
1034      * and sets its append position to zero.  The matcher's region is set to
1035      * the default region, which is its entire character sequence.  The
1036      * anchoring and transparency of this matcher's region boundaries are
1037      * unaffected.
1038      *
1039      * @param  input
1040      *         The new input character sequence
1041      *
1042      * @return  This matcher
1043      */
reset(CharSequence input)1044     public Matcher reset(CharSequence input) {
1045         return reset(input, 0, input.length());
1046     }
1047 
1048     /**
1049      * Resets the Matcher. A new input sequence and a new region can be
1050      * specified. Results of a previous find get lost. The next attempt to find
1051      * an occurrence of the Pattern in the string will start at the beginning of
1052      * the region. This is the internal version of reset() to which the several
1053      * public versions delegate.
1054      *
1055      * @param input
1056      *            the input sequence.
1057      * @param start
1058      *            the start of the region.
1059      * @param end
1060      *            the end of the region.
1061      *
1062      * @return the matcher itself.
1063      */
reset(CharSequence input, int start, int end)1064     private Matcher reset(CharSequence input, int start, int end) {
1065         if (input == null) {
1066             throw new IllegalArgumentException("input == null");
1067         }
1068 
1069         if (start < 0 || end < 0 || start > input.length() || end > input.length() || start > end) {
1070             throw new IndexOutOfBoundsException();
1071         }
1072 
1073         this.input = input.toString();
1074         this.regionStart = start;
1075         this.regionEnd = end;
1076         resetForInput();
1077 
1078         matchFound = false;
1079         appendPos = 0;
1080 
1081         return this;
1082     }
1083 
resetForInput()1084     private void resetForInput() {
1085         synchronized (this) {
1086             setInputImpl(address, input, regionStart, regionEnd);
1087             useAnchoringBoundsImpl(address, anchoringBounds);
1088             useTransparentBoundsImpl(address, transparentBounds);
1089         }
1090     }
1091 
1092     /**
1093      * Makes sure that a successful match has been made. Is invoked internally
1094      * from various places in the class.
1095      *
1096      * @throws IllegalStateException
1097      *             if no successful match has been made.
1098      */
ensureMatch()1099     private void ensureMatch() {
1100         if (!matchFound) {
1101             throw new IllegalStateException("No successful match so far");
1102         }
1103     }
1104 
1105     /**
1106      * Returns the start index of the previous match.  </p>
1107      *
1108      * @return  The index of the first character matched
1109      *
1110      * @throws  IllegalStateException
1111      *          If no match has yet been attempted,
1112      *          or if the previous match operation failed
1113      */
start()1114     public int start() {
1115         return start(0);
1116     }
1117 
1118     /**
1119      * Returns the start index of the subsequence captured by the given group
1120      * during the previous match operation.
1121      *
1122      * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
1123      * to right, starting at one.  Group zero denotes the entire pattern, so
1124      * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
1125      * <i>m.</i><tt>start()</tt>.  </p>
1126      *
1127      * @param  group
1128      *         The index of a capturing group in this matcher's pattern
1129      *
1130      * @return  The index of the first character captured by the group,
1131      *          or <tt>-1</tt> if the match was successful but the group
1132      *          itself did not match anything
1133      *
1134      * @throws  IllegalStateException
1135      *          If no match has yet been attempted,
1136      *          or if the previous match operation failed
1137      *
1138      * @throws  IndexOutOfBoundsException
1139      *          If there is no capturing group in the pattern
1140      *          with the given index
1141      */
start(int group)1142     public int start(int group) throws IllegalStateException {
1143         ensureMatch();
1144         return matchOffsets[group * 2];
1145     }
1146 
1147 
1148     /**
1149      * Returns the start index of the subsequence captured by the given
1150      * <a href="Pattern.html#groupname">named-capturing group</a> during the
1151      * previous match operation.
1152      *
1153      * @param  name
1154      *         The name of a named-capturing group in this matcher's pattern
1155      *
1156      * @return  The index of the first character captured by the group,
1157      *          or {@code -1} if the match was successful but the group
1158      *          itself did not match anything
1159      *
1160      * @throws  IllegalStateException
1161      *          If no match has yet been attempted,
1162      *          or if the previous match operation failed
1163      *
1164      * @throws  IllegalArgumentException
1165      *          If there is no capturing group in the pattern
1166      *          with the given name
1167      * @since 1.8
1168      */
start(String name)1169     public int start(String name) {
1170         ensureMatch();
1171         return matchOffsets[getMatchedGroupIndex(pattern.address, name) * 2];
1172     }
1173 
getMatchedGroupIndex(long patternAddr, String name)1174     private static int getMatchedGroupIndex(long patternAddr, String name) {
1175         int result = getMatchedGroupIndex0(patternAddr, name);
1176         if (result < 0) {
1177             throw new IllegalArgumentException("No capturing group in the pattern " +
1178                                                "with the name " + name);
1179         }
1180         return result;
1181     }
1182 
getMatchedGroupIndex0(long patternAddr, String name)1183     private static native int getMatchedGroupIndex0(long patternAddr, String name);
findImpl(long addr, int startIndex, int[] offsets)1184     private static native boolean findImpl(long addr, int startIndex, int[] offsets);
findNextImpl(long addr, int[] offsets)1185     private static native boolean findNextImpl(long addr, int[] offsets);
getNativeFinalizer()1186     private static native long getNativeFinalizer();
groupCountImpl(long addr)1187     private static native int groupCountImpl(long addr);
hitEndImpl(long addr)1188     private static native boolean hitEndImpl(long addr);
lookingAtImpl(long addr, int[] offsets)1189     private static native boolean lookingAtImpl(long addr, int[] offsets);
matchesImpl(long addr, int[] offsets)1190     private static native boolean matchesImpl(long addr, int[] offsets);
nativeSize()1191     private static native int nativeSize();
openImpl(long patternAddr)1192     private static native long openImpl(long patternAddr);
requireEndImpl(long addr)1193     private static native boolean requireEndImpl(long addr);
setInputImpl(long addr, String s, int start, int end)1194     private static native void setInputImpl(long addr, String s, int start, int end);
useAnchoringBoundsImpl(long addr, boolean value)1195     private static native void useAnchoringBoundsImpl(long addr, boolean value);
useTransparentBoundsImpl(long addr, boolean value)1196     private static native void useTransparentBoundsImpl(long addr, boolean value);
1197 
1198     /**
1199      * A trivial match result implementation that's based on an array of integers
1200      * representing match offsets. The array is of the form
1201      * {@code { start1, end1, start2, end2 ....}) where each consecutive pair of elements represents
1202      * the start and end of a match respectively.
1203      */
1204     static final class OffsetBasedMatchResult implements MatchResult {
1205         private final String input;
1206         private final int[] offsets;
1207 
OffsetBasedMatchResult(String input, int[] offsets)1208         OffsetBasedMatchResult(String input, int[] offsets) {
1209             this.input = input;
1210             this.offsets = offsets.clone();
1211         }
1212 
1213         @Override
start()1214         public int start() {
1215             return start(0);
1216         }
1217 
1218         @Override
start(int group)1219         public int start(int group) {
1220             return offsets[2 * group];
1221         }
1222 
1223         @Override
end()1224         public int end() {
1225             return end(0);
1226         }
1227 
1228         @Override
end(int group)1229         public int end(int group) {
1230             return offsets[2 * group + 1];
1231         }
1232 
1233         @Override
group()1234         public String group() {
1235             return group(0);
1236         }
1237 
1238         @Override
group(int group)1239         public String group(int group) {
1240             final int start = start(group);
1241             final int end = end(group);
1242             if (start == -1 || end == -1) {
1243                 return null;
1244             }
1245 
1246             return input.substring(start, end);
1247         }
1248 
1249         @Override
groupCount()1250         public int groupCount() {
1251             return (offsets.length / 2) - 1;
1252         }
1253     }
1254 }
1255