1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text.method;
18 
19 import android.annotation.NonNull;
20 import android.compat.annotation.UnsupportedAppUsage;
21 import android.icu.lang.UCharacter;
22 import android.icu.lang.UProperty;
23 import android.icu.text.BreakIterator;
24 import android.icu.util.ULocale;
25 import android.os.Build;
26 import android.text.CharSequenceCharacterIterator;
27 import android.text.Selection;
28 import android.text.TextUtils;
29 
30 import java.util.Locale;
31 
32 /**
33  * Walks through cursor positions at word boundaries. Internally uses
34  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
35  * for performance reasons.
36  *
37  * Also provides methods to determine word boundaries.
38  * {@hide}
39  */
40 public class WordIterator implements Selection.PositionIterator {
41     // Size of the window for the word iterator, should be greater than the longest word's length
42     private static final int WINDOW_WIDTH = 50;
43 
44     private int mStart, mEnd;
45     private CharSequence mCharSeq;
46     private final BreakIterator mIterator;
47 
48     /**
49      * Constructs a WordIterator using the default locale.
50      */
WordIterator()51     public WordIterator() {
52         this(Locale.getDefault());
53     }
54 
55     /**
56      * Constructs a new WordIterator for the specified locale.
57      * @param locale The locale to be used for analyzing the text.
58      */
59     @UnsupportedAppUsage
WordIterator(Locale locale)60     public WordIterator(Locale locale) {
61         mIterator = BreakIterator.getWordInstance(locale);
62     }
63 
64     /**
65      * Constructs a new WordIterator for the specified locale.
66      * @param locale The locale to be used for analyzing the text.
67      */
WordIterator(ULocale locale)68     public WordIterator(ULocale locale) {
69         mIterator = BreakIterator.getWordInstance(locale);
70     }
71 
72     @UnsupportedAppUsage
setCharSequence(@onNull CharSequence charSequence, int start, int end)73     public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
74         if (0 <= start && end <= charSequence.length()) {
75             mCharSeq = charSequence;
76             mStart = Math.max(0, start - WINDOW_WIDTH);
77             mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
78             mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
79         } else {
80             throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
81         }
82     }
83 
84     /** {@inheritDoc} */
85     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
preceding(int offset)86     public int preceding(int offset) {
87         checkOffsetIsValid(offset);
88         while (true) {
89             offset = mIterator.preceding(offset);
90             if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
91                 return offset;
92             }
93         }
94     }
95 
96     /** {@inheritDoc} */
97     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
following(int offset)98     public int following(int offset) {
99         checkOffsetIsValid(offset);
100         while (true) {
101             offset = mIterator.following(offset);
102             if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
103                 return offset;
104             }
105         }
106     }
107 
108     /** {@inheritDoc} */
109     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isBoundary(int offset)110     public boolean isBoundary(int offset) {
111         checkOffsetIsValid(offset);
112         return mIterator.isBoundary(offset);
113     }
114 
115     /**
116      * Returns the position of next boundary after the given offset. Returns
117      * {@code DONE} if there is no boundary after the given offset.
118      *
119      * @param offset the given start position to search from.
120      * @return the position of the last boundary preceding the given offset.
121      */
122     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
nextBoundary(int offset)123     public int nextBoundary(int offset) {
124         checkOffsetIsValid(offset);
125         return mIterator.following(offset);
126     }
127 
128     /**
129      * Returns the position of boundary preceding the given offset or
130      * {@code DONE} if the given offset specifies the starting position.
131      *
132      * @param offset the given start position to search from.
133      * @return the position of the last boundary preceding the given offset.
134      */
135     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
prevBoundary(int offset)136     public int prevBoundary(int offset) {
137         checkOffsetIsValid(offset);
138         return mIterator.preceding(offset);
139     }
140 
141     /** If <code>offset</code> is within a word, returns the index of the first character of that
142      * word, otherwise returns BreakIterator.DONE.
143      *
144      * The offsets that are considered to be part of a word are the indexes of its characters,
145      * <i>as well as</i> the index of its last character plus one.
146      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
147      *
148      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
149      * The returned value is within [0..offset] or BreakIterator.DONE.
150      *
151      * @throws IllegalArgumentException is offset is not valid.
152      */
153     @UnsupportedAppUsage
getBeginning(int offset)154     public int getBeginning(int offset) {
155         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
156         // so this method can be removed.
157         return getBeginning(offset, false);
158     }
159 
160     /**
161      * If <code>offset</code> is within a word, returns the index of the last character of that
162      * word plus one, otherwise returns BreakIterator.DONE.
163      *
164      * The offsets that are considered to be part of a word are the indexes of its characters,
165      * <i>as well as</i> the index of its last character plus one.
166      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
167      *
168      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
169      * The returned value is within [offset..textLength] or BreakIterator.DONE.
170      *
171      * @throws IllegalArgumentException is offset is not valid.
172      */
173     @UnsupportedAppUsage
getEnd(int offset)174     public int getEnd(int offset) {
175         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
176         // so this method can be removed.
177         return getEnd(offset, false);
178     }
179 
180     /**
181      * If the <code>offset</code> is within a word or on a word boundary that can only be
182      * considered the start of a word (e.g. _word where "_" is any character that would not
183      * be considered part of the word) then this returns the index of the first character of
184      * that word.
185      *
186      * If the offset is on a word boundary that can be considered the start and end of a
187      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
188      * between AA and BB, this would return the start of the previous word, AA.
189      *
190      * Returns BreakIterator.DONE if there is no previous boundary.
191      *
192      * @throws IllegalArgumentException is offset is not valid.
193      */
194     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPrevWordBeginningOnTwoWordsBoundary(int offset)195     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
196         return getBeginning(offset, true);
197     }
198 
199     /**
200      * If the <code>offset</code> is within a word or on a word boundary that can only be
201      * considered the end of a word (e.g. word_ where "_" is any character that would not
202      * be considered part of the word) then this returns the index of the last character
203      * plus one of that word.
204      *
205      * If the offset is on a word boundary that can be considered the start and end of a
206      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
207      * between AA and BB, this would return the end of the next word, BB.
208      *
209      * Returns BreakIterator.DONE if there is no next boundary.
210      *
211      * @throws IllegalArgumentException is offset is not valid.
212      */
213     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getNextWordEndOnTwoWordBoundary(int offset)214     public int getNextWordEndOnTwoWordBoundary(int offset) {
215         return getEnd(offset, true);
216     }
217 
218     /**
219      * If the <code>offset</code> is within a word or on a word boundary that can only be
220      * considered the start of a word (e.g. _word where "_" is any character that would not
221      * be considered part of the word) then this returns the index of the first character of
222      * that word.
223      *
224      * If the offset is on a word boundary that can be considered the start and end of a
225      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
226      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
227      * return the start of the previous word, AA. Otherwise it would return the current offset,
228      * the start of BB.
229      *
230      * Returns BreakIterator.DONE if there is no previous boundary.
231      *
232      * @throws IllegalArgumentException is offset is not valid.
233      */
getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)234     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
235         checkOffsetIsValid(offset);
236 
237         if (isOnLetterOrDigit(offset)) {
238             if (mIterator.isBoundary(offset)
239                     && (!isAfterLetterOrDigit(offset)
240                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
241                 return offset;
242             } else {
243                 return mIterator.preceding(offset);
244             }
245         } else {
246             if (isAfterLetterOrDigit(offset)) {
247                 return mIterator.preceding(offset);
248             }
249         }
250         return BreakIterator.DONE;
251     }
252 
253     /**
254      * If the <code>offset</code> is within a word or on a word boundary that can only be
255      * considered the end of a word (e.g. word_ where "_" is any character that would not be
256      * considered part of the word) then this returns the index of the last character plus one
257      * of that word.
258      *
259      * If the offset is on a word boundary that can be considered the start and end of a
260      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
261      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
262      * the end of the next word, BB. Otherwise it would return the current offset, the end
263      * of AA.
264      *
265      * Returns BreakIterator.DONE if there is no next boundary.
266      *
267      * @throws IllegalArgumentException is offset is not valid.
268      */
getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)269     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
270         checkOffsetIsValid(offset);
271 
272         if (isAfterLetterOrDigit(offset)) {
273             if (mIterator.isBoundary(offset)
274                     && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
275                 return offset;
276             } else {
277                 return mIterator.following(offset);
278             }
279         } else {
280             if (isOnLetterOrDigit(offset)) {
281                 return mIterator.following(offset);
282             }
283         }
284         return BreakIterator.DONE;
285     }
286 
287     /**
288      * If <code>offset</code> is within a group of punctuation as defined by {@link
289      * TextUtils#isPunctuation(int)}, returns the index of the first character of that group,
290      * otherwise returns BreakIterator.DONE.
291      *
292      * @param offset the offset to search from.
293      */
294     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPunctuationBeginning(int offset)295     public int getPunctuationBeginning(int offset) {
296         checkOffsetIsValid(offset);
297         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
298             offset = prevBoundary(offset);
299         }
300         // No need to shift offset, prevBoundary handles that.
301         return offset;
302     }
303 
304     /**
305      * If <code>offset</code> is within a group of punctuation as defined by {@link
306      * TextUtils#isPunctuation(int)}, returns the index of the last character of that group plus
307      * one, otherwise returns BreakIterator.DONE.
308      *
309      * @param offset the offset to search from.
310      */
311     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
getPunctuationEnd(int offset)312     public int getPunctuationEnd(int offset) {
313         checkOffsetIsValid(offset);
314         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
315             offset = nextBoundary(offset);
316         }
317         // No need to shift offset, nextBoundary handles that.
318         return offset;
319     }
320 
321     /**
322      * Indicates if the provided offset is after a punctuation character as defined by {@link
323      * TextUtils#isPunctuation(int)}.
324      *
325      * @param offset the offset to check from.
326      * @return Whether the offset is after a punctuation character.
327      */
328     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isAfterPunctuation(int offset)329     public boolean isAfterPunctuation(int offset) {
330         if (mStart < offset && offset <= mEnd) {
331             final int codePoint = Character.codePointBefore(mCharSeq, offset);
332             return TextUtils.isPunctuation(codePoint);
333         }
334         return false;
335     }
336 
337     /**
338      * Indicates if the provided offset is at a punctuation character as defined by {@link
339      * TextUtils#isPunctuation(int)}.
340      *
341      * @param offset the offset to check from.
342      * @return Whether the offset is at a punctuation character.
343      */
344     @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553)
isOnPunctuation(int offset)345     public boolean isOnPunctuation(int offset) {
346         if (mStart <= offset && offset < mEnd) {
347             final int codePoint = Character.codePointAt(mCharSeq, offset);
348             return TextUtils.isPunctuation(codePoint);
349         }
350         return false;
351     }
352 
353     /**
354      * Indicates if the codepoint is a mid-word-only punctuation.
355      *
356      * At the moment, this is locale-independent, and includes all the characters in
357      * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
358      * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
359      * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
360      * in the middle of a word, but they become word breaks if they happen at the end of a word
361      * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
362      *
363      * @param locale the locale to consider the codepoint in. Presently ignored.
364      * @param codePoint the codepoint to check.
365      * @return True if the codepoint is a mid-word punctuation.
366      */
isMidWordPunctuation(Locale locale, int codePoint)367     public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
368         final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
369         return (wb == UCharacter.WordBreak.MIDLETTER
370                 || wb == UCharacter.WordBreak.MIDNUMLET
371                 || wb == UCharacter.WordBreak.SINGLE_QUOTE);
372     }
373 
isPunctuationStartBoundary(int offset)374     private boolean isPunctuationStartBoundary(int offset) {
375         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
376     }
377 
isPunctuationEndBoundary(int offset)378     private boolean isPunctuationEndBoundary(int offset) {
379         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
380     }
381 
isAfterLetterOrDigit(int offset)382     private boolean isAfterLetterOrDigit(int offset) {
383         if (mStart < offset && offset <= mEnd) {
384             final int codePoint = Character.codePointBefore(mCharSeq, offset);
385             if (Character.isLetterOrDigit(codePoint)) return true;
386         }
387         return false;
388     }
389 
isOnLetterOrDigit(int offset)390     private boolean isOnLetterOrDigit(int offset) {
391         if (mStart <= offset && offset < mEnd) {
392             final int codePoint = Character.codePointAt(mCharSeq, offset);
393             if (Character.isLetterOrDigit(codePoint)) return true;
394         }
395         return false;
396     }
397 
checkOffsetIsValid(int offset)398     private void checkOffsetIsValid(int offset) {
399         if (!(mStart <= offset && offset <= mEnd)) {
400             throw new IllegalArgumentException("Invalid offset: " + (offset) +
401                     ". Valid range is [" + mStart + ", " + mEnd + "]");
402         }
403     }
404 }
405