1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text.method;
18 
19 import android.annotation.NonNull;
20 import android.annotation.UnsupportedAppUsage;
21 import android.icu.lang.UCharacter;
22 import android.icu.lang.UProperty;
23 import android.icu.text.BreakIterator;
24 import android.text.CharSequenceCharacterIterator;
25 import android.text.Selection;
26 
27 import java.util.Locale;
28 
29 /**
30  * Walks through cursor positions at word boundaries. Internally uses
31  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
32  * for performance reasons.
33  *
34  * Also provides methods to determine word boundaries.
35  * {@hide}
36  */
37 public class WordIterator implements Selection.PositionIterator {
38     // Size of the window for the word iterator, should be greater than the longest word's length
39     private static final int WINDOW_WIDTH = 50;
40 
41     private int mStart, mEnd;
42     private CharSequence mCharSeq;
43     private final BreakIterator mIterator;
44 
45     /**
46      * Constructs a WordIterator using the default locale.
47      */
WordIterator()48     public WordIterator() {
49         this(Locale.getDefault());
50     }
51 
52     /**
53      * Constructs a new WordIterator for the specified locale.
54      * @param locale The locale to be used for analyzing the text.
55      */
56     @UnsupportedAppUsage
WordIterator(Locale locale)57     public WordIterator(Locale locale) {
58         mIterator = BreakIterator.getWordInstance(locale);
59     }
60 
61     @UnsupportedAppUsage
setCharSequence(@onNull CharSequence charSequence, int start, int end)62     public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
63         if (0 <= start && end <= charSequence.length()) {
64             mCharSeq = charSequence;
65             mStart = Math.max(0, start - WINDOW_WIDTH);
66             mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
67             mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
68         } else {
69             throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
70         }
71     }
72 
73     /** {@inheritDoc} */
74     @UnsupportedAppUsage
preceding(int offset)75     public int preceding(int offset) {
76         checkOffsetIsValid(offset);
77         while (true) {
78             offset = mIterator.preceding(offset);
79             if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
80                 return offset;
81             }
82         }
83     }
84 
85     /** {@inheritDoc} */
86     @UnsupportedAppUsage
following(int offset)87     public int following(int offset) {
88         checkOffsetIsValid(offset);
89         while (true) {
90             offset = mIterator.following(offset);
91             if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
92                 return offset;
93             }
94         }
95     }
96 
97     /** {@inheritDoc} */
98     @UnsupportedAppUsage
isBoundary(int offset)99     public boolean isBoundary(int offset) {
100         checkOffsetIsValid(offset);
101         return mIterator.isBoundary(offset);
102     }
103 
104     /**
105      * Returns the position of next boundary after the given offset. Returns
106      * {@code DONE} if there is no boundary after the given offset.
107      *
108      * @param offset the given start position to search from.
109      * @return the position of the last boundary preceding the given offset.
110      */
111     @UnsupportedAppUsage
nextBoundary(int offset)112     public int nextBoundary(int offset) {
113         checkOffsetIsValid(offset);
114         return mIterator.following(offset);
115     }
116 
117     /**
118      * Returns the position of boundary preceding the given offset or
119      * {@code DONE} if the given offset specifies the starting position.
120      *
121      * @param offset the given start position to search from.
122      * @return the position of the last boundary preceding the given offset.
123      */
124     @UnsupportedAppUsage
prevBoundary(int offset)125     public int prevBoundary(int offset) {
126         checkOffsetIsValid(offset);
127         return mIterator.preceding(offset);
128     }
129 
130     /** If <code>offset</code> is within a word, returns the index of the first character of that
131      * word, otherwise returns BreakIterator.DONE.
132      *
133      * The offsets that are considered to be part of a word are the indexes of its characters,
134      * <i>as well as</i> the index of its last character plus one.
135      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
136      *
137      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
138      * The returned value is within [0..offset] or BreakIterator.DONE.
139      *
140      * @throws IllegalArgumentException is offset is not valid.
141      */
142     @UnsupportedAppUsage
getBeginning(int offset)143     public int getBeginning(int offset) {
144         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
145         // so this method can be removed.
146         return getBeginning(offset, false);
147     }
148 
149     /**
150      * If <code>offset</code> is within a word, returns the index of the last character of that
151      * word plus one, otherwise returns BreakIterator.DONE.
152      *
153      * The offsets that are considered to be part of a word are the indexes of its characters,
154      * <i>as well as</i> the index of its last character plus one.
155      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
156      *
157      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
158      * The returned value is within [offset..textLength] or BreakIterator.DONE.
159      *
160      * @throws IllegalArgumentException is offset is not valid.
161      */
162     @UnsupportedAppUsage
getEnd(int offset)163     public int getEnd(int offset) {
164         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
165         // so this method can be removed.
166         return getEnd(offset, false);
167     }
168 
169     /**
170      * If the <code>offset</code> is within a word or on a word boundary that can only be
171      * considered the start of a word (e.g. _word where "_" is any character that would not
172      * be considered part of the word) then this returns the index of the first character of
173      * that word.
174      *
175      * If the offset is on a word boundary that can be considered the start and end of a
176      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
177      * between AA and BB, this would return the start of the previous word, AA.
178      *
179      * Returns BreakIterator.DONE if there is no previous boundary.
180      *
181      * @throws IllegalArgumentException is offset is not valid.
182      */
183     @UnsupportedAppUsage
getPrevWordBeginningOnTwoWordsBoundary(int offset)184     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
185         return getBeginning(offset, true);
186     }
187 
188     /**
189      * If the <code>offset</code> is within a word or on a word boundary that can only be
190      * considered the end of a word (e.g. word_ where "_" is any character that would not
191      * be considered part of the word) then this returns the index of the last character
192      * plus one of that word.
193      *
194      * If the offset is on a word boundary that can be considered the start and end of a
195      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
196      * between AA and BB, this would return the end of the next word, BB.
197      *
198      * Returns BreakIterator.DONE if there is no next boundary.
199      *
200      * @throws IllegalArgumentException is offset is not valid.
201      */
202     @UnsupportedAppUsage
getNextWordEndOnTwoWordBoundary(int offset)203     public int getNextWordEndOnTwoWordBoundary(int offset) {
204         return getEnd(offset, true);
205     }
206 
207     /**
208      * If the <code>offset</code> is within a word or on a word boundary that can only be
209      * considered the start of a word (e.g. _word where "_" is any character that would not
210      * be considered part of the word) then this returns the index of the first character of
211      * that word.
212      *
213      * If the offset is on a word boundary that can be considered the start and end of a
214      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
215      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
216      * return the start of the previous word, AA. Otherwise it would return the current offset,
217      * the start of BB.
218      *
219      * Returns BreakIterator.DONE if there is no previous boundary.
220      *
221      * @throws IllegalArgumentException is offset is not valid.
222      */
getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)223     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
224         checkOffsetIsValid(offset);
225 
226         if (isOnLetterOrDigit(offset)) {
227             if (mIterator.isBoundary(offset)
228                     && (!isAfterLetterOrDigit(offset)
229                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
230                 return offset;
231             } else {
232                 return mIterator.preceding(offset);
233             }
234         } else {
235             if (isAfterLetterOrDigit(offset)) {
236                 return mIterator.preceding(offset);
237             }
238         }
239         return BreakIterator.DONE;
240     }
241 
242     /**
243      * If the <code>offset</code> is within a word or on a word boundary that can only be
244      * considered the end of a word (e.g. word_ where "_" is any character that would not be
245      * considered part of the word) then this returns the index of the last character plus one
246      * of that word.
247      *
248      * If the offset is on a word boundary that can be considered the start and end of a
249      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
250      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
251      * the end of the next word, BB. Otherwise it would return the current offset, the end
252      * of AA.
253      *
254      * Returns BreakIterator.DONE if there is no next boundary.
255      *
256      * @throws IllegalArgumentException is offset is not valid.
257      */
getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)258     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
259         checkOffsetIsValid(offset);
260 
261         if (isAfterLetterOrDigit(offset)) {
262             if (mIterator.isBoundary(offset)
263                     && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
264                 return offset;
265             } else {
266                 return mIterator.following(offset);
267             }
268         } else {
269             if (isOnLetterOrDigit(offset)) {
270                 return mIterator.following(offset);
271             }
272         }
273         return BreakIterator.DONE;
274     }
275 
276     /**
277      * If <code>offset</code> is within a group of punctuation as defined
278      * by {@link #isPunctuation(int)}, returns the index of the first character
279      * of that group, otherwise returns BreakIterator.DONE.
280      *
281      * @param offset the offset to search from.
282      */
283     @UnsupportedAppUsage
getPunctuationBeginning(int offset)284     public int getPunctuationBeginning(int offset) {
285         checkOffsetIsValid(offset);
286         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
287             offset = prevBoundary(offset);
288         }
289         // No need to shift offset, prevBoundary handles that.
290         return offset;
291     }
292 
293     /**
294      * If <code>offset</code> is within a group of punctuation as defined
295      * by {@link #isPunctuation(int)}, returns the index of the last character
296      * of that group plus one, otherwise returns BreakIterator.DONE.
297      *
298      * @param offset the offset to search from.
299      */
300     @UnsupportedAppUsage
getPunctuationEnd(int offset)301     public int getPunctuationEnd(int offset) {
302         checkOffsetIsValid(offset);
303         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
304             offset = nextBoundary(offset);
305         }
306         // No need to shift offset, nextBoundary handles that.
307         return offset;
308     }
309 
310     /**
311      * Indicates if the provided offset is after a punctuation character
312      * as defined by {@link #isPunctuation(int)}.
313      *
314      * @param offset the offset to check from.
315      * @return Whether the offset is after a punctuation character.
316      */
317     @UnsupportedAppUsage
isAfterPunctuation(int offset)318     public boolean isAfterPunctuation(int offset) {
319         if (mStart < offset && offset <= mEnd) {
320             final int codePoint = Character.codePointBefore(mCharSeq, offset);
321             return isPunctuation(codePoint);
322         }
323         return false;
324     }
325 
326     /**
327      * Indicates if the provided offset is at a punctuation character
328      * as defined by {@link #isPunctuation(int)}.
329      *
330      * @param offset the offset to check from.
331      * @return Whether the offset is at a punctuation character.
332      */
333     @UnsupportedAppUsage
isOnPunctuation(int offset)334     public boolean isOnPunctuation(int offset) {
335         if (mStart <= offset && offset < mEnd) {
336             final int codePoint = Character.codePointAt(mCharSeq, offset);
337             return isPunctuation(codePoint);
338         }
339         return false;
340     }
341 
342     /**
343      * Indicates if the codepoint is a mid-word-only punctuation.
344      *
345      * At the moment, this is locale-independent, and includes all the characters in
346      * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
347      * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
348      * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
349      * in the middle of a word, but they become word breaks if they happen at the end of a word
350      * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
351      *
352      * @param locale the locale to consider the codepoint in. Presently ignored.
353      * @param codePoint the codepoint to check.
354      * @return True if the codepoint is a mid-word punctuation.
355      */
isMidWordPunctuation(Locale locale, int codePoint)356     public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
357         final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
358         return (wb == UCharacter.WordBreak.MIDLETTER
359                 || wb == UCharacter.WordBreak.MIDNUMLET
360                 || wb == UCharacter.WordBreak.SINGLE_QUOTE);
361     }
362 
isPunctuationStartBoundary(int offset)363     private boolean isPunctuationStartBoundary(int offset) {
364         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
365     }
366 
isPunctuationEndBoundary(int offset)367     private boolean isPunctuationEndBoundary(int offset) {
368         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
369     }
370 
isPunctuation(int cp)371     private static boolean isPunctuation(int cp) {
372         final int type = Character.getType(cp);
373         return (type == Character.CONNECTOR_PUNCTUATION
374                 || type == Character.DASH_PUNCTUATION
375                 || type == Character.END_PUNCTUATION
376                 || type == Character.FINAL_QUOTE_PUNCTUATION
377                 || type == Character.INITIAL_QUOTE_PUNCTUATION
378                 || type == Character.OTHER_PUNCTUATION
379                 || type == Character.START_PUNCTUATION);
380     }
381 
isAfterLetterOrDigit(int offset)382     private boolean isAfterLetterOrDigit(int offset) {
383         if (mStart < offset && offset <= mEnd) {
384             final int codePoint = Character.codePointBefore(mCharSeq, offset);
385             if (Character.isLetterOrDigit(codePoint)) return true;
386         }
387         return false;
388     }
389 
isOnLetterOrDigit(int offset)390     private boolean isOnLetterOrDigit(int offset) {
391         if (mStart <= offset && offset < mEnd) {
392             final int codePoint = Character.codePointAt(mCharSeq, offset);
393             if (Character.isLetterOrDigit(codePoint)) return true;
394         }
395         return false;
396     }
397 
checkOffsetIsValid(int offset)398     private void checkOffsetIsValid(int offset) {
399         if (!(mStart <= offset && offset <= mEnd)) {
400             throw new IllegalArgumentException("Invalid offset: " + (offset) +
401                     ". Valid range is [" + mStart + ", " + mEnd + "]");
402         }
403     }
404 }
405