1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text.method;
18 
19 import android.annotation.NonNull;
20 import android.icu.lang.UCharacter;
21 import android.icu.lang.UProperty;
22 import android.icu.text.BreakIterator;
23 import android.text.CharSequenceCharacterIterator;
24 import android.text.Selection;
25 
26 import java.util.Locale;
27 
28 /**
29  * Walks through cursor positions at word boundaries. Internally uses
30  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
31  * for performance reasons.
32  *
33  * Also provides methods to determine word boundaries.
34  * {@hide}
35  */
36 public class WordIterator implements Selection.PositionIterator {
37     // Size of the window for the word iterator, should be greater than the longest word's length
38     private static final int WINDOW_WIDTH = 50;
39 
40     private int mStart, mEnd;
41     private CharSequence mCharSeq;
42     private final BreakIterator mIterator;
43 
44     /**
45      * Constructs a WordIterator using the default locale.
46      */
WordIterator()47     public WordIterator() {
48         this(Locale.getDefault());
49     }
50 
51     /**
52      * Constructs a new WordIterator for the specified locale.
53      * @param locale The locale to be used for analyzing the text.
54      */
WordIterator(Locale locale)55     public WordIterator(Locale locale) {
56         mIterator = BreakIterator.getWordInstance(locale);
57     }
58 
setCharSequence(@onNull CharSequence charSequence, int start, int end)59     public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
60         if (0 <= start && end <= charSequence.length()) {
61             mCharSeq = charSequence;
62             mStart = Math.max(0, start - WINDOW_WIDTH);
63             mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
64             mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
65         } else {
66             throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
67         }
68     }
69 
70     /** {@inheritDoc} */
preceding(int offset)71     public int preceding(int offset) {
72         checkOffsetIsValid(offset);
73         while (true) {
74             offset = mIterator.preceding(offset);
75             if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
76                 return offset;
77             }
78         }
79     }
80 
81     /** {@inheritDoc} */
following(int offset)82     public int following(int offset) {
83         checkOffsetIsValid(offset);
84         while (true) {
85             offset = mIterator.following(offset);
86             if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
87                 return offset;
88             }
89         }
90     }
91 
92     /** {@inheritDoc} */
isBoundary(int offset)93     public boolean isBoundary(int offset) {
94         checkOffsetIsValid(offset);
95         return mIterator.isBoundary(offset);
96     }
97 
98     /**
99      * Returns the position of next boundary after the given offset. Returns
100      * {@code DONE} if there is no boundary after the given offset.
101      *
102      * @param offset the given start position to search from.
103      * @return the position of the last boundary preceding the given offset.
104      */
nextBoundary(int offset)105     public int nextBoundary(int offset) {
106         checkOffsetIsValid(offset);
107         return mIterator.following(offset);
108     }
109 
110     /**
111      * Returns the position of boundary preceding the given offset or
112      * {@code DONE} if the given offset specifies the starting position.
113      *
114      * @param offset the given start position to search from.
115      * @return the position of the last boundary preceding the given offset.
116      */
prevBoundary(int offset)117     public int prevBoundary(int offset) {
118         checkOffsetIsValid(offset);
119         return mIterator.preceding(offset);
120     }
121 
122     /** If <code>offset</code> is within a word, returns the index of the first character of that
123      * word, otherwise returns BreakIterator.DONE.
124      *
125      * The offsets that are considered to be part of a word are the indexes of its characters,
126      * <i>as well as</i> the index of its last character plus one.
127      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
128      *
129      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
130      * The returned value is within [0..offset] or BreakIterator.DONE.
131      *
132      * @throws IllegalArgumentException is offset is not valid.
133      */
getBeginning(int offset)134     public int getBeginning(int offset) {
135         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
136         // so this method can be removed.
137         return getBeginning(offset, false);
138     }
139 
140     /**
141      * If <code>offset</code> is within a word, returns the index of the last character of that
142      * word plus one, otherwise returns BreakIterator.DONE.
143      *
144      * The offsets that are considered to be part of a word are the indexes of its characters,
145      * <i>as well as</i> the index of its last character plus one.
146      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
147      *
148      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
149      * The returned value is within [offset..textLength] or BreakIterator.DONE.
150      *
151      * @throws IllegalArgumentException is offset is not valid.
152      */
getEnd(int offset)153     public int getEnd(int offset) {
154         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
155         // so this method can be removed.
156         return getEnd(offset, false);
157     }
158 
159     /**
160      * If the <code>offset</code> is within a word or on a word boundary that can only be
161      * considered the start of a word (e.g. _word where "_" is any character that would not
162      * be considered part of the word) then this returns the index of the first character of
163      * that word.
164      *
165      * If the offset is on a word boundary that can be considered the start and end of a
166      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
167      * between AA and BB, this would return the start of the previous word, AA.
168      *
169      * Returns BreakIterator.DONE if there is no previous boundary.
170      *
171      * @throws IllegalArgumentException is offset is not valid.
172      */
getPrevWordBeginningOnTwoWordsBoundary(int offset)173     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
174         return getBeginning(offset, true);
175     }
176 
177     /**
178      * If the <code>offset</code> is within a word or on a word boundary that can only be
179      * considered the end of a word (e.g. word_ where "_" is any character that would not
180      * be considered part of the word) then this returns the index of the last character
181      * plus one of that word.
182      *
183      * If the offset is on a word boundary that can be considered the start and end of a
184      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
185      * between AA and BB, this would return the end of the next word, BB.
186      *
187      * Returns BreakIterator.DONE if there is no next boundary.
188      *
189      * @throws IllegalArgumentException is offset is not valid.
190      */
getNextWordEndOnTwoWordBoundary(int offset)191     public int getNextWordEndOnTwoWordBoundary(int offset) {
192         return getEnd(offset, true);
193     }
194 
195     /**
196      * If the <code>offset</code> is within a word or on a word boundary that can only be
197      * considered the start of a word (e.g. _word where "_" is any character that would not
198      * be considered part of the word) then this returns the index of the first character of
199      * that word.
200      *
201      * If the offset is on a word boundary that can be considered the start and end of a
202      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
203      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
204      * return the start of the previous word, AA. Otherwise it would return the current offset,
205      * the start of BB.
206      *
207      * Returns BreakIterator.DONE if there is no previous boundary.
208      *
209      * @throws IllegalArgumentException is offset is not valid.
210      */
getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)211     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
212         checkOffsetIsValid(offset);
213 
214         if (isOnLetterOrDigit(offset)) {
215             if (mIterator.isBoundary(offset)
216                     && (!isAfterLetterOrDigit(offset)
217                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
218                 return offset;
219             } else {
220                 return mIterator.preceding(offset);
221             }
222         } else {
223             if (isAfterLetterOrDigit(offset)) {
224                 return mIterator.preceding(offset);
225             }
226         }
227         return BreakIterator.DONE;
228     }
229 
230     /**
231      * If the <code>offset</code> is within a word or on a word boundary that can only be
232      * considered the end of a word (e.g. word_ where "_" is any character that would not be
233      * considered part of the word) then this returns the index of the last character plus one
234      * of that word.
235      *
236      * If the offset is on a word boundary that can be considered the start and end of a
237      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
238      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
239      * the end of the next word, BB. Otherwise it would return the current offset, the end
240      * of AA.
241      *
242      * Returns BreakIterator.DONE if there is no next boundary.
243      *
244      * @throws IllegalArgumentException is offset is not valid.
245      */
getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)246     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
247         checkOffsetIsValid(offset);
248 
249         if (isAfterLetterOrDigit(offset)) {
250             if (mIterator.isBoundary(offset)
251                     && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
252                 return offset;
253             } else {
254                 return mIterator.following(offset);
255             }
256         } else {
257             if (isOnLetterOrDigit(offset)) {
258                 return mIterator.following(offset);
259             }
260         }
261         return BreakIterator.DONE;
262     }
263 
264     /**
265      * If <code>offset</code> is within a group of punctuation as defined
266      * by {@link #isPunctuation(int)}, returns the index of the first character
267      * of that group, otherwise returns BreakIterator.DONE.
268      *
269      * @param offset the offset to search from.
270      */
getPunctuationBeginning(int offset)271     public int getPunctuationBeginning(int offset) {
272         checkOffsetIsValid(offset);
273         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
274             offset = prevBoundary(offset);
275         }
276         // No need to shift offset, prevBoundary handles that.
277         return offset;
278     }
279 
280     /**
281      * If <code>offset</code> is within a group of punctuation as defined
282      * by {@link #isPunctuation(int)}, returns the index of the last character
283      * of that group plus one, otherwise returns BreakIterator.DONE.
284      *
285      * @param offset the offset to search from.
286      */
getPunctuationEnd(int offset)287     public int getPunctuationEnd(int offset) {
288         checkOffsetIsValid(offset);
289         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
290             offset = nextBoundary(offset);
291         }
292         // No need to shift offset, nextBoundary handles that.
293         return offset;
294     }
295 
296     /**
297      * Indicates if the provided offset is after a punctuation character
298      * as defined by {@link #isPunctuation(int)}.
299      *
300      * @param offset the offset to check from.
301      * @return Whether the offset is after a punctuation character.
302      */
isAfterPunctuation(int offset)303     public boolean isAfterPunctuation(int offset) {
304         if (mStart < offset && offset <= mEnd) {
305             final int codePoint = Character.codePointBefore(mCharSeq, offset);
306             return isPunctuation(codePoint);
307         }
308         return false;
309     }
310 
311     /**
312      * Indicates if the provided offset is at a punctuation character
313      * as defined by {@link #isPunctuation(int)}.
314      *
315      * @param offset the offset to check from.
316      * @return Whether the offset is at a punctuation character.
317      */
isOnPunctuation(int offset)318     public boolean isOnPunctuation(int offset) {
319         if (mStart <= offset && offset < mEnd) {
320             final int codePoint = Character.codePointAt(mCharSeq, offset);
321             return isPunctuation(codePoint);
322         }
323         return false;
324     }
325 
326     /**
327      * Indicates if the codepoint is a mid-word-only punctuation.
328      *
329      * At the moment, this is locale-independent, and includes all the characters in
330      * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
331      * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
332      * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
333      * in the middle of a word, but they become word breaks if they happen at the end of a word
334      * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
335      *
336      * @param locale the locale to consider the codepoint in. Presently ignored.
337      * @param codePoint the codepoint to check.
338      * @return True if the codepoint is a mid-word punctuation.
339      */
isMidWordPunctuation(Locale locale, int codePoint)340     public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
341         final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
342         return (wb == UCharacter.WordBreak.MIDLETTER
343                 || wb == UCharacter.WordBreak.MIDNUMLET
344                 || wb == UCharacter.WordBreak.SINGLE_QUOTE);
345     }
346 
isPunctuationStartBoundary(int offset)347     private boolean isPunctuationStartBoundary(int offset) {
348         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
349     }
350 
isPunctuationEndBoundary(int offset)351     private boolean isPunctuationEndBoundary(int offset) {
352         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
353     }
354 
isPunctuation(int cp)355     private static boolean isPunctuation(int cp) {
356         final int type = Character.getType(cp);
357         return (type == Character.CONNECTOR_PUNCTUATION
358                 || type == Character.DASH_PUNCTUATION
359                 || type == Character.END_PUNCTUATION
360                 || type == Character.FINAL_QUOTE_PUNCTUATION
361                 || type == Character.INITIAL_QUOTE_PUNCTUATION
362                 || type == Character.OTHER_PUNCTUATION
363                 || type == Character.START_PUNCTUATION);
364     }
365 
isAfterLetterOrDigit(int offset)366     private boolean isAfterLetterOrDigit(int offset) {
367         if (mStart < offset && offset <= mEnd) {
368             final int codePoint = Character.codePointBefore(mCharSeq, offset);
369             if (Character.isLetterOrDigit(codePoint)) return true;
370         }
371         return false;
372     }
373 
isOnLetterOrDigit(int offset)374     private boolean isOnLetterOrDigit(int offset) {
375         if (mStart <= offset && offset < mEnd) {
376             final int codePoint = Character.codePointAt(mCharSeq, offset);
377             if (Character.isLetterOrDigit(codePoint)) return true;
378         }
379         return false;
380     }
381 
checkOffsetIsValid(int offset)382     private void checkOffsetIsValid(int offset) {
383         if (!(mStart <= offset && offset <= mEnd)) {
384             throw new IllegalArgumentException("Invalid offset: " + (offset) +
385                     ". Valid range is [" + mStart + ", " + mEnd + "]");
386         }
387     }
388 }
389