1 
2 /*
3  * Copyright (C) 2011 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 package android.text.method;
19 
20 import android.text.Selection;
21 import android.text.SpannableStringBuilder;
22 
23 import android.icu.text.BreakIterator;
24 import java.util.Locale;
25 
26 /**
27  * Walks through cursor positions at word boundaries. Internally uses
28  * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
29  * for performance reasons.
30  *
31  * Also provides methods to determine word boundaries.
32  * {@hide}
33  */
34 public class WordIterator implements Selection.PositionIterator {
35     // Size of the window for the word iterator, should be greater than the longest word's length
36     private static final int WINDOW_WIDTH = 50;
37 
38     private String mString;
39     private int mOffsetShift;
40 
41     private BreakIterator mIterator;
42 
43     /**
44      * Constructs a WordIterator using the default locale.
45      */
WordIterator()46     public WordIterator() {
47         this(Locale.getDefault());
48     }
49 
50     /**
51      * Constructs a new WordIterator for the specified locale.
52      * @param locale The locale to be used when analysing the text.
53      */
WordIterator(Locale locale)54     public WordIterator(Locale locale) {
55         mIterator = BreakIterator.getWordInstance(locale);
56     }
57 
setCharSequence(CharSequence charSequence, int start, int end)58     public void setCharSequence(CharSequence charSequence, int start, int end) {
59         mOffsetShift = Math.max(0, start - WINDOW_WIDTH);
60         final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
61 
62         if (charSequence instanceof SpannableStringBuilder) {
63             mString = ((SpannableStringBuilder) charSequence).substring(mOffsetShift, windowEnd);
64         } else {
65             mString = charSequence.subSequence(mOffsetShift, windowEnd).toString();
66         }
67         mIterator.setText(mString);
68     }
69 
70     /** {@inheritDoc} */
preceding(int offset)71     public int preceding(int offset) {
72         int shiftedOffset = offset - mOffsetShift;
73         do {
74             shiftedOffset = mIterator.preceding(shiftedOffset);
75             if (shiftedOffset == BreakIterator.DONE) {
76                 return BreakIterator.DONE;
77             }
78             if (isOnLetterOrDigit(shiftedOffset)) {
79                 return shiftedOffset + mOffsetShift;
80             }
81         } while (true);
82     }
83 
84     /** {@inheritDoc} */
following(int offset)85     public int following(int offset) {
86         int shiftedOffset = offset - mOffsetShift;
87         do {
88             shiftedOffset = mIterator.following(shiftedOffset);
89             if (shiftedOffset == BreakIterator.DONE) {
90                 return BreakIterator.DONE;
91             }
92             if (isAfterLetterOrDigit(shiftedOffset)) {
93                 return shiftedOffset + mOffsetShift;
94             }
95         } while (true);
96     }
97 
98     /** {@inheritDoc} */
isBoundary(int offset)99     public boolean isBoundary(int offset) {
100         int shiftedOffset = offset - mOffsetShift;
101         checkOffsetIsValid(shiftedOffset);
102         return mIterator.isBoundary(shiftedOffset);
103     }
104 
105     /**
106      * Returns the position of next boundary after the given offset. Returns
107      * {@code DONE} if there is no boundary after the given offset.
108      *
109      * @param offset the given start position to search from.
110      * @return the position of the last boundary preceding the given offset.
111      */
nextBoundary(int offset)112     public int nextBoundary(int offset) {
113         int shiftedOffset = offset - mOffsetShift;
114         shiftedOffset = mIterator.following(shiftedOffset);
115         if (shiftedOffset == BreakIterator.DONE) {
116             return BreakIterator.DONE;
117         }
118         return shiftedOffset + mOffsetShift;
119     }
120 
121     /**
122      * Returns the position of boundary preceding the given offset or
123      * {@code DONE} if the given offset specifies the starting position.
124      *
125      * @param offset the given start position to search from.
126      * @return the position of the last boundary preceding the given offset.
127      */
prevBoundary(int offset)128     public int prevBoundary(int offset) {
129         int shiftedOffset = offset - mOffsetShift;
130         shiftedOffset = mIterator.preceding(shiftedOffset);
131         if (shiftedOffset == BreakIterator.DONE) {
132             return BreakIterator.DONE;
133         }
134         return shiftedOffset + mOffsetShift;
135     }
136 
137     /** If <code>offset</code> is within a word, returns the index of the first character of that
138      * word, otherwise returns BreakIterator.DONE.
139      *
140      * The offsets that are considered to be part of a word are the indexes of its characters,
141      * <i>as well as</i> the index of its last character plus one.
142      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
143      *
144      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
145      * The returned value is within [0..offset] or BreakIterator.DONE.
146      *
147      * @throws IllegalArgumentException is offset is not valid.
148      */
getBeginning(int offset)149     public int getBeginning(int offset) {
150         // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
151         // so this method can be removed.
152         return getBeginning(offset, false);
153     }
154 
155     /**
156      * If <code>offset</code> is within a word, returns the index of the last character of that
157      * word plus one, otherwise returns BreakIterator.DONE.
158      *
159      * The offsets that are considered to be part of a word are the indexes of its characters,
160      * <i>as well as</i> the index of its last character plus one.
161      * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
162      *
163      * Valid range for offset is [0..textLength] (note the inclusive upper bound).
164      * The returned value is within [offset..textLength] or BreakIterator.DONE.
165      *
166      * @throws IllegalArgumentException is offset is not valid.
167      */
getEnd(int offset)168     public int getEnd(int offset) {
169         // TODO: Check if usage of this can be updated to getEnd(offset, true), if
170         // so this method can be removed.
171         return getEnd(offset, false);
172     }
173 
174     /**
175      * If the <code>offset</code> is within a word or on a word boundary that can only be
176      * considered the start of a word (e.g. _word where "_" is any character that would not
177      * be considered part of the word) then this returns the index of the first character of
178      * that word.
179      *
180      * If the offset is on a word boundary that can be considered the start and end of a
181      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
182      * between AA and BB, this would return the start of the previous word, AA.
183      *
184      * Returns BreakIterator.DONE if there is no previous boundary.
185      *
186      * @throws IllegalArgumentException is offset is not valid.
187      */
getPrevWordBeginningOnTwoWordsBoundary(int offset)188     public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
189         return getBeginning(offset, true);
190     }
191 
192     /**
193      * If the <code>offset</code> is within a word or on a word boundary that can only be
194      * considered the end of a word (e.g. word_ where "_" is any character that would not
195      * be considered part of the word) then this returns the index of the last character
196      * plus one of that word.
197      *
198      * If the offset is on a word boundary that can be considered the start and end of a
199      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
200      * between AA and BB, this would return the end of the next word, BB.
201      *
202      * Returns BreakIterator.DONE if there is no next boundary.
203      *
204      * @throws IllegalArgumentException is offset is not valid.
205      */
getNextWordEndOnTwoWordBoundary(int offset)206     public int getNextWordEndOnTwoWordBoundary(int offset) {
207         return getEnd(offset, true);
208     }
209 
210     /**
211      * If the <code>offset</code> is within a word or on a word boundary that can only be
212      * considered the start of a word (e.g. _word where "_" is any character that would not
213      * be considered part of the word) then this returns the index of the first character of
214      * that word.
215      *
216      * If the offset is on a word boundary that can be considered the start and end of a
217      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
218      * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
219      * return the start of the previous word, AA. Otherwise it would return the current offset,
220      * the start of BB.
221      *
222      * Returns BreakIterator.DONE if there is no previous boundary.
223      *
224      * @throws IllegalArgumentException is offset is not valid.
225      */
getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)226     private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
227         final int shiftedOffset = offset - mOffsetShift;
228         checkOffsetIsValid(shiftedOffset);
229 
230         if (isOnLetterOrDigit(shiftedOffset)) {
231             if (mIterator.isBoundary(shiftedOffset)
232                     && (!isAfterLetterOrDigit(shiftedOffset)
233                             || !getPrevWordBeginningOnTwoWordsBoundary)) {
234                 return shiftedOffset + mOffsetShift;
235             } else {
236                 return mIterator.preceding(shiftedOffset) + mOffsetShift;
237             }
238         } else {
239             if (isAfterLetterOrDigit(shiftedOffset)) {
240                 return mIterator.preceding(shiftedOffset) + mOffsetShift;
241             }
242         }
243         return BreakIterator.DONE;
244     }
245 
246     /**
247      * If the <code>offset</code> is within a word or on a word boundary that can only be
248      * considered the end of a word (e.g. word_ where "_" is any character that would not be
249      * considered part of the word) then this returns the index of the last character plus one
250      * of that word.
251      *
252      * If the offset is on a word boundary that can be considered the start and end of a
253      * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
254      * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
255      * the end of the next word, BB. Otherwise it would return the current offset, the end
256      * of AA.
257      *
258      * Returns BreakIterator.DONE if there is no next boundary.
259      *
260      * @throws IllegalArgumentException is offset is not valid.
261      */
getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)262     private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
263         final int shiftedOffset = offset - mOffsetShift;
264         checkOffsetIsValid(shiftedOffset);
265 
266         if (isAfterLetterOrDigit(shiftedOffset)) {
267             if (mIterator.isBoundary(shiftedOffset)
268                     && (!isOnLetterOrDigit(shiftedOffset) || !getNextWordEndOnTwoWordBoundary)) {
269                 return shiftedOffset + mOffsetShift;
270             } else {
271                 return mIterator.following(shiftedOffset) + mOffsetShift;
272             }
273         } else {
274             if (isOnLetterOrDigit(shiftedOffset)) {
275                 return mIterator.following(shiftedOffset) + mOffsetShift;
276             }
277         }
278         return BreakIterator.DONE;
279     }
280 
281     /**
282      * If <code>offset</code> is within a group of punctuation as defined
283      * by {@link #isPunctuation(int)}, returns the index of the first character
284      * of that group, otherwise returns BreakIterator.DONE.
285      *
286      * @param offset the offset to search from.
287      */
getPunctuationBeginning(int offset)288     public int getPunctuationBeginning(int offset) {
289         while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
290             offset = prevBoundary(offset);
291         }
292         // No need to shift offset, prevBoundary handles that.
293         return offset;
294     }
295 
296     /**
297      * If <code>offset</code> is within a group of punctuation as defined
298      * by {@link #isPunctuation(int)}, returns the index of the last character
299      * of that group plus one, otherwise returns BreakIterator.DONE.
300      *
301      * @param offset the offset to search from.
302      */
getPunctuationEnd(int offset)303     public int getPunctuationEnd(int offset) {
304         while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
305             offset = nextBoundary(offset);
306         }
307         // No need to shift offset, nextBoundary handles that.
308         return offset;
309     }
310 
311     /**
312      * Indicates if the provided offset is after a punctuation character
313      * as defined by {@link #isPunctuation(int)}.
314      *
315      * @param offset the offset to check from.
316      * @return Whether the offset is after a punctuation character.
317      */
isAfterPunctuation(int offset)318     public boolean isAfterPunctuation(int offset) {
319         final int shiftedOffset = offset - mOffsetShift;
320         if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
321             final int codePoint = mString.codePointBefore(shiftedOffset);
322             return isPunctuation(codePoint);
323         }
324         return false;
325     }
326 
327     /**
328      * Indicates if the provided offset is at a punctuation character
329      * as defined by {@link #isPunctuation(int)}.
330      *
331      * @param offset the offset to check from.
332      * @return Whether the offset is at a punctuation character.
333      */
isOnPunctuation(int offset)334     public boolean isOnPunctuation(int offset) {
335         final int shiftedOffset = offset - mOffsetShift;
336         if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
337             final int codePoint = mString.codePointAt(shiftedOffset);
338             return isPunctuation(codePoint);
339         }
340         return false;
341     }
342 
isPunctuationStartBoundary(int offset)343     private boolean isPunctuationStartBoundary(int offset) {
344         return isOnPunctuation(offset) && !isAfterPunctuation(offset);
345     }
346 
isPunctuationEndBoundary(int offset)347     private boolean isPunctuationEndBoundary(int offset) {
348         return !isOnPunctuation(offset) && isAfterPunctuation(offset);
349     }
350 
isPunctuation(int cp)351     private boolean isPunctuation(int cp) {
352         int type = Character.getType(cp);
353         return (type == Character.CONNECTOR_PUNCTUATION ||
354                 type == Character.DASH_PUNCTUATION ||
355                 type == Character.END_PUNCTUATION ||
356                 type == Character.FINAL_QUOTE_PUNCTUATION ||
357                 type == Character.INITIAL_QUOTE_PUNCTUATION ||
358                 type == Character.OTHER_PUNCTUATION ||
359                 type == Character.START_PUNCTUATION);
360     }
361 
isAfterLetterOrDigit(int shiftedOffset)362     private boolean isAfterLetterOrDigit(int shiftedOffset) {
363         if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
364             final int codePoint = mString.codePointBefore(shiftedOffset);
365             if (Character.isLetterOrDigit(codePoint)) return true;
366         }
367         return false;
368     }
369 
isOnLetterOrDigit(int shiftedOffset)370     private boolean isOnLetterOrDigit(int shiftedOffset) {
371         if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
372             final int codePoint = mString.codePointAt(shiftedOffset);
373             if (Character.isLetterOrDigit(codePoint)) return true;
374         }
375         return false;
376     }
377 
checkOffsetIsValid(int shiftedOffset)378     private void checkOffsetIsValid(int shiftedOffset) {
379         if (shiftedOffset < 0 || shiftedOffset > mString.length()) {
380             throw new IllegalArgumentException("Invalid offset: " + (shiftedOffset + mOffsetShift) +
381                     ". Valid range is [" + mOffsetShift + ", " + (mString.length() + mOffsetShift) +
382                     "]");
383         }
384     }
385 }
386