1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text.method; 18 19 import android.annotation.NonNull; 20 import android.icu.lang.UCharacter; 21 import android.icu.lang.UProperty; 22 import android.icu.text.BreakIterator; 23 import android.text.CharSequenceCharacterIterator; 24 import android.text.Selection; 25 26 import java.util.Locale; 27 28 /** 29 * Walks through cursor positions at word boundaries. Internally uses 30 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 31 * for performance reasons. 32 * 33 * Also provides methods to determine word boundaries. 34 * {@hide} 35 */ 36 public class WordIterator implements Selection.PositionIterator { 37 // Size of the window for the word iterator, should be greater than the longest word's length 38 private static final int WINDOW_WIDTH = 50; 39 40 private int mStart, mEnd; 41 private CharSequence mCharSeq; 42 private final BreakIterator mIterator; 43 44 /** 45 * Constructs a WordIterator using the default locale. 46 */ WordIterator()47 public WordIterator() { 48 this(Locale.getDefault()); 49 } 50 51 /** 52 * Constructs a new WordIterator for the specified locale. 53 * @param locale The locale to be used for analyzing the text. 54 */ WordIterator(Locale locale)55 public WordIterator(Locale locale) { 56 mIterator = BreakIterator.getWordInstance(locale); 57 } 58 setCharSequence(@onNull CharSequence charSequence, int start, int end)59 public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { 60 if (0 <= start && end <= charSequence.length()) { 61 mCharSeq = charSequence; 62 mStart = Math.max(0, start - WINDOW_WIDTH); 63 mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 64 mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); 65 } else { 66 throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); 67 } 68 } 69 70 /** {@inheritDoc} */ preceding(int offset)71 public int preceding(int offset) { 72 checkOffsetIsValid(offset); 73 while (true) { 74 offset = mIterator.preceding(offset); 75 if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { 76 return offset; 77 } 78 } 79 } 80 81 /** {@inheritDoc} */ following(int offset)82 public int following(int offset) { 83 checkOffsetIsValid(offset); 84 while (true) { 85 offset = mIterator.following(offset); 86 if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { 87 return offset; 88 } 89 } 90 } 91 92 /** {@inheritDoc} */ isBoundary(int offset)93 public boolean isBoundary(int offset) { 94 checkOffsetIsValid(offset); 95 return mIterator.isBoundary(offset); 96 } 97 98 /** 99 * Returns the position of next boundary after the given offset. Returns 100 * {@code DONE} if there is no boundary after the given offset. 101 * 102 * @param offset the given start position to search from. 103 * @return the position of the last boundary preceding the given offset. 104 */ nextBoundary(int offset)105 public int nextBoundary(int offset) { 106 checkOffsetIsValid(offset); 107 return mIterator.following(offset); 108 } 109 110 /** 111 * Returns the position of boundary preceding the given offset or 112 * {@code DONE} if the given offset specifies the starting position. 113 * 114 * @param offset the given start position to search from. 115 * @return the position of the last boundary preceding the given offset. 116 */ prevBoundary(int offset)117 public int prevBoundary(int offset) { 118 checkOffsetIsValid(offset); 119 return mIterator.preceding(offset); 120 } 121 122 /** If <code>offset</code> is within a word, returns the index of the first character of that 123 * word, otherwise returns BreakIterator.DONE. 124 * 125 * The offsets that are considered to be part of a word are the indexes of its characters, 126 * <i>as well as</i> the index of its last character plus one. 127 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 128 * 129 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 130 * The returned value is within [0..offset] or BreakIterator.DONE. 131 * 132 * @throws IllegalArgumentException is offset is not valid. 133 */ getBeginning(int offset)134 public int getBeginning(int offset) { 135 // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 136 // so this method can be removed. 137 return getBeginning(offset, false); 138 } 139 140 /** 141 * If <code>offset</code> is within a word, returns the index of the last character of that 142 * word plus one, otherwise returns BreakIterator.DONE. 143 * 144 * The offsets that are considered to be part of a word are the indexes of its characters, 145 * <i>as well as</i> the index of its last character plus one. 146 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 147 * 148 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 149 * The returned value is within [offset..textLength] or BreakIterator.DONE. 150 * 151 * @throws IllegalArgumentException is offset is not valid. 152 */ getEnd(int offset)153 public int getEnd(int offset) { 154 // TODO: Check if usage of this can be updated to getEnd(offset, true), if 155 // so this method can be removed. 156 return getEnd(offset, false); 157 } 158 159 /** 160 * If the <code>offset</code> is within a word or on a word boundary that can only be 161 * considered the start of a word (e.g. _word where "_" is any character that would not 162 * be considered part of the word) then this returns the index of the first character of 163 * that word. 164 * 165 * If the offset is on a word boundary that can be considered the start and end of a 166 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 167 * between AA and BB, this would return the start of the previous word, AA. 168 * 169 * Returns BreakIterator.DONE if there is no previous boundary. 170 * 171 * @throws IllegalArgumentException is offset is not valid. 172 */ getPrevWordBeginningOnTwoWordsBoundary(int offset)173 public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 174 return getBeginning(offset, true); 175 } 176 177 /** 178 * If the <code>offset</code> is within a word or on a word boundary that can only be 179 * considered the end of a word (e.g. word_ where "_" is any character that would not 180 * be considered part of the word) then this returns the index of the last character 181 * plus one of that word. 182 * 183 * If the offset is on a word boundary that can be considered the start and end of a 184 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 185 * between AA and BB, this would return the end of the next word, BB. 186 * 187 * Returns BreakIterator.DONE if there is no next boundary. 188 * 189 * @throws IllegalArgumentException is offset is not valid. 190 */ getNextWordEndOnTwoWordBoundary(int offset)191 public int getNextWordEndOnTwoWordBoundary(int offset) { 192 return getEnd(offset, true); 193 } 194 195 /** 196 * If the <code>offset</code> is within a word or on a word boundary that can only be 197 * considered the start of a word (e.g. _word where "_" is any character that would not 198 * be considered part of the word) then this returns the index of the first character of 199 * that word. 200 * 201 * If the offset is on a word boundary that can be considered the start and end of a 202 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 203 * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 204 * return the start of the previous word, AA. Otherwise it would return the current offset, 205 * the start of BB. 206 * 207 * Returns BreakIterator.DONE if there is no previous boundary. 208 * 209 * @throws IllegalArgumentException is offset is not valid. 210 */ getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)211 private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 212 checkOffsetIsValid(offset); 213 214 if (isOnLetterOrDigit(offset)) { 215 if (mIterator.isBoundary(offset) 216 && (!isAfterLetterOrDigit(offset) 217 || !getPrevWordBeginningOnTwoWordsBoundary)) { 218 return offset; 219 } else { 220 return mIterator.preceding(offset); 221 } 222 } else { 223 if (isAfterLetterOrDigit(offset)) { 224 return mIterator.preceding(offset); 225 } 226 } 227 return BreakIterator.DONE; 228 } 229 230 /** 231 * If the <code>offset</code> is within a word or on a word boundary that can only be 232 * considered the end of a word (e.g. word_ where "_" is any character that would not be 233 * considered part of the word) then this returns the index of the last character plus one 234 * of that word. 235 * 236 * If the offset is on a word boundary that can be considered the start and end of a 237 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 238 * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 239 * the end of the next word, BB. Otherwise it would return the current offset, the end 240 * of AA. 241 * 242 * Returns BreakIterator.DONE if there is no next boundary. 243 * 244 * @throws IllegalArgumentException is offset is not valid. 245 */ getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)246 private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 247 checkOffsetIsValid(offset); 248 249 if (isAfterLetterOrDigit(offset)) { 250 if (mIterator.isBoundary(offset) 251 && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { 252 return offset; 253 } else { 254 return mIterator.following(offset); 255 } 256 } else { 257 if (isOnLetterOrDigit(offset)) { 258 return mIterator.following(offset); 259 } 260 } 261 return BreakIterator.DONE; 262 } 263 264 /** 265 * If <code>offset</code> is within a group of punctuation as defined 266 * by {@link #isPunctuation(int)}, returns the index of the first character 267 * of that group, otherwise returns BreakIterator.DONE. 268 * 269 * @param offset the offset to search from. 270 */ getPunctuationBeginning(int offset)271 public int getPunctuationBeginning(int offset) { 272 checkOffsetIsValid(offset); 273 while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 274 offset = prevBoundary(offset); 275 } 276 // No need to shift offset, prevBoundary handles that. 277 return offset; 278 } 279 280 /** 281 * If <code>offset</code> is within a group of punctuation as defined 282 * by {@link #isPunctuation(int)}, returns the index of the last character 283 * of that group plus one, otherwise returns BreakIterator.DONE. 284 * 285 * @param offset the offset to search from. 286 */ getPunctuationEnd(int offset)287 public int getPunctuationEnd(int offset) { 288 checkOffsetIsValid(offset); 289 while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 290 offset = nextBoundary(offset); 291 } 292 // No need to shift offset, nextBoundary handles that. 293 return offset; 294 } 295 296 /** 297 * Indicates if the provided offset is after a punctuation character 298 * as defined by {@link #isPunctuation(int)}. 299 * 300 * @param offset the offset to check from. 301 * @return Whether the offset is after a punctuation character. 302 */ isAfterPunctuation(int offset)303 public boolean isAfterPunctuation(int offset) { 304 if (mStart < offset && offset <= mEnd) { 305 final int codePoint = Character.codePointBefore(mCharSeq, offset); 306 return isPunctuation(codePoint); 307 } 308 return false; 309 } 310 311 /** 312 * Indicates if the provided offset is at a punctuation character 313 * as defined by {@link #isPunctuation(int)}. 314 * 315 * @param offset the offset to check from. 316 * @return Whether the offset is at a punctuation character. 317 */ isOnPunctuation(int offset)318 public boolean isOnPunctuation(int offset) { 319 if (mStart <= offset && offset < mEnd) { 320 final int codePoint = Character.codePointAt(mCharSeq, offset); 321 return isPunctuation(codePoint); 322 } 323 return false; 324 } 325 326 /** 327 * Indicates if the codepoint is a mid-word-only punctuation. 328 * 329 * At the moment, this is locale-independent, and includes all the characters in 330 * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see 331 * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the 332 * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are 333 * in the middle of a word, but they become word breaks if they happen at the end of a word 334 * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). 335 * 336 * @param locale the locale to consider the codepoint in. Presently ignored. 337 * @param codePoint the codepoint to check. 338 * @return True if the codepoint is a mid-word punctuation. 339 */ isMidWordPunctuation(Locale locale, int codePoint)340 public static boolean isMidWordPunctuation(Locale locale, int codePoint) { 341 final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); 342 return (wb == UCharacter.WordBreak.MIDLETTER 343 || wb == UCharacter.WordBreak.MIDNUMLET 344 || wb == UCharacter.WordBreak.SINGLE_QUOTE); 345 } 346 isPunctuationStartBoundary(int offset)347 private boolean isPunctuationStartBoundary(int offset) { 348 return isOnPunctuation(offset) && !isAfterPunctuation(offset); 349 } 350 isPunctuationEndBoundary(int offset)351 private boolean isPunctuationEndBoundary(int offset) { 352 return !isOnPunctuation(offset) && isAfterPunctuation(offset); 353 } 354 isPunctuation(int cp)355 private static boolean isPunctuation(int cp) { 356 final int type = Character.getType(cp); 357 return (type == Character.CONNECTOR_PUNCTUATION 358 || type == Character.DASH_PUNCTUATION 359 || type == Character.END_PUNCTUATION 360 || type == Character.FINAL_QUOTE_PUNCTUATION 361 || type == Character.INITIAL_QUOTE_PUNCTUATION 362 || type == Character.OTHER_PUNCTUATION 363 || type == Character.START_PUNCTUATION); 364 } 365 isAfterLetterOrDigit(int offset)366 private boolean isAfterLetterOrDigit(int offset) { 367 if (mStart < offset && offset <= mEnd) { 368 final int codePoint = Character.codePointBefore(mCharSeq, offset); 369 if (Character.isLetterOrDigit(codePoint)) return true; 370 } 371 return false; 372 } 373 isOnLetterOrDigit(int offset)374 private boolean isOnLetterOrDigit(int offset) { 375 if (mStart <= offset && offset < mEnd) { 376 final int codePoint = Character.codePointAt(mCharSeq, offset); 377 if (Character.isLetterOrDigit(codePoint)) return true; 378 } 379 return false; 380 } 381 checkOffsetIsValid(int offset)382 private void checkOffsetIsValid(int offset) { 383 if (!(mStart <= offset && offset <= mEnd)) { 384 throw new IllegalArgumentException("Invalid offset: " + (offset) + 385 ". Valid range is [" + mStart + ", " + mEnd + "]"); 386 } 387 } 388 } 389