1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text.method; 18 19 import android.annotation.NonNull; 20 import android.compat.annotation.UnsupportedAppUsage; 21 import android.icu.lang.UCharacter; 22 import android.icu.lang.UProperty; 23 import android.icu.text.BreakIterator; 24 import android.icu.util.ULocale; 25 import android.os.Build; 26 import android.text.CharSequenceCharacterIterator; 27 import android.text.Selection; 28 import android.text.TextUtils; 29 30 import java.util.Locale; 31 32 /** 33 * Walks through cursor positions at word boundaries. Internally uses 34 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 35 * for performance reasons. 36 * 37 * Also provides methods to determine word boundaries. 38 * {@hide} 39 */ 40 public class WordIterator implements Selection.PositionIterator { 41 // Size of the window for the word iterator, should be greater than the longest word's length 42 private static final int WINDOW_WIDTH = 50; 43 44 private int mStart, mEnd; 45 private CharSequence mCharSeq; 46 private final BreakIterator mIterator; 47 48 /** 49 * Constructs a WordIterator using the default locale. 50 */ WordIterator()51 public WordIterator() { 52 this(Locale.getDefault()); 53 } 54 55 /** 56 * Constructs a new WordIterator for the specified locale. 57 * @param locale The locale to be used for analyzing the text. 58 */ 59 @UnsupportedAppUsage WordIterator(Locale locale)60 public WordIterator(Locale locale) { 61 mIterator = BreakIterator.getWordInstance(locale); 62 } 63 64 /** 65 * Constructs a new WordIterator for the specified locale. 66 * @param locale The locale to be used for analyzing the text. 67 */ WordIterator(ULocale locale)68 public WordIterator(ULocale locale) { 69 mIterator = BreakIterator.getWordInstance(locale); 70 } 71 72 @UnsupportedAppUsage setCharSequence(@onNull CharSequence charSequence, int start, int end)73 public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { 74 if (0 <= start && end <= charSequence.length()) { 75 mCharSeq = charSequence; 76 mStart = Math.max(0, start - WINDOW_WIDTH); 77 mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 78 mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); 79 } else { 80 throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); 81 } 82 } 83 84 /** {@inheritDoc} */ 85 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) preceding(int offset)86 public int preceding(int offset) { 87 checkOffsetIsValid(offset); 88 while (true) { 89 offset = mIterator.preceding(offset); 90 if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { 91 return offset; 92 } 93 } 94 } 95 96 /** {@inheritDoc} */ 97 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) following(int offset)98 public int following(int offset) { 99 checkOffsetIsValid(offset); 100 while (true) { 101 offset = mIterator.following(offset); 102 if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { 103 return offset; 104 } 105 } 106 } 107 108 /** {@inheritDoc} */ 109 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isBoundary(int offset)110 public boolean isBoundary(int offset) { 111 checkOffsetIsValid(offset); 112 return mIterator.isBoundary(offset); 113 } 114 115 /** 116 * Returns the position of next boundary after the given offset. Returns 117 * {@code DONE} if there is no boundary after the given offset. 118 * 119 * @param offset the given start position to search from. 120 * @return the position of the last boundary preceding the given offset. 121 */ 122 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) nextBoundary(int offset)123 public int nextBoundary(int offset) { 124 checkOffsetIsValid(offset); 125 return mIterator.following(offset); 126 } 127 128 /** 129 * Returns the position of boundary preceding the given offset or 130 * {@code DONE} if the given offset specifies the starting position. 131 * 132 * @param offset the given start position to search from. 133 * @return the position of the last boundary preceding the given offset. 134 */ 135 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) prevBoundary(int offset)136 public int prevBoundary(int offset) { 137 checkOffsetIsValid(offset); 138 return mIterator.preceding(offset); 139 } 140 141 /** If <code>offset</code> is within a word, returns the index of the first character of that 142 * word, otherwise returns BreakIterator.DONE. 143 * 144 * The offsets that are considered to be part of a word are the indexes of its characters, 145 * <i>as well as</i> the index of its last character plus one. 146 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 147 * 148 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 149 * The returned value is within [0..offset] or BreakIterator.DONE. 150 * 151 * @throws IllegalArgumentException is offset is not valid. 152 */ 153 @UnsupportedAppUsage getBeginning(int offset)154 public int getBeginning(int offset) { 155 // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 156 // so this method can be removed. 157 return getBeginning(offset, false); 158 } 159 160 /** 161 * If <code>offset</code> is within a word, returns the index of the last character of that 162 * word plus one, otherwise returns BreakIterator.DONE. 163 * 164 * The offsets that are considered to be part of a word are the indexes of its characters, 165 * <i>as well as</i> the index of its last character plus one. 166 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 167 * 168 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 169 * The returned value is within [offset..textLength] or BreakIterator.DONE. 170 * 171 * @throws IllegalArgumentException is offset is not valid. 172 */ 173 @UnsupportedAppUsage getEnd(int offset)174 public int getEnd(int offset) { 175 // TODO: Check if usage of this can be updated to getEnd(offset, true), if 176 // so this method can be removed. 177 return getEnd(offset, false); 178 } 179 180 /** 181 * If the <code>offset</code> is within a word or on a word boundary that can only be 182 * considered the start of a word (e.g. _word where "_" is any character that would not 183 * be considered part of the word) then this returns the index of the first character of 184 * that word. 185 * 186 * If the offset is on a word boundary that can be considered the start and end of a 187 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 188 * between AA and BB, this would return the start of the previous word, AA. 189 * 190 * Returns BreakIterator.DONE if there is no previous boundary. 191 * 192 * @throws IllegalArgumentException is offset is not valid. 193 */ 194 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPrevWordBeginningOnTwoWordsBoundary(int offset)195 public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 196 return getBeginning(offset, true); 197 } 198 199 /** 200 * If the <code>offset</code> is within a word or on a word boundary that can only be 201 * considered the end of a word (e.g. word_ where "_" is any character that would not 202 * be considered part of the word) then this returns the index of the last character 203 * plus one of that word. 204 * 205 * If the offset is on a word boundary that can be considered the start and end of a 206 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 207 * between AA and BB, this would return the end of the next word, BB. 208 * 209 * Returns BreakIterator.DONE if there is no next boundary. 210 * 211 * @throws IllegalArgumentException is offset is not valid. 212 */ 213 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getNextWordEndOnTwoWordBoundary(int offset)214 public int getNextWordEndOnTwoWordBoundary(int offset) { 215 return getEnd(offset, true); 216 } 217 218 /** 219 * If the <code>offset</code> is within a word or on a word boundary that can only be 220 * considered the start of a word (e.g. _word where "_" is any character that would not 221 * be considered part of the word) then this returns the index of the first character of 222 * that word. 223 * 224 * If the offset is on a word boundary that can be considered the start and end of a 225 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 226 * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 227 * return the start of the previous word, AA. Otherwise it would return the current offset, 228 * the start of BB. 229 * 230 * Returns BreakIterator.DONE if there is no previous boundary. 231 * 232 * @throws IllegalArgumentException is offset is not valid. 233 */ getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)234 private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 235 checkOffsetIsValid(offset); 236 237 if (isOnLetterOrDigit(offset)) { 238 if (mIterator.isBoundary(offset) 239 && (!isAfterLetterOrDigit(offset) 240 || !getPrevWordBeginningOnTwoWordsBoundary)) { 241 return offset; 242 } else { 243 return mIterator.preceding(offset); 244 } 245 } else { 246 if (isAfterLetterOrDigit(offset)) { 247 return mIterator.preceding(offset); 248 } 249 } 250 return BreakIterator.DONE; 251 } 252 253 /** 254 * If the <code>offset</code> is within a word or on a word boundary that can only be 255 * considered the end of a word (e.g. word_ where "_" is any character that would not be 256 * considered part of the word) then this returns the index of the last character plus one 257 * of that word. 258 * 259 * If the offset is on a word boundary that can be considered the start and end of a 260 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 261 * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 262 * the end of the next word, BB. Otherwise it would return the current offset, the end 263 * of AA. 264 * 265 * Returns BreakIterator.DONE if there is no next boundary. 266 * 267 * @throws IllegalArgumentException is offset is not valid. 268 */ getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)269 private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 270 checkOffsetIsValid(offset); 271 272 if (isAfterLetterOrDigit(offset)) { 273 if (mIterator.isBoundary(offset) 274 && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { 275 return offset; 276 } else { 277 return mIterator.following(offset); 278 } 279 } else { 280 if (isOnLetterOrDigit(offset)) { 281 return mIterator.following(offset); 282 } 283 } 284 return BreakIterator.DONE; 285 } 286 287 /** 288 * If <code>offset</code> is within a group of punctuation as defined by {@link 289 * TextUtils#isPunctuation(int)}, returns the index of the first character of that group, 290 * otherwise returns BreakIterator.DONE. 291 * 292 * @param offset the offset to search from. 293 */ 294 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPunctuationBeginning(int offset)295 public int getPunctuationBeginning(int offset) { 296 checkOffsetIsValid(offset); 297 while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 298 offset = prevBoundary(offset); 299 } 300 // No need to shift offset, prevBoundary handles that. 301 return offset; 302 } 303 304 /** 305 * If <code>offset</code> is within a group of punctuation as defined by {@link 306 * TextUtils#isPunctuation(int)}, returns the index of the last character of that group plus 307 * one, otherwise returns BreakIterator.DONE. 308 * 309 * @param offset the offset to search from. 310 */ 311 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) getPunctuationEnd(int offset)312 public int getPunctuationEnd(int offset) { 313 checkOffsetIsValid(offset); 314 while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 315 offset = nextBoundary(offset); 316 } 317 // No need to shift offset, nextBoundary handles that. 318 return offset; 319 } 320 321 /** 322 * Indicates if the provided offset is after a punctuation character as defined by {@link 323 * TextUtils#isPunctuation(int)}. 324 * 325 * @param offset the offset to check from. 326 * @return Whether the offset is after a punctuation character. 327 */ 328 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isAfterPunctuation(int offset)329 public boolean isAfterPunctuation(int offset) { 330 if (mStart < offset && offset <= mEnd) { 331 final int codePoint = Character.codePointBefore(mCharSeq, offset); 332 return TextUtils.isPunctuation(codePoint); 333 } 334 return false; 335 } 336 337 /** 338 * Indicates if the provided offset is at a punctuation character as defined by {@link 339 * TextUtils#isPunctuation(int)}. 340 * 341 * @param offset the offset to check from. 342 * @return Whether the offset is at a punctuation character. 343 */ 344 @UnsupportedAppUsage(maxTargetSdk = Build.VERSION_CODES.R, trackingBug = 170729553) isOnPunctuation(int offset)345 public boolean isOnPunctuation(int offset) { 346 if (mStart <= offset && offset < mEnd) { 347 final int codePoint = Character.codePointAt(mCharSeq, offset); 348 return TextUtils.isPunctuation(codePoint); 349 } 350 return false; 351 } 352 353 /** 354 * Indicates if the codepoint is a mid-word-only punctuation. 355 * 356 * At the moment, this is locale-independent, and includes all the characters in 357 * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see 358 * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the 359 * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are 360 * in the middle of a word, but they become word breaks if they happen at the end of a word 361 * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). 362 * 363 * @param locale the locale to consider the codepoint in. Presently ignored. 364 * @param codePoint the codepoint to check. 365 * @return True if the codepoint is a mid-word punctuation. 366 */ isMidWordPunctuation(Locale locale, int codePoint)367 public static boolean isMidWordPunctuation(Locale locale, int codePoint) { 368 final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); 369 return (wb == UCharacter.WordBreak.MIDLETTER 370 || wb == UCharacter.WordBreak.MIDNUMLET 371 || wb == UCharacter.WordBreak.SINGLE_QUOTE); 372 } 373 isPunctuationStartBoundary(int offset)374 private boolean isPunctuationStartBoundary(int offset) { 375 return isOnPunctuation(offset) && !isAfterPunctuation(offset); 376 } 377 isPunctuationEndBoundary(int offset)378 private boolean isPunctuationEndBoundary(int offset) { 379 return !isOnPunctuation(offset) && isAfterPunctuation(offset); 380 } 381 isAfterLetterOrDigit(int offset)382 private boolean isAfterLetterOrDigit(int offset) { 383 if (mStart < offset && offset <= mEnd) { 384 final int codePoint = Character.codePointBefore(mCharSeq, offset); 385 if (Character.isLetterOrDigit(codePoint)) return true; 386 } 387 return false; 388 } 389 isOnLetterOrDigit(int offset)390 private boolean isOnLetterOrDigit(int offset) { 391 if (mStart <= offset && offset < mEnd) { 392 final int codePoint = Character.codePointAt(mCharSeq, offset); 393 if (Character.isLetterOrDigit(codePoint)) return true; 394 } 395 return false; 396 } 397 checkOffsetIsValid(int offset)398 private void checkOffsetIsValid(int offset) { 399 if (!(mStart <= offset && offset <= mEnd)) { 400 throw new IllegalArgumentException("Invalid offset: " + (offset) + 401 ". Valid range is [" + mStart + ", " + mEnd + "]"); 402 } 403 } 404 } 405