1 2 /* 3 * Copyright (C) 2011 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package android.text.method; 19 20 import android.text.Selection; 21 import android.text.SpannableStringBuilder; 22 23 import java.text.BreakIterator; 24 import java.util.Locale; 25 26 /** 27 * Walks through cursor positions at word boundaries. Internally uses 28 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 29 * for performance reasons. 30 * 31 * Also provides methods to determine word boundaries. 32 * {@hide} 33 */ 34 public class WordIterator implements Selection.PositionIterator { 35 // Size of the window for the word iterator, should be greater than the longest word's length 36 private static final int WINDOW_WIDTH = 50; 37 38 private String mString; 39 private int mOffsetShift; 40 41 private BreakIterator mIterator; 42 43 /** 44 * Constructs a WordIterator using the default locale. 45 */ WordIterator()46 public WordIterator() { 47 this(Locale.getDefault()); 48 } 49 50 /** 51 * Constructs a new WordIterator for the specified locale. 52 * @param locale The locale to be used when analysing the text. 53 */ WordIterator(Locale locale)54 public WordIterator(Locale locale) { 55 mIterator = BreakIterator.getWordInstance(locale); 56 } 57 setCharSequence(CharSequence charSequence, int start, int end)58 public void setCharSequence(CharSequence charSequence, int start, int end) { 59 mOffsetShift = Math.max(0, start - WINDOW_WIDTH); 60 final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 61 62 if (charSequence instanceof SpannableStringBuilder) { 63 mString = ((SpannableStringBuilder) charSequence).substring(mOffsetShift, windowEnd); 64 } else { 65 mString = charSequence.subSequence(mOffsetShift, windowEnd).toString(); 66 } 67 mIterator.setText(mString); 68 } 69 70 /** {@inheritDoc} */ preceding(int offset)71 public int preceding(int offset) { 72 int shiftedOffset = offset - mOffsetShift; 73 do { 74 shiftedOffset = mIterator.preceding(shiftedOffset); 75 if (shiftedOffset == BreakIterator.DONE) { 76 return BreakIterator.DONE; 77 } 78 if (isOnLetterOrDigit(shiftedOffset)) { 79 return shiftedOffset + mOffsetShift; 80 } 81 } while (true); 82 } 83 84 /** {@inheritDoc} */ following(int offset)85 public int following(int offset) { 86 int shiftedOffset = offset - mOffsetShift; 87 do { 88 shiftedOffset = mIterator.following(shiftedOffset); 89 if (shiftedOffset == BreakIterator.DONE) { 90 return BreakIterator.DONE; 91 } 92 if (isAfterLetterOrDigit(shiftedOffset)) { 93 return shiftedOffset + mOffsetShift; 94 } 95 } while (true); 96 } 97 98 /** {@inheritDoc} */ isBoundary(int offset)99 public boolean isBoundary(int offset) { 100 int shiftedOffset = offset - mOffsetShift; 101 checkOffsetIsValid(shiftedOffset); 102 return mIterator.isBoundary(shiftedOffset); 103 } 104 105 /** 106 * Returns the position of next boundary after the given offset. Returns 107 * {@code DONE} if there is no boundary after the given offset. 108 * 109 * @param offset the given start position to search from. 110 * @return the position of the last boundary preceding the given offset. 111 */ nextBoundary(int offset)112 public int nextBoundary(int offset) { 113 int shiftedOffset = offset - mOffsetShift; 114 shiftedOffset = mIterator.following(shiftedOffset); 115 if (shiftedOffset == BreakIterator.DONE) { 116 return BreakIterator.DONE; 117 } 118 return shiftedOffset + mOffsetShift; 119 } 120 121 /** 122 * Returns the position of boundary preceding the given offset or 123 * {@code DONE} if the given offset specifies the starting position. 124 * 125 * @param offset the given start position to search from. 126 * @return the position of the last boundary preceding the given offset. 127 */ prevBoundary(int offset)128 public int prevBoundary(int offset) { 129 int shiftedOffset = offset - mOffsetShift; 130 shiftedOffset = mIterator.preceding(shiftedOffset); 131 if (shiftedOffset == BreakIterator.DONE) { 132 return BreakIterator.DONE; 133 } 134 return shiftedOffset + mOffsetShift; 135 } 136 137 /** If <code>offset</code> is within a word, returns the index of the first character of that 138 * word, otherwise returns BreakIterator.DONE. 139 * 140 * The offsets that are considered to be part of a word are the indexes of its characters, 141 * <i>as well as</i> the index of its last character plus one. 142 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 143 * 144 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 145 * The returned value is within [0..offset] or BreakIterator.DONE. 146 * 147 * @throws IllegalArgumentException is offset is not valid. 148 */ getBeginning(int offset)149 public int getBeginning(int offset) { 150 // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 151 // so this method can be removed. 152 return getBeginning(offset, false); 153 } 154 155 /** 156 * If <code>offset</code> is within a word, returns the index of the last character of that 157 * word plus one, otherwise returns BreakIterator.DONE. 158 * 159 * The offsets that are considered to be part of a word are the indexes of its characters, 160 * <i>as well as</i> the index of its last character plus one. 161 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 162 * 163 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 164 * The returned value is within [offset..textLength] or BreakIterator.DONE. 165 * 166 * @throws IllegalArgumentException is offset is not valid. 167 */ getEnd(int offset)168 public int getEnd(int offset) { 169 // TODO: Check if usage of this can be updated to getEnd(offset, true), if 170 // so this method can be removed. 171 return getEnd(offset, false); 172 } 173 174 /** 175 * If the <code>offset</code> is within a word or on a word boundary that can only be 176 * considered the start of a word (e.g. _word where "_" is any character that would not 177 * be considered part of the word) then this returns the index of the first character of 178 * that word. 179 * 180 * If the offset is on a word boundary that can be considered the start and end of a 181 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 182 * between AA and BB, this would return the start of the previous word, AA. 183 * 184 * Returns BreakIterator.DONE if there is no previous boundary. 185 * 186 * @throws IllegalArgumentException is offset is not valid. 187 */ getPrevWordBeginningOnTwoWordsBoundary(int offset)188 public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 189 return getBeginning(offset, true); 190 } 191 192 /** 193 * If the <code>offset</code> is within a word or on a word boundary that can only be 194 * considered the end of a word (e.g. word_ where "_" is any character that would not 195 * be considered part of the word) then this returns the index of the last character 196 * plus one of that word. 197 * 198 * If the offset is on a word boundary that can be considered the start and end of a 199 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 200 * between AA and BB, this would return the end of the next word, BB. 201 * 202 * Returns BreakIterator.DONE if there is no next boundary. 203 * 204 * @throws IllegalArgumentException is offset is not valid. 205 */ getNextWordEndOnTwoWordBoundary(int offset)206 public int getNextWordEndOnTwoWordBoundary(int offset) { 207 return getEnd(offset, true); 208 } 209 210 /** 211 * If the <code>offset</code> is within a word or on a word boundary that can only be 212 * considered the start of a word (e.g. _word where "_" is any character that would not 213 * be considered part of the word) then this returns the index of the first character of 214 * that word. 215 * 216 * If the offset is on a word boundary that can be considered the start and end of a 217 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 218 * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 219 * return the start of the previous word, AA. Otherwise it would return the current offset, 220 * the start of BB. 221 * 222 * Returns BreakIterator.DONE if there is no previous boundary. 223 * 224 * @throws IllegalArgumentException is offset is not valid. 225 */ getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)226 private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 227 final int shiftedOffset = offset - mOffsetShift; 228 checkOffsetIsValid(shiftedOffset); 229 230 if (isOnLetterOrDigit(shiftedOffset)) { 231 if (mIterator.isBoundary(shiftedOffset) 232 && (!isAfterLetterOrDigit(shiftedOffset) 233 || !getPrevWordBeginningOnTwoWordsBoundary)) { 234 return shiftedOffset + mOffsetShift; 235 } else { 236 return mIterator.preceding(shiftedOffset) + mOffsetShift; 237 } 238 } else { 239 if (isAfterLetterOrDigit(shiftedOffset)) { 240 return mIterator.preceding(shiftedOffset) + mOffsetShift; 241 } 242 } 243 return BreakIterator.DONE; 244 } 245 246 /** 247 * If the <code>offset</code> is within a word or on a word boundary that can only be 248 * considered the end of a word (e.g. word_ where "_" is any character that would not be 249 * considered part of the word) then this returns the index of the last character plus one 250 * of that word. 251 * 252 * If the offset is on a word boundary that can be considered the start and end of a 253 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 254 * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 255 * the end of the next word, BB. Otherwise it would return the current offset, the end 256 * of AA. 257 * 258 * Returns BreakIterator.DONE if there is no next boundary. 259 * 260 * @throws IllegalArgumentException is offset is not valid. 261 */ getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)262 private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 263 final int shiftedOffset = offset - mOffsetShift; 264 checkOffsetIsValid(shiftedOffset); 265 266 if (isAfterLetterOrDigit(shiftedOffset)) { 267 if (mIterator.isBoundary(shiftedOffset) 268 && (!isOnLetterOrDigit(shiftedOffset) || !getNextWordEndOnTwoWordBoundary)) { 269 return shiftedOffset + mOffsetShift; 270 } else { 271 return mIterator.following(shiftedOffset) + mOffsetShift; 272 } 273 } else { 274 if (isOnLetterOrDigit(shiftedOffset)) { 275 return mIterator.following(shiftedOffset) + mOffsetShift; 276 } 277 } 278 return BreakIterator.DONE; 279 } 280 281 /** 282 * If <code>offset</code> is within a group of punctuation as defined 283 * by {@link #isPunctuation(int)}, returns the index of the first character 284 * of that group, otherwise returns BreakIterator.DONE. 285 * 286 * @param offset the offset to search from. 287 */ getPunctuationBeginning(int offset)288 public int getPunctuationBeginning(int offset) { 289 while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 290 offset = prevBoundary(offset); 291 } 292 // No need to shift offset, prevBoundary handles that. 293 return offset; 294 } 295 296 /** 297 * If <code>offset</code> is within a group of punctuation as defined 298 * by {@link #isPunctuation(int)}, returns the index of the last character 299 * of that group plus one, otherwise returns BreakIterator.DONE. 300 * 301 * @param offset the offset to search from. 302 */ getPunctuationEnd(int offset)303 public int getPunctuationEnd(int offset) { 304 while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 305 offset = nextBoundary(offset); 306 } 307 // No need to shift offset, nextBoundary handles that. 308 return offset; 309 } 310 311 /** 312 * Indicates if the provided offset is after a punctuation character 313 * as defined by {@link #isPunctuation(int)}. 314 * 315 * @param offset the offset to check from. 316 * @return Whether the offset is after a punctuation character. 317 */ isAfterPunctuation(int offset)318 public boolean isAfterPunctuation(int offset) { 319 final int shiftedOffset = offset - mOffsetShift; 320 if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) { 321 final int codePoint = mString.codePointBefore(shiftedOffset); 322 return isPunctuation(codePoint); 323 } 324 return false; 325 } 326 327 /** 328 * Indicates if the provided offset is at a punctuation character 329 * as defined by {@link #isPunctuation(int)}. 330 * 331 * @param offset the offset to check from. 332 * @return Whether the offset is at a punctuation character. 333 */ isOnPunctuation(int offset)334 public boolean isOnPunctuation(int offset) { 335 final int shiftedOffset = offset - mOffsetShift; 336 if (shiftedOffset >= 0 && shiftedOffset < mString.length()) { 337 final int codePoint = mString.codePointAt(shiftedOffset); 338 return isPunctuation(codePoint); 339 } 340 return false; 341 } 342 isPunctuationStartBoundary(int offset)343 private boolean isPunctuationStartBoundary(int offset) { 344 return isOnPunctuation(offset) && !isAfterPunctuation(offset); 345 } 346 isPunctuationEndBoundary(int offset)347 private boolean isPunctuationEndBoundary(int offset) { 348 return !isOnPunctuation(offset) && isAfterPunctuation(offset); 349 } 350 isPunctuation(int cp)351 private boolean isPunctuation(int cp) { 352 int type = Character.getType(cp); 353 return (type == Character.CONNECTOR_PUNCTUATION || 354 type == Character.DASH_PUNCTUATION || 355 type == Character.END_PUNCTUATION || 356 type == Character.FINAL_QUOTE_PUNCTUATION || 357 type == Character.INITIAL_QUOTE_PUNCTUATION || 358 type == Character.OTHER_PUNCTUATION || 359 type == Character.START_PUNCTUATION); 360 } 361 isAfterLetterOrDigit(int shiftedOffset)362 private boolean isAfterLetterOrDigit(int shiftedOffset) { 363 if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) { 364 final int codePoint = mString.codePointBefore(shiftedOffset); 365 if (Character.isLetterOrDigit(codePoint)) return true; 366 } 367 return false; 368 } 369 isOnLetterOrDigit(int shiftedOffset)370 private boolean isOnLetterOrDigit(int shiftedOffset) { 371 if (shiftedOffset >= 0 && shiftedOffset < mString.length()) { 372 final int codePoint = mString.codePointAt(shiftedOffset); 373 if (Character.isLetterOrDigit(codePoint)) return true; 374 } 375 return false; 376 } 377 checkOffsetIsValid(int shiftedOffset)378 private void checkOffsetIsValid(int shiftedOffset) { 379 if (shiftedOffset < 0 || shiftedOffset > mString.length()) { 380 throw new IllegalArgumentException("Invalid offset: " + (shiftedOffset + mOffsetShift) + 381 ". Valid range is [" + mOffsetShift + ", " + (mString.length() + mOffsetShift) + 382 "]"); 383 } 384 } 385 } 386