1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text; 18 19 import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR; 20 21 import android.annotation.Nullable; 22 import android.view.View; 23 24 import com.android.internal.annotations.VisibleForTesting; 25 26 import java.util.Locale; 27 28 /** 29 * Utility class for formatting text for display in a potentially opposite-directionality context 30 * without garbling. The directionality of the context is set at formatter creation and the 31 * directionality of the text can be either estimated or passed in when known. 32 * 33 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2}, 34 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class. 35 * 36 * <p>These APIs provides the following functionality: 37 * <p> 38 * 1. Bidi Wrapping 39 * When text in one language is mixed into a document in another, opposite-directionality language, 40 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string 41 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 42 * separated from the surrounding text in a "wrapper" that: 43 * <p> 44 * - Declares its directionality so that the string is displayed correctly. This can be done in 45 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 46 * <p> 47 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 48 * Currently, this can only be done using invisible Unicode characters of the same direction as 49 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 50 * the directionality to that of the context. The "reset" may need to be done at both ends of the 51 * string. Without "reset" after the string, the string will "stick" to a number or logically 52 * separate opposite-direction text that happens to follow it in-line (even if separated by 53 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 54 * happen there, but only with more opposite-direction text, not a number. One approach is to 55 * "reset" the direction only after each string, on the theory that if the preceding opposite- 56 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 57 * the "reset" only before each string definitely does not work because we do not want to require 58 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 59 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 60 * message translations often contain untranslated Latin-script brand names and technical terms, 61 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 62 * has such a message, it is best to do the "reset" manually in the message translation itself, 63 * since the message's opposite-direction text could be followed by an inserted number, which we 64 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 65 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 66 * isolation to be part of the directionality declaration. This form of isolation is better than 67 * "reset" because it takes less space, does not require knowing the context directionality, has a 68 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 69 * using it because required platforms do not yet support it. 70 * <p> 71 * Providing these wrapping services is the basic purpose of the bidi formatter. 72 * <p> 73 * 2. Directionality estimation 74 * How does one know whether a string about to be inserted into surrounding text has the same 75 * directionality? Well, in many cases, one knows that this must be the case when writing the code 76 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 77 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 78 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 79 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 80 * language of the string (and thus its directionality) is not known a priori, and must be 81 * estimated at run-time. The bidi formatter can do this automatically using the default 82 * first-strong estimation algorithm. It can also be configured to use a custom directionality 83 * estimation object. 84 */ 85 public final class BidiFormatter { 86 87 /** 88 * The default text direction heuristic. 89 */ 90 private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 91 92 /** 93 * Unicode "Left-To-Right Embedding" (LRE) character. 94 */ 95 private static final char LRE = '\u202A'; 96 97 /** 98 * Unicode "Right-To-Left Embedding" (RLE) character. 99 */ 100 private static final char RLE = '\u202B'; 101 102 /** 103 * Unicode "Pop Directional Formatting" (PDF) character. 104 */ 105 private static final char PDF = '\u202C'; 106 107 /** 108 * Unicode "Left-To-Right Mark" (LRM) character. 109 */ 110 private static final char LRM = '\u200E'; 111 112 /* 113 * Unicode "Right-To-Left Mark" (RLM) character. 114 */ 115 private static final char RLM = '\u200F'; 116 117 /* 118 * String representation of LRM 119 */ 120 private static final String LRM_STRING = Character.toString(LRM); 121 122 /* 123 * String representation of RLM 124 */ 125 private static final String RLM_STRING = Character.toString(RLM); 126 127 /** 128 * Empty string constant. 129 */ 130 private static final String EMPTY_STRING = ""; 131 132 /** 133 * A class for building a BidiFormatter with non-default options. 134 */ 135 public static final class Builder { 136 private boolean mIsRtlContext; 137 private int mFlags; 138 private TextDirectionHeuristic mTextDirectionHeuristic; 139 140 /** 141 * Constructor. 142 * 143 */ Builder()144 public Builder() { 145 initialize(isRtlLocale(Locale.getDefault())); 146 } 147 148 /** 149 * Constructor. 150 * 151 * @param rtlContext Whether the context directionality is RTL. 152 */ Builder(boolean rtlContext)153 public Builder(boolean rtlContext) { 154 initialize(rtlContext); 155 } 156 157 /** 158 * Constructor. 159 * 160 * @param locale The context locale. 161 */ Builder(Locale locale)162 public Builder(Locale locale) { 163 initialize(isRtlLocale(locale)); 164 } 165 166 /** 167 * Initializes the builder with the given context directionality and default options. 168 * 169 * @param isRtlContext Whether the context is RTL or not. 170 */ initialize(boolean isRtlContext)171 private void initialize(boolean isRtlContext) { 172 mIsRtlContext = isRtlContext; 173 mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC; 174 mFlags = DEFAULT_FLAGS; 175 } 176 177 /** 178 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 179 * a string being bidi-wrapped, not just after it. The default is true. 180 */ stereoReset(boolean stereoReset)181 public Builder stereoReset(boolean stereoReset) { 182 if (stereoReset) { 183 mFlags |= FLAG_STEREO_RESET; 184 } else { 185 mFlags &= ~FLAG_STEREO_RESET; 186 } 187 return this; 188 } 189 190 /** 191 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 192 * By default, uses the first-strong heuristic. 193 * 194 * @param heuristic the {@code TextDirectionHeuristic} to use. 195 * @return the builder itself. 196 */ setTextDirectionHeuristic(TextDirectionHeuristic heuristic)197 public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) { 198 mTextDirectionHeuristic = heuristic; 199 return this; 200 } 201 202 /** 203 * @return A BidiFormatter with the specified options. 204 */ build()205 public BidiFormatter build() { 206 if (mFlags == DEFAULT_FLAGS && 207 mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 208 return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext); 209 } 210 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic); 211 } 212 } 213 214 // 215 private static final int FLAG_STEREO_RESET = 2; 216 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 217 218 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 219 false /* LTR context */, 220 DEFAULT_FLAGS, 221 DEFAULT_TEXT_DIRECTION_HEURISTIC); 222 223 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 224 true /* RTL context */, 225 DEFAULT_FLAGS, 226 DEFAULT_TEXT_DIRECTION_HEURISTIC); 227 228 private final boolean mIsRtlContext; 229 private final int mFlags; 230 private final TextDirectionHeuristic mDefaultTextDirectionHeuristic; 231 232 /** 233 * Factory for creating an instance of BidiFormatter for the default locale directionality. 234 * 235 * This does not create any new objects, and returns already existing static instances. 236 * 237 */ getInstance()238 public static BidiFormatter getInstance() { 239 return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault())); 240 } 241 242 /** 243 * Factory for creating an instance of BidiFormatter given the context directionality. 244 * 245 * This does not create any new objects, and returns already existing static instances. 246 * 247 * @param rtlContext Whether the context directionality is RTL. 248 */ getInstance(boolean rtlContext)249 public static BidiFormatter getInstance(boolean rtlContext) { 250 return getDefaultInstanceFromContext(rtlContext); 251 } 252 253 /** 254 * Factory for creating an instance of BidiFormatter given the context locale. 255 * 256 * This does not create any new objects, and returns already existing static instances. 257 * 258 * @param locale The context locale. 259 */ getInstance(Locale locale)260 public static BidiFormatter getInstance(Locale locale) { 261 return getDefaultInstanceFromContext(isRtlLocale(locale)); 262 } 263 264 /** 265 * @param isRtlContext Whether the context directionality is RTL or not. 266 * @param flags The option flags. 267 * @param heuristic The default text direction heuristic. 268 */ BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic)269 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) { 270 mIsRtlContext = isRtlContext; 271 mFlags = flags; 272 mDefaultTextDirectionHeuristic = heuristic; 273 } 274 275 /** 276 * @return Whether the context directionality is RTL 277 */ isRtlContext()278 public boolean isRtlContext() { 279 return mIsRtlContext; 280 } 281 282 /** 283 * @return Whether directionality "reset" should also be done before a string being 284 * bidi-wrapped, not just after it. 285 */ getStereoReset()286 public boolean getStereoReset() { 287 return (mFlags & FLAG_STEREO_RESET) != 0; 288 } 289 290 /** 291 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 292 * overall or the exit directionality of a given string is opposite to the context directionality. 293 * Putting this after the string (including its directionality declaration wrapping) prevents it 294 * from "sticking" to other opposite-directionality text or a number appearing after it inline 295 * with only neutral content in between. Otherwise returns the empty string. While the exit 296 * directionality is determined by scanning the end of the string, the overall directionality is 297 * given explicitly by a heuristic to estimate the {@code str}'s directionality. 298 * 299 * @param str CharSequence after which the mark may need to appear. 300 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 301 * directionality. 302 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 303 * else, the empty string. 304 * 305 * @hide 306 */ markAfter(CharSequence str, TextDirectionHeuristic heuristic)307 public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) { 308 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 309 // getExitDir() is called only if needed (short-circuit). 310 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 311 return LRM_STRING; 312 } 313 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 314 return RLM_STRING; 315 } 316 return EMPTY_STRING; 317 } 318 319 /** 320 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 321 * overall or the entry directionality of a given string is opposite to the context 322 * directionality. Putting this before the string (including its directionality declaration 323 * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before 324 * it inline with only neutral content in between. Otherwise returns the empty string. While the 325 * entry directionality is determined by scanning the beginning of the string, the overall 326 * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. 327 * 328 * @param str CharSequence before which the mark may need to appear. 329 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 330 * directionality. 331 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 332 * else, the empty string. 333 * 334 * @hide 335 */ markBefore(CharSequence str, TextDirectionHeuristic heuristic)336 public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) { 337 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 338 // getEntryDir() is called only if needed (short-circuit). 339 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 340 return LRM_STRING; 341 } 342 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 343 return RLM_STRING; 344 } 345 return EMPTY_STRING; 346 } 347 348 /** 349 * Estimates the directionality of a string using the default text direction heuristic. 350 * 351 * @param str String whose directionality is to be estimated. 352 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 353 * false. 354 */ isRtl(String str)355 public boolean isRtl(String str) { 356 return isRtl((CharSequence) str); 357 } 358 359 /** 360 * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string 361 * 362 * @param str CharSequence whose directionality is to be estimated. 363 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 364 * false. 365 */ isRtl(CharSequence str)366 public boolean isRtl(CharSequence str) { 367 return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length()); 368 } 369 370 /** 371 * Formats a string of given directionality for use in plain-text output of the context 372 * directionality, so an opposite-directionality string is neither garbled nor garbles its 373 * surroundings. This makes use of Unicode bidi formatting characters. 374 * <p> 375 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 376 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 377 * LRE+{@code str}+PDF for LTR text. 378 * <p> 379 * If {@code isolate}, directionally isolates the string so that it does not garble its 380 * surroundings. Currently, this is done by "resetting" the directionality after the string by 381 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 382 * either the overall directionality or the exit directionality of the string is opposite to 383 * that of the context. Unless the formatter was built using 384 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 385 * bidi mark matching the context directionality when either the overall directionality or the 386 * entry directionality of the string is opposite to that of the context. Note that as opposed 387 * to the overall directionality, the entry and exit directionalities are determined from the 388 * string itself. 389 * <p> 390 * Does *not* do HTML-escaping. 391 * 392 * @param str The input string. 393 * @param heuristic The algorithm to be used to estimate the string's overall direction. 394 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 395 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 396 * content around it 397 * @return Input string after applying the above processing. {@code null} if {@code str} is 398 * {@code null}. 399 */ unicodeWrap(@ullable String str, TextDirectionHeuristic heuristic, boolean isolate)400 public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic, 401 boolean isolate) { 402 if (str == null) return null; 403 return unicodeWrap((CharSequence) str, heuristic, isolate).toString(); 404 } 405 406 /** 407 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a 408 * CharSequence instead of a string 409 * 410 * @param str The input CharSequence. 411 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 412 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 413 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 414 * the content around it 415 * @return Input CharSequence after applying the above processing. {@code null} if {@code str} 416 * is {@code null}. 417 */ unicodeWrap(@ullable CharSequence str, TextDirectionHeuristic heuristic, boolean isolate)418 public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str, 419 TextDirectionHeuristic heuristic, boolean isolate) { 420 if (str == null) return null; 421 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 422 SpannableStringBuilder result = new SpannableStringBuilder(); 423 if (getStereoReset() && isolate) { 424 result.append(markBefore(str, 425 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 426 } 427 if (isRtl != mIsRtlContext) { 428 result.append(isRtl ? RLE : LRE); 429 result.append(str); 430 result.append(PDF); 431 } else { 432 result.append(str); 433 } 434 if (isolate) { 435 result.append(markAfter(str, 436 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 437 } 438 return result; 439 } 440 441 /** 442 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes 443 * {@code isolate} is true. 444 * 445 * @param str The input string. 446 * @param heuristic The algorithm to be used to estimate the string's overall direction. 447 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 448 * @return Input string after applying the above processing. 449 */ unicodeWrap(String str, TextDirectionHeuristic heuristic)450 public String unicodeWrap(String str, TextDirectionHeuristic heuristic) { 451 return unicodeWrap(str, heuristic, true /* isolate */); 452 } 453 454 /** 455 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but 456 * assumes {@code isolate} is true. 457 * 458 * @param str The input CharSequence. 459 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 460 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 461 * @return Input CharSequence after applying the above processing. 462 */ unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic)463 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) { 464 return unicodeWrap(str, heuristic, true /* isolate */); 465 } 466 467 468 /** 469 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 470 * formatter's default direction estimation algorithm. 471 * 472 * @param str The input string. 473 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 474 * content around it 475 * @return Input string after applying the above processing. 476 */ unicodeWrap(String str, boolean isolate)477 public String unicodeWrap(String str, boolean isolate) { 478 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 479 } 480 481 /** 482 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses 483 * the formatter's default direction estimation algorithm. 484 * 485 * @param str The input CharSequence. 486 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 487 * the content around it 488 * @return Input CharSequence after applying the above processing. 489 */ unicodeWrap(CharSequence str, boolean isolate)490 public CharSequence unicodeWrap(CharSequence str, boolean isolate) { 491 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 492 } 493 494 /** 495 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 496 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 497 * 498 * @param str The input string. 499 * @return Input string after applying the above processing. 500 */ unicodeWrap(String str)501 public String unicodeWrap(String str) { 502 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 503 } 504 505 /** 506 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses 507 * the formatter's default direction estimation algorithm and assumes {@code isolate} is true. 508 * 509 * @param str The input CharSequence. 510 * @return Input CharSequence after applying the above processing. 511 */ unicodeWrap(CharSequence str)512 public CharSequence unicodeWrap(CharSequence str) { 513 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 514 } 515 getDefaultInstanceFromContext(boolean isRtlContext)516 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 517 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 518 } 519 520 /** 521 * Helper method to return true if the Locale directionality is RTL. 522 * 523 * @param locale The Locale whose directionality will be checked to be RTL or LTR 524 * @return true if the {@code locale} directionality is RTL. False otherwise. 525 */ isRtlLocale(Locale locale)526 private static boolean isRtlLocale(Locale locale) { 527 return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL); 528 } 529 530 /** 531 * Enum for directionality type. 532 */ 533 private static final int DIR_LTR = -1; 534 private static final int DIR_UNKNOWN = 0; 535 private static final int DIR_RTL = +1; 536 537 /** 538 * Returns the directionality of the last character with strong directionality in the string, or 539 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 540 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 541 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 542 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 543 * whether a logically separate item that starts with a number or a character of the string's 544 * exit directionality and follows this string inline (not counting any neutral characters in 545 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 546 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 547 * between the two will prevent such sticking. 548 * 549 * @param str the string to check. 550 */ getExitDir(CharSequence str)551 private static int getExitDir(CharSequence str) { 552 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 553 } 554 555 /** 556 * Returns the directionality of the first character with strong directionality in the string, 557 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 558 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 559 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 560 * characters. The intended use is to check whether a logically separate item that ends with a 561 * character of the string's entry directionality and precedes the string inline (not counting 562 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 563 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 564 * context's directionality) between the two will prevent such sticking. 565 * 566 * @param str the string to check. 567 */ getEntryDir(CharSequence str)568 private static int getEntryDir(CharSequence str) { 569 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 570 } 571 572 /** 573 * An object that estimates the directionality of a given string by various methods. 574 * 575 * @hide 576 */ 577 @VisibleForTesting 578 public static class DirectionalityEstimator { 579 580 // Internal static variables and constants. 581 582 /** 583 * Size of the bidi character class cache. The results of the Character.getDirectionality() 584 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 585 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 586 * cache. It can be reduced to 0x180, restricting the cache to the Western European 587 * languages. 588 */ 589 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 590 591 /** 592 * The bidi character class cache. 593 */ 594 private static final byte DIR_TYPE_CACHE[]; 595 596 static { 597 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 598 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 599 // Calling Character.getDirectionality() is OK here, since new emojis start after 600 // the end of our cache. 601 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 602 } 603 } 604 605 /** 606 * Return Character directionality. Same as {@link Character#getDirectionality(int)} except 607 * it can override values for newest emoji that are not covered by ICU. 608 */ getDirectionality(int codePoint)609 public static byte getDirectionality(int codePoint) { 610 return Character.getDirectionality(codePoint); 611 } 612 613 // Internal instance variables. 614 615 /** 616 * The text to be scanned. 617 */ 618 private final CharSequence text; 619 620 /** 621 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 622 * entities when looking for the next / preceding dir type. 623 */ 624 private final boolean isHtml; 625 626 /** 627 * The length of the text in chars. 628 */ 629 private final int length; 630 631 /** 632 * The current position in the text. 633 */ 634 private int charIndex; 635 636 /** 637 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 638 * encountered a supplementary codepoint, this contains a char that is not a valid 639 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 640 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 641 */ 642 private char lastChar; 643 644 /** 645 * Constructor. 646 * 647 * @param text The string to scan. 648 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 649 * tags and entities. 650 */ DirectionalityEstimator(CharSequence text, boolean isHtml)651 DirectionalityEstimator(CharSequence text, boolean isHtml) { 652 this.text = text; 653 this.isHtml = isHtml; 654 length = text.length(); 655 } 656 657 /** 658 * Returns the directionality of the first character with strong directionality in the 659 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 660 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 661 * after RLE/RLO. The results are undefined for a string containing unbalanced 662 * LRE/RLE/LRO/RLO/PDF characters. 663 */ getEntryDir()664 int getEntryDir() { 665 // The reason for this method name, as opposed to getFirstStrongDir(), is that 666 // "first strong" is a commonly used description of Unicode's estimation algorithm, 667 // but the two must treat formatting characters quite differently. Thus, we are staying 668 // away from both "first" and "last" in these method names to avoid confusion. 669 charIndex = 0; 670 int embeddingLevel = 0; 671 int embeddingLevelDir = DIR_UNKNOWN; 672 int firstNonEmptyEmbeddingLevel = 0; 673 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 674 switch (dirTypeForward()) { 675 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 676 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 677 ++embeddingLevel; 678 embeddingLevelDir = DIR_LTR; 679 break; 680 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 681 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 682 ++embeddingLevel; 683 embeddingLevelDir = DIR_RTL; 684 break; 685 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 686 --embeddingLevel; 687 // To restore embeddingLevelDir to its previous value, we would need a 688 // stack, which we want to avoid. Thus, at this point we do not know the 689 // current embedding's directionality. 690 embeddingLevelDir = DIR_UNKNOWN; 691 break; 692 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 693 break; 694 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 695 if (embeddingLevel == 0) { 696 return DIR_LTR; 697 } 698 firstNonEmptyEmbeddingLevel = embeddingLevel; 699 break; 700 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 701 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 702 if (embeddingLevel == 0) { 703 return DIR_RTL; 704 } 705 firstNonEmptyEmbeddingLevel = embeddingLevel; 706 break; 707 default: 708 firstNonEmptyEmbeddingLevel = embeddingLevel; 709 break; 710 } 711 } 712 713 // We have either found a non-empty embedding or scanned the entire string finding 714 // neither a non-empty embedding nor a strong character outside of an embedding. 715 if (firstNonEmptyEmbeddingLevel == 0) { 716 // We have not found a non-empty embedding. Thus, the string contains neither a 717 // non-empty embedding nor a strong character outside of an embedding. 718 return DIR_UNKNOWN; 719 } 720 721 // We have found a non-empty embedding. 722 if (embeddingLevelDir != DIR_UNKNOWN) { 723 // We know the directionality of the non-empty embedding. 724 return embeddingLevelDir; 725 } 726 727 // We do not remember the directionality of the non-empty embedding we found. So, we go 728 // backwards to find the start of the non-empty embedding and get its directionality. 729 while (charIndex > 0) { 730 switch (dirTypeBackward()) { 731 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 732 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 733 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 734 return DIR_LTR; 735 } 736 --embeddingLevel; 737 break; 738 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 739 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 740 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 741 return DIR_RTL; 742 } 743 --embeddingLevel; 744 break; 745 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 746 ++embeddingLevel; 747 break; 748 } 749 } 750 // We should never get here. 751 return DIR_UNKNOWN; 752 } 753 754 /** 755 * Returns the directionality of the last character with strong directionality in the 756 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 757 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 758 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 759 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 760 */ getExitDir()761 int getExitDir() { 762 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 763 // strong" sounds like the exact opposite of "first strong", which is a commonly used 764 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 765 // must treat formatting characters quite differently. Thus, we are staying away from 766 // both "first" and "last" in these method names to avoid confusion. 767 charIndex = length; 768 int embeddingLevel = 0; 769 int lastNonEmptyEmbeddingLevel = 0; 770 while (charIndex > 0) { 771 switch (dirTypeBackward()) { 772 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 773 if (embeddingLevel == 0) { 774 return DIR_LTR; 775 } 776 if (lastNonEmptyEmbeddingLevel == 0) { 777 lastNonEmptyEmbeddingLevel = embeddingLevel; 778 } 779 break; 780 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 781 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 782 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 783 return DIR_LTR; 784 } 785 --embeddingLevel; 786 break; 787 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 788 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 789 if (embeddingLevel == 0) { 790 return DIR_RTL; 791 } 792 if (lastNonEmptyEmbeddingLevel == 0) { 793 lastNonEmptyEmbeddingLevel = embeddingLevel; 794 } 795 break; 796 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 797 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 798 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 799 return DIR_RTL; 800 } 801 --embeddingLevel; 802 break; 803 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 804 ++embeddingLevel; 805 break; 806 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 807 break; 808 default: 809 if (lastNonEmptyEmbeddingLevel == 0) { 810 lastNonEmptyEmbeddingLevel = embeddingLevel; 811 } 812 break; 813 } 814 } 815 return DIR_UNKNOWN; 816 } 817 818 // Internal methods 819 820 /** 821 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 822 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 823 * cache. 824 */ getCachedDirectionality(char c)825 private static byte getCachedDirectionality(char c) { 826 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c); 827 } 828 829 /** 830 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 831 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 832 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 833 * figure out the actual character, and return its dirtype, but treating it as whitespace is 834 * good enough for our purposes. 835 * 836 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 837 */ dirTypeForward()838 byte dirTypeForward() { 839 lastChar = text.charAt(charIndex); 840 if (Character.isHighSurrogate(lastChar)) { 841 int codePoint = Character.codePointAt(text, charIndex); 842 charIndex += Character.charCount(codePoint); 843 return getDirectionality(codePoint); 844 } 845 charIndex++; 846 byte dirType = getCachedDirectionality(lastChar); 847 if (isHtml) { 848 // Process tags and entities. 849 if (lastChar == '<') { 850 dirType = skipTagForward(); 851 } else if (lastChar == '&') { 852 dirType = skipEntityForward(); 853 } 854 } 855 return dirType; 856 } 857 858 /** 859 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 860 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 861 * entity, advances over the whole tag/entity and returns 862 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 863 * actual character, and return its dirtype, but treating it as whitespace is good enough 864 * for our purposes. 865 * 866 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 867 */ dirTypeBackward()868 byte dirTypeBackward() { 869 lastChar = text.charAt(charIndex - 1); 870 if (Character.isLowSurrogate(lastChar)) { 871 int codePoint = Character.codePointBefore(text, charIndex); 872 charIndex -= Character.charCount(codePoint); 873 return getDirectionality(codePoint); 874 } 875 charIndex--; 876 byte dirType = getCachedDirectionality(lastChar); 877 if (isHtml) { 878 // Process tags and entities. 879 if (lastChar == '>') { 880 dirType = skipTagBackward(); 881 } else if (lastChar == ';') { 882 dirType = skipEntityBackward(); 883 } 884 } 885 return dirType; 886 } 887 888 /** 889 * Advances charIndex forward through an HTML tag (after the opening < has already been 890 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 891 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 892 * < that hadn't been part of a tag after all). 893 */ skipTagForward()894 private byte skipTagForward() { 895 int initialCharIndex = charIndex; 896 while (charIndex < length) { 897 lastChar = text.charAt(charIndex++); 898 if (lastChar == '>') { 899 // The end of the tag. 900 return Character.DIRECTIONALITY_WHITESPACE; 901 } 902 if (lastChar == '"' || lastChar == '\'') { 903 // Skip over a quoted attribute value inside the tag. 904 char quote = lastChar; 905 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 906 } 907 } 908 // The original '<' wasn't the start of a tag after all. 909 charIndex = initialCharIndex; 910 lastChar = '<'; 911 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 912 } 913 914 /** 915 * Advances charIndex backward through an HTML tag (after the closing > has already been 916 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 917 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 918 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 919 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 920 * ">>>>", because skipTagBackward() also stops looking for a matching < 921 * when it encounters another >. 922 */ skipTagBackward()923 private byte skipTagBackward() { 924 int initialCharIndex = charIndex; 925 while (charIndex > 0) { 926 lastChar = text.charAt(--charIndex); 927 if (lastChar == '<') { 928 // The start of the tag. 929 return Character.DIRECTIONALITY_WHITESPACE; 930 } 931 if (lastChar == '>') { 932 break; 933 } 934 if (lastChar == '"' || lastChar == '\'') { 935 // Skip over a quoted attribute value inside the tag. 936 char quote = lastChar; 937 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 938 } 939 } 940 // The original '>' wasn't the end of a tag after all. 941 charIndex = initialCharIndex; 942 lastChar = '>'; 943 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 944 } 945 946 /** 947 * Advances charIndex forward through an HTML character entity tag (after the opening 948 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 949 * best to figure out the actual character and return its dirtype, but this is good enough. 950 */ skipEntityForward()951 private byte skipEntityForward() { 952 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 953 return Character.DIRECTIONALITY_WHITESPACE; 954 } 955 956 /** 957 * Advances charIndex backward through an HTML character entity tag (after the closing ; 958 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 959 * to figure out the actual character and return its dirtype, but this is good enough. 960 * If there is no matching &, does not change charIndex and returns 961 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 962 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 963 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 964 * also stops looking for a matching & when it encounters another ;. 965 */ skipEntityBackward()966 private byte skipEntityBackward() { 967 int initialCharIndex = charIndex; 968 while (charIndex > 0) { 969 lastChar = text.charAt(--charIndex); 970 if (lastChar == '&') { 971 return Character.DIRECTIONALITY_WHITESPACE; 972 } 973 if (lastChar == ';') { 974 break; 975 } 976 } 977 charIndex = initialCharIndex; 978 lastChar = ';'; 979 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 980 } 981 } 982 } 983