1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text; 18 19 import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR; 20 21 import android.annotation.Nullable; 22 import android.view.View; 23 24 import java.util.Locale; 25 26 /** 27 * Utility class for formatting text for display in a potentially opposite-directionality context 28 * without garbling. The directionality of the context is set at formatter creation and the 29 * directionality of the text can be either estimated or passed in when known. 30 * 31 * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2}, 32 * you can use the support library's {@link android.support.v4.text.BidiFormatter} class. 33 * 34 * <p>These APIs provides the following functionality: 35 * <p> 36 * 1. Bidi Wrapping 37 * When text in one language is mixed into a document in another, opposite-directionality language, 38 * e.g. when an English business name is embedded in some Hebrew text, both the inserted string 39 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 40 * separated from the surrounding text in a "wrapper" that: 41 * <p> 42 * - Declares its directionality so that the string is displayed correctly. This can be done in 43 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 44 * <p> 45 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 46 * Currently, this can only be done using invisible Unicode characters of the same direction as 47 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 48 * the directionality to that of the context. The "reset" may need to be done at both ends of the 49 * string. Without "reset" after the string, the string will "stick" to a number or logically 50 * separate opposite-direction text that happens to follow it in-line (even if separated by 51 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 52 * happen there, but only with more opposite-direction text, not a number. One approach is to 53 * "reset" the direction only after each string, on the theory that if the preceding opposite- 54 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 55 * the "reset" only before each string definitely does not work because we do not want to require 56 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 57 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 58 * message translations often contain untranslated Latin-script brand names and technical terms, 59 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 60 * has such a message, it is best to do the "reset" manually in the message translation itself, 61 * since the message's opposite-direction text could be followed by an inserted number, which we 62 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 63 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 64 * isolation to be part of the directionality declaration. This form of isolation is better than 65 * "reset" because it takes less space, does not require knowing the context directionality, has a 66 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 67 * using it because required platforms do not yet support it. 68 * <p> 69 * Providing these wrapping services is the basic purpose of the bidi formatter. 70 * <p> 71 * 2. Directionality estimation 72 * How does one know whether a string about to be inserted into surrounding text has the same 73 * directionality? Well, in many cases, one knows that this must be the case when writing the code 74 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 75 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 76 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 77 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 78 * language of the string (and thus its directionality) is not known a priori, and must be 79 * estimated at run-time. The bidi formatter can do this automatically using the default 80 * first-strong estimation algorithm. It can also be configured to use a custom directionality 81 * estimation object. 82 */ 83 public final class BidiFormatter { 84 85 /** 86 * The default text direction heuristic. 87 */ 88 private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 89 90 /** 91 * Unicode "Left-To-Right Embedding" (LRE) character. 92 */ 93 private static final char LRE = '\u202A'; 94 95 /** 96 * Unicode "Right-To-Left Embedding" (RLE) character. 97 */ 98 private static final char RLE = '\u202B'; 99 100 /** 101 * Unicode "Pop Directional Formatting" (PDF) character. 102 */ 103 private static final char PDF = '\u202C'; 104 105 /** 106 * Unicode "Left-To-Right Mark" (LRM) character. 107 */ 108 private static final char LRM = '\u200E'; 109 110 /* 111 * Unicode "Right-To-Left Mark" (RLM) character. 112 */ 113 private static final char RLM = '\u200F'; 114 115 /* 116 * String representation of LRM 117 */ 118 private static final String LRM_STRING = Character.toString(LRM); 119 120 /* 121 * String representation of RLM 122 */ 123 private static final String RLM_STRING = Character.toString(RLM); 124 125 /** 126 * Empty string constant. 127 */ 128 private static final String EMPTY_STRING = ""; 129 130 /** 131 * A class for building a BidiFormatter with non-default options. 132 */ 133 public static final class Builder { 134 private boolean mIsRtlContext; 135 private int mFlags; 136 private TextDirectionHeuristic mTextDirectionHeuristic; 137 138 /** 139 * Constructor. 140 * 141 */ Builder()142 public Builder() { 143 initialize(isRtlLocale(Locale.getDefault())); 144 } 145 146 /** 147 * Constructor. 148 * 149 * @param rtlContext Whether the context directionality is RTL. 150 */ Builder(boolean rtlContext)151 public Builder(boolean rtlContext) { 152 initialize(rtlContext); 153 } 154 155 /** 156 * Constructor. 157 * 158 * @param locale The context locale. 159 */ Builder(Locale locale)160 public Builder(Locale locale) { 161 initialize(isRtlLocale(locale)); 162 } 163 164 /** 165 * Initializes the builder with the given context directionality and default options. 166 * 167 * @param isRtlContext Whether the context is RTL or not. 168 */ initialize(boolean isRtlContext)169 private void initialize(boolean isRtlContext) { 170 mIsRtlContext = isRtlContext; 171 mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC; 172 mFlags = DEFAULT_FLAGS; 173 } 174 175 /** 176 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 177 * a string being bidi-wrapped, not just after it. The default is true. 178 */ stereoReset(boolean stereoReset)179 public Builder stereoReset(boolean stereoReset) { 180 if (stereoReset) { 181 mFlags |= FLAG_STEREO_RESET; 182 } else { 183 mFlags &= ~FLAG_STEREO_RESET; 184 } 185 return this; 186 } 187 188 /** 189 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 190 * By default, uses the first-strong heuristic. 191 * 192 * @param heuristic the {@code TextDirectionHeuristic} to use. 193 * @return the builder itself. 194 */ setTextDirectionHeuristic(TextDirectionHeuristic heuristic)195 public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) { 196 mTextDirectionHeuristic = heuristic; 197 return this; 198 } 199 200 /** 201 * @return A BidiFormatter with the specified options. 202 */ build()203 public BidiFormatter build() { 204 if (mFlags == DEFAULT_FLAGS && 205 mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 206 return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext); 207 } 208 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic); 209 } 210 } 211 212 // 213 private static final int FLAG_STEREO_RESET = 2; 214 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 215 216 private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 217 false /* LTR context */, 218 DEFAULT_FLAGS, 219 DEFAULT_TEXT_DIRECTION_HEURISTIC); 220 221 private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 222 true /* RTL context */, 223 DEFAULT_FLAGS, 224 DEFAULT_TEXT_DIRECTION_HEURISTIC); 225 226 private final boolean mIsRtlContext; 227 private final int mFlags; 228 private final TextDirectionHeuristic mDefaultTextDirectionHeuristic; 229 230 /** 231 * Factory for creating an instance of BidiFormatter for the default locale directionality. 232 * 233 * This does not create any new objects, and returns already existing static instances. 234 * 235 */ getInstance()236 public static BidiFormatter getInstance() { 237 return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault())); 238 } 239 240 /** 241 * Factory for creating an instance of BidiFormatter given the context directionality. 242 * 243 * This does not create any new objects, and returns already existing static instances. 244 * 245 * @param rtlContext Whether the context directionality is RTL. 246 */ getInstance(boolean rtlContext)247 public static BidiFormatter getInstance(boolean rtlContext) { 248 return getDefaultInstanceFromContext(rtlContext); 249 } 250 251 /** 252 * Factory for creating an instance of BidiFormatter given the context locale. 253 * 254 * This does not create any new objects, and returns already existing static instances. 255 * 256 * @param locale The context locale. 257 */ getInstance(Locale locale)258 public static BidiFormatter getInstance(Locale locale) { 259 return getDefaultInstanceFromContext(isRtlLocale(locale)); 260 } 261 262 /** 263 * @param isRtlContext Whether the context directionality is RTL or not. 264 * @param flags The option flags. 265 * @param heuristic The default text direction heuristic. 266 */ BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic)267 private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) { 268 mIsRtlContext = isRtlContext; 269 mFlags = flags; 270 mDefaultTextDirectionHeuristic = heuristic; 271 } 272 273 /** 274 * @return Whether the context directionality is RTL 275 */ isRtlContext()276 public boolean isRtlContext() { 277 return mIsRtlContext; 278 } 279 280 /** 281 * @return Whether directionality "reset" should also be done before a string being 282 * bidi-wrapped, not just after it. 283 */ getStereoReset()284 public boolean getStereoReset() { 285 return (mFlags & FLAG_STEREO_RESET) != 0; 286 } 287 288 /** 289 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 290 * overall or the exit directionality of a given string is opposite to the context directionality. 291 * Putting this after the string (including its directionality declaration wrapping) prevents it 292 * from "sticking" to other opposite-directionality text or a number appearing after it inline 293 * with only neutral content in between. Otherwise returns the empty string. While the exit 294 * directionality is determined by scanning the end of the string, the overall directionality is 295 * given explicitly by a heuristic to estimate the {@code str}'s directionality. 296 * 297 * @param str CharSequence after which the mark may need to appear. 298 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 299 * directionality. 300 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 301 * else, the empty string. 302 * 303 * @hide 304 */ markAfter(CharSequence str, TextDirectionHeuristic heuristic)305 public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) { 306 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 307 // getExitDir() is called only if needed (short-circuit). 308 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 309 return LRM_STRING; 310 } 311 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 312 return RLM_STRING; 313 } 314 return EMPTY_STRING; 315 } 316 317 /** 318 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 319 * overall or the entry directionality of a given string is opposite to the context 320 * directionality. Putting this before the string (including its directionality declaration 321 * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before 322 * it inline with only neutral content in between. Otherwise returns the empty string. While the 323 * entry directionality is determined by scanning the beginning of the string, the overall 324 * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality. 325 * 326 * @param str CharSequence before which the mark may need to appear. 327 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 328 * directionality. 329 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 330 * else, the empty string. 331 * 332 * @hide 333 */ markBefore(CharSequence str, TextDirectionHeuristic heuristic)334 public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) { 335 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 336 // getEntryDir() is called only if needed (short-circuit). 337 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 338 return LRM_STRING; 339 } 340 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 341 return RLM_STRING; 342 } 343 return EMPTY_STRING; 344 } 345 346 /** 347 * Estimates the directionality of a string using the default text direction heuristic. 348 * 349 * @param str String whose directionality is to be estimated. 350 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 351 * false. 352 */ isRtl(String str)353 public boolean isRtl(String str) { 354 return isRtl((CharSequence) str); 355 } 356 357 /** 358 * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string 359 * 360 * @param str CharSequence whose directionality is to be estimated. 361 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 362 * false. 363 */ isRtl(CharSequence str)364 public boolean isRtl(CharSequence str) { 365 return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length()); 366 } 367 368 /** 369 * Formats a string of given directionality for use in plain-text output of the context 370 * directionality, so an opposite-directionality string is neither garbled nor garbles its 371 * surroundings. This makes use of Unicode bidi formatting characters. 372 * <p> 373 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 374 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 375 * LRE+{@code str}+PDF for LTR text. 376 * <p> 377 * If {@code isolate}, directionally isolates the string so that it does not garble its 378 * surroundings. Currently, this is done by "resetting" the directionality after the string by 379 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 380 * either the overall directionality or the exit directionality of the string is opposite to 381 * that of the context. Unless the formatter was built using 382 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 383 * bidi mark matching the context directionality when either the overall directionality or the 384 * entry directionality of the string is opposite to that of the context. Note that as opposed 385 * to the overall directionality, the entry and exit directionalities are determined from the 386 * string itself. 387 * <p> 388 * Does *not* do HTML-escaping. 389 * 390 * @param str The input string. 391 * @param heuristic The algorithm to be used to estimate the string's overall direction. 392 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 393 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 394 * content around it 395 * @return Input string after applying the above processing. {@code null} if {@code str} is 396 * {@code null}. 397 */ unicodeWrap(@ullable String str, TextDirectionHeuristic heuristic, boolean isolate)398 public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic, 399 boolean isolate) { 400 if (str == null) return null; 401 return unicodeWrap((CharSequence) str, heuristic, isolate).toString(); 402 } 403 404 /** 405 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a 406 * CharSequence instead of a string 407 * 408 * @param str The input CharSequence. 409 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 410 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 411 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 412 * the content around it 413 * @return Input CharSequence after applying the above processing. {@code null} if {@code str} 414 * is {@code null}. 415 */ unicodeWrap(@ullable CharSequence str, TextDirectionHeuristic heuristic, boolean isolate)416 public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str, 417 TextDirectionHeuristic heuristic, boolean isolate) { 418 if (str == null) return null; 419 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 420 SpannableStringBuilder result = new SpannableStringBuilder(); 421 if (getStereoReset() && isolate) { 422 result.append(markBefore(str, 423 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 424 } 425 if (isRtl != mIsRtlContext) { 426 result.append(isRtl ? RLE : LRE); 427 result.append(str); 428 result.append(PDF); 429 } else { 430 result.append(str); 431 } 432 if (isolate) { 433 result.append(markAfter(str, 434 isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR)); 435 } 436 return result; 437 } 438 439 /** 440 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes 441 * {@code isolate} is true. 442 * 443 * @param str The input string. 444 * @param heuristic The algorithm to be used to estimate the string's overall direction. 445 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 446 * @return Input string after applying the above processing. 447 */ unicodeWrap(String str, TextDirectionHeuristic heuristic)448 public String unicodeWrap(String str, TextDirectionHeuristic heuristic) { 449 return unicodeWrap(str, heuristic, true /* isolate */); 450 } 451 452 /** 453 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but 454 * assumes {@code isolate} is true. 455 * 456 * @param str The input CharSequence. 457 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 458 * See {@link TextDirectionHeuristics} for pre-defined heuristics. 459 * @return Input CharSequence after applying the above processing. 460 */ unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic)461 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) { 462 return unicodeWrap(str, heuristic, true /* isolate */); 463 } 464 465 466 /** 467 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 468 * formatter's default direction estimation algorithm. 469 * 470 * @param str The input string. 471 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 472 * content around it 473 * @return Input string after applying the above processing. 474 */ unicodeWrap(String str, boolean isolate)475 public String unicodeWrap(String str, boolean isolate) { 476 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 477 } 478 479 /** 480 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses 481 * the formatter's default direction estimation algorithm. 482 * 483 * @param str The input CharSequence. 484 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 485 * the content around it 486 * @return Input CharSequence after applying the above processing. 487 */ unicodeWrap(CharSequence str, boolean isolate)488 public CharSequence unicodeWrap(CharSequence str, boolean isolate) { 489 return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate); 490 } 491 492 /** 493 * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the 494 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 495 * 496 * @param str The input string. 497 * @return Input string after applying the above processing. 498 */ unicodeWrap(String str)499 public String unicodeWrap(String str) { 500 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 501 } 502 503 /** 504 * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses 505 * the formatter's default direction estimation algorithm and assumes {@code isolate} is true. 506 * 507 * @param str The input CharSequence. 508 * @return Input CharSequence after applying the above processing. 509 */ unicodeWrap(CharSequence str)510 public CharSequence unicodeWrap(CharSequence str) { 511 return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */); 512 } 513 getDefaultInstanceFromContext(boolean isRtlContext)514 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 515 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 516 } 517 518 /** 519 * Helper method to return true if the Locale directionality is RTL. 520 * 521 * @param locale The Locale whose directionality will be checked to be RTL or LTR 522 * @return true if the {@code locale} directionality is RTL. False otherwise. 523 */ isRtlLocale(Locale locale)524 private static boolean isRtlLocale(Locale locale) { 525 return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL); 526 } 527 528 /** 529 * Enum for directionality type. 530 */ 531 private static final int DIR_LTR = -1; 532 private static final int DIR_UNKNOWN = 0; 533 private static final int DIR_RTL = +1; 534 535 /** 536 * Returns the directionality of the last character with strong directionality in the string, or 537 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 538 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 539 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 540 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 541 * whether a logically separate item that starts with a number or a character of the string's 542 * exit directionality and follows this string inline (not counting any neutral characters in 543 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 544 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 545 * between the two will prevent such sticking. 546 * 547 * @param str the string to check. 548 */ getExitDir(CharSequence str)549 private static int getExitDir(CharSequence str) { 550 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 551 } 552 553 /** 554 * Returns the directionality of the first character with strong directionality in the string, 555 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 556 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 557 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 558 * characters. The intended use is to check whether a logically separate item that ends with a 559 * character of the string's entry directionality and precedes the string inline (not counting 560 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 561 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 562 * context's directionality) between the two will prevent such sticking. 563 * 564 * @param str the string to check. 565 */ getEntryDir(CharSequence str)566 private static int getEntryDir(CharSequence str) { 567 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 568 } 569 570 /** 571 * An object that estimates the directionality of a given string by various methods. 572 * 573 */ 574 private static class DirectionalityEstimator { 575 576 // Internal static variables and constants. 577 578 /** 579 * Size of the bidi character class cache. The results of the Character.getDirectionality() 580 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 581 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 582 * cache. It can be reduced to 0x180, restricting the cache to the Western European 583 * languages. 584 */ 585 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 586 587 /** 588 * The bidi character class cache. 589 */ 590 private static final byte DIR_TYPE_CACHE[]; 591 592 static { 593 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 594 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 595 // Calling Character.getDirectionality() is OK here, since new emojis start after 596 // the end of our cache. 597 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 598 } 599 } 600 getDirectionality(int codePoint)601 private static byte getDirectionality(int codePoint) { 602 if (Emoji.isNewEmoji(codePoint)) { 603 // TODO: Fix or remove once emoji-data.text 5.0 is in ICU or update to 6.0. 604 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 605 } else { 606 return Character.getDirectionality(codePoint); 607 } 608 } 609 610 // Internal instance variables. 611 612 /** 613 * The text to be scanned. 614 */ 615 private final CharSequence text; 616 617 /** 618 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 619 * entities when looking for the next / preceding dir type. 620 */ 621 private final boolean isHtml; 622 623 /** 624 * The length of the text in chars. 625 */ 626 private final int length; 627 628 /** 629 * The current position in the text. 630 */ 631 private int charIndex; 632 633 /** 634 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 635 * encountered a supplementary codepoint, this contains a char that is not a valid 636 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 637 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 638 */ 639 private char lastChar; 640 641 /** 642 * Constructor. 643 * 644 * @param text The string to scan. 645 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 646 * tags and entities. 647 */ DirectionalityEstimator(CharSequence text, boolean isHtml)648 DirectionalityEstimator(CharSequence text, boolean isHtml) { 649 this.text = text; 650 this.isHtml = isHtml; 651 length = text.length(); 652 } 653 654 /** 655 * Returns the directionality of the first character with strong directionality in the 656 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 657 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 658 * after RLE/RLO. The results are undefined for a string containing unbalanced 659 * LRE/RLE/LRO/RLO/PDF characters. 660 */ getEntryDir()661 int getEntryDir() { 662 // The reason for this method name, as opposed to getFirstStrongDir(), is that 663 // "first strong" is a commonly used description of Unicode's estimation algorithm, 664 // but the two must treat formatting characters quite differently. Thus, we are staying 665 // away from both "first" and "last" in these method names to avoid confusion. 666 charIndex = 0; 667 int embeddingLevel = 0; 668 int embeddingLevelDir = DIR_UNKNOWN; 669 int firstNonEmptyEmbeddingLevel = 0; 670 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 671 switch (dirTypeForward()) { 672 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 673 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 674 ++embeddingLevel; 675 embeddingLevelDir = DIR_LTR; 676 break; 677 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 678 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 679 ++embeddingLevel; 680 embeddingLevelDir = DIR_RTL; 681 break; 682 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 683 --embeddingLevel; 684 // To restore embeddingLevelDir to its previous value, we would need a 685 // stack, which we want to avoid. Thus, at this point we do not know the 686 // current embedding's directionality. 687 embeddingLevelDir = DIR_UNKNOWN; 688 break; 689 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 690 break; 691 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 692 if (embeddingLevel == 0) { 693 return DIR_LTR; 694 } 695 firstNonEmptyEmbeddingLevel = embeddingLevel; 696 break; 697 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 698 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 699 if (embeddingLevel == 0) { 700 return DIR_RTL; 701 } 702 firstNonEmptyEmbeddingLevel = embeddingLevel; 703 break; 704 default: 705 firstNonEmptyEmbeddingLevel = embeddingLevel; 706 break; 707 } 708 } 709 710 // We have either found a non-empty embedding or scanned the entire string finding 711 // neither a non-empty embedding nor a strong character outside of an embedding. 712 if (firstNonEmptyEmbeddingLevel == 0) { 713 // We have not found a non-empty embedding. Thus, the string contains neither a 714 // non-empty embedding nor a strong character outside of an embedding. 715 return DIR_UNKNOWN; 716 } 717 718 // We have found a non-empty embedding. 719 if (embeddingLevelDir != DIR_UNKNOWN) { 720 // We know the directionality of the non-empty embedding. 721 return embeddingLevelDir; 722 } 723 724 // We do not remember the directionality of the non-empty embedding we found. So, we go 725 // backwards to find the start of the non-empty embedding and get its directionality. 726 while (charIndex > 0) { 727 switch (dirTypeBackward()) { 728 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 729 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 730 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 731 return DIR_LTR; 732 } 733 --embeddingLevel; 734 break; 735 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 736 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 737 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 738 return DIR_RTL; 739 } 740 --embeddingLevel; 741 break; 742 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 743 ++embeddingLevel; 744 break; 745 } 746 } 747 // We should never get here. 748 return DIR_UNKNOWN; 749 } 750 751 /** 752 * Returns the directionality of the last character with strong directionality in the 753 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 754 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 755 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 756 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 757 */ getExitDir()758 int getExitDir() { 759 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 760 // strong" sounds like the exact opposite of "first strong", which is a commonly used 761 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 762 // must treat formatting characters quite differently. Thus, we are staying away from 763 // both "first" and "last" in these method names to avoid confusion. 764 charIndex = length; 765 int embeddingLevel = 0; 766 int lastNonEmptyEmbeddingLevel = 0; 767 while (charIndex > 0) { 768 switch (dirTypeBackward()) { 769 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 770 if (embeddingLevel == 0) { 771 return DIR_LTR; 772 } 773 if (lastNonEmptyEmbeddingLevel == 0) { 774 lastNonEmptyEmbeddingLevel = embeddingLevel; 775 } 776 break; 777 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 778 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 779 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 780 return DIR_LTR; 781 } 782 --embeddingLevel; 783 break; 784 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 785 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 786 if (embeddingLevel == 0) { 787 return DIR_RTL; 788 } 789 if (lastNonEmptyEmbeddingLevel == 0) { 790 lastNonEmptyEmbeddingLevel = embeddingLevel; 791 } 792 break; 793 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 794 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 795 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 796 return DIR_RTL; 797 } 798 --embeddingLevel; 799 break; 800 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 801 ++embeddingLevel; 802 break; 803 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 804 break; 805 default: 806 if (lastNonEmptyEmbeddingLevel == 0) { 807 lastNonEmptyEmbeddingLevel = embeddingLevel; 808 } 809 break; 810 } 811 } 812 return DIR_UNKNOWN; 813 } 814 815 // Internal methods 816 817 /** 818 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 819 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 820 * cache. 821 */ getCachedDirectionality(char c)822 private static byte getCachedDirectionality(char c) { 823 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c); 824 } 825 826 /** 827 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 828 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 829 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 830 * figure out the actual character, and return its dirtype, but treating it as whitespace is 831 * good enough for our purposes. 832 * 833 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 834 */ dirTypeForward()835 byte dirTypeForward() { 836 lastChar = text.charAt(charIndex); 837 if (Character.isHighSurrogate(lastChar)) { 838 int codePoint = Character.codePointAt(text, charIndex); 839 charIndex += Character.charCount(codePoint); 840 return getDirectionality(codePoint); 841 } 842 charIndex++; 843 byte dirType = getCachedDirectionality(lastChar); 844 if (isHtml) { 845 // Process tags and entities. 846 if (lastChar == '<') { 847 dirType = skipTagForward(); 848 } else if (lastChar == '&') { 849 dirType = skipEntityForward(); 850 } 851 } 852 return dirType; 853 } 854 855 /** 856 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 857 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 858 * entity, advances over the whole tag/entity and returns 859 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 860 * actual character, and return its dirtype, but treating it as whitespace is good enough 861 * for our purposes. 862 * 863 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 864 */ dirTypeBackward()865 byte dirTypeBackward() { 866 lastChar = text.charAt(charIndex - 1); 867 if (Character.isLowSurrogate(lastChar)) { 868 int codePoint = Character.codePointBefore(text, charIndex); 869 charIndex -= Character.charCount(codePoint); 870 return getDirectionality(codePoint); 871 } 872 charIndex--; 873 byte dirType = getCachedDirectionality(lastChar); 874 if (isHtml) { 875 // Process tags and entities. 876 if (lastChar == '>') { 877 dirType = skipTagBackward(); 878 } else if (lastChar == ';') { 879 dirType = skipEntityBackward(); 880 } 881 } 882 return dirType; 883 } 884 885 /** 886 * Advances charIndex forward through an HTML tag (after the opening < has already been 887 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 888 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 889 * < that hadn't been part of a tag after all). 890 */ skipTagForward()891 private byte skipTagForward() { 892 int initialCharIndex = charIndex; 893 while (charIndex < length) { 894 lastChar = text.charAt(charIndex++); 895 if (lastChar == '>') { 896 // The end of the tag. 897 return Character.DIRECTIONALITY_WHITESPACE; 898 } 899 if (lastChar == '"' || lastChar == '\'') { 900 // Skip over a quoted attribute value inside the tag. 901 char quote = lastChar; 902 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 903 } 904 } 905 // The original '<' wasn't the start of a tag after all. 906 charIndex = initialCharIndex; 907 lastChar = '<'; 908 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 909 } 910 911 /** 912 * Advances charIndex backward through an HTML tag (after the closing > has already been 913 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 914 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 915 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 916 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 917 * ">>>>", because skipTagBackward() also stops looking for a matching < 918 * when it encounters another >. 919 */ skipTagBackward()920 private byte skipTagBackward() { 921 int initialCharIndex = charIndex; 922 while (charIndex > 0) { 923 lastChar = text.charAt(--charIndex); 924 if (lastChar == '<') { 925 // The start of the tag. 926 return Character.DIRECTIONALITY_WHITESPACE; 927 } 928 if (lastChar == '>') { 929 break; 930 } 931 if (lastChar == '"' || lastChar == '\'') { 932 // Skip over a quoted attribute value inside the tag. 933 char quote = lastChar; 934 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 935 } 936 } 937 // The original '>' wasn't the end of a tag after all. 938 charIndex = initialCharIndex; 939 lastChar = '>'; 940 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 941 } 942 943 /** 944 * Advances charIndex forward through an HTML character entity tag (after the opening 945 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 946 * best to figure out the actual character and return its dirtype, but this is good enough. 947 */ skipEntityForward()948 private byte skipEntityForward() { 949 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 950 return Character.DIRECTIONALITY_WHITESPACE; 951 } 952 953 /** 954 * Advances charIndex backward through an HTML character entity tag (after the closing ; 955 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 956 * to figure out the actual character and return its dirtype, but this is good enough. 957 * If there is no matching &, does not change charIndex and returns 958 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 959 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 960 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 961 * also stops looking for a matching & when it encounters another ;. 962 */ skipEntityBackward()963 private byte skipEntityBackward() { 964 int initialCharIndex = charIndex; 965 while (charIndex > 0) { 966 lastChar = text.charAt(--charIndex); 967 if (lastChar == '&') { 968 return Character.DIRECTIONALITY_WHITESPACE; 969 } 970 if (lastChar == ';') { 971 break; 972 } 973 } 974 charIndex = initialCharIndex; 975 lastChar = ';'; 976 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 977 } 978 } 979 } 980