1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package androidx.core.text; 18 19 import static androidx.core.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR; 20 21 import android.text.SpannableStringBuilder; 22 23 import androidx.core.view.ViewCompat; 24 25 import java.util.Locale; 26 27 /** 28 * Utility class for formatting text for display in a potentially opposite-directionality context 29 * without garbling. The directionality of the context is set at formatter creation and the 30 * directionality of the text can be either estimated or passed in when known. Provides the 31 * following functionality: 32 * <p> 33 * 1. Bidi Wrapping 34 * When text in one language is mixed into a document in another, opposite-directionality language, 35 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string 36 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly 37 * separated from the surrounding text in a "wrapper" that: 38 * <p> 39 * - Declares its directionality so that the string is displayed correctly. This can be done in 40 * Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods. 41 * <p> 42 * - Isolates the string's directionality, so it does not unduly affect the surrounding content. 43 * Currently, this can only be done using invisible Unicode characters of the same direction as 44 * the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting" 45 * the directionality to that of the context. The "reset" may need to be done at both ends of the 46 * string. Without "reset" after the string, the string will "stick" to a number or logically 47 * separate opposite-direction text that happens to follow it in-line (even if separated by 48 * neutral content like spaces and punctuation). Without "reset" before the string, the same can 49 * happen there, but only with more opposite-direction text, not a number. One approach is to 50 * "reset" the direction only after each string, on the theory that if the preceding opposite- 51 * direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing 52 * the "reset" only before each string definitely does not work because we do not want to require 53 * bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a 54 * number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL 55 * message translations often contain untranslated Latin-script brand names and technical terms, 56 * and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one 57 * has such a message, it is best to do the "reset" manually in the message translation itself, 58 * since the message's opposite-direction text could be followed by an inserted number, which we 59 * would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an 60 * alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the 61 * isolation to be part of the directionality declaration. This form of isolation is better than 62 * "reset" because it takes less space, does not require knowing the context directionality, has a 63 * gentler effect than "reset", and protects both ends of the string. However, we do not yet allow 64 * using it because required platforms do not yet support it. 65 * <p> 66 * Providing these wrapping services is the basic purpose of the bidi formatter. 67 * <p> 68 * 2. Directionality estimation 69 * How does one know whether a string about to be inserted into surrounding text has the same 70 * directionality? Well, in many cases, one knows that this must be the case when writing the code 71 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such 72 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be 73 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known. 74 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the 75 * language of the string (and thus its directionality) is not known a priori, and must be 76 * estimated at run-time. The bidi formatter can do this automatically using the default 77 * first-strong estimation algorithm. It can also be configured to use a custom directionality 78 * estimation object. 79 */ 80 public final class BidiFormatter { 81 82 /** 83 * The default text direction heuristic. 84 */ 85 static final TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR; 86 87 /** 88 * Unicode "Left-To-Right Embedding" (LRE) character. 89 */ 90 private static final char LRE = '\u202A'; 91 92 /** 93 * Unicode "Right-To-Left Embedding" (RLE) character. 94 */ 95 private static final char RLE = '\u202B'; 96 97 /** 98 * Unicode "Pop Directional Formatting" (PDF) character. 99 */ 100 private static final char PDF = '\u202C'; 101 102 /** 103 * Unicode "Left-To-Right Mark" (LRM) character. 104 */ 105 private static final char LRM = '\u200E'; 106 107 /* 108 * Unicode "Right-To-Left Mark" (RLM) character. 109 */ 110 private static final char RLM = '\u200F'; 111 112 /* 113 * String representation of LRM 114 */ 115 private static final String LRM_STRING = Character.toString(LRM); 116 117 /* 118 * String representation of RLM 119 */ 120 private static final String RLM_STRING = Character.toString(RLM); 121 122 /** 123 * Empty string constant. 124 */ 125 private static final String EMPTY_STRING = ""; 126 127 /** 128 * A class for building a BidiFormatter with non-default options. 129 */ 130 public static final class Builder { 131 private boolean mIsRtlContext; 132 private int mFlags; 133 private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat; 134 135 /** 136 * Constructor. 137 * 138 */ Builder()139 public Builder() { 140 initialize(isRtlLocale(Locale.getDefault())); 141 } 142 143 /** 144 * Constructor. 145 * 146 * @param rtlContext Whether the context directionality is RTL. 147 */ Builder(boolean rtlContext)148 public Builder(boolean rtlContext) { 149 initialize(rtlContext); 150 } 151 152 /** 153 * Constructor. 154 * 155 * @param locale The context locale. 156 */ Builder(Locale locale)157 public Builder(Locale locale) { 158 initialize(isRtlLocale(locale)); 159 } 160 161 /** 162 * Initializes the builder with the given context directionality and default options. 163 * 164 * @param isRtlContext Whether the context is RTL or not. 165 */ initialize(boolean isRtlContext)166 private void initialize(boolean isRtlContext) { 167 mIsRtlContext = isRtlContext; 168 mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC; 169 mFlags = DEFAULT_FLAGS; 170 } 171 172 /** 173 * Specifies whether the BidiFormatter to be built should also "reset" directionality before 174 * a string being bidi-wrapped, not just after it. The default is true. 175 */ stereoReset(boolean stereoReset)176 public Builder stereoReset(boolean stereoReset) { 177 if (stereoReset) { 178 mFlags |= FLAG_STEREO_RESET; 179 } else { 180 mFlags &= ~FLAG_STEREO_RESET; 181 } 182 return this; 183 } 184 185 /** 186 * Specifies the default directionality estimation algorithm to be used by the BidiFormatter. 187 * By default, uses the first-strong heuristic. 188 * 189 * @param heuristic the {@code TextDirectionHeuristic} to use. 190 * @return the builder itself. 191 */ setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic)192 public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) { 193 mTextDirectionHeuristicCompat = heuristic; 194 return this; 195 } 196 getDefaultInstanceFromContext(boolean isRtlContext)197 private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) { 198 return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE; 199 } 200 201 /** 202 * @return A BidiFormatter with the specified options. 203 */ build()204 public BidiFormatter build() { 205 if (mFlags == DEFAULT_FLAGS && 206 mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) { 207 return getDefaultInstanceFromContext(mIsRtlContext); 208 } 209 return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat); 210 } 211 } 212 213 // 214 private static final int FLAG_STEREO_RESET = 2; 215 private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET; 216 217 static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter( 218 false /* LTR context */, 219 DEFAULT_FLAGS, 220 DEFAULT_TEXT_DIRECTION_HEURISTIC); 221 222 static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter( 223 true /* RTL context */, 224 DEFAULT_FLAGS, 225 DEFAULT_TEXT_DIRECTION_HEURISTIC); 226 227 private final boolean mIsRtlContext; 228 private final int mFlags; 229 private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat; 230 231 /** 232 * Factory for creating an instance of BidiFormatter for the default locale directionality. 233 * 234 */ getInstance()235 public static BidiFormatter getInstance() { 236 return new Builder().build(); 237 } 238 239 /** 240 * Factory for creating an instance of BidiFormatter given the context directionality. 241 * 242 * @param rtlContext Whether the context directionality is RTL. 243 */ getInstance(boolean rtlContext)244 public static BidiFormatter getInstance(boolean rtlContext) { 245 return new Builder(rtlContext).build(); 246 } 247 248 /** 249 * Factory for creating an instance of BidiFormatter given the context locale. 250 * 251 * @param locale The context locale. 252 */ getInstance(Locale locale)253 public static BidiFormatter getInstance(Locale locale) { 254 return new Builder(locale).build(); 255 } 256 257 /** 258 * @param isRtlContext Whether the context directionality is RTL or not. 259 * @param flags The option flags. 260 * @param heuristic The default text direction heuristic. 261 */ BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic)262 BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) { 263 mIsRtlContext = isRtlContext; 264 mFlags = flags; 265 mDefaultTextDirectionHeuristicCompat = heuristic; 266 } 267 268 /** 269 * @return Whether the context directionality is RTL 270 */ isRtlContext()271 public boolean isRtlContext() { 272 return mIsRtlContext; 273 } 274 275 /** 276 * @return Whether directionality "reset" should also be done before a string being 277 * bidi-wrapped, not just after it. 278 */ getStereoReset()279 public boolean getStereoReset() { 280 return (mFlags & FLAG_STEREO_RESET) != 0; 281 } 282 283 /** 284 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 285 * overall or the exit directionality of a given CharSequence is opposite to the context 286 * directionality. Putting this after the CharSequence (including its directionality 287 * declaration wrapping) prevents it from "sticking" to other opposite-directionality text or a 288 * number appearing after it inline with only neutral content in between. Otherwise returns 289 * the empty string. While the exit directionality is determined by scanning the end of the 290 * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the 291 * {@code str}'s directionality. 292 * 293 * @param str CharSequence after which the mark may need to appear. 294 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 295 * directionality. 296 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 297 * else, the empty . 298 */ markAfter(CharSequence str, TextDirectionHeuristicCompat heuristic)299 private String markAfter(CharSequence str, TextDirectionHeuristicCompat heuristic) { 300 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 301 // getExitDir() is called only if needed (short-circuit). 302 if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) { 303 return LRM_STRING; 304 } 305 if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) { 306 return RLM_STRING; 307 } 308 return EMPTY_STRING; 309 } 310 311 /** 312 * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the 313 * overall or the entry directionality of a given CharSequence is opposite to the context 314 * directionality. Putting this before the CharSequence (including its directionality 315 * declaration wrapping) prevents it from "sticking" to other opposite-directionality text 316 * appearing before it inline with only neutral content in between. Otherwise returns the 317 * empty string. While the entry directionality is determined by scanning the beginning of the 318 * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the 319 * {@code str}'s directionality. 320 * 321 * @param str CharSequence before which the mark may need to appear. 322 * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s 323 * directionality. 324 * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context; 325 * else, the empty string. 326 */ markBefore(CharSequence str, TextDirectionHeuristicCompat heuristic)327 private String markBefore(CharSequence str, TextDirectionHeuristicCompat heuristic) { 328 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 329 // getEntryDir() is called only if needed (short-circuit). 330 if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) { 331 return LRM_STRING; 332 } 333 if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) { 334 return RLM_STRING; 335 } 336 return EMPTY_STRING; 337 } 338 339 /** 340 * Estimates the directionality of a string using the default text direction heuristic. 341 * 342 * @param str String whose directionality is to be estimated. 343 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 344 * false. 345 */ isRtl(String str)346 public boolean isRtl(String str) { 347 return isRtl((CharSequence) str); 348 } 349 350 /** 351 * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string. 352 * 353 * @param str CharSequence whose directionality is to be estimated. 354 * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns 355 * false. 356 */ isRtl(CharSequence str)357 public boolean isRtl(CharSequence str) { 358 return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length()); 359 } 360 361 /** 362 * Formats a string of given directionality for use in plain-text output of the context 363 * directionality, so an opposite-directionality string is neither garbled nor garbles its 364 * surroundings. This makes use of Unicode bidi formatting characters. 365 * <p> 366 * The algorithm: In case the given directionality doesn't match the context directionality, wraps 367 * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or 368 * LRE+{@code str}+PDF for LTR text. 369 * <p> 370 * If {@code isolate}, directionally isolates the string so that it does not garble its 371 * surroundings. Currently, this is done by "resetting" the directionality after the string by 372 * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when 373 * either the overall directionality or the exit directionality of the string is opposite to 374 * that of the context. Unless the formatter was built using 375 * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode 376 * bidi mark matching the context directionality when either the overall directionality or the 377 * entry directionality of the string is opposite to that of the context. Note that as opposed 378 * to the overall directionality, the entry and exit directionalities are determined from the 379 * string itself. 380 * <p> 381 * Does *not* do HTML-escaping. 382 * 383 * @param str The input string. 384 * @param heuristic The algorithm to be used to estimate the string's overall direction. 385 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 386 * content around it 387 * @return Input string after applying the above processing. {@code null} if {@code str} is 388 * {@code null}. 389 */ unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate)390 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) { 391 if (str == null) return null; 392 return unicodeWrap((CharSequence) str, heuristic, isolate).toString(); 393 } 394 395 /** 396 * Operates like {@link #unicodeWrap(String, 397 * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but takes a CharSequence 398 * instead of a string 399 * 400 * @param str The input CharSequence. 401 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 402 * See {@link androidx.core.text.TextDirectionHeuristicsCompat} for pre-defined 403 * heuristics. 404 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 405 * the content around it 406 * @return Input CharSequence after applying the above processing. {@code null} if {@code str} 407 * is {@code null}. 408 */ unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic, boolean isolate)409 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic, 410 boolean isolate) { 411 if (str == null) return null; 412 final boolean isRtl = heuristic.isRtl(str, 0, str.length()); 413 SpannableStringBuilder result = new SpannableStringBuilder(); 414 if (getStereoReset() && isolate) { 415 result.append(markBefore(str, 416 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 417 } 418 if (isRtl != mIsRtlContext) { 419 result.append(isRtl ? RLE : LRE); 420 result.append(str); 421 result.append(PDF); 422 } else { 423 result.append(str); 424 } 425 if (isolate) { 426 result.append(markAfter(str, 427 isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR)); 428 } 429 return result; 430 } 431 432 /** 433 * Operates like {@link #unicodeWrap(String, androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but assumes 434 * {@code isolate} is true. 435 * 436 * @param str The input string. 437 * @param heuristic The algorithm to be used to estimate the string's overall direction. 438 * @return Input string after applying the above processing. 439 */ unicodeWrap(String str, TextDirectionHeuristicCompat heuristic)440 public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) { 441 return unicodeWrap(str, heuristic, true /* isolate */); 442 } 443 444 /** 445 * Operates like {@link #unicodeWrap(CharSequence, 446 * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but assumes {@code isolate} 447 * is true. 448 * 449 * @param str The input CharSequence. 450 * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction. 451 * See {@link androidx.core.text.TextDirectionHeuristicsCompat} for pre-defined 452 * heuristics. 453 * @return Input CharSequence after applying the above processing. 454 */ unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic)455 public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic) { 456 return unicodeWrap(str, heuristic, true /* isolate */); 457 } 458 459 /** 460 * Operates like {@link #unicodeWrap(String, androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the 461 * formatter's default direction estimation algorithm. 462 * 463 * @param str The input string. 464 * @param isolate Whether to directionally isolate the string to prevent it from garbling the 465 * content around it 466 * @return Input string after applying the above processing. 467 */ unicodeWrap(String str, boolean isolate)468 public String unicodeWrap(String str, boolean isolate) { 469 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate); 470 } 471 472 /** 473 * Operates like {@link #unicodeWrap(CharSequence, 474 * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's 475 * default direction estimation algorithm. 476 * 477 * @param str The input CharSequence. 478 * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling 479 * the content around it 480 * @return Input CharSequence after applying the above processing. 481 */ unicodeWrap(CharSequence str, boolean isolate)482 public CharSequence unicodeWrap(CharSequence str, boolean isolate) { 483 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate); 484 } 485 486 /** 487 * Operates like {@link #unicodeWrap(String, androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the 488 * formatter's default direction estimation algorithm and assumes {@code isolate} is true. 489 * 490 * @param str The input string. 491 * @return Input string after applying the above processing. 492 */ unicodeWrap(String str)493 public String unicodeWrap(String str) { 494 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */); 495 } 496 497 /** 498 * Operates like {@link #unicodeWrap(CharSequence, 499 * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's 500 * default direction estimation algorithm and assumes {@code isolate} is true. 501 * 502 * @param str The input CharSequence. 503 * @return Input CharSequence after applying the above processing. 504 */ unicodeWrap(CharSequence str)505 public CharSequence unicodeWrap(CharSequence str) { 506 return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */); 507 } 508 509 /** 510 * Helper method to return true if the Locale directionality is RTL. 511 * 512 * @param locale The Locale whose directionality will be checked to be RTL or LTR 513 * @return true if the {@code locale} directionality is RTL. False otherwise. 514 */ isRtlLocale(Locale locale)515 static boolean isRtlLocale(Locale locale) { 516 return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL); 517 } 518 519 /** 520 * Enum for directionality type. 521 */ 522 private static final int DIR_LTR = -1; 523 private static final int DIR_UNKNOWN = 0; 524 private static final int DIR_RTL = +1; 525 526 /** 527 * Returns the directionality of the last character with strong directionality in the string, or 528 * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of 529 * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a 530 * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a 531 * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check 532 * whether a logically separate item that starts with a number or a character of the string's 533 * exit directionality and follows this string inline (not counting any neutral characters in 534 * between) would "stick" to it in an opposite-directionality context, thus being displayed in 535 * an incorrect position. An LRM or RLM character (the one of the context's directionality) 536 * between the two will prevent such sticking. 537 * 538 * @param str the string to check. 539 */ getExitDir(CharSequence str)540 private static int getExitDir(CharSequence str) { 541 return new DirectionalityEstimator(str, false /* isHtml */).getExitDir(); 542 } 543 544 /** 545 * Returns the directionality of the first character with strong directionality in the string, 546 * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 547 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after 548 * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF 549 * characters. The intended use is to check whether a logically separate item that ends with a 550 * character of the string's entry directionality and precedes the string inline (not counting 551 * any neutral characters in between) would "stick" to it in an opposite-directionality context, 552 * thus being displayed in an incorrect position. An LRM or RLM character (the one of the 553 * context's directionality) between the two will prevent such sticking. 554 * 555 * @param str the string to check. 556 */ getEntryDir(CharSequence str)557 private static int getEntryDir(CharSequence str) { 558 return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir(); 559 } 560 561 /** 562 * An object that estimates the directionality of a given string by various methods. 563 * 564 */ 565 private static class DirectionalityEstimator { 566 567 // Internal static variables and constants. 568 569 /** 570 * Size of the bidi character class cache. The results of the Character.getDirectionality() 571 * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed. 572 * The 0x700 value is designed to leave all the European and Near Eastern languages in the 573 * cache. It can be reduced to 0x180, restricting the cache to the Western European 574 * languages. 575 */ 576 private static final int DIR_TYPE_CACHE_SIZE = 0x700; 577 578 /** 579 * The bidi character class cache. 580 */ 581 private static final byte DIR_TYPE_CACHE[]; 582 583 static { 584 DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE]; 585 for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) { 586 DIR_TYPE_CACHE[i] = Character.getDirectionality(i); 587 } 588 } 589 590 // Internal instance variables. 591 592 /** 593 * The text to be scanned. 594 */ 595 private final CharSequence text; 596 597 /** 598 * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and 599 * entities when looking for the next / preceding dir type. 600 */ 601 private final boolean isHtml; 602 603 /** 604 * The length of the text in chars. 605 */ 606 private final int length; 607 608 /** 609 * The current position in the text. 610 */ 611 private int charIndex; 612 613 /** 614 * The char encountered by the last dirTypeForward or dirTypeBackward call. If it 615 * encountered a supplementary codepoint, this contains a char that is not a valid 616 * codepoint. This is ok, because this member is only used to detect some well-known ASCII 617 * syntax, e.g. "http://" and the beginning of an HTML tag or entity. 618 */ 619 private char lastChar; 620 621 /** 622 * Constructor. 623 * 624 * @param text The string to scan. 625 * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over 626 * tags and entities. 627 */ DirectionalityEstimator(CharSequence text, boolean isHtml)628 DirectionalityEstimator(CharSequence text, boolean isHtml) { 629 this.text = text; 630 this.isHtml = isHtml; 631 length = text.length(); 632 } 633 634 /** 635 * Returns the directionality of the first character with strong directionality in the 636 * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an 637 * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL 638 * after RLE/RLO. The results are undefined for a string containing unbalanced 639 * LRE/RLE/LRO/RLO/PDF characters. 640 */ getEntryDir()641 int getEntryDir() { 642 // The reason for this method name, as opposed to getFirstStrongDir(), is that 643 // "first strong" is a commonly used description of Unicode's estimation algorithm, 644 // but the two must treat formatting characters quite differently. Thus, we are staying 645 // away from both "first" and "last" in these method names to avoid confusion. 646 charIndex = 0; 647 int embeddingLevel = 0; 648 int embeddingLevelDir = DIR_UNKNOWN; 649 int firstNonEmptyEmbeddingLevel = 0; 650 while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) { 651 switch (dirTypeForward()) { 652 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 653 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 654 ++embeddingLevel; 655 embeddingLevelDir = DIR_LTR; 656 break; 657 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 658 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 659 ++embeddingLevel; 660 embeddingLevelDir = DIR_RTL; 661 break; 662 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 663 --embeddingLevel; 664 // To restore embeddingLevelDir to its previous value, we would need a 665 // stack, which we want to avoid. Thus, at this point we do not know the 666 // current embedding's directionality. 667 embeddingLevelDir = DIR_UNKNOWN; 668 break; 669 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 670 break; 671 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 672 if (embeddingLevel == 0) { 673 return DIR_LTR; 674 } 675 firstNonEmptyEmbeddingLevel = embeddingLevel; 676 break; 677 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 678 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 679 if (embeddingLevel == 0) { 680 return DIR_RTL; 681 } 682 firstNonEmptyEmbeddingLevel = embeddingLevel; 683 break; 684 default: 685 firstNonEmptyEmbeddingLevel = embeddingLevel; 686 break; 687 } 688 } 689 690 // We have either found a non-empty embedding or scanned the entire string finding 691 // neither a non-empty embedding nor a strong character outside of an embedding. 692 if (firstNonEmptyEmbeddingLevel == 0) { 693 // We have not found a non-empty embedding. Thus, the string contains neither a 694 // non-empty embedding nor a strong character outside of an embedding. 695 return DIR_UNKNOWN; 696 } 697 698 // We have found a non-empty embedding. 699 if (embeddingLevelDir != DIR_UNKNOWN) { 700 // We know the directionality of the non-empty embedding. 701 return embeddingLevelDir; 702 } 703 704 // We do not remember the directionality of the non-empty embedding we found. So, we go 705 // backwards to find the start of the non-empty embedding and get its directionality. 706 while (charIndex > 0) { 707 switch (dirTypeBackward()) { 708 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 709 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 710 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 711 return DIR_LTR; 712 } 713 --embeddingLevel; 714 break; 715 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 716 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 717 if (firstNonEmptyEmbeddingLevel == embeddingLevel) { 718 return DIR_RTL; 719 } 720 --embeddingLevel; 721 break; 722 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 723 ++embeddingLevel; 724 break; 725 } 726 } 727 // We should never get here. 728 return DIR_UNKNOWN; 729 } 730 731 /** 732 * Returns the directionality of the last character with strong directionality in the 733 * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards 734 * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its 735 * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results 736 * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. 737 */ getExitDir()738 int getExitDir() { 739 // The reason for this method name, as opposed to getLastStrongDir(), is that "last 740 // strong" sounds like the exact opposite of "first strong", which is a commonly used 741 // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two 742 // must treat formatting characters quite differently. Thus, we are staying away from 743 // both "first" and "last" in these method names to avoid confusion. 744 charIndex = length; 745 int embeddingLevel = 0; 746 int lastNonEmptyEmbeddingLevel = 0; 747 while (charIndex > 0) { 748 switch (dirTypeBackward()) { 749 case Character.DIRECTIONALITY_LEFT_TO_RIGHT: 750 if (embeddingLevel == 0) { 751 return DIR_LTR; 752 } 753 if (lastNonEmptyEmbeddingLevel == 0) { 754 lastNonEmptyEmbeddingLevel = embeddingLevel; 755 } 756 break; 757 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: 758 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 759 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 760 return DIR_LTR; 761 } 762 --embeddingLevel; 763 break; 764 case Character.DIRECTIONALITY_RIGHT_TO_LEFT: 765 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: 766 if (embeddingLevel == 0) { 767 return DIR_RTL; 768 } 769 if (lastNonEmptyEmbeddingLevel == 0) { 770 lastNonEmptyEmbeddingLevel = embeddingLevel; 771 } 772 break; 773 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: 774 case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: 775 if (lastNonEmptyEmbeddingLevel == embeddingLevel) { 776 return DIR_RTL; 777 } 778 --embeddingLevel; 779 break; 780 case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT: 781 ++embeddingLevel; 782 break; 783 case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL: 784 break; 785 default: 786 if (lastNonEmptyEmbeddingLevel == 0) { 787 lastNonEmptyEmbeddingLevel = embeddingLevel; 788 } 789 break; 790 } 791 } 792 return DIR_UNKNOWN; 793 } 794 795 // Internal methods 796 797 /** 798 * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using 799 * a cache for speed. Not designed for supplementary codepoints, whose results we do not 800 * cache. 801 */ getCachedDirectionality(char c)802 private static byte getCachedDirectionality(char c) { 803 return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c); 804 } 805 806 /** 807 * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances 808 * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity, 809 * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to 810 * figure out the actual character, and return its dirtype, but treating it as whitespace is 811 * good enough for our purposes. 812 * 813 * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0. 814 */ dirTypeForward()815 byte dirTypeForward() { 816 lastChar = text.charAt(charIndex); 817 if (Character.isHighSurrogate(lastChar)) { 818 int codePoint = Character.codePointAt(text, charIndex); 819 charIndex += Character.charCount(codePoint); 820 return Character.getDirectionality(codePoint); 821 } 822 charIndex++; 823 byte dirType = getCachedDirectionality(lastChar); 824 if (isHtml) { 825 // Process tags and entities. 826 if (lastChar == '<') { 827 dirType = skipTagForward(); 828 } else if (lastChar == '&') { 829 dirType = skipEntityForward(); 830 } 831 } 832 return dirType; 833 } 834 835 /** 836 * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances 837 * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or 838 * entity, advances over the whole tag/entity and returns 839 * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the 840 * actual character, and return its dirtype, but treating it as whitespace is good enough 841 * for our purposes. 842 * 843 * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0. 844 */ dirTypeBackward()845 byte dirTypeBackward() { 846 lastChar = text.charAt(charIndex - 1); 847 if (Character.isLowSurrogate(lastChar)) { 848 int codePoint = Character.codePointBefore(text, charIndex); 849 charIndex -= Character.charCount(codePoint); 850 return Character.getDirectionality(codePoint); 851 } 852 charIndex--; 853 byte dirType = getCachedDirectionality(lastChar); 854 if (isHtml) { 855 // Process tags and entities. 856 if (lastChar == '>') { 857 dirType = skipTagBackward(); 858 } else if (lastChar == ';') { 859 dirType = skipEntityBackward(); 860 } 861 } 862 return dirType; 863 } 864 865 /** 866 * Advances charIndex forward through an HTML tag (after the opening < has already been 867 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching >, 868 * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the 869 * < that hadn't been part of a tag after all). 870 */ skipTagForward()871 private byte skipTagForward() { 872 int initialCharIndex = charIndex; 873 while (charIndex < length) { 874 lastChar = text.charAt(charIndex++); 875 if (lastChar == '>') { 876 // The end of the tag. 877 return Character.DIRECTIONALITY_WHITESPACE; 878 } 879 if (lastChar == '"' || lastChar == '\'') { 880 // Skip over a quoted attribute value inside the tag. 881 char quote = lastChar; 882 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {} 883 } 884 } 885 // The original '<' wasn't the start of a tag after all. 886 charIndex = initialCharIndex; 887 lastChar = '<'; 888 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 889 } 890 891 /** 892 * Advances charIndex backward through an HTML tag (after the closing > has already been 893 * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching <, does 894 * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the > 895 * that hadn't been part of a tag after all). Nevertheless, the running time for calling 896 * skipTagBackward() in a loop remains linear in the size of the text, even for a text like 897 * ">>>>", because skipTagBackward() also stops looking for a matching < 898 * when it encounters another >. 899 */ skipTagBackward()900 private byte skipTagBackward() { 901 int initialCharIndex = charIndex; 902 while (charIndex > 0) { 903 lastChar = text.charAt(--charIndex); 904 if (lastChar == '<') { 905 // The start of the tag. 906 return Character.DIRECTIONALITY_WHITESPACE; 907 } 908 if (lastChar == '>') { 909 break; 910 } 911 if (lastChar == '"' || lastChar == '\'') { 912 // Skip over a quoted attribute value inside the tag. 913 char quote = lastChar; 914 while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {} 915 } 916 } 917 // The original '>' wasn't the end of a tag after all. 918 charIndex = initialCharIndex; 919 lastChar = '>'; 920 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 921 } 922 923 /** 924 * Advances charIndex forward through an HTML character entity tag (after the opening 925 * & has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be 926 * best to figure out the actual character and return its dirtype, but this is good enough. 927 */ skipEntityForward()928 private byte skipEntityForward() { 929 while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {} 930 return Character.DIRECTIONALITY_WHITESPACE; 931 } 932 933 /** 934 * Advances charIndex backward through an HTML character entity tag (after the closing ; 935 * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best 936 * to figure out the actual character and return its dirtype, but this is good enough. 937 * If there is no matching &, does not change charIndex and returns 938 * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after 939 * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains 940 * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward() 941 * also stops looking for a matching & when it encounters another ;. 942 */ skipEntityBackward()943 private byte skipEntityBackward() { 944 int initialCharIndex = charIndex; 945 while (charIndex > 0) { 946 lastChar = text.charAt(--charIndex); 947 if (lastChar == '&') { 948 return Character.DIRECTIONALITY_WHITESPACE; 949 } 950 if (lastChar == ';') { 951 break; 952 } 953 } 954 charIndex = initialCharIndex; 955 lastChar = ';'; 956 return Character.DIRECTIONALITY_OTHER_NEUTRALS; 957 } 958 } 959 } 960