1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text;
18 
19 import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR;
20 
21 import android.annotation.Nullable;
22 import android.view.View;
23 
24 import java.util.Locale;
25 
26 /**
27  * Utility class for formatting text for display in a potentially opposite-directionality context
28  * without garbling. The directionality of the context is set at formatter creation and the
29  * directionality of the text can be either estimated or passed in when known.
30  *
31  * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2},
32  * you can use the support library's {@link android.support.v4.text.BidiFormatter} class.
33  *
34  * <p>These APIs provides the following functionality:
35  * <p>
36  * 1. Bidi Wrapping
37  * When text in one language is mixed into a document in another, opposite-directionality language,
38  * e.g. when an English business name is embedded in some Hebrew text, both the inserted string
39  * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
40  * separated from the surrounding text in a "wrapper" that:
41  * <p>
42  * - Declares its directionality so that the string is displayed correctly. This can be done in
43  *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
44  * <p>
45  * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
46  *   Currently, this can only be done using invisible Unicode characters of the same direction as
47  *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
48  *   the directionality to that of the context. The "reset" may need to be done at both ends of the
49  *   string. Without "reset" after the string, the string will "stick" to a number or logically
50  *   separate opposite-direction text that happens to follow it in-line (even if separated by
51  *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
52  *   happen there, but only with more opposite-direction text, not a number. One approach is to
53  *   "reset" the direction only after each string, on the theory that if the preceding opposite-
54  *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
55  *   the "reset" only before each string definitely does not work because we do not want to require
56  *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
57  *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
58  *   message translations often contain untranslated Latin-script brand names and technical terms,
59  *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
60  *   has such a message, it is best to do the "reset" manually in the message translation itself,
61  *   since the message's opposite-direction text could be followed by an inserted number, which we
62  *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
63  *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
64  *   isolation to be part of the directionality declaration. This form of isolation is better than
65  *   "reset" because it takes less space, does not require knowing the context directionality, has a
66  *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
67  *   using it because required platforms do not yet support it.
68  * <p>
69  * Providing these wrapping services is the basic purpose of the bidi formatter.
70  * <p>
71  * 2. Directionality estimation
72  * How does one know whether a string about to be inserted into surrounding text has the same
73  * directionality? Well, in many cases, one knows that this must be the case when writing the code
74  * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
75  * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
76  * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
77  * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
78  * language of the string (and thus its directionality) is not known a priori, and must be
79  * estimated at run-time. The bidi formatter can do this automatically using the default
80  * first-strong estimation algorithm. It can also be configured to use a custom directionality
81  * estimation object.
82  */
83 public final class BidiFormatter {
84 
85     /**
86      * The default text direction heuristic.
87      */
88     private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
89 
90     /**
91      * Unicode "Left-To-Right Embedding" (LRE) character.
92      */
93     private static final char LRE = '\u202A';
94 
95     /**
96      * Unicode "Right-To-Left Embedding" (RLE) character.
97      */
98     private static final char RLE = '\u202B';
99 
100     /**
101      * Unicode "Pop Directional Formatting" (PDF) character.
102      */
103     private static final char PDF = '\u202C';
104 
105     /**
106      *  Unicode "Left-To-Right Mark" (LRM) character.
107      */
108     private static final char LRM = '\u200E';
109 
110     /*
111      * Unicode "Right-To-Left Mark" (RLM) character.
112      */
113     private static final char RLM = '\u200F';
114 
115     /*
116      * String representation of LRM
117      */
118     private static final String LRM_STRING = Character.toString(LRM);
119 
120     /*
121      * String representation of RLM
122      */
123     private static final String RLM_STRING = Character.toString(RLM);
124 
125     /**
126      * Empty string constant.
127      */
128     private static final String EMPTY_STRING = "";
129 
130     /**
131      * A class for building a BidiFormatter with non-default options.
132      */
133     public static final class Builder {
134         private boolean mIsRtlContext;
135         private int mFlags;
136         private TextDirectionHeuristic mTextDirectionHeuristic;
137 
138         /**
139          * Constructor.
140          *
141          */
Builder()142         public Builder() {
143             initialize(isRtlLocale(Locale.getDefault()));
144         }
145 
146         /**
147          * Constructor.
148          *
149          * @param rtlContext Whether the context directionality is RTL.
150          */
Builder(boolean rtlContext)151         public Builder(boolean rtlContext) {
152             initialize(rtlContext);
153         }
154 
155         /**
156          * Constructor.
157          *
158          * @param locale The context locale.
159          */
Builder(Locale locale)160         public Builder(Locale locale) {
161             initialize(isRtlLocale(locale));
162         }
163 
164         /**
165          * Initializes the builder with the given context directionality and default options.
166          *
167          * @param isRtlContext Whether the context is RTL or not.
168          */
initialize(boolean isRtlContext)169         private void initialize(boolean isRtlContext) {
170             mIsRtlContext = isRtlContext;
171             mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC;
172             mFlags = DEFAULT_FLAGS;
173         }
174 
175         /**
176          * Specifies whether the BidiFormatter to be built should also "reset" directionality before
177          * a string being bidi-wrapped, not just after it. The default is true.
178          */
stereoReset(boolean stereoReset)179         public Builder stereoReset(boolean stereoReset) {
180             if (stereoReset) {
181                 mFlags |= FLAG_STEREO_RESET;
182             } else {
183                 mFlags &= ~FLAG_STEREO_RESET;
184             }
185             return this;
186         }
187 
188         /**
189          * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
190          * By default, uses the first-strong heuristic.
191          *
192          * @param heuristic the {@code TextDirectionHeuristic} to use.
193          * @return the builder itself.
194          */
setTextDirectionHeuristic(TextDirectionHeuristic heuristic)195         public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) {
196             mTextDirectionHeuristic = heuristic;
197             return this;
198         }
199 
200         /**
201          * @return A BidiFormatter with the specified options.
202          */
build()203         public BidiFormatter build() {
204             if (mFlags == DEFAULT_FLAGS &&
205                     mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
206                 return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext);
207             }
208             return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic);
209         }
210     }
211 
212     //
213     private static final int FLAG_STEREO_RESET = 2;
214     private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
215 
216     private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
217             false /* LTR context */,
218             DEFAULT_FLAGS,
219             DEFAULT_TEXT_DIRECTION_HEURISTIC);
220 
221     private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
222             true /* RTL context */,
223             DEFAULT_FLAGS,
224             DEFAULT_TEXT_DIRECTION_HEURISTIC);
225 
226     private final boolean mIsRtlContext;
227     private final int mFlags;
228     private final TextDirectionHeuristic mDefaultTextDirectionHeuristic;
229 
230     /**
231      * Factory for creating an instance of BidiFormatter for the default locale directionality.
232      *
233      * This does not create any new objects, and returns already existing static instances.
234      *
235      */
getInstance()236     public static BidiFormatter getInstance() {
237         return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault()));
238     }
239 
240     /**
241      * Factory for creating an instance of BidiFormatter given the context directionality.
242      *
243      * This does not create any new objects, and returns already existing static instances.
244      *
245      * @param rtlContext Whether the context directionality is RTL.
246      */
getInstance(boolean rtlContext)247     public static BidiFormatter getInstance(boolean rtlContext) {
248         return getDefaultInstanceFromContext(rtlContext);
249     }
250 
251     /**
252      * Factory for creating an instance of BidiFormatter given the context locale.
253      *
254      * This does not create any new objects, and returns already existing static instances.
255      *
256      * @param locale The context locale.
257      */
getInstance(Locale locale)258     public static BidiFormatter getInstance(Locale locale) {
259         return getDefaultInstanceFromContext(isRtlLocale(locale));
260     }
261 
262     /**
263      * @param isRtlContext Whether the context directionality is RTL or not.
264      * @param flags The option flags.
265      * @param heuristic The default text direction heuristic.
266      */
BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic)267     private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) {
268         mIsRtlContext = isRtlContext;
269         mFlags = flags;
270         mDefaultTextDirectionHeuristic = heuristic;
271     }
272 
273     /**
274      * @return Whether the context directionality is RTL
275      */
isRtlContext()276     public boolean isRtlContext() {
277         return mIsRtlContext;
278     }
279 
280     /**
281      * @return Whether directionality "reset" should also be done before a string being
282      * bidi-wrapped, not just after it.
283      */
getStereoReset()284     public boolean getStereoReset() {
285         return (mFlags & FLAG_STEREO_RESET) != 0;
286     }
287 
288     /**
289      * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
290      * overall or the exit directionality of a given string is opposite to the context directionality.
291      * Putting this after the string (including its directionality declaration wrapping) prevents it
292      * from "sticking" to other opposite-directionality text or a number appearing after it inline
293      * with only neutral content in between. Otherwise returns the empty string. While the exit
294      * directionality is determined by scanning the end of the string, the overall directionality is
295      * given explicitly by a heuristic to estimate the {@code str}'s directionality.
296      *
297      * @param str CharSequence after which the mark may need to appear.
298      * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
299      *                  directionality.
300      * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
301      *     else, the empty string.
302      *
303      * @hide
304      */
markAfter(CharSequence str, TextDirectionHeuristic heuristic)305     public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) {
306         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
307         // getExitDir() is called only if needed (short-circuit).
308         if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
309             return LRM_STRING;
310         }
311         if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
312             return RLM_STRING;
313         }
314         return EMPTY_STRING;
315     }
316 
317     /**
318      * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
319      * overall or the entry directionality of a given string is opposite to the context
320      * directionality. Putting this before the string (including its directionality declaration
321      * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
322      * it inline with only neutral content in between. Otherwise returns the empty string. While the
323      * entry directionality is determined by scanning the beginning of the string, the overall
324      * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
325      *
326      * @param str CharSequence before which the mark may need to appear.
327      * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
328      *                  directionality.
329      * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
330      *     else, the empty string.
331      *
332      * @hide
333      */
markBefore(CharSequence str, TextDirectionHeuristic heuristic)334     public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) {
335         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
336         // getEntryDir() is called only if needed (short-circuit).
337         if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
338             return LRM_STRING;
339         }
340         if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
341             return RLM_STRING;
342         }
343         return EMPTY_STRING;
344     }
345 
346     /**
347      * Estimates the directionality of a string using the default text direction heuristic.
348      *
349      * @param str String whose directionality is to be estimated.
350      * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
351      *          false.
352      */
isRtl(String str)353     public boolean isRtl(String str) {
354         return isRtl((CharSequence) str);
355     }
356 
357     /**
358      * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string
359      *
360      * @param str CharSequence whose directionality is to be estimated.
361      * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
362      *          false.
363      */
isRtl(CharSequence str)364     public boolean isRtl(CharSequence str) {
365         return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length());
366     }
367 
368     /**
369      * Formats a string of given directionality for use in plain-text output of the context
370      * directionality, so an opposite-directionality string is neither garbled nor garbles its
371      * surroundings. This makes use of Unicode bidi formatting characters.
372      * <p>
373      * The algorithm: In case the given directionality doesn't match the context directionality, wraps
374      * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
375      * LRE+{@code str}+PDF for LTR text.
376      * <p>
377      * If {@code isolate}, directionally isolates the string so that it does not garble its
378      * surroundings. Currently, this is done by "resetting" the directionality after the string by
379      * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
380      * either the overall directionality or the exit directionality of the string is opposite to
381      * that of the context. Unless the formatter was built using
382      * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
383      * bidi mark matching the context directionality when either the overall directionality or the
384      * entry directionality of the string is opposite to that of the context. Note that as opposed
385      * to the overall directionality, the entry and exit directionalities are determined from the
386      * string itself.
387      * <p>
388      * Does *not* do HTML-escaping.
389      *
390      * @param str The input string.
391      * @param heuristic The algorithm to be used to estimate the string's overall direction.
392      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
393      * @param isolate Whether to directionally isolate the string to prevent it from garbling the
394      *     content around it
395      * @return Input string after applying the above processing. {@code null} if {@code str} is
396      *     {@code null}.
397      */
unicodeWrap(@ullable String str, TextDirectionHeuristic heuristic, boolean isolate)398     public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic,
399             boolean isolate) {
400         if (str == null) return null;
401         return unicodeWrap((CharSequence) str, heuristic, isolate).toString();
402     }
403 
404     /**
405      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a
406      * CharSequence instead of a string
407      *
408      * @param str The input CharSequence.
409      * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
410      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
411      * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
412      *     the content around it
413      * @return Input CharSequence after applying the above processing. {@code null} if {@code str}
414      *     is {@code null}.
415      */
unicodeWrap(@ullable CharSequence str, TextDirectionHeuristic heuristic, boolean isolate)416     public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str,
417             TextDirectionHeuristic heuristic, boolean isolate) {
418         if (str == null) return null;
419         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
420         SpannableStringBuilder result = new SpannableStringBuilder();
421         if (getStereoReset() && isolate) {
422             result.append(markBefore(str,
423                     isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
424         }
425         if (isRtl != mIsRtlContext) {
426             result.append(isRtl ? RLE : LRE);
427             result.append(str);
428             result.append(PDF);
429         } else {
430             result.append(str);
431         }
432         if (isolate) {
433             result.append(markAfter(str,
434                     isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
435         }
436         return result;
437     }
438 
439     /**
440      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes
441      * {@code isolate} is true.
442      *
443      * @param str The input string.
444      * @param heuristic The algorithm to be used to estimate the string's overall direction.
445      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
446      * @return Input string after applying the above processing.
447      */
unicodeWrap(String str, TextDirectionHeuristic heuristic)448     public String unicodeWrap(String str, TextDirectionHeuristic heuristic) {
449         return unicodeWrap(str, heuristic, true /* isolate */);
450     }
451 
452     /**
453      * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but
454      * assumes {@code isolate} is true.
455      *
456      * @param str The input CharSequence.
457      * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
458      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
459      * @return Input CharSequence after applying the above processing.
460      */
unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic)461     public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) {
462         return unicodeWrap(str, heuristic, true /* isolate */);
463     }
464 
465 
466     /**
467      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
468      * formatter's default direction estimation algorithm.
469      *
470      * @param str The input string.
471      * @param isolate Whether to directionally isolate the string to prevent it from garbling the
472      *     content around it
473      * @return Input string after applying the above processing.
474      */
unicodeWrap(String str, boolean isolate)475     public String unicodeWrap(String str, boolean isolate) {
476         return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
477     }
478 
479     /**
480      * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
481      * the formatter's default direction estimation algorithm.
482      *
483      * @param str The input CharSequence.
484      * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
485      *     the content around it
486      * @return Input CharSequence after applying the above processing.
487      */
unicodeWrap(CharSequence str, boolean isolate)488     public CharSequence unicodeWrap(CharSequence str, boolean isolate) {
489         return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
490     }
491 
492     /**
493      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
494      * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
495      *
496      * @param str The input string.
497      * @return Input string after applying the above processing.
498      */
unicodeWrap(String str)499     public String unicodeWrap(String str) {
500         return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
501     }
502 
503     /**
504      * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
505      * the formatter's default direction estimation algorithm and assumes {@code isolate} is true.
506      *
507      * @param str The input CharSequence.
508      * @return Input CharSequence after applying the above processing.
509      */
unicodeWrap(CharSequence str)510     public CharSequence unicodeWrap(CharSequence str) {
511         return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
512     }
513 
getDefaultInstanceFromContext(boolean isRtlContext)514     private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
515         return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
516     }
517 
518     /**
519      * Helper method to return true if the Locale directionality is RTL.
520      *
521      * @param locale The Locale whose directionality will be checked to be RTL or LTR
522      * @return true if the {@code locale} directionality is RTL. False otherwise.
523      */
isRtlLocale(Locale locale)524     private static boolean isRtlLocale(Locale locale) {
525         return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL);
526     }
527 
528     /**
529      * Enum for directionality type.
530      */
531     private static final int DIR_LTR = -1;
532     private static final int DIR_UNKNOWN = 0;
533     private static final int DIR_RTL = +1;
534 
535     /**
536      * Returns the directionality of the last character with strong directionality in the string, or
537      * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
538      * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
539      * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
540      * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
541      * whether a logically separate item that starts with a number or a character of the string's
542      * exit directionality and follows this string inline (not counting any neutral characters in
543      * between) would "stick" to it in an opposite-directionality context, thus being displayed in
544      * an incorrect position. An LRM or RLM character (the one of the context's directionality)
545      * between the two will prevent such sticking.
546      *
547      * @param str the string to check.
548      */
getExitDir(CharSequence str)549     private static int getExitDir(CharSequence str) {
550         return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
551     }
552 
553     /**
554      * Returns the directionality of the first character with strong directionality in the string,
555      * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
556      * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
557      * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
558      * characters. The intended use is to check whether a logically separate item that ends with a
559      * character of the string's entry directionality and precedes the string inline (not counting
560      * any neutral characters in between) would "stick" to it in an opposite-directionality context,
561      * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
562      * context's directionality) between the two will prevent such sticking.
563      *
564      * @param str the string to check.
565      */
getEntryDir(CharSequence str)566     private static int getEntryDir(CharSequence str) {
567         return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
568     }
569 
570     /**
571      * An object that estimates the directionality of a given string by various methods.
572      *
573      */
574     private static class DirectionalityEstimator {
575 
576         // Internal static variables and constants.
577 
578         /**
579          * Size of the bidi character class cache. The results of the Character.getDirectionality()
580          * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
581          * The 0x700 value is designed to leave all the European and Near Eastern languages in the
582          * cache. It can be reduced to 0x180, restricting the cache to the Western European
583          * languages.
584          */
585         private static final int DIR_TYPE_CACHE_SIZE = 0x700;
586 
587         /**
588          * The bidi character class cache.
589          */
590         private static final byte DIR_TYPE_CACHE[];
591 
592         static {
593             DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
594             for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
595                 // Calling Character.getDirectionality() is OK here, since new emojis start after
596                 // the end of our cache.
597                 DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
598             }
599         }
600 
getDirectionality(int codePoint)601         private static byte getDirectionality(int codePoint) {
602             if (Emoji.isNewEmoji(codePoint)) {
603                 // TODO: Fix or remove once emoji-data.text 5.0 is in ICU or update to 6.0.
604                 return Character.DIRECTIONALITY_OTHER_NEUTRALS;
605             } else {
606                 return Character.getDirectionality(codePoint);
607             }
608         }
609 
610         // Internal instance variables.
611 
612         /**
613          * The text to be scanned.
614          */
615         private final CharSequence text;
616 
617         /**
618          * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
619          * entities when looking for the next / preceding dir type.
620          */
621         private final boolean isHtml;
622 
623         /**
624          * The length of the text in chars.
625          */
626         private final int length;
627 
628         /**
629          * The current position in the text.
630          */
631         private int charIndex;
632 
633         /**
634          * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
635          * encountered a supplementary codepoint, this contains a char that is not a valid
636          * codepoint. This is ok, because this member is only used to detect some well-known ASCII
637          * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
638          */
639         private char lastChar;
640 
641         /**
642          * Constructor.
643          *
644          * @param text The string to scan.
645          * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
646          *     tags and entities.
647          */
DirectionalityEstimator(CharSequence text, boolean isHtml)648         DirectionalityEstimator(CharSequence text, boolean isHtml) {
649             this.text = text;
650             this.isHtml = isHtml;
651             length = text.length();
652         }
653 
654         /**
655          * Returns the directionality of the first character with strong directionality in the
656          * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
657          * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
658          * after RLE/RLO. The results are undefined for a string containing unbalanced
659          * LRE/RLE/LRO/RLO/PDF characters.
660          */
getEntryDir()661         int getEntryDir() {
662             // The reason for this method name, as opposed to getFirstStrongDir(), is that
663             // "first strong" is a commonly used description of Unicode's estimation algorithm,
664             // but the two must treat formatting characters quite differently. Thus, we are staying
665             // away from both "first" and "last" in these method names to avoid confusion.
666             charIndex = 0;
667             int embeddingLevel = 0;
668             int embeddingLevelDir = DIR_UNKNOWN;
669             int firstNonEmptyEmbeddingLevel = 0;
670             while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
671                 switch (dirTypeForward()) {
672                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
673                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
674                         ++embeddingLevel;
675                         embeddingLevelDir = DIR_LTR;
676                         break;
677                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
678                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
679                         ++embeddingLevel;
680                         embeddingLevelDir = DIR_RTL;
681                         break;
682                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
683                         --embeddingLevel;
684                         // To restore embeddingLevelDir to its previous value, we would need a
685                         // stack, which we want to avoid. Thus, at this point we do not know the
686                         // current embedding's directionality.
687                         embeddingLevelDir = DIR_UNKNOWN;
688                         break;
689                     case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
690                         break;
691                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
692                         if (embeddingLevel == 0) {
693                             return DIR_LTR;
694                         }
695                         firstNonEmptyEmbeddingLevel = embeddingLevel;
696                         break;
697                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
698                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
699                         if (embeddingLevel == 0) {
700                             return DIR_RTL;
701                         }
702                         firstNonEmptyEmbeddingLevel = embeddingLevel;
703                         break;
704                     default:
705                         firstNonEmptyEmbeddingLevel = embeddingLevel;
706                         break;
707                 }
708             }
709 
710             // We have either found a non-empty embedding or scanned the entire string finding
711             // neither a non-empty embedding nor a strong character outside of an embedding.
712             if (firstNonEmptyEmbeddingLevel == 0) {
713                 // We have not found a non-empty embedding. Thus, the string contains neither a
714                 // non-empty embedding nor a strong character outside of an embedding.
715                 return DIR_UNKNOWN;
716             }
717 
718             // We have found a non-empty embedding.
719             if (embeddingLevelDir != DIR_UNKNOWN) {
720                 // We know the directionality of the non-empty embedding.
721                 return embeddingLevelDir;
722             }
723 
724             // We do not remember the directionality of the non-empty embedding we found. So, we go
725             // backwards to find the start of the non-empty embedding and get its directionality.
726             while (charIndex > 0) {
727                 switch (dirTypeBackward()) {
728                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
729                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
730                         if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
731                             return DIR_LTR;
732                         }
733                         --embeddingLevel;
734                         break;
735                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
736                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
737                         if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
738                             return DIR_RTL;
739                         }
740                         --embeddingLevel;
741                         break;
742                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
743                         ++embeddingLevel;
744                         break;
745                 }
746             }
747             // We should never get here.
748             return DIR_UNKNOWN;
749         }
750 
751         /**
752          * Returns the directionality of the last character with strong directionality in the
753          * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
754          * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
755          * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
756          * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
757          */
getExitDir()758         int getExitDir() {
759             // The reason for this method name, as opposed to getLastStrongDir(), is that "last
760             // strong" sounds like the exact opposite of "first strong", which is a commonly used
761             // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
762             // must treat formatting characters quite differently. Thus, we are staying away from
763             // both "first" and "last" in these method names to avoid confusion.
764             charIndex = length;
765             int embeddingLevel = 0;
766             int lastNonEmptyEmbeddingLevel = 0;
767             while (charIndex > 0) {
768                 switch (dirTypeBackward()) {
769                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
770                         if (embeddingLevel == 0) {
771                             return DIR_LTR;
772                         }
773                         if (lastNonEmptyEmbeddingLevel == 0) {
774                             lastNonEmptyEmbeddingLevel = embeddingLevel;
775                         }
776                         break;
777                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
778                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
779                         if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
780                             return DIR_LTR;
781                         }
782                         --embeddingLevel;
783                         break;
784                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
785                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
786                         if (embeddingLevel == 0) {
787                             return DIR_RTL;
788                         }
789                         if (lastNonEmptyEmbeddingLevel == 0) {
790                             lastNonEmptyEmbeddingLevel = embeddingLevel;
791                         }
792                         break;
793                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
794                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
795                         if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
796                             return DIR_RTL;
797                         }
798                         --embeddingLevel;
799                         break;
800                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
801                         ++embeddingLevel;
802                         break;
803                     case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
804                         break;
805                     default:
806                         if (lastNonEmptyEmbeddingLevel == 0) {
807                             lastNonEmptyEmbeddingLevel = embeddingLevel;
808                         }
809                         break;
810                 }
811             }
812             return DIR_UNKNOWN;
813         }
814 
815         // Internal methods
816 
817         /**
818          * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
819          * a cache for speed. Not designed for supplementary codepoints, whose results we do not
820          * cache.
821          */
getCachedDirectionality(char c)822         private static byte getCachedDirectionality(char c) {
823             return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c);
824         }
825 
826         /**
827          * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
828          * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
829          * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
830          * figure out the actual character, and return its dirtype, but treating it as whitespace is
831          * good enough for our purposes.
832          *
833          * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
834          */
dirTypeForward()835         byte dirTypeForward() {
836             lastChar = text.charAt(charIndex);
837             if (Character.isHighSurrogate(lastChar)) {
838                 int codePoint = Character.codePointAt(text, charIndex);
839                 charIndex += Character.charCount(codePoint);
840                 return getDirectionality(codePoint);
841             }
842             charIndex++;
843             byte dirType = getCachedDirectionality(lastChar);
844             if (isHtml) {
845                 // Process tags and entities.
846                 if (lastChar == '<') {
847                     dirType = skipTagForward();
848                 } else if (lastChar == '&') {
849                     dirType = skipEntityForward();
850                 }
851             }
852             return dirType;
853         }
854 
855         /**
856          * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
857          * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
858          * entity, advances over the whole tag/entity and returns
859          * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
860          * actual character, and return its dirtype, but treating it as whitespace is good enough
861          * for our purposes.
862          *
863          * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
864          */
dirTypeBackward()865         byte dirTypeBackward() {
866             lastChar = text.charAt(charIndex - 1);
867             if (Character.isLowSurrogate(lastChar)) {
868                 int codePoint = Character.codePointBefore(text, charIndex);
869                 charIndex -= Character.charCount(codePoint);
870                 return getDirectionality(codePoint);
871             }
872             charIndex--;
873             byte dirType = getCachedDirectionality(lastChar);
874             if (isHtml) {
875                 // Process tags and entities.
876                 if (lastChar == '>') {
877                     dirType = skipTagBackward();
878                 } else if (lastChar == ';') {
879                     dirType = skipEntityBackward();
880                 }
881             }
882             return dirType;
883         }
884 
885         /**
886          * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
887          * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
888          * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
889          * &lt; that hadn't been part of a tag after all).
890          */
skipTagForward()891         private byte skipTagForward() {
892             int initialCharIndex = charIndex;
893             while (charIndex < length) {
894                 lastChar = text.charAt(charIndex++);
895                 if (lastChar == '>') {
896                     // The end of the tag.
897                     return Character.DIRECTIONALITY_WHITESPACE;
898                 }
899                 if (lastChar == '"' || lastChar == '\'') {
900                     // Skip over a quoted attribute value inside the tag.
901                     char quote = lastChar;
902                     while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
903                 }
904             }
905             // The original '<' wasn't the start of a tag after all.
906             charIndex = initialCharIndex;
907             lastChar = '<';
908             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
909         }
910 
911         /**
912          * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
913          * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
914          * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
915          * that hadn't been part of a tag after all). Nevertheless, the running time for calling
916          * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
917          * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
918          * when it encounters another &gt;.
919          */
skipTagBackward()920         private byte skipTagBackward() {
921             int initialCharIndex = charIndex;
922             while (charIndex > 0) {
923                 lastChar = text.charAt(--charIndex);
924                 if (lastChar == '<') {
925                     // The start of the tag.
926                     return Character.DIRECTIONALITY_WHITESPACE;
927                 }
928                 if (lastChar == '>') {
929                     break;
930                 }
931                 if (lastChar == '"' || lastChar == '\'') {
932                     // Skip over a quoted attribute value inside the tag.
933                     char quote = lastChar;
934                     while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
935                 }
936             }
937             // The original '>' wasn't the end of a tag after all.
938             charIndex = initialCharIndex;
939             lastChar = '>';
940             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
941         }
942 
943         /**
944          * Advances charIndex forward through an HTML character entity tag (after the opening
945          * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
946          * best to figure out the actual character and return its dirtype, but this is good enough.
947          */
skipEntityForward()948         private byte skipEntityForward() {
949             while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
950             return Character.DIRECTIONALITY_WHITESPACE;
951         }
952 
953         /**
954          * Advances charIndex backward through an HTML character entity tag (after the closing ;
955          * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
956          * to figure out the actual character and return its dirtype, but this is good enough.
957          * If there is no matching &amp;, does not change charIndex and returns
958          * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
959          * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
960          * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
961          * also stops looking for a matching &amp; when it encounters another ;.
962          */
skipEntityBackward()963         private byte skipEntityBackward() {
964             int initialCharIndex = charIndex;
965             while (charIndex > 0) {
966                 lastChar = text.charAt(--charIndex);
967                 if (lastChar == '&') {
968                     return Character.DIRECTIONALITY_WHITESPACE;
969                 }
970                 if (lastChar == ';') {
971                     break;
972                 }
973             }
974             charIndex = initialCharIndex;
975             lastChar = ';';
976             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
977         }
978     }
979 }
980