1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text;
18 
19 import static android.text.TextDirectionHeuristics.FIRSTSTRONG_LTR;
20 
21 import android.annotation.Nullable;
22 import android.view.View;
23 
24 import com.android.internal.annotations.VisibleForTesting;
25 
26 import java.util.Locale;
27 
28 /**
29  * Utility class for formatting text for display in a potentially opposite-directionality context
30  * without garbling. The directionality of the context is set at formatter creation and the
31  * directionality of the text can be either estimated or passed in when known.
32  *
33  * <p>To support versions lower than {@link android.os.Build.VERSION_CODES#JELLY_BEAN_MR2},
34  * you can use the support library's {@link android.support.v4.text.BidiFormatter} class.
35  *
36  * <p>These APIs provides the following functionality:
37  * <p>
38  * 1. Bidi Wrapping
39  * When text in one language is mixed into a document in another, opposite-directionality language,
40  * e.g. when an English business name is embedded in some Hebrew text, both the inserted string
41  * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
42  * separated from the surrounding text in a "wrapper" that:
43  * <p>
44  * - Declares its directionality so that the string is displayed correctly. This can be done in
45  *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
46  * <p>
47  * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
48  *   Currently, this can only be done using invisible Unicode characters of the same direction as
49  *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
50  *   the directionality to that of the context. The "reset" may need to be done at both ends of the
51  *   string. Without "reset" after the string, the string will "stick" to a number or logically
52  *   separate opposite-direction text that happens to follow it in-line (even if separated by
53  *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
54  *   happen there, but only with more opposite-direction text, not a number. One approach is to
55  *   "reset" the direction only after each string, on the theory that if the preceding opposite-
56  *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
57  *   the "reset" only before each string definitely does not work because we do not want to require
58  *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
59  *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
60  *   message translations often contain untranslated Latin-script brand names and technical terms,
61  *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
62  *   has such a message, it is best to do the "reset" manually in the message translation itself,
63  *   since the message's opposite-direction text could be followed by an inserted number, which we
64  *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
65  *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
66  *   isolation to be part of the directionality declaration. This form of isolation is better than
67  *   "reset" because it takes less space, does not require knowing the context directionality, has a
68  *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
69  *   using it because required platforms do not yet support it.
70  * <p>
71  * Providing these wrapping services is the basic purpose of the bidi formatter.
72  * <p>
73  * 2. Directionality estimation
74  * How does one know whether a string about to be inserted into surrounding text has the same
75  * directionality? Well, in many cases, one knows that this must be the case when writing the code
76  * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
77  * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
78  * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
79  * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
80  * language of the string (and thus its directionality) is not known a priori, and must be
81  * estimated at run-time. The bidi formatter can do this automatically using the default
82  * first-strong estimation algorithm. It can also be configured to use a custom directionality
83  * estimation object.
84  */
85 public final class BidiFormatter {
86 
87     /**
88      * The default text direction heuristic.
89      */
90     private static TextDirectionHeuristic DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
91 
92     /**
93      * Unicode "Left-To-Right Embedding" (LRE) character.
94      */
95     private static final char LRE = '\u202A';
96 
97     /**
98      * Unicode "Right-To-Left Embedding" (RLE) character.
99      */
100     private static final char RLE = '\u202B';
101 
102     /**
103      * Unicode "Pop Directional Formatting" (PDF) character.
104      */
105     private static final char PDF = '\u202C';
106 
107     /**
108      *  Unicode "Left-To-Right Mark" (LRM) character.
109      */
110     private static final char LRM = '\u200E';
111 
112     /*
113      * Unicode "Right-To-Left Mark" (RLM) character.
114      */
115     private static final char RLM = '\u200F';
116 
117     /*
118      * String representation of LRM
119      */
120     private static final String LRM_STRING = Character.toString(LRM);
121 
122     /*
123      * String representation of RLM
124      */
125     private static final String RLM_STRING = Character.toString(RLM);
126 
127     /**
128      * Empty string constant.
129      */
130     private static final String EMPTY_STRING = "";
131 
132     /**
133      * A class for building a BidiFormatter with non-default options.
134      */
135     public static final class Builder {
136         private boolean mIsRtlContext;
137         private int mFlags;
138         private TextDirectionHeuristic mTextDirectionHeuristic;
139 
140         /**
141          * Constructor.
142          *
143          */
Builder()144         public Builder() {
145             initialize(isRtlLocale(Locale.getDefault()));
146         }
147 
148         /**
149          * Constructor.
150          *
151          * @param rtlContext Whether the context directionality is RTL.
152          */
Builder(boolean rtlContext)153         public Builder(boolean rtlContext) {
154             initialize(rtlContext);
155         }
156 
157         /**
158          * Constructor.
159          *
160          * @param locale The context locale.
161          */
Builder(Locale locale)162         public Builder(Locale locale) {
163             initialize(isRtlLocale(locale));
164         }
165 
166         /**
167          * Initializes the builder with the given context directionality and default options.
168          *
169          * @param isRtlContext Whether the context is RTL or not.
170          */
initialize(boolean isRtlContext)171         private void initialize(boolean isRtlContext) {
172             mIsRtlContext = isRtlContext;
173             mTextDirectionHeuristic = DEFAULT_TEXT_DIRECTION_HEURISTIC;
174             mFlags = DEFAULT_FLAGS;
175         }
176 
177         /**
178          * Specifies whether the BidiFormatter to be built should also "reset" directionality before
179          * a string being bidi-wrapped, not just after it. The default is true.
180          */
stereoReset(boolean stereoReset)181         public Builder stereoReset(boolean stereoReset) {
182             if (stereoReset) {
183                 mFlags |= FLAG_STEREO_RESET;
184             } else {
185                 mFlags &= ~FLAG_STEREO_RESET;
186             }
187             return this;
188         }
189 
190         /**
191          * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
192          * By default, uses the first-strong heuristic.
193          *
194          * @param heuristic the {@code TextDirectionHeuristic} to use.
195          * @return the builder itself.
196          */
setTextDirectionHeuristic(TextDirectionHeuristic heuristic)197         public Builder setTextDirectionHeuristic(TextDirectionHeuristic heuristic) {
198             mTextDirectionHeuristic = heuristic;
199             return this;
200         }
201 
202         /**
203          * @return A BidiFormatter with the specified options.
204          */
build()205         public BidiFormatter build() {
206             if (mFlags == DEFAULT_FLAGS &&
207                     mTextDirectionHeuristic == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
208                 return BidiFormatter.getDefaultInstanceFromContext(mIsRtlContext);
209             }
210             return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristic);
211         }
212     }
213 
214     //
215     private static final int FLAG_STEREO_RESET = 2;
216     private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
217 
218     private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
219             false /* LTR context */,
220             DEFAULT_FLAGS,
221             DEFAULT_TEXT_DIRECTION_HEURISTIC);
222 
223     private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
224             true /* RTL context */,
225             DEFAULT_FLAGS,
226             DEFAULT_TEXT_DIRECTION_HEURISTIC);
227 
228     private final boolean mIsRtlContext;
229     private final int mFlags;
230     private final TextDirectionHeuristic mDefaultTextDirectionHeuristic;
231 
232     /**
233      * Factory for creating an instance of BidiFormatter for the default locale directionality.
234      *
235      * This does not create any new objects, and returns already existing static instances.
236      *
237      */
getInstance()238     public static BidiFormatter getInstance() {
239         return getDefaultInstanceFromContext(isRtlLocale(Locale.getDefault()));
240     }
241 
242     /**
243      * Factory for creating an instance of BidiFormatter given the context directionality.
244      *
245      * This does not create any new objects, and returns already existing static instances.
246      *
247      * @param rtlContext Whether the context directionality is RTL.
248      */
getInstance(boolean rtlContext)249     public static BidiFormatter getInstance(boolean rtlContext) {
250         return getDefaultInstanceFromContext(rtlContext);
251     }
252 
253     /**
254      * Factory for creating an instance of BidiFormatter given the context locale.
255      *
256      * This does not create any new objects, and returns already existing static instances.
257      *
258      * @param locale The context locale.
259      */
getInstance(Locale locale)260     public static BidiFormatter getInstance(Locale locale) {
261         return getDefaultInstanceFromContext(isRtlLocale(locale));
262     }
263 
264     /**
265      * @param isRtlContext Whether the context directionality is RTL or not.
266      * @param flags The option flags.
267      * @param heuristic The default text direction heuristic.
268      */
BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic)269     private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristic heuristic) {
270         mIsRtlContext = isRtlContext;
271         mFlags = flags;
272         mDefaultTextDirectionHeuristic = heuristic;
273     }
274 
275     /**
276      * @return Whether the context directionality is RTL
277      */
isRtlContext()278     public boolean isRtlContext() {
279         return mIsRtlContext;
280     }
281 
282     /**
283      * @return Whether directionality "reset" should also be done before a string being
284      * bidi-wrapped, not just after it.
285      */
getStereoReset()286     public boolean getStereoReset() {
287         return (mFlags & FLAG_STEREO_RESET) != 0;
288     }
289 
290     /**
291      * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
292      * overall or the exit directionality of a given string is opposite to the context directionality.
293      * Putting this after the string (including its directionality declaration wrapping) prevents it
294      * from "sticking" to other opposite-directionality text or a number appearing after it inline
295      * with only neutral content in between. Otherwise returns the empty string. While the exit
296      * directionality is determined by scanning the end of the string, the overall directionality is
297      * given explicitly by a heuristic to estimate the {@code str}'s directionality.
298      *
299      * @param str CharSequence after which the mark may need to appear.
300      * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
301      *                  directionality.
302      * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
303      *     else, the empty string.
304      *
305      * @hide
306      */
markAfter(CharSequence str, TextDirectionHeuristic heuristic)307     public String markAfter(CharSequence str, TextDirectionHeuristic heuristic) {
308         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
309         // getExitDir() is called only if needed (short-circuit).
310         if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
311             return LRM_STRING;
312         }
313         if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
314             return RLM_STRING;
315         }
316         return EMPTY_STRING;
317     }
318 
319     /**
320      * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
321      * overall or the entry directionality of a given string is opposite to the context
322      * directionality. Putting this before the string (including its directionality declaration
323      * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
324      * it inline with only neutral content in between. Otherwise returns the empty string. While the
325      * entry directionality is determined by scanning the beginning of the string, the overall
326      * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
327      *
328      * @param str CharSequence before which the mark may need to appear.
329      * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
330      *                  directionality.
331      * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
332      *     else, the empty string.
333      *
334      * @hide
335      */
markBefore(CharSequence str, TextDirectionHeuristic heuristic)336     public String markBefore(CharSequence str, TextDirectionHeuristic heuristic) {
337         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
338         // getEntryDir() is called only if needed (short-circuit).
339         if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
340             return LRM_STRING;
341         }
342         if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
343             return RLM_STRING;
344         }
345         return EMPTY_STRING;
346     }
347 
348     /**
349      * Estimates the directionality of a string using the default text direction heuristic.
350      *
351      * @param str String whose directionality is to be estimated.
352      * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
353      *          false.
354      */
isRtl(String str)355     public boolean isRtl(String str) {
356         return isRtl((CharSequence) str);
357     }
358 
359     /**
360      * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string
361      *
362      * @param str CharSequence whose directionality is to be estimated.
363      * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
364      *          false.
365      */
isRtl(CharSequence str)366     public boolean isRtl(CharSequence str) {
367         return mDefaultTextDirectionHeuristic.isRtl(str, 0, str.length());
368     }
369 
370     /**
371      * Formats a string of given directionality for use in plain-text output of the context
372      * directionality, so an opposite-directionality string is neither garbled nor garbles its
373      * surroundings. This makes use of Unicode bidi formatting characters.
374      * <p>
375      * The algorithm: In case the given directionality doesn't match the context directionality, wraps
376      * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
377      * LRE+{@code str}+PDF for LTR text.
378      * <p>
379      * If {@code isolate}, directionally isolates the string so that it does not garble its
380      * surroundings. Currently, this is done by "resetting" the directionality after the string by
381      * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
382      * either the overall directionality or the exit directionality of the string is opposite to
383      * that of the context. Unless the formatter was built using
384      * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
385      * bidi mark matching the context directionality when either the overall directionality or the
386      * entry directionality of the string is opposite to that of the context. Note that as opposed
387      * to the overall directionality, the entry and exit directionalities are determined from the
388      * string itself.
389      * <p>
390      * Does *not* do HTML-escaping.
391      *
392      * @param str The input string.
393      * @param heuristic The algorithm to be used to estimate the string's overall direction.
394      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
395      * @param isolate Whether to directionally isolate the string to prevent it from garbling the
396      *     content around it
397      * @return Input string after applying the above processing. {@code null} if {@code str} is
398      *     {@code null}.
399      */
unicodeWrap(@ullable String str, TextDirectionHeuristic heuristic, boolean isolate)400     public @Nullable String unicodeWrap(@Nullable String str, TextDirectionHeuristic heuristic,
401             boolean isolate) {
402         if (str == null) return null;
403         return unicodeWrap((CharSequence) str, heuristic, isolate).toString();
404     }
405 
406     /**
407      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but takes a
408      * CharSequence instead of a string
409      *
410      * @param str The input CharSequence.
411      * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
412      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
413      * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
414      *     the content around it
415      * @return Input CharSequence after applying the above processing. {@code null} if {@code str}
416      *     is {@code null}.
417      */
unicodeWrap(@ullable CharSequence str, TextDirectionHeuristic heuristic, boolean isolate)418     public @Nullable CharSequence unicodeWrap(@Nullable CharSequence str,
419             TextDirectionHeuristic heuristic, boolean isolate) {
420         if (str == null) return null;
421         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
422         SpannableStringBuilder result = new SpannableStringBuilder();
423         if (getStereoReset() && isolate) {
424             result.append(markBefore(str,
425                     isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
426         }
427         if (isRtl != mIsRtlContext) {
428             result.append(isRtl ? RLE : LRE);
429             result.append(str);
430             result.append(PDF);
431         } else {
432             result.append(str);
433         }
434         if (isolate) {
435             result.append(markAfter(str,
436                     isRtl ? TextDirectionHeuristics.RTL : TextDirectionHeuristics.LTR));
437         }
438         return result;
439     }
440 
441     /**
442      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but assumes
443      * {@code isolate} is true.
444      *
445      * @param str The input string.
446      * @param heuristic The algorithm to be used to estimate the string's overall direction.
447      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
448      * @return Input string after applying the above processing.
449      */
unicodeWrap(String str, TextDirectionHeuristic heuristic)450     public String unicodeWrap(String str, TextDirectionHeuristic heuristic) {
451         return unicodeWrap(str, heuristic, true /* isolate */);
452     }
453 
454     /**
455      * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but
456      * assumes {@code isolate} is true.
457      *
458      * @param str The input CharSequence.
459      * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
460      *        See {@link TextDirectionHeuristics} for pre-defined heuristics.
461      * @return Input CharSequence after applying the above processing.
462      */
unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic)463     public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristic heuristic) {
464         return unicodeWrap(str, heuristic, true /* isolate */);
465     }
466 
467 
468     /**
469      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
470      * formatter's default direction estimation algorithm.
471      *
472      * @param str The input string.
473      * @param isolate Whether to directionally isolate the string to prevent it from garbling the
474      *     content around it
475      * @return Input string after applying the above processing.
476      */
unicodeWrap(String str, boolean isolate)477     public String unicodeWrap(String str, boolean isolate) {
478         return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
479     }
480 
481     /**
482      * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
483      * the formatter's default direction estimation algorithm.
484      *
485      * @param str The input CharSequence.
486      * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
487      *     the content around it
488      * @return Input CharSequence after applying the above processing.
489      */
unicodeWrap(CharSequence str, boolean isolate)490     public CharSequence unicodeWrap(CharSequence str, boolean isolate) {
491         return unicodeWrap(str, mDefaultTextDirectionHeuristic, isolate);
492     }
493 
494     /**
495      * Operates like {@link #unicodeWrap(String, TextDirectionHeuristic, boolean)}, but uses the
496      * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
497      *
498      * @param str The input string.
499      * @return Input string after applying the above processing.
500      */
unicodeWrap(String str)501     public String unicodeWrap(String str) {
502         return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
503     }
504 
505     /**
506      * Operates like {@link #unicodeWrap(CharSequence, TextDirectionHeuristic, boolean)}, but uses
507      * the formatter's default direction estimation algorithm and assumes {@code isolate} is true.
508      *
509      * @param str The input CharSequence.
510      * @return Input CharSequence after applying the above processing.
511      */
unicodeWrap(CharSequence str)512     public CharSequence unicodeWrap(CharSequence str) {
513         return unicodeWrap(str, mDefaultTextDirectionHeuristic, true /* isolate */);
514     }
515 
getDefaultInstanceFromContext(boolean isRtlContext)516     private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
517         return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
518     }
519 
520     /**
521      * Helper method to return true if the Locale directionality is RTL.
522      *
523      * @param locale The Locale whose directionality will be checked to be RTL or LTR
524      * @return true if the {@code locale} directionality is RTL. False otherwise.
525      */
isRtlLocale(Locale locale)526     private static boolean isRtlLocale(Locale locale) {
527         return (TextUtils.getLayoutDirectionFromLocale(locale) == View.LAYOUT_DIRECTION_RTL);
528     }
529 
530     /**
531      * Enum for directionality type.
532      */
533     private static final int DIR_LTR = -1;
534     private static final int DIR_UNKNOWN = 0;
535     private static final int DIR_RTL = +1;
536 
537     /**
538      * Returns the directionality of the last character with strong directionality in the string, or
539      * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
540      * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
541      * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
542      * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
543      * whether a logically separate item that starts with a number or a character of the string's
544      * exit directionality and follows this string inline (not counting any neutral characters in
545      * between) would "stick" to it in an opposite-directionality context, thus being displayed in
546      * an incorrect position. An LRM or RLM character (the one of the context's directionality)
547      * between the two will prevent such sticking.
548      *
549      * @param str the string to check.
550      */
getExitDir(CharSequence str)551     private static int getExitDir(CharSequence str) {
552         return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
553     }
554 
555     /**
556      * Returns the directionality of the first character with strong directionality in the string,
557      * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
558      * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
559      * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
560      * characters. The intended use is to check whether a logically separate item that ends with a
561      * character of the string's entry directionality and precedes the string inline (not counting
562      * any neutral characters in between) would "stick" to it in an opposite-directionality context,
563      * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
564      * context's directionality) between the two will prevent such sticking.
565      *
566      * @param str the string to check.
567      */
getEntryDir(CharSequence str)568     private static int getEntryDir(CharSequence str) {
569         return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
570     }
571 
572     /**
573      * An object that estimates the directionality of a given string by various methods.
574      *
575      * @hide
576      */
577     @VisibleForTesting
578     public static class DirectionalityEstimator {
579 
580         // Internal static variables and constants.
581 
582         /**
583          * Size of the bidi character class cache. The results of the Character.getDirectionality()
584          * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
585          * The 0x700 value is designed to leave all the European and Near Eastern languages in the
586          * cache. It can be reduced to 0x180, restricting the cache to the Western European
587          * languages.
588          */
589         private static final int DIR_TYPE_CACHE_SIZE = 0x700;
590 
591         /**
592          * The bidi character class cache.
593          */
594         private static final byte DIR_TYPE_CACHE[];
595 
596         static {
597             DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
598             for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
599                 // Calling Character.getDirectionality() is OK here, since new emojis start after
600                 // the end of our cache.
601                 DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
602             }
603         }
604 
605         /**
606          * Return Character directionality. Same as {@link Character#getDirectionality(int)} except
607          * it can override values for newest emoji that are not covered by ICU.
608          */
getDirectionality(int codePoint)609         public static byte getDirectionality(int codePoint) {
610             return Character.getDirectionality(codePoint);
611         }
612 
613         // Internal instance variables.
614 
615         /**
616          * The text to be scanned.
617          */
618         private final CharSequence text;
619 
620         /**
621          * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
622          * entities when looking for the next / preceding dir type.
623          */
624         private final boolean isHtml;
625 
626         /**
627          * The length of the text in chars.
628          */
629         private final int length;
630 
631         /**
632          * The current position in the text.
633          */
634         private int charIndex;
635 
636         /**
637          * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
638          * encountered a supplementary codepoint, this contains a char that is not a valid
639          * codepoint. This is ok, because this member is only used to detect some well-known ASCII
640          * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
641          */
642         private char lastChar;
643 
644         /**
645          * Constructor.
646          *
647          * @param text The string to scan.
648          * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
649          *     tags and entities.
650          */
DirectionalityEstimator(CharSequence text, boolean isHtml)651         DirectionalityEstimator(CharSequence text, boolean isHtml) {
652             this.text = text;
653             this.isHtml = isHtml;
654             length = text.length();
655         }
656 
657         /**
658          * Returns the directionality of the first character with strong directionality in the
659          * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
660          * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
661          * after RLE/RLO. The results are undefined for a string containing unbalanced
662          * LRE/RLE/LRO/RLO/PDF characters.
663          */
getEntryDir()664         int getEntryDir() {
665             // The reason for this method name, as opposed to getFirstStrongDir(), is that
666             // "first strong" is a commonly used description of Unicode's estimation algorithm,
667             // but the two must treat formatting characters quite differently. Thus, we are staying
668             // away from both "first" and "last" in these method names to avoid confusion.
669             charIndex = 0;
670             int embeddingLevel = 0;
671             int embeddingLevelDir = DIR_UNKNOWN;
672             int firstNonEmptyEmbeddingLevel = 0;
673             while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
674                 switch (dirTypeForward()) {
675                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
676                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
677                         ++embeddingLevel;
678                         embeddingLevelDir = DIR_LTR;
679                         break;
680                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
681                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
682                         ++embeddingLevel;
683                         embeddingLevelDir = DIR_RTL;
684                         break;
685                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
686                         --embeddingLevel;
687                         // To restore embeddingLevelDir to its previous value, we would need a
688                         // stack, which we want to avoid. Thus, at this point we do not know the
689                         // current embedding's directionality.
690                         embeddingLevelDir = DIR_UNKNOWN;
691                         break;
692                     case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
693                         break;
694                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
695                         if (embeddingLevel == 0) {
696                             return DIR_LTR;
697                         }
698                         firstNonEmptyEmbeddingLevel = embeddingLevel;
699                         break;
700                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
701                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
702                         if (embeddingLevel == 0) {
703                             return DIR_RTL;
704                         }
705                         firstNonEmptyEmbeddingLevel = embeddingLevel;
706                         break;
707                     default:
708                         firstNonEmptyEmbeddingLevel = embeddingLevel;
709                         break;
710                 }
711             }
712 
713             // We have either found a non-empty embedding or scanned the entire string finding
714             // neither a non-empty embedding nor a strong character outside of an embedding.
715             if (firstNonEmptyEmbeddingLevel == 0) {
716                 // We have not found a non-empty embedding. Thus, the string contains neither a
717                 // non-empty embedding nor a strong character outside of an embedding.
718                 return DIR_UNKNOWN;
719             }
720 
721             // We have found a non-empty embedding.
722             if (embeddingLevelDir != DIR_UNKNOWN) {
723                 // We know the directionality of the non-empty embedding.
724                 return embeddingLevelDir;
725             }
726 
727             // We do not remember the directionality of the non-empty embedding we found. So, we go
728             // backwards to find the start of the non-empty embedding and get its directionality.
729             while (charIndex > 0) {
730                 switch (dirTypeBackward()) {
731                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
732                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
733                         if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
734                             return DIR_LTR;
735                         }
736                         --embeddingLevel;
737                         break;
738                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
739                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
740                         if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
741                             return DIR_RTL;
742                         }
743                         --embeddingLevel;
744                         break;
745                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
746                         ++embeddingLevel;
747                         break;
748                 }
749             }
750             // We should never get here.
751             return DIR_UNKNOWN;
752         }
753 
754         /**
755          * Returns the directionality of the last character with strong directionality in the
756          * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
757          * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
758          * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
759          * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
760          */
getExitDir()761         int getExitDir() {
762             // The reason for this method name, as opposed to getLastStrongDir(), is that "last
763             // strong" sounds like the exact opposite of "first strong", which is a commonly used
764             // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
765             // must treat formatting characters quite differently. Thus, we are staying away from
766             // both "first" and "last" in these method names to avoid confusion.
767             charIndex = length;
768             int embeddingLevel = 0;
769             int lastNonEmptyEmbeddingLevel = 0;
770             while (charIndex > 0) {
771                 switch (dirTypeBackward()) {
772                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
773                         if (embeddingLevel == 0) {
774                             return DIR_LTR;
775                         }
776                         if (lastNonEmptyEmbeddingLevel == 0) {
777                             lastNonEmptyEmbeddingLevel = embeddingLevel;
778                         }
779                         break;
780                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
781                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
782                         if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
783                             return DIR_LTR;
784                         }
785                         --embeddingLevel;
786                         break;
787                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
788                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
789                         if (embeddingLevel == 0) {
790                             return DIR_RTL;
791                         }
792                         if (lastNonEmptyEmbeddingLevel == 0) {
793                             lastNonEmptyEmbeddingLevel = embeddingLevel;
794                         }
795                         break;
796                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
797                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
798                         if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
799                             return DIR_RTL;
800                         }
801                         --embeddingLevel;
802                         break;
803                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
804                         ++embeddingLevel;
805                         break;
806                     case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
807                         break;
808                     default:
809                         if (lastNonEmptyEmbeddingLevel == 0) {
810                             lastNonEmptyEmbeddingLevel = embeddingLevel;
811                         }
812                         break;
813                 }
814             }
815             return DIR_UNKNOWN;
816         }
817 
818         // Internal methods
819 
820         /**
821          * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
822          * a cache for speed. Not designed for supplementary codepoints, whose results we do not
823          * cache.
824          */
getCachedDirectionality(char c)825         private static byte getCachedDirectionality(char c) {
826             return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : getDirectionality(c);
827         }
828 
829         /**
830          * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
831          * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
832          * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
833          * figure out the actual character, and return its dirtype, but treating it as whitespace is
834          * good enough for our purposes.
835          *
836          * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
837          */
dirTypeForward()838         byte dirTypeForward() {
839             lastChar = text.charAt(charIndex);
840             if (Character.isHighSurrogate(lastChar)) {
841                 int codePoint = Character.codePointAt(text, charIndex);
842                 charIndex += Character.charCount(codePoint);
843                 return getDirectionality(codePoint);
844             }
845             charIndex++;
846             byte dirType = getCachedDirectionality(lastChar);
847             if (isHtml) {
848                 // Process tags and entities.
849                 if (lastChar == '<') {
850                     dirType = skipTagForward();
851                 } else if (lastChar == '&') {
852                     dirType = skipEntityForward();
853                 }
854             }
855             return dirType;
856         }
857 
858         /**
859          * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
860          * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
861          * entity, advances over the whole tag/entity and returns
862          * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
863          * actual character, and return its dirtype, but treating it as whitespace is good enough
864          * for our purposes.
865          *
866          * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
867          */
dirTypeBackward()868         byte dirTypeBackward() {
869             lastChar = text.charAt(charIndex - 1);
870             if (Character.isLowSurrogate(lastChar)) {
871                 int codePoint = Character.codePointBefore(text, charIndex);
872                 charIndex -= Character.charCount(codePoint);
873                 return getDirectionality(codePoint);
874             }
875             charIndex--;
876             byte dirType = getCachedDirectionality(lastChar);
877             if (isHtml) {
878                 // Process tags and entities.
879                 if (lastChar == '>') {
880                     dirType = skipTagBackward();
881                 } else if (lastChar == ';') {
882                     dirType = skipEntityBackward();
883                 }
884             }
885             return dirType;
886         }
887 
888         /**
889          * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
890          * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
891          * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
892          * &lt; that hadn't been part of a tag after all).
893          */
skipTagForward()894         private byte skipTagForward() {
895             int initialCharIndex = charIndex;
896             while (charIndex < length) {
897                 lastChar = text.charAt(charIndex++);
898                 if (lastChar == '>') {
899                     // The end of the tag.
900                     return Character.DIRECTIONALITY_WHITESPACE;
901                 }
902                 if (lastChar == '"' || lastChar == '\'') {
903                     // Skip over a quoted attribute value inside the tag.
904                     char quote = lastChar;
905                     while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
906                 }
907             }
908             // The original '<' wasn't the start of a tag after all.
909             charIndex = initialCharIndex;
910             lastChar = '<';
911             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
912         }
913 
914         /**
915          * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
916          * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
917          * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
918          * that hadn't been part of a tag after all). Nevertheless, the running time for calling
919          * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
920          * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
921          * when it encounters another &gt;.
922          */
skipTagBackward()923         private byte skipTagBackward() {
924             int initialCharIndex = charIndex;
925             while (charIndex > 0) {
926                 lastChar = text.charAt(--charIndex);
927                 if (lastChar == '<') {
928                     // The start of the tag.
929                     return Character.DIRECTIONALITY_WHITESPACE;
930                 }
931                 if (lastChar == '>') {
932                     break;
933                 }
934                 if (lastChar == '"' || lastChar == '\'') {
935                     // Skip over a quoted attribute value inside the tag.
936                     char quote = lastChar;
937                     while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
938                 }
939             }
940             // The original '>' wasn't the end of a tag after all.
941             charIndex = initialCharIndex;
942             lastChar = '>';
943             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
944         }
945 
946         /**
947          * Advances charIndex forward through an HTML character entity tag (after the opening
948          * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
949          * best to figure out the actual character and return its dirtype, but this is good enough.
950          */
skipEntityForward()951         private byte skipEntityForward() {
952             while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
953             return Character.DIRECTIONALITY_WHITESPACE;
954         }
955 
956         /**
957          * Advances charIndex backward through an HTML character entity tag (after the closing ;
958          * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
959          * to figure out the actual character and return its dirtype, but this is good enough.
960          * If there is no matching &amp;, does not change charIndex and returns
961          * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
962          * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
963          * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
964          * also stops looking for a matching &amp; when it encounters another ;.
965          */
skipEntityBackward()966         private byte skipEntityBackward() {
967             int initialCharIndex = charIndex;
968             while (charIndex > 0) {
969                 lastChar = text.charAt(--charIndex);
970                 if (lastChar == '&') {
971                     return Character.DIRECTIONALITY_WHITESPACE;
972                 }
973                 if (lastChar == ';') {
974                     break;
975                 }
976             }
977             charIndex = initialCharIndex;
978             lastChar = ';';
979             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
980         }
981     }
982 }
983