1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package androidx.core.text;
18 
19 import static androidx.core.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR;
20 
21 import android.text.SpannableStringBuilder;
22 
23 import androidx.core.view.ViewCompat;
24 
25 import java.util.Locale;
26 
27 /**
28  * Utility class for formatting text for display in a potentially opposite-directionality context
29  * without garbling. The directionality of the context is set at formatter creation and the
30  * directionality of the text can be either estimated or passed in when known. Provides the
31  * following functionality:
32  * <p>
33  * 1. Bidi Wrapping
34  * When text in one language is mixed into a document in another, opposite-directionality language,
35  * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string
36  * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
37  * separated from the surrounding text in a "wrapper" that:
38  * <p>
39  * - Declares its directionality so that the string is displayed correctly. This can be done in
40  *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
41  * <p>
42  * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
43  *   Currently, this can only be done using invisible Unicode characters of the same direction as
44  *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
45  *   the directionality to that of the context. The "reset" may need to be done at both ends of the
46  *   string. Without "reset" after the string, the string will "stick" to a number or logically
47  *   separate opposite-direction text that happens to follow it in-line (even if separated by
48  *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
49  *   happen there, but only with more opposite-direction text, not a number. One approach is to
50  *   "reset" the direction only after each string, on the theory that if the preceding opposite-
51  *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
52  *   the "reset" only before each string definitely does not work because we do not want to require
53  *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
54  *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
55  *   message translations often contain untranslated Latin-script brand names and technical terms,
56  *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
57  *   has such a message, it is best to do the "reset" manually in the message translation itself,
58  *   since the message's opposite-direction text could be followed by an inserted number, which we
59  *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
60  *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
61  *   isolation to be part of the directionality declaration. This form of isolation is better than
62  *   "reset" because it takes less space, does not require knowing the context directionality, has a
63  *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
64  *   using it because required platforms do not yet support it.
65  * <p>
66  * Providing these wrapping services is the basic purpose of the bidi formatter.
67  * <p>
68  * 2. Directionality estimation
69  * How does one know whether a string about to be inserted into surrounding text has the same
70  * directionality? Well, in many cases, one knows that this must be the case when writing the code
71  * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
72  * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
73  * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
74  * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
75  * language of the string (and thus its directionality) is not known a priori, and must be
76  * estimated at run-time. The bidi formatter can do this automatically using the default
77  * first-strong estimation algorithm. It can also be configured to use a custom directionality
78  * estimation object.
79  */
80 public final class BidiFormatter {
81 
82     /**
83      * The default text direction heuristic.
84      */
85     static final TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
86 
87     /**
88      * Unicode "Left-To-Right Embedding" (LRE) character.
89      */
90     private static final char LRE = '\u202A';
91 
92     /**
93      * Unicode "Right-To-Left Embedding" (RLE) character.
94      */
95     private static final char RLE = '\u202B';
96 
97     /**
98      * Unicode "Pop Directional Formatting" (PDF) character.
99      */
100     private static final char PDF = '\u202C';
101 
102     /**
103      *  Unicode "Left-To-Right Mark" (LRM) character.
104      */
105     private static final char LRM = '\u200E';
106 
107     /*
108      * Unicode "Right-To-Left Mark" (RLM) character.
109      */
110     private static final char RLM = '\u200F';
111 
112     /*
113      * String representation of LRM
114      */
115     private static final String LRM_STRING = Character.toString(LRM);
116 
117     /*
118      * String representation of RLM
119      */
120     private static final String RLM_STRING = Character.toString(RLM);
121 
122     /**
123      * Empty string constant.
124      */
125     private static final String EMPTY_STRING = "";
126 
127     /**
128      * A class for building a BidiFormatter with non-default options.
129      */
130     public static final class Builder {
131         private boolean mIsRtlContext;
132         private int mFlags;
133         private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat;
134 
135         /**
136          * Constructor.
137          *
138          */
Builder()139         public Builder() {
140             initialize(isRtlLocale(Locale.getDefault()));
141         }
142 
143         /**
144          * Constructor.
145          *
146          * @param rtlContext Whether the context directionality is RTL.
147          */
Builder(boolean rtlContext)148         public Builder(boolean rtlContext) {
149             initialize(rtlContext);
150         }
151 
152         /**
153          * Constructor.
154          *
155          * @param locale The context locale.
156          */
Builder(Locale locale)157         public Builder(Locale locale) {
158             initialize(isRtlLocale(locale));
159         }
160 
161         /**
162          * Initializes the builder with the given context directionality and default options.
163          *
164          * @param isRtlContext Whether the context is RTL or not.
165          */
initialize(boolean isRtlContext)166         private void initialize(boolean isRtlContext) {
167             mIsRtlContext = isRtlContext;
168             mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC;
169             mFlags = DEFAULT_FLAGS;
170         }
171 
172         /**
173          * Specifies whether the BidiFormatter to be built should also "reset" directionality before
174          * a string being bidi-wrapped, not just after it. The default is true.
175          */
stereoReset(boolean stereoReset)176         public Builder stereoReset(boolean stereoReset) {
177             if (stereoReset) {
178                 mFlags |= FLAG_STEREO_RESET;
179             } else {
180                 mFlags &= ~FLAG_STEREO_RESET;
181             }
182             return this;
183         }
184 
185         /**
186          * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
187          * By default, uses the first-strong heuristic.
188          *
189          * @param heuristic the {@code TextDirectionHeuristic} to use.
190          * @return the builder itself.
191          */
setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic)192         public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) {
193             mTextDirectionHeuristicCompat = heuristic;
194             return this;
195         }
196 
getDefaultInstanceFromContext(boolean isRtlContext)197         private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
198             return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
199         }
200 
201         /**
202          * @return A BidiFormatter with the specified options.
203          */
build()204         public BidiFormatter build() {
205             if (mFlags == DEFAULT_FLAGS &&
206                     mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
207                 return getDefaultInstanceFromContext(mIsRtlContext);
208             }
209             return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat);
210         }
211     }
212 
213     //
214     private static final int FLAG_STEREO_RESET = 2;
215     private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
216 
217     static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
218             false /* LTR context */,
219             DEFAULT_FLAGS,
220             DEFAULT_TEXT_DIRECTION_HEURISTIC);
221 
222     static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
223             true /* RTL context */,
224             DEFAULT_FLAGS,
225             DEFAULT_TEXT_DIRECTION_HEURISTIC);
226 
227     private final boolean mIsRtlContext;
228     private final int mFlags;
229     private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat;
230 
231     /**
232      * Factory for creating an instance of BidiFormatter for the default locale directionality.
233      *
234      */
getInstance()235     public static BidiFormatter getInstance() {
236         return new Builder().build();
237     }
238 
239     /**
240      * Factory for creating an instance of BidiFormatter given the context directionality.
241      *
242      * @param rtlContext Whether the context directionality is RTL.
243      */
getInstance(boolean rtlContext)244     public static BidiFormatter getInstance(boolean rtlContext) {
245         return new Builder(rtlContext).build();
246     }
247 
248     /**
249      * Factory for creating an instance of BidiFormatter given the context locale.
250      *
251      * @param locale The context locale.
252      */
getInstance(Locale locale)253     public static BidiFormatter getInstance(Locale locale) {
254         return new Builder(locale).build();
255     }
256 
257     /**
258      * @param isRtlContext Whether the context directionality is RTL or not.
259      * @param flags The option flags.
260      * @param heuristic The default text direction heuristic.
261      */
BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic)262     BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) {
263         mIsRtlContext = isRtlContext;
264         mFlags = flags;
265         mDefaultTextDirectionHeuristicCompat = heuristic;
266     }
267 
268     /**
269      * @return Whether the context directionality is RTL
270      */
isRtlContext()271     public boolean isRtlContext() {
272         return mIsRtlContext;
273     }
274 
275     /**
276      * @return Whether directionality "reset" should also be done before a string being
277      * bidi-wrapped, not just after it.
278      */
getStereoReset()279     public boolean getStereoReset() {
280         return (mFlags & FLAG_STEREO_RESET) != 0;
281     }
282 
283     /**
284      * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
285      * overall or the exit directionality of a given CharSequence is opposite to the context
286      * directionality. Putting this after the CharSequence (including its directionality
287      * declaration wrapping) prevents it from "sticking" to other opposite-directionality text or a
288      * number appearing after it inline with only neutral content in between. Otherwise returns
289      * the empty string. While the exit directionality is determined by scanning the end of the
290      * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the
291      * {@code str}'s directionality.
292      *
293      * @param str CharSequence after which the mark may need to appear.
294      * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
295      *                  directionality.
296      * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
297      *     else, the empty .
298      */
markAfter(CharSequence str, TextDirectionHeuristicCompat heuristic)299     private String markAfter(CharSequence str, TextDirectionHeuristicCompat heuristic) {
300         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
301         // getExitDir() is called only if needed (short-circuit).
302         if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
303             return LRM_STRING;
304         }
305         if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
306             return RLM_STRING;
307         }
308         return EMPTY_STRING;
309     }
310 
311     /**
312      * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
313      * overall or the entry directionality of a given CharSequence is opposite to the context
314      * directionality. Putting this before the CharSequence (including its directionality
315      * declaration wrapping) prevents it from "sticking" to other opposite-directionality text
316      * appearing before it inline with only neutral content in between. Otherwise returns the
317      * empty string. While the entry directionality is determined by scanning the beginning of the
318      * CharSequence, the overall directionality is given explicitly by a heuristic to estimate the
319      * {@code str}'s directionality.
320      *
321      * @param str CharSequence before which the mark may need to appear.
322      * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
323      *                  directionality.
324      * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
325      *     else, the empty string.
326      */
markBefore(CharSequence str, TextDirectionHeuristicCompat heuristic)327     private String markBefore(CharSequence str, TextDirectionHeuristicCompat heuristic) {
328         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
329         // getEntryDir() is called only if needed (short-circuit).
330         if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
331             return LRM_STRING;
332         }
333         if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
334             return RLM_STRING;
335         }
336         return EMPTY_STRING;
337     }
338 
339     /**
340      * Estimates the directionality of a string using the default text direction heuristic.
341      *
342      * @param str String whose directionality is to be estimated.
343      * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
344      *          false.
345      */
isRtl(String str)346     public boolean isRtl(String str) {
347         return isRtl((CharSequence) str);
348     }
349 
350     /**
351      * Operates like {@link #isRtl(String)}, but takes a CharSequence instead of a string.
352      *
353      * @param str CharSequence whose directionality is to be estimated.
354      * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
355      *          false.
356      */
isRtl(CharSequence str)357     public boolean isRtl(CharSequence str) {
358         return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length());
359     }
360 
361     /**
362      * Formats a string of given directionality for use in plain-text output of the context
363      * directionality, so an opposite-directionality string is neither garbled nor garbles its
364      * surroundings. This makes use of Unicode bidi formatting characters.
365      * <p>
366      * The algorithm: In case the given directionality doesn't match the context directionality, wraps
367      * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
368      * LRE+{@code str}+PDF for LTR text.
369      * <p>
370      * If {@code isolate}, directionally isolates the string so that it does not garble its
371      * surroundings. Currently, this is done by "resetting" the directionality after the string by
372      * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
373      * either the overall directionality or the exit directionality of the string is opposite to
374      * that of the context. Unless the formatter was built using
375      * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
376      * bidi mark matching the context directionality when either the overall directionality or the
377      * entry directionality of the string is opposite to that of the context. Note that as opposed
378      * to the overall directionality, the entry and exit directionalities are determined from the
379      * string itself.
380      * <p>
381      * Does *not* do HTML-escaping.
382      *
383      * @param str The input string.
384      * @param heuristic The algorithm to be used to estimate the string's overall direction.
385      * @param isolate Whether to directionally isolate the string to prevent it from garbling the
386      *     content around it
387      * @return Input string after applying the above processing. {@code null} if {@code str} is
388      *     {@code null}.
389      */
unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate)390     public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) {
391         if (str == null) return null;
392         return unicodeWrap((CharSequence) str, heuristic, isolate).toString();
393     }
394 
395     /**
396      * Operates like {@link #unicodeWrap(String,
397      * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but takes a CharSequence
398      * instead of a string
399      *
400      * @param str The input CharSequence.
401      * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
402      *        See {@link androidx.core.text.TextDirectionHeuristicsCompat} for pre-defined
403      *        heuristics.
404      * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
405      *     the content around it
406      * @return Input CharSequence after applying the above processing. {@code null} if {@code str}
407      *     is {@code null}.
408      */
unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic, boolean isolate)409     public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic,
410             boolean isolate) {
411         if (str == null) return null;
412         final boolean isRtl = heuristic.isRtl(str, 0, str.length());
413         SpannableStringBuilder result = new SpannableStringBuilder();
414         if (getStereoReset() && isolate) {
415             result.append(markBefore(str,
416                     isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
417         }
418         if (isRtl != mIsRtlContext) {
419             result.append(isRtl ? RLE : LRE);
420             result.append(str);
421             result.append(PDF);
422         } else {
423             result.append(str);
424         }
425         if (isolate) {
426             result.append(markAfter(str,
427                     isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
428         }
429         return result;
430     }
431 
432     /**
433      * Operates like {@link #unicodeWrap(String, androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but assumes
434      * {@code isolate} is true.
435      *
436      * @param str The input string.
437      * @param heuristic The algorithm to be used to estimate the string's overall direction.
438      * @return Input string after applying the above processing.
439      */
unicodeWrap(String str, TextDirectionHeuristicCompat heuristic)440     public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) {
441         return unicodeWrap(str, heuristic, true /* isolate */);
442     }
443 
444     /**
445      * Operates like {@link #unicodeWrap(CharSequence,
446      * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but assumes {@code isolate}
447      * is true.
448      *
449      * @param str The input CharSequence.
450      * @param heuristic The algorithm to be used to estimate the CharSequence's overall direction.
451      *        See {@link androidx.core.text.TextDirectionHeuristicsCompat} for pre-defined
452      *        heuristics.
453      * @return Input CharSequence after applying the above processing.
454      */
unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic)455     public CharSequence unicodeWrap(CharSequence str, TextDirectionHeuristicCompat heuristic) {
456         return unicodeWrap(str, heuristic, true /* isolate */);
457     }
458 
459     /**
460      * Operates like {@link #unicodeWrap(String, androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the
461      * formatter's default direction estimation algorithm.
462      *
463      * @param str The input string.
464      * @param isolate Whether to directionally isolate the string to prevent it from garbling the
465      *     content around it
466      * @return Input string after applying the above processing.
467      */
unicodeWrap(String str, boolean isolate)468     public String unicodeWrap(String str, boolean isolate) {
469         return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate);
470     }
471 
472     /**
473      * Operates like {@link #unicodeWrap(CharSequence,
474      * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's
475      * default direction estimation algorithm.
476      *
477      * @param str The input CharSequence.
478      * @param isolate Whether to directionally isolate the CharSequence to prevent it from garbling
479      *     the content around it
480      * @return Input CharSequence after applying the above processing.
481      */
unicodeWrap(CharSequence str, boolean isolate)482     public CharSequence unicodeWrap(CharSequence str, boolean isolate) {
483         return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate);
484     }
485 
486     /**
487      * Operates like {@link #unicodeWrap(String, androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the
488      * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
489      *
490      * @param str The input string.
491      * @return Input string after applying the above processing.
492      */
unicodeWrap(String str)493     public String unicodeWrap(String str) {
494         return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */);
495     }
496 
497     /**
498      * Operates like {@link #unicodeWrap(CharSequence,
499      * androidx.core.text.TextDirectionHeuristicCompat, boolean)}, but uses the formatter's
500      * default direction estimation algorithm and assumes {@code isolate} is true.
501      *
502      * @param str The input CharSequence.
503      * @return Input CharSequence after applying the above processing.
504      */
unicodeWrap(CharSequence str)505     public CharSequence unicodeWrap(CharSequence str) {
506         return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */);
507     }
508 
509     /**
510      * Helper method to return true if the Locale directionality is RTL.
511      *
512      * @param locale The Locale whose directionality will be checked to be RTL or LTR
513      * @return true if the {@code locale} directionality is RTL. False otherwise.
514      */
isRtlLocale(Locale locale)515     static boolean isRtlLocale(Locale locale) {
516         return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL);
517     }
518 
519     /**
520      * Enum for directionality type.
521      */
522     private static final int DIR_LTR = -1;
523     private static final int DIR_UNKNOWN = 0;
524     private static final int DIR_RTL = +1;
525 
526     /**
527      * Returns the directionality of the last character with strong directionality in the string, or
528      * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
529      * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
530      * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
531      * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
532      * whether a logically separate item that starts with a number or a character of the string's
533      * exit directionality and follows this string inline (not counting any neutral characters in
534      * between) would "stick" to it in an opposite-directionality context, thus being displayed in
535      * an incorrect position. An LRM or RLM character (the one of the context's directionality)
536      * between the two will prevent such sticking.
537      *
538      * @param str the string to check.
539      */
getExitDir(CharSequence str)540     private static int getExitDir(CharSequence str) {
541         return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
542     }
543 
544     /**
545      * Returns the directionality of the first character with strong directionality in the string,
546      * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
547      * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
548      * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
549      * characters. The intended use is to check whether a logically separate item that ends with a
550      * character of the string's entry directionality and precedes the string inline (not counting
551      * any neutral characters in between) would "stick" to it in an opposite-directionality context,
552      * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
553      * context's directionality) between the two will prevent such sticking.
554      *
555      * @param str the string to check.
556      */
getEntryDir(CharSequence str)557     private static int getEntryDir(CharSequence str) {
558         return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
559     }
560 
561     /**
562      * An object that estimates the directionality of a given string by various methods.
563      *
564      */
565     private static class DirectionalityEstimator {
566 
567         // Internal static variables and constants.
568 
569         /**
570          * Size of the bidi character class cache. The results of the Character.getDirectionality()
571          * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
572          * The 0x700 value is designed to leave all the European and Near Eastern languages in the
573          * cache. It can be reduced to 0x180, restricting the cache to the Western European
574          * languages.
575          */
576         private static final int DIR_TYPE_CACHE_SIZE = 0x700;
577 
578         /**
579          * The bidi character class cache.
580          */
581         private static final byte DIR_TYPE_CACHE[];
582 
583         static {
584             DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
585             for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
586                 DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
587             }
588         }
589 
590         // Internal instance variables.
591 
592         /**
593          * The text to be scanned.
594          */
595         private final CharSequence text;
596 
597         /**
598          * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
599          * entities when looking for the next / preceding dir type.
600          */
601         private final boolean isHtml;
602 
603         /**
604          * The length of the text in chars.
605          */
606         private final int length;
607 
608         /**
609          * The current position in the text.
610          */
611         private int charIndex;
612 
613         /**
614          * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
615          * encountered a supplementary codepoint, this contains a char that is not a valid
616          * codepoint. This is ok, because this member is only used to detect some well-known ASCII
617          * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
618          */
619         private char lastChar;
620 
621         /**
622          * Constructor.
623          *
624          * @param text The string to scan.
625          * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
626          *     tags and entities.
627          */
DirectionalityEstimator(CharSequence text, boolean isHtml)628         DirectionalityEstimator(CharSequence text, boolean isHtml) {
629             this.text = text;
630             this.isHtml = isHtml;
631             length = text.length();
632         }
633 
634         /**
635          * Returns the directionality of the first character with strong directionality in the
636          * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
637          * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
638          * after RLE/RLO. The results are undefined for a string containing unbalanced
639          * LRE/RLE/LRO/RLO/PDF characters.
640          */
getEntryDir()641         int getEntryDir() {
642             // The reason for this method name, as opposed to getFirstStrongDir(), is that
643             // "first strong" is a commonly used description of Unicode's estimation algorithm,
644             // but the two must treat formatting characters quite differently. Thus, we are staying
645             // away from both "first" and "last" in these method names to avoid confusion.
646             charIndex = 0;
647             int embeddingLevel = 0;
648             int embeddingLevelDir = DIR_UNKNOWN;
649             int firstNonEmptyEmbeddingLevel = 0;
650             while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
651                 switch (dirTypeForward()) {
652                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
653                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
654                         ++embeddingLevel;
655                         embeddingLevelDir = DIR_LTR;
656                         break;
657                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
658                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
659                         ++embeddingLevel;
660                         embeddingLevelDir = DIR_RTL;
661                         break;
662                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
663                         --embeddingLevel;
664                         // To restore embeddingLevelDir to its previous value, we would need a
665                         // stack, which we want to avoid. Thus, at this point we do not know the
666                         // current embedding's directionality.
667                         embeddingLevelDir = DIR_UNKNOWN;
668                         break;
669                     case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
670                         break;
671                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
672                         if (embeddingLevel == 0) {
673                             return DIR_LTR;
674                         }
675                         firstNonEmptyEmbeddingLevel = embeddingLevel;
676                         break;
677                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
678                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
679                         if (embeddingLevel == 0) {
680                             return DIR_RTL;
681                         }
682                         firstNonEmptyEmbeddingLevel = embeddingLevel;
683                         break;
684                     default:
685                         firstNonEmptyEmbeddingLevel = embeddingLevel;
686                         break;
687                 }
688             }
689 
690             // We have either found a non-empty embedding or scanned the entire string finding
691             // neither a non-empty embedding nor a strong character outside of an embedding.
692             if (firstNonEmptyEmbeddingLevel == 0) {
693                 // We have not found a non-empty embedding. Thus, the string contains neither a
694                 // non-empty embedding nor a strong character outside of an embedding.
695                 return DIR_UNKNOWN;
696             }
697 
698             // We have found a non-empty embedding.
699             if (embeddingLevelDir != DIR_UNKNOWN) {
700                 // We know the directionality of the non-empty embedding.
701                 return embeddingLevelDir;
702             }
703 
704             // We do not remember the directionality of the non-empty embedding we found. So, we go
705             // backwards to find the start of the non-empty embedding and get its directionality.
706             while (charIndex > 0) {
707                 switch (dirTypeBackward()) {
708                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
709                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
710                         if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
711                             return DIR_LTR;
712                         }
713                         --embeddingLevel;
714                         break;
715                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
716                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
717                         if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
718                             return DIR_RTL;
719                         }
720                         --embeddingLevel;
721                         break;
722                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
723                         ++embeddingLevel;
724                         break;
725                 }
726             }
727             // We should never get here.
728             return DIR_UNKNOWN;
729         }
730 
731         /**
732          * Returns the directionality of the last character with strong directionality in the
733          * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
734          * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
735          * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
736          * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
737          */
getExitDir()738         int getExitDir() {
739             // The reason for this method name, as opposed to getLastStrongDir(), is that "last
740             // strong" sounds like the exact opposite of "first strong", which is a commonly used
741             // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
742             // must treat formatting characters quite differently. Thus, we are staying away from
743             // both "first" and "last" in these method names to avoid confusion.
744             charIndex = length;
745             int embeddingLevel = 0;
746             int lastNonEmptyEmbeddingLevel = 0;
747             while (charIndex > 0) {
748                 switch (dirTypeBackward()) {
749                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
750                         if (embeddingLevel == 0) {
751                             return DIR_LTR;
752                         }
753                         if (lastNonEmptyEmbeddingLevel == 0) {
754                             lastNonEmptyEmbeddingLevel = embeddingLevel;
755                         }
756                         break;
757                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
758                     case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
759                         if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
760                             return DIR_LTR;
761                         }
762                         --embeddingLevel;
763                         break;
764                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
765                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
766                         if (embeddingLevel == 0) {
767                             return DIR_RTL;
768                         }
769                         if (lastNonEmptyEmbeddingLevel == 0) {
770                             lastNonEmptyEmbeddingLevel = embeddingLevel;
771                         }
772                         break;
773                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
774                     case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
775                         if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
776                             return DIR_RTL;
777                         }
778                         --embeddingLevel;
779                         break;
780                     case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
781                         ++embeddingLevel;
782                         break;
783                     case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
784                         break;
785                     default:
786                         if (lastNonEmptyEmbeddingLevel == 0) {
787                             lastNonEmptyEmbeddingLevel = embeddingLevel;
788                         }
789                         break;
790                 }
791             }
792             return DIR_UNKNOWN;
793         }
794 
795         // Internal methods
796 
797         /**
798          * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
799          * a cache for speed. Not designed for supplementary codepoints, whose results we do not
800          * cache.
801          */
getCachedDirectionality(char c)802         private static byte getCachedDirectionality(char c) {
803             return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c);
804         }
805 
806         /**
807          * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
808          * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
809          * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
810          * figure out the actual character, and return its dirtype, but treating it as whitespace is
811          * good enough for our purposes.
812          *
813          * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
814          */
dirTypeForward()815         byte dirTypeForward() {
816             lastChar = text.charAt(charIndex);
817             if (Character.isHighSurrogate(lastChar)) {
818                 int codePoint = Character.codePointAt(text, charIndex);
819                 charIndex += Character.charCount(codePoint);
820                 return Character.getDirectionality(codePoint);
821             }
822             charIndex++;
823             byte dirType = getCachedDirectionality(lastChar);
824             if (isHtml) {
825                 // Process tags and entities.
826                 if (lastChar == '<') {
827                     dirType = skipTagForward();
828                 } else if (lastChar == '&') {
829                     dirType = skipEntityForward();
830                 }
831             }
832             return dirType;
833         }
834 
835         /**
836          * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
837          * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
838          * entity, advances over the whole tag/entity and returns
839          * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
840          * actual character, and return its dirtype, but treating it as whitespace is good enough
841          * for our purposes.
842          *
843          * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
844          */
dirTypeBackward()845         byte dirTypeBackward() {
846             lastChar = text.charAt(charIndex - 1);
847             if (Character.isLowSurrogate(lastChar)) {
848                 int codePoint = Character.codePointBefore(text, charIndex);
849                 charIndex -= Character.charCount(codePoint);
850                 return Character.getDirectionality(codePoint);
851             }
852             charIndex--;
853             byte dirType = getCachedDirectionality(lastChar);
854             if (isHtml) {
855                 // Process tags and entities.
856                 if (lastChar == '>') {
857                     dirType = skipTagBackward();
858                 } else if (lastChar == ';') {
859                     dirType = skipEntityBackward();
860                 }
861             }
862             return dirType;
863         }
864 
865         /**
866          * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
867          * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
868          * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
869          * &lt; that hadn't been part of a tag after all).
870          */
skipTagForward()871         private byte skipTagForward() {
872             int initialCharIndex = charIndex;
873             while (charIndex < length) {
874                 lastChar = text.charAt(charIndex++);
875                 if (lastChar == '>') {
876                     // The end of the tag.
877                     return Character.DIRECTIONALITY_WHITESPACE;
878                 }
879                 if (lastChar == '"' || lastChar == '\'') {
880                     // Skip over a quoted attribute value inside the tag.
881                     char quote = lastChar;
882                     while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
883                 }
884             }
885             // The original '<' wasn't the start of a tag after all.
886             charIndex = initialCharIndex;
887             lastChar = '<';
888             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
889         }
890 
891         /**
892          * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
893          * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
894          * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
895          * that hadn't been part of a tag after all). Nevertheless, the running time for calling
896          * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
897          * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
898          * when it encounters another &gt;.
899          */
skipTagBackward()900         private byte skipTagBackward() {
901             int initialCharIndex = charIndex;
902             while (charIndex > 0) {
903                 lastChar = text.charAt(--charIndex);
904                 if (lastChar == '<') {
905                     // The start of the tag.
906                     return Character.DIRECTIONALITY_WHITESPACE;
907                 }
908                 if (lastChar == '>') {
909                     break;
910                 }
911                 if (lastChar == '"' || lastChar == '\'') {
912                     // Skip over a quoted attribute value inside the tag.
913                     char quote = lastChar;
914                     while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
915                 }
916             }
917             // The original '>' wasn't the end of a tag after all.
918             charIndex = initialCharIndex;
919             lastChar = '>';
920             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
921         }
922 
923         /**
924          * Advances charIndex forward through an HTML character entity tag (after the opening
925          * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
926          * best to figure out the actual character and return its dirtype, but this is good enough.
927          */
skipEntityForward()928         private byte skipEntityForward() {
929             while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
930             return Character.DIRECTIONALITY_WHITESPACE;
931         }
932 
933         /**
934          * Advances charIndex backward through an HTML character entity tag (after the closing ;
935          * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
936          * to figure out the actual character and return its dirtype, but this is good enough.
937          * If there is no matching &amp;, does not change charIndex and returns
938          * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
939          * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
940          * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
941          * also stops looking for a matching &amp; when it encounters another ;.
942          */
skipEntityBackward()943         private byte skipEntityBackward() {
944             int initialCharIndex = charIndex;
945             while (charIndex > 0) {
946                 lastChar = text.charAt(--charIndex);
947                 if (lastChar == '&') {
948                     return Character.DIRECTIONALITY_WHITESPACE;
949                 }
950                 if (lastChar == ';') {
951                     break;
952                 }
953             }
954             charIndex = initialCharIndex;
955             lastChar = ';';
956             return Character.DIRECTIONALITY_OTHER_NEUTRALS;
957         }
958     }
959 }
960