1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/rtl.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include <algorithm>
11 
12 #include "base/command_line.h"
13 #include "base/files/file_path.h"
14 #include "base/i18n/base_i18n_switches.h"
15 #include "base/logging.h"
16 #include "base/macros.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/sys_string_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "build/build_config.h"
22 #include "third_party/icu/source/common/unicode/locid.h"
23 #include "third_party/icu/source/common/unicode/uchar.h"
24 #include "third_party/icu/source/common/unicode/uscript.h"
25 #include "third_party/icu/source/i18n/unicode/coll.h"
26 
27 #if defined(OS_IOS)
28 #include "base/debug/crash_logging.h"
29 #include "base/ios/ios_util.h"
30 #endif
31 
32 namespace {
33 
34 // Extract language, country and variant, but ignore keywords.  For example,
35 // en-US, ca@valencia, ca-ES@valencia.
GetLocaleString(const icu::Locale & locale)36 std::string GetLocaleString(const icu::Locale& locale) {
37   const char* language = locale.getLanguage();
38   const char* country = locale.getCountry();
39   const char* variant = locale.getVariant();
40 
41   std::string result =
42       (language != nullptr && *language != '\0') ? language : "und";
43 
44   if (country != nullptr && *country != '\0') {
45     result += '-';
46     result += country;
47   }
48 
49   if (variant != nullptr && *variant != '\0')
50     result += '@' + base::ToLowerASCII(variant);
51 
52   return result;
53 }
54 
55 // Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
56 // directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
57 // http://unicode.org/reports/tr9/ for more information.
GetCharacterDirection(UChar32 character)58 base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
59   static bool has_switch = base::CommandLine::ForCurrentProcess()->HasSwitch(
60       switches::kForceTextDirection);
61   if (has_switch) {
62     base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
63     std::string force_flag =
64         command_line->GetSwitchValueASCII(switches::kForceTextDirection);
65 
66     if (force_flag == switches::kForceDirectionRTL)
67       return base::i18n::RIGHT_TO_LEFT;
68     if (force_flag == switches::kForceDirectionLTR)
69       return base::i18n::LEFT_TO_RIGHT;
70   }
71   // Now that we have the character, we use ICU in order to query for the
72   // appropriate Unicode BiDi character type.
73   int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
74   if ((property == U_RIGHT_TO_LEFT) ||
75       (property == U_RIGHT_TO_LEFT_ARABIC) ||
76       (property == U_RIGHT_TO_LEFT_EMBEDDING) ||
77       (property == U_RIGHT_TO_LEFT_OVERRIDE)) {
78     return base::i18n::RIGHT_TO_LEFT;
79   } else if ((property == U_LEFT_TO_RIGHT) ||
80              (property == U_LEFT_TO_RIGHT_EMBEDDING) ||
81              (property == U_LEFT_TO_RIGHT_OVERRIDE)) {
82     return base::i18n::LEFT_TO_RIGHT;
83   }
84   return base::i18n::UNKNOWN_DIRECTION;
85 }
86 
87 }  // namespace
88 
89 namespace base {
90 namespace i18n {
91 
92 // Represents the locale-specific ICU text direction.
93 static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
94 
95 // Convert the ICU default locale to a string.
GetConfiguredLocale()96 std::string GetConfiguredLocale() {
97   return GetLocaleString(icu::Locale::getDefault());
98 }
99 
100 // Convert the ICU canonicalized locale to a string.
GetCanonicalLocale(const std::string & locale)101 std::string GetCanonicalLocale(const std::string& locale) {
102   return GetLocaleString(icu::Locale::createCanonical(locale.c_str()));
103 }
104 
105 // Convert Chrome locale name to ICU locale name
ICULocaleName(const std::string & locale_string)106 std::string ICULocaleName(const std::string& locale_string) {
107   // If not Spanish, just return it.
108   if (locale_string.substr(0, 2) != "es")
109     return locale_string;
110   // Expand es to es-ES.
111   if (LowerCaseEqualsASCII(locale_string, "es"))
112     return "es-ES";
113   // Map es-419 (Latin American Spanish) to es-FOO depending on the system
114   // locale.  If it's es-RR other than es-ES, map to es-RR. Otherwise, map
115   // to es-MX (the most populous in Spanish-speaking Latin America).
116   if (LowerCaseEqualsASCII(locale_string, "es-419")) {
117     const icu::Locale& locale = icu::Locale::getDefault();
118     std::string language = locale.getLanguage();
119     const char* country = locale.getCountry();
120     if (LowerCaseEqualsASCII(language, "es") &&
121       !LowerCaseEqualsASCII(country, "es")) {
122         language += '-';
123         language += country;
124         return language;
125     }
126     return "es-MX";
127   }
128   // Currently, Chrome has only "es" and "es-419", but later we may have
129   // more specific "es-RR".
130   return locale_string;
131 }
132 
SetICUDefaultLocale(const std::string & locale_string)133 void SetICUDefaultLocale(const std::string& locale_string) {
134 #if defined(OS_IOS)
135   static base::debug::CrashKeyString* crash_key_locale =
136       base::debug::AllocateCrashKeyString("icu_locale_input",
137                                           base::debug::CrashKeySize::Size256);
138   base::debug::SetCrashKeyString(crash_key_locale, locale_string);
139 #endif
140   icu::Locale locale(ICULocaleName(locale_string).c_str());
141   UErrorCode error_code = U_ZERO_ERROR;
142   const char* lang = locale.getLanguage();
143   if (lang != nullptr && *lang != '\0') {
144     icu::Locale::setDefault(locale, error_code);
145   } else {
146     LOG(ERROR) << "Failed to set the ICU default locale to " << locale_string
147                << ". Falling back to en-US.";
148     icu::Locale::setDefault(icu::Locale::getUS(), error_code);
149   }
150   g_icu_text_direction = UNKNOWN_DIRECTION;
151 }
152 
IsRTL()153 bool IsRTL() {
154   return ICUIsRTL();
155 }
156 
SetRTLForTesting(bool rtl)157 void SetRTLForTesting(bool rtl) {
158   SetICUDefaultLocale(rtl ? "he" : "en");
159   DCHECK_EQ(rtl, IsRTL());
160 }
161 
ICUIsRTL()162 bool ICUIsRTL() {
163   if (g_icu_text_direction == UNKNOWN_DIRECTION) {
164     const icu::Locale& locale = icu::Locale::getDefault();
165     g_icu_text_direction = GetTextDirectionForLocaleInStartUp(locale.getName());
166   }
167   return g_icu_text_direction == RIGHT_TO_LEFT;
168 }
169 
GetForcedTextDirection()170 TextDirection GetForcedTextDirection() {
171 // On iOS, check for RTL forcing.
172 #if defined(OS_IOS)
173   if (base::ios::IsInForcedRTL())
174     return base::i18n::RIGHT_TO_LEFT;
175 #endif
176 
177   base::CommandLine* command_line = base::CommandLine::ForCurrentProcess();
178   if (command_line->HasSwitch(switches::kForceUIDirection)) {
179     std::string force_flag =
180         command_line->GetSwitchValueASCII(switches::kForceUIDirection);
181 
182     if (force_flag == switches::kForceDirectionLTR)
183       return base::i18n::LEFT_TO_RIGHT;
184 
185     if (force_flag == switches::kForceDirectionRTL)
186       return base::i18n::RIGHT_TO_LEFT;
187   }
188 
189   return base::i18n::UNKNOWN_DIRECTION;
190 }
191 
GetTextDirectionForLocaleInStartUp(const char * locale_name)192 TextDirection GetTextDirectionForLocaleInStartUp(const char* locale_name) {
193   // Check for direction forcing.
194   TextDirection forced_direction = GetForcedTextDirection();
195   if (forced_direction != UNKNOWN_DIRECTION)
196     return forced_direction;
197 
198   // This list needs to be updated in alphabetical order if we add more RTL
199   // locales.
200   static const char kRTLLanguageCodes[][3] = {"ar", "fa", "he", "iw", "ur"};
201   std::vector<StringPiece> locale_split =
202       SplitStringPiece(locale_name, "-_", KEEP_WHITESPACE, SPLIT_WANT_ALL);
203   const StringPiece& language_code = locale_split[0];
204   if (std::binary_search(kRTLLanguageCodes,
205                          kRTLLanguageCodes + arraysize(kRTLLanguageCodes),
206                          language_code))
207     return RIGHT_TO_LEFT;
208   return LEFT_TO_RIGHT;
209 }
210 
GetTextDirectionForLocale(const char * locale_name)211 TextDirection GetTextDirectionForLocale(const char* locale_name) {
212   // Check for direction forcing.
213   TextDirection forced_direction = GetForcedTextDirection();
214   if (forced_direction != UNKNOWN_DIRECTION)
215     return forced_direction;
216 
217   UErrorCode status = U_ZERO_ERROR;
218   ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
219   DCHECK(U_SUCCESS(status));
220   // Treat anything other than RTL as LTR.
221   return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
222 }
223 
GetFirstStrongCharacterDirection(const string16 & text)224 TextDirection GetFirstStrongCharacterDirection(const string16& text) {
225   const UChar* string = text.c_str();
226   size_t length = text.length();
227   size_t position = 0;
228   while (position < length) {
229     UChar32 character;
230     size_t next_position = position;
231     U16_NEXT(string, next_position, length, character);
232     TextDirection direction = GetCharacterDirection(character);
233     if (direction != UNKNOWN_DIRECTION)
234       return direction;
235     position = next_position;
236   }
237   return LEFT_TO_RIGHT;
238 }
239 
GetLastStrongCharacterDirection(const string16 & text)240 TextDirection GetLastStrongCharacterDirection(const string16& text) {
241   const UChar* string = text.c_str();
242   size_t position = text.length();
243   while (position > 0) {
244     UChar32 character;
245     size_t prev_position = position;
246     U16_PREV(string, 0, prev_position, character);
247     TextDirection direction = GetCharacterDirection(character);
248     if (direction != UNKNOWN_DIRECTION)
249       return direction;
250     position = prev_position;
251   }
252   return LEFT_TO_RIGHT;
253 }
254 
GetStringDirection(const string16 & text)255 TextDirection GetStringDirection(const string16& text) {
256   const UChar* string = text.c_str();
257   size_t length = text.length();
258   size_t position = 0;
259 
260   TextDirection result(UNKNOWN_DIRECTION);
261   while (position < length) {
262     UChar32 character;
263     size_t next_position = position;
264     U16_NEXT(string, next_position, length, character);
265     TextDirection direction = GetCharacterDirection(character);
266     if (direction != UNKNOWN_DIRECTION) {
267       if (result != UNKNOWN_DIRECTION && result != direction)
268         return UNKNOWN_DIRECTION;
269       result = direction;
270     }
271     position = next_position;
272   }
273 
274   // Handle the case of a string not containing any strong directionality
275   // characters defaulting to LEFT_TO_RIGHT.
276   if (result == UNKNOWN_DIRECTION)
277     return LEFT_TO_RIGHT;
278 
279   return result;
280 }
281 
282 #if defined(OS_WIN)
AdjustStringForLocaleDirection(string16 * text)283 bool AdjustStringForLocaleDirection(string16* text) {
284   if (!IsRTL() || text->empty())
285     return false;
286 
287   // Marking the string as LTR if the locale is RTL and the string does not
288   // contain strong RTL characters. Otherwise, mark the string as RTL.
289   bool has_rtl_chars = StringContainsStrongRTLChars(*text);
290   if (!has_rtl_chars)
291     WrapStringWithLTRFormatting(text);
292   else
293     WrapStringWithRTLFormatting(text);
294 
295   return true;
296 }
297 
UnadjustStringForLocaleDirection(string16 * text)298 bool UnadjustStringForLocaleDirection(string16* text) {
299   if (!IsRTL() || text->empty())
300     return false;
301 
302   *text = StripWrappingBidiControlCharacters(*text);
303   return true;
304 }
305 #else
AdjustStringForLocaleDirection(string16 * text)306 bool AdjustStringForLocaleDirection(string16* text) {
307   // On OS X & GTK the directionality of a label is determined by the first
308   // strongly directional character.
309   // However, we want to make sure that in an LTR-language-UI all strings are
310   // left aligned and vice versa.
311   // A problem can arise if we display a string which starts with user input.
312   // User input may be of the opposite directionality to the UI. So the whole
313   // string will be displayed in the opposite directionality, e.g. if we want to
314   // display in an LTR UI [such as US English]:
315   //
316   // EMAN_NOISNETXE is now installed.
317   //
318   // Since EXTENSION_NAME begins with a strong RTL char, the label's
319   // directionality will be set to RTL and the string will be displayed visually
320   // as:
321   //
322   // .is now installed EMAN_NOISNETXE
323   //
324   // In order to solve this issue, we prepend an LRM to the string. An LRM is a
325   // strongly directional LTR char.
326   // We also append an LRM at the end, which ensures that we're in an LTR
327   // context.
328 
329   // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
330   // box so there is no issue with displaying zero-width bidi control characters
331   // on any system.  Thus no need for the !IsRTL() check here.
332   if (text->empty())
333     return false;
334 
335   bool ui_direction_is_rtl = IsRTL();
336 
337   bool has_rtl_chars = StringContainsStrongRTLChars(*text);
338   if (!ui_direction_is_rtl && has_rtl_chars) {
339     WrapStringWithRTLFormatting(text);
340     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
341                  kLeftToRightMark);
342     text->push_back(kLeftToRightMark);
343   } else if (ui_direction_is_rtl && has_rtl_chars) {
344     WrapStringWithRTLFormatting(text);
345     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
346                  kRightToLeftMark);
347     text->push_back(kRightToLeftMark);
348   } else if (ui_direction_is_rtl) {
349     WrapStringWithLTRFormatting(text);
350     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
351                  kRightToLeftMark);
352     text->push_back(kRightToLeftMark);
353   } else {
354     return false;
355   }
356 
357   return true;
358 }
359 
UnadjustStringForLocaleDirection(string16 * text)360 bool UnadjustStringForLocaleDirection(string16* text) {
361   if (text->empty())
362     return false;
363 
364   size_t begin_index = 0;
365   char16 begin = text->at(begin_index);
366   if (begin == kLeftToRightMark ||
367       begin == kRightToLeftMark) {
368     ++begin_index;
369   }
370 
371   size_t end_index = text->length() - 1;
372   char16 end = text->at(end_index);
373   if (end == kLeftToRightMark ||
374       end == kRightToLeftMark) {
375     --end_index;
376   }
377 
378   string16 unmarked_text =
379       text->substr(begin_index, end_index - begin_index + 1);
380   *text = StripWrappingBidiControlCharacters(unmarked_text);
381   return true;
382 }
383 
384 #endif  // !OS_WIN
385 
EnsureTerminatedDirectionalFormatting(string16 * text)386 void EnsureTerminatedDirectionalFormatting(string16* text) {
387   int count = 0;
388   for (auto c : *text) {
389     if (c == kLeftToRightEmbeddingMark || c == kRightToLeftEmbeddingMark ||
390         c == kLeftToRightOverride || c == kRightToLeftOverride) {
391       ++count;
392     } else if (c == kPopDirectionalFormatting && count > 0) {
393       --count;
394     }
395   }
396   for (int j = 0; j < count; j++)
397     text->push_back(kPopDirectionalFormatting);
398 }
399 
SanitizeUserSuppliedString(string16 * text)400 void SanitizeUserSuppliedString(string16* text) {
401   EnsureTerminatedDirectionalFormatting(text);
402   AdjustStringForLocaleDirection(text);
403 }
404 
StringContainsStrongRTLChars(const string16 & text)405 bool StringContainsStrongRTLChars(const string16& text) {
406   const UChar* string = text.c_str();
407   size_t length = text.length();
408   size_t position = 0;
409   while (position < length) {
410     UChar32 character;
411     size_t next_position = position;
412     U16_NEXT(string, next_position, length, character);
413 
414     // Now that we have the character, we use ICU in order to query for the
415     // appropriate Unicode BiDi character type.
416     int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
417     if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
418       return true;
419 
420     position = next_position;
421   }
422 
423   return false;
424 }
425 
WrapStringWithLTRFormatting(string16 * text)426 void WrapStringWithLTRFormatting(string16* text) {
427   if (text->empty())
428     return;
429 
430   // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
431   text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
432                kLeftToRightEmbeddingMark);
433 
434   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
435   text->push_back(kPopDirectionalFormatting);
436 }
437 
WrapStringWithRTLFormatting(string16 * text)438 void WrapStringWithRTLFormatting(string16* text) {
439   if (text->empty())
440     return;
441 
442   // Inserting an RLE (Right-To-Left Embedding) mark as the first character.
443   text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
444                kRightToLeftEmbeddingMark);
445 
446   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
447   text->push_back(kPopDirectionalFormatting);
448 }
449 
WrapPathWithLTRFormatting(const FilePath & path,string16 * rtl_safe_path)450 void WrapPathWithLTRFormatting(const FilePath& path,
451                                string16* rtl_safe_path) {
452   // Wrap the overall path with LRE-PDF pair which essentialy marks the
453   // string as a Left-To-Right string.
454   // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
455   rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
456 #if defined(OS_MACOSX)
457     rtl_safe_path->append(UTF8ToUTF16(path.value()));
458 #elif defined(OS_WIN)
459     rtl_safe_path->append(path.value());
460 #else  // defined(OS_POSIX) && !defined(OS_MACOSX)
461     std::wstring wide_path = base::SysNativeMBToWide(path.value());
462     rtl_safe_path->append(WideToUTF16(wide_path));
463 #endif
464   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
465   rtl_safe_path->push_back(kPopDirectionalFormatting);
466 }
467 
GetDisplayStringInLTRDirectionality(const string16 & text)468 string16 GetDisplayStringInLTRDirectionality(const string16& text) {
469   // Always wrap the string in RTL UI (it may be appended to RTL string).
470   // Also wrap strings with an RTL first strong character direction in LTR UI.
471   if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
472     string16 text_mutable(text);
473     WrapStringWithLTRFormatting(&text_mutable);
474     return text_mutable;
475   }
476   return text;
477 }
478 
StripWrappingBidiControlCharacters(const string16 & text)479 string16 StripWrappingBidiControlCharacters(const string16& text) {
480   if (text.empty())
481     return text;
482   size_t begin_index = 0;
483   char16 begin = text[begin_index];
484   if (begin == kLeftToRightEmbeddingMark ||
485       begin == kRightToLeftEmbeddingMark ||
486       begin == kLeftToRightOverride ||
487       begin == kRightToLeftOverride)
488     ++begin_index;
489   size_t end_index = text.length() - 1;
490   if (text[end_index] == kPopDirectionalFormatting)
491     --end_index;
492   return text.substr(begin_index, end_index - begin_index + 1);
493 }
494 
495 }  // namespace i18n
496 }  // namespace base
497