1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_INTL_SUPPORT
6 #error Internationalization is expected to be enabled.
7 #endif  // V8_INTL_SUPPORT
8 
9 #include "src/objects/js-collator.h"
10 
11 #include "src/isolate.h"
12 #include "src/objects-inl.h"
13 #include "src/objects/js-collator-inl.h"
14 #include "unicode/coll.h"
15 #include "unicode/locid.h"
16 #include "unicode/strenum.h"
17 #include "unicode/ucol.h"
18 #include "unicode/uloc.h"
19 
20 namespace v8 {
21 namespace internal {
22 
23 namespace {
24 
25 // TODO(gsathya): Consider internalizing the value strings.
CreateDataPropertyForOptions(Isolate * isolate,Handle<JSObject> options,Handle<String> key,const char * value)26 void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
27                                   Handle<String> key, const char* value) {
28   CHECK_NOT_NULL(value);
29   Handle<String> value_str =
30       isolate->factory()->NewStringFromAsciiChecked(value);
31 
32   // This is a brand new JSObject that shouldn't already have the same
33   // key so this shouldn't fail.
34   CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_str,
35                                        kDontThrow)
36             .FromJust());
37 }
38 
CreateDataPropertyForOptions(Isolate * isolate,Handle<JSObject> options,Handle<String> key,bool value)39 void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
40                                   Handle<String> key, bool value) {
41   Handle<Object> value_obj = isolate->factory()->ToBoolean(value);
42 
43   // This is a brand new JSObject that shouldn't already have the same
44   // key so this shouldn't fail.
45   CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_obj,
46                                        kDontThrow)
47             .FromJust());
48 }
49 
50 }  // anonymous namespace
51 
52 // static
ResolvedOptions(Isolate * isolate,Handle<JSCollator> collator)53 Handle<JSObject> JSCollator::ResolvedOptions(Isolate* isolate,
54                                              Handle<JSCollator> collator) {
55   Handle<JSObject> options =
56       isolate->factory()->NewJSObject(isolate->object_function());
57 
58   JSCollator::Usage usage = collator->usage();
59   CreateDataPropertyForOptions(isolate, options,
60                                isolate->factory()->usage_string(),
61                                JSCollator::UsageToString(usage));
62 
63   icu::Collator* icu_collator = collator->icu_collator()->raw();
64   CHECK_NOT_NULL(icu_collator);
65 
66   UErrorCode status = U_ZERO_ERROR;
67   bool numeric =
68       icu_collator->getAttribute(UCOL_NUMERIC_COLLATION, status) == UCOL_ON;
69   CHECK(U_SUCCESS(status));
70   CreateDataPropertyForOptions(isolate, options,
71                                isolate->factory()->numeric_string(), numeric);
72 
73   const char* case_first = nullptr;
74   status = U_ZERO_ERROR;
75   switch (icu_collator->getAttribute(UCOL_CASE_FIRST, status)) {
76     case UCOL_LOWER_FIRST:
77       case_first = "lower";
78       break;
79     case UCOL_UPPER_FIRST:
80       case_first = "upper";
81       break;
82     default:
83       case_first = "false";
84   }
85   CHECK(U_SUCCESS(status));
86   CreateDataPropertyForOptions(
87       isolate, options, isolate->factory()->caseFirst_string(), case_first);
88 
89   const char* sensitivity = nullptr;
90   status = U_ZERO_ERROR;
91   switch (icu_collator->getAttribute(UCOL_STRENGTH, status)) {
92     case UCOL_PRIMARY: {
93       CHECK(U_SUCCESS(status));
94       status = U_ZERO_ERROR;
95       // case level: true + s1 -> case, s1 -> base.
96       if (UCOL_ON == icu_collator->getAttribute(UCOL_CASE_LEVEL, status)) {
97         sensitivity = "case";
98       } else {
99         sensitivity = "base";
100       }
101       CHECK(U_SUCCESS(status));
102       break;
103     }
104     case UCOL_SECONDARY:
105       sensitivity = "accent";
106       break;
107     case UCOL_TERTIARY:
108       sensitivity = "variant";
109       break;
110     case UCOL_QUATERNARY:
111       // We shouldn't get quaternary and identical from ICU, but if we do
112       // put them into variant.
113       sensitivity = "variant";
114       break;
115     default:
116       sensitivity = "variant";
117   }
118   CHECK(U_SUCCESS(status));
119   CreateDataPropertyForOptions(
120       isolate, options, isolate->factory()->sensitivity_string(), sensitivity);
121 
122   status = U_ZERO_ERROR;
123   bool ignore_punctuation = icu_collator->getAttribute(UCOL_ALTERNATE_HANDLING,
124                                                        status) == UCOL_SHIFTED;
125   CHECK(U_SUCCESS(status));
126   CreateDataPropertyForOptions(isolate, options,
127                                isolate->factory()->ignorePunctuation_string(),
128                                ignore_punctuation);
129 
130   status = U_ZERO_ERROR;
131   const char* collation;
132   std::unique_ptr<icu::StringEnumeration> collation_values(
133       icu_collator->getKeywordValues("co", status));
134   // Collation wasn't provided as a keyword to icu, use default.
135   if (status == U_ILLEGAL_ARGUMENT_ERROR) {
136     CreateDataPropertyForOptions(
137         isolate, options, isolate->factory()->collation_string(), "default");
138   } else {
139     CHECK(U_SUCCESS(status));
140     CHECK_NOT_NULL(collation_values.get());
141 
142     int32_t length;
143     status = U_ZERO_ERROR;
144     collation = collation_values->next(&length, status);
145     CHECK(U_SUCCESS(status));
146 
147     // There has to be at least one value.
148     CHECK_NOT_NULL(collation);
149     CreateDataPropertyForOptions(
150         isolate, options, isolate->factory()->collation_string(), collation);
151 
152     status = U_ZERO_ERROR;
153     collation_values->reset(status);
154     CHECK(U_SUCCESS(status));
155   }
156 
157   status = U_ZERO_ERROR;
158   icu::Locale icu_locale = icu_collator->getLocale(ULOC_VALID_LOCALE, status);
159   CHECK(U_SUCCESS(status));
160 
161   char result[ULOC_FULLNAME_CAPACITY];
162   status = U_ZERO_ERROR;
163   uloc_toLanguageTag(icu_locale.getName(), result, ULOC_FULLNAME_CAPACITY,
164                      FALSE, &status);
165   CHECK(U_SUCCESS(status));
166 
167   CreateDataPropertyForOptions(isolate, options,
168                                isolate->factory()->locale_string(), result);
169 
170   return options;
171 }
172 
173 namespace {
174 
LookupUnicodeExtensions(const icu::Locale & icu_locale,const std::set<std::string> & relevant_keys)175 std::map<std::string, std::string> LookupUnicodeExtensions(
176     const icu::Locale& icu_locale, const std::set<std::string>& relevant_keys) {
177   std::map<std::string, std::string> extensions;
178 
179   UErrorCode status = U_ZERO_ERROR;
180   std::unique_ptr<icu::StringEnumeration> keywords(
181       icu_locale.createKeywords(status));
182   if (U_FAILURE(status)) return extensions;
183 
184   if (!keywords) return extensions;
185   char value[ULOC_FULLNAME_CAPACITY];
186 
187   int32_t length;
188   status = U_ZERO_ERROR;
189   for (const char* keyword = keywords->next(&length, status);
190        keyword != nullptr; keyword = keywords->next(&length, status)) {
191     // Ignore failures in ICU and skip to the next keyword.
192     //
193     // This is fine.™
194     if (U_FAILURE(status)) {
195       status = U_ZERO_ERROR;
196       continue;
197     }
198 
199     icu_locale.getKeywordValue(keyword, value, ULOC_FULLNAME_CAPACITY, status);
200 
201     // Ignore failures in ICU and skip to the next keyword.
202     //
203     // This is fine.™
204     if (U_FAILURE(status)) {
205       status = U_ZERO_ERROR;
206       continue;
207     }
208 
209     const char* bcp47_key = uloc_toUnicodeLocaleKey(keyword);
210 
211     // Ignore keywords that we don't recognize - spec allows that.
212     if (bcp47_key && (relevant_keys.find(bcp47_key) != relevant_keys.end())) {
213       const char* bcp47_value = uloc_toUnicodeLocaleType(bcp47_key, value);
214       extensions.insert(
215           std::pair<std::string, std::string>(bcp47_key, bcp47_value));
216     }
217   }
218 
219   return extensions;
220 }
221 
SetCaseFirstOption(icu::Collator * icu_collator,const char * value)222 void SetCaseFirstOption(icu::Collator* icu_collator, const char* value) {
223   CHECK_NOT_NULL(icu_collator);
224   CHECK_NOT_NULL(value);
225   UErrorCode status = U_ZERO_ERROR;
226   if (strcmp(value, "upper") == 0) {
227     icu_collator->setAttribute(UCOL_CASE_FIRST, UCOL_UPPER_FIRST, status);
228   } else if (strcmp(value, "lower") == 0) {
229     icu_collator->setAttribute(UCOL_CASE_FIRST, UCOL_LOWER_FIRST, status);
230   } else {
231     icu_collator->setAttribute(UCOL_CASE_FIRST, UCOL_OFF, status);
232   }
233   CHECK(U_SUCCESS(status));
234 }
235 
236 }  // anonymous namespace
237 
238 // static
InitializeCollator(Isolate * isolate,Handle<JSCollator> collator,Handle<Object> locales,Handle<Object> options_obj)239 MaybeHandle<JSCollator> JSCollator::InitializeCollator(
240     Isolate* isolate, Handle<JSCollator> collator, Handle<Object> locales,
241     Handle<Object> options_obj) {
242   // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
243   Handle<JSObject> requested_locales;
244   ASSIGN_RETURN_ON_EXCEPTION(isolate, requested_locales,
245                              Intl::CanonicalizeLocaleListJS(isolate, locales),
246                              JSCollator);
247 
248   // 2. If options is undefined, then
249   if (options_obj->IsUndefined(isolate)) {
250     // 2. a. Let options be ObjectCreate(null).
251     options_obj = isolate->factory()->NewJSObjectWithNullProto();
252   } else {
253     // 3. Else
254     // 3. a. Let options be ? ToObject(options).
255     ASSIGN_RETURN_ON_EXCEPTION(
256         isolate, options_obj,
257         Object::ToObject(isolate, options_obj, "Intl.Collator"), JSCollator);
258   }
259 
260   // At this point, options_obj can either be a JSObject or a JSProxy only.
261   Handle<JSReceiver> options = Handle<JSReceiver>::cast(options_obj);
262 
263   // 4. Let usage be ? GetOption(options, "usage", "string", « "sort",
264   // "search" », "sort").
265   std::vector<const char*> values = {"sort", "search"};
266   std::unique_ptr<char[]> usage_str = nullptr;
267   JSCollator::Usage usage = JSCollator::Usage::SORT;
268   Maybe<bool> found_usage = Intl::GetStringOption(
269       isolate, options, "usage", values, "Intl.Collator", &usage_str);
270   MAYBE_RETURN(found_usage, MaybeHandle<JSCollator>());
271 
272   if (found_usage.FromJust()) {
273     DCHECK_NOT_NULL(usage_str.get());
274     if (strcmp(usage_str.get(), "search") == 0) {
275       usage = JSCollator::Usage::SEARCH;
276     }
277   }
278 
279   // 5. Set collator.[[Usage]] to usage.
280   collator->set_usage(usage);
281 
282   // 6. If usage is "sort", then
283   //    a. Let localeData be %Collator%.[[SortLocaleData]].
284   // 7. Else,
285   //    a. Let localeData be %Collator%.[[SearchLocaleData]].
286   //
287   // The above two spec operations aren't required, the Intl spec is
288   // crazy. See https://github.com/tc39/ecma402/issues/256
289 
290   // TODO(gsathya): This is currently done as part of the
291   // Intl::ResolveLocale call below. Fix this once resolveLocale is
292   // changed to not do the lookup.
293   //
294   // 9. Let matcher be ? GetOption(options, "localeMatcher", "string",
295   // « "lookup", "best fit" », "best fit").
296   // 10. Set opt.[[localeMatcher]] to matcher.
297 
298   // 11. Let numeric be ? GetOption(options, "numeric", "boolean",
299   // undefined, undefined).
300   // 12. If numeric is not undefined, then
301   //    a. Let numeric be ! ToString(numeric).
302   //
303   // Note: We omit the ToString(numeric) operation as it's not
304   // observable. Intl::GetBoolOption returns a Boolean and
305   // ToString(Boolean) is not side-effecting.
306   //
307   // 13. Set opt.[[kn]] to numeric.
308   bool numeric;
309   Maybe<bool> found_numeric = Intl::GetBoolOption(isolate, options, "numeric",
310                                                   "Intl.Collator", &numeric);
311   MAYBE_RETURN(found_numeric, MaybeHandle<JSCollator>());
312 
313   // 14. Let caseFirst be ? GetOption(options, "caseFirst", "string",
314   //     « "upper", "lower", "false" », undefined).
315   // 15. Set opt.[[kf]] to caseFirst.
316   values = {"upper", "lower", "false"};
317   std::unique_ptr<char[]> case_first_str = nullptr;
318   Maybe<bool> found_case_first = Intl::GetStringOption(
319       isolate, options, "caseFirst", values, "Intl.Collator", &case_first_str);
320   MAYBE_RETURN(found_case_first, MaybeHandle<JSCollator>());
321 
322   // The relevant unicode extensions accepted by Collator as specified here:
323   // https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots
324   //
325   // 16. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]].
326   std::set<std::string> relevant_extension_keys{"co", "kn", "kf"};
327 
328   // We don't pass the relevant_extension_keys to ResolveLocale here
329   // as per the spec.
330   //
331   // In ResolveLocale, the spec makes sure we only pick and use the
332   // relevant extension keys and ignore any other keys. Also, in
333   // ResolveLocale, the spec makes sure that if a given key has both a
334   // value in the options object and an unicode extension value, then
335   // we pick the value provided in the options object.
336   // For example: in the case of `new Intl.Collator('en-u-kn-true', {
337   // numeric: false })` the value `false` is used for the `numeric`
338   // key.
339   //
340   // Instead of performing all this validation in ResolveLocale, we
341   // just perform it inline below. In the future when we port
342   // ResolveLocale to C++, we can make all these validations generic
343   // and move it ResolveLocale.
344   //
345   // 17. Let r be ResolveLocale(%Collator%.[[AvailableLocales]],
346   // requestedLocales, opt, %Collator%.[[RelevantExtensionKeys]],
347   // localeData).
348   // 18. Set collator.[[Locale]] to r.[[locale]].
349   Handle<JSObject> r;
350   ASSIGN_RETURN_ON_EXCEPTION(
351       isolate, r,
352       Intl::ResolveLocale(isolate, "collator", requested_locales, options),
353       JSCollator);
354 
355   Handle<String> locale_with_extension_str =
356       isolate->factory()->NewStringFromStaticChars("localeWithExtension");
357   Handle<Object> locale_with_extension_obj =
358       JSObject::GetDataProperty(r, locale_with_extension_str);
359 
360   // The locale_with_extension has to be a string. Either a user
361   // provided canonicalized string or the default locale.
362   CHECK(locale_with_extension_obj->IsString());
363   Handle<String> locale_with_extension =
364       Handle<String>::cast(locale_with_extension_obj);
365 
366   icu::Locale icu_locale =
367       Intl::CreateICULocale(isolate, locale_with_extension);
368   DCHECK(!icu_locale.isBogus());
369 
370   std::map<std::string, std::string> extensions =
371       LookupUnicodeExtensions(icu_locale, relevant_extension_keys);
372 
373   // 19. Let collation be r.[[co]].
374   //
375   // r.[[co]] is already set as part of the icu::Locale creation as
376   // icu parses unicode extensions and sets the keywords.
377   //
378   // We need to sanitize the keywords based on certain ECMAScript rules.
379   //
380   // As per https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots:
381   // The values "standard" and "search" must not be used as elements
382   // in any [[SortLocaleData]][locale].co and
383   // [[SearchLocaleData]][locale].co list.
384   auto co_extension_it = extensions.find("co");
385   if (co_extension_it != extensions.end()) {
386     const std::string& value = co_extension_it->second;
387     if ((value == "search") || (value == "standard")) {
388       UErrorCode status = U_ZERO_ERROR;
389       icu_locale.setKeywordValue("co", NULL, status);
390       CHECK(U_SUCCESS(status));
391     }
392   }
393 
394   // 20. If collation is null, let collation be "default".
395   // 21. Set collator.[[Collation]] to collation.
396   //
397   // We don't store the collation value as per the above two steps
398   // here. The collation value can be looked up from icu::Collator on
399   // demand, as part of Intl.Collator.prototype.resolvedOptions.
400 
401   UErrorCode status = U_ZERO_ERROR;
402   std::unique_ptr<icu::Collator> icu_collator(
403       icu::Collator::createInstance(icu_locale, status));
404   if (U_FAILURE(status) || icu_collator.get() == nullptr) {
405     status = U_ZERO_ERROR;
406     // Remove extensions and try again.
407     icu::Locale no_extension_locale(icu_locale.getBaseName());
408     icu_collator.reset(
409         icu::Collator::createInstance(no_extension_locale, status));
410 
411     if (U_FAILURE(status) || icu_collator.get() == nullptr) {
412       FATAL("Failed to create ICU collator, are ICU data files missing?");
413     }
414   }
415   DCHECK(U_SUCCESS(status));
416   CHECK_NOT_NULL(icu_collator.get());
417 
418   // 22. If relevantExtensionKeys contains "kn", then
419   //     a. Set collator.[[Numeric]] to ! SameValue(r.[[kn]], "true").
420   //
421   // If the numeric value is passed in through the options object,
422   // then we use it. Otherwise, we check if the numeric value is
423   // passed in through the unicode extensions.
424   status = U_ZERO_ERROR;
425   if (found_numeric.FromJust()) {
426     icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
427                                numeric ? UCOL_ON : UCOL_OFF, status);
428     CHECK(U_SUCCESS(status));
429   } else {
430     auto kn_extension_it = extensions.find("kn");
431     if (kn_extension_it != extensions.end()) {
432       const std::string& value = kn_extension_it->second;
433 
434       numeric = (value == "true");
435 
436       icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
437                                  numeric ? UCOL_ON : UCOL_OFF, status);
438       CHECK(U_SUCCESS(status));
439     }
440   }
441 
442   // 23. If relevantExtensionKeys contains "kf", then
443   //     a. Set collator.[[CaseFirst]] to r.[[kf]].
444   //
445   // If the caseFirst value is passed in through the options object,
446   // then we use it. Otherwise, we check if the caseFirst value is
447   // passed in through the unicode extensions.
448   if (found_case_first.FromJust()) {
449     const char* case_first_cstr = case_first_str.get();
450     SetCaseFirstOption(icu_collator.get(), case_first_cstr);
451   } else {
452     auto kf_extension_it = extensions.find("kf");
453     if (kf_extension_it != extensions.end()) {
454       const std::string& value = kf_extension_it->second;
455       SetCaseFirstOption(icu_collator.get(), value.c_str());
456     }
457   }
458 
459   // Normalization is always on, by the spec. We are free to optimize
460   // if the strings are already normalized (but we don't have a way to tell
461   // that right now).
462   status = U_ZERO_ERROR;
463   icu_collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
464   CHECK(U_SUCCESS(status));
465 
466   // 24. Let sensitivity be ? GetOption(options, "sensitivity",
467   // "string", « "base", "accent", "case", "variant" », undefined).
468   values = {"base", "accent", "case", "variant"};
469   std::unique_ptr<char[]> sensitivity_str = nullptr;
470   Maybe<bool> found_sensitivity =
471       Intl::GetStringOption(isolate, options, "sensitivity", values,
472                             "Intl.Collator", &sensitivity_str);
473   MAYBE_RETURN(found_sensitivity, MaybeHandle<JSCollator>());
474 
475   // 25. If sensitivity is undefined, then
476   if (!found_sensitivity.FromJust()) {
477     // 25. a. If usage is "sort", then
478     if (usage == Usage::SORT) {
479       // 25. a. i. Let sensitivity be "variant".
480       // 26. Set collator.[[Sensitivity]] to sensitivity.
481       icu_collator->setStrength(icu::Collator::TERTIARY);
482     }
483   } else {
484     DCHECK(found_sensitivity.FromJust());
485     const char* sensitivity_cstr = sensitivity_str.get();
486     DCHECK_NOT_NULL(sensitivity_cstr);
487 
488     // 26. Set collator.[[Sensitivity]] to sensitivity.
489     if (strcmp(sensitivity_cstr, "base") == 0) {
490       icu_collator->setStrength(icu::Collator::PRIMARY);
491     } else if (strcmp(sensitivity_cstr, "accent") == 0) {
492       icu_collator->setStrength(icu::Collator::SECONDARY);
493     } else if (strcmp(sensitivity_cstr, "case") == 0) {
494       icu_collator->setStrength(icu::Collator::PRIMARY);
495       status = U_ZERO_ERROR;
496       icu_collator->setAttribute(UCOL_CASE_LEVEL, UCOL_ON, status);
497       CHECK(U_SUCCESS(status));
498     } else {
499       DCHECK_EQ(0, strcmp(sensitivity_cstr, "variant"));
500       icu_collator->setStrength(icu::Collator::TERTIARY);
501     }
502   }
503 
504   // 27.Let ignorePunctuation be ? GetOption(options,
505   // "ignorePunctuation", "boolean", undefined, false).
506   bool ignore_punctuation;
507   Maybe<bool> found_ignore_punctuation =
508       Intl::GetBoolOption(isolate, options, "ignorePunctuation",
509                           "Intl.Collator", &ignore_punctuation);
510   MAYBE_RETURN(found_ignore_punctuation, MaybeHandle<JSCollator>());
511 
512   // 28. Set collator.[[IgnorePunctuation]] to ignorePunctuation.
513   if (found_ignore_punctuation.FromJust() && ignore_punctuation) {
514     status = U_ZERO_ERROR;
515     icu_collator->setAttribute(UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, status);
516     CHECK(U_SUCCESS(status));
517   }
518 
519   Handle<Managed<icu::Collator>> managed_collator =
520       Managed<icu::Collator>::FromUniquePtr(isolate, 0,
521                                             std::move(icu_collator));
522   collator->set_icu_collator(*managed_collator);
523 
524   // 29. Return collator.
525   return collator;
526 }
527 
528 // static
UsageToString(Usage usage)529 const char* JSCollator::UsageToString(Usage usage) {
530   switch (usage) {
531     case Usage::SORT:
532       return "sort";
533     case Usage::SEARCH:
534       return "search";
535     case Usage::COUNT:
536       UNREACHABLE();
537   }
538 }
539 
540 }  // namespace internal
541 }  // namespace v8
542