V8 API Reference, 7.2.502.16 (for Deno 0.2.4)
js-collator.cc
1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_INTL_SUPPORT
6 #error Internationalization is expected to be enabled.
7 #endif // V8_INTL_SUPPORT
8 
9 #include "src/objects/js-collator.h"
10 
11 #include "src/isolate.h"
12 #include "src/objects-inl.h"
13 #include "src/objects/js-collator-inl.h"
14 #include "unicode/coll.h"
15 #include "unicode/locid.h"
16 #include "unicode/strenum.h"
17 #include "unicode/ucol.h"
18 #include "unicode/uloc.h"
19 
20 namespace v8 {
21 namespace internal {
22 
23 namespace {
24 
25 enum class Usage {
26  SORT,
27  SEARCH,
28 };
29 
30 enum class Sensitivity {
31  kBase,
32  kAccent,
33  kCase,
34  kVariant,
35  kUndefined,
36 };
37 
38 // TODO(gsathya): Consider internalizing the value strings.
39 void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
40  Handle<String> key, const char* value) {
41  CHECK_NOT_NULL(value);
42  Handle<String> value_str =
43  isolate->factory()->NewStringFromAsciiChecked(value);
44 
45  // This is a brand new JSObject that shouldn't already have the same
46  // key so this shouldn't fail.
47  CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_str,
48  kDontThrow)
49  .FromJust());
50 }
51 
52 void CreateDataPropertyForOptions(Isolate* isolate, Handle<JSObject> options,
53  Handle<String> key, bool value) {
54  Handle<Object> value_obj = isolate->factory()->ToBoolean(value);
55 
56  // This is a brand new JSObject that shouldn't already have the same
57  // key so this shouldn't fail.
58  CHECK(JSReceiver::CreateDataProperty(isolate, options, key, value_obj,
59  kDontThrow)
60  .FromJust());
61 }
62 
63 void toLanguageTag(const icu::Locale& locale, char* tag) {
64  UErrorCode status = U_ZERO_ERROR;
65  uloc_toLanguageTag(locale.getName(), tag, ULOC_FULLNAME_CAPACITY, FALSE,
66  &status);
67  CHECK(U_SUCCESS(status));
68 }
69 
70 } // anonymous namespace
71 
72 // static
73 Handle<JSObject> JSCollator::ResolvedOptions(Isolate* isolate,
74  Handle<JSCollator> collator) {
75  Handle<JSObject> options =
76  isolate->factory()->NewJSObject(isolate->object_function());
77 
78  icu::Collator* icu_collator = collator->icu_collator()->raw();
79  CHECK_NOT_NULL(icu_collator);
80 
81  UErrorCode status = U_ZERO_ERROR;
82  bool numeric =
83  icu_collator->getAttribute(UCOL_NUMERIC_COLLATION, status) == UCOL_ON;
84  CHECK(U_SUCCESS(status));
85 
86  const char* case_first = nullptr;
87  status = U_ZERO_ERROR;
88  switch (icu_collator->getAttribute(UCOL_CASE_FIRST, status)) {
89  case UCOL_LOWER_FIRST:
90  case_first = "lower";
91  break;
92  case UCOL_UPPER_FIRST:
93  case_first = "upper";
94  break;
95  default:
96  case_first = "false";
97  }
98  CHECK(U_SUCCESS(status));
99 
100  const char* sensitivity = nullptr;
101  status = U_ZERO_ERROR;
102  switch (icu_collator->getAttribute(UCOL_STRENGTH, status)) {
103  case UCOL_PRIMARY: {
104  CHECK(U_SUCCESS(status));
105  status = U_ZERO_ERROR;
106  // case level: true + s1 -> case, s1 -> base.
107  if (UCOL_ON == icu_collator->getAttribute(UCOL_CASE_LEVEL, status)) {
108  sensitivity = "case";
109  } else {
110  sensitivity = "base";
111  }
112  CHECK(U_SUCCESS(status));
113  break;
114  }
115  case UCOL_SECONDARY:
116  sensitivity = "accent";
117  break;
118  case UCOL_TERTIARY:
119  sensitivity = "variant";
120  break;
121  case UCOL_QUATERNARY:
122  // We shouldn't get quaternary and identical from ICU, but if we do
123  // put them into variant.
124  sensitivity = "variant";
125  break;
126  default:
127  sensitivity = "variant";
128  }
129  CHECK(U_SUCCESS(status));
130 
131  status = U_ZERO_ERROR;
132  bool ignore_punctuation = icu_collator->getAttribute(UCOL_ALTERNATE_HANDLING,
133  status) == UCOL_SHIFTED;
134  CHECK(U_SUCCESS(status));
135 
136  status = U_ZERO_ERROR;
137 
138  icu::Locale icu_locale(icu_collator->getLocale(ULOC_VALID_LOCALE, status));
139  CHECK(U_SUCCESS(status));
140 
141  const char* collation = "default";
142  const char* usage = "sort";
143  const char* collation_key = "co";
144  const char* legacy_collation_key = uloc_toLegacyKey(collation_key);
145  DCHECK_NOT_NULL(legacy_collation_key);
146 
147  char bcp47_locale_tag[ULOC_FULLNAME_CAPACITY];
148  char legacy_collation_value[ULOC_FULLNAME_CAPACITY];
149  status = U_ZERO_ERROR;
150  int32_t length =
151  icu_locale.getKeywordValue(legacy_collation_key, legacy_collation_value,
152  ULOC_FULLNAME_CAPACITY, status);
153 
154  if (length > 0 && U_SUCCESS(status)) {
155  const char* collation_value =
156  uloc_toUnicodeLocaleType(collation_key, legacy_collation_value);
157  CHECK_NOT_NULL(collation_value);
158 
159  if (strcmp(collation_value, "search") == 0) {
160  usage = "search";
161 
162  // Search is disallowed as a collation value per spec. Let's
163  // use `default`, instead.
164  //
165  // https://tc39.github.io/ecma402/#sec-properties-of-intl-collator-instances
166  collation = "default";
167 
168  // We clone the icu::Locale because we don't want the
169  // icu_collator to be affected when we remove the collation key
170  // below.
171  icu::Locale new_icu_locale = icu_locale;
172 
173  // The spec forbids the search as a collation value in the
174  // locale tag, so let's filter it out.
175  status = U_ZERO_ERROR;
176  new_icu_locale.setKeywordValue(legacy_collation_key, nullptr, status);
177  CHECK(U_SUCCESS(status));
178 
179  toLanguageTag(new_icu_locale, bcp47_locale_tag);
180  } else {
181  collation = collation_value;
182  toLanguageTag(icu_locale, bcp47_locale_tag);
183  }
184  } else {
185  toLanguageTag(icu_locale, bcp47_locale_tag);
186  }
187 
188  // 5. For each row of Table 2, except the header row, in table order, do
189  // ...
190  // Table 2: Resolved Options of Collator Instances
191  // Internal Slot Property Extension Key
192  // [[Locale] "locale"
193  // [[Usage] "usage"
194  // [[Sensitivity]] "sensitivity"
195  // [[IgnorePunctuation]] "ignorePunctuation"
196  // [[Collation]] "collation"
197  // [[Numeric]] "numeric" kn
198  // [[CaseFirst]] "caseFirst" kf
199  CreateDataPropertyForOptions(
200  isolate, options, isolate->factory()->locale_string(), bcp47_locale_tag);
201  CreateDataPropertyForOptions(isolate, options,
202  isolate->factory()->usage_string(), usage);
203  CreateDataPropertyForOptions(
204  isolate, options, isolate->factory()->sensitivity_string(), sensitivity);
205  CreateDataPropertyForOptions(isolate, options,
206  isolate->factory()->ignorePunctuation_string(),
207  ignore_punctuation);
208  CreateDataPropertyForOptions(
209  isolate, options, isolate->factory()->collation_string(), collation);
210  CreateDataPropertyForOptions(isolate, options,
211  isolate->factory()->numeric_string(), numeric);
212  CreateDataPropertyForOptions(
213  isolate, options, isolate->factory()->caseFirst_string(), case_first);
214  return options;
215 }
216 
217 namespace {
218 
219 Intl::CaseFirst ToCaseFirst(const char* str) {
220  if (strcmp(str, "upper") == 0) return Intl::CaseFirst::kUpper;
221  if (strcmp(str, "lower") == 0) return Intl::CaseFirst::kLower;
222  if (strcmp(str, "false") == 0) return Intl::CaseFirst::kFalse;
223  return Intl::CaseFirst::kUndefined;
224 }
225 
226 UColAttributeValue ToUColAttributeValue(Intl::CaseFirst case_first) {
227  switch (case_first) {
228  case Intl::CaseFirst::kUpper:
229  return UCOL_UPPER_FIRST;
230  case Intl::CaseFirst::kLower:
231  return UCOL_LOWER_FIRST;
232  case Intl::CaseFirst::kFalse:
233  case Intl::CaseFirst::kUndefined:
234  return UCOL_OFF;
235  }
236 }
237 
238 void SetCaseFirstOption(icu::Collator* icu_collator,
239  Intl::CaseFirst case_first) {
240  CHECK_NOT_NULL(icu_collator);
241  UErrorCode status = U_ZERO_ERROR;
242  icu_collator->setAttribute(UCOL_CASE_FIRST, ToUColAttributeValue(case_first),
243  status);
244  CHECK(U_SUCCESS(status));
245 }
246 
247 } // anonymous namespace
248 
249 // static
250 MaybeHandle<JSCollator> JSCollator::Initialize(Isolate* isolate,
251  Handle<JSCollator> collator,
252  Handle<Object> locales,
253  Handle<Object> options_obj) {
254  // 1. Let requestedLocales be ? CanonicalizeLocaleList(locales).
255  Maybe<std::vector<std::string>> maybe_requested_locales =
256  Intl::CanonicalizeLocaleList(isolate, locales);
257  MAYBE_RETURN(maybe_requested_locales, Handle<JSCollator>());
258  std::vector<std::string> requested_locales =
259  maybe_requested_locales.FromJust();
260 
261  // 2. If options is undefined, then
262  if (options_obj->IsUndefined(isolate)) {
263  // 2. a. Let options be ObjectCreate(null).
264  options_obj = isolate->factory()->NewJSObjectWithNullProto();
265  } else {
266  // 3. Else
267  // 3. a. Let options be ? ToObject(options).
268  ASSIGN_RETURN_ON_EXCEPTION(
269  isolate, options_obj,
270  Object::ToObject(isolate, options_obj, "Intl.Collator"), JSCollator);
271  }
272 
273  // At this point, options_obj can either be a JSObject or a JSProxy only.
274  Handle<JSReceiver> options = Handle<JSReceiver>::cast(options_obj);
275 
276  // 4. Let usage be ? GetOption(options, "usage", "string", « "sort",
277  // "search" », "sort").
278  Maybe<Usage> maybe_usage = Intl::GetStringOption<Usage>(
279  isolate, options, "usage", "Intl.Collator", {"sort", "search"},
280  {Usage::SORT, Usage::SEARCH}, Usage::SORT);
281  MAYBE_RETURN(maybe_usage, MaybeHandle<JSCollator>());
282  Usage usage = maybe_usage.FromJust();
283 
284  // 9. Let matcher be ? GetOption(options, "localeMatcher", "string",
285  // « "lookup", "best fit" », "best fit").
286  // 10. Set opt.[[localeMatcher]] to matcher.
287  Maybe<Intl::MatcherOption> maybe_locale_matcher =
288  Intl::GetLocaleMatcher(isolate, options, "Intl.Collator");
289  MAYBE_RETURN(maybe_locale_matcher, MaybeHandle<JSCollator>());
290  Intl::MatcherOption matcher = maybe_locale_matcher.FromJust();
291 
292  // 11. Let numeric be ? GetOption(options, "numeric", "boolean",
293  // undefined, undefined).
294  // 12. If numeric is not undefined, then
295  // a. Let numeric be ! ToString(numeric).
296  //
297  // Note: We omit the ToString(numeric) operation as it's not
298  // observable. Intl::GetBoolOption returns a Boolean and
299  // ToString(Boolean) is not side-effecting.
300  //
301  // 13. Set opt.[[kn]] to numeric.
302  bool numeric;
303  Maybe<bool> found_numeric = Intl::GetBoolOption(isolate, options, "numeric",
304  "Intl.Collator", &numeric);
305  MAYBE_RETURN(found_numeric, MaybeHandle<JSCollator>());
306 
307  // 14. Let caseFirst be ? GetOption(options, "caseFirst", "string",
308  // « "upper", "lower", "false" », undefined).
309  Maybe<Intl::CaseFirst> maybe_case_first =
310  Intl::GetCaseFirst(isolate, options, "Intl.Collator");
311  MAYBE_RETURN(maybe_case_first, MaybeHandle<JSCollator>());
312  Intl::CaseFirst case_first = maybe_case_first.FromJust();
313 
314  // The relevant unicode extensions accepted by Collator as specified here:
315  // https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots
316  //
317  // 16. Let relevantExtensionKeys be %Collator%.[[RelevantExtensionKeys]].
318  std::set<std::string> relevant_extension_keys{"co", "kn", "kf"};
319 
320  // 17. Let r be ResolveLocale(%Collator%.[[AvailableLocales]],
321  // requestedLocales, opt, %Collator%.[[RelevantExtensionKeys]],
322  // localeData).
323  Intl::ResolvedLocale r =
324  Intl::ResolveLocale(isolate, JSCollator::GetAvailableLocales(),
325  requested_locales, matcher, relevant_extension_keys);
326 
327  // 18. Set collator.[[Locale]] to r.[[locale]].
328  icu::Locale icu_locale = r.icu_locale;
329  DCHECK(!icu_locale.isBogus());
330 
331  std::map<std::string, std::string> extensions = r.extensions;
332 
333  // 19. Let collation be r.[[co]].
334  //
335  // r.[[co]] is already set as part of the icu::Locale creation as
336  // icu parses unicode extensions and sets the keywords.
337  //
338  // We need to sanitize the keywords based on certain ECMAScript rules.
339  //
340  // As per https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots:
341  // The values "standard" and "search" must not be used as elements
342  // in any [[SortLocaleData]][locale].co and
343  // [[SearchLocaleData]][locale].co list.
344  auto co_extension_it = extensions.find("co");
345  if (co_extension_it != extensions.end()) {
346  const std::string& value = co_extension_it->second;
347  if ((value == "search") || (value == "standard")) {
348  UErrorCode status = U_ZERO_ERROR;
349  const char* key = uloc_toLegacyKey("co");
350  icu_locale.setKeywordValue(key, nullptr, status);
351  CHECK(U_SUCCESS(status));
352  }
353  }
354 
355  // 5. Set collator.[[Usage]] to usage.
356  //
357  // 6. If usage is "sort", then
358  // a. Let localeData be %Collator%.[[SortLocaleData]].
359  // 7. Else,
360  // a. Let localeData be %Collator%.[[SearchLocaleData]].
361  //
362  // The Intl spec doesn't allow us to use "search" as an extension
363  // value for collation as per:
364  // https://tc39.github.io/ecma402/#sec-intl-collator-internal-slots
365  //
366  // But the only way to pass the value "search" for collation from
367  // the options object to ICU is to use the 'co' extension keyword.
368  //
369  // This will need to be filtered out when creating the
370  // resolvedOptions object.
371  if (usage == Usage::SEARCH) {
372  const char* key = uloc_toLegacyKey("co");
373  CHECK_NOT_NULL(key);
374  const char* value = uloc_toLegacyType(key, "search");
375  CHECK_NOT_NULL(value);
376  UErrorCode status = U_ZERO_ERROR;
377  icu_locale.setKeywordValue(key, value, status);
378  CHECK(U_SUCCESS(status));
379  }
380 
381  // 20. If collation is null, let collation be "default".
382  // 21. Set collator.[[Collation]] to collation.
383  //
384  // We don't store the collation value as per the above two steps
385  // here. The collation value can be looked up from icu::Collator on
386  // demand, as part of Intl.Collator.prototype.resolvedOptions.
387 
388  UErrorCode status = U_ZERO_ERROR;
389  std::unique_ptr<icu::Collator> icu_collator(
390  icu::Collator::createInstance(icu_locale, status));
391  if (U_FAILURE(status) || icu_collator.get() == nullptr) {
392  status = U_ZERO_ERROR;
393  // Remove extensions and try again.
394  icu::Locale no_extension_locale(icu_locale.getBaseName());
395  icu_collator.reset(
396  icu::Collator::createInstance(no_extension_locale, status));
397 
398  if (U_FAILURE(status) || icu_collator.get() == nullptr) {
399  FATAL("Failed to create ICU collator, are ICU data files missing?");
400  }
401  }
402  DCHECK(U_SUCCESS(status));
403  CHECK_NOT_NULL(icu_collator.get());
404 
405  // 22. If relevantExtensionKeys contains "kn", then
406  // a. Set collator.[[Numeric]] to ! SameValue(r.[[kn]], "true").
407  //
408  // If the numeric value is passed in through the options object,
409  // then we use it. Otherwise, we check if the numeric value is
410  // passed in through the unicode extensions.
411  status = U_ZERO_ERROR;
412  if (found_numeric.FromJust()) {
413  icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
414  numeric ? UCOL_ON : UCOL_OFF, status);
415  CHECK(U_SUCCESS(status));
416  } else {
417  auto kn_extension_it = extensions.find("kn");
418  if (kn_extension_it != extensions.end()) {
419  const std::string& value = kn_extension_it->second;
420 
421  numeric = (value == "true");
422 
423  icu_collator->setAttribute(UCOL_NUMERIC_COLLATION,
424  numeric ? UCOL_ON : UCOL_OFF, status);
425  CHECK(U_SUCCESS(status));
426  }
427  }
428 
429  // 23. If relevantExtensionKeys contains "kf", then
430  // a. Set collator.[[CaseFirst]] to r.[[kf]].
431  //
432  // If the caseFirst value is passed in through the options object,
433  // then we use it. Otherwise, we check if the caseFirst value is
434  // passed in through the unicode extensions.
435  if (case_first != Intl::CaseFirst::kUndefined) {
436  SetCaseFirstOption(icu_collator.get(), case_first);
437  } else {
438  auto kf_extension_it = extensions.find("kf");
439  if (kf_extension_it != extensions.end()) {
440  const std::string& value = kf_extension_it->second;
441  SetCaseFirstOption(icu_collator.get(), ToCaseFirst(value.c_str()));
442  }
443  }
444 
445  // Normalization is always on, by the spec. We are free to optimize
446  // if the strings are already normalized (but we don't have a way to tell
447  // that right now).
448  status = U_ZERO_ERROR;
449  icu_collator->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
450  CHECK(U_SUCCESS(status));
451 
452  // 24. Let sensitivity be ? GetOption(options, "sensitivity",
453  // "string", « "base", "accent", "case", "variant" », undefined).
454  Maybe<Sensitivity> maybe_sensitivity = Intl::GetStringOption<Sensitivity>(
455  isolate, options, "sensitivity", "Intl.Collator",
456  {"base", "accent", "case", "variant"},
457  {Sensitivity::kBase, Sensitivity::kAccent, Sensitivity::kCase,
458  Sensitivity::kVariant},
459  Sensitivity::kUndefined);
460  MAYBE_RETURN(maybe_sensitivity, MaybeHandle<JSCollator>());
461  Sensitivity sensitivity = maybe_sensitivity.FromJust();
462 
463  // 25. If sensitivity is undefined, then
464  if (sensitivity == Sensitivity::kUndefined) {
465  // 25. a. If usage is "sort", then
466  if (usage == Usage::SORT) {
467  // 25. a. i. Let sensitivity be "variant".
468  sensitivity = Sensitivity::kVariant;
469  }
470  }
471  // 26. Set collator.[[Sensitivity]] to sensitivity.
472  switch (sensitivity) {
473  case Sensitivity::kBase:
474  icu_collator->setStrength(icu::Collator::PRIMARY);
475  break;
476  case Sensitivity::kAccent:
477  icu_collator->setStrength(icu::Collator::SECONDARY);
478  break;
479  case Sensitivity::kCase:
480  icu_collator->setStrength(icu::Collator::PRIMARY);
481  status = U_ZERO_ERROR;
482  icu_collator->setAttribute(UCOL_CASE_LEVEL, UCOL_ON, status);
483  CHECK(U_SUCCESS(status));
484  break;
485  case Sensitivity::kVariant:
486  icu_collator->setStrength(icu::Collator::TERTIARY);
487  break;
488  case Sensitivity::kUndefined:
489  break;
490  }
491 
492  // 27.Let ignorePunctuation be ? GetOption(options,
493  // "ignorePunctuation", "boolean", undefined, false).
494  bool ignore_punctuation;
495  Maybe<bool> found_ignore_punctuation =
496  Intl::GetBoolOption(isolate, options, "ignorePunctuation",
497  "Intl.Collator", &ignore_punctuation);
498  MAYBE_RETURN(found_ignore_punctuation, MaybeHandle<JSCollator>());
499 
500  // 28. Set collator.[[IgnorePunctuation]] to ignorePunctuation.
501  if (found_ignore_punctuation.FromJust() && ignore_punctuation) {
502  status = U_ZERO_ERROR;
503  icu_collator->setAttribute(UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, status);
504  CHECK(U_SUCCESS(status));
505  }
506 
507  Handle<Managed<icu::Collator>> managed_collator =
508  Managed<icu::Collator>::FromUniquePtr(isolate, 0,
509  std::move(icu_collator));
510  collator->set_icu_collator(*managed_collator);
511 
512  // 29. Return collator.
513  return collator;
514 }
515 
516 std::set<std::string> JSCollator::GetAvailableLocales() {
517  int32_t num_locales = 0;
518  const icu::Locale* icu_available_locales =
519  icu::Collator::getAvailableLocales(num_locales);
520  return Intl::BuildLocaleSet(icu_available_locales, num_locales);
521 }
522 
523 } // namespace internal
524 } // namespace v8
Definition: libplatform.h:13