1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "charstr.h"
28 #include "cmemory.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
34 #include "cstring.h"
35 #include "patternprops.h"
36 #include "uassert.h"
37 #include "uvectr32.h"
38
39 U_NAMESPACE_BEGIN
40
41 namespace {
42
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
44 const int32_t BEFORE_LENGTH = 7;
45
46 } // namespace
47
~Sink()48 CollationRuleParser::Sink::~Sink() {}
49
50 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52
53 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55
~Importer()56 CollationRuleParser::Importer::~Importer() {}
57
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59 : nfd(*Normalizer2::getNFDInstance(errorCode)),
60 nfc(*Normalizer2::getNFCInstance(errorCode)),
61 rules(NULL), baseData(base), settings(NULL),
62 parseError(NULL), errorReason(NULL),
63 sink(NULL), importer(NULL),
64 ruleIndex(0) {
65 }
66
~CollationRuleParser()67 CollationRuleParser::~CollationRuleParser() {
68 }
69
70 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)71 CollationRuleParser::parse(const UnicodeString &ruleString,
72 CollationSettings &outSettings,
73 UParseError *outParseError,
74 UErrorCode &errorCode) {
75 if(U_FAILURE(errorCode)) { return; }
76 settings = &outSettings;
77 parseError = outParseError;
78 if(parseError != NULL) {
79 parseError->line = 0;
80 parseError->offset = -1;
81 parseError->preContext[0] = 0;
82 parseError->postContext[0] = 0;
83 }
84 errorReason = NULL;
85 parse(ruleString, errorCode);
86 }
87
88 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90 if(U_FAILURE(errorCode)) { return; }
91 rules = &ruleString;
92 ruleIndex = 0;
93
94 while(ruleIndex < rules->length()) {
95 UChar c = rules->charAt(ruleIndex);
96 if(PatternProps::isWhiteSpace(c)) {
97 ++ruleIndex;
98 continue;
99 }
100 switch(c) {
101 case 0x26: // '&'
102 parseRuleChain(errorCode);
103 break;
104 case 0x5b: // '['
105 parseSetting(errorCode);
106 break;
107 case 0x23: // '#' starts a comment, until the end of the line
108 ruleIndex = skipComment(ruleIndex + 1);
109 break;
110 case 0x40: // '@' is equivalent to [backwards 2]
111 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112 UCOL_ON, 0, errorCode);
113 ++ruleIndex;
114 break;
115 case 0x21: // '!' used to turn on Thai/Lao character reversal
116 // Accept but ignore. The root collator has contractions
117 // that are equivalent to the character reversal, where appropriate.
118 ++ruleIndex;
119 break;
120 default:
121 setParseError("expected a reset or setting or comment", errorCode);
122 break;
123 }
124 if(U_FAILURE(errorCode)) { return; }
125 }
126 }
127
128 void
parseRuleChain(UErrorCode & errorCode)129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130 int32_t resetStrength = parseResetAndPosition(errorCode);
131 UBool isFirstRelation = TRUE;
132 for(;;) {
133 int32_t result = parseRelationOperator(errorCode);
134 if(U_FAILURE(errorCode)) { return; }
135 if(result < 0) {
136 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137 // '#' starts a comment, until the end of the line
138 ruleIndex = skipComment(ruleIndex + 1);
139 continue;
140 }
141 if(isFirstRelation) {
142 setParseError("reset not followed by a relation", errorCode);
143 }
144 return;
145 }
146 int32_t strength = result & STRENGTH_MASK;
147 if(resetStrength < UCOL_IDENTICAL) {
148 // reset-before rule chain
149 if(isFirstRelation) {
150 if(strength != resetStrength) {
151 setParseError("reset-before strength differs from its first relation", errorCode);
152 return;
153 }
154 } else {
155 if(strength < resetStrength) {
156 setParseError("reset-before strength followed by a stronger relation", errorCode);
157 return;
158 }
159 }
160 }
161 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
162 if((result & STARRED_FLAG) == 0) {
163 parseRelationStrings(strength, i, errorCode);
164 } else {
165 parseStarredCharacters(strength, i, errorCode);
166 }
167 if(U_FAILURE(errorCode)) { return; }
168 isFirstRelation = FALSE;
169 }
170 }
171
172 int32_t
parseResetAndPosition(UErrorCode & errorCode)173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175 int32_t i = skipWhiteSpace(ruleIndex + 1);
176 int32_t j;
177 UChar c;
178 int32_t resetStrength;
179 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180 (j = i + BEFORE_LENGTH) < rules->length() &&
181 PatternProps::isWhiteSpace(rules->charAt(j)) &&
182 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184 rules->charAt(j + 1) == 0x5d) {
185 // &[before n] with n=1 or 2 or 3
186 resetStrength = UCOL_PRIMARY + (c - 0x31);
187 i = skipWhiteSpace(j + 2);
188 } else {
189 resetStrength = UCOL_IDENTICAL;
190 }
191 if(i >= rules->length()) {
192 setParseError("reset without position", errorCode);
193 return UCOL_DEFAULT;
194 }
195 UnicodeString str;
196 if(rules->charAt(i) == 0x5b) { // '['
197 i = parseSpecialPosition(i, str, errorCode);
198 } else {
199 i = parseTailoringString(i, str, errorCode);
200 }
201 sink->addReset(resetStrength, str, errorReason, errorCode);
202 if(U_FAILURE(errorCode)) { setErrorContext(); }
203 ruleIndex = i;
204 return resetStrength;
205 }
206
207 int32_t
parseRelationOperator(UErrorCode & errorCode)208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210 ruleIndex = skipWhiteSpace(ruleIndex);
211 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212 int32_t strength;
213 int32_t i = ruleIndex;
214 UChar c = rules->charAt(i++);
215 switch(c) {
216 case 0x3c: // '<'
217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
218 ++i;
219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
220 ++i;
221 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
222 ++i;
223 strength = UCOL_QUATERNARY;
224 } else {
225 strength = UCOL_TERTIARY;
226 }
227 } else {
228 strength = UCOL_SECONDARY;
229 }
230 } else {
231 strength = UCOL_PRIMARY;
232 }
233 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
234 ++i;
235 strength |= STARRED_FLAG;
236 }
237 break;
238 case 0x3b: // ';' same as <<
239 strength = UCOL_SECONDARY;
240 break;
241 case 0x2c: // ',' same as <<<
242 strength = UCOL_TERTIARY;
243 break;
244 case 0x3d: // '='
245 strength = UCOL_IDENTICAL;
246 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
247 ++i;
248 strength |= STARRED_FLAG;
249 }
250 break;
251 default:
252 return UCOL_DEFAULT;
253 }
254 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 }
256
257 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259 // Parse
260 // prefix | str / extension
261 // where prefix and extension are optional.
262 UnicodeString prefix, str, extension;
263 i = parseTailoringString(i, str, errorCode);
264 if(U_FAILURE(errorCode)) { return; }
265 UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266 if(next == 0x7c) { // '|' separates the context prefix from the string.
267 prefix = str;
268 i = parseTailoringString(i + 1, str, errorCode);
269 if(U_FAILURE(errorCode)) { return; }
270 next = (i < rules->length()) ? rules->charAt(i) : 0;
271 }
272 if(next == 0x2f) { // '/' separates the string from the extension.
273 i = parseTailoringString(i + 1, extension, errorCode);
274 }
275 if(!prefix.isEmpty()) {
276 UChar32 prefix0 = prefix.char32At(0);
277 UChar32 c = str.char32At(0);
278 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280 errorCode);
281 return;
282 }
283 }
284 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285 if(U_FAILURE(errorCode)) { setErrorContext(); }
286 ruleIndex = i;
287 }
288
289 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291 UnicodeString empty, raw;
292 i = parseString(skipWhiteSpace(i), raw, errorCode);
293 if(U_FAILURE(errorCode)) { return; }
294 if(raw.isEmpty()) {
295 setParseError("missing starred-relation string", errorCode);
296 return;
297 }
298 UChar32 prev = -1;
299 int32_t j = 0;
300 for(;;) {
301 while(j < raw.length()) {
302 UChar32 c = raw.char32At(j);
303 if(!nfd.isInert(c)) {
304 setParseError("starred-relation string is not all NFD-inert", errorCode);
305 return;
306 }
307 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308 if(U_FAILURE(errorCode)) {
309 setErrorContext();
310 return;
311 }
312 j += U16_LENGTH(c);
313 prev = c;
314 }
315 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
316 break;
317 }
318 if(prev < 0) {
319 setParseError("range without start in starred-relation string", errorCode);
320 return;
321 }
322 i = parseString(i + 1, raw, errorCode);
323 if(U_FAILURE(errorCode)) { return; }
324 if(raw.isEmpty()) {
325 setParseError("range without end in starred-relation string", errorCode);
326 return;
327 }
328 UChar32 c = raw.char32At(0);
329 if(c < prev) {
330 setParseError("range start greater than end in starred-relation string", errorCode);
331 return;
332 }
333 // range prev-c
334 UnicodeString s;
335 while(++prev <= c) {
336 if(!nfd.isInert(prev)) {
337 setParseError("starred-relation string range is not all NFD-inert", errorCode);
338 return;
339 }
340 if(U_IS_SURROGATE(prev)) {
341 setParseError("starred-relation string range contains a surrogate", errorCode);
342 return;
343 }
344 if(0xfffd <= prev && prev <= 0xffff) {
345 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346 return;
347 }
348 s.setTo(prev);
349 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350 if(U_FAILURE(errorCode)) {
351 setErrorContext();
352 return;
353 }
354 }
355 prev = -1;
356 j = U16_LENGTH(c);
357 }
358 ruleIndex = skipWhiteSpace(i);
359 }
360
361 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363 i = parseString(skipWhiteSpace(i), raw, errorCode);
364 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365 setParseError("missing relation string", errorCode);
366 }
367 return skipWhiteSpace(i);
368 }
369
370 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372 if(U_FAILURE(errorCode)) { return i; }
373 raw.remove();
374 while(i < rules->length()) {
375 UChar32 c = rules->charAt(i++);
376 if(isSyntaxChar(c)) {
377 if(c == 0x27) { // apostrophe
378 if(i < rules->length() && rules->charAt(i) == 0x27) {
379 // Double apostrophe, encodes a single one.
380 raw.append((UChar)0x27);
381 ++i;
382 continue;
383 }
384 // Quote literal text until the next single apostrophe.
385 for(;;) {
386 if(i == rules->length()) {
387 setParseError("quoted literal text missing terminating apostrophe", errorCode);
388 return i;
389 }
390 c = rules->charAt(i++);
391 if(c == 0x27) {
392 if(i < rules->length() && rules->charAt(i) == 0x27) {
393 // Double apostrophe inside quoted literal text,
394 // still encodes a single apostrophe.
395 ++i;
396 } else {
397 break;
398 }
399 }
400 raw.append((UChar)c);
401 }
402 } else if(c == 0x5c) { // backslash
403 if(i == rules->length()) {
404 setParseError("backslash escape at the end of the rule string", errorCode);
405 return i;
406 }
407 c = rules->char32At(i);
408 raw.append(c);
409 i += U16_LENGTH(c);
410 } else {
411 // Any other syntax character terminates a string.
412 --i;
413 break;
414 }
415 } else if(PatternProps::isWhiteSpace(c)) {
416 // Unquoted white space terminates a string.
417 --i;
418 break;
419 } else {
420 raw.append((UChar)c);
421 }
422 }
423 for(int32_t j = 0; j < raw.length();) {
424 UChar32 c = raw.char32At(j);
425 if(U_IS_SURROGATE(c)) {
426 setParseError("string contains an unpaired surrogate", errorCode);
427 return i;
428 }
429 if(0xfffd <= c && c <= 0xffff) {
430 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431 return i;
432 }
433 j += U16_LENGTH(c);
434 }
435 return i;
436 }
437
438 namespace {
439
440 static const char *const positions[] = {
441 "first tertiary ignorable",
442 "last tertiary ignorable",
443 "first secondary ignorable",
444 "last secondary ignorable",
445 "first primary ignorable",
446 "last primary ignorable",
447 "first variable",
448 "last variable",
449 "first regular",
450 "last regular",
451 "first implicit",
452 "last implicit",
453 "first trailing",
454 "last trailing"
455 };
456
457 } // namespace
458
459 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461 if(U_FAILURE(errorCode)) { return 0; }
462 UnicodeString raw;
463 int32_t j = readWords(i + 1, raw);
464 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
465 ++j;
466 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469 return j;
470 }
471 }
472 if(raw == UNICODE_STRING_SIMPLE("top")) {
473 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474 return j;
475 }
476 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478 return j;
479 }
480 }
481 setParseError("not a valid special reset position", errorCode);
482 return i;
483 }
484
485 void
parseSetting(UErrorCode & errorCode)486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487 if(U_FAILURE(errorCode)) { return; }
488 UnicodeString raw;
489 int32_t i = ruleIndex + 1;
490 int32_t j = readWords(i, raw);
491 if(j <= i || raw.isEmpty()) {
492 setParseError("expected a setting/option at '['", errorCode);
493 }
494 if(rules->charAt(j) == 0x5d) { // words end with ]
495 ++j;
496 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498 parseReordering(raw, errorCode);
499 ruleIndex = j;
500 return;
501 }
502 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504 UCOL_ON, 0, errorCode);
505 ruleIndex = j;
506 return;
507 }
508 UnicodeString v;
509 int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510 if(valueIndex >= 0) {
511 v.setTo(raw, valueIndex + 1);
512 raw.truncate(valueIndex);
513 }
514 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515 int32_t value = UCOL_DEFAULT;
516 UChar c = v.charAt(0);
517 if(0x31 <= c && c <= 0x34) { // 1..4
518 value = UCOL_PRIMARY + (c - 0x31);
519 } else if(c == 0x49) { // 'I'
520 value = UCOL_IDENTICAL;
521 }
522 if(value != UCOL_DEFAULT) {
523 settings->setStrength(value, 0, errorCode);
524 ruleIndex = j;
525 return;
526 }
527 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528 UColAttributeValue value = UCOL_DEFAULT;
529 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530 value = UCOL_NON_IGNORABLE;
531 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532 value = UCOL_SHIFTED;
533 }
534 if(value != UCOL_DEFAULT) {
535 settings->setAlternateHandling(value, 0, errorCode);
536 ruleIndex = j;
537 return;
538 }
539 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540 int32_t value = UCOL_DEFAULT;
541 if(v == UNICODE_STRING_SIMPLE("space")) {
542 value = CollationSettings::MAX_VAR_SPACE;
543 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544 value = CollationSettings::MAX_VAR_PUNCT;
545 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546 value = CollationSettings::MAX_VAR_SYMBOL;
547 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548 value = CollationSettings::MAX_VAR_CURRENCY;
549 }
550 if(value != UCOL_DEFAULT) {
551 settings->setMaxVariable(value, 0, errorCode);
552 settings->variableTop = baseData->getLastPrimaryForGroup(
553 UCOL_REORDER_CODE_FIRST + value);
554 U_ASSERT(settings->variableTop != 0);
555 ruleIndex = j;
556 return;
557 }
558 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559 UColAttributeValue value = UCOL_DEFAULT;
560 if(v == UNICODE_STRING_SIMPLE("off")) {
561 value = UCOL_OFF;
562 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563 value = UCOL_LOWER_FIRST;
564 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565 value = UCOL_UPPER_FIRST;
566 }
567 if(value != UCOL_DEFAULT) {
568 settings->setCaseFirst(value, 0, errorCode);
569 ruleIndex = j;
570 return;
571 }
572 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573 UColAttributeValue value = getOnOffValue(v);
574 if(value != UCOL_DEFAULT) {
575 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576 ruleIndex = j;
577 return;
578 }
579 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580 UColAttributeValue value = getOnOffValue(v);
581 if(value != UCOL_DEFAULT) {
582 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583 ruleIndex = j;
584 return;
585 }
586 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587 UColAttributeValue value = getOnOffValue(v);
588 if(value != UCOL_DEFAULT) {
589 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590 ruleIndex = j;
591 return;
592 }
593 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594 UColAttributeValue value = getOnOffValue(v);
595 if(value != UCOL_DEFAULT) {
596 if(value == UCOL_ON) {
597 setParseError("[hiraganaQ on] is not supported", errorCode);
598 }
599 ruleIndex = j;
600 return;
601 }
602 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603 CharString lang;
604 lang.appendInvariantChars(v, errorCode);
605 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606 // BCP 47 language tag -> ICU locale ID
607 char localeID[ULOC_FULLNAME_CAPACITY];
608 int32_t parsedLength;
609 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610 &parsedLength, &errorCode);
611 if(U_FAILURE(errorCode) ||
612 parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613 errorCode = U_ZERO_ERROR;
614 setParseError("expected language tag in [import langTag]", errorCode);
615 return;
616 }
617 // localeID minus all keywords
618 char baseID[ULOC_FULLNAME_CAPACITY];
619 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621 errorCode = U_ZERO_ERROR;
622 setParseError("expected language tag in [import langTag]", errorCode);
623 return;
624 }
625 if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
626 uprv_strcpy(baseID, "root");
627 }
628 // @collation=type, or length=0 if not specified
629 char collationType[ULOC_KEYWORDS_CAPACITY];
630 length = uloc_getKeywordValue(localeID, "collation",
631 collationType, ULOC_KEYWORDS_CAPACITY,
632 &errorCode);
633 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
634 errorCode = U_ZERO_ERROR;
635 setParseError("expected language tag in [import langTag]", errorCode);
636 return;
637 }
638 if(importer == NULL) {
639 setParseError("[import langTag] is not supported", errorCode);
640 } else {
641 UnicodeString importedRules;
642 importer->getRules(baseID, length > 0 ? collationType : "standard",
643 importedRules, errorReason, errorCode);
644 if(U_FAILURE(errorCode)) {
645 if(errorReason == NULL) {
646 errorReason = "[import langTag] failed";
647 }
648 setErrorContext();
649 return;
650 }
651 const UnicodeString *outerRules = rules;
652 int32_t outerRuleIndex = ruleIndex;
653 parse(importedRules, errorCode);
654 if(U_FAILURE(errorCode)) {
655 if(parseError != NULL) {
656 parseError->offset = outerRuleIndex;
657 }
658 }
659 rules = outerRules;
660 ruleIndex = j;
661 }
662 return;
663 }
664 } else if(rules->charAt(j) == 0x5b) { // words end with [
665 UnicodeSet set;
666 j = parseUnicodeSet(j, set, errorCode);
667 if(U_FAILURE(errorCode)) { return; }
668 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
669 sink->optimize(set, errorReason, errorCode);
670 if(U_FAILURE(errorCode)) { setErrorContext(); }
671 ruleIndex = j;
672 return;
673 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
674 sink->suppressContractions(set, errorReason, errorCode);
675 if(U_FAILURE(errorCode)) { setErrorContext(); }
676 ruleIndex = j;
677 return;
678 }
679 }
680 setParseError("not a valid setting/option", errorCode);
681 }
682
683 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)684 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
685 if(U_FAILURE(errorCode)) { return; }
686 int32_t i = 7; // after "reorder"
687 if(i == raw.length()) {
688 // empty [reorder] with no codes
689 settings->resetReordering();
690 return;
691 }
692 // Parse the codes in [reorder aa bb cc].
693 UVector32 reorderCodes(errorCode);
694 if(U_FAILURE(errorCode)) { return; }
695 CharString word;
696 while(i < raw.length()) {
697 ++i; // skip the word-separating space
698 int32_t limit = raw.indexOf((UChar)0x20, i);
699 if(limit < 0) { limit = raw.length(); }
700 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
701 if(U_FAILURE(errorCode)) { return; }
702 int32_t code = getReorderCode(word.data());
703 if(code < 0) {
704 setParseError("unknown script or reorder code", errorCode);
705 return;
706 }
707 reorderCodes.addElement(code, errorCode);
708 if(U_FAILURE(errorCode)) { return; }
709 i = limit;
710 }
711 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
712 }
713
714 static const char *const gSpecialReorderCodes[] = {
715 "space", "punct", "symbol", "currency", "digit"
716 };
717
718 int32_t
getReorderCode(const char * word)719 CollationRuleParser::getReorderCode(const char *word) {
720 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
721 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
722 return UCOL_REORDER_CODE_FIRST + i;
723 }
724 }
725 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
726 if(script >= 0) {
727 return script;
728 }
729 if(uprv_stricmp(word, "others") == 0) {
730 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
731 }
732 return -1;
733 }
734
735 UColAttributeValue
getOnOffValue(const UnicodeString & s)736 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
737 if(s == UNICODE_STRING_SIMPLE("on")) {
738 return UCOL_ON;
739 } else if(s == UNICODE_STRING_SIMPLE("off")) {
740 return UCOL_OFF;
741 } else {
742 return UCOL_DEFAULT;
743 }
744 }
745
746 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)747 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
748 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
749 int32_t level = 0;
750 int32_t j = i;
751 for(;;) {
752 if(j == rules->length()) {
753 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
754 return j;
755 }
756 UChar c = rules->charAt(j++);
757 if(c == 0x5b) { // '['
758 ++level;
759 } else if(c == 0x5d) { // ']'
760 if(--level == 0) { break; }
761 }
762 }
763 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
764 if(U_FAILURE(errorCode)) {
765 errorCode = U_ZERO_ERROR;
766 setParseError("not a valid UnicodeSet pattern", errorCode);
767 return j;
768 }
769 j = skipWhiteSpace(j);
770 if(j == rules->length() || rules->charAt(j) != 0x5d) {
771 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
772 return j;
773 }
774 return ++j;
775 }
776
777 int32_t
readWords(int32_t i,UnicodeString & raw) const778 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
779 static const UChar sp = 0x20;
780 raw.remove();
781 i = skipWhiteSpace(i);
782 for(;;) {
783 if(i >= rules->length()) { return 0; }
784 UChar c = rules->charAt(i);
785 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
786 if(raw.isEmpty()) { return i; }
787 if(raw.endsWith(&sp, 1)) { // remove trailing space
788 raw.truncate(raw.length() - 1);
789 }
790 return i;
791 }
792 if(PatternProps::isWhiteSpace(c)) {
793 raw.append(sp);
794 i = skipWhiteSpace(i + 1);
795 } else {
796 raw.append(c);
797 ++i;
798 }
799 }
800 }
801
802 int32_t
skipComment(int32_t i) const803 CollationRuleParser::skipComment(int32_t i) const {
804 // skip to past the newline
805 while(i < rules->length()) {
806 UChar c = rules->charAt(i++);
807 // LF or FF or CR or NEL or LS or PS
808 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
809 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
810 // NLF (new line function) = CR or LF or CR+LF or NEL.
811 // No need to collect all of CR+LF because a following LF will be ignored anyway.
812 break;
813 }
814 }
815 return i;
816 }
817
818 void
setParseError(const char * reason,UErrorCode & errorCode)819 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
820 if(U_FAILURE(errorCode)) { return; }
821 // Error code consistent with the old parser (from ca. 2001),
822 // rather than U_PARSE_ERROR;
823 errorCode = U_INVALID_FORMAT_ERROR;
824 errorReason = reason;
825 if(parseError != NULL) { setErrorContext(); }
826 }
827
828 void
setErrorContext()829 CollationRuleParser::setErrorContext() {
830 if(parseError == NULL) { return; }
831
832 // Note: This relies on the calling code maintaining the ruleIndex
833 // at a position that is useful for debugging.
834 // For example, at the beginning of a reset or relation etc.
835 parseError->offset = ruleIndex;
836 parseError->line = 0; // We are not counting line numbers.
837
838 // before ruleIndex
839 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
840 if(start < 0) {
841 start = 0;
842 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
843 ++start;
844 }
845 int32_t length = ruleIndex - start;
846 rules->extract(start, length, parseError->preContext);
847 parseError->preContext[length] = 0;
848
849 // starting from ruleIndex
850 length = rules->length() - ruleIndex;
851 if(length >= U_PARSE_CONTEXT_LEN) {
852 length = U_PARSE_CONTEXT_LEN - 1;
853 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
854 --length;
855 }
856 }
857 rules->extract(ruleIndex, length, parseError->postContext);
858 parseError->postContext[length] = 0;
859 }
860
861 UBool
isSyntaxChar(UChar32 c)862 CollationRuleParser::isSyntaxChar(UChar32 c) {
863 return 0x21 <= c && c <= 0x7e &&
864 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
865 (0x5b <= c && c <= 0x60) || (0x7b <= c));
866 }
867
868 int32_t
skipWhiteSpace(int32_t i) const869 CollationRuleParser::skipWhiteSpace(int32_t i) const {
870 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
871 ++i;
872 }
873 return i;
874 }
875
876 U_NAMESPACE_END
877
878 #endif // !UCONFIG_NO_COLLATION
879