1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.cpp
7 *
8 * (replaced the former ucol_tok.cpp)
9 *
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/normalizer2.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/uchar.h"
21 #include "unicode/ucol.h"
22 #include "unicode/uloc.h"
23 #include "unicode/unistr.h"
24 #include "unicode/utf16.h"
25 #include "charstr.h"
26 #include "cmemory.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationruleparser.h"
30 #include "collationsettings.h"
31 #include "collationtailoring.h"
32 #include "cstring.h"
33 #include "patternprops.h"
34 #include "uassert.h"
35 #include "uvectr32.h"
36
37 U_NAMESPACE_BEGIN
38
39 namespace {
40
41 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
42 const int32_t BEFORE_LENGTH = 7;
43
44 } // namespace
45
~Sink()46 CollationRuleParser::Sink::~Sink() {}
47
48 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)49 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
50
51 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)52 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
53
~Importer()54 CollationRuleParser::Importer::~Importer() {}
55
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)56 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
57 : nfd(*Normalizer2::getNFDInstance(errorCode)),
58 nfc(*Normalizer2::getNFCInstance(errorCode)),
59 rules(NULL), baseData(base), settings(NULL),
60 parseError(NULL), errorReason(NULL),
61 sink(NULL), importer(NULL),
62 ruleIndex(0) {
63 }
64
~CollationRuleParser()65 CollationRuleParser::~CollationRuleParser() {
66 }
67
68 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)69 CollationRuleParser::parse(const UnicodeString &ruleString,
70 CollationSettings &outSettings,
71 UParseError *outParseError,
72 UErrorCode &errorCode) {
73 if(U_FAILURE(errorCode)) { return; }
74 settings = &outSettings;
75 parseError = outParseError;
76 if(parseError != NULL) {
77 parseError->line = 0;
78 parseError->offset = -1;
79 parseError->preContext[0] = 0;
80 parseError->postContext[0] = 0;
81 }
82 errorReason = NULL;
83 parse(ruleString, errorCode);
84 }
85
86 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)87 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
88 if(U_FAILURE(errorCode)) { return; }
89 rules = &ruleString;
90 ruleIndex = 0;
91
92 while(ruleIndex < rules->length()) {
93 UChar c = rules->charAt(ruleIndex);
94 if(PatternProps::isWhiteSpace(c)) {
95 ++ruleIndex;
96 continue;
97 }
98 switch(c) {
99 case 0x26: // '&'
100 parseRuleChain(errorCode);
101 break;
102 case 0x5b: // '['
103 parseSetting(errorCode);
104 break;
105 case 0x23: // '#' starts a comment, until the end of the line
106 ruleIndex = skipComment(ruleIndex + 1);
107 break;
108 case 0x40: // '@' is equivalent to [backwards 2]
109 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
110 UCOL_ON, 0, errorCode);
111 ++ruleIndex;
112 break;
113 case 0x21: // '!' used to turn on Thai/Lao character reversal
114 // Accept but ignore. The root collator has contractions
115 // that are equivalent to the character reversal, where appropriate.
116 ++ruleIndex;
117 break;
118 default:
119 setParseError("expected a reset or setting or comment", errorCode);
120 break;
121 }
122 if(U_FAILURE(errorCode)) { return; }
123 }
124 }
125
126 void
parseRuleChain(UErrorCode & errorCode)127 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
128 int32_t resetStrength = parseResetAndPosition(errorCode);
129 UBool isFirstRelation = TRUE;
130 for(;;) {
131 int32_t result = parseRelationOperator(errorCode);
132 if(U_FAILURE(errorCode)) { return; }
133 if(result < 0) {
134 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
135 // '#' starts a comment, until the end of the line
136 ruleIndex = skipComment(ruleIndex + 1);
137 continue;
138 }
139 if(isFirstRelation) {
140 setParseError("reset not followed by a relation", errorCode);
141 }
142 return;
143 }
144 int32_t strength = result & STRENGTH_MASK;
145 if(resetStrength < UCOL_IDENTICAL) {
146 // reset-before rule chain
147 if(isFirstRelation) {
148 if(strength != resetStrength) {
149 setParseError("reset-before strength differs from its first relation", errorCode);
150 return;
151 }
152 } else {
153 if(strength < resetStrength) {
154 setParseError("reset-before strength followed by a stronger relation", errorCode);
155 return;
156 }
157 }
158 }
159 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
160 if((result & STARRED_FLAG) == 0) {
161 parseRelationStrings(strength, i, errorCode);
162 } else {
163 parseStarredCharacters(strength, i, errorCode);
164 }
165 if(U_FAILURE(errorCode)) { return; }
166 isFirstRelation = FALSE;
167 }
168 }
169
170 int32_t
parseResetAndPosition(UErrorCode & errorCode)171 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
172 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
173 int32_t i = skipWhiteSpace(ruleIndex + 1);
174 int32_t j;
175 UChar c;
176 int32_t resetStrength;
177 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
178 (j = i + BEFORE_LENGTH) < rules->length() &&
179 PatternProps::isWhiteSpace(rules->charAt(j)) &&
180 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
181 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
182 rules->charAt(j + 1) == 0x5d) {
183 // &[before n] with n=1 or 2 or 3
184 resetStrength = UCOL_PRIMARY + (c - 0x31);
185 i = skipWhiteSpace(j + 2);
186 } else {
187 resetStrength = UCOL_IDENTICAL;
188 }
189 if(i >= rules->length()) {
190 setParseError("reset without position", errorCode);
191 return UCOL_DEFAULT;
192 }
193 UnicodeString str;
194 if(rules->charAt(i) == 0x5b) { // '['
195 i = parseSpecialPosition(i, str, errorCode);
196 } else {
197 i = parseTailoringString(i, str, errorCode);
198 }
199 sink->addReset(resetStrength, str, errorReason, errorCode);
200 if(U_FAILURE(errorCode)) { setErrorContext(); }
201 ruleIndex = i;
202 return resetStrength;
203 }
204
205 int32_t
parseRelationOperator(UErrorCode & errorCode)206 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
207 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
208 ruleIndex = skipWhiteSpace(ruleIndex);
209 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
210 int32_t strength;
211 int32_t i = ruleIndex;
212 UChar c = rules->charAt(i++);
213 switch(c) {
214 case 0x3c: // '<'
215 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
216 ++i;
217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
218 ++i;
219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
220 ++i;
221 strength = UCOL_QUATERNARY;
222 } else {
223 strength = UCOL_TERTIARY;
224 }
225 } else {
226 strength = UCOL_SECONDARY;
227 }
228 } else {
229 strength = UCOL_PRIMARY;
230 }
231 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
232 ++i;
233 strength |= STARRED_FLAG;
234 }
235 break;
236 case 0x3b: // ';' same as <<
237 strength = UCOL_SECONDARY;
238 break;
239 case 0x2c: // ',' same as <<<
240 strength = UCOL_TERTIARY;
241 break;
242 case 0x3d: // '='
243 strength = UCOL_IDENTICAL;
244 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
245 ++i;
246 strength |= STARRED_FLAG;
247 }
248 break;
249 default:
250 return UCOL_DEFAULT;
251 }
252 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
253 }
254
255 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)256 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
257 // Parse
258 // prefix | str / extension
259 // where prefix and extension are optional.
260 UnicodeString prefix, str, extension;
261 i = parseTailoringString(i, str, errorCode);
262 if(U_FAILURE(errorCode)) { return; }
263 UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
264 if(next == 0x7c) { // '|' separates the context prefix from the string.
265 prefix = str;
266 i = parseTailoringString(i + 1, str, errorCode);
267 if(U_FAILURE(errorCode)) { return; }
268 next = (i < rules->length()) ? rules->charAt(i) : 0;
269 }
270 if(next == 0x2f) { // '/' separates the string from the extension.
271 i = parseTailoringString(i + 1, extension, errorCode);
272 }
273 if(!prefix.isEmpty()) {
274 UChar32 prefix0 = prefix.char32At(0);
275 UChar32 c = str.char32At(0);
276 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
277 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
278 errorCode);
279 return;
280 }
281 }
282 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
283 if(U_FAILURE(errorCode)) { setErrorContext(); }
284 ruleIndex = i;
285 }
286
287 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)288 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
289 UnicodeString empty, raw;
290 i = parseString(skipWhiteSpace(i), raw, errorCode);
291 if(U_FAILURE(errorCode)) { return; }
292 if(raw.isEmpty()) {
293 setParseError("missing starred-relation string", errorCode);
294 return;
295 }
296 UChar32 prev = -1;
297 int32_t j = 0;
298 for(;;) {
299 while(j < raw.length()) {
300 UChar32 c = raw.char32At(j);
301 if(!nfd.isInert(c)) {
302 setParseError("starred-relation string is not all NFD-inert", errorCode);
303 return;
304 }
305 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
306 if(U_FAILURE(errorCode)) {
307 setErrorContext();
308 return;
309 }
310 j += U16_LENGTH(c);
311 prev = c;
312 }
313 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
314 break;
315 }
316 if(prev < 0) {
317 setParseError("range without start in starred-relation string", errorCode);
318 return;
319 }
320 i = parseString(i + 1, raw, errorCode);
321 if(U_FAILURE(errorCode)) { return; }
322 if(raw.isEmpty()) {
323 setParseError("range without end in starred-relation string", errorCode);
324 return;
325 }
326 UChar32 c = raw.char32At(0);
327 if(c < prev) {
328 setParseError("range start greater than end in starred-relation string", errorCode);
329 return;
330 }
331 // range prev-c
332 UnicodeString s;
333 while(++prev <= c) {
334 if(!nfd.isInert(prev)) {
335 setParseError("starred-relation string range is not all NFD-inert", errorCode);
336 return;
337 }
338 if(U_IS_SURROGATE(prev)) {
339 setParseError("starred-relation string range contains a surrogate", errorCode);
340 return;
341 }
342 if(0xfffd <= prev && prev <= 0xffff) {
343 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
344 return;
345 }
346 s.setTo(prev);
347 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
348 if(U_FAILURE(errorCode)) {
349 setErrorContext();
350 return;
351 }
352 }
353 prev = -1;
354 j = U16_LENGTH(c);
355 }
356 ruleIndex = skipWhiteSpace(i);
357 }
358
359 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)360 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
361 i = parseString(skipWhiteSpace(i), raw, errorCode);
362 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
363 setParseError("missing relation string", errorCode);
364 }
365 return skipWhiteSpace(i);
366 }
367
368 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)369 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
370 if(U_FAILURE(errorCode)) { return i; }
371 raw.remove();
372 while(i < rules->length()) {
373 UChar32 c = rules->charAt(i++);
374 if(isSyntaxChar(c)) {
375 if(c == 0x27) { // apostrophe
376 if(i < rules->length() && rules->charAt(i) == 0x27) {
377 // Double apostrophe, encodes a single one.
378 raw.append((UChar)0x27);
379 ++i;
380 continue;
381 }
382 // Quote literal text until the next single apostrophe.
383 for(;;) {
384 if(i == rules->length()) {
385 setParseError("quoted literal text missing terminating apostrophe", errorCode);
386 return i;
387 }
388 c = rules->charAt(i++);
389 if(c == 0x27) {
390 if(i < rules->length() && rules->charAt(i) == 0x27) {
391 // Double apostrophe inside quoted literal text,
392 // still encodes a single apostrophe.
393 ++i;
394 } else {
395 break;
396 }
397 }
398 raw.append((UChar)c);
399 }
400 } else if(c == 0x5c) { // backslash
401 if(i == rules->length()) {
402 setParseError("backslash escape at the end of the rule string", errorCode);
403 return i;
404 }
405 c = rules->char32At(i);
406 raw.append(c);
407 i += U16_LENGTH(c);
408 } else {
409 // Any other syntax character terminates a string.
410 --i;
411 break;
412 }
413 } else if(PatternProps::isWhiteSpace(c)) {
414 // Unquoted white space terminates a string.
415 --i;
416 break;
417 } else {
418 raw.append((UChar)c);
419 }
420 }
421 for(int32_t j = 0; j < raw.length();) {
422 UChar32 c = raw.char32At(j);
423 if(U_IS_SURROGATE(c)) {
424 setParseError("string contains an unpaired surrogate", errorCode);
425 return i;
426 }
427 if(0xfffd <= c && c <= 0xffff) {
428 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
429 return i;
430 }
431 j += U16_LENGTH(c);
432 }
433 return i;
434 }
435
436 namespace {
437
438 static const char *const positions[] = {
439 "first tertiary ignorable",
440 "last tertiary ignorable",
441 "first secondary ignorable",
442 "last secondary ignorable",
443 "first primary ignorable",
444 "last primary ignorable",
445 "first variable",
446 "last variable",
447 "first regular",
448 "last regular",
449 "first implicit",
450 "last implicit",
451 "first trailing",
452 "last trailing"
453 };
454
455 } // namespace
456
457 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)458 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
459 if(U_FAILURE(errorCode)) { return 0; }
460 UnicodeString raw;
461 int32_t j = readWords(i + 1, raw);
462 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
463 ++j;
464 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
465 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
466 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
467 return j;
468 }
469 }
470 if(raw == UNICODE_STRING_SIMPLE("top")) {
471 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
472 return j;
473 }
474 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
475 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
476 return j;
477 }
478 }
479 setParseError("not a valid special reset position", errorCode);
480 return i;
481 }
482
483 void
parseSetting(UErrorCode & errorCode)484 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
485 if(U_FAILURE(errorCode)) { return; }
486 UnicodeString raw;
487 int32_t i = ruleIndex + 1;
488 int32_t j = readWords(i, raw);
489 if(j <= i || raw.isEmpty()) {
490 setParseError("expected a setting/option at '['", errorCode);
491 }
492 if(rules->charAt(j) == 0x5d) { // words end with ]
493 ++j;
494 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
495 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
496 parseReordering(raw, errorCode);
497 ruleIndex = j;
498 return;
499 }
500 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
501 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
502 UCOL_ON, 0, errorCode);
503 ruleIndex = j;
504 return;
505 }
506 UnicodeString v;
507 int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
508 if(valueIndex >= 0) {
509 v.setTo(raw, valueIndex + 1);
510 raw.truncate(valueIndex);
511 }
512 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
513 int32_t value = UCOL_DEFAULT;
514 UChar c = v.charAt(0);
515 if(0x31 <= c && c <= 0x34) { // 1..4
516 value = UCOL_PRIMARY + (c - 0x31);
517 } else if(c == 0x49) { // 'I'
518 value = UCOL_IDENTICAL;
519 }
520 if(value != UCOL_DEFAULT) {
521 settings->setStrength(value, 0, errorCode);
522 ruleIndex = j;
523 return;
524 }
525 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
526 UColAttributeValue value = UCOL_DEFAULT;
527 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
528 value = UCOL_NON_IGNORABLE;
529 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
530 value = UCOL_SHIFTED;
531 }
532 if(value != UCOL_DEFAULT) {
533 settings->setAlternateHandling(value, 0, errorCode);
534 ruleIndex = j;
535 return;
536 }
537 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
538 int32_t value = UCOL_DEFAULT;
539 if(v == UNICODE_STRING_SIMPLE("space")) {
540 value = CollationSettings::MAX_VAR_SPACE;
541 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
542 value = CollationSettings::MAX_VAR_PUNCT;
543 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
544 value = CollationSettings::MAX_VAR_SYMBOL;
545 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
546 value = CollationSettings::MAX_VAR_CURRENCY;
547 }
548 if(value != UCOL_DEFAULT) {
549 settings->setMaxVariable(value, 0, errorCode);
550 settings->variableTop = baseData->getLastPrimaryForGroup(
551 UCOL_REORDER_CODE_FIRST + value);
552 U_ASSERT(settings->variableTop != 0);
553 ruleIndex = j;
554 return;
555 }
556 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
557 UColAttributeValue value = UCOL_DEFAULT;
558 if(v == UNICODE_STRING_SIMPLE("off")) {
559 value = UCOL_OFF;
560 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
561 value = UCOL_LOWER_FIRST;
562 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
563 value = UCOL_UPPER_FIRST;
564 }
565 if(value != UCOL_DEFAULT) {
566 settings->setCaseFirst(value, 0, errorCode);
567 ruleIndex = j;
568 return;
569 }
570 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
571 UColAttributeValue value = getOnOffValue(v);
572 if(value != UCOL_DEFAULT) {
573 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
574 ruleIndex = j;
575 return;
576 }
577 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
578 UColAttributeValue value = getOnOffValue(v);
579 if(value != UCOL_DEFAULT) {
580 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
581 ruleIndex = j;
582 return;
583 }
584 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
585 UColAttributeValue value = getOnOffValue(v);
586 if(value != UCOL_DEFAULT) {
587 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
588 ruleIndex = j;
589 return;
590 }
591 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
592 UColAttributeValue value = getOnOffValue(v);
593 if(value != UCOL_DEFAULT) {
594 if(value == UCOL_ON) {
595 setParseError("[hiraganaQ on] is not supported", errorCode);
596 }
597 ruleIndex = j;
598 return;
599 }
600 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
601 CharString lang;
602 lang.appendInvariantChars(v, errorCode);
603 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
604 // BCP 47 language tag -> ICU locale ID
605 char localeID[ULOC_FULLNAME_CAPACITY];
606 int32_t parsedLength;
607 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
608 &parsedLength, &errorCode);
609 if(U_FAILURE(errorCode) ||
610 parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
611 errorCode = U_ZERO_ERROR;
612 setParseError("expected language tag in [import langTag]", errorCode);
613 return;
614 }
615 // localeID minus all keywords
616 char baseID[ULOC_FULLNAME_CAPACITY];
617 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
618 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
619 errorCode = U_ZERO_ERROR;
620 setParseError("expected language tag in [import langTag]", errorCode);
621 return;
622 }
623 if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
624 uprv_strcpy(baseID, "root");
625 }
626 // @collation=type, or length=0 if not specified
627 char collationType[ULOC_KEYWORDS_CAPACITY];
628 length = uloc_getKeywordValue(localeID, "collation",
629 collationType, ULOC_KEYWORDS_CAPACITY,
630 &errorCode);
631 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
632 errorCode = U_ZERO_ERROR;
633 setParseError("expected language tag in [import langTag]", errorCode);
634 return;
635 }
636 if(importer == NULL) {
637 setParseError("[import langTag] is not supported", errorCode);
638 } else {
639 UnicodeString importedRules;
640 importer->getRules(baseID, length > 0 ? collationType : "standard",
641 importedRules, errorReason, errorCode);
642 if(U_FAILURE(errorCode)) {
643 if(errorReason == NULL) {
644 errorReason = "[import langTag] failed";
645 }
646 setErrorContext();
647 return;
648 }
649 const UnicodeString *outerRules = rules;
650 int32_t outerRuleIndex = ruleIndex;
651 parse(importedRules, errorCode);
652 if(U_FAILURE(errorCode)) {
653 if(parseError != NULL) {
654 parseError->offset = outerRuleIndex;
655 }
656 }
657 rules = outerRules;
658 ruleIndex = j;
659 }
660 return;
661 }
662 } else if(rules->charAt(j) == 0x5b) { // words end with [
663 UnicodeSet set;
664 j = parseUnicodeSet(j, set, errorCode);
665 if(U_FAILURE(errorCode)) { return; }
666 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
667 sink->optimize(set, errorReason, errorCode);
668 if(U_FAILURE(errorCode)) { setErrorContext(); }
669 ruleIndex = j;
670 return;
671 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
672 sink->suppressContractions(set, errorReason, errorCode);
673 if(U_FAILURE(errorCode)) { setErrorContext(); }
674 ruleIndex = j;
675 return;
676 }
677 }
678 setParseError("not a valid setting/option", errorCode);
679 }
680
681 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)682 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
683 if(U_FAILURE(errorCode)) { return; }
684 int32_t i = 7; // after "reorder"
685 if(i == raw.length()) {
686 // empty [reorder] with no codes
687 settings->resetReordering();
688 return;
689 }
690 // Parse the codes in [reorder aa bb cc].
691 UVector32 reorderCodes(errorCode);
692 if(U_FAILURE(errorCode)) { return; }
693 CharString word;
694 while(i < raw.length()) {
695 ++i; // skip the word-separating space
696 int32_t limit = raw.indexOf((UChar)0x20, i);
697 if(limit < 0) { limit = raw.length(); }
698 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
699 if(U_FAILURE(errorCode)) { return; }
700 int32_t code = getReorderCode(word.data());
701 if(code < 0) {
702 setParseError("unknown script or reorder code", errorCode);
703 return;
704 }
705 reorderCodes.addElement(code, errorCode);
706 if(U_FAILURE(errorCode)) { return; }
707 i = limit;
708 }
709 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
710 }
711
712 static const char *const gSpecialReorderCodes[] = {
713 "space", "punct", "symbol", "currency", "digit"
714 };
715
716 int32_t
getReorderCode(const char * word)717 CollationRuleParser::getReorderCode(const char *word) {
718 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
719 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
720 return UCOL_REORDER_CODE_FIRST + i;
721 }
722 }
723 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
724 if(script >= 0) {
725 return script;
726 }
727 if(uprv_stricmp(word, "others") == 0) {
728 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
729 }
730 return -1;
731 }
732
733 UColAttributeValue
getOnOffValue(const UnicodeString & s)734 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
735 if(s == UNICODE_STRING_SIMPLE("on")) {
736 return UCOL_ON;
737 } else if(s == UNICODE_STRING_SIMPLE("off")) {
738 return UCOL_OFF;
739 } else {
740 return UCOL_DEFAULT;
741 }
742 }
743
744 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)745 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
746 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
747 int32_t level = 0;
748 int32_t j = i;
749 for(;;) {
750 if(j == rules->length()) {
751 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
752 return j;
753 }
754 UChar c = rules->charAt(j++);
755 if(c == 0x5b) { // '['
756 ++level;
757 } else if(c == 0x5d) { // ']'
758 if(--level == 0) { break; }
759 }
760 }
761 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
762 if(U_FAILURE(errorCode)) {
763 errorCode = U_ZERO_ERROR;
764 setParseError("not a valid UnicodeSet pattern", errorCode);
765 return j;
766 }
767 j = skipWhiteSpace(j);
768 if(j == rules->length() || rules->charAt(j) != 0x5d) {
769 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
770 return j;
771 }
772 return ++j;
773 }
774
775 int32_t
readWords(int32_t i,UnicodeString & raw) const776 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
777 static const UChar sp = 0x20;
778 raw.remove();
779 i = skipWhiteSpace(i);
780 for(;;) {
781 if(i >= rules->length()) { return 0; }
782 UChar c = rules->charAt(i);
783 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
784 if(raw.isEmpty()) { return i; }
785 if(raw.endsWith(&sp, 1)) { // remove trailing space
786 raw.truncate(raw.length() - 1);
787 }
788 return i;
789 }
790 if(PatternProps::isWhiteSpace(c)) {
791 raw.append(0x20);
792 i = skipWhiteSpace(i + 1);
793 } else {
794 raw.append(c);
795 ++i;
796 }
797 }
798 }
799
800 int32_t
skipComment(int32_t i) const801 CollationRuleParser::skipComment(int32_t i) const {
802 // skip to past the newline
803 while(i < rules->length()) {
804 UChar c = rules->charAt(i++);
805 // LF or FF or CR or NEL or LS or PS
806 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
807 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
808 // NLF (new line function) = CR or LF or CR+LF or NEL.
809 // No need to collect all of CR+LF because a following LF will be ignored anyway.
810 break;
811 }
812 }
813 return i;
814 }
815
816 void
setParseError(const char * reason,UErrorCode & errorCode)817 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
818 if(U_FAILURE(errorCode)) { return; }
819 // Error code consistent with the old parser (from ca. 2001),
820 // rather than U_PARSE_ERROR;
821 errorCode = U_INVALID_FORMAT_ERROR;
822 errorReason = reason;
823 if(parseError != NULL) { setErrorContext(); }
824 }
825
826 void
setErrorContext()827 CollationRuleParser::setErrorContext() {
828 if(parseError == NULL) { return; }
829
830 // Note: This relies on the calling code maintaining the ruleIndex
831 // at a position that is useful for debugging.
832 // For example, at the beginning of a reset or relation etc.
833 parseError->offset = ruleIndex;
834 parseError->line = 0; // We are not counting line numbers.
835
836 // before ruleIndex
837 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
838 if(start < 0) {
839 start = 0;
840 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
841 ++start;
842 }
843 int32_t length = ruleIndex - start;
844 rules->extract(start, length, parseError->preContext);
845 parseError->preContext[length] = 0;
846
847 // starting from ruleIndex
848 length = rules->length() - ruleIndex;
849 if(length >= U_PARSE_CONTEXT_LEN) {
850 length = U_PARSE_CONTEXT_LEN - 1;
851 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
852 --length;
853 }
854 }
855 rules->extract(ruleIndex, length, parseError->postContext);
856 parseError->postContext[length] = 0;
857 }
858
859 UBool
isSyntaxChar(UChar32 c)860 CollationRuleParser::isSyntaxChar(UChar32 c) {
861 return 0x21 <= c && c <= 0x7e &&
862 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
863 (0x5b <= c && c <= 0x60) || (0x7b <= c));
864 }
865
866 int32_t
skipWhiteSpace(int32_t i) const867 CollationRuleParser::skipWhiteSpace(int32_t i) const {
868 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
869 ++i;
870 }
871 return i;
872 }
873
874 U_NAMESPACE_END
875
876 #endif // !UCONFIG_NO_COLLATION
877