1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15 
16 #include "unicode/utypes.h"
17 
18 #if !UCONFIG_NO_COLLATION
19 
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "charstr.h"
28 #include "cmemory.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
34 #include "cstring.h"
35 #include "patternprops.h"
36 #include "uassert.h"
37 #include "uvectr32.h"
38 
39 U_NAMESPACE_BEGIN
40 
41 namespace {
42 
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
44 const int32_t BEFORE_LENGTH = 7;
45 
46 }  // namespace
47 
~Sink()48 CollationRuleParser::Sink::~Sink() {}
49 
50 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52 
53 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55 
~Importer()56 CollationRuleParser::Importer::~Importer() {}
57 
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59         : nfd(*Normalizer2::getNFDInstance(errorCode)),
60           nfc(*Normalizer2::getNFCInstance(errorCode)),
61           rules(NULL), baseData(base), settings(NULL),
62           parseError(NULL), errorReason(NULL),
63           sink(NULL), importer(NULL),
64           ruleIndex(0) {
65 }
66 
~CollationRuleParser()67 CollationRuleParser::~CollationRuleParser() {
68 }
69 
70 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)71 CollationRuleParser::parse(const UnicodeString &ruleString,
72                            CollationSettings &outSettings,
73                            UParseError *outParseError,
74                            UErrorCode &errorCode) {
75     if(U_FAILURE(errorCode)) { return; }
76     settings = &outSettings;
77     parseError = outParseError;
78     if(parseError != NULL) {
79         parseError->line = 0;
80         parseError->offset = -1;
81         parseError->preContext[0] = 0;
82         parseError->postContext[0] = 0;
83     }
84     errorReason = NULL;
85     parse(ruleString, errorCode);
86 }
87 
88 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90     if(U_FAILURE(errorCode)) { return; }
91     rules = &ruleString;
92     ruleIndex = 0;
93 
94     while(ruleIndex < rules->length()) {
95         UChar c = rules->charAt(ruleIndex);
96         if(PatternProps::isWhiteSpace(c)) {
97             ++ruleIndex;
98             continue;
99         }
100         switch(c) {
101         case 0x26:  // '&'
102             parseRuleChain(errorCode);
103             break;
104         case 0x5b:  // '['
105             parseSetting(errorCode);
106             break;
107         case 0x23:  // '#' starts a comment, until the end of the line
108             ruleIndex = skipComment(ruleIndex + 1);
109             break;
110         case 0x40:  // '@' is equivalent to [backwards 2]
111             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112                               UCOL_ON, 0, errorCode);
113             ++ruleIndex;
114             break;
115         case 0x21:  // '!' used to turn on Thai/Lao character reversal
116             // Accept but ignore. The root collator has contractions
117             // that are equivalent to the character reversal, where appropriate.
118             ++ruleIndex;
119             break;
120         default:
121             setParseError("expected a reset or setting or comment", errorCode);
122             break;
123         }
124         if(U_FAILURE(errorCode)) { return; }
125     }
126 }
127 
128 void
parseRuleChain(UErrorCode & errorCode)129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130     int32_t resetStrength = parseResetAndPosition(errorCode);
131     UBool isFirstRelation = TRUE;
132     for(;;) {
133         int32_t result = parseRelationOperator(errorCode);
134         if(U_FAILURE(errorCode)) { return; }
135         if(result < 0) {
136             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137                 // '#' starts a comment, until the end of the line
138                 ruleIndex = skipComment(ruleIndex + 1);
139                 continue;
140             }
141             if(isFirstRelation) {
142                 setParseError("reset not followed by a relation", errorCode);
143             }
144             return;
145         }
146         int32_t strength = result & STRENGTH_MASK;
147         if(resetStrength < UCOL_IDENTICAL) {
148             // reset-before rule chain
149             if(isFirstRelation) {
150                 if(strength != resetStrength) {
151                     setParseError("reset-before strength differs from its first relation", errorCode);
152                     return;
153                 }
154             } else {
155                 if(strength < resetStrength) {
156                     setParseError("reset-before strength followed by a stronger relation", errorCode);
157                     return;
158                 }
159             }
160         }
161         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
162         if((result & STARRED_FLAG) == 0) {
163             parseRelationStrings(strength, i, errorCode);
164         } else {
165             parseStarredCharacters(strength, i, errorCode);
166         }
167         if(U_FAILURE(errorCode)) { return; }
168         isFirstRelation = FALSE;
169     }
170 }
171 
172 int32_t
parseResetAndPosition(UErrorCode & errorCode)173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175     int32_t i = skipWhiteSpace(ruleIndex + 1);
176     int32_t j;
177     UChar c;
178     int32_t resetStrength;
179     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180             (j = i + BEFORE_LENGTH) < rules->length() &&
181             PatternProps::isWhiteSpace(rules->charAt(j)) &&
182             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184             rules->charAt(j + 1) == 0x5d) {
185         // &[before n] with n=1 or 2 or 3
186         resetStrength = UCOL_PRIMARY + (c - 0x31);
187         i = skipWhiteSpace(j + 2);
188     } else {
189         resetStrength = UCOL_IDENTICAL;
190     }
191     if(i >= rules->length()) {
192         setParseError("reset without position", errorCode);
193         return UCOL_DEFAULT;
194     }
195     UnicodeString str;
196     if(rules->charAt(i) == 0x5b) {  // '['
197         i = parseSpecialPosition(i, str, errorCode);
198     } else {
199         i = parseTailoringString(i, str, errorCode);
200     }
201     sink->addReset(resetStrength, str, errorReason, errorCode);
202     if(U_FAILURE(errorCode)) { setErrorContext(); }
203     ruleIndex = i;
204     return resetStrength;
205 }
206 
207 int32_t
parseRelationOperator(UErrorCode & errorCode)208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210     ruleIndex = skipWhiteSpace(ruleIndex);
211     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212     int32_t strength;
213     int32_t i = ruleIndex;
214     UChar c = rules->charAt(i++);
215     switch(c) {
216     case 0x3c:  // '<'
217         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
218             ++i;
219             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
220                 ++i;
221                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
222                     ++i;
223                     strength = UCOL_QUATERNARY;
224                 } else {
225                     strength = UCOL_TERTIARY;
226                 }
227             } else {
228                 strength = UCOL_SECONDARY;
229             }
230         } else {
231             strength = UCOL_PRIMARY;
232         }
233         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
234             ++i;
235             strength |= STARRED_FLAG;
236         }
237         break;
238     case 0x3b:  // ';' same as <<
239         strength = UCOL_SECONDARY;
240         break;
241     case 0x2c:  // ',' same as <<<
242         strength = UCOL_TERTIARY;
243         break;
244     case 0x3d:  // '='
245         strength = UCOL_IDENTICAL;
246         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
247             ++i;
248             strength |= STARRED_FLAG;
249         }
250         break;
251     default:
252         return UCOL_DEFAULT;
253     }
254     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 }
256 
257 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259     // Parse
260     //     prefix | str / extension
261     // where prefix and extension are optional.
262     UnicodeString prefix, str, extension;
263     i = parseTailoringString(i, str, errorCode);
264     if(U_FAILURE(errorCode)) { return; }
265     UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266     if(next == 0x7c) {  // '|' separates the context prefix from the string.
267         prefix = str;
268         i = parseTailoringString(i + 1, str, errorCode);
269         if(U_FAILURE(errorCode)) { return; }
270         next = (i < rules->length()) ? rules->charAt(i) : 0;
271     }
272     if(next == 0x2f) {  // '/' separates the string from the extension.
273         i = parseTailoringString(i + 1, extension, errorCode);
274     }
275     if(!prefix.isEmpty()) {
276         UChar32 prefix0 = prefix.char32At(0);
277         UChar32 c = str.char32At(0);
278         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280                           errorCode);
281             return;
282         }
283     }
284     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285     if(U_FAILURE(errorCode)) { setErrorContext(); }
286     ruleIndex = i;
287 }
288 
289 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291     UnicodeString empty, raw;
292     i = parseString(skipWhiteSpace(i), raw, errorCode);
293     if(U_FAILURE(errorCode)) { return; }
294     if(raw.isEmpty()) {
295         setParseError("missing starred-relation string", errorCode);
296         return;
297     }
298     UChar32 prev = -1;
299     int32_t j = 0;
300     for(;;) {
301         while(j < raw.length()) {
302             UChar32 c = raw.char32At(j);
303             if(!nfd.isInert(c)) {
304                 setParseError("starred-relation string is not all NFD-inert", errorCode);
305                 return;
306             }
307             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308             if(U_FAILURE(errorCode)) {
309                 setErrorContext();
310                 return;
311             }
312             j += U16_LENGTH(c);
313             prev = c;
314         }
315         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
316             break;
317         }
318         if(prev < 0) {
319             setParseError("range without start in starred-relation string", errorCode);
320             return;
321         }
322         i = parseString(i + 1, raw, errorCode);
323         if(U_FAILURE(errorCode)) { return; }
324         if(raw.isEmpty()) {
325             setParseError("range without end in starred-relation string", errorCode);
326             return;
327         }
328         UChar32 c = raw.char32At(0);
329         if(c < prev) {
330             setParseError("range start greater than end in starred-relation string", errorCode);
331             return;
332         }
333         // range prev-c
334         UnicodeString s;
335         while(++prev <= c) {
336             if(!nfd.isInert(prev)) {
337                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
338                 return;
339             }
340             if(U_IS_SURROGATE(prev)) {
341                 setParseError("starred-relation string range contains a surrogate", errorCode);
342                 return;
343             }
344             if(0xfffd <= prev && prev <= 0xffff) {
345                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346                 return;
347             }
348             s.setTo(prev);
349             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350             if(U_FAILURE(errorCode)) {
351                 setErrorContext();
352                 return;
353             }
354         }
355         prev = -1;
356         j = U16_LENGTH(c);
357     }
358     ruleIndex = skipWhiteSpace(i);
359 }
360 
361 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363     i = parseString(skipWhiteSpace(i), raw, errorCode);
364     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365         setParseError("missing relation string", errorCode);
366     }
367     return skipWhiteSpace(i);
368 }
369 
370 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372     if(U_FAILURE(errorCode)) { return i; }
373     raw.remove();
374     while(i < rules->length()) {
375         UChar32 c = rules->charAt(i++);
376         if(isSyntaxChar(c)) {
377             if(c == 0x27) {  // apostrophe
378                 if(i < rules->length() && rules->charAt(i) == 0x27) {
379                     // Double apostrophe, encodes a single one.
380                     raw.append((UChar)0x27);
381                     ++i;
382                     continue;
383                 }
384                 // Quote literal text until the next single apostrophe.
385                 for(;;) {
386                     if(i == rules->length()) {
387                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
388                         return i;
389                     }
390                     c = rules->charAt(i++);
391                     if(c == 0x27) {
392                         if(i < rules->length() && rules->charAt(i) == 0x27) {
393                             // Double apostrophe inside quoted literal text,
394                             // still encodes a single apostrophe.
395                             ++i;
396                         } else {
397                             break;
398                         }
399                     }
400                     raw.append((UChar)c);
401                 }
402             } else if(c == 0x5c) {  // backslash
403                 if(i == rules->length()) {
404                     setParseError("backslash escape at the end of the rule string", errorCode);
405                     return i;
406                 }
407                 c = rules->char32At(i);
408                 raw.append(c);
409                 i += U16_LENGTH(c);
410             } else {
411                 // Any other syntax character terminates a string.
412                 --i;
413                 break;
414             }
415         } else if(PatternProps::isWhiteSpace(c)) {
416             // Unquoted white space terminates a string.
417             --i;
418             break;
419         } else {
420             raw.append((UChar)c);
421         }
422     }
423     for(int32_t j = 0; j < raw.length();) {
424         UChar32 c = raw.char32At(j);
425         if(U_IS_SURROGATE(c)) {
426             setParseError("string contains an unpaired surrogate", errorCode);
427             return i;
428         }
429         if(0xfffd <= c && c <= 0xffff) {
430             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431             return i;
432         }
433         j += U16_LENGTH(c);
434     }
435     return i;
436 }
437 
438 namespace {
439 
440 static const char *const positions[] = {
441     "first tertiary ignorable",
442     "last tertiary ignorable",
443     "first secondary ignorable",
444     "last secondary ignorable",
445     "first primary ignorable",
446     "last primary ignorable",
447     "first variable",
448     "last variable",
449     "first regular",
450     "last regular",
451     "first implicit",
452     "last implicit",
453     "first trailing",
454     "last trailing"
455 };
456 
457 }  // namespace
458 
459 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461     if(U_FAILURE(errorCode)) { return 0; }
462     UnicodeString raw;
463     int32_t j = readWords(i + 1, raw);
464     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
465         ++j;
466         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468                 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469                 return j;
470             }
471         }
472         if(raw == UNICODE_STRING_SIMPLE("top")) {
473             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474             return j;
475         }
476         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478             return j;
479         }
480     }
481     setParseError("not a valid special reset position", errorCode);
482     return i;
483 }
484 
485 void
parseSetting(UErrorCode & errorCode)486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487     if(U_FAILURE(errorCode)) { return; }
488     UnicodeString raw;
489     int32_t i = ruleIndex + 1;
490     int32_t j = readWords(i, raw);
491     if(j <= i || raw.isEmpty()) {
492         setParseError("expected a setting/option at '['", errorCode);
493     }
494     if(rules->charAt(j) == 0x5d) {  // words end with ]
495         ++j;
496         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498             parseReordering(raw, errorCode);
499             ruleIndex = j;
500             return;
501         }
502         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504                               UCOL_ON, 0, errorCode);
505             ruleIndex = j;
506             return;
507         }
508         UnicodeString v;
509         int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510         if(valueIndex >= 0) {
511             v.setTo(raw, valueIndex + 1);
512             raw.truncate(valueIndex);
513         }
514         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515             int32_t value = UCOL_DEFAULT;
516             UChar c = v.charAt(0);
517             if(0x31 <= c && c <= 0x34) {  // 1..4
518                 value = UCOL_PRIMARY + (c - 0x31);
519             } else if(c == 0x49) {  // 'I'
520                 value = UCOL_IDENTICAL;
521             }
522             if(value != UCOL_DEFAULT) {
523                 settings->setStrength(value, 0, errorCode);
524                 ruleIndex = j;
525                 return;
526             }
527         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528             UColAttributeValue value = UCOL_DEFAULT;
529             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530                 value = UCOL_NON_IGNORABLE;
531             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532                 value = UCOL_SHIFTED;
533             }
534             if(value != UCOL_DEFAULT) {
535                 settings->setAlternateHandling(value, 0, errorCode);
536                 ruleIndex = j;
537                 return;
538             }
539         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540             int32_t value = UCOL_DEFAULT;
541             if(v == UNICODE_STRING_SIMPLE("space")) {
542                 value = CollationSettings::MAX_VAR_SPACE;
543             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544                 value = CollationSettings::MAX_VAR_PUNCT;
545             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546                 value = CollationSettings::MAX_VAR_SYMBOL;
547             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548                 value = CollationSettings::MAX_VAR_CURRENCY;
549             }
550             if(value != UCOL_DEFAULT) {
551                 settings->setMaxVariable(value, 0, errorCode);
552                 settings->variableTop = baseData->getLastPrimaryForGroup(
553                     UCOL_REORDER_CODE_FIRST + value);
554                 U_ASSERT(settings->variableTop != 0);
555                 ruleIndex = j;
556                 return;
557             }
558         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559             UColAttributeValue value = UCOL_DEFAULT;
560             if(v == UNICODE_STRING_SIMPLE("off")) {
561                 value = UCOL_OFF;
562             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563                 value = UCOL_LOWER_FIRST;
564             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565                 value = UCOL_UPPER_FIRST;
566             }
567             if(value != UCOL_DEFAULT) {
568                 settings->setCaseFirst(value, 0, errorCode);
569                 ruleIndex = j;
570                 return;
571             }
572         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573             UColAttributeValue value = getOnOffValue(v);
574             if(value != UCOL_DEFAULT) {
575                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576                 ruleIndex = j;
577                 return;
578             }
579         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580             UColAttributeValue value = getOnOffValue(v);
581             if(value != UCOL_DEFAULT) {
582                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583                 ruleIndex = j;
584                 return;
585             }
586         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587             UColAttributeValue value = getOnOffValue(v);
588             if(value != UCOL_DEFAULT) {
589                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590                 ruleIndex = j;
591                 return;
592             }
593         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594             UColAttributeValue value = getOnOffValue(v);
595             if(value != UCOL_DEFAULT) {
596                 if(value == UCOL_ON) {
597                     setParseError("[hiraganaQ on] is not supported", errorCode);
598                 }
599                 ruleIndex = j;
600                 return;
601             }
602         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603             CharString lang;
604             lang.appendInvariantChars(v, errorCode);
605             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606             // BCP 47 language tag -> ICU locale ID
607             char localeID[ULOC_FULLNAME_CAPACITY];
608             int32_t parsedLength;
609             int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610                                                  &parsedLength, &errorCode);
611             if(U_FAILURE(errorCode) ||
612                     parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613                 errorCode = U_ZERO_ERROR;
614                 setParseError("expected language tag in [import langTag]", errorCode);
615                 return;
616             }
617             // localeID minus all keywords
618             char baseID[ULOC_FULLNAME_CAPACITY];
619             length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621                 errorCode = U_ZERO_ERROR;
622                 setParseError("expected language tag in [import langTag]", errorCode);
623                 return;
624             }
625             if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
626                 uprv_strcpy(baseID, "root");
627             }
628             // @collation=type, or length=0 if not specified
629             char collationType[ULOC_KEYWORDS_CAPACITY];
630             length = uloc_getKeywordValue(localeID, "collation",
631                                           collationType, ULOC_KEYWORDS_CAPACITY,
632                                           &errorCode);
633             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
634                 errorCode = U_ZERO_ERROR;
635                 setParseError("expected language tag in [import langTag]", errorCode);
636                 return;
637             }
638             if(importer == NULL) {
639                 setParseError("[import langTag] is not supported", errorCode);
640             } else {
641                 UnicodeString importedRules;
642                 importer->getRules(baseID, length > 0 ? collationType : "standard",
643                                    importedRules, errorReason, errorCode);
644                 if(U_FAILURE(errorCode)) {
645                     if(errorReason == NULL) {
646                         errorReason = "[import langTag] failed";
647                     }
648                     setErrorContext();
649                     return;
650                 }
651                 const UnicodeString *outerRules = rules;
652                 int32_t outerRuleIndex = ruleIndex;
653                 parse(importedRules, errorCode);
654                 if(U_FAILURE(errorCode)) {
655                     if(parseError != NULL) {
656                         parseError->offset = outerRuleIndex;
657                     }
658                 }
659                 rules = outerRules;
660                 ruleIndex = j;
661             }
662             return;
663         }
664     } else if(rules->charAt(j) == 0x5b) {  // words end with [
665         UnicodeSet set;
666         j = parseUnicodeSet(j, set, errorCode);
667         if(U_FAILURE(errorCode)) { return; }
668         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
669             sink->optimize(set, errorReason, errorCode);
670             if(U_FAILURE(errorCode)) { setErrorContext(); }
671             ruleIndex = j;
672             return;
673         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
674             sink->suppressContractions(set, errorReason, errorCode);
675             if(U_FAILURE(errorCode)) { setErrorContext(); }
676             ruleIndex = j;
677             return;
678         }
679     }
680     setParseError("not a valid setting/option", errorCode);
681 }
682 
683 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)684 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
685     if(U_FAILURE(errorCode)) { return; }
686     int32_t i = 7;  // after "reorder"
687     if(i == raw.length()) {
688         // empty [reorder] with no codes
689         settings->resetReordering();
690         return;
691     }
692     // Parse the codes in [reorder aa bb cc].
693     UVector32 reorderCodes(errorCode);
694     if(U_FAILURE(errorCode)) { return; }
695     CharString word;
696     while(i < raw.length()) {
697         ++i;  // skip the word-separating space
698         int32_t limit = raw.indexOf((UChar)0x20, i);
699         if(limit < 0) { limit = raw.length(); }
700         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
701         if(U_FAILURE(errorCode)) { return; }
702         int32_t code = getReorderCode(word.data());
703         if(code < 0) {
704             setParseError("unknown script or reorder code", errorCode);
705             return;
706         }
707         reorderCodes.addElement(code, errorCode);
708         if(U_FAILURE(errorCode)) { return; }
709         i = limit;
710     }
711     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
712 }
713 
714 static const char *const gSpecialReorderCodes[] = {
715     "space", "punct", "symbol", "currency", "digit"
716 };
717 
718 int32_t
getReorderCode(const char * word)719 CollationRuleParser::getReorderCode(const char *word) {
720     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
721         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
722             return UCOL_REORDER_CODE_FIRST + i;
723         }
724     }
725     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
726     if(script >= 0) {
727         return script;
728     }
729     if(uprv_stricmp(word, "others") == 0) {
730         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
731     }
732     return -1;
733 }
734 
735 UColAttributeValue
getOnOffValue(const UnicodeString & s)736 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
737     if(s == UNICODE_STRING_SIMPLE("on")) {
738         return UCOL_ON;
739     } else if(s == UNICODE_STRING_SIMPLE("off")) {
740         return UCOL_OFF;
741     } else {
742         return UCOL_DEFAULT;
743     }
744 }
745 
746 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)747 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
748     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
749     int32_t level = 0;
750     int32_t j = i;
751     for(;;) {
752         if(j == rules->length()) {
753             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
754             return j;
755         }
756         UChar c = rules->charAt(j++);
757         if(c == 0x5b) {  // '['
758             ++level;
759         } else if(c == 0x5d) {  // ']'
760             if(--level == 0) { break; }
761         }
762     }
763     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
764     if(U_FAILURE(errorCode)) {
765         errorCode = U_ZERO_ERROR;
766         setParseError("not a valid UnicodeSet pattern", errorCode);
767         return j;
768     }
769     j = skipWhiteSpace(j);
770     if(j == rules->length() || rules->charAt(j) != 0x5d) {
771         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
772         return j;
773     }
774     return ++j;
775 }
776 
777 int32_t
readWords(int32_t i,UnicodeString & raw) const778 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
779     static const UChar sp = 0x20;
780     raw.remove();
781     i = skipWhiteSpace(i);
782     for(;;) {
783         if(i >= rules->length()) { return 0; }
784         UChar c = rules->charAt(i);
785         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
786             if(raw.isEmpty()) { return i; }
787             if(raw.endsWith(&sp, 1)) {  // remove trailing space
788                 raw.truncate(raw.length() - 1);
789             }
790             return i;
791         }
792         if(PatternProps::isWhiteSpace(c)) {
793             raw.append(sp);
794             i = skipWhiteSpace(i + 1);
795         } else {
796             raw.append(c);
797             ++i;
798         }
799     }
800 }
801 
802 int32_t
skipComment(int32_t i) const803 CollationRuleParser::skipComment(int32_t i) const {
804     // skip to past the newline
805     while(i < rules->length()) {
806         UChar c = rules->charAt(i++);
807         // LF or FF or CR or NEL or LS or PS
808         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
809             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
810             // NLF (new line function) = CR or LF or CR+LF or NEL.
811             // No need to collect all of CR+LF because a following LF will be ignored anyway.
812             break;
813         }
814     }
815     return i;
816 }
817 
818 void
setParseError(const char * reason,UErrorCode & errorCode)819 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
820     if(U_FAILURE(errorCode)) { return; }
821     // Error code consistent with the old parser (from ca. 2001),
822     // rather than U_PARSE_ERROR;
823     errorCode = U_INVALID_FORMAT_ERROR;
824     errorReason = reason;
825     if(parseError != NULL) { setErrorContext(); }
826 }
827 
828 void
setErrorContext()829 CollationRuleParser::setErrorContext() {
830     if(parseError == NULL) { return; }
831 
832     // Note: This relies on the calling code maintaining the ruleIndex
833     // at a position that is useful for debugging.
834     // For example, at the beginning of a reset or relation etc.
835     parseError->offset = ruleIndex;
836     parseError->line = 0;  // We are not counting line numbers.
837 
838     // before ruleIndex
839     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
840     if(start < 0) {
841         start = 0;
842     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
843         ++start;
844     }
845     int32_t length = ruleIndex - start;
846     rules->extract(start, length, parseError->preContext);
847     parseError->preContext[length] = 0;
848 
849     // starting from ruleIndex
850     length = rules->length() - ruleIndex;
851     if(length >= U_PARSE_CONTEXT_LEN) {
852         length = U_PARSE_CONTEXT_LEN - 1;
853         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
854             --length;
855         }
856     }
857     rules->extract(ruleIndex, length, parseError->postContext);
858     parseError->postContext[length] = 0;
859 }
860 
861 UBool
isSyntaxChar(UChar32 c)862 CollationRuleParser::isSyntaxChar(UChar32 c) {
863     return 0x21 <= c && c <= 0x7e &&
864             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
865             (0x5b <= c && c <= 0x60) || (0x7b <= c));
866 }
867 
868 int32_t
skipWhiteSpace(int32_t i) const869 CollationRuleParser::skipWhiteSpace(int32_t i) const {
870     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
871         ++i;
872     }
873     return i;
874 }
875 
876 U_NAMESPACE_END
877 
878 #endif  // !UCONFIG_NO_COLLATION
879