1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *   Copyright (C) 1997-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 ******************************************************************************
8 *   file name:  nfrule.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 * Modification history
14 * Date        Name      Comments
15 * 10/11/2001  Doug      Ported from ICU4J
16 */
17 
18 #include "nfrule.h"
19 
20 #if U_HAVE_RBNF
21 
22 #include "unicode/localpointer.h"
23 #include "unicode/rbnf.h"
24 #include "unicode/tblcoll.h"
25 #include "unicode/plurfmt.h"
26 #include "unicode/upluralrules.h"
27 #include "unicode/coleitr.h"
28 #include "unicode/uchar.h"
29 #include "nfrs.h"
30 #include "nfrlist.h"
31 #include "nfsubs.h"
32 #include "patternprops.h"
33 
34 U_NAMESPACE_BEGIN
35 
NFRule(const RuleBasedNumberFormat * _rbnf,const UnicodeString & _ruleText,UErrorCode & status)36 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status)
37   : baseValue((int32_t)0)
38   , radix(10)
39   , exponent(0)
40   , decimalPoint(0)
41   , ruleText(_ruleText)
42   , sub1(NULL)
43   , sub2(NULL)
44   , formatter(_rbnf)
45   , rulePatternFormat(NULL)
46 {
47     if (!ruleText.isEmpty()) {
48         parseRuleDescriptor(ruleText, status);
49     }
50 }
51 
~NFRule()52 NFRule::~NFRule()
53 {
54     if (sub1 != sub2) {
55         delete sub2;
56         sub2 = NULL;
57     }
58     delete sub1;
59     sub1 = NULL;
60     delete rulePatternFormat;
61     rulePatternFormat = NULL;
62 }
63 
64 static const UChar gLeftBracket = 0x005b;
65 static const UChar gRightBracket = 0x005d;
66 static const UChar gColon = 0x003a;
67 static const UChar gZero = 0x0030;
68 static const UChar gNine = 0x0039;
69 static const UChar gSpace = 0x0020;
70 static const UChar gSlash = 0x002f;
71 static const UChar gGreaterThan = 0x003e;
72 static const UChar gLessThan = 0x003c;
73 static const UChar gComma = 0x002c;
74 static const UChar gDot = 0x002e;
75 static const UChar gTick = 0x0027;
76 //static const UChar gMinus = 0x002d;
77 static const UChar gSemicolon = 0x003b;
78 static const UChar gX = 0x0078;
79 
80 static const UChar gMinusX[] =                  {0x2D, 0x78, 0};    /* "-x" */
81 static const UChar gInf[] =                     {0x49, 0x6E, 0x66, 0}; /* "Inf" */
82 static const UChar gNaN[] =                     {0x4E, 0x61, 0x4E, 0}; /* "NaN" */
83 
84 static const UChar gDollarOpenParenthesis[] =   {0x24, 0x28, 0}; /* "$(" */
85 static const UChar gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */
86 
87 static const UChar gLessLess[] =                {0x3C, 0x3C, 0};    /* "<<" */
88 static const UChar gLessPercent[] =             {0x3C, 0x25, 0};    /* "<%" */
89 static const UChar gLessHash[] =                {0x3C, 0x23, 0};    /* "<#" */
90 static const UChar gLessZero[] =                {0x3C, 0x30, 0};    /* "<0" */
91 static const UChar gGreaterGreater[] =          {0x3E, 0x3E, 0};    /* ">>" */
92 static const UChar gGreaterPercent[] =          {0x3E, 0x25, 0};    /* ">%" */
93 static const UChar gGreaterHash[] =             {0x3E, 0x23, 0};    /* ">#" */
94 static const UChar gGreaterZero[] =             {0x3E, 0x30, 0};    /* ">0" */
95 static const UChar gEqualPercent[] =            {0x3D, 0x25, 0};    /* "=%" */
96 static const UChar gEqualHash[] =               {0x3D, 0x23, 0};    /* "=#" */
97 static const UChar gEqualZero[] =               {0x3D, 0x30, 0};    /* "=0" */
98 static const UChar gGreaterGreaterGreater[] =   {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */
99 
100 static const UChar * const RULE_PREFIXES[] = {
101     gLessLess, gLessPercent, gLessHash, gLessZero,
102     gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero,
103     gEqualPercent, gEqualHash, gEqualZero, NULL
104 };
105 
106 void
makeRules(UnicodeString & description,NFRuleSet * owner,const NFRule * predecessor,const RuleBasedNumberFormat * rbnf,NFRuleList & rules,UErrorCode & status)107 NFRule::makeRules(UnicodeString& description,
108                   NFRuleSet *owner,
109                   const NFRule *predecessor,
110                   const RuleBasedNumberFormat *rbnf,
111                   NFRuleList& rules,
112                   UErrorCode& status)
113 {
114     // we know we're making at least one rule, so go ahead and
115     // new it up and initialize its basevalue and divisor
116     // (this also strips the rule descriptor, if any, off the
117     // descripton string)
118     NFRule* rule1 = new NFRule(rbnf, description, status);
119     /* test for NULL */
120     if (rule1 == 0) {
121         status = U_MEMORY_ALLOCATION_ERROR;
122         return;
123     }
124     description = rule1->ruleText;
125 
126     // check the description to see whether there's text enclosed
127     // in brackets
128     int32_t brack1 = description.indexOf(gLeftBracket);
129     int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket);
130 
131     // if the description doesn't contain a matched pair of brackets,
132     // or if it's of a type that doesn't recognize bracketed text,
133     // then leave the description alone, initialize the rule's
134     // rule text and substitutions, and return that rule
135     if (brack2 < 0 || brack1 > brack2
136         || rule1->getType() == kProperFractionRule
137         || rule1->getType() == kNegativeNumberRule
138         || rule1->getType() == kInfinityRule
139         || rule1->getType() == kNaNRule)
140     {
141         rule1->extractSubstitutions(owner, description, predecessor, status);
142     }
143     else {
144         // if the description does contain a matched pair of brackets,
145         // then it's really shorthand for two rules (with one exception)
146         NFRule* rule2 = NULL;
147         UnicodeString sbuf;
148 
149         // we'll actually only split the rule into two rules if its
150         // base value is an even multiple of its divisor (or it's one
151         // of the special rules)
152         if ((rule1->baseValue > 0
153             && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0)
154             || rule1->getType() == kImproperFractionRule
155             || rule1->getType() == kMasterRule) {
156 
157             // if it passes that test, new up the second rule.  If the
158             // rule set both rules will belong to is a fraction rule
159             // set, they both have the same base value; otherwise,
160             // increment the original rule's base value ("rule1" actually
161             // goes SECOND in the rule set's rule list)
162             rule2 = new NFRule(rbnf, UnicodeString(), status);
163             /* test for NULL */
164             if (rule2 == 0) {
165                 status = U_MEMORY_ALLOCATION_ERROR;
166                 return;
167             }
168             if (rule1->baseValue >= 0) {
169                 rule2->baseValue = rule1->baseValue;
170                 if (!owner->isFractionRuleSet()) {
171                     ++rule1->baseValue;
172                 }
173             }
174 
175             // if the description began with "x.x" and contains bracketed
176             // text, it describes both the improper fraction rule and
177             // the proper fraction rule
178             else if (rule1->getType() == kImproperFractionRule) {
179                 rule2->setType(kProperFractionRule);
180             }
181 
182             // if the description began with "x.0" and contains bracketed
183             // text, it describes both the master rule and the
184             // improper fraction rule
185             else if (rule1->getType() == kMasterRule) {
186                 rule2->baseValue = rule1->baseValue;
187                 rule1->setType(kImproperFractionRule);
188             }
189 
190             // both rules have the same radix and exponent (i.e., the
191             // same divisor)
192             rule2->radix = rule1->radix;
193             rule2->exponent = rule1->exponent;
194 
195             // rule2's rule text omits the stuff in brackets: initalize
196             // its rule text and substitutions accordingly
197             sbuf.append(description, 0, brack1);
198             if (brack2 + 1 < description.length()) {
199                 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
200             }
201             rule2->extractSubstitutions(owner, sbuf, predecessor, status);
202         }
203 
204         // rule1's text includes the text in the brackets but omits
205         // the brackets themselves: initialize _its_ rule text and
206         // substitutions accordingly
207         sbuf.setTo(description, 0, brack1);
208         sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
209         if (brack2 + 1 < description.length()) {
210             sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
211         }
212         rule1->extractSubstitutions(owner, sbuf, predecessor, status);
213 
214         // if we only have one rule, return it; if we have two, return
215         // a two-element array containing them (notice that rule2 goes
216         // BEFORE rule1 in the list: in all cases, rule2 OMITS the
217         // material in the brackets and rule1 INCLUDES the material
218         // in the brackets)
219         if (rule2 != NULL) {
220             if (rule2->baseValue >= kNoBase) {
221                 rules.add(rule2);
222             }
223             else {
224                 owner->setNonNumericalRule(rule2);
225             }
226         }
227     }
228     if (rule1->baseValue >= kNoBase) {
229         rules.add(rule1);
230     }
231     else {
232         owner->setNonNumericalRule(rule1);
233     }
234 }
235 
236 /**
237  * This function parses the rule's rule descriptor (i.e., the base
238  * value and/or other tokens that precede the rule's rule text
239  * in the description) and sets the rule's base value, radix, and
240  * exponent according to the descriptor.  (If the description doesn't
241  * include a rule descriptor, then this function sets everything to
242  * default values and the rule set sets the rule's real base value).
243  * @param description The rule's description
244  * @return If "description" included a rule descriptor, this is
245  * "description" with the descriptor and any trailing whitespace
246  * stripped off.  Otherwise; it's "descriptor" unchangd.
247  */
248 void
parseRuleDescriptor(UnicodeString & description,UErrorCode & status)249 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
250 {
251     // the description consists of a rule descriptor and a rule body,
252     // separated by a colon.  The rule descriptor is optional.  If
253     // it's omitted, just set the base value to 0.
254     int32_t p = description.indexOf(gColon);
255     if (p != -1) {
256         // copy the descriptor out into its own string and strip it,
257         // along with any trailing whitespace, out of the original
258         // description
259         UnicodeString descriptor;
260         descriptor.setTo(description, 0, p);
261 
262         ++p;
263         while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) {
264             ++p;
265         }
266         description.removeBetween(0, p);
267 
268         // check first to see if the rule descriptor matches the token
269         // for one of the special rules.  If it does, set the base
270         // value to the correct identifier value
271         int descriptorLength = descriptor.length();
272         UChar firstChar = descriptor.charAt(0);
273         UChar lastChar = descriptor.charAt(descriptorLength - 1);
274         if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) {
275             // if the rule descriptor begins with a digit, it's a descriptor
276             // for a normal rule
277             // since we don't have Long.parseLong, and this isn't much work anyway,
278             // just build up the value as we encounter the digits.
279             int64_t val = 0;
280             p = 0;
281             UChar c = gSpace;
282 
283             // begin parsing the descriptor: copy digits
284             // into "tempValue", skip periods, commas, and spaces,
285             // stop on a slash or > sign (or at the end of the string),
286             // and throw an exception on any other character
287             int64_t ll_10 = 10;
288             while (p < descriptorLength) {
289                 c = descriptor.charAt(p);
290                 if (c >= gZero && c <= gNine) {
291                     val = val * ll_10 + (int32_t)(c - gZero);
292                 }
293                 else if (c == gSlash || c == gGreaterThan) {
294                     break;
295                 }
296                 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
297                 }
298                 else {
299                     // throw new IllegalArgumentException("Illegal character in rule descriptor");
300                     status = U_PARSE_ERROR;
301                     return;
302                 }
303                 ++p;
304             }
305 
306             // we have the base value, so set it
307             setBaseValue(val, status);
308 
309             // if we stopped the previous loop on a slash, we're
310             // now parsing the rule's radix.  Again, accumulate digits
311             // in tempValue, skip punctuation, stop on a > mark, and
312             // throw an exception on anything else
313             if (c == gSlash) {
314                 val = 0;
315                 ++p;
316                 int64_t ll_10 = 10;
317                 while (p < descriptorLength) {
318                     c = descriptor.charAt(p);
319                     if (c >= gZero && c <= gNine) {
320                         val = val * ll_10 + (int32_t)(c - gZero);
321                     }
322                     else if (c == gGreaterThan) {
323                         break;
324                     }
325                     else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
326                     }
327                     else {
328                         // throw new IllegalArgumentException("Illegal character is rule descriptor");
329                         status = U_PARSE_ERROR;
330                         return;
331                     }
332                     ++p;
333                 }
334 
335                 // tempValue now contain's the rule's radix.  Set it
336                 // accordingly, and recalculate the rule's exponent
337                 radix = (int32_t)val;
338                 if (radix == 0) {
339                     // throw new IllegalArgumentException("Rule can't have radix of 0");
340                     status = U_PARSE_ERROR;
341                 }
342 
343                 exponent = expectedExponent();
344             }
345 
346             // if we stopped the previous loop on a > sign, then continue
347             // for as long as we still see > signs.  For each one,
348             // decrement the exponent (unless the exponent is already 0).
349             // If we see another character before reaching the end of
350             // the descriptor, that's also a syntax error.
351             if (c == gGreaterThan) {
352                 while (p < descriptor.length()) {
353                     c = descriptor.charAt(p);
354                     if (c == gGreaterThan && exponent > 0) {
355                         --exponent;
356                     } else {
357                         // throw new IllegalArgumentException("Illegal character in rule descriptor");
358                         status = U_PARSE_ERROR;
359                         return;
360                     }
361                     ++p;
362                 }
363             }
364         }
365         else if (0 == descriptor.compare(gMinusX, 2)) {
366             setType(kNegativeNumberRule);
367         }
368         else if (descriptorLength == 3) {
369             if (firstChar == gZero && lastChar == gX) {
370                 setBaseValue(kProperFractionRule, status);
371                 decimalPoint = descriptor.charAt(1);
372             }
373             else if (firstChar == gX && lastChar == gX) {
374                 setBaseValue(kImproperFractionRule, status);
375                 decimalPoint = descriptor.charAt(1);
376             }
377             else if (firstChar == gX && lastChar == gZero) {
378                 setBaseValue(kMasterRule, status);
379                 decimalPoint = descriptor.charAt(1);
380             }
381             else if (descriptor.compare(gNaN, 3) == 0) {
382                 setBaseValue(kNaNRule, status);
383             }
384             else if (descriptor.compare(gInf, 3) == 0) {
385                 setBaseValue(kInfinityRule, status);
386             }
387         }
388     }
389     // else use the default base value for now.
390 
391     // finally, if the rule body begins with an apostrophe, strip it off
392     // (this is generally used to put whitespace at the beginning of
393     // a rule's rule text)
394     if (description.length() > 0 && description.charAt(0) == gTick) {
395         description.removeBetween(0, 1);
396     }
397 
398     // return the description with all the stuff we've just waded through
399     // stripped off the front.  It now contains just the rule body.
400     // return description;
401 }
402 
403 /**
404 * Searches the rule's rule text for the substitution tokens,
405 * creates the substitutions, and removes the substitution tokens
406 * from the rule's rule text.
407 * @param owner The rule set containing this rule
408 * @param predecessor The rule preseding this one in "owners" rule list
409 * @param ownersOwner The RuleBasedFormat that owns this rule
410 */
411 void
extractSubstitutions(const NFRuleSet * ruleSet,const UnicodeString & ruleText,const NFRule * predecessor,UErrorCode & status)412 NFRule::extractSubstitutions(const NFRuleSet* ruleSet,
413                              const UnicodeString &ruleText,
414                              const NFRule* predecessor,
415                              UErrorCode& status)
416 {
417     if (U_FAILURE(status)) {
418         return;
419     }
420     this->ruleText = ruleText;
421     sub1 = extractSubstitution(ruleSet, predecessor, status);
422     if (sub1 == NULL) {
423         // Small optimization. There is no need to create a redundant NullSubstitution.
424         sub2 = NULL;
425     }
426     else {
427         sub2 = extractSubstitution(ruleSet, predecessor, status);
428     }
429     int32_t pluralRuleStart = this->ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
430     int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? this->ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1);
431     if (pluralRuleEnd >= 0) {
432         int32_t endType = this->ruleText.indexOf(gComma, pluralRuleStart);
433         if (endType < 0) {
434             status = U_PARSE_ERROR;
435             return;
436         }
437         UnicodeString type(this->ruleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2));
438         UPluralType pluralType;
439         if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) {
440             pluralType = UPLURAL_TYPE_CARDINAL;
441         }
442         else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) {
443             pluralType = UPLURAL_TYPE_ORDINAL;
444         }
445         else {
446             status = U_ILLEGAL_ARGUMENT_ERROR;
447             return;
448         }
449         rulePatternFormat = formatter->createPluralFormat(pluralType,
450                 this->ruleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status);
451     }
452 }
453 
454 /**
455 * Searches the rule's rule text for the first substitution token,
456 * creates a substitution based on it, and removes the token from
457 * the rule's rule text.
458 * @param owner The rule set containing this rule
459 * @param predecessor The rule preceding this one in the rule set's
460 * rule list
461 * @param ownersOwner The RuleBasedNumberFormat that owns this rule
462 * @return The newly-created substitution.  This is never null; if
463 * the rule text doesn't contain any substitution tokens, this will
464 * be a NullSubstitution.
465 */
466 NFSubstitution *
extractSubstitution(const NFRuleSet * ruleSet,const NFRule * predecessor,UErrorCode & status)467 NFRule::extractSubstitution(const NFRuleSet* ruleSet,
468                             const NFRule* predecessor,
469                             UErrorCode& status)
470 {
471     NFSubstitution* result = NULL;
472 
473     // search the rule's rule text for the first two characters of
474     // a substitution token
475     int32_t subStart = indexOfAnyRulePrefix();
476     int32_t subEnd = subStart;
477 
478     // if we didn't find one, create a null substitution positioned
479     // at the end of the rule text
480     if (subStart == -1) {
481         return NULL;
482     }
483 
484     // special-case the ">>>" token, since searching for the > at the
485     // end will actually find the > in the middle
486     if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) {
487         subEnd = subStart + 2;
488 
489         // otherwise the substitution token ends with the same character
490         // it began with
491     } else {
492         UChar c = ruleText.charAt(subStart);
493         subEnd = ruleText.indexOf(c, subStart + 1);
494         // special case for '<%foo<<'
495         if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) {
496             // ordinals use "=#,##0==%abbrev=" as their rule.  Notice that the '==' in the middle
497             // occurs because of the juxtaposition of two different rules.  The check for '<' is a hack
498             // to get around this.  Having the duplicate at the front would cause problems with
499             // rules like "<<%" to format, say, percents...
500             ++subEnd;
501         }
502    }
503 
504     // if we don't find the end of the token (i.e., if we're on a single,
505     // unmatched token character), create a null substitution positioned
506     // at the end of the rule
507     if (subEnd == -1) {
508         return NULL;
509     }
510 
511     // if we get here, we have a real substitution token (or at least
512     // some text bounded by substitution token characters).  Use
513     // makeSubstitution() to create the right kind of substitution
514     UnicodeString subToken;
515     subToken.setTo(ruleText, subStart, subEnd + 1 - subStart);
516     result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet,
517         this->formatter, subToken, status);
518 
519     // remove the substitution from the rule text
520     ruleText.removeBetween(subStart, subEnd+1);
521 
522     return result;
523 }
524 
525 /**
526  * Sets the rule's base value, and causes the radix and exponent
527  * to be recalculated.  This is used during construction when we
528  * don't know the rule's base value until after it's been
529  * constructed.  It should be used at any other time.
530  * @param The new base value for the rule.
531  */
532 void
setBaseValue(int64_t newBaseValue,UErrorCode & status)533 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status)
534 {
535     // set the base value
536     baseValue = newBaseValue;
537     radix = 10;
538 
539     // if this isn't a special rule, recalculate the radix and exponent
540     // (the radix always defaults to 10; if it's supposed to be something
541     // else, it's cleaned up by the caller and the exponent is
542     // recalculated again-- the only function that does this is
543     // NFRule.parseRuleDescriptor() )
544     if (baseValue >= 1) {
545         exponent = expectedExponent();
546 
547         // this function gets called on a fully-constructed rule whose
548         // description didn't specify a base value.  This means it
549         // has substitutions, and some substitutions hold on to copies
550         // of the rule's divisor.  Fix their copies of the divisor.
551         if (sub1 != NULL) {
552             sub1->setDivisor(radix, exponent, status);
553         }
554         if (sub2 != NULL) {
555             sub2->setDivisor(radix, exponent, status);
556         }
557 
558         // if this is a special rule, its radix and exponent are basically
559         // ignored.  Set them to "safe" default values
560     } else {
561         exponent = 0;
562     }
563 }
564 
565 /**
566 * This calculates the rule's exponent based on its radix and base
567 * value.  This will be the highest power the radix can be raised to
568 * and still produce a result less than or equal to the base value.
569 */
570 int16_t
expectedExponent() const571 NFRule::expectedExponent() const
572 {
573     // since the log of 0, or the log base 0 of something, causes an
574     // error, declare the exponent in these cases to be 0 (we also
575     // deal with the special-rule identifiers here)
576     if (radix == 0 || baseValue < 1) {
577         return 0;
578     }
579 
580     // we get rounding error in some cases-- for example, log 1000 / log 10
581     // gives us 1.9999999996 instead of 2.  The extra logic here is to take
582     // that into account
583     int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix));
584     int64_t temp = util64_pow(radix, tempResult + 1);
585     if (temp <= baseValue) {
586         tempResult += 1;
587     }
588     return tempResult;
589 }
590 
591 /**
592  * Searches the rule's rule text for any of the specified strings.
593  * @return The index of the first match in the rule's rule text
594  * (i.e., the first substring in the rule's rule text that matches
595  * _any_ of the strings in "strings").  If none of the strings in
596  * "strings" is found in the rule's rule text, returns -1.
597  */
598 int32_t
indexOfAnyRulePrefix() const599 NFRule::indexOfAnyRulePrefix() const
600 {
601     int result = -1;
602     for (int i = 0; RULE_PREFIXES[i]; i++) {
603         int32_t pos = ruleText.indexOf(*RULE_PREFIXES[i]);
604         if (pos != -1 && (result == -1 || pos < result)) {
605             result = pos;
606         }
607     }
608     return result;
609 }
610 
611 //-----------------------------------------------------------------------
612 // boilerplate
613 //-----------------------------------------------------------------------
614 
615 static UBool
util_equalSubstitutions(const NFSubstitution * sub1,const NFSubstitution * sub2)616 util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2)
617 {
618     if (sub1) {
619         if (sub2) {
620             return *sub1 == *sub2;
621         }
622     } else if (!sub2) {
623         return TRUE;
624     }
625     return FALSE;
626 }
627 
628 /**
629 * Tests two rules for equality.
630 * @param that The rule to compare this one against
631 * @return True is the two rules are functionally equivalent
632 */
633 UBool
operator ==(const NFRule & rhs) const634 NFRule::operator==(const NFRule& rhs) const
635 {
636     return baseValue == rhs.baseValue
637         && radix == rhs.radix
638         && exponent == rhs.exponent
639         && ruleText == rhs.ruleText
640         && util_equalSubstitutions(sub1, rhs.sub1)
641         && util_equalSubstitutions(sub2, rhs.sub2);
642 }
643 
644 /**
645 * Returns a textual representation of the rule.  This won't
646 * necessarily be the same as the description that this rule
647 * was created with, but it will produce the same result.
648 * @return A textual description of the rule
649 */
util_append64(UnicodeString & result,int64_t n)650 static void util_append64(UnicodeString& result, int64_t n)
651 {
652     UChar buffer[256];
653     int32_t len = util64_tou(n, buffer, sizeof(buffer));
654     UnicodeString temp(buffer, len);
655     result.append(temp);
656 }
657 
658 void
_appendRuleText(UnicodeString & result) const659 NFRule::_appendRuleText(UnicodeString& result) const
660 {
661     switch (getType()) {
662     case kNegativeNumberRule: result.append(gMinusX, 2); break;
663     case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
664     case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
665     case kMasterRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break;
666     case kInfinityRule: result.append(gInf, 3); break;
667     case kNaNRule: result.append(gNaN, 3); break;
668     default:
669         // for a normal rule, write out its base value, and if the radix is
670         // something other than 10, write out the radix (with the preceding
671         // slash, of course).  Then calculate the expected exponent and if
672         // if isn't the same as the actual exponent, write an appropriate
673         // number of > signs.  Finally, terminate the whole thing with
674         // a colon.
675         util_append64(result, baseValue);
676         if (radix != 10) {
677             result.append(gSlash);
678             util_append64(result, radix);
679         }
680         int numCarets = expectedExponent() - exponent;
681         for (int i = 0; i < numCarets; i++) {
682             result.append(gGreaterThan);
683         }
684         break;
685     }
686     result.append(gColon);
687     result.append(gSpace);
688 
689     // if the rule text begins with a space, write an apostrophe
690     // (whitespace after the rule descriptor is ignored; the
691     // apostrophe is used to make the whitespace significant)
692     if (ruleText.charAt(0) == gSpace && (sub1 == NULL || sub1->getPos() != 0)) {
693         result.append(gTick);
694     }
695 
696     // now, write the rule's rule text, inserting appropriate
697     // substitution tokens in the appropriate places
698     UnicodeString ruleTextCopy;
699     ruleTextCopy.setTo(ruleText);
700 
701     UnicodeString temp;
702     if (sub2 != NULL) {
703         sub2->toString(temp);
704         ruleTextCopy.insert(sub2->getPos(), temp);
705     }
706     if (sub1 != NULL) {
707         sub1->toString(temp);
708         ruleTextCopy.insert(sub1->getPos(), temp);
709     }
710 
711     result.append(ruleTextCopy);
712 
713     // and finally, top the whole thing off with a semicolon and
714     // return the result
715     result.append(gSemicolon);
716 }
717 
718 //-----------------------------------------------------------------------
719 // formatting
720 //-----------------------------------------------------------------------
721 
722 /**
723 * Formats the number, and inserts the resulting text into
724 * toInsertInto.
725 * @param number The number being formatted
726 * @param toInsertInto The string where the resultant text should
727 * be inserted
728 * @param pos The position in toInsertInto where the resultant text
729 * should be inserted
730 */
731 void
doFormat(int64_t number,UnicodeString & toInsertInto,int32_t pos,int32_t recursionCount,UErrorCode & status) const732 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
733 {
734     // first, insert the rule's rule text into toInsertInto at the
735     // specified position, then insert the results of the substitutions
736     // into the right places in toInsertInto (notice we do the
737     // substitutions in reverse order so that the offsets don't get
738     // messed up)
739     int32_t pluralRuleStart = ruleText.length();
740     int32_t lengthOffset = 0;
741     if (!rulePatternFormat) {
742         toInsertInto.insert(pos, ruleText);
743     }
744     else {
745         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
746         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
747         int initialLength = toInsertInto.length();
748         if (pluralRuleEnd < ruleText.length() - 1) {
749             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
750         }
751         toInsertInto.insert(pos,
752             rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status));
753         if (pluralRuleStart > 0) {
754             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
755         }
756         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
757     }
758 
759     if (sub2 != NULL) {
760         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
761     }
762     if (sub1 != NULL) {
763         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
764     }
765 }
766 
767 /**
768 * Formats the number, and inserts the resulting text into
769 * toInsertInto.
770 * @param number The number being formatted
771 * @param toInsertInto The string where the resultant text should
772 * be inserted
773 * @param pos The position in toInsertInto where the resultant text
774 * should be inserted
775 */
776 void
doFormat(double number,UnicodeString & toInsertInto,int32_t pos,int32_t recursionCount,UErrorCode & status) const777 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
778 {
779     // first, insert the rule's rule text into toInsertInto at the
780     // specified position, then insert the results of the substitutions
781     // into the right places in toInsertInto
782     // [again, we have two copies of this routine that do the same thing
783     // so that we don't sacrifice precision in a long by casting it
784     // to a double]
785     int32_t pluralRuleStart = ruleText.length();
786     int32_t lengthOffset = 0;
787     if (!rulePatternFormat) {
788         toInsertInto.insert(pos, ruleText);
789     }
790     else {
791         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
792         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
793         int initialLength = toInsertInto.length();
794         if (pluralRuleEnd < ruleText.length() - 1) {
795             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
796         }
797         double pluralVal = number;
798         if (0 <= pluralVal && pluralVal < 1) {
799             // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior.
800             // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors.
801             pluralVal = uprv_round(pluralVal * uprv_pow(radix, exponent));
802         }
803         else {
804             pluralVal = pluralVal / uprv_pow(radix, exponent);
805         }
806         toInsertInto.insert(pos, rulePatternFormat->format((int32_t)(pluralVal), status));
807         if (pluralRuleStart > 0) {
808             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
809         }
810         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
811     }
812 
813     if (sub2 != NULL) {
814         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
815     }
816     if (sub1 != NULL) {
817         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
818     }
819 }
820 
821 /**
822 * Used by the owning rule set to determine whether to invoke the
823 * rollback rule (i.e., whether this rule or the one that precedes
824 * it in the rule set's list should be used to format the number)
825 * @param The number being formatted
826 * @return True if the rule set should use the rule that precedes
827 * this one in its list; false if it should use this rule
828 */
829 UBool
shouldRollBack(double number) const830 NFRule::shouldRollBack(double number) const
831 {
832     // we roll back if the rule contains a modulus substitution,
833     // the number being formatted is an even multiple of the rule's
834     // divisor, and the rule's base value is NOT an even multiple
835     // of its divisor
836     // In other words, if the original description had
837     //    100: << hundred[ >>];
838     // that expands into
839     //    100: << hundred;
840     //    101: << hundred >>;
841     // internally.  But when we're formatting 200, if we use the rule
842     // at 101, which would normally apply, we get "two hundred zero".
843     // To prevent this, we roll back and use the rule at 100 instead.
844     // This is the logic that makes this happen: the rule at 101 has
845     // a modulus substitution, its base value isn't an even multiple
846     // of 100, and the value we're trying to format _is_ an even
847     // multiple of 100.  This is called the "rollback rule."
848     if ((sub1 != NULL && sub1->isModulusSubstitution()) || (sub2 != NULL && sub2->isModulusSubstitution())) {
849         int64_t re = util64_pow(radix, exponent);
850         return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0;
851     }
852     return FALSE;
853 }
854 
855 //-----------------------------------------------------------------------
856 // parsing
857 //-----------------------------------------------------------------------
858 
859 /**
860 * Attempts to parse the string with this rule.
861 * @param text The string being parsed
862 * @param parsePosition On entry, the value is ignored and assumed to
863 * be 0. On exit, this has been updated with the position of the first
864 * character not consumed by matching the text against this rule
865 * (if this rule doesn't match the text at all, the parse position
866 * if left unchanged (presumably at 0) and the function returns
867 * new Long(0)).
868 * @param isFractionRule True if this rule is contained within a
869 * fraction rule set.  This is only used if the rule has no
870 * substitutions.
871 * @return If this rule matched the text, this is the rule's base value
872 * combined appropriately with the results of parsing the substitutions.
873 * If nothing matched, this is new Long(0) and the parse position is
874 * left unchanged.  The result will be an instance of Long if the
875 * result is an integer and Double otherwise.  The result is never null.
876 */
877 #ifdef RBNF_DEBUG
878 #include <stdio.h>
879 
dumpUS(FILE * f,const UnicodeString & us)880 static void dumpUS(FILE* f, const UnicodeString& us) {
881   int len = us.length();
882   char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1];
883   if (buf != NULL) {
884 	  us.extract(0, len, buf);
885 	  buf[len] = 0;
886 	  fprintf(f, "%s", buf);
887 	  uprv_free(buf); //delete[] buf;
888   }
889 }
890 #endif
891 UBool
doParse(const UnicodeString & text,ParsePosition & parsePosition,UBool isFractionRule,double upperBound,Formattable & resVal) const892 NFRule::doParse(const UnicodeString& text,
893                 ParsePosition& parsePosition,
894                 UBool isFractionRule,
895                 double upperBound,
896                 Formattable& resVal) const
897 {
898     // internally we operate on a copy of the string being parsed
899     // (because we're going to change it) and use our own ParsePosition
900     ParsePosition pp;
901     UnicodeString workText(text);
902 
903     int32_t sub1Pos = sub1 != NULL ? sub1->getPos() : ruleText.length();
904     int32_t sub2Pos = sub2 != NULL ? sub2->getPos() : ruleText.length();
905 
906     // check to see whether the text before the first substitution
907     // matches the text at the beginning of the string being
908     // parsed.  If it does, strip that off the front of workText;
909     // otherwise, dump out with a mismatch
910     UnicodeString prefix;
911     prefix.setTo(ruleText, 0, sub1Pos);
912 
913 #ifdef RBNF_DEBUG
914     fprintf(stderr, "doParse %p ", this);
915     {
916         UnicodeString rt;
917         _appendRuleText(rt);
918         dumpUS(stderr, rt);
919     }
920 
921     fprintf(stderr, " text: '");
922     dumpUS(stderr, text);
923     fprintf(stderr, "' prefix: '");
924     dumpUS(stderr, prefix);
925 #endif
926     stripPrefix(workText, prefix, pp);
927     int32_t prefixLength = text.length() - workText.length();
928 
929 #ifdef RBNF_DEBUG
930     fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos);
931 #endif
932 
933     if (pp.getIndex() == 0 && sub1Pos != 0) {
934         // commented out because ParsePosition doesn't have error index in 1.1.x
935         // restored for ICU4C port
936         parsePosition.setErrorIndex(pp.getErrorIndex());
937         resVal.setLong(0);
938         return TRUE;
939     }
940     if (baseValue == kInfinityRule) {
941         // If you match this, don't try to perform any calculations on it.
942         parsePosition.setIndex(pp.getIndex());
943         resVal.setDouble(uprv_getInfinity());
944         return TRUE;
945     }
946     if (baseValue == kNaNRule) {
947         // If you match this, don't try to perform any calculations on it.
948         parsePosition.setIndex(pp.getIndex());
949         resVal.setDouble(uprv_getNaN());
950         return TRUE;
951     }
952 
953     // this is the fun part.  The basic guts of the rule-matching
954     // logic is matchToDelimiter(), which is called twice.  The first
955     // time it searches the input string for the rule text BETWEEN
956     // the substitutions and tries to match the intervening text
957     // in the input string with the first substitution.  If that
958     // succeeds, it then calls it again, this time to look for the
959     // rule text after the second substitution and to match the
960     // intervening input text against the second substitution.
961     //
962     // For example, say we have a rule that looks like this:
963     //    first << middle >> last;
964     // and input text that looks like this:
965     //    first one middle two last
966     // First we use stripPrefix() to match "first " in both places and
967     // strip it off the front, leaving
968     //    one middle two last
969     // Then we use matchToDelimiter() to match " middle " and try to
970     // match "one" against a substitution.  If it's successful, we now
971     // have
972     //    two last
973     // We use matchToDelimiter() a second time to match " last" and
974     // try to match "two" against a substitution.  If "two" matches
975     // the substitution, we have a successful parse.
976     //
977     // Since it's possible in many cases to find multiple instances
978     // of each of these pieces of rule text in the input string,
979     // we need to try all the possible combinations of these
980     // locations.  This prevents us from prematurely declaring a mismatch,
981     // and makes sure we match as much input text as we can.
982     int highWaterMark = 0;
983     double result = 0;
984     int start = 0;
985     double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue);
986 
987     UnicodeString temp;
988     do {
989         // our partial parse result starts out as this rule's base
990         // value.  If it finds a successful match, matchToDelimiter()
991         // will compose this in some way with what it gets back from
992         // the substitution, giving us a new partial parse result
993         pp.setIndex(0);
994 
995         temp.setTo(ruleText, sub1Pos, sub2Pos - sub1Pos);
996         double partialResult = matchToDelimiter(workText, start, tempBaseValue,
997             temp, pp, sub1,
998             upperBound);
999 
1000         // if we got a successful match (or were trying to match a
1001         // null substitution), pp is now pointing at the first unmatched
1002         // character.  Take note of that, and try matchToDelimiter()
1003         // on the input text again
1004         if (pp.getIndex() != 0 || sub1 == NULL) {
1005             start = pp.getIndex();
1006 
1007             UnicodeString workText2;
1008             workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex());
1009             ParsePosition pp2;
1010 
1011             // the second matchToDelimiter() will compose our previous
1012             // partial result with whatever it gets back from its
1013             // substitution if there's a successful match, giving us
1014             // a real result
1015             temp.setTo(ruleText, sub2Pos, ruleText.length() - sub2Pos);
1016             partialResult = matchToDelimiter(workText2, 0, partialResult,
1017                 temp, pp2, sub2,
1018                 upperBound);
1019 
1020             // if we got a successful match on this second
1021             // matchToDelimiter() call, update the high-water mark
1022             // and result (if necessary)
1023             if (pp2.getIndex() != 0 || sub2 == NULL) {
1024                 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) {
1025                     highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex();
1026                     result = partialResult;
1027                 }
1028             }
1029             else {
1030                 // commented out because ParsePosition doesn't have error index in 1.1.x
1031                 // restored for ICU4C port
1032                 int32_t temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex();
1033                 if (temp> parsePosition.getErrorIndex()) {
1034                     parsePosition.setErrorIndex(temp);
1035                 }
1036             }
1037         }
1038         else {
1039             // commented out because ParsePosition doesn't have error index in 1.1.x
1040             // restored for ICU4C port
1041             int32_t temp = sub1Pos + pp.getErrorIndex();
1042             if (temp > parsePosition.getErrorIndex()) {
1043                 parsePosition.setErrorIndex(temp);
1044             }
1045         }
1046         // keep trying to match things until the outer matchToDelimiter()
1047         // call fails to make a match (each time, it picks up where it
1048         // left off the previous time)
1049     } while (sub1Pos != sub2Pos
1050         && pp.getIndex() > 0
1051         && pp.getIndex() < workText.length()
1052         && pp.getIndex() != start);
1053 
1054     // update the caller's ParsePosition with our high-water mark
1055     // (i.e., it now points at the first character this function
1056     // didn't match-- the ParsePosition is therefore unchanged if
1057     // we didn't match anything)
1058     parsePosition.setIndex(highWaterMark);
1059     // commented out because ParsePosition doesn't have error index in 1.1.x
1060     // restored for ICU4C port
1061     if (highWaterMark > 0) {
1062         parsePosition.setErrorIndex(0);
1063     }
1064 
1065     // this is a hack for one unusual condition: Normally, whether this
1066     // rule belong to a fraction rule set or not is handled by its
1067     // substitutions.  But if that rule HAS NO substitutions, then
1068     // we have to account for it here.  By definition, if the matching
1069     // rule in a fraction rule set has no substitutions, its numerator
1070     // is 1, and so the result is the reciprocal of its base value.
1071     if (isFractionRule && highWaterMark > 0 && sub1 == NULL) {
1072         result = 1 / result;
1073     }
1074 
1075     resVal.setDouble(result);
1076     return TRUE; // ??? do we need to worry if it is a long or a double?
1077 }
1078 
1079 /**
1080 * This function is used by parse() to match the text being parsed
1081 * against a possible prefix string.  This function
1082 * matches characters from the beginning of the string being parsed
1083 * to characters from the prospective prefix.  If they match, pp is
1084 * updated to the first character not matched, and the result is
1085 * the unparsed part of the string.  If they don't match, the whole
1086 * string is returned, and pp is left unchanged.
1087 * @param text The string being parsed
1088 * @param prefix The text to match against
1089 * @param pp On entry, ignored and assumed to be 0.  On exit, points
1090 * to the first unmatched character (assuming the whole prefix matched),
1091 * or is unchanged (if the whole prefix didn't match).
1092 * @return If things match, this is the unparsed part of "text";
1093 * if they didn't match, this is "text".
1094 */
1095 void
stripPrefix(UnicodeString & text,const UnicodeString & prefix,ParsePosition & pp) const1096 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const
1097 {
1098     // if the prefix text is empty, dump out without doing anything
1099     if (prefix.length() != 0) {
1100     	UErrorCode status = U_ZERO_ERROR;
1101         // use prefixLength() to match the beginning of
1102         // "text" against "prefix".  This function returns the
1103         // number of characters from "text" that matched (or 0 if
1104         // we didn't match the whole prefix)
1105         int32_t pfl = prefixLength(text, prefix, status);
1106         if (U_FAILURE(status)) { // Memory allocation error.
1107         	return;
1108         }
1109         if (pfl != 0) {
1110             // if we got a successful match, update the parse position
1111             // and strip the prefix off of "text"
1112             pp.setIndex(pp.getIndex() + pfl);
1113             text.remove(0, pfl);
1114         }
1115     }
1116 }
1117 
1118 /**
1119 * Used by parse() to match a substitution and any following text.
1120 * "text" is searched for instances of "delimiter".  For each instance
1121 * of delimiter, the intervening text is tested to see whether it
1122 * matches the substitution.  The longest match wins.
1123 * @param text The string being parsed
1124 * @param startPos The position in "text" where we should start looking
1125 * for "delimiter".
1126 * @param baseValue A partial parse result (often the rule's base value),
1127 * which is combined with the result from matching the substitution
1128 * @param delimiter The string to search "text" for.
1129 * @param pp Ignored and presumed to be 0 on entry.  If there's a match,
1130 * on exit this will point to the first unmatched character.
1131 * @param sub If we find "delimiter" in "text", this substitution is used
1132 * to match the text between the beginning of the string and the
1133 * position of "delimiter."  (If "delimiter" is the empty string, then
1134 * this function just matches against this substitution and updates
1135 * everything accordingly.)
1136 * @param upperBound When matching the substitution, it will only
1137 * consider rules with base values lower than this value.
1138 * @return If there's a match, this is the result of composing
1139 * baseValue with the result of matching the substitution.  Otherwise,
1140 * this is new Long(0).  It's never null.  If the result is an integer,
1141 * this will be an instance of Long; otherwise, it's an instance of
1142 * Double.
1143 *
1144 * !!! note {dlf} in point of fact, in the java code the caller always converts
1145 * the result to a double, so we might as well return one.
1146 */
1147 double
matchToDelimiter(const UnicodeString & text,int32_t startPos,double _baseValue,const UnicodeString & delimiter,ParsePosition & pp,const NFSubstitution * sub,double upperBound) const1148 NFRule::matchToDelimiter(const UnicodeString& text,
1149                          int32_t startPos,
1150                          double _baseValue,
1151                          const UnicodeString& delimiter,
1152                          ParsePosition& pp,
1153                          const NFSubstitution* sub,
1154                          double upperBound) const
1155 {
1156 	UErrorCode status = U_ZERO_ERROR;
1157     // if "delimiter" contains real (i.e., non-ignorable) text, search
1158     // it for "delimiter" beginning at "start".  If that succeeds, then
1159     // use "sub"'s doParse() method to match the text before the
1160     // instance of "delimiter" we just found.
1161     if (!allIgnorable(delimiter, status)) {
1162     	if (U_FAILURE(status)) { //Memory allocation error.
1163     		return 0;
1164     	}
1165         ParsePosition tempPP;
1166         Formattable result;
1167 
1168         // use findText() to search for "delimiter".  It returns a two-
1169         // element array: element 0 is the position of the match, and
1170         // element 1 is the number of characters that matched
1171         // "delimiter".
1172         int32_t dLen;
1173         int32_t dPos = findText(text, delimiter, startPos, &dLen);
1174 
1175         // if findText() succeeded, isolate the text preceding the
1176         // match, and use "sub" to match that text
1177         while (dPos >= 0) {
1178             UnicodeString subText;
1179             subText.setTo(text, 0, dPos);
1180             if (subText.length() > 0) {
1181                 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound,
1182 #if UCONFIG_NO_COLLATION
1183                     FALSE,
1184 #else
1185                     formatter->isLenient(),
1186 #endif
1187                     result);
1188 
1189                 // if the substitution could match all the text up to
1190                 // where we found "delimiter", then this function has
1191                 // a successful match.  Bump the caller's parse position
1192                 // to point to the first character after the text
1193                 // that matches "delimiter", and return the result
1194                 // we got from parsing the substitution.
1195                 if (success && tempPP.getIndex() == dPos) {
1196                     pp.setIndex(dPos + dLen);
1197                     return result.getDouble();
1198                 }
1199                 else {
1200                     // commented out because ParsePosition doesn't have error index in 1.1.x
1201                     // restored for ICU4C port
1202                     if (tempPP.getErrorIndex() > 0) {
1203                         pp.setErrorIndex(tempPP.getErrorIndex());
1204                     } else {
1205                         pp.setErrorIndex(tempPP.getIndex());
1206                     }
1207                 }
1208             }
1209 
1210             // if we didn't match the substitution, search for another
1211             // copy of "delimiter" in "text" and repeat the loop if
1212             // we find it
1213             tempPP.setIndex(0);
1214             dPos = findText(text, delimiter, dPos + dLen, &dLen);
1215         }
1216         // if we make it here, this was an unsuccessful match, and we
1217         // leave pp unchanged and return 0
1218         pp.setIndex(0);
1219         return 0;
1220 
1221         // if "delimiter" is empty, or consists only of ignorable characters
1222         // (i.e., is semantically empty), thwe we obviously can't search
1223         // for "delimiter".  Instead, just use "sub" to parse as much of
1224         // "text" as possible.
1225     }
1226     else if (sub == NULL) {
1227         return _baseValue;
1228     }
1229     else {
1230         ParsePosition tempPP;
1231         Formattable result;
1232 
1233         // try to match the whole string against the substitution
1234         UBool success = sub->doParse(text, tempPP, _baseValue, upperBound,
1235 #if UCONFIG_NO_COLLATION
1236             FALSE,
1237 #else
1238             formatter->isLenient(),
1239 #endif
1240             result);
1241         if (success && (tempPP.getIndex() != 0)) {
1242             // if there's a successful match (or it's a null
1243             // substitution), update pp to point to the first
1244             // character we didn't match, and pass the result from
1245             // sub.doParse() on through to the caller
1246             pp.setIndex(tempPP.getIndex());
1247             return result.getDouble();
1248         }
1249         else {
1250             // commented out because ParsePosition doesn't have error index in 1.1.x
1251             // restored for ICU4C port
1252             pp.setErrorIndex(tempPP.getErrorIndex());
1253         }
1254 
1255         // and if we get to here, then nothing matched, so we return
1256         // 0 and leave pp alone
1257         return 0;
1258     }
1259 }
1260 
1261 /**
1262 * Used by stripPrefix() to match characters.  If lenient parse mode
1263 * is off, this just calls startsWith().  If lenient parse mode is on,
1264 * this function uses CollationElementIterators to match characters in
1265 * the strings (only primary-order differences are significant in
1266 * determining whether there's a match).
1267 * @param str The string being tested
1268 * @param prefix The text we're hoping to see at the beginning
1269 * of "str"
1270 * @return If "prefix" is found at the beginning of "str", this
1271 * is the number of characters in "str" that were matched (this
1272 * isn't necessarily the same as the length of "prefix" when matching
1273 * text with a collator).  If there's no match, this is 0.
1274 */
1275 int32_t
prefixLength(const UnicodeString & str,const UnicodeString & prefix,UErrorCode & status) const1276 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const
1277 {
1278     // if we're looking for an empty prefix, it obviously matches
1279     // zero characters.  Just go ahead and return 0.
1280     if (prefix.length() == 0) {
1281         return 0;
1282     }
1283 
1284 #if !UCONFIG_NO_COLLATION
1285     // go through all this grief if we're in lenient-parse mode
1286     if (formatter->isLenient()) {
1287         // get the formatter's collator and use it to create two
1288         // collation element iterators, one over the target string
1289         // and another over the prefix (right now, we'll throw an
1290         // exception if the collator we get back from the formatter
1291         // isn't a RuleBasedCollator, because RuleBasedCollator defines
1292         // the CollationElementIterator protocol.  Hopefully, this
1293         // will change someday.)
1294         const RuleBasedCollator* collator = formatter->getCollator();
1295         if (collator == NULL) {
1296             status = U_MEMORY_ALLOCATION_ERROR;
1297             return 0;
1298         }
1299         LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str));
1300         LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix));
1301         // Check for memory allocation error.
1302         if (strIter.isNull() || prefixIter.isNull()) {
1303             status = U_MEMORY_ALLOCATION_ERROR;
1304             return 0;
1305         }
1306 
1307         UErrorCode err = U_ZERO_ERROR;
1308 
1309         // The original code was problematic.  Consider this match:
1310         // prefix = "fifty-"
1311         // string = " fifty-7"
1312         // The intent is to match string up to the '7', by matching 'fifty-' at position 1
1313         // in the string.  Unfortunately, we were getting a match, and then computing where
1314         // the match terminated by rematching the string.  The rematch code was using as an
1315         // initial guess the substring of string between 0 and prefix.length.  Because of
1316         // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving
1317         // the position before the hyphen in the string.  Recursing down, we then parsed the
1318         // remaining string '-7' as numeric.  The resulting number turned out as 43 (50 - 7).
1319         // This was not pretty, especially since the string "fifty-7" parsed just fine.
1320         //
1321         // We have newer APIs now, so we can use calls on the iterator to determine what we
1322         // matched up to.  If we terminate because we hit the last element in the string,
1323         // our match terminates at this length.  If we terminate because we hit the last element
1324         // in the target, our match terminates at one before the element iterator position.
1325 
1326         // match collation elements between the strings
1327         int32_t oStr = strIter->next(err);
1328         int32_t oPrefix = prefixIter->next(err);
1329 
1330         while (oPrefix != CollationElementIterator::NULLORDER) {
1331             // skip over ignorable characters in the target string
1332             while (CollationElementIterator::primaryOrder(oStr) == 0
1333                 && oStr != CollationElementIterator::NULLORDER) {
1334                 oStr = strIter->next(err);
1335             }
1336 
1337             // skip over ignorable characters in the prefix
1338             while (CollationElementIterator::primaryOrder(oPrefix) == 0
1339                 && oPrefix != CollationElementIterator::NULLORDER) {
1340                 oPrefix = prefixIter->next(err);
1341             }
1342 
1343             // dlf: move this above following test, if we consume the
1344             // entire target, aren't we ok even if the source was also
1345             // entirely consumed?
1346 
1347             // if skipping over ignorables brought to the end of
1348             // the prefix, we DID match: drop out of the loop
1349             if (oPrefix == CollationElementIterator::NULLORDER) {
1350                 break;
1351             }
1352 
1353             // if skipping over ignorables brought us to the end
1354             // of the target string, we didn't match and return 0
1355             if (oStr == CollationElementIterator::NULLORDER) {
1356                 return 0;
1357             }
1358 
1359             // match collation elements from the two strings
1360             // (considering only primary differences).  If we
1361             // get a mismatch, dump out and return 0
1362             if (CollationElementIterator::primaryOrder(oStr)
1363                 != CollationElementIterator::primaryOrder(oPrefix)) {
1364                 return 0;
1365 
1366                 // otherwise, advance to the next character in each string
1367                 // and loop (we drop out of the loop when we exhaust
1368                 // collation elements in the prefix)
1369             } else {
1370                 oStr = strIter->next(err);
1371                 oPrefix = prefixIter->next(err);
1372             }
1373         }
1374 
1375         int32_t result = strIter->getOffset();
1376         if (oStr != CollationElementIterator::NULLORDER) {
1377             --result; // back over character that we don't want to consume;
1378         }
1379 
1380 #ifdef RBNF_DEBUG
1381         fprintf(stderr, "prefix length: %d\n", result);
1382 #endif
1383         return result;
1384 #if 0
1385         //----------------------------------------------------------------
1386         // JDK 1.2-specific API call
1387         // return strIter.getOffset();
1388         //----------------------------------------------------------------
1389         // JDK 1.1 HACK (take out for 1.2-specific code)
1390 
1391         // if we make it to here, we have a successful match.  Now we
1392         // have to find out HOW MANY characters from the target string
1393         // matched the prefix (there isn't necessarily a one-to-one
1394         // mapping between collation elements and characters).
1395         // In JDK 1.2, there's a simple getOffset() call we can use.
1396         // In JDK 1.1, on the other hand, we have to go through some
1397         // ugly contortions.  First, use the collator to compare the
1398         // same number of characters from the prefix and target string.
1399         // If they're equal, we're done.
1400         collator->setStrength(Collator::PRIMARY);
1401         if (str.length() >= prefix.length()) {
1402             UnicodeString temp;
1403             temp.setTo(str, 0, prefix.length());
1404             if (collator->equals(temp, prefix)) {
1405 #ifdef RBNF_DEBUG
1406                 fprintf(stderr, "returning: %d\n", prefix.length());
1407 #endif
1408                 return prefix.length();
1409             }
1410         }
1411 
1412         // if they're not equal, then we have to compare successively
1413         // larger and larger substrings of the target string until we
1414         // get to one that matches the prefix.  At that point, we know
1415         // how many characters matched the prefix, and we can return.
1416         int32_t p = 1;
1417         while (p <= str.length()) {
1418             UnicodeString temp;
1419             temp.setTo(str, 0, p);
1420             if (collator->equals(temp, prefix)) {
1421                 return p;
1422             } else {
1423                 ++p;
1424             }
1425         }
1426 
1427         // SHOULD NEVER GET HERE!!!
1428         return 0;
1429         //----------------------------------------------------------------
1430 #endif
1431 
1432         // If lenient parsing is turned off, forget all that crap above.
1433         // Just use String.startsWith() and be done with it.
1434   } else
1435 #endif
1436   {
1437       if (str.startsWith(prefix)) {
1438           return prefix.length();
1439       } else {
1440           return 0;
1441       }
1442   }
1443 }
1444 
1445 /**
1446 * Searches a string for another string.  If lenient parsing is off,
1447 * this just calls indexOf().  If lenient parsing is on, this function
1448 * uses CollationElementIterator to match characters, and only
1449 * primary-order differences are significant in determining whether
1450 * there's a match.
1451 * @param str The string to search
1452 * @param key The string to search "str" for
1453 * @param startingAt The index into "str" where the search is to
1454 * begin
1455 * @return A two-element array of ints.  Element 0 is the position
1456 * of the match, or -1 if there was no match.  Element 1 is the
1457 * number of characters in "str" that matched (which isn't necessarily
1458 * the same as the length of "key")
1459 */
1460 int32_t
findText(const UnicodeString & str,const UnicodeString & key,int32_t startingAt,int32_t * length) const1461 NFRule::findText(const UnicodeString& str,
1462                  const UnicodeString& key,
1463                  int32_t startingAt,
1464                  int32_t* length) const
1465 {
1466     if (rulePatternFormat) {
1467         Formattable result;
1468         FieldPosition position(UNUM_INTEGER_FIELD);
1469         position.setBeginIndex(startingAt);
1470         rulePatternFormat->parseType(str, this, result, position);
1471         int start = position.getBeginIndex();
1472         if (start >= 0) {
1473             int32_t pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
1474             int32_t pluralRuleSuffix = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2;
1475             int32_t matchLen = position.getEndIndex() - start;
1476             UnicodeString prefix(ruleText.tempSubString(0, pluralRuleStart));
1477             UnicodeString suffix(ruleText.tempSubString(pluralRuleSuffix));
1478             if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0
1479                     && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0)
1480             {
1481                 *length = matchLen + prefix.length() + suffix.length();
1482                 return start - prefix.length();
1483             }
1484         }
1485         *length = 0;
1486         return -1;
1487     }
1488     if (!formatter->isLenient()) {
1489         // if lenient parsing is turned off, this is easy: just call
1490         // String.indexOf() and we're done
1491         *length = key.length();
1492         return str.indexOf(key, startingAt);
1493     }
1494     else {
1495         // but if lenient parsing is turned ON, we've got some work
1496         // ahead of us
1497         return findTextLenient(str, key, startingAt, length);
1498     }
1499 }
1500 
1501 int32_t
findTextLenient(const UnicodeString & str,const UnicodeString & key,int32_t startingAt,int32_t * length) const1502 NFRule::findTextLenient(const UnicodeString& str,
1503                  const UnicodeString& key,
1504                  int32_t startingAt,
1505                  int32_t* length) const
1506 {
1507     //----------------------------------------------------------------
1508     // JDK 1.1 HACK (take out of 1.2-specific code)
1509 
1510     // in JDK 1.2, CollationElementIterator provides us with an
1511     // API to map between character offsets and collation elements
1512     // and we can do this by marching through the string comparing
1513     // collation elements.  We can't do that in JDK 1.1.  Insted,
1514     // we have to go through this horrible slow mess:
1515     int32_t p = startingAt;
1516     int32_t keyLen = 0;
1517 
1518     // basically just isolate smaller and smaller substrings of
1519     // the target string (each running to the end of the string,
1520     // and with the first one running from startingAt to the end)
1521     // and then use prefixLength() to see if the search key is at
1522     // the beginning of each substring.  This is excruciatingly
1523     // slow, but it will locate the key and tell use how long the
1524     // matching text was.
1525     UnicodeString temp;
1526     UErrorCode status = U_ZERO_ERROR;
1527     while (p < str.length() && keyLen == 0) {
1528         temp.setTo(str, p, str.length() - p);
1529         keyLen = prefixLength(temp, key, status);
1530         if (U_FAILURE(status)) {
1531             break;
1532         }
1533         if (keyLen != 0) {
1534             *length = keyLen;
1535             return p;
1536         }
1537         ++p;
1538     }
1539     // if we make it to here, we didn't find it.  Return -1 for the
1540     // location.  The length should be ignored, but set it to 0,
1541     // which should be "safe"
1542     *length = 0;
1543     return -1;
1544 }
1545 
1546 /**
1547 * Checks to see whether a string consists entirely of ignorable
1548 * characters.
1549 * @param str The string to test.
1550 * @return true if the string is empty of consists entirely of
1551 * characters that the number formatter's collator says are
1552 * ignorable at the primary-order level.  false otherwise.
1553 */
1554 UBool
allIgnorable(const UnicodeString & str,UErrorCode & status) const1555 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const
1556 {
1557     // if the string is empty, we can just return true
1558     if (str.length() == 0) {
1559         return TRUE;
1560     }
1561 
1562 #if !UCONFIG_NO_COLLATION
1563     // if lenient parsing is turned on, walk through the string with
1564     // a collation element iterator and make sure each collation
1565     // element is 0 (ignorable) at the primary level
1566     if (formatter->isLenient()) {
1567         const RuleBasedCollator* collator = formatter->getCollator();
1568         if (collator == NULL) {
1569             status = U_MEMORY_ALLOCATION_ERROR;
1570             return FALSE;
1571         }
1572         LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str));
1573 
1574         // Memory allocation error check.
1575         if (iter.isNull()) {
1576             status = U_MEMORY_ALLOCATION_ERROR;
1577             return FALSE;
1578         }
1579 
1580         UErrorCode err = U_ZERO_ERROR;
1581         int32_t o = iter->next(err);
1582         while (o != CollationElementIterator::NULLORDER
1583             && CollationElementIterator::primaryOrder(o) == 0) {
1584             o = iter->next(err);
1585         }
1586 
1587         return o == CollationElementIterator::NULLORDER;
1588     }
1589 #endif
1590 
1591     // if lenient parsing is turned off, there is no such thing as
1592     // an ignorable character: return true only if the string is empty
1593     return FALSE;
1594 }
1595 
1596 void
setDecimalFormatSymbols(const DecimalFormatSymbols & newSymbols,UErrorCode & status)1597 NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) {
1598     if (sub1 != NULL) {
1599         sub1->setDecimalFormatSymbols(newSymbols, status);
1600     }
1601     if (sub2 != NULL) {
1602         sub2->setDecimalFormatSymbols(newSymbols, status);
1603     }
1604 }
1605 
1606 U_NAMESPACE_END
1607 
1608 /* U_HAVE_RBNF */
1609 #endif
1610