1 /*
2 ******************************************************************************
3 *   Copyright (C) 1997-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 ******************************************************************************
6 *   file name:  nfrule.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 * Modification history
12 * Date        Name      Comments
13 * 10/11/2001  Doug      Ported from ICU4J
14 */
15 
16 #include "nfrule.h"
17 
18 #if U_HAVE_RBNF
19 
20 #include "unicode/localpointer.h"
21 #include "unicode/rbnf.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/plurfmt.h"
24 #include "unicode/upluralrules.h"
25 #include "unicode/coleitr.h"
26 #include "unicode/uchar.h"
27 #include "nfrs.h"
28 #include "nfrlist.h"
29 #include "nfsubs.h"
30 #include "patternprops.h"
31 
32 U_NAMESPACE_BEGIN
33 
NFRule(const RuleBasedNumberFormat * _rbnf,const UnicodeString & _ruleText,UErrorCode & status)34 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status)
35   : baseValue((int32_t)0)
36   , radix(10)
37   , exponent(0)
38   , decimalPoint(0)
39   , ruleText(_ruleText)
40   , sub1(NULL)
41   , sub2(NULL)
42   , formatter(_rbnf)
43   , rulePatternFormat(NULL)
44 {
45     if (!ruleText.isEmpty()) {
46         parseRuleDescriptor(ruleText, status);
47     }
48 }
49 
~NFRule()50 NFRule::~NFRule()
51 {
52     if (sub1 != sub2) {
53         delete sub2;
54         sub2 = NULL;
55     }
56     delete sub1;
57     sub1 = NULL;
58     delete rulePatternFormat;
59     rulePatternFormat = NULL;
60 }
61 
62 static const UChar gLeftBracket = 0x005b;
63 static const UChar gRightBracket = 0x005d;
64 static const UChar gColon = 0x003a;
65 static const UChar gZero = 0x0030;
66 static const UChar gNine = 0x0039;
67 static const UChar gSpace = 0x0020;
68 static const UChar gSlash = 0x002f;
69 static const UChar gGreaterThan = 0x003e;
70 static const UChar gLessThan = 0x003c;
71 static const UChar gComma = 0x002c;
72 static const UChar gDot = 0x002e;
73 static const UChar gTick = 0x0027;
74 //static const UChar gMinus = 0x002d;
75 static const UChar gSemicolon = 0x003b;
76 static const UChar gX = 0x0078;
77 
78 static const UChar gMinusX[] =                  {0x2D, 0x78, 0};    /* "-x" */
79 static const UChar gInf[] =                     {0x49, 0x6E, 0x66, 0}; /* "Inf" */
80 static const UChar gNaN[] =                     {0x4E, 0x61, 0x4E, 0}; /* "NaN" */
81 
82 static const UChar gDollarOpenParenthesis[] =   {0x24, 0x28, 0}; /* "$(" */
83 static const UChar gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */
84 
85 static const UChar gLessLess[] =                {0x3C, 0x3C, 0};    /* "<<" */
86 static const UChar gLessPercent[] =             {0x3C, 0x25, 0};    /* "<%" */
87 static const UChar gLessHash[] =                {0x3C, 0x23, 0};    /* "<#" */
88 static const UChar gLessZero[] =                {0x3C, 0x30, 0};    /* "<0" */
89 static const UChar gGreaterGreater[] =          {0x3E, 0x3E, 0};    /* ">>" */
90 static const UChar gGreaterPercent[] =          {0x3E, 0x25, 0};    /* ">%" */
91 static const UChar gGreaterHash[] =             {0x3E, 0x23, 0};    /* ">#" */
92 static const UChar gGreaterZero[] =             {0x3E, 0x30, 0};    /* ">0" */
93 static const UChar gEqualPercent[] =            {0x3D, 0x25, 0};    /* "=%" */
94 static const UChar gEqualHash[] =               {0x3D, 0x23, 0};    /* "=#" */
95 static const UChar gEqualZero[] =               {0x3D, 0x30, 0};    /* "=0" */
96 static const UChar gGreaterGreaterGreater[] =   {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */
97 
98 static const UChar * const RULE_PREFIXES[] = {
99     gLessLess, gLessPercent, gLessHash, gLessZero,
100     gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero,
101     gEqualPercent, gEqualHash, gEqualZero, NULL
102 };
103 
104 void
makeRules(UnicodeString & description,NFRuleSet * owner,const NFRule * predecessor,const RuleBasedNumberFormat * rbnf,NFRuleList & rules,UErrorCode & status)105 NFRule::makeRules(UnicodeString& description,
106                   NFRuleSet *owner,
107                   const NFRule *predecessor,
108                   const RuleBasedNumberFormat *rbnf,
109                   NFRuleList& rules,
110                   UErrorCode& status)
111 {
112     // we know we're making at least one rule, so go ahead and
113     // new it up and initialize its basevalue and divisor
114     // (this also strips the rule descriptor, if any, off the
115     // descripton string)
116     NFRule* rule1 = new NFRule(rbnf, description, status);
117     /* test for NULL */
118     if (rule1 == 0) {
119         status = U_MEMORY_ALLOCATION_ERROR;
120         return;
121     }
122     description = rule1->ruleText;
123 
124     // check the description to see whether there's text enclosed
125     // in brackets
126     int32_t brack1 = description.indexOf(gLeftBracket);
127     int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket);
128 
129     // if the description doesn't contain a matched pair of brackets,
130     // or if it's of a type that doesn't recognize bracketed text,
131     // then leave the description alone, initialize the rule's
132     // rule text and substitutions, and return that rule
133     if (brack2 < 0 || brack1 > brack2
134         || rule1->getType() == kProperFractionRule
135         || rule1->getType() == kNegativeNumberRule
136         || rule1->getType() == kInfinityRule
137         || rule1->getType() == kNaNRule)
138     {
139         rule1->extractSubstitutions(owner, description, predecessor, status);
140     }
141     else {
142         // if the description does contain a matched pair of brackets,
143         // then it's really shorthand for two rules (with one exception)
144         NFRule* rule2 = NULL;
145         UnicodeString sbuf;
146 
147         // we'll actually only split the rule into two rules if its
148         // base value is an even multiple of its divisor (or it's one
149         // of the special rules)
150         if ((rule1->baseValue > 0
151             && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0)
152             || rule1->getType() == kImproperFractionRule
153             || rule1->getType() == kMasterRule) {
154 
155             // if it passes that test, new up the second rule.  If the
156             // rule set both rules will belong to is a fraction rule
157             // set, they both have the same base value; otherwise,
158             // increment the original rule's base value ("rule1" actually
159             // goes SECOND in the rule set's rule list)
160             rule2 = new NFRule(rbnf, UnicodeString(), status);
161             /* test for NULL */
162             if (rule2 == 0) {
163                 status = U_MEMORY_ALLOCATION_ERROR;
164                 return;
165             }
166             if (rule1->baseValue >= 0) {
167                 rule2->baseValue = rule1->baseValue;
168                 if (!owner->isFractionRuleSet()) {
169                     ++rule1->baseValue;
170                 }
171             }
172 
173             // if the description began with "x.x" and contains bracketed
174             // text, it describes both the improper fraction rule and
175             // the proper fraction rule
176             else if (rule1->getType() == kImproperFractionRule) {
177                 rule2->setType(kProperFractionRule);
178             }
179 
180             // if the description began with "x.0" and contains bracketed
181             // text, it describes both the master rule and the
182             // improper fraction rule
183             else if (rule1->getType() == kMasterRule) {
184                 rule2->baseValue = rule1->baseValue;
185                 rule1->setType(kImproperFractionRule);
186             }
187 
188             // both rules have the same radix and exponent (i.e., the
189             // same divisor)
190             rule2->radix = rule1->radix;
191             rule2->exponent = rule1->exponent;
192 
193             // rule2's rule text omits the stuff in brackets: initalize
194             // its rule text and substitutions accordingly
195             sbuf.append(description, 0, brack1);
196             if (brack2 + 1 < description.length()) {
197                 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
198             }
199             rule2->extractSubstitutions(owner, sbuf, predecessor, status);
200         }
201 
202         // rule1's text includes the text in the brackets but omits
203         // the brackets themselves: initialize _its_ rule text and
204         // substitutions accordingly
205         sbuf.setTo(description, 0, brack1);
206         sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
207         if (brack2 + 1 < description.length()) {
208             sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
209         }
210         rule1->extractSubstitutions(owner, sbuf, predecessor, status);
211 
212         // if we only have one rule, return it; if we have two, return
213         // a two-element array containing them (notice that rule2 goes
214         // BEFORE rule1 in the list: in all cases, rule2 OMITS the
215         // material in the brackets and rule1 INCLUDES the material
216         // in the brackets)
217         if (rule2 != NULL) {
218             if (rule2->baseValue >= kNoBase) {
219                 rules.add(rule2);
220             }
221             else {
222                 owner->setNonNumericalRule(rule2);
223             }
224         }
225     }
226     if (rule1->baseValue >= kNoBase) {
227         rules.add(rule1);
228     }
229     else {
230         owner->setNonNumericalRule(rule1);
231     }
232 }
233 
234 /**
235  * This function parses the rule's rule descriptor (i.e., the base
236  * value and/or other tokens that precede the rule's rule text
237  * in the description) and sets the rule's base value, radix, and
238  * exponent according to the descriptor.  (If the description doesn't
239  * include a rule descriptor, then this function sets everything to
240  * default values and the rule set sets the rule's real base value).
241  * @param description The rule's description
242  * @return If "description" included a rule descriptor, this is
243  * "description" with the descriptor and any trailing whitespace
244  * stripped off.  Otherwise; it's "descriptor" unchangd.
245  */
246 void
parseRuleDescriptor(UnicodeString & description,UErrorCode & status)247 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
248 {
249     // the description consists of a rule descriptor and a rule body,
250     // separated by a colon.  The rule descriptor is optional.  If
251     // it's omitted, just set the base value to 0.
252     int32_t p = description.indexOf(gColon);
253     if (p != -1) {
254         // copy the descriptor out into its own string and strip it,
255         // along with any trailing whitespace, out of the original
256         // description
257         UnicodeString descriptor;
258         descriptor.setTo(description, 0, p);
259 
260         ++p;
261         while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) {
262             ++p;
263         }
264         description.removeBetween(0, p);
265 
266         // check first to see if the rule descriptor matches the token
267         // for one of the special rules.  If it does, set the base
268         // value to the correct identifier value
269         int descriptorLength = descriptor.length();
270         UChar firstChar = descriptor.charAt(0);
271         UChar lastChar = descriptor.charAt(descriptorLength - 1);
272         if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) {
273             // if the rule descriptor begins with a digit, it's a descriptor
274             // for a normal rule
275             // since we don't have Long.parseLong, and this isn't much work anyway,
276             // just build up the value as we encounter the digits.
277             int64_t val = 0;
278             p = 0;
279             UChar c = gSpace;
280 
281             // begin parsing the descriptor: copy digits
282             // into "tempValue", skip periods, commas, and spaces,
283             // stop on a slash or > sign (or at the end of the string),
284             // and throw an exception on any other character
285             int64_t ll_10 = 10;
286             while (p < descriptorLength) {
287                 c = descriptor.charAt(p);
288                 if (c >= gZero && c <= gNine) {
289                     val = val * ll_10 + (int32_t)(c - gZero);
290                 }
291                 else if (c == gSlash || c == gGreaterThan) {
292                     break;
293                 }
294                 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
295                 }
296                 else {
297                     // throw new IllegalArgumentException("Illegal character in rule descriptor");
298                     status = U_PARSE_ERROR;
299                     return;
300                 }
301                 ++p;
302             }
303 
304             // we have the base value, so set it
305             setBaseValue(val, status);
306 
307             // if we stopped the previous loop on a slash, we're
308             // now parsing the rule's radix.  Again, accumulate digits
309             // in tempValue, skip punctuation, stop on a > mark, and
310             // throw an exception on anything else
311             if (c == gSlash) {
312                 val = 0;
313                 ++p;
314                 int64_t ll_10 = 10;
315                 while (p < descriptorLength) {
316                     c = descriptor.charAt(p);
317                     if (c >= gZero && c <= gNine) {
318                         val = val * ll_10 + (int32_t)(c - gZero);
319                     }
320                     else if (c == gGreaterThan) {
321                         break;
322                     }
323                     else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
324                     }
325                     else {
326                         // throw new IllegalArgumentException("Illegal character is rule descriptor");
327                         status = U_PARSE_ERROR;
328                         return;
329                     }
330                     ++p;
331                 }
332 
333                 // tempValue now contain's the rule's radix.  Set it
334                 // accordingly, and recalculate the rule's exponent
335                 radix = (int32_t)val;
336                 if (radix == 0) {
337                     // throw new IllegalArgumentException("Rule can't have radix of 0");
338                     status = U_PARSE_ERROR;
339                 }
340 
341                 exponent = expectedExponent();
342             }
343 
344             // if we stopped the previous loop on a > sign, then continue
345             // for as long as we still see > signs.  For each one,
346             // decrement the exponent (unless the exponent is already 0).
347             // If we see another character before reaching the end of
348             // the descriptor, that's also a syntax error.
349             if (c == gGreaterThan) {
350                 while (p < descriptor.length()) {
351                     c = descriptor.charAt(p);
352                     if (c == gGreaterThan && exponent > 0) {
353                         --exponent;
354                     } else {
355                         // throw new IllegalArgumentException("Illegal character in rule descriptor");
356                         status = U_PARSE_ERROR;
357                         return;
358                     }
359                     ++p;
360                 }
361             }
362         }
363         else if (0 == descriptor.compare(gMinusX, 2)) {
364             setType(kNegativeNumberRule);
365         }
366         else if (descriptorLength == 3) {
367             if (firstChar == gZero && lastChar == gX) {
368                 setBaseValue(kProperFractionRule, status);
369                 decimalPoint = descriptor.charAt(1);
370             }
371             else if (firstChar == gX && lastChar == gX) {
372                 setBaseValue(kImproperFractionRule, status);
373                 decimalPoint = descriptor.charAt(1);
374             }
375             else if (firstChar == gX && lastChar == gZero) {
376                 setBaseValue(kMasterRule, status);
377                 decimalPoint = descriptor.charAt(1);
378             }
379             else if (descriptor.compare(gNaN, 3) == 0) {
380                 setBaseValue(kNaNRule, status);
381             }
382             else if (descriptor.compare(gInf, 3) == 0) {
383                 setBaseValue(kInfinityRule, status);
384             }
385         }
386     }
387     // else use the default base value for now.
388 
389     // finally, if the rule body begins with an apostrophe, strip it off
390     // (this is generally used to put whitespace at the beginning of
391     // a rule's rule text)
392     if (description.length() > 0 && description.charAt(0) == gTick) {
393         description.removeBetween(0, 1);
394     }
395 
396     // return the description with all the stuff we've just waded through
397     // stripped off the front.  It now contains just the rule body.
398     // return description;
399 }
400 
401 /**
402 * Searches the rule's rule text for the substitution tokens,
403 * creates the substitutions, and removes the substitution tokens
404 * from the rule's rule text.
405 * @param owner The rule set containing this rule
406 * @param predecessor The rule preseding this one in "owners" rule list
407 * @param ownersOwner The RuleBasedFormat that owns this rule
408 */
409 void
extractSubstitutions(const NFRuleSet * ruleSet,const UnicodeString & ruleText,const NFRule * predecessor,UErrorCode & status)410 NFRule::extractSubstitutions(const NFRuleSet* ruleSet,
411                              const UnicodeString &ruleText,
412                              const NFRule* predecessor,
413                              UErrorCode& status)
414 {
415     if (U_FAILURE(status)) {
416         return;
417     }
418     this->ruleText = ruleText;
419     sub1 = extractSubstitution(ruleSet, predecessor, status);
420     if (sub1 == NULL) {
421         // Small optimization. There is no need to create a redundant NullSubstitution.
422         sub2 = NULL;
423     }
424     else {
425         sub2 = extractSubstitution(ruleSet, predecessor, status);
426     }
427     int32_t pluralRuleStart = this->ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
428     int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? this->ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1);
429     if (pluralRuleEnd >= 0) {
430         int32_t endType = this->ruleText.indexOf(gComma, pluralRuleStart);
431         if (endType < 0) {
432             status = U_PARSE_ERROR;
433             return;
434         }
435         UnicodeString type(this->ruleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2));
436         UPluralType pluralType;
437         if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) {
438             pluralType = UPLURAL_TYPE_CARDINAL;
439         }
440         else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) {
441             pluralType = UPLURAL_TYPE_ORDINAL;
442         }
443         else {
444             status = U_ILLEGAL_ARGUMENT_ERROR;
445             return;
446         }
447         rulePatternFormat = formatter->createPluralFormat(pluralType,
448                 this->ruleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status);
449     }
450 }
451 
452 /**
453 * Searches the rule's rule text for the first substitution token,
454 * creates a substitution based on it, and removes the token from
455 * the rule's rule text.
456 * @param owner The rule set containing this rule
457 * @param predecessor The rule preceding this one in the rule set's
458 * rule list
459 * @param ownersOwner The RuleBasedNumberFormat that owns this rule
460 * @return The newly-created substitution.  This is never null; if
461 * the rule text doesn't contain any substitution tokens, this will
462 * be a NullSubstitution.
463 */
464 NFSubstitution *
extractSubstitution(const NFRuleSet * ruleSet,const NFRule * predecessor,UErrorCode & status)465 NFRule::extractSubstitution(const NFRuleSet* ruleSet,
466                             const NFRule* predecessor,
467                             UErrorCode& status)
468 {
469     NFSubstitution* result = NULL;
470 
471     // search the rule's rule text for the first two characters of
472     // a substitution token
473     int32_t subStart = indexOfAnyRulePrefix();
474     int32_t subEnd = subStart;
475 
476     // if we didn't find one, create a null substitution positioned
477     // at the end of the rule text
478     if (subStart == -1) {
479         return NULL;
480     }
481 
482     // special-case the ">>>" token, since searching for the > at the
483     // end will actually find the > in the middle
484     if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) {
485         subEnd = subStart + 2;
486 
487         // otherwise the substitution token ends with the same character
488         // it began with
489     } else {
490         UChar c = ruleText.charAt(subStart);
491         subEnd = ruleText.indexOf(c, subStart + 1);
492         // special case for '<%foo<<'
493         if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) {
494             // ordinals use "=#,##0==%abbrev=" as their rule.  Notice that the '==' in the middle
495             // occurs because of the juxtaposition of two different rules.  The check for '<' is a hack
496             // to get around this.  Having the duplicate at the front would cause problems with
497             // rules like "<<%" to format, say, percents...
498             ++subEnd;
499         }
500    }
501 
502     // if we don't find the end of the token (i.e., if we're on a single,
503     // unmatched token character), create a null substitution positioned
504     // at the end of the rule
505     if (subEnd == -1) {
506         return NULL;
507     }
508 
509     // if we get here, we have a real substitution token (or at least
510     // some text bounded by substitution token characters).  Use
511     // makeSubstitution() to create the right kind of substitution
512     UnicodeString subToken;
513     subToken.setTo(ruleText, subStart, subEnd + 1 - subStart);
514     result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet,
515         this->formatter, subToken, status);
516 
517     // remove the substitution from the rule text
518     ruleText.removeBetween(subStart, subEnd+1);
519 
520     return result;
521 }
522 
523 /**
524  * Sets the rule's base value, and causes the radix and exponent
525  * to be recalculated.  This is used during construction when we
526  * don't know the rule's base value until after it's been
527  * constructed.  It should be used at any other time.
528  * @param The new base value for the rule.
529  */
530 void
setBaseValue(int64_t newBaseValue,UErrorCode & status)531 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status)
532 {
533     // set the base value
534     baseValue = newBaseValue;
535     radix = 10;
536 
537     // if this isn't a special rule, recalculate the radix and exponent
538     // (the radix always defaults to 10; if it's supposed to be something
539     // else, it's cleaned up by the caller and the exponent is
540     // recalculated again-- the only function that does this is
541     // NFRule.parseRuleDescriptor() )
542     if (baseValue >= 1) {
543         exponent = expectedExponent();
544 
545         // this function gets called on a fully-constructed rule whose
546         // description didn't specify a base value.  This means it
547         // has substitutions, and some substitutions hold on to copies
548         // of the rule's divisor.  Fix their copies of the divisor.
549         if (sub1 != NULL) {
550             sub1->setDivisor(radix, exponent, status);
551         }
552         if (sub2 != NULL) {
553             sub2->setDivisor(radix, exponent, status);
554         }
555 
556         // if this is a special rule, its radix and exponent are basically
557         // ignored.  Set them to "safe" default values
558     } else {
559         exponent = 0;
560     }
561 }
562 
563 /**
564 * This calculates the rule's exponent based on its radix and base
565 * value.  This will be the highest power the radix can be raised to
566 * and still produce a result less than or equal to the base value.
567 */
568 int16_t
expectedExponent() const569 NFRule::expectedExponent() const
570 {
571     // since the log of 0, or the log base 0 of something, causes an
572     // error, declare the exponent in these cases to be 0 (we also
573     // deal with the special-rule identifiers here)
574     if (radix == 0 || baseValue < 1) {
575         return 0;
576     }
577 
578     // we get rounding error in some cases-- for example, log 1000 / log 10
579     // gives us 1.9999999996 instead of 2.  The extra logic here is to take
580     // that into account
581     int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix));
582     int64_t temp = util64_pow(radix, tempResult + 1);
583     if (temp <= baseValue) {
584         tempResult += 1;
585     }
586     return tempResult;
587 }
588 
589 /**
590  * Searches the rule's rule text for any of the specified strings.
591  * @return The index of the first match in the rule's rule text
592  * (i.e., the first substring in the rule's rule text that matches
593  * _any_ of the strings in "strings").  If none of the strings in
594  * "strings" is found in the rule's rule text, returns -1.
595  */
596 int32_t
indexOfAnyRulePrefix() const597 NFRule::indexOfAnyRulePrefix() const
598 {
599     int result = -1;
600     for (int i = 0; RULE_PREFIXES[i]; i++) {
601         int32_t pos = ruleText.indexOf(*RULE_PREFIXES[i]);
602         if (pos != -1 && (result == -1 || pos < result)) {
603             result = pos;
604         }
605     }
606     return result;
607 }
608 
609 //-----------------------------------------------------------------------
610 // boilerplate
611 //-----------------------------------------------------------------------
612 
613 static UBool
util_equalSubstitutions(const NFSubstitution * sub1,const NFSubstitution * sub2)614 util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2)
615 {
616     if (sub1) {
617         if (sub2) {
618             return *sub1 == *sub2;
619         }
620     } else if (!sub2) {
621         return TRUE;
622     }
623     return FALSE;
624 }
625 
626 /**
627 * Tests two rules for equality.
628 * @param that The rule to compare this one against
629 * @return True is the two rules are functionally equivalent
630 */
631 UBool
operator ==(const NFRule & rhs) const632 NFRule::operator==(const NFRule& rhs) const
633 {
634     return baseValue == rhs.baseValue
635         && radix == rhs.radix
636         && exponent == rhs.exponent
637         && ruleText == rhs.ruleText
638         && util_equalSubstitutions(sub1, rhs.sub1)
639         && util_equalSubstitutions(sub2, rhs.sub2);
640 }
641 
642 /**
643 * Returns a textual representation of the rule.  This won't
644 * necessarily be the same as the description that this rule
645 * was created with, but it will produce the same result.
646 * @return A textual description of the rule
647 */
util_append64(UnicodeString & result,int64_t n)648 static void util_append64(UnicodeString& result, int64_t n)
649 {
650     UChar buffer[256];
651     int32_t len = util64_tou(n, buffer, sizeof(buffer));
652     UnicodeString temp(buffer, len);
653     result.append(temp);
654 }
655 
656 void
_appendRuleText(UnicodeString & result) const657 NFRule::_appendRuleText(UnicodeString& result) const
658 {
659     switch (getType()) {
660     case kNegativeNumberRule: result.append(gMinusX, 2); break;
661     case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
662     case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break;
663     case kMasterRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break;
664     case kInfinityRule: result.append(gInf, 3); break;
665     case kNaNRule: result.append(gNaN, 3); break;
666     default:
667         // for a normal rule, write out its base value, and if the radix is
668         // something other than 10, write out the radix (with the preceding
669         // slash, of course).  Then calculate the expected exponent and if
670         // if isn't the same as the actual exponent, write an appropriate
671         // number of > signs.  Finally, terminate the whole thing with
672         // a colon.
673         util_append64(result, baseValue);
674         if (radix != 10) {
675             result.append(gSlash);
676             util_append64(result, radix);
677         }
678         int numCarets = expectedExponent() - exponent;
679         for (int i = 0; i < numCarets; i++) {
680             result.append(gGreaterThan);
681         }
682         break;
683     }
684     result.append(gColon);
685     result.append(gSpace);
686 
687     // if the rule text begins with a space, write an apostrophe
688     // (whitespace after the rule descriptor is ignored; the
689     // apostrophe is used to make the whitespace significant)
690     if (ruleText.charAt(0) == gSpace && (sub1 == NULL || sub1->getPos() != 0)) {
691         result.append(gTick);
692     }
693 
694     // now, write the rule's rule text, inserting appropriate
695     // substitution tokens in the appropriate places
696     UnicodeString ruleTextCopy;
697     ruleTextCopy.setTo(ruleText);
698 
699     UnicodeString temp;
700     if (sub2 != NULL) {
701         sub2->toString(temp);
702         ruleTextCopy.insert(sub2->getPos(), temp);
703     }
704     if (sub1 != NULL) {
705         sub1->toString(temp);
706         ruleTextCopy.insert(sub1->getPos(), temp);
707     }
708 
709     result.append(ruleTextCopy);
710 
711     // and finally, top the whole thing off with a semicolon and
712     // return the result
713     result.append(gSemicolon);
714 }
715 
716 //-----------------------------------------------------------------------
717 // formatting
718 //-----------------------------------------------------------------------
719 
720 /**
721 * Formats the number, and inserts the resulting text into
722 * toInsertInto.
723 * @param number The number being formatted
724 * @param toInsertInto The string where the resultant text should
725 * be inserted
726 * @param pos The position in toInsertInto where the resultant text
727 * should be inserted
728 */
729 void
doFormat(int64_t number,UnicodeString & toInsertInto,int32_t pos,int32_t recursionCount,UErrorCode & status) const730 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
731 {
732     // first, insert the rule's rule text into toInsertInto at the
733     // specified position, then insert the results of the substitutions
734     // into the right places in toInsertInto (notice we do the
735     // substitutions in reverse order so that the offsets don't get
736     // messed up)
737     int32_t pluralRuleStart = ruleText.length();
738     int32_t lengthOffset = 0;
739     if (!rulePatternFormat) {
740         toInsertInto.insert(pos, ruleText);
741     }
742     else {
743         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
744         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
745         int initialLength = toInsertInto.length();
746         if (pluralRuleEnd < ruleText.length() - 1) {
747             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
748         }
749         toInsertInto.insert(pos,
750             rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status));
751         if (pluralRuleStart > 0) {
752             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
753         }
754         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
755     }
756 
757     if (sub2 != NULL) {
758         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
759     }
760     if (sub1 != NULL) {
761         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
762     }
763 }
764 
765 /**
766 * Formats the number, and inserts the resulting text into
767 * toInsertInto.
768 * @param number The number being formatted
769 * @param toInsertInto The string where the resultant text should
770 * be inserted
771 * @param pos The position in toInsertInto where the resultant text
772 * should be inserted
773 */
774 void
doFormat(double number,UnicodeString & toInsertInto,int32_t pos,int32_t recursionCount,UErrorCode & status) const775 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const
776 {
777     // first, insert the rule's rule text into toInsertInto at the
778     // specified position, then insert the results of the substitutions
779     // into the right places in toInsertInto
780     // [again, we have two copies of this routine that do the same thing
781     // so that we don't sacrifice precision in a long by casting it
782     // to a double]
783     int32_t pluralRuleStart = ruleText.length();
784     int32_t lengthOffset = 0;
785     if (!rulePatternFormat) {
786         toInsertInto.insert(pos, ruleText);
787     }
788     else {
789         pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
790         int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart);
791         int initialLength = toInsertInto.length();
792         if (pluralRuleEnd < ruleText.length() - 1) {
793             toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2));
794         }
795         double pluralVal = number;
796         if (0 <= pluralVal && pluralVal < 1) {
797             // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior.
798             // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors.
799             pluralVal = uprv_round(pluralVal * uprv_pow(radix, exponent));
800         }
801         else {
802             pluralVal = pluralVal / uprv_pow(radix, exponent);
803         }
804         toInsertInto.insert(pos, rulePatternFormat->format((int32_t)(pluralVal), status));
805         if (pluralRuleStart > 0) {
806             toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart));
807         }
808         lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength);
809     }
810 
811     if (sub2 != NULL) {
812         sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
813     }
814     if (sub1 != NULL) {
815         sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status);
816     }
817 }
818 
819 /**
820 * Used by the owning rule set to determine whether to invoke the
821 * rollback rule (i.e., whether this rule or the one that precedes
822 * it in the rule set's list should be used to format the number)
823 * @param The number being formatted
824 * @return True if the rule set should use the rule that precedes
825 * this one in its list; false if it should use this rule
826 */
827 UBool
shouldRollBack(double number) const828 NFRule::shouldRollBack(double number) const
829 {
830     // we roll back if the rule contains a modulus substitution,
831     // the number being formatted is an even multiple of the rule's
832     // divisor, and the rule's base value is NOT an even multiple
833     // of its divisor
834     // In other words, if the original description had
835     //    100: << hundred[ >>];
836     // that expands into
837     //    100: << hundred;
838     //    101: << hundred >>;
839     // internally.  But when we're formatting 200, if we use the rule
840     // at 101, which would normally apply, we get "two hundred zero".
841     // To prevent this, we roll back and use the rule at 100 instead.
842     // This is the logic that makes this happen: the rule at 101 has
843     // a modulus substitution, its base value isn't an even multiple
844     // of 100, and the value we're trying to format _is_ an even
845     // multiple of 100.  This is called the "rollback rule."
846     if ((sub1 != NULL && sub1->isModulusSubstitution()) || (sub2 != NULL && sub2->isModulusSubstitution())) {
847         int64_t re = util64_pow(radix, exponent);
848         return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0;
849     }
850     return FALSE;
851 }
852 
853 //-----------------------------------------------------------------------
854 // parsing
855 //-----------------------------------------------------------------------
856 
857 /**
858 * Attempts to parse the string with this rule.
859 * @param text The string being parsed
860 * @param parsePosition On entry, the value is ignored and assumed to
861 * be 0. On exit, this has been updated with the position of the first
862 * character not consumed by matching the text against this rule
863 * (if this rule doesn't match the text at all, the parse position
864 * if left unchanged (presumably at 0) and the function returns
865 * new Long(0)).
866 * @param isFractionRule True if this rule is contained within a
867 * fraction rule set.  This is only used if the rule has no
868 * substitutions.
869 * @return If this rule matched the text, this is the rule's base value
870 * combined appropriately with the results of parsing the substitutions.
871 * If nothing matched, this is new Long(0) and the parse position is
872 * left unchanged.  The result will be an instance of Long if the
873 * result is an integer and Double otherwise.  The result is never null.
874 */
875 #ifdef RBNF_DEBUG
876 #include <stdio.h>
877 
dumpUS(FILE * f,const UnicodeString & us)878 static void dumpUS(FILE* f, const UnicodeString& us) {
879   int len = us.length();
880   char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1];
881   if (buf != NULL) {
882 	  us.extract(0, len, buf);
883 	  buf[len] = 0;
884 	  fprintf(f, "%s", buf);
885 	  uprv_free(buf); //delete[] buf;
886   }
887 }
888 #endif
889 UBool
doParse(const UnicodeString & text,ParsePosition & parsePosition,UBool isFractionRule,double upperBound,Formattable & resVal) const890 NFRule::doParse(const UnicodeString& text,
891                 ParsePosition& parsePosition,
892                 UBool isFractionRule,
893                 double upperBound,
894                 Formattable& resVal) const
895 {
896     // internally we operate on a copy of the string being parsed
897     // (because we're going to change it) and use our own ParsePosition
898     ParsePosition pp;
899     UnicodeString workText(text);
900 
901     int32_t sub1Pos = sub1 != NULL ? sub1->getPos() : ruleText.length();
902     int32_t sub2Pos = sub2 != NULL ? sub2->getPos() : ruleText.length();
903 
904     // check to see whether the text before the first substitution
905     // matches the text at the beginning of the string being
906     // parsed.  If it does, strip that off the front of workText;
907     // otherwise, dump out with a mismatch
908     UnicodeString prefix;
909     prefix.setTo(ruleText, 0, sub1Pos);
910 
911 #ifdef RBNF_DEBUG
912     fprintf(stderr, "doParse %p ", this);
913     {
914         UnicodeString rt;
915         _appendRuleText(rt);
916         dumpUS(stderr, rt);
917     }
918 
919     fprintf(stderr, " text: '");
920     dumpUS(stderr, text);
921     fprintf(stderr, "' prefix: '");
922     dumpUS(stderr, prefix);
923 #endif
924     stripPrefix(workText, prefix, pp);
925     int32_t prefixLength = text.length() - workText.length();
926 
927 #ifdef RBNF_DEBUG
928     fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos);
929 #endif
930 
931     if (pp.getIndex() == 0 && sub1Pos != 0) {
932         // commented out because ParsePosition doesn't have error index in 1.1.x
933         // restored for ICU4C port
934         parsePosition.setErrorIndex(pp.getErrorIndex());
935         resVal.setLong(0);
936         return TRUE;
937     }
938     if (baseValue == kInfinityRule) {
939         // If you match this, don't try to perform any calculations on it.
940         parsePosition.setIndex(pp.getIndex());
941         resVal.setDouble(uprv_getInfinity());
942         return TRUE;
943     }
944     if (baseValue == kNaNRule) {
945         // If you match this, don't try to perform any calculations on it.
946         parsePosition.setIndex(pp.getIndex());
947         resVal.setDouble(uprv_getNaN());
948         return TRUE;
949     }
950 
951     // this is the fun part.  The basic guts of the rule-matching
952     // logic is matchToDelimiter(), which is called twice.  The first
953     // time it searches the input string for the rule text BETWEEN
954     // the substitutions and tries to match the intervening text
955     // in the input string with the first substitution.  If that
956     // succeeds, it then calls it again, this time to look for the
957     // rule text after the second substitution and to match the
958     // intervening input text against the second substitution.
959     //
960     // For example, say we have a rule that looks like this:
961     //    first << middle >> last;
962     // and input text that looks like this:
963     //    first one middle two last
964     // First we use stripPrefix() to match "first " in both places and
965     // strip it off the front, leaving
966     //    one middle two last
967     // Then we use matchToDelimiter() to match " middle " and try to
968     // match "one" against a substitution.  If it's successful, we now
969     // have
970     //    two last
971     // We use matchToDelimiter() a second time to match " last" and
972     // try to match "two" against a substitution.  If "two" matches
973     // the substitution, we have a successful parse.
974     //
975     // Since it's possible in many cases to find multiple instances
976     // of each of these pieces of rule text in the input string,
977     // we need to try all the possible combinations of these
978     // locations.  This prevents us from prematurely declaring a mismatch,
979     // and makes sure we match as much input text as we can.
980     int highWaterMark = 0;
981     double result = 0;
982     int start = 0;
983     double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue);
984 
985     UnicodeString temp;
986     do {
987         // our partial parse result starts out as this rule's base
988         // value.  If it finds a successful match, matchToDelimiter()
989         // will compose this in some way with what it gets back from
990         // the substitution, giving us a new partial parse result
991         pp.setIndex(0);
992 
993         temp.setTo(ruleText, sub1Pos, sub2Pos - sub1Pos);
994         double partialResult = matchToDelimiter(workText, start, tempBaseValue,
995             temp, pp, sub1,
996             upperBound);
997 
998         // if we got a successful match (or were trying to match a
999         // null substitution), pp is now pointing at the first unmatched
1000         // character.  Take note of that, and try matchToDelimiter()
1001         // on the input text again
1002         if (pp.getIndex() != 0 || sub1 == NULL) {
1003             start = pp.getIndex();
1004 
1005             UnicodeString workText2;
1006             workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex());
1007             ParsePosition pp2;
1008 
1009             // the second matchToDelimiter() will compose our previous
1010             // partial result with whatever it gets back from its
1011             // substitution if there's a successful match, giving us
1012             // a real result
1013             temp.setTo(ruleText, sub2Pos, ruleText.length() - sub2Pos);
1014             partialResult = matchToDelimiter(workText2, 0, partialResult,
1015                 temp, pp2, sub2,
1016                 upperBound);
1017 
1018             // if we got a successful match on this second
1019             // matchToDelimiter() call, update the high-water mark
1020             // and result (if necessary)
1021             if (pp2.getIndex() != 0 || sub2 == NULL) {
1022                 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) {
1023                     highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex();
1024                     result = partialResult;
1025                 }
1026             }
1027             else {
1028                 // commented out because ParsePosition doesn't have error index in 1.1.x
1029                 // restored for ICU4C port
1030                 int32_t temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex();
1031                 if (temp> parsePosition.getErrorIndex()) {
1032                     parsePosition.setErrorIndex(temp);
1033                 }
1034             }
1035         }
1036         else {
1037             // commented out because ParsePosition doesn't have error index in 1.1.x
1038             // restored for ICU4C port
1039             int32_t temp = sub1Pos + pp.getErrorIndex();
1040             if (temp > parsePosition.getErrorIndex()) {
1041                 parsePosition.setErrorIndex(temp);
1042             }
1043         }
1044         // keep trying to match things until the outer matchToDelimiter()
1045         // call fails to make a match (each time, it picks up where it
1046         // left off the previous time)
1047     } while (sub1Pos != sub2Pos
1048         && pp.getIndex() > 0
1049         && pp.getIndex() < workText.length()
1050         && pp.getIndex() != start);
1051 
1052     // update the caller's ParsePosition with our high-water mark
1053     // (i.e., it now points at the first character this function
1054     // didn't match-- the ParsePosition is therefore unchanged if
1055     // we didn't match anything)
1056     parsePosition.setIndex(highWaterMark);
1057     // commented out because ParsePosition doesn't have error index in 1.1.x
1058     // restored for ICU4C port
1059     if (highWaterMark > 0) {
1060         parsePosition.setErrorIndex(0);
1061     }
1062 
1063     // this is a hack for one unusual condition: Normally, whether this
1064     // rule belong to a fraction rule set or not is handled by its
1065     // substitutions.  But if that rule HAS NO substitutions, then
1066     // we have to account for it here.  By definition, if the matching
1067     // rule in a fraction rule set has no substitutions, its numerator
1068     // is 1, and so the result is the reciprocal of its base value.
1069     if (isFractionRule && highWaterMark > 0 && sub1 == NULL) {
1070         result = 1 / result;
1071     }
1072 
1073     resVal.setDouble(result);
1074     return TRUE; // ??? do we need to worry if it is a long or a double?
1075 }
1076 
1077 /**
1078 * This function is used by parse() to match the text being parsed
1079 * against a possible prefix string.  This function
1080 * matches characters from the beginning of the string being parsed
1081 * to characters from the prospective prefix.  If they match, pp is
1082 * updated to the first character not matched, and the result is
1083 * the unparsed part of the string.  If they don't match, the whole
1084 * string is returned, and pp is left unchanged.
1085 * @param text The string being parsed
1086 * @param prefix The text to match against
1087 * @param pp On entry, ignored and assumed to be 0.  On exit, points
1088 * to the first unmatched character (assuming the whole prefix matched),
1089 * or is unchanged (if the whole prefix didn't match).
1090 * @return If things match, this is the unparsed part of "text";
1091 * if they didn't match, this is "text".
1092 */
1093 void
stripPrefix(UnicodeString & text,const UnicodeString & prefix,ParsePosition & pp) const1094 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const
1095 {
1096     // if the prefix text is empty, dump out without doing anything
1097     if (prefix.length() != 0) {
1098     	UErrorCode status = U_ZERO_ERROR;
1099         // use prefixLength() to match the beginning of
1100         // "text" against "prefix".  This function returns the
1101         // number of characters from "text" that matched (or 0 if
1102         // we didn't match the whole prefix)
1103         int32_t pfl = prefixLength(text, prefix, status);
1104         if (U_FAILURE(status)) { // Memory allocation error.
1105         	return;
1106         }
1107         if (pfl != 0) {
1108             // if we got a successful match, update the parse position
1109             // and strip the prefix off of "text"
1110             pp.setIndex(pp.getIndex() + pfl);
1111             text.remove(0, pfl);
1112         }
1113     }
1114 }
1115 
1116 /**
1117 * Used by parse() to match a substitution and any following text.
1118 * "text" is searched for instances of "delimiter".  For each instance
1119 * of delimiter, the intervening text is tested to see whether it
1120 * matches the substitution.  The longest match wins.
1121 * @param text The string being parsed
1122 * @param startPos The position in "text" where we should start looking
1123 * for "delimiter".
1124 * @param baseValue A partial parse result (often the rule's base value),
1125 * which is combined with the result from matching the substitution
1126 * @param delimiter The string to search "text" for.
1127 * @param pp Ignored and presumed to be 0 on entry.  If there's a match,
1128 * on exit this will point to the first unmatched character.
1129 * @param sub If we find "delimiter" in "text", this substitution is used
1130 * to match the text between the beginning of the string and the
1131 * position of "delimiter."  (If "delimiter" is the empty string, then
1132 * this function just matches against this substitution and updates
1133 * everything accordingly.)
1134 * @param upperBound When matching the substitution, it will only
1135 * consider rules with base values lower than this value.
1136 * @return If there's a match, this is the result of composing
1137 * baseValue with the result of matching the substitution.  Otherwise,
1138 * this is new Long(0).  It's never null.  If the result is an integer,
1139 * this will be an instance of Long; otherwise, it's an instance of
1140 * Double.
1141 *
1142 * !!! note {dlf} in point of fact, in the java code the caller always converts
1143 * the result to a double, so we might as well return one.
1144 */
1145 double
matchToDelimiter(const UnicodeString & text,int32_t startPos,double _baseValue,const UnicodeString & delimiter,ParsePosition & pp,const NFSubstitution * sub,double upperBound) const1146 NFRule::matchToDelimiter(const UnicodeString& text,
1147                          int32_t startPos,
1148                          double _baseValue,
1149                          const UnicodeString& delimiter,
1150                          ParsePosition& pp,
1151                          const NFSubstitution* sub,
1152                          double upperBound) const
1153 {
1154 	UErrorCode status = U_ZERO_ERROR;
1155     // if "delimiter" contains real (i.e., non-ignorable) text, search
1156     // it for "delimiter" beginning at "start".  If that succeeds, then
1157     // use "sub"'s doParse() method to match the text before the
1158     // instance of "delimiter" we just found.
1159     if (!allIgnorable(delimiter, status)) {
1160     	if (U_FAILURE(status)) { //Memory allocation error.
1161     		return 0;
1162     	}
1163         ParsePosition tempPP;
1164         Formattable result;
1165 
1166         // use findText() to search for "delimiter".  It returns a two-
1167         // element array: element 0 is the position of the match, and
1168         // element 1 is the number of characters that matched
1169         // "delimiter".
1170         int32_t dLen;
1171         int32_t dPos = findText(text, delimiter, startPos, &dLen);
1172 
1173         // if findText() succeeded, isolate the text preceding the
1174         // match, and use "sub" to match that text
1175         while (dPos >= 0) {
1176             UnicodeString subText;
1177             subText.setTo(text, 0, dPos);
1178             if (subText.length() > 0) {
1179                 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound,
1180 #if UCONFIG_NO_COLLATION
1181                     FALSE,
1182 #else
1183                     formatter->isLenient(),
1184 #endif
1185                     result);
1186 
1187                 // if the substitution could match all the text up to
1188                 // where we found "delimiter", then this function has
1189                 // a successful match.  Bump the caller's parse position
1190                 // to point to the first character after the text
1191                 // that matches "delimiter", and return the result
1192                 // we got from parsing the substitution.
1193                 if (success && tempPP.getIndex() == dPos) {
1194                     pp.setIndex(dPos + dLen);
1195                     return result.getDouble();
1196                 }
1197                 else {
1198                     // commented out because ParsePosition doesn't have error index in 1.1.x
1199                     // restored for ICU4C port
1200                     if (tempPP.getErrorIndex() > 0) {
1201                         pp.setErrorIndex(tempPP.getErrorIndex());
1202                     } else {
1203                         pp.setErrorIndex(tempPP.getIndex());
1204                     }
1205                 }
1206             }
1207 
1208             // if we didn't match the substitution, search for another
1209             // copy of "delimiter" in "text" and repeat the loop if
1210             // we find it
1211             tempPP.setIndex(0);
1212             dPos = findText(text, delimiter, dPos + dLen, &dLen);
1213         }
1214         // if we make it here, this was an unsuccessful match, and we
1215         // leave pp unchanged and return 0
1216         pp.setIndex(0);
1217         return 0;
1218 
1219         // if "delimiter" is empty, or consists only of ignorable characters
1220         // (i.e., is semantically empty), thwe we obviously can't search
1221         // for "delimiter".  Instead, just use "sub" to parse as much of
1222         // "text" as possible.
1223     }
1224     else if (sub == NULL) {
1225         return _baseValue;
1226     }
1227     else {
1228         ParsePosition tempPP;
1229         Formattable result;
1230 
1231         // try to match the whole string against the substitution
1232         UBool success = sub->doParse(text, tempPP, _baseValue, upperBound,
1233 #if UCONFIG_NO_COLLATION
1234             FALSE,
1235 #else
1236             formatter->isLenient(),
1237 #endif
1238             result);
1239         if (success && (tempPP.getIndex() != 0)) {
1240             // if there's a successful match (or it's a null
1241             // substitution), update pp to point to the first
1242             // character we didn't match, and pass the result from
1243             // sub.doParse() on through to the caller
1244             pp.setIndex(tempPP.getIndex());
1245             return result.getDouble();
1246         }
1247         else {
1248             // commented out because ParsePosition doesn't have error index in 1.1.x
1249             // restored for ICU4C port
1250             pp.setErrorIndex(tempPP.getErrorIndex());
1251         }
1252 
1253         // and if we get to here, then nothing matched, so we return
1254         // 0 and leave pp alone
1255         return 0;
1256     }
1257 }
1258 
1259 /**
1260 * Used by stripPrefix() to match characters.  If lenient parse mode
1261 * is off, this just calls startsWith().  If lenient parse mode is on,
1262 * this function uses CollationElementIterators to match characters in
1263 * the strings (only primary-order differences are significant in
1264 * determining whether there's a match).
1265 * @param str The string being tested
1266 * @param prefix The text we're hoping to see at the beginning
1267 * of "str"
1268 * @return If "prefix" is found at the beginning of "str", this
1269 * is the number of characters in "str" that were matched (this
1270 * isn't necessarily the same as the length of "prefix" when matching
1271 * text with a collator).  If there's no match, this is 0.
1272 */
1273 int32_t
prefixLength(const UnicodeString & str,const UnicodeString & prefix,UErrorCode & status) const1274 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const
1275 {
1276     // if we're looking for an empty prefix, it obviously matches
1277     // zero characters.  Just go ahead and return 0.
1278     if (prefix.length() == 0) {
1279         return 0;
1280     }
1281 
1282 #if !UCONFIG_NO_COLLATION
1283     // go through all this grief if we're in lenient-parse mode
1284     if (formatter->isLenient()) {
1285         // get the formatter's collator and use it to create two
1286         // collation element iterators, one over the target string
1287         // and another over the prefix (right now, we'll throw an
1288         // exception if the collator we get back from the formatter
1289         // isn't a RuleBasedCollator, because RuleBasedCollator defines
1290         // the CollationElementIterator protocol.  Hopefully, this
1291         // will change someday.)
1292         const RuleBasedCollator* collator = formatter->getCollator();
1293         if (collator == NULL) {
1294             status = U_MEMORY_ALLOCATION_ERROR;
1295             return 0;
1296         }
1297         LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str));
1298         LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix));
1299         // Check for memory allocation error.
1300         if (strIter.isNull() || prefixIter.isNull()) {
1301             status = U_MEMORY_ALLOCATION_ERROR;
1302             return 0;
1303         }
1304 
1305         UErrorCode err = U_ZERO_ERROR;
1306 
1307         // The original code was problematic.  Consider this match:
1308         // prefix = "fifty-"
1309         // string = " fifty-7"
1310         // The intent is to match string up to the '7', by matching 'fifty-' at position 1
1311         // in the string.  Unfortunately, we were getting a match, and then computing where
1312         // the match terminated by rematching the string.  The rematch code was using as an
1313         // initial guess the substring of string between 0 and prefix.length.  Because of
1314         // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving
1315         // the position before the hyphen in the string.  Recursing down, we then parsed the
1316         // remaining string '-7' as numeric.  The resulting number turned out as 43 (50 - 7).
1317         // This was not pretty, especially since the string "fifty-7" parsed just fine.
1318         //
1319         // We have newer APIs now, so we can use calls on the iterator to determine what we
1320         // matched up to.  If we terminate because we hit the last element in the string,
1321         // our match terminates at this length.  If we terminate because we hit the last element
1322         // in the target, our match terminates at one before the element iterator position.
1323 
1324         // match collation elements between the strings
1325         int32_t oStr = strIter->next(err);
1326         int32_t oPrefix = prefixIter->next(err);
1327 
1328         while (oPrefix != CollationElementIterator::NULLORDER) {
1329             // skip over ignorable characters in the target string
1330             while (CollationElementIterator::primaryOrder(oStr) == 0
1331                 && oStr != CollationElementIterator::NULLORDER) {
1332                 oStr = strIter->next(err);
1333             }
1334 
1335             // skip over ignorable characters in the prefix
1336             while (CollationElementIterator::primaryOrder(oPrefix) == 0
1337                 && oPrefix != CollationElementIterator::NULLORDER) {
1338                 oPrefix = prefixIter->next(err);
1339             }
1340 
1341             // dlf: move this above following test, if we consume the
1342             // entire target, aren't we ok even if the source was also
1343             // entirely consumed?
1344 
1345             // if skipping over ignorables brought to the end of
1346             // the prefix, we DID match: drop out of the loop
1347             if (oPrefix == CollationElementIterator::NULLORDER) {
1348                 break;
1349             }
1350 
1351             // if skipping over ignorables brought us to the end
1352             // of the target string, we didn't match and return 0
1353             if (oStr == CollationElementIterator::NULLORDER) {
1354                 return 0;
1355             }
1356 
1357             // match collation elements from the two strings
1358             // (considering only primary differences).  If we
1359             // get a mismatch, dump out and return 0
1360             if (CollationElementIterator::primaryOrder(oStr)
1361                 != CollationElementIterator::primaryOrder(oPrefix)) {
1362                 return 0;
1363 
1364                 // otherwise, advance to the next character in each string
1365                 // and loop (we drop out of the loop when we exhaust
1366                 // collation elements in the prefix)
1367             } else {
1368                 oStr = strIter->next(err);
1369                 oPrefix = prefixIter->next(err);
1370             }
1371         }
1372 
1373         int32_t result = strIter->getOffset();
1374         if (oStr != CollationElementIterator::NULLORDER) {
1375             --result; // back over character that we don't want to consume;
1376         }
1377 
1378 #ifdef RBNF_DEBUG
1379         fprintf(stderr, "prefix length: %d\n", result);
1380 #endif
1381         return result;
1382 #if 0
1383         //----------------------------------------------------------------
1384         // JDK 1.2-specific API call
1385         // return strIter.getOffset();
1386         //----------------------------------------------------------------
1387         // JDK 1.1 HACK (take out for 1.2-specific code)
1388 
1389         // if we make it to here, we have a successful match.  Now we
1390         // have to find out HOW MANY characters from the target string
1391         // matched the prefix (there isn't necessarily a one-to-one
1392         // mapping between collation elements and characters).
1393         // In JDK 1.2, there's a simple getOffset() call we can use.
1394         // In JDK 1.1, on the other hand, we have to go through some
1395         // ugly contortions.  First, use the collator to compare the
1396         // same number of characters from the prefix and target string.
1397         // If they're equal, we're done.
1398         collator->setStrength(Collator::PRIMARY);
1399         if (str.length() >= prefix.length()) {
1400             UnicodeString temp;
1401             temp.setTo(str, 0, prefix.length());
1402             if (collator->equals(temp, prefix)) {
1403 #ifdef RBNF_DEBUG
1404                 fprintf(stderr, "returning: %d\n", prefix.length());
1405 #endif
1406                 return prefix.length();
1407             }
1408         }
1409 
1410         // if they're not equal, then we have to compare successively
1411         // larger and larger substrings of the target string until we
1412         // get to one that matches the prefix.  At that point, we know
1413         // how many characters matched the prefix, and we can return.
1414         int32_t p = 1;
1415         while (p <= str.length()) {
1416             UnicodeString temp;
1417             temp.setTo(str, 0, p);
1418             if (collator->equals(temp, prefix)) {
1419                 return p;
1420             } else {
1421                 ++p;
1422             }
1423         }
1424 
1425         // SHOULD NEVER GET HERE!!!
1426         return 0;
1427         //----------------------------------------------------------------
1428 #endif
1429 
1430         // If lenient parsing is turned off, forget all that crap above.
1431         // Just use String.startsWith() and be done with it.
1432   } else
1433 #endif
1434   {
1435       if (str.startsWith(prefix)) {
1436           return prefix.length();
1437       } else {
1438           return 0;
1439       }
1440   }
1441 }
1442 
1443 /**
1444 * Searches a string for another string.  If lenient parsing is off,
1445 * this just calls indexOf().  If lenient parsing is on, this function
1446 * uses CollationElementIterator to match characters, and only
1447 * primary-order differences are significant in determining whether
1448 * there's a match.
1449 * @param str The string to search
1450 * @param key The string to search "str" for
1451 * @param startingAt The index into "str" where the search is to
1452 * begin
1453 * @return A two-element array of ints.  Element 0 is the position
1454 * of the match, or -1 if there was no match.  Element 1 is the
1455 * number of characters in "str" that matched (which isn't necessarily
1456 * the same as the length of "key")
1457 */
1458 int32_t
findText(const UnicodeString & str,const UnicodeString & key,int32_t startingAt,int32_t * length) const1459 NFRule::findText(const UnicodeString& str,
1460                  const UnicodeString& key,
1461                  int32_t startingAt,
1462                  int32_t* length) const
1463 {
1464     if (rulePatternFormat) {
1465         Formattable result;
1466         FieldPosition position(UNUM_INTEGER_FIELD);
1467         position.setBeginIndex(startingAt);
1468         rulePatternFormat->parseType(str, this, result, position);
1469         int start = position.getBeginIndex();
1470         if (start >= 0) {
1471             int32_t pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0);
1472             int32_t pluralRuleSuffix = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2;
1473             int32_t matchLen = position.getEndIndex() - start;
1474             UnicodeString prefix(ruleText.tempSubString(0, pluralRuleStart));
1475             UnicodeString suffix(ruleText.tempSubString(pluralRuleSuffix));
1476             if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0
1477                     && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0)
1478             {
1479                 *length = matchLen + prefix.length() + suffix.length();
1480                 return start - prefix.length();
1481             }
1482         }
1483         *length = 0;
1484         return -1;
1485     }
1486     if (!formatter->isLenient()) {
1487         // if lenient parsing is turned off, this is easy: just call
1488         // String.indexOf() and we're done
1489         *length = key.length();
1490         return str.indexOf(key, startingAt);
1491     }
1492     else {
1493         // but if lenient parsing is turned ON, we've got some work
1494         // ahead of us
1495         return findTextLenient(str, key, startingAt, length);
1496     }
1497 }
1498 
1499 int32_t
findTextLenient(const UnicodeString & str,const UnicodeString & key,int32_t startingAt,int32_t * length) const1500 NFRule::findTextLenient(const UnicodeString& str,
1501                  const UnicodeString& key,
1502                  int32_t startingAt,
1503                  int32_t* length) const
1504 {
1505     //----------------------------------------------------------------
1506     // JDK 1.1 HACK (take out of 1.2-specific code)
1507 
1508     // in JDK 1.2, CollationElementIterator provides us with an
1509     // API to map between character offsets and collation elements
1510     // and we can do this by marching through the string comparing
1511     // collation elements.  We can't do that in JDK 1.1.  Insted,
1512     // we have to go through this horrible slow mess:
1513     int32_t p = startingAt;
1514     int32_t keyLen = 0;
1515 
1516     // basically just isolate smaller and smaller substrings of
1517     // the target string (each running to the end of the string,
1518     // and with the first one running from startingAt to the end)
1519     // and then use prefixLength() to see if the search key is at
1520     // the beginning of each substring.  This is excruciatingly
1521     // slow, but it will locate the key and tell use how long the
1522     // matching text was.
1523     UnicodeString temp;
1524     UErrorCode status = U_ZERO_ERROR;
1525     while (p < str.length() && keyLen == 0) {
1526         temp.setTo(str, p, str.length() - p);
1527         keyLen = prefixLength(temp, key, status);
1528         if (U_FAILURE(status)) {
1529             break;
1530         }
1531         if (keyLen != 0) {
1532             *length = keyLen;
1533             return p;
1534         }
1535         ++p;
1536     }
1537     // if we make it to here, we didn't find it.  Return -1 for the
1538     // location.  The length should be ignored, but set it to 0,
1539     // which should be "safe"
1540     *length = 0;
1541     return -1;
1542 }
1543 
1544 /**
1545 * Checks to see whether a string consists entirely of ignorable
1546 * characters.
1547 * @param str The string to test.
1548 * @return true if the string is empty of consists entirely of
1549 * characters that the number formatter's collator says are
1550 * ignorable at the primary-order level.  false otherwise.
1551 */
1552 UBool
allIgnorable(const UnicodeString & str,UErrorCode & status) const1553 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const
1554 {
1555     // if the string is empty, we can just return true
1556     if (str.length() == 0) {
1557         return TRUE;
1558     }
1559 
1560 #if !UCONFIG_NO_COLLATION
1561     // if lenient parsing is turned on, walk through the string with
1562     // a collation element iterator and make sure each collation
1563     // element is 0 (ignorable) at the primary level
1564     if (formatter->isLenient()) {
1565         const RuleBasedCollator* collator = formatter->getCollator();
1566         if (collator == NULL) {
1567             status = U_MEMORY_ALLOCATION_ERROR;
1568             return FALSE;
1569         }
1570         LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str));
1571 
1572         // Memory allocation error check.
1573         if (iter.isNull()) {
1574             status = U_MEMORY_ALLOCATION_ERROR;
1575             return FALSE;
1576         }
1577 
1578         UErrorCode err = U_ZERO_ERROR;
1579         int32_t o = iter->next(err);
1580         while (o != CollationElementIterator::NULLORDER
1581             && CollationElementIterator::primaryOrder(o) == 0) {
1582             o = iter->next(err);
1583         }
1584 
1585         return o == CollationElementIterator::NULLORDER;
1586     }
1587 #endif
1588 
1589     // if lenient parsing is turned off, there is no such thing as
1590     // an ignorable character: return true only if the string is empty
1591     return FALSE;
1592 }
1593 
1594 void
setDecimalFormatSymbols(const DecimalFormatSymbols & newSymbols,UErrorCode & status)1595 NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) {
1596     if (sub1 != NULL) {
1597         sub1->setDecimalFormatSymbols(newSymbols, status);
1598     }
1599     if (sub2 != NULL) {
1600         sub2->setDecimalFormatSymbols(newSymbols, status);
1601     }
1602 }
1603 
1604 U_NAMESPACE_END
1605 
1606 /* U_HAVE_RBNF */
1607 #endif
1608