1 /*
2  * Copyright (C) 2015, International Business Machines
3  * Corporation and others.  All Rights Reserved.
4  *
5  * file name: affixpatternparser.cpp
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_FORMATTING
11 
12 #include "unicode/dcfmtsym.h"
13 #include "unicode/plurrule.h"
14 #include "unicode/ucurr.h"
15 #include "affixpatternparser.h"
16 #include "charstr.h"
17 #include "precision.h"
18 #include "uassert.h"
19 #include "unistrappender.h"
20 
21         static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
22 
23 static UChar gPercent = 0x25;
24 static UChar gPerMill = 0x2030;
25 static UChar gNegative = 0x2D;
26 static UChar gPositive = 0x2B;
27 
28 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
29 
30 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
31 
32 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
33 
34 #define UNPACK_LENGTH(c) ((c) & 0xFF)
35 
36 U_NAMESPACE_BEGIN
37 
38 static int32_t
nextToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)39 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
40     if (buffer[idx] != 0x27 || idx + 1 == len) {
41         *token = buffer[idx];
42         return 1;
43     }
44     *token = buffer[idx + 1];
45     if (buffer[idx + 1] == 0xA4) {
46         int32_t i = 2;
47         for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i);
48         return i;
49     }
50     return 2;
51 }
52 
53 static int32_t
nextUserToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)54 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
55     *token = buffer[idx];
56     int32_t max;
57     switch (buffer[idx]) {
58     case 0x27:
59         max = 2;
60         break;
61     case 0xA4:
62         max = 3;
63         break;
64     default:
65         max = 1;
66         break;
67     }
68     int32_t i = 1;
69     for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i);
70     return i;
71 }
72 
CurrencyAffixInfo()73 CurrencyAffixInfo::CurrencyAffixInfo()
74         : fSymbol(gDefaultSymbols, 1),
75           fISO(gDefaultSymbols, 2),
76           fLong(DigitAffix(gDefaultSymbols, 3)),
77           fIsDefault(TRUE) {
78 }
79 
80 void
set(const char * locale,const PluralRules * rules,const UChar * currency,UErrorCode & status)81 CurrencyAffixInfo::set(
82         const char *locale,
83         const PluralRules *rules,
84         const UChar *currency,
85         UErrorCode &status) {
86     if (U_FAILURE(status)) {
87         return;
88     }
89     fIsDefault = FALSE;
90     if (currency == NULL) {
91         fSymbol.setTo(gDefaultSymbols, 1);
92         fISO.setTo(gDefaultSymbols, 2);
93         fLong.remove();
94         fLong.append(gDefaultSymbols, 3);
95         fIsDefault = TRUE;
96         return;
97     }
98     int32_t len;
99     UBool unusedIsChoice;
100     const UChar *symbol = ucurr_getName(
101             currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
102             &len, &status);
103     if (U_FAILURE(status)) {
104         return;
105     }
106     fSymbol.setTo(symbol, len);
107     fISO.setTo(currency, u_strlen(currency));
108     fLong.remove();
109     StringEnumeration* keywords = rules->getKeywords(status);
110     if (U_FAILURE(status)) {
111         return;
112     }
113     const UnicodeString* pluralCount;
114     while ((pluralCount = keywords->snext(status)) != NULL) {
115         CharString pCount;
116         pCount.appendInvariantChars(*pluralCount, status);
117         const UChar *pluralName = ucurr_getPluralName(
118             currency, locale, &unusedIsChoice, pCount.data(),
119             &len, &status);
120         fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
121     }
122     delete keywords;
123 }
124 
125 void
adjustPrecision(const UChar * currency,const UCurrencyUsage usage,FixedPrecision & precision,UErrorCode & status)126 CurrencyAffixInfo::adjustPrecision(
127         const UChar *currency, const UCurrencyUsage usage,
128         FixedPrecision &precision, UErrorCode &status) {
129     if (U_FAILURE(status)) {
130         return;
131     }
132 
133     int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
134             currency, usage, &status);
135     precision.fMin.setFracDigitCount(digitCount);
136     precision.fMax.setFracDigitCount(digitCount);
137     double increment = ucurr_getRoundingIncrementForUsage(
138             currency, usage, &status);
139     if (increment == 0.0) {
140         precision.fRoundingIncrement.clear();
141     } else {
142         precision.fRoundingIncrement.set(increment);
143         // guard against round-off error
144         precision.fRoundingIncrement.round(6);
145     }
146 }
147 
148 void
addLiteral(const UChar * literal,int32_t start,int32_t len)149 AffixPattern::addLiteral(
150         const UChar *literal, int32_t start, int32_t len) {
151     char32Count += u_countChar32(literal + start, len);
152     literals.append(literal, start, len);
153     int32_t tlen = tokens.length();
154     // Takes 4 UChars to encode maximum literal length.
155     UChar *tokenChars = tokens.getBuffer(tlen + 4);
156 
157     // find start of literal size. May be tlen if there is no literal.
158     // While finding start of literal size, compute literal length
159     int32_t literalLength = 0;
160     int32_t tLiteralStart = tlen;
161     while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
162         tLiteralStart--;
163         literalLength <<= 8;
164         literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
165     }
166     // Add number of chars we just added to literal
167     literalLength += len;
168 
169     // Now encode the new length starting at tLiteralStart
170     tlen = tLiteralStart;
171     tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
172     literalLength >>= 8;
173     while (literalLength) {
174         tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
175         literalLength >>= 8;
176     }
177     tokens.releaseBuffer(tlen);
178 }
179 
180 void
add(ETokenType t)181 AffixPattern::add(ETokenType t) {
182     add(t, 1);
183 }
184 
185 void
addCurrency(uint8_t count)186 AffixPattern::addCurrency(uint8_t count) {
187     add(kCurrency, count);
188 }
189 
190 void
add(ETokenType t,uint8_t count)191 AffixPattern::add(ETokenType t, uint8_t count) {
192     U_ASSERT(t != kLiteral);
193     char32Count += count;
194     switch (t) {
195     case kCurrency:
196         hasCurrencyToken = TRUE;
197         break;
198     case kPercent:
199         hasPercentToken = TRUE;
200         break;
201     case kPerMill:
202         hasPermillToken = TRUE;
203         break;
204     default:
205         // Do nothing
206         break;
207     }
208     tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
209 }
210 
211 AffixPattern &
append(const AffixPattern & other)212 AffixPattern::append(const AffixPattern &other) {
213     AffixPatternIterator iter;
214     other.iterator(iter);
215     UnicodeString literal;
216     while (iter.nextToken()) {
217         switch (iter.getTokenType()) {
218         case kLiteral:
219             iter.getLiteral(literal);
220             addLiteral(literal.getBuffer(), 0, literal.length());
221             break;
222         case kCurrency:
223             addCurrency(iter.getTokenLength());
224             break;
225         default:
226             add(iter.getTokenType());
227             break;
228         }
229     }
230     return *this;
231 }
232 
233 void
remove()234 AffixPattern::remove() {
235     tokens.remove();
236     literals.remove();
237     hasCurrencyToken = FALSE;
238     hasPercentToken = FALSE;
239     hasPermillToken = FALSE;
240     char32Count = 0;
241 }
242 
243 // escapes literals for strings where special characters are NOT escaped
244 // except for apostrophe.
escapeApostropheInLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)245 static void escapeApostropheInLiteral(
246         const UnicodeString &literal, UnicodeStringAppender &appender) {
247     int32_t len = literal.length();
248     const UChar *buffer = literal.getBuffer();
249     for (int32_t i = 0; i < len; ++i) {
250         UChar ch = buffer[i];
251         switch (ch) {
252             case 0x27:
253                 appender.append((UChar) 0x27);
254                 appender.append((UChar) 0x27);
255                 break;
256             default:
257                 appender.append(ch);
258                 break;
259         }
260     }
261 }
262 
263 
264 // escapes literals for user strings where special characters in literals
265 // are escaped with apostrophe.
escapeLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)266 static void escapeLiteral(
267         const UnicodeString &literal, UnicodeStringAppender &appender) {
268     int32_t len = literal.length();
269     const UChar *buffer = literal.getBuffer();
270     for (int32_t i = 0; i < len; ++i) {
271         UChar ch = buffer[i];
272         switch (ch) {
273             case 0x27:
274                 appender.append((UChar) 0x27);
275                 appender.append((UChar) 0x27);
276                 break;
277             case 0x25:
278                 appender.append((UChar) 0x27);
279                 appender.append((UChar) 0x25);
280                 appender.append((UChar) 0x27);
281                 break;
282             case 0x2030:
283                 appender.append((UChar) 0x27);
284                 appender.append((UChar) 0x2030);
285                 appender.append((UChar) 0x27);
286                 break;
287             case 0xA4:
288                 appender.append((UChar) 0x27);
289                 appender.append((UChar) 0xA4);
290                 appender.append((UChar) 0x27);
291                 break;
292             case 0x2D:
293                 appender.append((UChar) 0x27);
294                 appender.append((UChar) 0x2D);
295                 appender.append((UChar) 0x27);
296                 break;
297             case 0x2B:
298                 appender.append((UChar) 0x27);
299                 appender.append((UChar) 0x2B);
300                 appender.append((UChar) 0x27);
301                 break;
302             default:
303                 appender.append(ch);
304                 break;
305         }
306     }
307 }
308 
309 UnicodeString &
toString(UnicodeString & appendTo) const310 AffixPattern::toString(UnicodeString &appendTo) const {
311     AffixPatternIterator iter;
312     iterator(iter);
313     UnicodeStringAppender appender(appendTo);
314     UnicodeString literal;
315     while (iter.nextToken()) {
316         switch (iter.getTokenType()) {
317         case kLiteral:
318             escapeApostropheInLiteral(iter.getLiteral(literal), appender);
319             break;
320         case kPercent:
321             appender.append((UChar) 0x27);
322             appender.append((UChar) 0x25);
323             break;
324         case kPerMill:
325             appender.append((UChar) 0x27);
326             appender.append((UChar) 0x2030);
327             break;
328         case kCurrency:
329             {
330                 appender.append((UChar) 0x27);
331                 int32_t cl = iter.getTokenLength();
332                 for (int32_t i = 0; i < cl; ++i) {
333                     appender.append((UChar) 0xA4);
334                 }
335             }
336             break;
337         case kNegative:
338             appender.append((UChar) 0x27);
339             appender.append((UChar) 0x2D);
340             break;
341         case kPositive:
342             appender.append((UChar) 0x27);
343             appender.append((UChar) 0x2B);
344             break;
345         default:
346             U_ASSERT(FALSE);
347             break;
348         }
349     }
350     return appendTo;
351 }
352 
353 UnicodeString &
toUserString(UnicodeString & appendTo) const354 AffixPattern::toUserString(UnicodeString &appendTo) const {
355     AffixPatternIterator iter;
356     iterator(iter);
357     UnicodeStringAppender appender(appendTo);
358     UnicodeString literal;
359     while (iter.nextToken()) {
360         switch (iter.getTokenType()) {
361         case kLiteral:
362             escapeLiteral(iter.getLiteral(literal), appender);
363             break;
364         case kPercent:
365             appender.append((UChar) 0x25);
366             break;
367         case kPerMill:
368             appender.append((UChar) 0x2030);
369             break;
370         case kCurrency:
371             {
372                 int32_t cl = iter.getTokenLength();
373                 for (int32_t i = 0; i < cl; ++i) {
374                     appender.append((UChar) 0xA4);
375                 }
376             }
377             break;
378         case kNegative:
379             appender.append((UChar) 0x2D);
380             break;
381         case kPositive:
382             appender.append((UChar) 0x2B);
383             break;
384         default:
385             U_ASSERT(FALSE);
386             break;
387         }
388     }
389     return appendTo;
390 }
391 
392 class AffixPatternAppender : public UMemory {
393 public:
AffixPatternAppender(AffixPattern & dest)394     AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
395 
append(UChar x)396     inline void append(UChar x) {
397         if (fIdx == UPRV_LENGTHOF(fBuffer)) {
398             fDest->addLiteral(fBuffer, 0, fIdx);
399             fIdx = 0;
400         }
401         fBuffer[fIdx++] = x;
402     }
403 
append(UChar32 x)404     inline void append(UChar32 x) {
405         if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
406             fDest->addLiteral(fBuffer, 0, fIdx);
407             fIdx = 0;
408         }
409         U16_APPEND_UNSAFE(fBuffer, fIdx, x);
410     }
411 
flush()412     inline void flush() {
413         if (fIdx) {
414             fDest->addLiteral(fBuffer, 0, fIdx);
415         }
416         fIdx = 0;
417     }
418 
419     /**
420      * flush the buffer when we go out of scope.
421      */
~AffixPatternAppender()422     ~AffixPatternAppender() {
423         flush();
424     }
425 private:
426     AffixPattern *fDest;
427     int32_t fIdx;
428     UChar fBuffer[32];
429     AffixPatternAppender(const AffixPatternAppender &other);
430     AffixPatternAppender &operator=(const AffixPatternAppender &other);
431 };
432 
433 
434 AffixPattern &
parseUserAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)435 AffixPattern::parseUserAffixString(
436         const UnicodeString &affixStr,
437         AffixPattern &appendTo,
438         UErrorCode &status) {
439     if (U_FAILURE(status)) {
440         return appendTo;
441     }
442     int32_t len = affixStr.length();
443     const UChar *buffer = affixStr.getBuffer();
444     // 0 = not quoted; 1 = quoted.
445     int32_t state = 0;
446     AffixPatternAppender appender(appendTo);
447     for (int32_t i = 0; i < len; ) {
448         UChar token;
449         int32_t tokenSize = nextUserToken(buffer, i, len, &token);
450         i += tokenSize;
451         if (token == 0x27 && tokenSize == 1) { // quote
452             state = 1 - state;
453             continue;
454         }
455         if (state == 0) {
456             switch (token) {
457             case 0x25:
458                 appender.flush();
459                 appendTo.add(kPercent, 1);
460                 break;
461             case 0x27:  // double quote
462                 appender.append((UChar) 0x27);
463                 break;
464             case 0x2030:
465                 appender.flush();
466                 appendTo.add(kPerMill, 1);
467                 break;
468             case 0x2D:
469                 appender.flush();
470                 appendTo.add(kNegative, 1);
471                 break;
472             case 0x2B:
473                 appender.flush();
474                 appendTo.add(kPositive, 1);
475                 break;
476             case 0xA4:
477                 appender.flush();
478                 appendTo.add(kCurrency, tokenSize);
479                 break;
480             default:
481                 appender.append(token);
482                 break;
483             }
484         } else {
485             switch (token) {
486             case 0x27:  // double quote
487                 appender.append((UChar) 0x27);
488                 break;
489             case 0xA4: // included b/c tokenSize can be > 1
490                 for (int32_t j = 0; j < tokenSize; ++j) {
491                     appender.append((UChar) 0xA4);
492                 }
493                 break;
494             default:
495                 appender.append(token);
496                 break;
497             }
498         }
499     }
500     return appendTo;
501 }
502 
503 AffixPattern &
parseAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)504 AffixPattern::parseAffixString(
505         const UnicodeString &affixStr,
506         AffixPattern &appendTo,
507         UErrorCode &status) {
508     if (U_FAILURE(status)) {
509         return appendTo;
510     }
511     int32_t len = affixStr.length();
512     const UChar *buffer = affixStr.getBuffer();
513     for (int32_t i = 0; i < len; ) {
514         UChar token;
515         int32_t tokenSize = nextToken(buffer, i, len, &token);
516         if (tokenSize == 1) {
517             int32_t literalStart = i;
518             ++i;
519             while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
520                 ++i;
521             }
522             appendTo.addLiteral(buffer, literalStart, i - literalStart);
523 
524             // If we reached end of string, we are done
525             if (i == len) {
526                 return appendTo;
527             }
528         }
529         i += tokenSize;
530         switch (token) {
531         case 0x25:
532             appendTo.add(kPercent, 1);
533             break;
534         case 0x2030:
535             appendTo.add(kPerMill, 1);
536             break;
537         case 0x2D:
538             appendTo.add(kNegative, 1);
539             break;
540         case 0x2B:
541             appendTo.add(kPositive, 1);
542             break;
543         case 0xA4:
544             {
545                 if (tokenSize - 1 > 3) {
546                     status = U_PARSE_ERROR;
547                     return appendTo;
548                 }
549                 appendTo.add(kCurrency, tokenSize - 1);
550             }
551             break;
552         default:
553             appendTo.addLiteral(&token, 0, 1);
554             break;
555         }
556     }
557     return appendTo;
558 }
559 
560 AffixPatternIterator &
iterator(AffixPatternIterator & result) const561 AffixPattern::iterator(AffixPatternIterator &result) const {
562     result.nextLiteralIndex = 0;
563     result.lastLiteralLength = 0;
564     result.nextTokenIndex = 0;
565     result.tokens = &tokens;
566     result.literals = &literals;
567     return result;
568 }
569 
570 UBool
nextToken()571 AffixPatternIterator::nextToken() {
572     int32_t tlen = tokens->length();
573     if (nextTokenIndex == tlen) {
574         return FALSE;
575     }
576     ++nextTokenIndex;
577     const UChar *tokenBuffer = tokens->getBuffer();
578     if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
579             AffixPattern::kLiteral) {
580         while (nextTokenIndex < tlen &&
581                 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
582             ++nextTokenIndex;
583         }
584         lastLiteralLength = 0;
585         int32_t i = nextTokenIndex - 1;
586         for (; UNPACK_LONG(tokenBuffer[i]); --i) {
587             lastLiteralLength <<= 8;
588             lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
589         }
590         lastLiteralLength <<= 8;
591         lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
592         nextLiteralIndex += lastLiteralLength;
593     }
594     return TRUE;
595 }
596 
597 AffixPattern::ETokenType
getTokenType() const598 AffixPatternIterator::getTokenType() const {
599     return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
600 }
601 
602 UnicodeString &
getLiteral(UnicodeString & result) const603 AffixPatternIterator::getLiteral(UnicodeString &result) const {
604     const UChar *buffer = literals->getBuffer();
605     result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
606     return result;
607 }
608 
609 int32_t
getTokenLength() const610 AffixPatternIterator::getTokenLength() const {
611     const UChar *tokenBuffer = tokens->getBuffer();
612     AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
613     return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
614 }
615 
AffixPatternParser()616 AffixPatternParser::AffixPatternParser()
617         : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
618 }
619 
AffixPatternParser(const DecimalFormatSymbols & symbols)620 AffixPatternParser::AffixPatternParser(
621         const DecimalFormatSymbols &symbols) {
622     setDecimalFormatSymbols(symbols);
623 }
624 
625 void
setDecimalFormatSymbols(const DecimalFormatSymbols & symbols)626 AffixPatternParser::setDecimalFormatSymbols(
627         const DecimalFormatSymbols &symbols) {
628     fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
629     fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
630     fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
631     fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
632 }
633 
634 PluralAffix &
parse(const AffixPattern & affixPattern,const CurrencyAffixInfo & currencyAffixInfo,PluralAffix & appendTo,UErrorCode & status) const635 AffixPatternParser::parse(
636         const AffixPattern &affixPattern,
637         const CurrencyAffixInfo &currencyAffixInfo,
638         PluralAffix &appendTo,
639         UErrorCode &status) const {
640     if (U_FAILURE(status)) {
641         return appendTo;
642     }
643     AffixPatternIterator iter;
644     affixPattern.iterator(iter);
645     UnicodeString literal;
646     while (iter.nextToken()) {
647         switch (iter.getTokenType()) {
648         case AffixPattern::kPercent:
649             appendTo.append(fPercent, UNUM_PERCENT_FIELD);
650             break;
651         case AffixPattern::kPerMill:
652             appendTo.append(fPermill, UNUM_PERMILL_FIELD);
653             break;
654         case AffixPattern::kNegative:
655             appendTo.append(fNegative, UNUM_SIGN_FIELD);
656             break;
657         case AffixPattern::kPositive:
658             appendTo.append(fPositive, UNUM_SIGN_FIELD);
659             break;
660         case AffixPattern::kCurrency:
661             switch (iter.getTokenLength()) {
662                 case 1:
663                     appendTo.append(
664                             currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
665                     break;
666                 case 2:
667                     appendTo.append(
668                             currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
669                     break;
670                 case 3:
671                     appendTo.append(
672                             currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
673                     break;
674                 default:
675                     U_ASSERT(FALSE);
676                     break;
677             }
678             break;
679         case AffixPattern::kLiteral:
680             appendTo.append(iter.getLiteral(literal));
681             break;
682         default:
683             U_ASSERT(FALSE);
684             break;
685         }
686     }
687     return appendTo;
688 }
689 
690 
691 U_NAMESPACE_END
692 #endif /* #if !UCONFIG_NO_FORMATTING */
693