1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  * Copyright (C) 2015, International Business Machines
5  * Corporation and others.  All Rights Reserved.
6  *
7  * file name: affixpatternparser.cpp
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_FORMATTING
13 
14 #include "unicode/dcfmtsym.h"
15 #include "unicode/plurrule.h"
16 #include "unicode/ucurr.h"
17 #include "affixpatternparser.h"
18 #include "charstr.h"
19 #include "precision.h"
20 #include "uassert.h"
21 #include "unistrappender.h"
22 
23         static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
24 
25 static UChar gPercent = 0x25;
26 static UChar gPerMill = 0x2030;
27 static UChar gNegative = 0x2D;
28 static UChar gPositive = 0x2B;
29 
30 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
31 
32 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
33 
34 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
35 
36 #define UNPACK_LENGTH(c) ((c) & 0xFF)
37 
38 U_NAMESPACE_BEGIN
39 
40 static int32_t
nextToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)41 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
42     if (buffer[idx] != 0x27 || idx + 1 == len) {
43         *token = buffer[idx];
44         return 1;
45     }
46     *token = buffer[idx + 1];
47     if (buffer[idx + 1] == 0xA4) {
48         int32_t i = 2;
49         for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
50           ;
51         return i;
52     }
53     return 2;
54 }
55 
56 static int32_t
nextUserToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)57 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
58     *token = buffer[idx];
59     int32_t max;
60     switch (buffer[idx]) {
61     case 0x27:
62         max = 2;
63         break;
64     case 0xA4:
65         max = 3;
66         break;
67     default:
68         max = 1;
69         break;
70     }
71     int32_t i = 1;
72     for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
73       ;
74     return i;
75 }
76 
CurrencyAffixInfo()77 CurrencyAffixInfo::CurrencyAffixInfo()
78         : fSymbol(gDefaultSymbols, 1),
79           fISO(gDefaultSymbols, 2),
80           fLong(DigitAffix(gDefaultSymbols, 3)),
81           fIsDefault(TRUE) {
82 }
83 
84 void
set(const char * locale,const PluralRules * rules,const UChar * currency,UErrorCode & status)85 CurrencyAffixInfo::set(
86         const char *locale,
87         const PluralRules *rules,
88         const UChar *currency,
89         UErrorCode &status) {
90     if (U_FAILURE(status)) {
91         return;
92     }
93     fIsDefault = FALSE;
94     if (currency == NULL) {
95         fSymbol.setTo(gDefaultSymbols, 1);
96         fISO.setTo(gDefaultSymbols, 2);
97         fLong.remove();
98         fLong.append(gDefaultSymbols, 3);
99         fIsDefault = TRUE;
100         return;
101     }
102     int32_t len;
103     UBool unusedIsChoice;
104     const UChar *symbol = ucurr_getName(
105             currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
106             &len, &status);
107     if (U_FAILURE(status)) {
108         return;
109     }
110     fSymbol.setTo(symbol, len);
111     fISO.setTo(currency, u_strlen(currency));
112     fLong.remove();
113     StringEnumeration* keywords = rules->getKeywords(status);
114     if (U_FAILURE(status)) {
115         return;
116     }
117     const UnicodeString* pluralCount;
118     while ((pluralCount = keywords->snext(status)) != NULL) {
119         CharString pCount;
120         pCount.appendInvariantChars(*pluralCount, status);
121         const UChar *pluralName = ucurr_getPluralName(
122             currency, locale, &unusedIsChoice, pCount.data(),
123             &len, &status);
124         fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
125     }
126     delete keywords;
127 }
128 
129 void
adjustPrecision(const UChar * currency,const UCurrencyUsage usage,FixedPrecision & precision,UErrorCode & status)130 CurrencyAffixInfo::adjustPrecision(
131         const UChar *currency, const UCurrencyUsage usage,
132         FixedPrecision &precision, UErrorCode &status) {
133     if (U_FAILURE(status)) {
134         return;
135     }
136 
137     int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
138             currency, usage, &status);
139     precision.fMin.setFracDigitCount(digitCount);
140     precision.fMax.setFracDigitCount(digitCount);
141     double increment = ucurr_getRoundingIncrementForUsage(
142             currency, usage, &status);
143     if (increment == 0.0) {
144         precision.fRoundingIncrement.clear();
145     } else {
146         precision.fRoundingIncrement.set(increment);
147         // guard against round-off error
148         precision.fRoundingIncrement.round(6);
149     }
150 }
151 
152 void
addLiteral(const UChar * literal,int32_t start,int32_t len)153 AffixPattern::addLiteral(
154         const UChar *literal, int32_t start, int32_t len) {
155     char32Count += u_countChar32(literal + start, len);
156     literals.append(literal, start, len);
157     int32_t tlen = tokens.length();
158     // Takes 4 UChars to encode maximum literal length.
159     UChar *tokenChars = tokens.getBuffer(tlen + 4);
160 
161     // find start of literal size. May be tlen if there is no literal.
162     // While finding start of literal size, compute literal length
163     int32_t literalLength = 0;
164     int32_t tLiteralStart = tlen;
165     while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
166         tLiteralStart--;
167         literalLength <<= 8;
168         literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
169     }
170     // Add number of chars we just added to literal
171     literalLength += len;
172 
173     // Now encode the new length starting at tLiteralStart
174     tlen = tLiteralStart;
175     tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
176     literalLength >>= 8;
177     while (literalLength) {
178         tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
179         literalLength >>= 8;
180     }
181     tokens.releaseBuffer(tlen);
182 }
183 
184 void
add(ETokenType t)185 AffixPattern::add(ETokenType t) {
186     add(t, 1);
187 }
188 
189 void
addCurrency(uint8_t count)190 AffixPattern::addCurrency(uint8_t count) {
191     add(kCurrency, count);
192 }
193 
194 void
add(ETokenType t,uint8_t count)195 AffixPattern::add(ETokenType t, uint8_t count) {
196     U_ASSERT(t != kLiteral);
197     char32Count += count;
198     switch (t) {
199     case kCurrency:
200         hasCurrencyToken = TRUE;
201         break;
202     case kPercent:
203         hasPercentToken = TRUE;
204         break;
205     case kPerMill:
206         hasPermillToken = TRUE;
207         break;
208     default:
209         // Do nothing
210         break;
211     }
212     tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
213 }
214 
215 AffixPattern &
append(const AffixPattern & other)216 AffixPattern::append(const AffixPattern &other) {
217     AffixPatternIterator iter;
218     other.iterator(iter);
219     UnicodeString literal;
220     while (iter.nextToken()) {
221         switch (iter.getTokenType()) {
222         case kLiteral:
223             iter.getLiteral(literal);
224             addLiteral(literal.getBuffer(), 0, literal.length());
225             break;
226         case kCurrency:
227             addCurrency(iter.getTokenLength());
228             break;
229         default:
230             add(iter.getTokenType());
231             break;
232         }
233     }
234     return *this;
235 }
236 
237 void
remove()238 AffixPattern::remove() {
239     tokens.remove();
240     literals.remove();
241     hasCurrencyToken = FALSE;
242     hasPercentToken = FALSE;
243     hasPermillToken = FALSE;
244     char32Count = 0;
245 }
246 
247 // escapes literals for strings where special characters are NOT escaped
248 // except for apostrophe.
escapeApostropheInLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)249 static void escapeApostropheInLiteral(
250         const UnicodeString &literal, UnicodeStringAppender &appender) {
251     int32_t len = literal.length();
252     const UChar *buffer = literal.getBuffer();
253     for (int32_t i = 0; i < len; ++i) {
254         UChar ch = buffer[i];
255         switch (ch) {
256             case 0x27:
257                 appender.append((UChar) 0x27);
258                 appender.append((UChar) 0x27);
259                 break;
260             default:
261                 appender.append(ch);
262                 break;
263         }
264     }
265 }
266 
267 
268 // escapes literals for user strings where special characters in literals
269 // are escaped with apostrophe.
escapeLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)270 static void escapeLiteral(
271         const UnicodeString &literal, UnicodeStringAppender &appender) {
272     int32_t len = literal.length();
273     const UChar *buffer = literal.getBuffer();
274     for (int32_t i = 0; i < len; ++i) {
275         UChar ch = buffer[i];
276         switch (ch) {
277             case 0x27:
278                 appender.append((UChar) 0x27);
279                 appender.append((UChar) 0x27);
280                 break;
281             case 0x25:
282                 appender.append((UChar) 0x27);
283                 appender.append((UChar) 0x25);
284                 appender.append((UChar) 0x27);
285                 break;
286             case 0x2030:
287                 appender.append((UChar) 0x27);
288                 appender.append((UChar) 0x2030);
289                 appender.append((UChar) 0x27);
290                 break;
291             case 0xA4:
292                 appender.append((UChar) 0x27);
293                 appender.append((UChar) 0xA4);
294                 appender.append((UChar) 0x27);
295                 break;
296             case 0x2D:
297                 appender.append((UChar) 0x27);
298                 appender.append((UChar) 0x2D);
299                 appender.append((UChar) 0x27);
300                 break;
301             case 0x2B:
302                 appender.append((UChar) 0x27);
303                 appender.append((UChar) 0x2B);
304                 appender.append((UChar) 0x27);
305                 break;
306             default:
307                 appender.append(ch);
308                 break;
309         }
310     }
311 }
312 
313 UnicodeString &
toString(UnicodeString & appendTo) const314 AffixPattern::toString(UnicodeString &appendTo) const {
315     AffixPatternIterator iter;
316     iterator(iter);
317     UnicodeStringAppender appender(appendTo);
318     UnicodeString literal;
319     while (iter.nextToken()) {
320         switch (iter.getTokenType()) {
321         case kLiteral:
322             escapeApostropheInLiteral(iter.getLiteral(literal), appender);
323             break;
324         case kPercent:
325             appender.append((UChar) 0x27);
326             appender.append((UChar) 0x25);
327             break;
328         case kPerMill:
329             appender.append((UChar) 0x27);
330             appender.append((UChar) 0x2030);
331             break;
332         case kCurrency:
333             {
334                 appender.append((UChar) 0x27);
335                 int32_t cl = iter.getTokenLength();
336                 for (int32_t i = 0; i < cl; ++i) {
337                     appender.append((UChar) 0xA4);
338                 }
339             }
340             break;
341         case kNegative:
342             appender.append((UChar) 0x27);
343             appender.append((UChar) 0x2D);
344             break;
345         case kPositive:
346             appender.append((UChar) 0x27);
347             appender.append((UChar) 0x2B);
348             break;
349         default:
350             U_ASSERT(FALSE);
351             break;
352         }
353     }
354     return appendTo;
355 }
356 
357 UnicodeString &
toUserString(UnicodeString & appendTo) const358 AffixPattern::toUserString(UnicodeString &appendTo) const {
359     AffixPatternIterator iter;
360     iterator(iter);
361     UnicodeStringAppender appender(appendTo);
362     UnicodeString literal;
363     while (iter.nextToken()) {
364         switch (iter.getTokenType()) {
365         case kLiteral:
366             escapeLiteral(iter.getLiteral(literal), appender);
367             break;
368         case kPercent:
369             appender.append((UChar) 0x25);
370             break;
371         case kPerMill:
372             appender.append((UChar) 0x2030);
373             break;
374         case kCurrency:
375             {
376                 int32_t cl = iter.getTokenLength();
377                 for (int32_t i = 0; i < cl; ++i) {
378                     appender.append((UChar) 0xA4);
379                 }
380             }
381             break;
382         case kNegative:
383             appender.append((UChar) 0x2D);
384             break;
385         case kPositive:
386             appender.append((UChar) 0x2B);
387             break;
388         default:
389             U_ASSERT(FALSE);
390             break;
391         }
392     }
393     return appendTo;
394 }
395 
396 class AffixPatternAppender : public UMemory {
397 public:
AffixPatternAppender(AffixPattern & dest)398     AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
399 
append(UChar x)400     inline void append(UChar x) {
401         if (fIdx == UPRV_LENGTHOF(fBuffer)) {
402             fDest->addLiteral(fBuffer, 0, fIdx);
403             fIdx = 0;
404         }
405         fBuffer[fIdx++] = x;
406     }
407 
append(UChar32 x)408     inline void append(UChar32 x) {
409         if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
410             fDest->addLiteral(fBuffer, 0, fIdx);
411             fIdx = 0;
412         }
413         U16_APPEND_UNSAFE(fBuffer, fIdx, x);
414     }
415 
flush()416     inline void flush() {
417         if (fIdx) {
418             fDest->addLiteral(fBuffer, 0, fIdx);
419         }
420         fIdx = 0;
421     }
422 
423     /**
424      * flush the buffer when we go out of scope.
425      */
~AffixPatternAppender()426     ~AffixPatternAppender() {
427         flush();
428     }
429 private:
430     AffixPattern *fDest;
431     int32_t fIdx;
432     UChar fBuffer[32];
433     AffixPatternAppender(const AffixPatternAppender &other);
434     AffixPatternAppender &operator=(const AffixPatternAppender &other);
435 };
436 
437 
438 AffixPattern &
parseUserAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)439 AffixPattern::parseUserAffixString(
440         const UnicodeString &affixStr,
441         AffixPattern &appendTo,
442         UErrorCode &status) {
443     if (U_FAILURE(status)) {
444         return appendTo;
445     }
446     int32_t len = affixStr.length();
447     const UChar *buffer = affixStr.getBuffer();
448     // 0 = not quoted; 1 = quoted.
449     int32_t state = 0;
450     AffixPatternAppender appender(appendTo);
451     for (int32_t i = 0; i < len; ) {
452         UChar token;
453         int32_t tokenSize = nextUserToken(buffer, i, len, &token);
454         i += tokenSize;
455         if (token == 0x27 && tokenSize == 1) { // quote
456             state = 1 - state;
457             continue;
458         }
459         if (state == 0) {
460             switch (token) {
461             case 0x25:
462                 appender.flush();
463                 appendTo.add(kPercent, 1);
464                 break;
465             case 0x27:  // double quote
466                 appender.append((UChar) 0x27);
467                 break;
468             case 0x2030:
469                 appender.flush();
470                 appendTo.add(kPerMill, 1);
471                 break;
472             case 0x2D:
473                 appender.flush();
474                 appendTo.add(kNegative, 1);
475                 break;
476             case 0x2B:
477                 appender.flush();
478                 appendTo.add(kPositive, 1);
479                 break;
480             case 0xA4:
481                 appender.flush();
482                 appendTo.add(kCurrency, tokenSize);
483                 break;
484             default:
485                 appender.append(token);
486                 break;
487             }
488         } else {
489             switch (token) {
490             case 0x27:  // double quote
491                 appender.append((UChar) 0x27);
492                 break;
493             case 0xA4: // included b/c tokenSize can be > 1
494                 for (int32_t j = 0; j < tokenSize; ++j) {
495                     appender.append((UChar) 0xA4);
496                 }
497                 break;
498             default:
499                 appender.append(token);
500                 break;
501             }
502         }
503     }
504     return appendTo;
505 }
506 
507 AffixPattern &
parseAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)508 AffixPattern::parseAffixString(
509         const UnicodeString &affixStr,
510         AffixPattern &appendTo,
511         UErrorCode &status) {
512     if (U_FAILURE(status)) {
513         return appendTo;
514     }
515     int32_t len = affixStr.length();
516     const UChar *buffer = affixStr.getBuffer();
517     for (int32_t i = 0; i < len; ) {
518         UChar token;
519         int32_t tokenSize = nextToken(buffer, i, len, &token);
520         if (tokenSize == 1) {
521             int32_t literalStart = i;
522             ++i;
523             while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
524                 ++i;
525             }
526             appendTo.addLiteral(buffer, literalStart, i - literalStart);
527 
528             // If we reached end of string, we are done
529             if (i == len) {
530                 return appendTo;
531             }
532         }
533         i += tokenSize;
534         switch (token) {
535         case 0x25:
536             appendTo.add(kPercent, 1);
537             break;
538         case 0x2030:
539             appendTo.add(kPerMill, 1);
540             break;
541         case 0x2D:
542             appendTo.add(kNegative, 1);
543             break;
544         case 0x2B:
545             appendTo.add(kPositive, 1);
546             break;
547         case 0xA4:
548             {
549                 if (tokenSize - 1 > 3) {
550                     status = U_PARSE_ERROR;
551                     return appendTo;
552                 }
553                 appendTo.add(kCurrency, tokenSize - 1);
554             }
555             break;
556         default:
557             appendTo.addLiteral(&token, 0, 1);
558             break;
559         }
560     }
561     return appendTo;
562 }
563 
564 AffixPatternIterator &
iterator(AffixPatternIterator & result) const565 AffixPattern::iterator(AffixPatternIterator &result) const {
566     result.nextLiteralIndex = 0;
567     result.lastLiteralLength = 0;
568     result.nextTokenIndex = 0;
569     result.tokens = &tokens;
570     result.literals = &literals;
571     return result;
572 }
573 
574 UBool
nextToken()575 AffixPatternIterator::nextToken() {
576     int32_t tlen = tokens->length();
577     if (nextTokenIndex == tlen) {
578         return FALSE;
579     }
580     ++nextTokenIndex;
581     const UChar *tokenBuffer = tokens->getBuffer();
582     if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
583             AffixPattern::kLiteral) {
584         while (nextTokenIndex < tlen &&
585                 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
586             ++nextTokenIndex;
587         }
588         lastLiteralLength = 0;
589         int32_t i = nextTokenIndex - 1;
590         for (; UNPACK_LONG(tokenBuffer[i]); --i) {
591             lastLiteralLength <<= 8;
592             lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
593         }
594         lastLiteralLength <<= 8;
595         lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
596         nextLiteralIndex += lastLiteralLength;
597     }
598     return TRUE;
599 }
600 
601 AffixPattern::ETokenType
getTokenType() const602 AffixPatternIterator::getTokenType() const {
603     return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
604 }
605 
606 UnicodeString &
getLiteral(UnicodeString & result) const607 AffixPatternIterator::getLiteral(UnicodeString &result) const {
608     const UChar *buffer = literals->getBuffer();
609     result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
610     return result;
611 }
612 
613 int32_t
getTokenLength() const614 AffixPatternIterator::getTokenLength() const {
615     const UChar *tokenBuffer = tokens->getBuffer();
616     AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
617     return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
618 }
619 
AffixPatternParser()620 AffixPatternParser::AffixPatternParser()
621         : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
622 }
623 
AffixPatternParser(const DecimalFormatSymbols & symbols)624 AffixPatternParser::AffixPatternParser(
625         const DecimalFormatSymbols &symbols) {
626     setDecimalFormatSymbols(symbols);
627 }
628 
629 void
setDecimalFormatSymbols(const DecimalFormatSymbols & symbols)630 AffixPatternParser::setDecimalFormatSymbols(
631         const DecimalFormatSymbols &symbols) {
632     fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
633     fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
634     fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
635     fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
636 }
637 
638 PluralAffix &
parse(const AffixPattern & affixPattern,const CurrencyAffixInfo & currencyAffixInfo,PluralAffix & appendTo,UErrorCode & status) const639 AffixPatternParser::parse(
640         const AffixPattern &affixPattern,
641         const CurrencyAffixInfo &currencyAffixInfo,
642         PluralAffix &appendTo,
643         UErrorCode &status) const {
644     if (U_FAILURE(status)) {
645         return appendTo;
646     }
647     AffixPatternIterator iter;
648     affixPattern.iterator(iter);
649     UnicodeString literal;
650     while (iter.nextToken()) {
651         switch (iter.getTokenType()) {
652         case AffixPattern::kPercent:
653             appendTo.append(fPercent, UNUM_PERCENT_FIELD);
654             break;
655         case AffixPattern::kPerMill:
656             appendTo.append(fPermill, UNUM_PERMILL_FIELD);
657             break;
658         case AffixPattern::kNegative:
659             appendTo.append(fNegative, UNUM_SIGN_FIELD);
660             break;
661         case AffixPattern::kPositive:
662             appendTo.append(fPositive, UNUM_SIGN_FIELD);
663             break;
664         case AffixPattern::kCurrency:
665             switch (iter.getTokenLength()) {
666                 case 1:
667                     appendTo.append(
668                             currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
669                     break;
670                 case 2:
671                     appendTo.append(
672                             currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
673                     break;
674                 case 3:
675                     appendTo.append(
676                             currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
677                     break;
678                 default:
679                     U_ASSERT(FALSE);
680                     break;
681             }
682             break;
683         case AffixPattern::kLiteral:
684             appendTo.append(iter.getLiteral(literal));
685             break;
686         default:
687             U_ASSERT(FALSE);
688             break;
689         }
690     }
691     return appendTo;
692 }
693 
694 
695 U_NAMESPACE_END
696 #endif /* #if !UCONFIG_NO_FORMATTING */
697