1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  * Copyright (C) 2015, International Business Machines
5  * Corporation and others.  All Rights Reserved.
6  *
7  * file name: affixpatternparser.cpp
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_FORMATTING
13 
14 #include "unicode/dcfmtsym.h"
15 #include "unicode/plurrule.h"
16 #include "unicode/strenum.h"
17 #include "unicode/ucurr.h"
18 #include "unicode/ustring.h"
19 #include "affixpatternparser.h"
20 #include "charstr.h"
21 #include "precision.h"
22 #include "uassert.h"
23 #include "unistrappender.h"
24 
25 static const UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
26 
27 static const UChar gPercent = 0x25;
28 static const UChar gPerMill = 0x2030;
29 static const UChar gNegative = 0x2D;
30 static const UChar gPositive = 0x2B;
31 
32 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
33 
34 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
35 
36 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
37 
38 #define UNPACK_LENGTH(c) ((c) & 0xFF)
39 
40 U_NAMESPACE_BEGIN
41 
42 static int32_t
nextToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)43 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
44     if (buffer[idx] != 0x27 || idx + 1 == len) {
45         *token = buffer[idx];
46         return 1;
47     }
48     *token = buffer[idx + 1];
49     if (buffer[idx + 1] == 0xA4) {
50         int32_t i = 2;
51         for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
52           ;
53         return i;
54     }
55     return 2;
56 }
57 
58 static int32_t
nextUserToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)59 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
60     *token = buffer[idx];
61     int32_t max;
62     switch (buffer[idx]) {
63     case 0x27:
64         max = 2;
65         break;
66     case 0xA4:
67         max = 3;
68         break;
69     default:
70         max = 1;
71         break;
72     }
73     int32_t i = 1;
74     for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
75       ;
76     return i;
77 }
78 
CurrencyAffixInfo()79 CurrencyAffixInfo::CurrencyAffixInfo()
80         : fSymbol(gDefaultSymbols, 1),
81           fISO(gDefaultSymbols, 2),
82           fLong(DigitAffix(gDefaultSymbols, 3)),
83           fIsDefault(TRUE) {
84 }
85 
86 void
set(const char * locale,const PluralRules * rules,const UChar * currency,UErrorCode & status)87 CurrencyAffixInfo::set(
88         const char *locale,
89         const PluralRules *rules,
90         const UChar *currency,
91         UErrorCode &status) {
92     if (U_FAILURE(status)) {
93         return;
94     }
95     fIsDefault = FALSE;
96     if (currency == NULL) {
97         fSymbol.setTo(gDefaultSymbols, 1);
98         fISO.setTo(gDefaultSymbols, 2);
99         fLong.remove();
100         fLong.append(gDefaultSymbols, 3);
101         fIsDefault = TRUE;
102         return;
103     }
104     int32_t len;
105     UBool unusedIsChoice;
106     const UChar *symbol = ucurr_getName(
107             currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
108             &len, &status);
109     if (U_FAILURE(status)) {
110         return;
111     }
112     fSymbol.setTo(symbol, len);
113     fISO.setTo(currency, u_strlen(currency));
114     fLong.remove();
115     StringEnumeration* keywords = rules->getKeywords(status);
116     if (U_FAILURE(status)) {
117         return;
118     }
119     const UnicodeString* pluralCount;
120     while ((pluralCount = keywords->snext(status)) != NULL) {
121         CharString pCount;
122         pCount.appendInvariantChars(*pluralCount, status);
123         const UChar *pluralName = ucurr_getPluralName(
124             currency, locale, &unusedIsChoice, pCount.data(),
125             &len, &status);
126         fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
127     }
128     delete keywords;
129 }
130 
131 void
adjustPrecision(const UChar * currency,const UCurrencyUsage usage,FixedPrecision & precision,UErrorCode & status)132 CurrencyAffixInfo::adjustPrecision(
133         const UChar *currency, const UCurrencyUsage usage,
134         FixedPrecision &precision, UErrorCode &status) {
135     if (U_FAILURE(status)) {
136         return;
137     }
138 
139     int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
140             currency, usage, &status);
141     precision.fMin.setFracDigitCount(digitCount);
142     precision.fMax.setFracDigitCount(digitCount);
143     double increment = ucurr_getRoundingIncrementForUsage(
144             currency, usage, &status);
145     if (increment == 0.0) {
146         precision.fRoundingIncrement.clear();
147     } else {
148         precision.fRoundingIncrement.set(increment);
149         // guard against round-off error
150         precision.fRoundingIncrement.round(6);
151     }
152 }
153 
154 void
addLiteral(const UChar * literal,int32_t start,int32_t len)155 AffixPattern::addLiteral(
156         const UChar *literal, int32_t start, int32_t len) {
157     char32Count += u_countChar32(literal + start, len);
158     literals.append(literal, start, len);
159     int32_t tlen = tokens.length();
160     // Takes 4 UChars to encode maximum literal length.
161     UChar *tokenChars = tokens.getBuffer(tlen + 4);
162 
163     // find start of literal size. May be tlen if there is no literal.
164     // While finding start of literal size, compute literal length
165     int32_t literalLength = 0;
166     int32_t tLiteralStart = tlen;
167     while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
168         tLiteralStart--;
169         literalLength <<= 8;
170         literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
171     }
172     // Add number of chars we just added to literal
173     literalLength += len;
174 
175     // Now encode the new length starting at tLiteralStart
176     tlen = tLiteralStart;
177     tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
178     literalLength >>= 8;
179     while (literalLength) {
180         tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
181         literalLength >>= 8;
182     }
183     tokens.releaseBuffer(tlen);
184 }
185 
186 void
add(ETokenType t)187 AffixPattern::add(ETokenType t) {
188     add(t, 1);
189 }
190 
191 void
addCurrency(uint8_t count)192 AffixPattern::addCurrency(uint8_t count) {
193     add(kCurrency, count);
194 }
195 
196 void
add(ETokenType t,uint8_t count)197 AffixPattern::add(ETokenType t, uint8_t count) {
198     U_ASSERT(t != kLiteral);
199     char32Count += count;
200     switch (t) {
201     case kCurrency:
202         hasCurrencyToken = TRUE;
203         break;
204     case kPercent:
205         hasPercentToken = TRUE;
206         break;
207     case kPerMill:
208         hasPermillToken = TRUE;
209         break;
210     default:
211         // Do nothing
212         break;
213     }
214     tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
215 }
216 
217 AffixPattern &
append(const AffixPattern & other)218 AffixPattern::append(const AffixPattern &other) {
219     AffixPatternIterator iter;
220     other.iterator(iter);
221     UnicodeString literal;
222     while (iter.nextToken()) {
223         switch (iter.getTokenType()) {
224         case kLiteral:
225             iter.getLiteral(literal);
226             addLiteral(literal.getBuffer(), 0, literal.length());
227             break;
228         case kCurrency:
229             addCurrency(static_cast<uint8_t>(iter.getTokenLength()));
230             break;
231         default:
232             add(iter.getTokenType());
233             break;
234         }
235     }
236     return *this;
237 }
238 
239 void
remove()240 AffixPattern::remove() {
241     tokens.remove();
242     literals.remove();
243     hasCurrencyToken = FALSE;
244     hasPercentToken = FALSE;
245     hasPermillToken = FALSE;
246     char32Count = 0;
247 }
248 
249 // escapes literals for strings where special characters are NOT escaped
250 // except for apostrophe.
escapeApostropheInLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)251 static void escapeApostropheInLiteral(
252         const UnicodeString &literal, UnicodeStringAppender &appender) {
253     int32_t len = literal.length();
254     const UChar *buffer = literal.getBuffer();
255     for (int32_t i = 0; i < len; ++i) {
256         UChar ch = buffer[i];
257         switch (ch) {
258             case 0x27:
259                 appender.append((UChar) 0x27);
260                 appender.append((UChar) 0x27);
261                 break;
262             default:
263                 appender.append(ch);
264                 break;
265         }
266     }
267 }
268 
269 
270 // escapes literals for user strings where special characters in literals
271 // are escaped with apostrophe.
escapeLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)272 static void escapeLiteral(
273         const UnicodeString &literal, UnicodeStringAppender &appender) {
274     int32_t len = literal.length();
275     const UChar *buffer = literal.getBuffer();
276     for (int32_t i = 0; i < len; ++i) {
277         UChar ch = buffer[i];
278         switch (ch) {
279             case 0x27:
280                 appender.append((UChar) 0x27);
281                 appender.append((UChar) 0x27);
282                 break;
283             case 0x25:
284                 appender.append((UChar) 0x27);
285                 appender.append((UChar) 0x25);
286                 appender.append((UChar) 0x27);
287                 break;
288             case 0x2030:
289                 appender.append((UChar) 0x27);
290                 appender.append((UChar) 0x2030);
291                 appender.append((UChar) 0x27);
292                 break;
293             case 0xA4:
294                 appender.append((UChar) 0x27);
295                 appender.append((UChar) 0xA4);
296                 appender.append((UChar) 0x27);
297                 break;
298             case 0x2D:
299                 appender.append((UChar) 0x27);
300                 appender.append((UChar) 0x2D);
301                 appender.append((UChar) 0x27);
302                 break;
303             case 0x2B:
304                 appender.append((UChar) 0x27);
305                 appender.append((UChar) 0x2B);
306                 appender.append((UChar) 0x27);
307                 break;
308             default:
309                 appender.append(ch);
310                 break;
311         }
312     }
313 }
314 
315 UnicodeString &
toString(UnicodeString & appendTo) const316 AffixPattern::toString(UnicodeString &appendTo) const {
317     AffixPatternIterator iter;
318     iterator(iter);
319     UnicodeStringAppender appender(appendTo);
320     UnicodeString literal;
321     while (iter.nextToken()) {
322         switch (iter.getTokenType()) {
323         case kLiteral:
324             escapeApostropheInLiteral(iter.getLiteral(literal), appender);
325             break;
326         case kPercent:
327             appender.append((UChar) 0x27);
328             appender.append((UChar) 0x25);
329             break;
330         case kPerMill:
331             appender.append((UChar) 0x27);
332             appender.append((UChar) 0x2030);
333             break;
334         case kCurrency:
335             {
336                 appender.append((UChar) 0x27);
337                 int32_t cl = iter.getTokenLength();
338                 for (int32_t i = 0; i < cl; ++i) {
339                     appender.append((UChar) 0xA4);
340                 }
341             }
342             break;
343         case kNegative:
344             appender.append((UChar) 0x27);
345             appender.append((UChar) 0x2D);
346             break;
347         case kPositive:
348             appender.append((UChar) 0x27);
349             appender.append((UChar) 0x2B);
350             break;
351         default:
352             U_ASSERT(FALSE);
353             break;
354         }
355     }
356     return appendTo;
357 }
358 
359 UnicodeString &
toUserString(UnicodeString & appendTo) const360 AffixPattern::toUserString(UnicodeString &appendTo) const {
361     AffixPatternIterator iter;
362     iterator(iter);
363     UnicodeStringAppender appender(appendTo);
364     UnicodeString literal;
365     while (iter.nextToken()) {
366         switch (iter.getTokenType()) {
367         case kLiteral:
368             escapeLiteral(iter.getLiteral(literal), appender);
369             break;
370         case kPercent:
371             appender.append((UChar) 0x25);
372             break;
373         case kPerMill:
374             appender.append((UChar) 0x2030);
375             break;
376         case kCurrency:
377             {
378                 int32_t cl = iter.getTokenLength();
379                 for (int32_t i = 0; i < cl; ++i) {
380                     appender.append((UChar) 0xA4);
381                 }
382             }
383             break;
384         case kNegative:
385             appender.append((UChar) 0x2D);
386             break;
387         case kPositive:
388             appender.append((UChar) 0x2B);
389             break;
390         default:
391             U_ASSERT(FALSE);
392             break;
393         }
394     }
395     return appendTo;
396 }
397 
398 class AffixPatternAppender : public UMemory {
399 public:
AffixPatternAppender(AffixPattern & dest)400     AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
401 
append(UChar x)402     inline void append(UChar x) {
403         if (fIdx == UPRV_LENGTHOF(fBuffer)) {
404             fDest->addLiteral(fBuffer, 0, fIdx);
405             fIdx = 0;
406         }
407         fBuffer[fIdx++] = x;
408     }
409 
append(UChar32 x)410     inline void append(UChar32 x) {
411         if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
412             fDest->addLiteral(fBuffer, 0, fIdx);
413             fIdx = 0;
414         }
415         U16_APPEND_UNSAFE(fBuffer, fIdx, x);
416     }
417 
flush()418     inline void flush() {
419         if (fIdx) {
420             fDest->addLiteral(fBuffer, 0, fIdx);
421         }
422         fIdx = 0;
423     }
424 
425     /**
426      * flush the buffer when we go out of scope.
427      */
~AffixPatternAppender()428     ~AffixPatternAppender() {
429         flush();
430     }
431 private:
432     AffixPattern *fDest;
433     int32_t fIdx;
434     UChar fBuffer[32];
435     AffixPatternAppender(const AffixPatternAppender &other);
436     AffixPatternAppender &operator=(const AffixPatternAppender &other);
437 };
438 
439 
440 AffixPattern &
parseUserAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)441 AffixPattern::parseUserAffixString(
442         const UnicodeString &affixStr,
443         AffixPattern &appendTo,
444         UErrorCode &status) {
445     if (U_FAILURE(status)) {
446         return appendTo;
447     }
448     int32_t len = affixStr.length();
449     const UChar *buffer = affixStr.getBuffer();
450     // 0 = not quoted; 1 = quoted.
451     int32_t state = 0;
452     AffixPatternAppender appender(appendTo);
453     for (int32_t i = 0; i < len; ) {
454         UChar token;
455         int32_t tokenSize = nextUserToken(buffer, i, len, &token);
456         i += tokenSize;
457         if (token == 0x27 && tokenSize == 1) { // quote
458             state = 1 - state;
459             continue;
460         }
461         if (state == 0) {
462             switch (token) {
463             case 0x25:
464                 appender.flush();
465                 appendTo.add(kPercent, 1);
466                 break;
467             case 0x27:  // double quote
468                 appender.append((UChar) 0x27);
469                 break;
470             case 0x2030:
471                 appender.flush();
472                 appendTo.add(kPerMill, 1);
473                 break;
474             case 0x2D:
475                 appender.flush();
476                 appendTo.add(kNegative, 1);
477                 break;
478             case 0x2B:
479                 appender.flush();
480                 appendTo.add(kPositive, 1);
481                 break;
482             case 0xA4:
483                 appender.flush();
484                 appendTo.add(kCurrency, static_cast<uint8_t>(tokenSize));
485                 break;
486             default:
487                 appender.append(token);
488                 break;
489             }
490         } else {
491             switch (token) {
492             case 0x27:  // double quote
493                 appender.append((UChar) 0x27);
494                 break;
495             case 0xA4: // included b/c tokenSize can be > 1
496                 for (int32_t j = 0; j < tokenSize; ++j) {
497                     appender.append((UChar) 0xA4);
498                 }
499                 break;
500             default:
501                 appender.append(token);
502                 break;
503             }
504         }
505     }
506     return appendTo;
507 }
508 
509 AffixPattern &
parseAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)510 AffixPattern::parseAffixString(
511         const UnicodeString &affixStr,
512         AffixPattern &appendTo,
513         UErrorCode &status) {
514     if (U_FAILURE(status)) {
515         return appendTo;
516     }
517     int32_t len = affixStr.length();
518     const UChar *buffer = affixStr.getBuffer();
519     for (int32_t i = 0; i < len; ) {
520         UChar token;
521         int32_t tokenSize = nextToken(buffer, i, len, &token);
522         if (tokenSize == 1) {
523             int32_t literalStart = i;
524             ++i;
525             while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
526                 ++i;
527             }
528             appendTo.addLiteral(buffer, literalStart, i - literalStart);
529 
530             // If we reached end of string, we are done
531             if (i == len) {
532                 return appendTo;
533             }
534         }
535         i += tokenSize;
536         switch (token) {
537         case 0x25:
538             appendTo.add(kPercent, 1);
539             break;
540         case 0x2030:
541             appendTo.add(kPerMill, 1);
542             break;
543         case 0x2D:
544             appendTo.add(kNegative, 1);
545             break;
546         case 0x2B:
547             appendTo.add(kPositive, 1);
548             break;
549         case 0xA4:
550             {
551                 if (tokenSize - 1 > 3) {
552                     status = U_PARSE_ERROR;
553                     return appendTo;
554                 }
555                 appendTo.add(kCurrency, tokenSize - 1);
556             }
557             break;
558         default:
559             appendTo.addLiteral(&token, 0, 1);
560             break;
561         }
562     }
563     return appendTo;
564 }
565 
566 AffixPatternIterator &
iterator(AffixPatternIterator & result) const567 AffixPattern::iterator(AffixPatternIterator &result) const {
568     result.nextLiteralIndex = 0;
569     result.lastLiteralLength = 0;
570     result.nextTokenIndex = 0;
571     result.tokens = &tokens;
572     result.literals = &literals;
573     return result;
574 }
575 
576 UBool
nextToken()577 AffixPatternIterator::nextToken() {
578     int32_t tlen = tokens->length();
579     if (nextTokenIndex == tlen) {
580         return FALSE;
581     }
582     ++nextTokenIndex;
583     const UChar *tokenBuffer = tokens->getBuffer();
584     if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
585             AffixPattern::kLiteral) {
586         while (nextTokenIndex < tlen &&
587                 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
588             ++nextTokenIndex;
589         }
590         lastLiteralLength = 0;
591         int32_t i = nextTokenIndex - 1;
592         for (; UNPACK_LONG(tokenBuffer[i]); --i) {
593             lastLiteralLength <<= 8;
594             lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
595         }
596         lastLiteralLength <<= 8;
597         lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
598         nextLiteralIndex += lastLiteralLength;
599     }
600     return TRUE;
601 }
602 
603 AffixPattern::ETokenType
getTokenType() const604 AffixPatternIterator::getTokenType() const {
605     return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
606 }
607 
608 UnicodeString &
getLiteral(UnicodeString & result) const609 AffixPatternIterator::getLiteral(UnicodeString &result) const {
610     const UChar *buffer = literals->getBuffer();
611     result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
612     return result;
613 }
614 
615 int32_t
getTokenLength() const616 AffixPatternIterator::getTokenLength() const {
617     const UChar *tokenBuffer = tokens->getBuffer();
618     AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
619     return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
620 }
621 
AffixPatternParser()622 AffixPatternParser::AffixPatternParser()
623         : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
624 }
625 
AffixPatternParser(const DecimalFormatSymbols & symbols)626 AffixPatternParser::AffixPatternParser(
627         const DecimalFormatSymbols &symbols) {
628     setDecimalFormatSymbols(symbols);
629 }
630 
631 void
setDecimalFormatSymbols(const DecimalFormatSymbols & symbols)632 AffixPatternParser::setDecimalFormatSymbols(
633         const DecimalFormatSymbols &symbols) {
634     fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
635     fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
636     fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
637     fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
638 }
639 
640 PluralAffix &
parse(const AffixPattern & affixPattern,const CurrencyAffixInfo & currencyAffixInfo,PluralAffix & appendTo,UErrorCode & status) const641 AffixPatternParser::parse(
642         const AffixPattern &affixPattern,
643         const CurrencyAffixInfo &currencyAffixInfo,
644         PluralAffix &appendTo,
645         UErrorCode &status) const {
646     if (U_FAILURE(status)) {
647         return appendTo;
648     }
649     AffixPatternIterator iter;
650     affixPattern.iterator(iter);
651     UnicodeString literal;
652     while (iter.nextToken()) {
653         switch (iter.getTokenType()) {
654         case AffixPattern::kPercent:
655             appendTo.append(fPercent, UNUM_PERCENT_FIELD);
656             break;
657         case AffixPattern::kPerMill:
658             appendTo.append(fPermill, UNUM_PERMILL_FIELD);
659             break;
660         case AffixPattern::kNegative:
661             appendTo.append(fNegative, UNUM_SIGN_FIELD);
662             break;
663         case AffixPattern::kPositive:
664             appendTo.append(fPositive, UNUM_SIGN_FIELD);
665             break;
666         case AffixPattern::kCurrency:
667             switch (iter.getTokenLength()) {
668                 case 1:
669                     appendTo.append(
670                             currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
671                     break;
672                 case 2:
673                     appendTo.append(
674                             currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
675                     break;
676                 case 3:
677                     appendTo.append(
678                             currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
679                     break;
680                 default:
681                     U_ASSERT(FALSE);
682                     break;
683             }
684             break;
685         case AffixPattern::kLiteral:
686             appendTo.append(iter.getLiteral(literal));
687             break;
688         default:
689             U_ASSERT(FALSE);
690             break;
691         }
692     }
693     return appendTo;
694 }
695 
696 
697 U_NAMESPACE_END
698 #endif /* #if !UCONFIG_NO_FORMATTING */
699