1 /*
2 * Copyright (C) 2015, International Business Machines
3 * Corporation and others. All Rights Reserved.
4 *
5 * file name: affixpatternparser.cpp
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_FORMATTING
11
12 #include "unicode/dcfmtsym.h"
13 #include "unicode/plurrule.h"
14 #include "unicode/ucurr.h"
15 #include "affixpatternparser.h"
16 #include "charstr.h"
17 #include "precision.h"
18 #include "uassert.h"
19 #include "unistrappender.h"
20
21 static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
22
23 static UChar gPercent = 0x25;
24 static UChar gPerMill = 0x2030;
25 static UChar gNegative = 0x2D;
26 static UChar gPositive = 0x2B;
27
28 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
29
30 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
31
32 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
33
34 #define UNPACK_LENGTH(c) ((c) & 0xFF)
35
36 U_NAMESPACE_BEGIN
37
38 static int32_t
nextToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)39 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
40 if (buffer[idx] != 0x27 || idx + 1 == len) {
41 *token = buffer[idx];
42 return 1;
43 }
44 *token = buffer[idx + 1];
45 if (buffer[idx + 1] == 0xA4) {
46 int32_t i = 2;
47 for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i);
48 return i;
49 }
50 return 2;
51 }
52
53 static int32_t
nextUserToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)54 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
55 *token = buffer[idx];
56 int32_t max;
57 switch (buffer[idx]) {
58 case 0x27:
59 max = 2;
60 break;
61 case 0xA4:
62 max = 3;
63 break;
64 default:
65 max = 1;
66 break;
67 }
68 int32_t i = 1;
69 for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i);
70 return i;
71 }
72
CurrencyAffixInfo()73 CurrencyAffixInfo::CurrencyAffixInfo()
74 : fSymbol(gDefaultSymbols, 1),
75 fISO(gDefaultSymbols, 2),
76 fLong(DigitAffix(gDefaultSymbols, 3)),
77 fIsDefault(TRUE) {
78 }
79
80 void
set(const char * locale,const PluralRules * rules,const UChar * currency,UErrorCode & status)81 CurrencyAffixInfo::set(
82 const char *locale,
83 const PluralRules *rules,
84 const UChar *currency,
85 UErrorCode &status) {
86 if (U_FAILURE(status)) {
87 return;
88 }
89 fIsDefault = FALSE;
90 if (currency == NULL) {
91 fSymbol.setTo(gDefaultSymbols, 1);
92 fISO.setTo(gDefaultSymbols, 2);
93 fLong.remove();
94 fLong.append(gDefaultSymbols, 3);
95 fIsDefault = TRUE;
96 return;
97 }
98 int32_t len;
99 UBool unusedIsChoice;
100 const UChar *symbol = ucurr_getName(
101 currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
102 &len, &status);
103 if (U_FAILURE(status)) {
104 return;
105 }
106 fSymbol.setTo(symbol, len);
107 fISO.setTo(currency, u_strlen(currency));
108 fLong.remove();
109 StringEnumeration* keywords = rules->getKeywords(status);
110 if (U_FAILURE(status)) {
111 return;
112 }
113 const UnicodeString* pluralCount;
114 while ((pluralCount = keywords->snext(status)) != NULL) {
115 CharString pCount;
116 pCount.appendInvariantChars(*pluralCount, status);
117 const UChar *pluralName = ucurr_getPluralName(
118 currency, locale, &unusedIsChoice, pCount.data(),
119 &len, &status);
120 fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
121 }
122 delete keywords;
123 }
124
125 void
adjustPrecision(const UChar * currency,const UCurrencyUsage usage,FixedPrecision & precision,UErrorCode & status)126 CurrencyAffixInfo::adjustPrecision(
127 const UChar *currency, const UCurrencyUsage usage,
128 FixedPrecision &precision, UErrorCode &status) {
129 if (U_FAILURE(status)) {
130 return;
131 }
132
133 int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
134 currency, usage, &status);
135 precision.fMin.setFracDigitCount(digitCount);
136 precision.fMax.setFracDigitCount(digitCount);
137 double increment = ucurr_getRoundingIncrementForUsage(
138 currency, usage, &status);
139 if (increment == 0.0) {
140 precision.fRoundingIncrement.clear();
141 } else {
142 precision.fRoundingIncrement.set(increment);
143 // guard against round-off error
144 precision.fRoundingIncrement.round(6);
145 }
146 }
147
148 void
addLiteral(const UChar * literal,int32_t start,int32_t len)149 AffixPattern::addLiteral(
150 const UChar *literal, int32_t start, int32_t len) {
151 char32Count += u_countChar32(literal + start, len);
152 literals.append(literal, start, len);
153 int32_t tlen = tokens.length();
154 // Takes 4 UChars to encode maximum literal length.
155 UChar *tokenChars = tokens.getBuffer(tlen + 4);
156
157 // find start of literal size. May be tlen if there is no literal.
158 // While finding start of literal size, compute literal length
159 int32_t literalLength = 0;
160 int32_t tLiteralStart = tlen;
161 while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
162 tLiteralStart--;
163 literalLength <<= 8;
164 literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
165 }
166 // Add number of chars we just added to literal
167 literalLength += len;
168
169 // Now encode the new length starting at tLiteralStart
170 tlen = tLiteralStart;
171 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
172 literalLength >>= 8;
173 while (literalLength) {
174 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
175 literalLength >>= 8;
176 }
177 tokens.releaseBuffer(tlen);
178 }
179
180 void
add(ETokenType t)181 AffixPattern::add(ETokenType t) {
182 add(t, 1);
183 }
184
185 void
addCurrency(uint8_t count)186 AffixPattern::addCurrency(uint8_t count) {
187 add(kCurrency, count);
188 }
189
190 void
add(ETokenType t,uint8_t count)191 AffixPattern::add(ETokenType t, uint8_t count) {
192 U_ASSERT(t != kLiteral);
193 char32Count += count;
194 switch (t) {
195 case kCurrency:
196 hasCurrencyToken = TRUE;
197 break;
198 case kPercent:
199 hasPercentToken = TRUE;
200 break;
201 case kPerMill:
202 hasPermillToken = TRUE;
203 break;
204 default:
205 // Do nothing
206 break;
207 }
208 tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
209 }
210
211 AffixPattern &
append(const AffixPattern & other)212 AffixPattern::append(const AffixPattern &other) {
213 AffixPatternIterator iter;
214 other.iterator(iter);
215 UnicodeString literal;
216 while (iter.nextToken()) {
217 switch (iter.getTokenType()) {
218 case kLiteral:
219 iter.getLiteral(literal);
220 addLiteral(literal.getBuffer(), 0, literal.length());
221 break;
222 case kCurrency:
223 addCurrency(iter.getTokenLength());
224 break;
225 default:
226 add(iter.getTokenType());
227 break;
228 }
229 }
230 return *this;
231 }
232
233 void
remove()234 AffixPattern::remove() {
235 tokens.remove();
236 literals.remove();
237 hasCurrencyToken = FALSE;
238 hasPercentToken = FALSE;
239 hasPermillToken = FALSE;
240 char32Count = 0;
241 }
242
243 // escapes literals for strings where special characters are NOT escaped
244 // except for apostrophe.
escapeApostropheInLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)245 static void escapeApostropheInLiteral(
246 const UnicodeString &literal, UnicodeStringAppender &appender) {
247 int32_t len = literal.length();
248 const UChar *buffer = literal.getBuffer();
249 for (int32_t i = 0; i < len; ++i) {
250 UChar ch = buffer[i];
251 switch (ch) {
252 case 0x27:
253 appender.append((UChar) 0x27);
254 appender.append((UChar) 0x27);
255 break;
256 default:
257 appender.append(ch);
258 break;
259 }
260 }
261 }
262
263
264 // escapes literals for user strings where special characters in literals
265 // are escaped with apostrophe.
escapeLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)266 static void escapeLiteral(
267 const UnicodeString &literal, UnicodeStringAppender &appender) {
268 int32_t len = literal.length();
269 const UChar *buffer = literal.getBuffer();
270 for (int32_t i = 0; i < len; ++i) {
271 UChar ch = buffer[i];
272 switch (ch) {
273 case 0x27:
274 appender.append((UChar) 0x27);
275 appender.append((UChar) 0x27);
276 break;
277 case 0x25:
278 appender.append((UChar) 0x27);
279 appender.append((UChar) 0x25);
280 appender.append((UChar) 0x27);
281 break;
282 case 0x2030:
283 appender.append((UChar) 0x27);
284 appender.append((UChar) 0x2030);
285 appender.append((UChar) 0x27);
286 break;
287 case 0xA4:
288 appender.append((UChar) 0x27);
289 appender.append((UChar) 0xA4);
290 appender.append((UChar) 0x27);
291 break;
292 case 0x2D:
293 appender.append((UChar) 0x27);
294 appender.append((UChar) 0x2D);
295 appender.append((UChar) 0x27);
296 break;
297 case 0x2B:
298 appender.append((UChar) 0x27);
299 appender.append((UChar) 0x2B);
300 appender.append((UChar) 0x27);
301 break;
302 default:
303 appender.append(ch);
304 break;
305 }
306 }
307 }
308
309 UnicodeString &
toString(UnicodeString & appendTo) const310 AffixPattern::toString(UnicodeString &appendTo) const {
311 AffixPatternIterator iter;
312 iterator(iter);
313 UnicodeStringAppender appender(appendTo);
314 UnicodeString literal;
315 while (iter.nextToken()) {
316 switch (iter.getTokenType()) {
317 case kLiteral:
318 escapeApostropheInLiteral(iter.getLiteral(literal), appender);
319 break;
320 case kPercent:
321 appender.append((UChar) 0x27);
322 appender.append((UChar) 0x25);
323 break;
324 case kPerMill:
325 appender.append((UChar) 0x27);
326 appender.append((UChar) 0x2030);
327 break;
328 case kCurrency:
329 {
330 appender.append((UChar) 0x27);
331 int32_t cl = iter.getTokenLength();
332 for (int32_t i = 0; i < cl; ++i) {
333 appender.append((UChar) 0xA4);
334 }
335 }
336 break;
337 case kNegative:
338 appender.append((UChar) 0x27);
339 appender.append((UChar) 0x2D);
340 break;
341 case kPositive:
342 appender.append((UChar) 0x27);
343 appender.append((UChar) 0x2B);
344 break;
345 default:
346 U_ASSERT(FALSE);
347 break;
348 }
349 }
350 return appendTo;
351 }
352
353 UnicodeString &
toUserString(UnicodeString & appendTo) const354 AffixPattern::toUserString(UnicodeString &appendTo) const {
355 AffixPatternIterator iter;
356 iterator(iter);
357 UnicodeStringAppender appender(appendTo);
358 UnicodeString literal;
359 while (iter.nextToken()) {
360 switch (iter.getTokenType()) {
361 case kLiteral:
362 escapeLiteral(iter.getLiteral(literal), appender);
363 break;
364 case kPercent:
365 appender.append((UChar) 0x25);
366 break;
367 case kPerMill:
368 appender.append((UChar) 0x2030);
369 break;
370 case kCurrency:
371 {
372 int32_t cl = iter.getTokenLength();
373 for (int32_t i = 0; i < cl; ++i) {
374 appender.append((UChar) 0xA4);
375 }
376 }
377 break;
378 case kNegative:
379 appender.append((UChar) 0x2D);
380 break;
381 case kPositive:
382 appender.append((UChar) 0x2B);
383 break;
384 default:
385 U_ASSERT(FALSE);
386 break;
387 }
388 }
389 return appendTo;
390 }
391
392 class AffixPatternAppender : public UMemory {
393 public:
AffixPatternAppender(AffixPattern & dest)394 AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
395
append(UChar x)396 inline void append(UChar x) {
397 if (fIdx == UPRV_LENGTHOF(fBuffer)) {
398 fDest->addLiteral(fBuffer, 0, fIdx);
399 fIdx = 0;
400 }
401 fBuffer[fIdx++] = x;
402 }
403
append(UChar32 x)404 inline void append(UChar32 x) {
405 if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
406 fDest->addLiteral(fBuffer, 0, fIdx);
407 fIdx = 0;
408 }
409 U16_APPEND_UNSAFE(fBuffer, fIdx, x);
410 }
411
flush()412 inline void flush() {
413 if (fIdx) {
414 fDest->addLiteral(fBuffer, 0, fIdx);
415 }
416 fIdx = 0;
417 }
418
419 /**
420 * flush the buffer when we go out of scope.
421 */
~AffixPatternAppender()422 ~AffixPatternAppender() {
423 flush();
424 }
425 private:
426 AffixPattern *fDest;
427 int32_t fIdx;
428 UChar fBuffer[32];
429 AffixPatternAppender(const AffixPatternAppender &other);
430 AffixPatternAppender &operator=(const AffixPatternAppender &other);
431 };
432
433
434 AffixPattern &
parseUserAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)435 AffixPattern::parseUserAffixString(
436 const UnicodeString &affixStr,
437 AffixPattern &appendTo,
438 UErrorCode &status) {
439 if (U_FAILURE(status)) {
440 return appendTo;
441 }
442 int32_t len = affixStr.length();
443 const UChar *buffer = affixStr.getBuffer();
444 // 0 = not quoted; 1 = quoted.
445 int32_t state = 0;
446 AffixPatternAppender appender(appendTo);
447 for (int32_t i = 0; i < len; ) {
448 UChar token;
449 int32_t tokenSize = nextUserToken(buffer, i, len, &token);
450 i += tokenSize;
451 if (token == 0x27 && tokenSize == 1) { // quote
452 state = 1 - state;
453 continue;
454 }
455 if (state == 0) {
456 switch (token) {
457 case 0x25:
458 appender.flush();
459 appendTo.add(kPercent, 1);
460 break;
461 case 0x27: // double quote
462 appender.append((UChar) 0x27);
463 break;
464 case 0x2030:
465 appender.flush();
466 appendTo.add(kPerMill, 1);
467 break;
468 case 0x2D:
469 appender.flush();
470 appendTo.add(kNegative, 1);
471 break;
472 case 0x2B:
473 appender.flush();
474 appendTo.add(kPositive, 1);
475 break;
476 case 0xA4:
477 appender.flush();
478 appendTo.add(kCurrency, tokenSize);
479 break;
480 default:
481 appender.append(token);
482 break;
483 }
484 } else {
485 switch (token) {
486 case 0x27: // double quote
487 appender.append((UChar) 0x27);
488 break;
489 case 0xA4: // included b/c tokenSize can be > 1
490 for (int32_t j = 0; j < tokenSize; ++j) {
491 appender.append((UChar) 0xA4);
492 }
493 break;
494 default:
495 appender.append(token);
496 break;
497 }
498 }
499 }
500 return appendTo;
501 }
502
503 AffixPattern &
parseAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)504 AffixPattern::parseAffixString(
505 const UnicodeString &affixStr,
506 AffixPattern &appendTo,
507 UErrorCode &status) {
508 if (U_FAILURE(status)) {
509 return appendTo;
510 }
511 int32_t len = affixStr.length();
512 const UChar *buffer = affixStr.getBuffer();
513 for (int32_t i = 0; i < len; ) {
514 UChar token;
515 int32_t tokenSize = nextToken(buffer, i, len, &token);
516 if (tokenSize == 1) {
517 int32_t literalStart = i;
518 ++i;
519 while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
520 ++i;
521 }
522 appendTo.addLiteral(buffer, literalStart, i - literalStart);
523
524 // If we reached end of string, we are done
525 if (i == len) {
526 return appendTo;
527 }
528 }
529 i += tokenSize;
530 switch (token) {
531 case 0x25:
532 appendTo.add(kPercent, 1);
533 break;
534 case 0x2030:
535 appendTo.add(kPerMill, 1);
536 break;
537 case 0x2D:
538 appendTo.add(kNegative, 1);
539 break;
540 case 0x2B:
541 appendTo.add(kPositive, 1);
542 break;
543 case 0xA4:
544 {
545 if (tokenSize - 1 > 3) {
546 status = U_PARSE_ERROR;
547 return appendTo;
548 }
549 appendTo.add(kCurrency, tokenSize - 1);
550 }
551 break;
552 default:
553 appendTo.addLiteral(&token, 0, 1);
554 break;
555 }
556 }
557 return appendTo;
558 }
559
560 AffixPatternIterator &
iterator(AffixPatternIterator & result) const561 AffixPattern::iterator(AffixPatternIterator &result) const {
562 result.nextLiteralIndex = 0;
563 result.lastLiteralLength = 0;
564 result.nextTokenIndex = 0;
565 result.tokens = &tokens;
566 result.literals = &literals;
567 return result;
568 }
569
570 UBool
nextToken()571 AffixPatternIterator::nextToken() {
572 int32_t tlen = tokens->length();
573 if (nextTokenIndex == tlen) {
574 return FALSE;
575 }
576 ++nextTokenIndex;
577 const UChar *tokenBuffer = tokens->getBuffer();
578 if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
579 AffixPattern::kLiteral) {
580 while (nextTokenIndex < tlen &&
581 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
582 ++nextTokenIndex;
583 }
584 lastLiteralLength = 0;
585 int32_t i = nextTokenIndex - 1;
586 for (; UNPACK_LONG(tokenBuffer[i]); --i) {
587 lastLiteralLength <<= 8;
588 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
589 }
590 lastLiteralLength <<= 8;
591 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
592 nextLiteralIndex += lastLiteralLength;
593 }
594 return TRUE;
595 }
596
597 AffixPattern::ETokenType
getTokenType() const598 AffixPatternIterator::getTokenType() const {
599 return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
600 }
601
602 UnicodeString &
getLiteral(UnicodeString & result) const603 AffixPatternIterator::getLiteral(UnicodeString &result) const {
604 const UChar *buffer = literals->getBuffer();
605 result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
606 return result;
607 }
608
609 int32_t
getTokenLength() const610 AffixPatternIterator::getTokenLength() const {
611 const UChar *tokenBuffer = tokens->getBuffer();
612 AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
613 return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
614 }
615
AffixPatternParser()616 AffixPatternParser::AffixPatternParser()
617 : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
618 }
619
AffixPatternParser(const DecimalFormatSymbols & symbols)620 AffixPatternParser::AffixPatternParser(
621 const DecimalFormatSymbols &symbols) {
622 setDecimalFormatSymbols(symbols);
623 }
624
625 void
setDecimalFormatSymbols(const DecimalFormatSymbols & symbols)626 AffixPatternParser::setDecimalFormatSymbols(
627 const DecimalFormatSymbols &symbols) {
628 fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
629 fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
630 fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
631 fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
632 }
633
634 PluralAffix &
parse(const AffixPattern & affixPattern,const CurrencyAffixInfo & currencyAffixInfo,PluralAffix & appendTo,UErrorCode & status) const635 AffixPatternParser::parse(
636 const AffixPattern &affixPattern,
637 const CurrencyAffixInfo ¤cyAffixInfo,
638 PluralAffix &appendTo,
639 UErrorCode &status) const {
640 if (U_FAILURE(status)) {
641 return appendTo;
642 }
643 AffixPatternIterator iter;
644 affixPattern.iterator(iter);
645 UnicodeString literal;
646 while (iter.nextToken()) {
647 switch (iter.getTokenType()) {
648 case AffixPattern::kPercent:
649 appendTo.append(fPercent, UNUM_PERCENT_FIELD);
650 break;
651 case AffixPattern::kPerMill:
652 appendTo.append(fPermill, UNUM_PERMILL_FIELD);
653 break;
654 case AffixPattern::kNegative:
655 appendTo.append(fNegative, UNUM_SIGN_FIELD);
656 break;
657 case AffixPattern::kPositive:
658 appendTo.append(fPositive, UNUM_SIGN_FIELD);
659 break;
660 case AffixPattern::kCurrency:
661 switch (iter.getTokenLength()) {
662 case 1:
663 appendTo.append(
664 currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
665 break;
666 case 2:
667 appendTo.append(
668 currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
669 break;
670 case 3:
671 appendTo.append(
672 currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
673 break;
674 default:
675 U_ASSERT(FALSE);
676 break;
677 }
678 break;
679 case AffixPattern::kLiteral:
680 appendTo.append(iter.getLiteral(literal));
681 break;
682 default:
683 U_ASSERT(FALSE);
684 break;
685 }
686 }
687 return appendTo;
688 }
689
690
691 U_NAMESPACE_END
692 #endif /* #if !UCONFIG_NO_FORMATTING */
693