1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #include "number_affixutils.h"
9 #include "unicode/utf16.h"
10 #include "unicode/uniset.h"
11 
12 using namespace icu;
13 using namespace icu::number;
14 using namespace icu::number::impl;
15 
16 TokenConsumer::~TokenConsumer() = default;
17 SymbolProvider::~SymbolProvider() = default;
18 
estimateLength(const UnicodeString & patternString,UErrorCode & status)19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20     AffixPatternState state = STATE_BASE;
21     int32_t offset = 0;
22     int32_t length = 0;
23     for (; offset < patternString.length();) {
24         UChar32 cp = patternString.char32At(offset);
25 
26         switch (state) {
27             case STATE_BASE:
28                 if (cp == u'\'') {
29                     // First quote
30                     state = STATE_FIRST_QUOTE;
31                 } else {
32                     // Unquoted symbol
33                     length++;
34                 }
35                 break;
36             case STATE_FIRST_QUOTE:
37                 if (cp == u'\'') {
38                     // Repeated quote
39                     length++;
40                     state = STATE_BASE;
41                 } else {
42                     // Quoted code point
43                     length++;
44                     state = STATE_INSIDE_QUOTE;
45                 }
46                 break;
47             case STATE_INSIDE_QUOTE:
48                 if (cp == u'\'') {
49                     // End of quoted sequence
50                     state = STATE_AFTER_QUOTE;
51                 } else {
52                     // Quoted code point
53                     length++;
54                 }
55                 break;
56             case STATE_AFTER_QUOTE:
57                 if (cp == u'\'') {
58                     // Double quote inside of quoted sequence
59                     length++;
60                     state = STATE_INSIDE_QUOTE;
61                 } else {
62                     // Unquoted symbol
63                     length++;
64                 }
65                 break;
66             default:
67                 U_ASSERT(false);
68         }
69 
70         offset += U16_LENGTH(cp);
71     }
72 
73     switch (state) {
74         case STATE_FIRST_QUOTE:
75         case STATE_INSIDE_QUOTE:
76             status = U_ILLEGAL_ARGUMENT_ERROR;
77             break;
78         default:
79             break;
80     }
81 
82     return length;
83 }
84 
escape(const UnicodeString & input)85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
86     AffixPatternState state = STATE_BASE;
87     int32_t offset = 0;
88     UnicodeString output;
89     for (; offset < input.length();) {
90         UChar32 cp = input.char32At(offset);
91 
92         switch (cp) {
93             case u'\'':
94                 output.append(u"''", -1);
95                 break;
96 
97             case u'-':
98             case u'+':
99             case u'%':
100             case u'‰':
101             case u'¤':
102                 if (state == STATE_BASE) {
103                     output.append(u'\'');
104                     output.append(cp);
105                     state = STATE_INSIDE_QUOTE;
106                 } else {
107                     output.append(cp);
108                 }
109                 break;
110 
111             default:
112                 if (state == STATE_INSIDE_QUOTE) {
113                     output.append(u'\'');
114                     output.append(cp);
115                     state = STATE_BASE;
116                 } else {
117                     output.append(cp);
118                 }
119                 break;
120         }
121         offset += U16_LENGTH(cp);
122     }
123 
124     if (state == STATE_INSIDE_QUOTE) {
125         output.append(u'\'');
126     }
127 
128     return output;
129 }
130 
getFieldForType(AffixPatternType type)131 Field AffixUtils::getFieldForType(AffixPatternType type) {
132     switch (type) {
133         case TYPE_MINUS_SIGN:
134             return Field::UNUM_SIGN_FIELD;
135         case TYPE_PLUS_SIGN:
136             return Field::UNUM_SIGN_FIELD;
137         case TYPE_PERCENT:
138             return Field::UNUM_PERCENT_FIELD;
139         case TYPE_PERMILLE:
140             return Field::UNUM_PERMILL_FIELD;
141         case TYPE_CURRENCY_SINGLE:
142             return Field::UNUM_CURRENCY_FIELD;
143         case TYPE_CURRENCY_DOUBLE:
144             return Field::UNUM_CURRENCY_FIELD;
145         case TYPE_CURRENCY_TRIPLE:
146             return Field::UNUM_CURRENCY_FIELD;
147         case TYPE_CURRENCY_QUAD:
148             return Field::UNUM_CURRENCY_FIELD;
149         case TYPE_CURRENCY_QUINT:
150             return Field::UNUM_CURRENCY_FIELD;
151         case TYPE_CURRENCY_OVERFLOW:
152             return Field::UNUM_CURRENCY_FIELD;
153         default:
154             U_ASSERT(false);
155             return Field::UNUM_FIELD_COUNT; // suppress "control reaches end of non-void function"
156     }
157 }
158 
159 int32_t
unescape(const UnicodeString & affixPattern,NumberStringBuilder & output,int32_t position,const SymbolProvider & provider,UErrorCode & status)160 AffixUtils::unescape(const UnicodeString &affixPattern, NumberStringBuilder &output, int32_t position,
161                      const SymbolProvider &provider, UErrorCode &status) {
162     int32_t length = 0;
163     AffixTag tag;
164     while (hasNext(tag, affixPattern)) {
165         tag = nextToken(tag, affixPattern, status);
166         if (U_FAILURE(status)) { return length; }
167         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
168             // Don't go to the provider for this special case
169             length += output.insertCodePoint(position + length, 0xFFFD, UNUM_CURRENCY_FIELD, status);
170         } else if (tag.type < 0) {
171             length += output.insert(
172                     position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
173         } else {
174             length += output.insertCodePoint(position + length, tag.codePoint, UNUM_FIELD_COUNT, status);
175         }
176     }
177     return length;
178 }
179 
unescapedCodePointCount(const UnicodeString & affixPattern,const SymbolProvider & provider,UErrorCode & status)180 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
181                                             const SymbolProvider &provider, UErrorCode &status) {
182     int32_t length = 0;
183     AffixTag tag;
184     while (hasNext(tag, affixPattern)) {
185         tag = nextToken(tag, affixPattern, status);
186         if (U_FAILURE(status)) { return length; }
187         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
188             length += 1;
189         } else if (tag.type < 0) {
190             length += provider.getSymbol(tag.type).length();
191         } else {
192             length += U16_LENGTH(tag.codePoint);
193         }
194     }
195     return length;
196 }
197 
198 bool
containsType(const UnicodeString & affixPattern,AffixPatternType type,UErrorCode & status)199 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
200     if (affixPattern.length() == 0) {
201         return false;
202     }
203     AffixTag tag;
204     while (hasNext(tag, affixPattern)) {
205         tag = nextToken(tag, affixPattern, status);
206         if (U_FAILURE(status)) { return false; }
207         if (tag.type == type) {
208             return true;
209         }
210     }
211     return false;
212 }
213 
hasCurrencySymbols(const UnicodeString & affixPattern,UErrorCode & status)214 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
215     if (affixPattern.length() == 0) {
216         return false;
217     }
218     AffixTag tag;
219     while (hasNext(tag, affixPattern)) {
220         tag = nextToken(tag, affixPattern, status);
221         if (U_FAILURE(status)) { return false; }
222         if (tag.type < 0 && getFieldForType(tag.type) == UNUM_CURRENCY_FIELD) {
223             return true;
224         }
225     }
226     return false;
227 }
228 
replaceType(const UnicodeString & affixPattern,AffixPatternType type,char16_t replacementChar,UErrorCode & status)229 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
230                                       char16_t replacementChar, UErrorCode &status) {
231     UnicodeString output(affixPattern); // copy
232     if (affixPattern.length() == 0) {
233         return output;
234     };
235     AffixTag tag;
236     while (hasNext(tag, affixPattern)) {
237         tag = nextToken(tag, affixPattern, status);
238         if (U_FAILURE(status)) { return output; }
239         if (tag.type == type) {
240             output.replace(tag.offset - 1, 1, replacementChar);
241         }
242     }
243     return output;
244 }
245 
containsOnlySymbolsAndIgnorables(const UnicodeString & affixPattern,const UnicodeSet & ignorables,UErrorCode & status)246 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
247                                                   const UnicodeSet& ignorables, UErrorCode& status) {
248     if (affixPattern.length() == 0) {
249         return true;
250     };
251     AffixTag tag;
252     while (hasNext(tag, affixPattern)) {
253         tag = nextToken(tag, affixPattern, status);
254         if (U_FAILURE(status)) { return false; }
255         if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
256             return false;
257         }
258     }
259     return true;
260 }
261 
iterateWithConsumer(const UnicodeString & affixPattern,TokenConsumer & consumer,UErrorCode & status)262 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
263                                      UErrorCode& status) {
264     if (affixPattern.length() == 0) {
265         return;
266     };
267     AffixTag tag;
268     while (hasNext(tag, affixPattern)) {
269         tag = nextToken(tag, affixPattern, status);
270         if (U_FAILURE(status)) { return; }
271         consumer.consumeToken(tag.type, tag.codePoint, status);
272         if (U_FAILURE(status)) { return; }
273     }
274 }
275 
nextToken(AffixTag tag,const UnicodeString & patternString,UErrorCode & status)276 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
277     int32_t offset = tag.offset;
278     int32_t state = tag.state;
279     for (; offset < patternString.length();) {
280         UChar32 cp = patternString.char32At(offset);
281         int32_t count = U16_LENGTH(cp);
282 
283         switch (state) {
284             case STATE_BASE:
285                 switch (cp) {
286                     case u'\'':
287                         state = STATE_FIRST_QUOTE;
288                         offset += count;
289                         // continue to the next code point
290                         break;
291                     case u'-':
292                         return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
293                     case u'+':
294                         return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
295                     case u'%':
296                         return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
297                     case u'‰':
298                         return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
299                     case u'¤':
300                         state = STATE_FIRST_CURR;
301                         offset += count;
302                         // continue to the next code point
303                         break;
304                     default:
305                         return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
306                 }
307                 break;
308             case STATE_FIRST_QUOTE:
309                 if (cp == u'\'') {
310                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
311                 } else {
312                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
313                 }
314             case STATE_INSIDE_QUOTE:
315                 if (cp == u'\'') {
316                     state = STATE_AFTER_QUOTE;
317                     offset += count;
318                     // continue to the next code point
319                     break;
320                 } else {
321                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
322                 }
323             case STATE_AFTER_QUOTE:
324                 if (cp == u'\'') {
325                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
326                 } else {
327                     state = STATE_BASE;
328                     // re-evaluate this code point
329                     break;
330                 }
331             case STATE_FIRST_CURR:
332                 if (cp == u'¤') {
333                     state = STATE_SECOND_CURR;
334                     offset += count;
335                     // continue to the next code point
336                     break;
337                 } else {
338                     return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
339                 }
340             case STATE_SECOND_CURR:
341                 if (cp == u'¤') {
342                     state = STATE_THIRD_CURR;
343                     offset += count;
344                     // continue to the next code point
345                     break;
346                 } else {
347                     return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
348                 }
349             case STATE_THIRD_CURR:
350                 if (cp == u'¤') {
351                     state = STATE_FOURTH_CURR;
352                     offset += count;
353                     // continue to the next code point
354                     break;
355                 } else {
356                     return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
357                 }
358             case STATE_FOURTH_CURR:
359                 if (cp == u'¤') {
360                     state = STATE_FIFTH_CURR;
361                     offset += count;
362                     // continue to the next code point
363                     break;
364                 } else {
365                     return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
366                 }
367             case STATE_FIFTH_CURR:
368                 if (cp == u'¤') {
369                     state = STATE_OVERFLOW_CURR;
370                     offset += count;
371                     // continue to the next code point
372                     break;
373                 } else {
374                     return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
375                 }
376             case STATE_OVERFLOW_CURR:
377                 if (cp == u'¤') {
378                     offset += count;
379                     // continue to the next code point and loop back to this state
380                     break;
381                 } else {
382                     return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
383                 }
384             default:
385                 U_ASSERT(false);
386         }
387     }
388     // End of string
389     switch (state) {
390         case STATE_BASE:
391             // No more tokens in string.
392             return {-1};
393         case STATE_FIRST_QUOTE:
394         case STATE_INSIDE_QUOTE:
395             // For consistent behavior with the JDK and ICU 58, set an error here.
396             status = U_ILLEGAL_ARGUMENT_ERROR;
397             return {-1};
398         case STATE_AFTER_QUOTE:
399             // No more tokens in string.
400             return {-1};
401         case STATE_FIRST_CURR:
402             return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
403         case STATE_SECOND_CURR:
404             return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
405         case STATE_THIRD_CURR:
406             return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
407         case STATE_FOURTH_CURR:
408             return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
409         case STATE_FIFTH_CURR:
410             return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
411         case STATE_OVERFLOW_CURR:
412             return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
413         default:
414             U_ASSERT(false);
415             return {-1}; // suppress "control reaches end of non-void function"
416     }
417 }
418 
hasNext(const AffixTag & tag,const UnicodeString & string)419 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
420     // First check for the {-1} and default initializer syntax.
421     if (tag.offset < 0) {
422         return false;
423     } else if (tag.offset == 0) {
424         return string.length() > 0;
425     }
426     // The rest of the fields are safe to use now.
427     // Special case: the last character in string is an end quote.
428     if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
429         string.charAt(tag.offset) == u'\'') {
430         return false;
431     } else if (tag.state != STATE_BASE) {
432         return true;
433     } else {
434         return tag.offset < string.length();
435     }
436 }
437 
438 #endif /* #if !UCONFIG_NO_FORMATTING */
439