1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include <typeinfo>
13 #include <array>
14 #include "number_types.h"
15 #include "number_patternstring.h"
16 #include "numparse_types.h"
17 #include "numparse_impl.h"
18 #include "numparse_symbols.h"
19 #include "numparse_decimal.h"
20 #include "unicode/numberformatter.h"
21 #include "cstr.h"
22 #include "number_mapper.h"
23 #include "static_unicode_sets.h"
24 
25 using namespace icu;
26 using namespace icu::number;
27 using namespace icu::number::impl;
28 using namespace icu::numparse;
29 using namespace icu::numparse::impl;
30 
31 
32 NumberParseMatcher::~NumberParseMatcher() = default;
33 
34 
35 NumberParserImpl*
createSimpleParser(const Locale & locale,const UnicodeString & patternString,parse_flags_t parseFlags,UErrorCode & status)36 NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString,
37                                      parse_flags_t parseFlags, UErrorCode& status) {
38 
39     LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
40     DecimalFormatSymbols symbols(locale, status);
41 
42     parser->fLocalMatchers.ignorables = {parseFlags};
43     IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
44 
45     DecimalFormatSymbols dfs(locale, status);
46     dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$");
47     dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU");
48     CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status);
49 
50     ParsedPatternInfo patternInfo;
51     PatternParser::parseToPatternInfo(patternString, patternInfo, status);
52 
53     // The following statements set up the affix matchers.
54     AffixTokenMatcherSetupData affixSetupData = {
55             currencySymbols, symbols, ignorables, locale, parseFlags};
56     parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData};
57     parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse};
58     parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers(
59             patternInfo, *parser, ignorables, parseFlags, status);
60 
61     Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO);
62     grouper.setLocaleData(patternInfo, locale);
63 
64     parser->addMatcher(parser->fLocalMatchers.ignorables);
65     parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags});
66     parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false});
67     parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false});
68     parser->addMatcher(parser->fLocalMatchers.percent = {symbols});
69     parser->addMatcher(parser->fLocalMatchers.permille = {symbols});
70     parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
71     parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
72     parser->addMatcher(parser->fLocalMatchers.padding = {u"@"});
73     parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
74     parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status});
75     parser->addMatcher(parser->fLocalValidators.number = {});
76 
77     parser->freeze();
78     return parser.orphan();
79 }
80 
81 NumberParserImpl*
createParserFromProperties(const number::impl::DecimalFormatProperties & properties,const DecimalFormatSymbols & symbols,bool parseCurrency,UErrorCode & status)82 NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatProperties& properties,
83                                              const DecimalFormatSymbols& symbols, bool parseCurrency,
84                                              UErrorCode& status) {
85     Locale locale = symbols.getLocale();
86     AutoAffixPatternProvider affixProvider(properties, status);
87     if (U_FAILURE(status)) { return nullptr; }
88     CurrencyUnit currency = resolveCurrency(properties, locale, status);
89     CurrencySymbols currencySymbols(currency, locale, symbols, status);
90     bool isStrict = properties.parseMode.getOrDefault(PARSE_MODE_STRICT) == PARSE_MODE_STRICT;
91     Grouper grouper = Grouper::forProperties(properties);
92     int parseFlags = 0;
93     if (U_FAILURE(status)) { return nullptr; }
94     if (!properties.parseCaseSensitive) {
95         parseFlags |= PARSE_FLAG_IGNORE_CASE;
96     }
97     if (properties.parseIntegerOnly) {
98         parseFlags |= PARSE_FLAG_INTEGER_ONLY;
99     }
100     if (properties.signAlwaysShown) {
101         parseFlags |= PARSE_FLAG_PLUS_SIGN_ALLOWED;
102     }
103     if (isStrict) {
104         parseFlags |= PARSE_FLAG_STRICT_GROUPING_SIZE;
105         parseFlags |= PARSE_FLAG_STRICT_SEPARATORS;
106         parseFlags |= PARSE_FLAG_USE_FULL_AFFIXES;
107         parseFlags |= PARSE_FLAG_EXACT_AFFIX;
108         parseFlags |= PARSE_FLAG_STRICT_IGNORABLES;
109     } else {
110         parseFlags |= PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
111     }
112     if (grouper.getPrimary() <= 0) {
113         parseFlags |= PARSE_FLAG_GROUPING_DISABLED;
114     }
115     if (parseCurrency || affixProvider.get().hasCurrencySign()) {
116         parseFlags |= PARSE_FLAG_MONETARY_SEPARATORS;
117     }
118     if (!parseCurrency) {
119         parseFlags |= PARSE_FLAG_NO_FOREIGN_CURRENCY;
120     }
121 
122     LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
123 
124     parser->fLocalMatchers.ignorables = {parseFlags};
125     IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
126 
127     //////////////////////
128     /// AFFIX MATCHERS ///
129     //////////////////////
130 
131     // The following statements set up the affix matchers.
132     AffixTokenMatcherSetupData affixSetupData = {
133             currencySymbols, symbols, ignorables, locale, parseFlags};
134     parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData};
135     parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse};
136     parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers(
137             affixProvider.get(), *parser, ignorables, parseFlags, status);
138 
139     ////////////////////////
140     /// CURRENCY MATCHER ///
141     ////////////////////////
142 
143     if (parseCurrency || affixProvider.get().hasCurrencySign()) {
144         parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status});
145     }
146 
147     ///////////////
148     /// PERCENT ///
149     ///////////////
150 
151     // ICU-TC meeting, April 11, 2018: accept percent/permille only if it is in the pattern,
152     // and to maintain regressive behavior, divide by 100 even if no percent sign is present.
153     if (!isStrict && affixProvider.get().containsSymbolType(AffixPatternType::TYPE_PERCENT, status)) {
154         parser->addMatcher(parser->fLocalMatchers.percent = {symbols});
155     }
156     if (!isStrict && affixProvider.get().containsSymbolType(AffixPatternType::TYPE_PERMILLE, status)) {
157         parser->addMatcher(parser->fLocalMatchers.permille = {symbols});
158     }
159 
160     ///////////////////////////////
161     /// OTHER STANDARD MATCHERS ///
162     ///////////////////////////////
163 
164     if (!isStrict) {
165         parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false});
166         parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false});
167     }
168     parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
169     parser->addMatcher(parser->fLocalMatchers.infinity = {symbols});
170     UnicodeString padString = properties.padString;
171     if (!padString.isBogus() && !ignorables.getSet()->contains(padString)) {
172         parser->addMatcher(parser->fLocalMatchers.padding = {padString});
173     }
174     parser->addMatcher(parser->fLocalMatchers.ignorables);
175     parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags});
176     // NOTE: parseNoExponent doesn't disable scientific parsing if we have a scientific formatter
177     if (!properties.parseNoExponent || properties.minimumExponentDigits > 0) {
178         parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper});
179     }
180 
181     //////////////////
182     /// VALIDATORS ///
183     //////////////////
184 
185     parser->addMatcher(parser->fLocalValidators.number = {});
186     if (isStrict) {
187         parser->addMatcher(parser->fLocalValidators.affix = {});
188     }
189     if (parseCurrency) {
190         parser->addMatcher(parser->fLocalValidators.currency = {});
191     }
192     if (properties.decimalPatternMatchRequired) {
193         bool patternHasDecimalSeparator =
194                 properties.decimalSeparatorAlwaysShown || properties.maximumFractionDigits != 0;
195         parser->addMatcher(parser->fLocalValidators.decimalSeparator = {patternHasDecimalSeparator});
196     }
197     // The multiplier takes care of scaling percentages.
198     Scale multiplier = scaleFromProperties(properties);
199     if (multiplier.isValid()) {
200         parser->addMatcher(parser->fLocalValidators.multiplier = {multiplier});
201     }
202 
203     parser->freeze();
204     return parser.orphan();
205 }
206 
NumberParserImpl(parse_flags_t parseFlags)207 NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags)
208         : fParseFlags(parseFlags) {
209 }
210 
~NumberParserImpl()211 NumberParserImpl::~NumberParserImpl() {
212     fNumMatchers = 0;
213 }
214 
addMatcher(NumberParseMatcher & matcher)215 void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) {
216     if (fNumMatchers + 1 > fMatchers.getCapacity()) {
217         fMatchers.resize(fNumMatchers * 2, fNumMatchers);
218     }
219     fMatchers[fNumMatchers] = &matcher;
220     fNumMatchers++;
221 }
222 
freeze()223 void NumberParserImpl::freeze() {
224     fFrozen = true;
225 }
226 
getParseFlags() const227 parse_flags_t NumberParserImpl::getParseFlags() const {
228     return fParseFlags;
229 }
230 
parse(const UnicodeString & input,bool greedy,ParsedNumber & result,UErrorCode & status) const231 void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result,
232                              UErrorCode& status) const {
233     return parse(input, 0, greedy, result, status);
234 }
235 
parse(const UnicodeString & input,int32_t start,bool greedy,ParsedNumber & result,UErrorCode & status) const236 void NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result,
237                              UErrorCode& status) const {
238     if (U_FAILURE(status)) {
239         return;
240     }
241     U_ASSERT(fFrozen);
242     // TODO: Check start >= 0 and start < input.length()
243     StringSegment segment(input, 0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE));
244     segment.adjustOffset(start);
245     if (greedy) {
246         parseGreedy(segment, result, status);
247     } else if (0 != (fParseFlags & PARSE_FLAG_ALLOW_INFINITE_RECURSION)) {
248         // Start at 1 so that recursionLevels never gets to 0
249         parseLongestRecursive(segment, result, 1, status);
250     } else {
251         // Arbitrary recursion safety limit: 100 levels.
252         parseLongestRecursive(segment, result, -100, status);
253     }
254     for (int32_t i = 0; i < fNumMatchers; i++) {
255         fMatchers[i]->postProcess(result);
256     }
257     result.postProcess();
258 }
259 
parseGreedy(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const260 void NumberParserImpl::parseGreedy(StringSegment& segment, ParsedNumber& result,
261                                             UErrorCode& status) const {
262     // Note: this method is not recursive in order to avoid stack overflow.
263     for (int i = 0; i <fNumMatchers;) {
264         // Base Case
265         if (segment.length() == 0) {
266             return;
267         }
268         const NumberParseMatcher* matcher = fMatchers[i];
269         if (!matcher->smokeTest(segment)) {
270             // Matcher failed smoke test: try the next one
271             i++;
272             continue;
273         }
274         int32_t initialOffset = segment.getOffset();
275         matcher->match(segment, result, status);
276         if (U_FAILURE(status)) {
277             return;
278         }
279         if (segment.getOffset() != initialOffset) {
280             // Greedy heuristic: accept the match and loop back
281             i = 0;
282             continue;
283         } else {
284             // Matcher did not match: try the next one
285             i++;
286             continue;
287         }
288         UPRV_UNREACHABLE;
289     }
290 
291     // NOTE: If we get here, the greedy parse completed without consuming the entire string.
292 }
293 
parseLongestRecursive(StringSegment & segment,ParsedNumber & result,int32_t recursionLevels,UErrorCode & status) const294 void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result,
295                                              int32_t recursionLevels,
296                                              UErrorCode& status) const {
297     // Base Case
298     if (segment.length() == 0) {
299         return;
300     }
301 
302     // Safety against stack overflow
303     if (recursionLevels == 0) {
304         return;
305     }
306 
307     // TODO: Give a nice way for the matcher to reset the ParsedNumber?
308     ParsedNumber initial(result);
309     ParsedNumber candidate;
310 
311     int initialOffset = segment.getOffset();
312     for (int32_t i = 0; i < fNumMatchers; i++) {
313         const NumberParseMatcher* matcher = fMatchers[i];
314         if (!matcher->smokeTest(segment)) {
315             continue;
316         }
317 
318         // In a non-greedy parse, we attempt all possible matches and pick the best.
319         for (int32_t charsToConsume = 0; charsToConsume < segment.length();) {
320             charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume));
321 
322             // Run the matcher on a segment of the current length.
323             candidate = initial;
324             segment.setLength(charsToConsume);
325             bool maybeMore = matcher->match(segment, candidate, status);
326             segment.resetLength();
327             if (U_FAILURE(status)) {
328                 return;
329             }
330 
331             // If the entire segment was consumed, recurse.
332             if (segment.getOffset() - initialOffset == charsToConsume) {
333                 parseLongestRecursive(segment, candidate, recursionLevels + 1, status);
334                 if (U_FAILURE(status)) {
335                     return;
336                 }
337                 if (candidate.isBetterThan(result)) {
338                     result = candidate;
339                 }
340             }
341 
342             // Since the segment can be re-used, reset the offset.
343             // This does not have an effect if the matcher did not consume any chars.
344             segment.setOffset(initialOffset);
345 
346             // Unless the matcher wants to see the next char, continue to the next matcher.
347             if (!maybeMore) {
348                 break;
349             }
350         }
351     }
352 }
353 
toString() const354 UnicodeString NumberParserImpl::toString() const {
355     UnicodeString result(u"<NumberParserImpl matchers:[");
356     for (int32_t i = 0; i < fNumMatchers; i++) {
357         result.append(u' ');
358         result.append(fMatchers[i]->toString());
359     }
360     result.append(u" ]>", -1);
361     return result;
362 }
363 
364 
365 #endif /* #if !UCONFIG_NO_FORMATTING */
366