1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include "numparse_types.h"
13 #include "numparse_affixes.h"
14 #include "numparse_utils.h"
15 #include "number_utils.h"
16 
17 using namespace icu;
18 using namespace icu::numparse;
19 using namespace icu::numparse::impl;
20 using namespace icu::number;
21 using namespace icu::number::impl;
22 
23 
24 namespace {
25 
26 /**
27  * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
28  * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
29  * the given pattern string.
30  */
matched(const AffixPatternMatcher * affix,const UnicodeString & patternString)31 static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
32     return (affix == nullptr && patternString.isBogus()) ||
33            (affix != nullptr && affix->getPattern() == patternString);
34 }
35 
36 /**
37  * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
38  */
length(const AffixPatternMatcher * matcher)39 static int32_t length(const AffixPatternMatcher* matcher) {
40     return matcher == nullptr ? 0 : matcher->getPattern().length();
41 }
42 
43 /**
44  * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
45  * valid, whether they are equal according to operator==.  Similar to Java Objects.equals()
46  */
equals(const AffixPatternMatcher * lhs,const AffixPatternMatcher * rhs)47 static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
48     if (lhs == nullptr && rhs == nullptr) {
49         return true;
50     }
51     if (lhs == nullptr || rhs == nullptr) {
52         return false;
53     }
54     return *lhs == *rhs;
55 }
56 
57 }
58 
59 
AffixPatternMatcherBuilder(const UnicodeString & pattern,AffixTokenMatcherWarehouse & warehouse,IgnorablesMatcher * ignorables)60 AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
61                                                        AffixTokenMatcherWarehouse& warehouse,
62                                                        IgnorablesMatcher* ignorables)
63         : fMatchersLen(0),
64           fLastTypeOrCp(0),
65           fPattern(pattern),
66           fWarehouse(warehouse),
67           fIgnorables(ignorables) {}
68 
consumeToken(AffixPatternType type,UChar32 cp,UErrorCode & status)69 void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
70     // This is called by AffixUtils.iterateWithConsumer() for each token.
71 
72     // Add an ignorables matcher between tokens except between two literals, and don't put two
73     // ignorables matchers in a row.
74     if (fIgnorables != nullptr && fMatchersLen > 0 &&
75         (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
76         addMatcher(*fIgnorables);
77     }
78 
79     if (type != TYPE_CODEPOINT) {
80         // Case 1: the token is a symbol.
81         switch (type) {
82             case TYPE_MINUS_SIGN:
83                 addMatcher(fWarehouse.minusSign());
84                 break;
85             case TYPE_PLUS_SIGN:
86                 addMatcher(fWarehouse.plusSign());
87                 break;
88             case TYPE_PERCENT:
89                 addMatcher(fWarehouse.percent());
90                 break;
91             case TYPE_PERMILLE:
92                 addMatcher(fWarehouse.permille());
93                 break;
94             case TYPE_CURRENCY_SINGLE:
95             case TYPE_CURRENCY_DOUBLE:
96             case TYPE_CURRENCY_TRIPLE:
97             case TYPE_CURRENCY_QUAD:
98             case TYPE_CURRENCY_QUINT:
99                 // All currency symbols use the same matcher
100                 addMatcher(fWarehouse.currency(status));
101                 break;
102             default:
103                 U_ASSERT(FALSE);
104         }
105 
106     } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
107         // Case 2: the token is an ignorable literal.
108         // No action necessary: the ignorables matcher has already been added.
109 
110     } else {
111         // Case 3: the token is a non-ignorable literal.
112         addMatcher(fWarehouse.nextCodePointMatcher(cp));
113     }
114     fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
115 }
116 
addMatcher(NumberParseMatcher & matcher)117 void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
118     if (fMatchersLen >= fMatchers.getCapacity()) {
119         fMatchers.resize(fMatchersLen * 2, fMatchersLen);
120     }
121     fMatchers[fMatchersLen++] = &matcher;
122 }
123 
build()124 AffixPatternMatcher AffixPatternMatcherBuilder::build() {
125     return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern);
126 }
127 
128 
CodePointMatcherWarehouse()129 CodePointMatcherWarehouse::CodePointMatcherWarehouse()
130         : codePointCount(0), codePointNumBatches(0) {}
131 
~CodePointMatcherWarehouse()132 CodePointMatcherWarehouse::~CodePointMatcherWarehouse() {
133     // Delete the variable number of batches of code point matchers
134     for (int32_t i = 0; i < codePointNumBatches; i++) {
135         delete[] codePointsOverflow[i];
136     }
137 }
138 
CodePointMatcherWarehouse(CodePointMatcherWarehouse && src)139 CodePointMatcherWarehouse::CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT
140         : codePoints(std::move(src.codePoints)),
141           codePointsOverflow(std::move(src.codePointsOverflow)),
142           codePointCount(src.codePointCount),
143           codePointNumBatches(src.codePointNumBatches) {}
144 
145 CodePointMatcherWarehouse&
operator =(CodePointMatcherWarehouse && src)146 CodePointMatcherWarehouse::operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT {
147     codePoints = std::move(src.codePoints);
148     codePointsOverflow = std::move(src.codePointsOverflow);
149     codePointCount = src.codePointCount;
150     codePointNumBatches = src.codePointNumBatches;
151     return *this;
152 }
153 
nextCodePointMatcher(UChar32 cp)154 NumberParseMatcher& CodePointMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
155     if (codePointCount < CODE_POINT_STACK_CAPACITY) {
156         return codePoints[codePointCount++] = {cp};
157     }
158     int32_t totalCapacity = CODE_POINT_STACK_CAPACITY + codePointNumBatches * CODE_POINT_BATCH_SIZE;
159     if (codePointCount >= totalCapacity) {
160         // Need a new batch
161         auto* nextBatch = new CodePointMatcher[CODE_POINT_BATCH_SIZE];
162         if (codePointNumBatches >= codePointsOverflow.getCapacity()) {
163             // Need more room for storing pointers to batches
164             codePointsOverflow.resize(codePointNumBatches * 2, codePointNumBatches);
165         }
166         codePointsOverflow[codePointNumBatches++] = nextBatch;
167     }
168     return codePointsOverflow[codePointNumBatches - 1][(codePointCount++ - CODE_POINT_STACK_CAPACITY) %
169                                                        CODE_POINT_BATCH_SIZE] = {cp};
170 }
171 
172 
AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData * setupData)173 AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
174         : fSetupData(setupData) {}
175 
minusSign()176 NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
177     return fMinusSign = {fSetupData->dfs, true};
178 }
179 
plusSign()180 NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
181     return fPlusSign = {fSetupData->dfs, true};
182 }
183 
percent()184 NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
185     return fPercent = {fSetupData->dfs};
186 }
187 
permille()188 NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
189     return fPermille = {fSetupData->dfs};
190 }
191 
currency(UErrorCode & status)192 NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
193     return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
194 }
195 
ignorables()196 IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
197     return fSetupData->ignorables;
198 }
199 
nextCodePointMatcher(UChar32 cp)200 NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
201     return fCodePoints.nextCodePointMatcher(cp);
202 }
203 
204 
CodePointMatcher(UChar32 cp)205 CodePointMatcher::CodePointMatcher(UChar32 cp)
206         : fCp(cp) {}
207 
match(StringSegment & segment,ParsedNumber & result,UErrorCode &) const208 bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
209     if (segment.startsWith(fCp)) {
210         segment.adjustOffsetByCodePoint();
211         result.setCharsConsumed(segment);
212     }
213     return false;
214 }
215 
smokeTest(const StringSegment & segment) const216 bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
217     return segment.startsWith(fCp);
218 }
219 
toString() const220 UnicodeString CodePointMatcher::toString() const {
221     return u"<CodePoint>";
222 }
223 
224 
fromAffixPattern(const UnicodeString & affixPattern,AffixTokenMatcherWarehouse & tokenWarehouse,parse_flags_t parseFlags,bool * success,UErrorCode & status)225 AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
226                                                           AffixTokenMatcherWarehouse& tokenWarehouse,
227                                                           parse_flags_t parseFlags, bool* success,
228                                                           UErrorCode& status) {
229     if (affixPattern.isEmpty()) {
230         *success = false;
231         return {};
232     }
233     *success = true;
234 
235     IgnorablesMatcher* ignorables;
236     if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
237         ignorables = nullptr;
238     } else {
239         ignorables = &tokenWarehouse.ignorables();
240     }
241 
242     AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
243     AffixUtils::iterateWithConsumer(affixPattern, builder, status);
244     return builder.build();
245 }
246 
AffixPatternMatcher(MatcherArray & matchers,int32_t matchersLen,const UnicodeString & pattern)247 AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
248                                          const UnicodeString& pattern)
249         : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {}
250 
getPattern() const251 UnicodeString AffixPatternMatcher::getPattern() const {
252     return fPattern.toAliasedUnicodeString();
253 }
254 
operator ==(const AffixPatternMatcher & other) const255 bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
256     return fPattern == other.fPattern;
257 }
258 
259 
AffixMatcherWarehouse(AffixTokenMatcherWarehouse * tokenWarehouse)260 AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
261         : fTokenWarehouse(tokenWarehouse) {
262 }
263 
isInteresting(const AffixPatternProvider & patternInfo,const IgnorablesMatcher & ignorables,parse_flags_t parseFlags,UErrorCode & status)264 bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
265                                           const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
266                                           UErrorCode& status) {
267     UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
268     UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
269     UnicodeString negPrefixString;
270     UnicodeString negSuffixString;
271     if (patternInfo.hasNegativeSubpattern()) {
272         negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
273         negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
274     }
275 
276     if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
277         AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
278         AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
279         AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
280         AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
281         // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
282         // trailing in the pattern string.
283         && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
284         !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
285         !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
286         !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
287         // The affixes contain only symbols and ignorables.
288         // No need to generate affix matchers.
289         return false;
290     }
291     return true;
292 }
293 
createAffixMatchers(const AffixPatternProvider & patternInfo,MutableMatcherCollection & output,const IgnorablesMatcher & ignorables,parse_flags_t parseFlags,UErrorCode & status)294 void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
295                                                 MutableMatcherCollection& output,
296                                                 const IgnorablesMatcher& ignorables,
297                                                 parse_flags_t parseFlags, UErrorCode& status) {
298     if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
299         return;
300     }
301 
302     // The affixes have interesting characters, or we are in strict mode.
303     // Use initial capacity of 6, the highest possible number of AffixMatchers.
304     UnicodeString sb;
305     bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
306     UNumberSignDisplay signDisplay = (0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) ? UNUM_SIGN_ALWAYS
307                                                                                         : UNUM_SIGN_AUTO;
308 
309     int32_t numAffixMatchers = 0;
310     int32_t numAffixPatternMatchers = 0;
311 
312     AffixPatternMatcher* posPrefix = nullptr;
313     AffixPatternMatcher* posSuffix = nullptr;
314 
315     // Pre-process the affix strings to resolve LDML rules like sign display.
316     for (int8_t signum = 1; signum >= -1; signum--) {
317         // Generate Prefix
318         bool hasPrefix = false;
319         PatternStringUtils::patternInfoToStringBuilder(
320                 patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb);
321         fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
322                 sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
323         AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
324                                                 : nullptr;
325 
326         // Generate Suffix
327         bool hasSuffix = false;
328         PatternStringUtils::patternInfoToStringBuilder(
329                 patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb);
330         fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
331                 sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
332         AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
333                                                 : nullptr;
334 
335         if (signum == 1) {
336             posPrefix = prefix;
337             posSuffix = suffix;
338         } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
339             // Skip adding these matchers (we already have equivalents)
340             continue;
341         }
342 
343         // Flags for setting in the ParsedNumber; the token matchers may add more.
344         int flags = (signum == -1) ? FLAG_NEGATIVE : 0;
345 
346         // Note: it is indeed possible for posPrefix and posSuffix to both be null.
347         // We still need to add that matcher for strict mode to work.
348         fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
349         if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
350             // The following if statements are designed to prevent adding two identical matchers.
351             if (signum == 1 || !equals(prefix, posPrefix)) {
352                 fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
353             }
354             if (signum == 1 || !equals(suffix, posSuffix)) {
355                 fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
356             }
357         }
358     }
359 
360     // Put the AffixMatchers in order, and then add them to the output.
361     // Since there are at most 9 elements, do a simple-to-implement bubble sort.
362     bool madeChanges;
363     do {
364         madeChanges = false;
365         for (int32_t i = 1; i < numAffixMatchers; i++) {
366             if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
367                 madeChanges = true;
368                 AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
369                 fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
370                 fAffixMatchers[i] = std::move(temp);
371             }
372         }
373     } while (madeChanges);
374 
375     for (int32_t i = 0; i < numAffixMatchers; i++) {
376         // Enable the following line to debug affixes
377         //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
378         output.addMatcher(fAffixMatchers[i]);
379     }
380 }
381 
382 
AffixMatcher(AffixPatternMatcher * prefix,AffixPatternMatcher * suffix,result_flags_t flags)383 AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
384         : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
385 
match(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const386 bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
387     if (!result.seenNumber()) {
388         // Prefix
389         // Do not match if:
390         // 1. We have already seen a prefix (result.prefix != null)
391         // 2. The prefix in this AffixMatcher is empty (prefix == null)
392         if (!result.prefix.isBogus() || fPrefix == nullptr) {
393             return false;
394         }
395 
396         // Attempt to match the prefix.
397         int initialOffset = segment.getOffset();
398         bool maybeMore = fPrefix->match(segment, result, status);
399         if (initialOffset != segment.getOffset()) {
400             result.prefix = fPrefix->getPattern();
401         }
402         return maybeMore;
403 
404     } else {
405         // Suffix
406         // Do not match if:
407         // 1. We have already seen a suffix (result.suffix != null)
408         // 2. The suffix in this AffixMatcher is empty (suffix == null)
409         // 3. The matched prefix does not equal this AffixMatcher's prefix
410         if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
411             return false;
412         }
413 
414         // Attempt to match the suffix.
415         int initialOffset = segment.getOffset();
416         bool maybeMore = fSuffix->match(segment, result, status);
417         if (initialOffset != segment.getOffset()) {
418             result.suffix = fSuffix->getPattern();
419         }
420         return maybeMore;
421     }
422 }
423 
smokeTest(const StringSegment & segment) const424 bool AffixMatcher::smokeTest(const StringSegment& segment) const {
425     return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
426            (fSuffix != nullptr && fSuffix->smokeTest(segment));
427 }
428 
postProcess(ParsedNumber & result) const429 void AffixMatcher::postProcess(ParsedNumber& result) const {
430     // Check to see if our affix is the one that was matched. If so, set the flags in the result.
431     if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
432         // Fill in the result prefix and suffix with non-null values (empty string).
433         // Used by strict mode to determine whether an entire affix pair was matched.
434         if (result.prefix.isBogus()) {
435             result.prefix = UnicodeString();
436         }
437         if (result.suffix.isBogus()) {
438             result.suffix = UnicodeString();
439         }
440         result.flags |= fFlags;
441         if (fPrefix != nullptr) {
442             fPrefix->postProcess(result);
443         }
444         if (fSuffix != nullptr) {
445             fSuffix->postProcess(result);
446         }
447     }
448 }
449 
compareTo(const AffixMatcher & rhs) const450 int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
451     const AffixMatcher& lhs = *this;
452     if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
453         return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
454     } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
455         return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
456     } else {
457         return 0;
458     }
459 }
460 
toString() const461 UnicodeString AffixMatcher::toString() const {
462     bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
463     return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
464            (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
465            (fSuffix ? fSuffix->getPattern() : u"null") + u">";
466 
467 }
468 
469 
470 #endif /* #if !UCONFIG_NO_FORMATTING */
471 
472 
473 
474 
475 
476 
477 
478 
479 
480 
481 
482 
483 
484 
485 
486 
487 
488 
489 
490 
491 
492 
493 
494 
495 
496