1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/number/number_test-include.h"
18 
19 #include <string>
20 #include <vector>
21 
22 #include "annotator/collections.h"
23 #include "annotator/model_generated.h"
24 #include "annotator/types-test-util.h"
25 #include "annotator/types.h"
26 #include "utils/tokenizer-utils.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "gmock/gmock.h"
29 #include "gtest/gtest.h"
30 
31 namespace libtextclassifier3 {
32 namespace test_internal {
33 
34 using ::testing::AllOf;
35 using ::testing::ElementsAre;
36 using ::testing::Field;
37 using ::testing::Matcher;
38 using ::testing::UnorderedElementsAre;
39 
40 const NumberAnnotatorOptions*
TestingNumberAnnotatorOptions()41 NumberAnnotatorTest::TestingNumberAnnotatorOptions() {
42   static const flatbuffers::DetachedBuffer* options_data = []() {
43     NumberAnnotatorOptionsT options;
44     options.enabled = true;
45     options.priority_score = -10.0;
46     options.float_number_priority_score = 1.0;
47     options.enabled_annotation_usecases =
48         1 << AnnotationUsecase_ANNOTATION_USECASE_RAW;
49     options.max_number_of_digits = 20;
50 
51     options.percentage_priority_score = 1.0;
52     options.percentage_annotation_usecases =
53         (1 << AnnotationUsecase_ANNOTATION_USECASE_RAW) +
54         (1 << AnnotationUsecase_ANNOTATION_USECASE_SMART);
55     std::set<std::string> percent_suffixes({"パーセント", "percent", "pércént",
56                                             "pc", "pct", "%", "٪", "﹪", "%"});
57     for (const std::string& string_value : percent_suffixes) {
58       options.percentage_pieces_string.append(string_value);
59       options.percentage_pieces_string.push_back('\0');
60     }
61 
62     flatbuffers::FlatBufferBuilder builder;
63     builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
64     return new flatbuffers::DetachedBuffer(builder.Release());
65   }();
66 
67   return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data());
68 }
69 
70 MATCHER_P(IsCorrectCollection, collection, "collection is " + collection) {
71   return arg.collection == collection;
72 }
73 
74 MATCHER_P(IsCorrectNumericValue, numeric_value,
75           "numeric value is " + std::to_string(numeric_value)) {
76   return arg.numeric_value == numeric_value;
77 }
78 
79 MATCHER_P(IsCorrectNumericDoubleValue, numeric_double_value,
80           "numeric double value is " + std::to_string(numeric_double_value)) {
81   return arg.numeric_double_value == numeric_double_value;
82 }
83 
84 MATCHER_P(IsCorrectScore, score, "score is " + std::to_string(score)) {
85   return arg.score == score;
86 }
87 
88 MATCHER_P(IsCorrectPriortyScore, priority_score,
89           "priority score is " + std::to_string(priority_score)) {
90   return arg.priority_score == priority_score;
91 }
92 
93 MATCHER_P(IsCorrectSpan, span,
94           "span is (" + std::to_string(span.first) + "," +
95               std::to_string(span.second) + ")") {
96   return arg.span == span;
97 }
98 
99 MATCHER_P(Classification, inner, "") {
100   return testing::ExplainMatchResult(inner, arg.classification,
101                                      result_listener);
102 }
103 
IsAnnotatedSpan(const CodepointSpan & codepoint_span,const std::string & collection,const int int_value,const double double_value,const float priority_score=-10,const float score=1)104 static Matcher<AnnotatedSpan> IsAnnotatedSpan(
105     const CodepointSpan& codepoint_span, const std::string& collection,
106     const int int_value, const double double_value,
107     const float priority_score = -10, const float score = 1) {
108   return AllOf(
109       IsCorrectSpan(codepoint_span),
110       Classification(ElementsAre(AllOf(
111           IsCorrectCollection(collection), IsCorrectNumericValue(int_value),
112           IsCorrectNumericDoubleValue(double_value), IsCorrectScore(score),
113           IsCorrectPriortyScore(priority_score)))));
114 }
115 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberCorrectly)116 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
117   ClassificationResult classification_result;
118   EXPECT_TRUE(number_annotator_.ClassifyText(
119       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
120       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
121 
122   EXPECT_EQ(classification_result.collection, "number");
123   EXPECT_EQ(classification_result.numeric_value, 12345);
124   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
125 }
126 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectly)127 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberAsFloatCorrectly) {
128   ClassificationResult classification_result;
129   EXPECT_TRUE(number_annotator_.ClassifyText(
130       UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
131       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
132 
133   EXPECT_EQ(classification_result.collection, "number");
134   EXPECT_EQ(classification_result.numeric_value, 12345);
135   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345.12345);
136 }
137 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals)138 TEST_F(NumberAnnotatorTest,
139        ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals) {
140   ClassificationResult classification_result;
141   // The dot after a number is considered punctuation, not part of a floating
142   // number.
143   EXPECT_TRUE(number_annotator_.ClassifyText(
144       UTF8ToUnicodeText("... 12345. ..."), {4, 9},
145       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
146   EXPECT_FALSE(number_annotator_.ClassifyText(
147       UTF8ToUnicodeText("... 12345. ..."), {4, 10},
148       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
149 
150   EXPECT_EQ(classification_result.collection, "number");
151   EXPECT_EQ(classification_result.numeric_value, 12345);
152   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
153 
154   EXPECT_TRUE(number_annotator_.ClassifyText(
155       UTF8ToUnicodeText("... 12345. ..."), {4, 9},
156       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
157   EXPECT_EQ(classification_result.collection, "number");
158   EXPECT_EQ(classification_result.numeric_value, 12345);
159   EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
160 }
161 
TEST_F(NumberAnnotatorTest,FindsAllIntegerAndFloatNumbersInText)162 TEST_F(NumberAnnotatorTest, FindsAllIntegerAndFloatNumbersInText) {
163   std::vector<AnnotatedSpan> result;
164   // In the context "68.9#" -> 68.9 is a number because # is punctuation.
165   // In the context "68.9#?" -> 68.9 is not a number because is followed by two
166   // punctuation signs.
167   EXPECT_TRUE(number_annotator_.FindAll(
168       UTF8ToUnicodeText("how much is 2 plus 5 divided by 7% minus 3.14 "
169                         "what about 68.9# or 68.9#?"),
170       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
171 
172   EXPECT_THAT(result,
173               UnorderedElementsAre(
174                   IsAnnotatedSpan(CodepointSpan(12, 13), "number",
175                                   /*int_value=*/2, /*double_value=*/2.0),
176                   IsAnnotatedSpan(CodepointSpan(19, 20), "number",
177                                   /*int_value=*/5, /*double_value=*/5.0),
178                   IsAnnotatedSpan(CodepointSpan(32, 33), "number",
179                                   /*int_value=*/7, /*double_value=*/7.0),
180                   IsAnnotatedSpan(CodepointSpan(32, 34), "percentage",
181                                   /*int_value=*/7, /*double_value=*/7.0,
182                                   /*priority_score=*/1),
183                   IsAnnotatedSpan(CodepointSpan(41, 45), "number",
184                                   /*int_value=*/3, /*double_value=*/3.14,
185                                   /*priority_score=*/1),
186                   IsAnnotatedSpan(CodepointSpan(57, 61), "number",
187                                   /*int_value=*/68, /*double_value=*/68.9,
188                                   /*priority_score=*/1)));
189 }
190 
TEST_F(NumberAnnotatorTest,ClassifiesNonNumberCorrectly)191 TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
192   ClassificationResult classification_result;
193   EXPECT_FALSE(number_annotator_.ClassifyText(
194       UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
195       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
196   EXPECT_FALSE(number_annotator_.ClassifyText(
197       UTF8ToUnicodeText("... 12345..12345 ..."), {4, 16},
198       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
199   EXPECT_FALSE(number_annotator_.ClassifyText(
200       UTF8ToUnicodeText("... 12345a ..."), {4, 11},
201       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
202 }
203 
TEST_F(NumberAnnotatorTest,ClassifiesNumberSelectionCorrectly)204 TEST_F(NumberAnnotatorTest, ClassifiesNumberSelectionCorrectly) {
205   ClassificationResult classification_result;
206   // Punctuation after a number is not part of the number.
207   EXPECT_TRUE(number_annotator_.ClassifyText(
208       UTF8ToUnicodeText("... 14, ..."), {4, 6},
209       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
210   EXPECT_EQ(classification_result.collection, "number");
211   EXPECT_EQ(classification_result.numeric_value, 14);
212   EXPECT_EQ(classification_result.numeric_double_value, 14);
213 
214   EXPECT_FALSE(number_annotator_.ClassifyText(
215       UTF8ToUnicodeText("... 14, ..."), {4, 7},
216       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
217 }
218 
TEST_F(NumberAnnotatorTest,ClassifiesPercentageSignCorrectly)219 TEST_F(NumberAnnotatorTest, ClassifiesPercentageSignCorrectly) {
220   ClassificationResult classification_result;
221   EXPECT_TRUE(number_annotator_.ClassifyText(
222       UTF8ToUnicodeText("... 99% ..."), {4, 7},
223       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
224 
225   EXPECT_EQ(classification_result.collection, "percentage");
226   EXPECT_EQ(classification_result.numeric_value, 99);
227   EXPECT_EQ(classification_result.numeric_double_value, 99);
228 }
229 
TEST_F(NumberAnnotatorTest,ClassifiesPercentageWordCorrectly)230 TEST_F(NumberAnnotatorTest, ClassifiesPercentageWordCorrectly) {
231   ClassificationResult classification_result;
232   EXPECT_TRUE(number_annotator_.ClassifyText(
233       UTF8ToUnicodeText("... 15 percent ..."), {4, 14},
234       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
235 
236   EXPECT_EQ(classification_result.collection, "percentage");
237   EXPECT_EQ(classification_result.numeric_value, 15);
238   EXPECT_EQ(classification_result.numeric_double_value, 15);
239 }
240 
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiPercentageIncorrectSuffix)241 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiPercentageIncorrectSuffix) {
242   ClassificationResult classification_result;
243   EXPECT_FALSE(number_annotator_.ClassifyText(
244       UTF8ToUnicodeText("15 café"), {0, 7},
245       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
246 }
247 
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiFrPercentageCorrectSuffix)248 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiFrPercentageCorrectSuffix) {
249   ClassificationResult classification_result;
250   EXPECT_TRUE(number_annotator_.ClassifyText(
251       UTF8ToUnicodeText("25 pércént"), {0, 10},
252       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
253 
254   EXPECT_EQ(classification_result.collection, "percentage");
255   EXPECT_EQ(classification_result.numeric_value, 25);
256   EXPECT_EQ(classification_result.numeric_double_value, 25);
257 }
258 
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiJaPercentageCorrectSuffix)259 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiJaPercentageCorrectSuffix) {
260   ClassificationResult classification_result;
261   EXPECT_TRUE(number_annotator_.ClassifyText(
262       UTF8ToUnicodeText("10パーセント"), {0, 7},
263       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
264   EXPECT_EQ(classification_result.collection, "percentage");
265   EXPECT_EQ(classification_result.numeric_value, 10);
266   EXPECT_EQ(classification_result.numeric_double_value, 10);
267 
268   std::vector<AnnotatedSpan> result;
269   EXPECT_TRUE(number_annotator_.FindAll(
270       UTF8ToUnicodeText("明日の降水確率は10パーセント  音量を12にセット"),
271       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
272   EXPECT_THAT(result,
273               UnorderedElementsAre(
274                   IsAnnotatedSpan(CodepointSpan(8, 10), "number",
275                                   /*int_value=*/10, /*double_value=*/10.0),
276                   IsAnnotatedSpan(CodepointSpan(8, 15), "percentage",
277                                   /*int_value=*/10, /*double_value=*/10.0,
278                                   /*priority_score=*/1),
279                   IsAnnotatedSpan(CodepointSpan(20, 22), "number",
280                                   /*int_value=*/12, /*double_value=*/12.0)));
281 }
282 
TEST_F(NumberAnnotatorTest,FindsAllNumbersInText)283 TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
284   std::vector<AnnotatedSpan> result;
285   EXPECT_TRUE(number_annotator_.FindAll(
286       UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
287                         "but not $99."),
288       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
289 
290   EXPECT_THAT(
291       result,
292       UnorderedElementsAre(
293           IsAnnotatedSpan(CodepointSpan(4, 9), "number",
294                           /*int_value=*/12345, /*double_value=*/12345.0),
295           IsAnnotatedSpan(CodepointSpan(14, 15), "number",
296                           /*int_value=*/9, /*double_value=*/9.0),
297           IsAnnotatedSpan(CodepointSpan(33, 35), "number",
298                           /*int_value=*/27, /*double_value=*/27.0),
299           IsAnnotatedSpan(CodepointSpan(33, 36), "percentage",
300                           /*int_value=*/27, /*double_value=*/27.0,
301                           /*priority_score=*/1),
302           IsAnnotatedSpan(CodepointSpan(40, 42), "number",
303                           /*int_value=*/68, /*double_value=*/68.0),
304           IsAnnotatedSpan(CodepointSpan(45, 47), "number",
305                           /*int_value=*/38, /*double_value=*/38.0),
306           IsAnnotatedSpan(CodepointSpan(49, 51), "number",
307                           /*int_value=*/39, /*double_value=*/39.0)));
308 }
309 
TEST_F(NumberAnnotatorTest,FindsNoNumberInText)310 TEST_F(NumberAnnotatorTest, FindsNoNumberInText) {
311   std::vector<AnnotatedSpan> result;
312   EXPECT_TRUE(number_annotator_.FindAll(
313       UTF8ToUnicodeText("... 12345a ... 12345..12345 and 123a45 are not valid. "
314                         "And -#5% is also bad."),
315       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
316   ASSERT_EQ(result.size(), 0);
317 }
318 
TEST_F(NumberAnnotatorTest,FindsNumberWithPunctuation)319 TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
320   std::vector<AnnotatedSpan> result;
321   // A number should be followed by only one punctuation signs => 15 is not a
322   // number.
323   EXPECT_TRUE(number_annotator_.FindAll(
324       UTF8ToUnicodeText(
325           "It's 12, 13, 14! Or 15??? For sure 16: 17; 18. and -19"),
326       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
327 
328   EXPECT_THAT(result,
329               UnorderedElementsAre(
330                   IsAnnotatedSpan(CodepointSpan(5, 7), "number",
331                                   /*int_value=*/12, /*double_value=*/12.0),
332                   IsAnnotatedSpan(CodepointSpan(9, 11), "number",
333                                   /*int_value=*/13, /*double_value=*/13.0),
334                   IsAnnotatedSpan(CodepointSpan(13, 15), "number",
335                                   /*int_value=*/14, /*double_value=*/14.0),
336                   IsAnnotatedSpan(CodepointSpan(35, 37), "number",
337                                   /*int_value=*/16, /*double_value=*/16.0),
338                   IsAnnotatedSpan(CodepointSpan(39, 41), "number",
339                                   /*int_value=*/17, /*double_value=*/17.0),
340                   IsAnnotatedSpan(CodepointSpan(43, 45), "number",
341                                   /*int_value=*/18, /*double_value=*/18.0),
342                   IsAnnotatedSpan(CodepointSpan(51, 54), "number",
343                                   /*int_value=*/-19, /*double_value=*/-19.0)));
344 }
345 
TEST_F(NumberAnnotatorTest,FindsFloatNumberWithPunctuation)346 TEST_F(NumberAnnotatorTest, FindsFloatNumberWithPunctuation) {
347   std::vector<AnnotatedSpan> result;
348   EXPECT_TRUE(number_annotator_.FindAll(
349       UTF8ToUnicodeText("It's 12.123, 13.45, 14.54321! Or 15.1? Maybe 16.33: "
350                         "17.21; but for sure 18.90."),
351       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
352 
353   EXPECT_THAT(result,
354               UnorderedElementsAre(
355                   IsAnnotatedSpan(CodepointSpan(5, 11), "number",
356                                   /*int_value=*/12, /*double_value=*/12.123,
357                                   /*priority_score=*/1),
358                   IsAnnotatedSpan(CodepointSpan(13, 18), "number",
359                                   /*int_value=*/13, /*double_value=*/13.45,
360                                   /*priority_score=*/1),
361                   IsAnnotatedSpan(CodepointSpan(20, 28), "number",
362                                   /*int_value=*/14, /*double_value=*/14.54321,
363                                   /*priority_score=*/1),
364                   IsAnnotatedSpan(CodepointSpan(33, 37), "number",
365                                   /*int_value=*/15, /*double_value=*/15.1,
366                                   /*priority_score=*/1),
367                   IsAnnotatedSpan(CodepointSpan(45, 50), "number",
368                                   /*int_value=*/16, /*double_value=*/16.33,
369                                   /*priority_score=*/1),
370                   IsAnnotatedSpan(CodepointSpan(52, 57), "number",
371                                   /*int_value=*/17, /*double_value=*/17.21,
372                                   /*priority_score=*/1),
373                   IsAnnotatedSpan(CodepointSpan(72, 77), "number",
374                                   /*int_value=*/18, /*double_value=*/18.9,
375                                   /*priority_score=*/1)));
376 }
377 
TEST_F(NumberAnnotatorTest,HandlesNumbersAtBeginning)378 TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
379   std::vector<AnnotatedSpan> result;
380   EXPECT_TRUE(number_annotator_.FindAll(
381       UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
382       &result));
383 
384   EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
385                           CodepointSpan(0, 2), "number",
386                           /*int_value=*/-5, /*double_value=*/-5)));
387 }
388 
TEST_F(NumberAnnotatorTest,HandlesNegativeNumbers)389 TEST_F(NumberAnnotatorTest, HandlesNegativeNumbers) {
390   std::vector<AnnotatedSpan> result;
391   EXPECT_TRUE(number_annotator_.FindAll(
392       UTF8ToUnicodeText("Number -5 and -5% and not number --5%"),
393       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
394 
395   EXPECT_THAT(result,
396               UnorderedElementsAre(
397                   IsAnnotatedSpan(CodepointSpan(7, 9), "number",
398                                   /*int_value=*/-5, /*double_value=*/-5),
399                   IsAnnotatedSpan(CodepointSpan(14, 16), "number",
400                                   /*int_value=*/-5, /*double_value=*/-5),
401                   IsAnnotatedSpan(CodepointSpan(14, 17), "percentage",
402                                   /*int_value=*/-5, /*double_value=*/-5,
403                                   /*priority_score=*/1)));
404 }
405 
TEST_F(NumberAnnotatorTest,FindGoodPercentageContexts)406 TEST_F(NumberAnnotatorTest, FindGoodPercentageContexts) {
407   std::vector<AnnotatedSpan> result;
408   EXPECT_TRUE(number_annotator_.FindAll(
409       UTF8ToUnicodeText(
410           "5 percent, 10 pct, 25 pc and 17%, -5 percent, 10% are percentages"),
411       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
412 
413   EXPECT_THAT(result,
414               UnorderedElementsAre(
415                   IsAnnotatedSpan(CodepointSpan(0, 1), "number",
416                                   /*int_value=*/5, /*double_value=*/5),
417                   IsAnnotatedSpan(CodepointSpan(0, 9), "percentage",
418                                   /*int_value=*/5, /*double_value=*/5,
419                                   /*priority_score=*/1),
420                   IsAnnotatedSpan(CodepointSpan(11, 13), "number",
421                                   /*int_value=*/10, /*double_value=*/10),
422                   IsAnnotatedSpan(CodepointSpan(11, 17), "percentage",
423                                   /*int_value=*/10, /*double_value=*/10,
424                                   /*priority_score=*/1),
425                   IsAnnotatedSpan(CodepointSpan(19, 21), "number",
426                                   /*int_value=*/25, /*double_value=*/25),
427                   IsAnnotatedSpan(CodepointSpan(19, 24), "percentage",
428                                   /*int_value=*/25, /*double_value=*/25,
429                                   /*priority_score=*/1),
430                   IsAnnotatedSpan(CodepointSpan(29, 31), "number",
431                                   /*int_value=*/17, /*double_value=*/17),
432                   IsAnnotatedSpan(CodepointSpan(29, 32), "percentage",
433                                   /*int_value=*/17, /*double_value=*/17,
434                                   /*priority_score=*/1),
435                   IsAnnotatedSpan(CodepointSpan(34, 36), "number",
436                                   /*int_value=*/-5, /*double_value=*/-5),
437                   IsAnnotatedSpan(CodepointSpan(34, 44), "percentage",
438                                   /*int_value=*/-5, /*double_value=*/-5,
439                                   /*priority_score=*/1),
440                   IsAnnotatedSpan(CodepointSpan(46, 48), "number",
441                                   /*int_value=*/10, /*double_value=*/10),
442                   IsAnnotatedSpan(CodepointSpan(46, 49), "percentage",
443                                   /*int_value=*/10, /*double_value=*/10,
444                                   /*priority_score=*/1)));
445 }
446 
TEST_F(NumberAnnotatorTest,FindSinglePercentageInContext)447 TEST_F(NumberAnnotatorTest, FindSinglePercentageInContext) {
448   std::vector<AnnotatedSpan> result;
449   EXPECT_TRUE(number_annotator_.FindAll(
450       UTF8ToUnicodeText("5%"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
451       &result));
452 
453   EXPECT_THAT(result, UnorderedElementsAre(
454                           IsAnnotatedSpan(CodepointSpan(0, 1), "number",
455                                           /*int_value=*/5, /*double_value=*/5),
456                           IsAnnotatedSpan(CodepointSpan(0, 2), "percentage",
457                                           /*int_value=*/5, /*double_value=*/5,
458                                           /*priority_score=*/1)));
459 }
460 
TEST_F(NumberAnnotatorTest,IgnoreBadPercentageContexts)461 TEST_F(NumberAnnotatorTest, IgnoreBadPercentageContexts) {
462   std::vector<AnnotatedSpan> result;
463   // A valid number is followed by only one punctuation element.
464   EXPECT_TRUE(number_annotator_.FindAll(
465       UTF8ToUnicodeText("10, pct, 25 prc, 5#: percentage are not percentages"),
466       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
467 
468   EXPECT_THAT(result,
469               UnorderedElementsAre(
470                   IsAnnotatedSpan(CodepointSpan(0, 2), "number",
471                                   /*int_value=*/10, /*double_value=*/10),
472                   IsAnnotatedSpan(CodepointSpan(9, 11), "number",
473                                   /*int_value=*/25, /*double_value=*/25)));
474 }
475 
TEST_F(NumberAnnotatorTest,IgnoreBadPercentagePunctuationContexts)476 TEST_F(NumberAnnotatorTest, IgnoreBadPercentagePunctuationContexts) {
477   std::vector<AnnotatedSpan> result;
478   EXPECT_TRUE(number_annotator_.FindAll(
479       UTF8ToUnicodeText(
480           "#!24% or :?33 percent are not valid percentages, nor numbers."),
481       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
482 
483   EXPECT_TRUE(result.empty());
484 }
485 
TEST_F(NumberAnnotatorTest,FindPercentageInNonAsciiContext)486 TEST_F(NumberAnnotatorTest, FindPercentageInNonAsciiContext) {
487   std::vector<AnnotatedSpan> result;
488   EXPECT_TRUE(number_annotator_.FindAll(
489       UTF8ToUnicodeText(
490           "At the café 10% or 25 percent of people are nice. Only 10%!"),
491       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
492 
493   EXPECT_THAT(result,
494               UnorderedElementsAre(
495                   IsAnnotatedSpan(CodepointSpan(12, 14), "number",
496                                   /*int_value=*/10, /*double_value=*/10),
497                   IsAnnotatedSpan(CodepointSpan(12, 15), "percentage",
498                                   /*int_value=*/10, /*double_value=*/10,
499                                   /*priority_score=*/1),
500                   IsAnnotatedSpan(CodepointSpan(19, 21), "number",
501                                   /*int_value=*/25, /*double_value=*/25),
502                   IsAnnotatedSpan(CodepointSpan(19, 29), "percentage",
503                                   /*int_value=*/25, /*double_value=*/25,
504                                   /*priority_score=*/1),
505                   IsAnnotatedSpan(CodepointSpan(55, 57), "number",
506                                   /*int_value=*/10, /*double_value=*/10),
507                   IsAnnotatedSpan(CodepointSpan(55, 58), "percentage",
508                                   /*int_value=*/10, /*double_value=*/10,
509                                   /*priority_score=*/1)));
510 }
511 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt)512 TEST_F(NumberAnnotatorTest,
513        WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt) {
514   ClassificationResult classification_result;
515   EXPECT_FALSE(number_annotator_.ClassifyText(
516       UTF8ToUnicodeText("23#!? percent"), {0, 13},
517       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
518 }
519 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt)520 TEST_F(NumberAnnotatorTest,
521        WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt) {
522   ClassificationResult classification_result;
523   EXPECT_FALSE(number_annotator_.ClassifyText(
524       UTF8ToUnicodeText("23 asdf 3.14 pct asdf"), {0, 21},
525       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
526 }
527 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt)528 TEST_F(NumberAnnotatorTest,
529        WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt) {
530   ClassificationResult classification_result;
531   EXPECT_FALSE(number_annotator_.ClassifyText(
532       UTF8ToUnicodeText("abdf23 percentabdf"), {0, 18},
533       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
534 }
535 
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt)536 TEST_F(NumberAnnotatorTest,
537        WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt) {
538   ClassificationResult classification_result;
539   EXPECT_FALSE(number_annotator_.ClassifyText(
540       UTF8ToUnicodeText("#?!23 percent#!?"), {0, 16},
541       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
542 }
543 
TEST_F(NumberAnnotatorTest,WhenBothPercentSymbolAndSuffixDoesNotParseIt)544 TEST_F(NumberAnnotatorTest, WhenBothPercentSymbolAndSuffixDoesNotParseIt) {
545   ClassificationResult classification_result;
546   EXPECT_FALSE(number_annotator_.ClassifyText(
547       UTF8ToUnicodeText("23% percent"), {0, 11},
548       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
549 }
550 
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt)551 TEST_F(NumberAnnotatorTest,
552        WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt) {
553   ClassificationResult classification_result;
554   EXPECT_FALSE(number_annotator_.ClassifyText(
555       UTF8ToUnicodeText("#?23%"), {0, 5},
556       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
557 }
558 
TEST_F(NumberAnnotatorTest,WhenNumberWithAdditionalCharactersDoesNotParsesIt)559 TEST_F(NumberAnnotatorTest, WhenNumberWithAdditionalCharactersDoesNotParsesIt) {
560   ClassificationResult classification_result;
561   EXPECT_FALSE(number_annotator_.ClassifyText(
562       UTF8ToUnicodeText("23#!?"), {0, 5},
563       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
564 }
565 
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt)566 TEST_F(NumberAnnotatorTest,
567        WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt) {
568   ClassificationResult classification_result;
569   // ! does not belong to the percentage annotation
570   EXPECT_TRUE(number_annotator_.ClassifyText(
571       UTF8ToUnicodeText("23%!"), {0, 3},
572       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
573   EXPECT_EQ(classification_result.collection, "percentage");
574   EXPECT_EQ(classification_result.numeric_value, 23);
575   EXPECT_EQ(classification_result.numeric_double_value, 23);
576 
577   EXPECT_FALSE(number_annotator_.ClassifyText(
578       UTF8ToUnicodeText("23%!"), {0, 4},
579       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
580 }
581 
TEST_F(NumberAnnotatorTest,WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt)582 TEST_F(NumberAnnotatorTest,
583        WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt) {
584   ClassificationResult classification_result;
585   EXPECT_FALSE(number_annotator_.ClassifyText(
586       UTF8ToUnicodeText("23.:;%"), {0, 6},
587       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
588 }
589 
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsDoesNotParsesIt)590 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParsesIt) {
591   ClassificationResult classification_result;
592   EXPECT_TRUE(number_annotator_.ClassifyText(
593       UTF8ToUnicodeText("--11"), {1, 4},
594       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
595   EXPECT_THAT(classification_result,
596               AllOf(Field(&ClassificationResult::collection, "number"),
597                     Field(&ClassificationResult::numeric_value, -11),
598                     Field(&ClassificationResult::numeric_double_value, -11)));
599 
600   EXPECT_FALSE(number_annotator_.ClassifyText(
601       UTF8ToUnicodeText("--11"), {0, 4},
602       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
603 }
604 
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsPercentSignDoesNotParsesIt)605 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsPercentSignDoesNotParsesIt) {
606   ClassificationResult classification_result;
607   EXPECT_TRUE(number_annotator_.ClassifyText(
608       UTF8ToUnicodeText("--11%"), {1, 5},
609       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
610   EXPECT_THAT(classification_result,
611               AllOf(Field(&ClassificationResult::collection, "percentage"),
612                     Field(&ClassificationResult::numeric_value, -11),
613                     Field(&ClassificationResult::numeric_double_value, -11)));
614 
615   EXPECT_FALSE(number_annotator_.ClassifyText(
616       UTF8ToUnicodeText("--11%"), {0, 5},
617       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
618 }
619 
TEST_F(NumberAnnotatorTest,WhenPlusMinusSignsDoesNotParsesIt)620 TEST_F(NumberAnnotatorTest, WhenPlusMinusSignsDoesNotParsesIt) {
621   ClassificationResult classification_result;
622   EXPECT_TRUE(number_annotator_.ClassifyText(
623       UTF8ToUnicodeText("+-11"), {1, 4},
624       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
625   EXPECT_THAT(classification_result,
626               AllOf(Field(&ClassificationResult::collection, "number"),
627                     Field(&ClassificationResult::numeric_value, -11),
628                     Field(&ClassificationResult::numeric_double_value, -11)));
629 
630   EXPECT_FALSE(number_annotator_.ClassifyText(
631       UTF8ToUnicodeText("+-11"), {0, 4},
632       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
633 }
634 
TEST_F(NumberAnnotatorTest,WhenMinusPlusSignsDoesNotParsesIt)635 TEST_F(NumberAnnotatorTest, WhenMinusPlusSignsDoesNotParsesIt) {
636   ClassificationResult classification_result;
637   // + right before a number is not included in the number annotation
638   EXPECT_FALSE(number_annotator_.ClassifyText(
639       UTF8ToUnicodeText("-+11"), {1, 4},
640       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
641   EXPECT_FALSE(number_annotator_.ClassifyText(
642       UTF8ToUnicodeText("-+11"), {0, 4},
643       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
644 }
645 
TEST_F(NumberAnnotatorTest,WhenMinusSignSuffixDoesNotParsesIt)646 TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParsesIt) {
647   ClassificationResult classification_result;
648   EXPECT_FALSE(number_annotator_.ClassifyText(
649       UTF8ToUnicodeText("10-"), {0, 3},
650       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
651 }
652 
TEST_F(NumberAnnotatorTest,WhenMultipleCharSuffixDoesNotParsesIt)653 TEST_F(NumberAnnotatorTest, WhenMultipleCharSuffixDoesNotParsesIt) {
654   ClassificationResult classification_result;
655   EXPECT_TRUE(number_annotator_.ClassifyText(
656       UTF8ToUnicodeText("10**"), {0, 2},
657       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
658   EXPECT_THAT(classification_result,
659               AllOf(Field(&ClassificationResult::collection, "number"),
660                     Field(&ClassificationResult::numeric_value, 10),
661                     Field(&ClassificationResult::numeric_double_value, 10)));
662 
663   EXPECT_FALSE(number_annotator_.ClassifyText(
664       UTF8ToUnicodeText("10**"), {0, 3},
665       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
666   EXPECT_FALSE(number_annotator_.ClassifyText(
667       UTF8ToUnicodeText("10**"), {0, 4},
668       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
669 }
670 
TEST_F(NumberAnnotatorTest,WhenMultipleCharPrefixDoesNotParsesIt)671 TEST_F(NumberAnnotatorTest, WhenMultipleCharPrefixDoesNotParsesIt) {
672   ClassificationResult classification_result;
673   EXPECT_FALSE(number_annotator_.ClassifyText(
674       UTF8ToUnicodeText("**10"), {1, 4},
675       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
676   EXPECT_FALSE(number_annotator_.ClassifyText(
677       UTF8ToUnicodeText("**10"), {0, 4},
678       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
679 }
680 
TEST_F(NumberAnnotatorTest,WhenLowestSupportedNumberParsesIt)681 TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
682   ClassificationResult classification_result;
683   EXPECT_TRUE(number_annotator_.ClassifyText(
684       UTF8ToUnicodeText("-1000000000"), {0, 11},
685       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
686 
687   EXPECT_THAT(
688       classification_result,
689       AllOf(Field(&ClassificationResult::collection, "number"),
690             Field(&ClassificationResult::numeric_value, -1000000000),
691             Field(&ClassificationResult::numeric_double_value, -1000000000)));
692 }
693 
TEST_F(NumberAnnotatorTest,WhenLargestSupportedNumberParsesIt)694 TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
695   ClassificationResult classification_result;
696   EXPECT_TRUE(number_annotator_.ClassifyText(
697       UTF8ToUnicodeText("1000000000"), {0, 10},
698       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
699 
700   EXPECT_THAT(
701       classification_result,
702       AllOf(Field(&ClassificationResult::collection, "number"),
703             Field(&ClassificationResult::numeric_value, 1000000000),
704             Field(&ClassificationResult::numeric_double_value, 1000000000)));
705 }
706 
TEST_F(NumberAnnotatorTest,WhenLowestSupportedFloatNumberParsesIt)707 TEST_F(NumberAnnotatorTest, WhenLowestSupportedFloatNumberParsesIt) {
708   ClassificationResult classification_result;
709   EXPECT_TRUE(number_annotator_.ClassifyText(
710       UTF8ToUnicodeText("-999999999.999999999"), {0, 20},
711       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
712 
713   EXPECT_THAT(classification_result,
714               AllOf(Field(&ClassificationResult::collection, "number"),
715                     Field(&ClassificationResult::numeric_value, -1000000000),
716                     Field(&ClassificationResult::numeric_double_value,
717                           -999999999.999999999)));
718 }
719 
TEST_F(NumberAnnotatorTest,WhenLargestFloatSupportedNumberParsesIt)720 TEST_F(NumberAnnotatorTest, WhenLargestFloatSupportedNumberParsesIt) {
721   ClassificationResult classification_result;
722   EXPECT_TRUE(number_annotator_.ClassifyText(
723       UTF8ToUnicodeText("999999999.999999999"), {0, 19},
724       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
725 
726   EXPECT_THAT(classification_result,
727               AllOf(Field(&ClassificationResult::collection, "number"),
728                     Field(&ClassificationResult::numeric_value, 1000000000),
729                     Field(&ClassificationResult::numeric_double_value,
730                           999999999.999999999)));
731 }
732 
TEST_F(NumberAnnotatorTest,WhenLargeNumberDoesNotParseIt)733 TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
734   ClassificationResult classification_result;
735   EXPECT_FALSE(number_annotator_.ClassifyText(
736       UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
737       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
738 }
739 
TEST_F(NumberAnnotatorTest,WhenMinusInTheMiddleDoesNotParseIt)740 TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
741   ClassificationResult classification_result;
742   EXPECT_FALSE(number_annotator_.ClassifyText(
743       UTF8ToUnicodeText("2016-2017"), {0, 9},
744       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
745 }
746 
TEST_F(NumberAnnotatorTest,WhenSuffixWithoutNumberDoesNotParseIt)747 TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
748   std::vector<AnnotatedSpan> result;
749   EXPECT_TRUE(number_annotator_.FindAll(
750       UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
751       &result));
752 
753   ASSERT_EQ(result.size(), 0);
754 }
755 
TEST_F(NumberAnnotatorTest,WhenPrefixWithoutNumberDoesNotParseIt)756 TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
757   std::vector<AnnotatedSpan> result;
758   EXPECT_TRUE(number_annotator_.FindAll(
759       UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
760       &result));
761 
762   ASSERT_EQ(result.size(), 0);
763 }
764 
TEST_F(NumberAnnotatorTest,WhenPrefixAndSuffixWithoutNumberDoesNotParseIt)765 TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
766   std::vector<AnnotatedSpan> result;
767   EXPECT_TRUE(number_annotator_.FindAll(
768       UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
769       &result));
770 
771   ASSERT_EQ(result.size(), 0);
772 }
773 
TEST_F(NumberAnnotatorTest,ForNumberAnnotationsSetsScoreAndPriorityScore)774 TEST_F(NumberAnnotatorTest, ForNumberAnnotationsSetsScoreAndPriorityScore) {
775   ClassificationResult classification_result;
776   EXPECT_TRUE(number_annotator_.ClassifyText(
777       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
778       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
779 
780   EXPECT_EQ(classification_result.collection, "number");
781   EXPECT_EQ(classification_result.numeric_value, 12345);
782   EXPECT_EQ(classification_result.numeric_double_value, 12345);
783   EXPECT_EQ(classification_result.score, 1);
784   EXPECT_EQ(classification_result.priority_score, -10);
785 
786   std::vector<AnnotatedSpan> result;
787   EXPECT_TRUE(number_annotator_.FindAll(
788       UTF8ToUnicodeText("Come at 9 or 10 ok?"),
789       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
790 
791   EXPECT_THAT(result,
792               UnorderedElementsAre(
793                   IsAnnotatedSpan(CodepointSpan(8, 9), "number",
794                                   /*int_value=*/9, /*double_value=*/9),
795                   IsAnnotatedSpan(CodepointSpan(13, 15), "number",
796                                   /*int_value=*/10, /*double_value=*/10)));
797 }
798 
TEST_F(NumberAnnotatorTest,ForFloatNumberAnnotationsSetsScoreAndPriorityScore)799 TEST_F(NumberAnnotatorTest,
800        ForFloatNumberAnnotationsSetsScoreAndPriorityScore) {
801   ClassificationResult classification_result;
802   EXPECT_TRUE(number_annotator_.ClassifyText(
803       UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
804       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
805   EXPECT_EQ(classification_result.collection, "number");
806   EXPECT_EQ(classification_result.numeric_value, 12345);
807   EXPECT_EQ(classification_result.numeric_double_value, 12345.12345);
808   EXPECT_EQ(classification_result.score, 1);
809   EXPECT_EQ(classification_result.priority_score, 1);
810 
811   std::vector<AnnotatedSpan> result;
812   EXPECT_TRUE(number_annotator_.FindAll(
813       UTF8ToUnicodeText("Results are between 12.5 and 13.5, right?"),
814       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
815   EXPECT_THAT(result,
816               UnorderedElementsAre(
817                   IsAnnotatedSpan(CodepointSpan(20, 24), "number",
818                                   /*int_value=*/12, /*double_value=*/12.5,
819                                   /*priority_score=*/1),
820                   IsAnnotatedSpan(CodepointSpan(29, 33), "number",
821                                   /*int_value=*/13, /*double_value=*/13.5,
822                                   /*priority_score=*/1)));
823 }
824 
TEST_F(NumberAnnotatorTest,ForPercentageAnnotationsSetsScoreAndPriorityScore)825 TEST_F(NumberAnnotatorTest, ForPercentageAnnotationsSetsScoreAndPriorityScore) {
826   ClassificationResult classification_result;
827   EXPECT_TRUE(number_annotator_.ClassifyText(
828       UTF8ToUnicodeText("... 12345% ..."), {4, 10},
829       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
830   EXPECT_EQ(classification_result.collection, "percentage");
831   EXPECT_EQ(classification_result.numeric_value, 12345);
832   EXPECT_EQ(classification_result.numeric_double_value, 12345);
833   EXPECT_EQ(classification_result.score, 1);
834   EXPECT_EQ(classification_result.priority_score, 1);
835 
836   EXPECT_TRUE(number_annotator_.ClassifyText(
837       UTF8ToUnicodeText("... 12345 percent ..."), {4, 17},
838       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
839   EXPECT_EQ(classification_result.collection, "percentage");
840   EXPECT_EQ(classification_result.numeric_value, 12345);
841   EXPECT_EQ(classification_result.numeric_double_value, 12345);
842   EXPECT_EQ(classification_result.score, 1);
843   EXPECT_EQ(classification_result.priority_score, 1);
844 
845   std::vector<AnnotatedSpan> result;
846   EXPECT_TRUE(number_annotator_.FindAll(
847       UTF8ToUnicodeText("Results are between 9% and 10 percent."),
848       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
849   EXPECT_THAT(result,
850               UnorderedElementsAre(
851                   IsAnnotatedSpan(CodepointSpan(20, 21), "number",
852                                   /*int_value=*/9, /*double_value=*/9),
853                   IsAnnotatedSpan(CodepointSpan(20, 22), "percentage",
854                                   /*int_value=*/9, /*double_value=*/9,
855                                   /*priority_score=*/1),
856                   IsAnnotatedSpan(CodepointSpan(27, 29), "number",
857                                   /*int_value=*/10, /*double_value=*/10),
858                   IsAnnotatedSpan(CodepointSpan(27, 37), "percentage",
859                                   /*int_value=*/10, /*double_value=*/10,
860                                   /*priority_score=*/1)));
861 }
862 
TEST_F(NumberAnnotatorTest,NumberDisabledPercentageEnabledForSmartUsecase)863 TEST_F(NumberAnnotatorTest, NumberDisabledPercentageEnabledForSmartUsecase) {
864   ClassificationResult classification_result;
865   EXPECT_FALSE(number_annotator_.ClassifyText(
866       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
867       AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
868 
869   EXPECT_TRUE(number_annotator_.ClassifyText(
870       UTF8ToUnicodeText("... 12345% ..."), {4, 10},
871       AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
872   EXPECT_EQ(classification_result.collection, "percentage");
873   EXPECT_EQ(classification_result.numeric_value, 12345);
874   EXPECT_EQ(classification_result.numeric_double_value, 12345.0);
875   EXPECT_EQ(classification_result.score, 1);
876   EXPECT_EQ(classification_result.priority_score, 1);
877 
878   EXPECT_TRUE(number_annotator_.ClassifyText(
879       UTF8ToUnicodeText("... 12345percent ..."), {4, 16},
880       AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
881   EXPECT_EQ(classification_result.collection, "percentage");
882   EXPECT_EQ(classification_result.numeric_value, 12345);
883   EXPECT_EQ(classification_result.numeric_double_value, 12345);
884   EXPECT_EQ(classification_result.score, 1);
885   EXPECT_EQ(classification_result.priority_score, 1);
886 
887   std::vector<AnnotatedSpan> result;
888   EXPECT_TRUE(number_annotator_.FindAll(
889       UTF8ToUnicodeText("Accuracy for experiment 3 is 9%."),
890       AnnotationUsecase_ANNOTATION_USECASE_SMART, &result));
891   EXPECT_THAT(result, UnorderedElementsAre(
892                           IsAnnotatedSpan(CodepointSpan(29, 31), "percentage",
893                                           /*int_value=*/9, /*double_value=*/9.0,
894                                           /*priority_score=*/1)));
895 }
896 
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersFindAll)897 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersFindAll) {
898   std::vector<AnnotatedSpan> result;
899   EXPECT_TRUE(number_annotator_.FindAll(
900       UTF8ToUnicodeText("how much is 2 + 2 or 5 - 96 * 89"),
901       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
902 
903   EXPECT_THAT(result,
904               UnorderedElementsAre(
905                   IsAnnotatedSpan(CodepointSpan(12, 13), "number",
906                                   /*int_value=*/2, /*double_value=*/2),
907                   IsAnnotatedSpan(CodepointSpan(16, 17), "number",
908                                   /*int_value=*/2, /*double_value=*/2),
909                   IsAnnotatedSpan(CodepointSpan(21, 22), "number",
910                                   /*int_value=*/5, /*double_value=*/5),
911                   IsAnnotatedSpan(CodepointSpan(25, 27), "number",
912                                   /*int_value=*/96, /*double_value=*/96),
913                   IsAnnotatedSpan(CodepointSpan(30, 32), "number",
914                                   /*int_value=*/89, /*double_value=*/89)));
915 }
916 
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersClassifyText)917 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersClassifyText) {
918   ClassificationResult classification_result;
919   EXPECT_FALSE(number_annotator_.ClassifyText(
920       UTF8ToUnicodeText("2 + 2"), {2, 3},
921       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
922   EXPECT_FALSE(number_annotator_.ClassifyText(
923       UTF8ToUnicodeText("2 - 96 * 89"), {2, 3},
924       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
925 }
926 
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersFindAll)927 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersFindAll) {
928   std::vector<AnnotatedSpan> result;
929   EXPECT_TRUE(number_annotator_.FindAll(
930       UTF8ToUnicodeText("what's 1 + 2/3 * 4/5 * 6 / 7"),
931       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
932 
933   EXPECT_THAT(result,
934               UnorderedElementsAre(
935                   IsAnnotatedSpan(CodepointSpan(7, 8), "number",
936                                   /*int_value=*/1, /*double_value=*/1),
937                   IsAnnotatedSpan(CodepointSpan(11, 12), "number",
938                                   /*int_value=*/2, /*double_value=*/2),
939                   IsAnnotatedSpan(CodepointSpan(13, 14), "number",
940                                   /*int_value=*/3, /*double_value=*/3),
941                   IsAnnotatedSpan(CodepointSpan(17, 18), "number",
942                                   /*int_value=*/4, /*double_value=*/4),
943                   IsAnnotatedSpan(CodepointSpan(19, 20), "number",
944                                   /*int_value=*/5, /*double_value=*/5),
945                   IsAnnotatedSpan(CodepointSpan(23, 24), "number",
946                                   /*int_value=*/6, /*double_value=*/6),
947                   IsAnnotatedSpan(CodepointSpan(27, 28), "number",
948                                   /*int_value=*/7, /*double_value=*/7)));
949 }
950 
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersClassifyText)951 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersClassifyText) {
952   ClassificationResult classification_result;
953   EXPECT_TRUE(number_annotator_.ClassifyText(
954       UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {11, 12},
955       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
956   EXPECT_EQ(classification_result.collection, "number");
957   EXPECT_EQ(classification_result.numeric_value, 2);
958   EXPECT_EQ(classification_result.numeric_double_value, 2);
959   EXPECT_EQ(classification_result.score, 1);
960 
961   EXPECT_TRUE(number_annotator_.ClassifyText(
962       UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {13, 14},
963       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
964   EXPECT_EQ(classification_result.collection, "number");
965   EXPECT_EQ(classification_result.numeric_value, 3);
966   EXPECT_EQ(classification_result.numeric_double_value, 3);
967   EXPECT_EQ(classification_result.score, 1);
968 }
969 
TEST_F(NumberAnnotatorTest,SlashDoesNotSeparatesTwoNumbersFindAll)970 TEST_F(NumberAnnotatorTest, SlashDoesNotSeparatesTwoNumbersFindAll) {
971   std::vector<AnnotatedSpan> result;
972   // 2 in the "2/" context is a number because / is punctuation
973   EXPECT_TRUE(number_annotator_.FindAll(
974       UTF8ToUnicodeText("what's 2a2/3 or 2/s4 or 2/ or /3 or //3 or 2//"),
975       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
976 
977   EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
978                           CodepointSpan(24, 25), "number",
979                           /*int_value=*/2, /*double_value=*/2)));
980 }
981 
TEST_F(NumberAnnotatorTest,BracketsContextAnnotatedFindAll)982 TEST_F(NumberAnnotatorTest, BracketsContextAnnotatedFindAll) {
983   std::vector<AnnotatedSpan> result;
984   EXPECT_TRUE(number_annotator_.FindAll(
985       UTF8ToUnicodeText("The interval is: (12, 13) or [-12, -4.5)"),
986       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
987 
988   EXPECT_THAT(result,
989               UnorderedElementsAre(
990                   IsAnnotatedSpan(CodepointSpan(18, 20), "number",
991                                   /*int_value=*/12, /*double_value=*/12),
992                   IsAnnotatedSpan(CodepointSpan(22, 24), "number",
993                                   /*int_value=*/13, /*double_value=*/13),
994                   IsAnnotatedSpan(CodepointSpan(30, 33), "number",
995                                   /*int_value=*/-12, /*double_value=*/-12),
996                   IsAnnotatedSpan(CodepointSpan(35, 39), "number",
997                                   /*int_value=*/-4, /*double_value=*/-4.5,
998                                   /*priority_score=*/1)));
999 }
1000 
TEST_F(NumberAnnotatorTest,BracketsContextNotAnnotatedFindAll)1001 TEST_F(NumberAnnotatorTest, BracketsContextNotAnnotatedFindAll) {
1002   std::vector<AnnotatedSpan> result;
1003   EXPECT_TRUE(number_annotator_.FindAll(
1004       UTF8ToUnicodeText("The interval is: -(12, 138*)"),
1005       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1006 
1007   EXPECT_TRUE(result.empty());
1008 }
1009 
TEST_F(NumberAnnotatorTest,FractionalNumberDotsFindAll)1010 TEST_F(NumberAnnotatorTest, FractionalNumberDotsFindAll) {
1011   std::vector<AnnotatedSpan> result;
1012   // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1013   EXPECT_TRUE(number_annotator_.FindAll(
1014       UTF8ToUnicodeText("3.1 3﹒2 3.3"),
1015       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1016 
1017   EXPECT_THAT(result, UnorderedElementsAre(
1018                           IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1019                                           /*int_value=*/3, /*double_value=*/3.1,
1020                                           /*priority_score=*/1),
1021                           IsAnnotatedSpan(CodepointSpan(4, 7), "number",
1022                                           /*int_value=*/3, /*double_value=*/3.2,
1023                                           /*priority_score=*/1),
1024                           IsAnnotatedSpan(CodepointSpan(8, 11), "number",
1025                                           /*int_value=*/3, /*double_value=*/3.3,
1026                                           /*priority_score=*/1)));
1027 }
1028 
TEST_F(NumberAnnotatorTest,NonAsciiDigitsFindAll)1029 TEST_F(NumberAnnotatorTest, NonAsciiDigitsFindAll) {
1030   std::vector<AnnotatedSpan> result;
1031   // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1032   // Digits source: https://unicode-search.net/unicode-namesearch.pl?term=digit
1033   EXPECT_TRUE(number_annotator_.FindAll(
1034       UTF8ToUnicodeText("3 3﹒2 3.3%"),
1035       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1036 
1037   EXPECT_THAT(result, UnorderedElementsAre(
1038                           IsAnnotatedSpan(CodepointSpan(0, 1), "number",
1039                                           /*int_value=*/3, /*double_value=*/3),
1040                           IsAnnotatedSpan(CodepointSpan(2, 5), "number",
1041                                           /*int_value=*/3, /*double_value=*/3.2,
1042                                           /*priority_score=*/1),
1043                           IsAnnotatedSpan(CodepointSpan(6, 9), "number",
1044                                           /*int_value=*/3, /*double_value=*/3.3,
1045                                           /*priority_score=*/1),
1046                           IsAnnotatedSpan(CodepointSpan(6, 10), "percentage",
1047                                           /*int_value=*/3, /*double_value=*/3.3,
1048                                           /*priority_score=*/1)));
1049 }
1050 
TEST_F(NumberAnnotatorTest,AnnotatedZeroPrecededNumbersFindAll)1051 TEST_F(NumberAnnotatorTest, AnnotatedZeroPrecededNumbersFindAll) {
1052   std::vector<AnnotatedSpan> result;
1053   EXPECT_TRUE(number_annotator_.FindAll(
1054       UTF8ToUnicodeText("Numbers: 0.9 or 09 or 09.9 or 032310"),
1055       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1056 
1057   EXPECT_THAT(result, UnorderedElementsAre(
1058                           IsAnnotatedSpan(CodepointSpan(9, 12), "number",
1059                                           /*int_value=*/0, /*double_value=*/0.9,
1060                                           /*priority_score=*/1),
1061                           IsAnnotatedSpan(CodepointSpan(16, 18), "number",
1062                                           /*int_value=*/9, /*double_value=*/9),
1063                           IsAnnotatedSpan(CodepointSpan(22, 26), "number",
1064                                           /*int_value=*/9, /*double_value=*/9.9,
1065                                           /*priority_score=*/1),
1066                           IsAnnotatedSpan(CodepointSpan(30, 36), "number",
1067                                           /*int_value=*/32310,
1068                                           /*double_value=*/32310)));
1069 }
1070 
TEST_F(NumberAnnotatorTest,ZeroAfterDotFindAll)1071 TEST_F(NumberAnnotatorTest, ZeroAfterDotFindAll) {
1072   std::vector<AnnotatedSpan> result;
1073   EXPECT_TRUE(number_annotator_.FindAll(
1074       UTF8ToUnicodeText("15.0 16.00"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
1075       &result));
1076 
1077   EXPECT_THAT(result,
1078               UnorderedElementsAre(
1079                   IsAnnotatedSpan(CodepointSpan(0, 4), "number",
1080                                   /*int_value=*/15, /*double_value=*/15),
1081                   IsAnnotatedSpan(CodepointSpan(5, 10), "number",
1082                                   /*int_value=*/16, /*double_value=*/16)));
1083 }
1084 
TEST_F(NumberAnnotatorTest,NineDotNineFindAll)1085 TEST_F(NumberAnnotatorTest, NineDotNineFindAll) {
1086   std::vector<AnnotatedSpan> result;
1087   EXPECT_TRUE(number_annotator_.FindAll(
1088       UTF8ToUnicodeText("9.9 9.99 99.99 99.999 99.9999"),
1089       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1090 
1091   EXPECT_THAT(result,
1092               UnorderedElementsAre(
1093                   IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1094                                   /*int_value=*/9, /*double_value=*/9.9,
1095                                   /*priority_score=*/1),
1096                   IsAnnotatedSpan(CodepointSpan(4, 8), "number",
1097                                   /*int_value=*/9, /*double_value=*/9.99,
1098                                   /*priority_score=*/1),
1099                   IsAnnotatedSpan(CodepointSpan(9, 14), "number",
1100                                   /*int_value=*/99, /*double_value=*/99.99,
1101                                   /*priority_score=*/1),
1102                   IsAnnotatedSpan(CodepointSpan(15, 21), "number",
1103                                   /*int_value=*/99, /*double_value=*/99.999,
1104                                   /*priority_score=*/1),
1105                   IsAnnotatedSpan(CodepointSpan(22, 29), "number",
1106                                   /*int_value=*/99, /*double_value=*/99.9999,
1107                                   /*priority_score=*/1)));
1108 }
1109 
1110 }  // namespace test_internal
1111 }  // namespace libtextclassifier3
1112