1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/number/number.h"
18 
19 #include <string>
20 #include <vector>
21 
22 #include "annotator/collections.h"
23 #include "annotator/model_generated.h"
24 #include "annotator/types-test-util.h"
25 #include "annotator/types.h"
26 #include "utils/test-utils.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 #include "gmock/gmock.h"
30 #include "gtest/gtest.h"
31 
32 namespace libtextclassifier3 {
33 namespace {
34 
35 using testing::AllOf;
36 using testing::ElementsAre;
37 using testing::Field;
38 
TestingNumberAnnotatorOptions()39 const NumberAnnotatorOptions* TestingNumberAnnotatorOptions() {
40   static const flatbuffers::DetachedBuffer* options_data = []() {
41     NumberAnnotatorOptionsT options;
42     options.enabled = true;
43     options.allowed_prefix_codepoints.push_back('$');
44     options.allowed_suffix_codepoints.push_back('%');
45 
46     flatbuffers::FlatBufferBuilder builder;
47     builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
48     return new flatbuffers::DetachedBuffer(builder.Release());
49   }();
50 
51   return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data());
52 }
53 
BuildFeatureProcessor(const UniLib * unilib)54 FeatureProcessor BuildFeatureProcessor(const UniLib* unilib) {
55   static const flatbuffers::DetachedBuffer* options_data = []() {
56     FeatureProcessorOptionsT options;
57     options.context_size = 1;
58     options.max_selection_span = 1;
59     options.snap_label_span_boundaries_to_containing_tokens = false;
60     options.ignored_span_boundary_codepoints.push_back(',');
61 
62     options.tokenization_codepoint_config.emplace_back(
63         new TokenizationCodepointRangeT());
64     auto& config = options.tokenization_codepoint_config.back();
65     config->start = 32;
66     config->end = 33;
67     config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
68 
69     flatbuffers::FlatBufferBuilder builder;
70     builder.Finish(FeatureProcessorOptions::Pack(builder, &options));
71     return new flatbuffers::DetachedBuffer(builder.Release());
72   }();
73 
74   const FeatureProcessorOptions* feature_processor_options =
75       flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data());
76 
77   return FeatureProcessor(feature_processor_options, unilib);
78 }
79 
80 class NumberAnnotatorTest : public ::testing::Test {
81  protected:
NumberAnnotatorTest()82   NumberAnnotatorTest()
83       : INIT_UNILIB_FOR_TESTING(unilib_),
84         feature_processor_(BuildFeatureProcessor(&unilib_)),
85         number_annotator_(TestingNumberAnnotatorOptions(),
86                           &feature_processor_) {}
87 
88   UniLib unilib_;
89   FeatureProcessor feature_processor_;
90   NumberAnnotator number_annotator_;
91 };
92 
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberCorrectly)93 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
94   ClassificationResult classification_result;
95   EXPECT_TRUE(number_annotator_.ClassifyText(
96       UTF8ToUnicodeText("... 12345 ..."), {4, 9},
97       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
98 
99   EXPECT_EQ(classification_result.collection, "number");
100   EXPECT_EQ(classification_result.numeric_value, 12345);
101 }
102 
TEST_F(NumberAnnotatorTest,ClassifiesNonNumberCorrectly)103 TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
104   ClassificationResult classification_result;
105   EXPECT_FALSE(number_annotator_.ClassifyText(
106       UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
107       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
108 }
109 
TEST_F(NumberAnnotatorTest,FindsAllNumbersInText)110 TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
111   std::vector<AnnotatedSpan> result;
112   EXPECT_TRUE(number_annotator_.FindAll(
113       UTF8ToUnicodeText("... 12345 ... 9 is my number and I paid $99 and "
114                         "sometimes 27% but not 68# nor #68"),
115       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
116 
117   ASSERT_EQ(result.size(), 4);
118   ASSERT_EQ(result[0].classification.size(), 1);
119   EXPECT_EQ(result[0].classification[0].collection, "number");
120   EXPECT_EQ(result[0].classification[0].numeric_value, 12345);
121   ASSERT_EQ(result[1].classification.size(), 1);
122   EXPECT_EQ(result[1].classification[0].collection, "number");
123   EXPECT_EQ(result[1].classification[0].numeric_value, 9);
124   ASSERT_EQ(result[2].classification.size(), 1);
125   EXPECT_EQ(result[2].classification[0].collection, "number");
126   EXPECT_EQ(result[2].classification[0].numeric_value, 99);
127   ASSERT_EQ(result[3].classification.size(), 1);
128   EXPECT_EQ(result[3].classification[0].collection, "number");
129   EXPECT_EQ(result[3].classification[0].numeric_value, 27);
130 }
131 
TEST_F(NumberAnnotatorTest,FindsNumberWithPunctuation)132 TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
133   std::vector<AnnotatedSpan> result;
134   EXPECT_TRUE(number_annotator_.FindAll(
135       UTF8ToUnicodeText("Come at 9, ok?"),
136       AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
137 
138   EXPECT_THAT(
139       result,
140       ElementsAre(
141           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(8, 9)),
142                 Field(&AnnotatedSpan::classification,
143                       ElementsAre(AllOf(
144                           Field(&ClassificationResult::collection, "number"),
145                           Field(&ClassificationResult::numeric_value, 9)))))));
146 }
147 
TEST_F(NumberAnnotatorTest,HandlesNumbersAtBeginning)148 TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
149   std::vector<AnnotatedSpan> result;
150   EXPECT_TRUE(number_annotator_.FindAll(
151       UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
152       &result));
153 
154   EXPECT_THAT(
155       result,
156       ElementsAre(
157           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 2)),
158                 Field(&AnnotatedSpan::classification,
159                       ElementsAre(AllOf(
160                           Field(&ClassificationResult::collection, "number"),
161                           Field(&ClassificationResult::numeric_value, -5)))))));
162 }
163 
TEST_F(NumberAnnotatorTest,WhenLowestSupportedNumberParsesIt)164 TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
165   ClassificationResult classification_result;
166   EXPECT_TRUE(number_annotator_.ClassifyText(
167       UTF8ToUnicodeText("-999999999999999999"), {0, 19},
168       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
169 
170   EXPECT_THAT(
171       classification_result,
172       AllOf(Field(&ClassificationResult::collection, "number"),
173             Field(&ClassificationResult::numeric_value, -999999999999999999L)));
174 }
175 
TEST_F(NumberAnnotatorTest,WhenLargestSupportedNumberParsesIt)176 TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
177   ClassificationResult classification_result;
178   EXPECT_TRUE(number_annotator_.ClassifyText(
179       UTF8ToUnicodeText("999999999999999999"), {0, 18},
180       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
181 
182   EXPECT_THAT(
183       classification_result,
184       AllOf(Field(&ClassificationResult::collection, "number"),
185             Field(&ClassificationResult::numeric_value, 999999999999999999L)));
186 }
187 
TEST_F(NumberAnnotatorTest,WhenFirstLowestNonSupportedNumberDoesNotParseIt)188 TEST_F(NumberAnnotatorTest, WhenFirstLowestNonSupportedNumberDoesNotParseIt) {
189   ClassificationResult classification_result;
190   EXPECT_FALSE(number_annotator_.ClassifyText(
191       UTF8ToUnicodeText("-10000000000000000000"), {0, 21},
192       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
193 }
194 
TEST_F(NumberAnnotatorTest,WhenFirstLargestNonSupportedNumberDoesNotParseIt)195 TEST_F(NumberAnnotatorTest, WhenFirstLargestNonSupportedNumberDoesNotParseIt) {
196   ClassificationResult classification_result;
197   EXPECT_FALSE(number_annotator_.ClassifyText(
198       UTF8ToUnicodeText("10000000000000000000"), {0, 20},
199       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
200 }
201 
TEST_F(NumberAnnotatorTest,WhenLargeNumberDoesNotParseIt)202 TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
203   ClassificationResult classification_result;
204   EXPECT_FALSE(number_annotator_.ClassifyText(
205       UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
206       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
207 }
208 
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsDoesNotParseIt)209 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParseIt) {
210   ClassificationResult classification_result;
211   EXPECT_FALSE(number_annotator_.ClassifyText(
212       UTF8ToUnicodeText("--10"), {0, 4},
213       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
214 }
215 
TEST_F(NumberAnnotatorTest,WhenMinusSignSuffixDoesNotParseIt)216 TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParseIt) {
217   ClassificationResult classification_result;
218   EXPECT_FALSE(number_annotator_.ClassifyText(
219       UTF8ToUnicodeText("10-"), {0, 3},
220       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
221 }
222 
TEST_F(NumberAnnotatorTest,WhenMinusInTheMiddleDoesNotParseIt)223 TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
224   ClassificationResult classification_result;
225   EXPECT_FALSE(number_annotator_.ClassifyText(
226       UTF8ToUnicodeText("2016-2017"), {0, 9},
227       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
228 }
229 
TEST_F(NumberAnnotatorTest,WhenSuffixWithoutNumberDoesNotParseIt)230 TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
231   std::vector<AnnotatedSpan> result;
232   EXPECT_TRUE(number_annotator_.FindAll(
233       UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
234       &result));
235 
236   ASSERT_EQ(result.size(), 0);
237 }
238 
TEST_F(NumberAnnotatorTest,WhenPrefixWithoutNumberDoesNotParseIt)239 TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
240   std::vector<AnnotatedSpan> result;
241   EXPECT_TRUE(number_annotator_.FindAll(
242       UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
243       &result));
244 
245   ASSERT_EQ(result.size(), 0);
246 }
247 
TEST_F(NumberAnnotatorTest,WhenPrefixAndSuffixWithoutNumberDoesNotParseIt)248 TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
249   std::vector<AnnotatedSpan> result;
250   EXPECT_TRUE(number_annotator_.FindAll(
251       UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
252       &result));
253 
254   ASSERT_EQ(result.size(), 0);
255 }
256 
257 }  // namespace
258 }  // namespace libtextclassifier3
259