1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/number/number.h"
18
19 #include <string>
20 #include <vector>
21
22 #include "annotator/collections.h"
23 #include "annotator/model_generated.h"
24 #include "annotator/types-test-util.h"
25 #include "annotator/types.h"
26 #include "utils/test-utils.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 #include "gmock/gmock.h"
30 #include "gtest/gtest.h"
31
32 namespace libtextclassifier3 {
33 namespace {
34
35 using testing::AllOf;
36 using testing::ElementsAre;
37 using testing::Field;
38
TestingNumberAnnotatorOptions()39 const NumberAnnotatorOptions* TestingNumberAnnotatorOptions() {
40 static const flatbuffers::DetachedBuffer* options_data = []() {
41 NumberAnnotatorOptionsT options;
42 options.enabled = true;
43 options.allowed_prefix_codepoints.push_back('$');
44 options.allowed_suffix_codepoints.push_back('%');
45
46 flatbuffers::FlatBufferBuilder builder;
47 builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
48 return new flatbuffers::DetachedBuffer(builder.Release());
49 }();
50
51 return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data());
52 }
53
BuildFeatureProcessor(const UniLib * unilib)54 FeatureProcessor BuildFeatureProcessor(const UniLib* unilib) {
55 static const flatbuffers::DetachedBuffer* options_data = []() {
56 FeatureProcessorOptionsT options;
57 options.context_size = 1;
58 options.max_selection_span = 1;
59 options.snap_label_span_boundaries_to_containing_tokens = false;
60 options.ignored_span_boundary_codepoints.push_back(',');
61
62 options.tokenization_codepoint_config.emplace_back(
63 new TokenizationCodepointRangeT());
64 auto& config = options.tokenization_codepoint_config.back();
65 config->start = 32;
66 config->end = 33;
67 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
68
69 flatbuffers::FlatBufferBuilder builder;
70 builder.Finish(FeatureProcessorOptions::Pack(builder, &options));
71 return new flatbuffers::DetachedBuffer(builder.Release());
72 }();
73
74 const FeatureProcessorOptions* feature_processor_options =
75 flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data());
76
77 return FeatureProcessor(feature_processor_options, unilib);
78 }
79
80 class NumberAnnotatorTest : public ::testing::Test {
81 protected:
NumberAnnotatorTest()82 NumberAnnotatorTest()
83 : INIT_UNILIB_FOR_TESTING(unilib_),
84 feature_processor_(BuildFeatureProcessor(&unilib_)),
85 number_annotator_(TestingNumberAnnotatorOptions(),
86 &feature_processor_) {}
87
88 UniLib unilib_;
89 FeatureProcessor feature_processor_;
90 NumberAnnotator number_annotator_;
91 };
92
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberCorrectly)93 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
94 ClassificationResult classification_result;
95 EXPECT_TRUE(number_annotator_.ClassifyText(
96 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
97 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
98
99 EXPECT_EQ(classification_result.collection, "number");
100 EXPECT_EQ(classification_result.numeric_value, 12345);
101 }
102
TEST_F(NumberAnnotatorTest,ClassifiesNonNumberCorrectly)103 TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
104 ClassificationResult classification_result;
105 EXPECT_FALSE(number_annotator_.ClassifyText(
106 UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
107 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
108 }
109
TEST_F(NumberAnnotatorTest,FindsAllNumbersInText)110 TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
111 std::vector<AnnotatedSpan> result;
112 EXPECT_TRUE(number_annotator_.FindAll(
113 UTF8ToUnicodeText("... 12345 ... 9 is my number and I paid $99 and "
114 "sometimes 27% but not 68# nor #68"),
115 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
116
117 ASSERT_EQ(result.size(), 4);
118 ASSERT_EQ(result[0].classification.size(), 1);
119 EXPECT_EQ(result[0].classification[0].collection, "number");
120 EXPECT_EQ(result[0].classification[0].numeric_value, 12345);
121 ASSERT_EQ(result[1].classification.size(), 1);
122 EXPECT_EQ(result[1].classification[0].collection, "number");
123 EXPECT_EQ(result[1].classification[0].numeric_value, 9);
124 ASSERT_EQ(result[2].classification.size(), 1);
125 EXPECT_EQ(result[2].classification[0].collection, "number");
126 EXPECT_EQ(result[2].classification[0].numeric_value, 99);
127 ASSERT_EQ(result[3].classification.size(), 1);
128 EXPECT_EQ(result[3].classification[0].collection, "number");
129 EXPECT_EQ(result[3].classification[0].numeric_value, 27);
130 }
131
TEST_F(NumberAnnotatorTest,FindsNumberWithPunctuation)132 TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
133 std::vector<AnnotatedSpan> result;
134 EXPECT_TRUE(number_annotator_.FindAll(
135 UTF8ToUnicodeText("Come at 9, ok?"),
136 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
137
138 EXPECT_THAT(
139 result,
140 ElementsAre(
141 AllOf(Field(&AnnotatedSpan::span, CodepointSpan(8, 9)),
142 Field(&AnnotatedSpan::classification,
143 ElementsAre(AllOf(
144 Field(&ClassificationResult::collection, "number"),
145 Field(&ClassificationResult::numeric_value, 9)))))));
146 }
147
TEST_F(NumberAnnotatorTest,HandlesNumbersAtBeginning)148 TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
149 std::vector<AnnotatedSpan> result;
150 EXPECT_TRUE(number_annotator_.FindAll(
151 UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
152 &result));
153
154 EXPECT_THAT(
155 result,
156 ElementsAre(
157 AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 2)),
158 Field(&AnnotatedSpan::classification,
159 ElementsAre(AllOf(
160 Field(&ClassificationResult::collection, "number"),
161 Field(&ClassificationResult::numeric_value, -5)))))));
162 }
163
TEST_F(NumberAnnotatorTest,WhenLowestSupportedNumberParsesIt)164 TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
165 ClassificationResult classification_result;
166 EXPECT_TRUE(number_annotator_.ClassifyText(
167 UTF8ToUnicodeText("-999999999999999999"), {0, 19},
168 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
169
170 EXPECT_THAT(
171 classification_result,
172 AllOf(Field(&ClassificationResult::collection, "number"),
173 Field(&ClassificationResult::numeric_value, -999999999999999999L)));
174 }
175
TEST_F(NumberAnnotatorTest,WhenLargestSupportedNumberParsesIt)176 TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
177 ClassificationResult classification_result;
178 EXPECT_TRUE(number_annotator_.ClassifyText(
179 UTF8ToUnicodeText("999999999999999999"), {0, 18},
180 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
181
182 EXPECT_THAT(
183 classification_result,
184 AllOf(Field(&ClassificationResult::collection, "number"),
185 Field(&ClassificationResult::numeric_value, 999999999999999999L)));
186 }
187
TEST_F(NumberAnnotatorTest,WhenFirstLowestNonSupportedNumberDoesNotParseIt)188 TEST_F(NumberAnnotatorTest, WhenFirstLowestNonSupportedNumberDoesNotParseIt) {
189 ClassificationResult classification_result;
190 EXPECT_FALSE(number_annotator_.ClassifyText(
191 UTF8ToUnicodeText("-10000000000000000000"), {0, 21},
192 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
193 }
194
TEST_F(NumberAnnotatorTest,WhenFirstLargestNonSupportedNumberDoesNotParseIt)195 TEST_F(NumberAnnotatorTest, WhenFirstLargestNonSupportedNumberDoesNotParseIt) {
196 ClassificationResult classification_result;
197 EXPECT_FALSE(number_annotator_.ClassifyText(
198 UTF8ToUnicodeText("10000000000000000000"), {0, 20},
199 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
200 }
201
TEST_F(NumberAnnotatorTest,WhenLargeNumberDoesNotParseIt)202 TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
203 ClassificationResult classification_result;
204 EXPECT_FALSE(number_annotator_.ClassifyText(
205 UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
206 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
207 }
208
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsDoesNotParseIt)209 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParseIt) {
210 ClassificationResult classification_result;
211 EXPECT_FALSE(number_annotator_.ClassifyText(
212 UTF8ToUnicodeText("--10"), {0, 4},
213 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
214 }
215
TEST_F(NumberAnnotatorTest,WhenMinusSignSuffixDoesNotParseIt)216 TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParseIt) {
217 ClassificationResult classification_result;
218 EXPECT_FALSE(number_annotator_.ClassifyText(
219 UTF8ToUnicodeText("10-"), {0, 3},
220 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
221 }
222
TEST_F(NumberAnnotatorTest,WhenMinusInTheMiddleDoesNotParseIt)223 TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
224 ClassificationResult classification_result;
225 EXPECT_FALSE(number_annotator_.ClassifyText(
226 UTF8ToUnicodeText("2016-2017"), {0, 9},
227 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
228 }
229
TEST_F(NumberAnnotatorTest,WhenSuffixWithoutNumberDoesNotParseIt)230 TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
231 std::vector<AnnotatedSpan> result;
232 EXPECT_TRUE(number_annotator_.FindAll(
233 UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
234 &result));
235
236 ASSERT_EQ(result.size(), 0);
237 }
238
TEST_F(NumberAnnotatorTest,WhenPrefixWithoutNumberDoesNotParseIt)239 TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
240 std::vector<AnnotatedSpan> result;
241 EXPECT_TRUE(number_annotator_.FindAll(
242 UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
243 &result));
244
245 ASSERT_EQ(result.size(), 0);
246 }
247
TEST_F(NumberAnnotatorTest,WhenPrefixAndSuffixWithoutNumberDoesNotParseIt)248 TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
249 std::vector<AnnotatedSpan> result;
250 EXPECT_TRUE(number_annotator_.FindAll(
251 UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
252 &result));
253
254 ASSERT_EQ(result.size(), 0);
255 }
256
257 } // namespace
258 } // namespace libtextclassifier3
259