1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/duration/duration.h"
18 
19 #include <string>
20 #include <vector>
21 
22 #include "annotator/collections.h"
23 #include "annotator/model_generated.h"
24 #include "annotator/types-test-util.h"
25 #include "annotator/types.h"
26 #include "utils/tokenizer-utils.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 #include "gmock/gmock.h"
30 #include "gtest/gtest.h"
31 
32 namespace libtextclassifier3 {
33 namespace {
34 
35 using testing::AllOf;
36 using testing::ElementsAre;
37 using testing::Field;
38 using testing::IsEmpty;
39 
TestingDurationAnnotatorOptions()40 const DurationAnnotatorOptions* TestingDurationAnnotatorOptions() {
41   static const flatbuffers::DetachedBuffer* options_data = []() {
42     DurationAnnotatorOptionsT options;
43     options.enabled = true;
44 
45     options.week_expressions.push_back("week");
46     options.week_expressions.push_back("weeks");
47 
48     options.day_expressions.push_back("day");
49     options.day_expressions.push_back("days");
50 
51     options.hour_expressions.push_back("hour");
52     options.hour_expressions.push_back("hours");
53 
54     options.minute_expressions.push_back("minute");
55     options.minute_expressions.push_back("minutes");
56 
57     options.second_expressions.push_back("second");
58     options.second_expressions.push_back("seconds");
59 
60     options.filler_expressions.push_back("and");
61     options.filler_expressions.push_back("a");
62     options.filler_expressions.push_back("an");
63     options.filler_expressions.push_back("one");
64 
65     options.half_expressions.push_back("half");
66 
67     options.sub_token_separator_codepoints.push_back('-');
68 
69     flatbuffers::FlatBufferBuilder builder;
70     builder.Finish(DurationAnnotatorOptions::Pack(builder, &options));
71     return new flatbuffers::DetachedBuffer(builder.Release());
72   }();
73 
74   return flatbuffers::GetRoot<DurationAnnotatorOptions>(options_data->data());
75 }
76 
BuildFeatureProcessor(const UniLib * unilib)77 std::unique_ptr<FeatureProcessor> BuildFeatureProcessor(const UniLib* unilib) {
78   static const flatbuffers::DetachedBuffer* options_data = []() {
79     FeatureProcessorOptionsT options;
80     options.context_size = 1;
81     options.max_selection_span = 1;
82     options.snap_label_span_boundaries_to_containing_tokens = false;
83     options.ignored_span_boundary_codepoints.push_back(',');
84 
85     options.tokenization_codepoint_config.emplace_back(
86         new TokenizationCodepointRangeT());
87     auto& config = options.tokenization_codepoint_config.back();
88     config->start = 32;
89     config->end = 33;
90     config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
91 
92     flatbuffers::FlatBufferBuilder builder;
93     builder.Finish(FeatureProcessorOptions::Pack(builder, &options));
94     return new flatbuffers::DetachedBuffer(builder.Release());
95   }();
96 
97   const FeatureProcessorOptions* feature_processor_options =
98       flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data());
99 
100   return std::unique_ptr<FeatureProcessor>(
101       new FeatureProcessor(feature_processor_options, unilib));
102 }
103 
104 class DurationAnnotatorTest : public ::testing::Test {
105  protected:
DurationAnnotatorTest()106   DurationAnnotatorTest()
107       : INIT_UNILIB_FOR_TESTING(unilib_),
108         feature_processor_(BuildFeatureProcessor(&unilib_)),
109         duration_annotator_(TestingDurationAnnotatorOptions(),
110                             feature_processor_.get(), &unilib_) {}
111 
Tokenize(const UnicodeText & text)112   std::vector<Token> Tokenize(const UnicodeText& text) {
113     return feature_processor_->Tokenize(text);
114   }
115 
116   UniLib unilib_;
117   std::unique_ptr<FeatureProcessor> feature_processor_;
118   DurationAnnotator duration_annotator_;
119 };
120 
TEST_F(DurationAnnotatorTest,ClassifiesSimpleDuration)121 TEST_F(DurationAnnotatorTest, ClassifiesSimpleDuration) {
122   ClassificationResult classification;
123   EXPECT_TRUE(duration_annotator_.ClassifyText(
124       UTF8ToUnicodeText("Wake me up in 15 minutes ok?"), {14, 24},
125       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
126 
127   EXPECT_THAT(classification,
128               AllOf(Field(&ClassificationResult::collection, "duration"),
129                     Field(&ClassificationResult::duration_ms, 15 * 60 * 1000)));
130 }
131 
TEST_F(DurationAnnotatorTest,ClassifiesWhenTokensDontAlignWithSelection)132 TEST_F(DurationAnnotatorTest, ClassifiesWhenTokensDontAlignWithSelection) {
133   ClassificationResult classification;
134   EXPECT_TRUE(duration_annotator_.ClassifyText(
135       UTF8ToUnicodeText("Wake me up in15 minutesok?"), {13, 23},
136       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
137 
138   EXPECT_THAT(classification,
139               AllOf(Field(&ClassificationResult::collection, "duration"),
140                     Field(&ClassificationResult::duration_ms, 15 * 60 * 1000)));
141 }
142 
TEST_F(DurationAnnotatorTest,DoNotClassifyWhenInputIsInvalid)143 TEST_F(DurationAnnotatorTest, DoNotClassifyWhenInputIsInvalid) {
144   ClassificationResult classification;
145   EXPECT_FALSE(duration_annotator_.ClassifyText(
146       UTF8ToUnicodeText("Weird space"), {5, 6},
147       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
148 }
149 
TEST_F(DurationAnnotatorTest,FindsSimpleDuration)150 TEST_F(DurationAnnotatorTest, FindsSimpleDuration) {
151   const UnicodeText text = UTF8ToUnicodeText("Wake me up in 15 minutes ok?");
152   std::vector<Token> tokens = Tokenize(text);
153   std::vector<AnnotatedSpan> result;
154   EXPECT_TRUE(duration_annotator_.FindAll(
155       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
156 
157   EXPECT_THAT(
158       result,
159       ElementsAre(
160           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(14, 24)),
161                 Field(&AnnotatedSpan::classification,
162                       ElementsAre(AllOf(
163                           Field(&ClassificationResult::collection, "duration"),
164                           Field(&ClassificationResult::duration_ms,
165                                 15 * 60 * 1000)))))));
166 }
167 
TEST_F(DurationAnnotatorTest,FindsDurationWithHalfExpression)168 TEST_F(DurationAnnotatorTest, FindsDurationWithHalfExpression) {
169   const UnicodeText text =
170       UTF8ToUnicodeText("Set a timer for 3 and half minutes ok?");
171   std::vector<Token> tokens = Tokenize(text);
172   std::vector<AnnotatedSpan> result;
173   EXPECT_TRUE(duration_annotator_.FindAll(
174       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
175 
176   EXPECT_THAT(
177       result,
178       ElementsAre(
179           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 34)),
180                 Field(&AnnotatedSpan::classification,
181                       ElementsAre(AllOf(
182                           Field(&ClassificationResult::collection, "duration"),
183                           Field(&ClassificationResult::duration_ms,
184                                 3.5 * 60 * 1000)))))));
185 }
186 
TEST_F(DurationAnnotatorTest,FindsComposedDuration)187 TEST_F(DurationAnnotatorTest, FindsComposedDuration) {
188   const UnicodeText text =
189       UTF8ToUnicodeText("Wake me up in 3 hours and 5 seconds ok?");
190   std::vector<Token> tokens = Tokenize(text);
191   std::vector<AnnotatedSpan> result;
192   EXPECT_TRUE(duration_annotator_.FindAll(
193       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
194 
195   EXPECT_THAT(
196       result,
197       ElementsAre(
198           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(14, 35)),
199                 Field(&AnnotatedSpan::classification,
200                       ElementsAre(AllOf(
201                           Field(&ClassificationResult::collection, "duration"),
202                           Field(&ClassificationResult::duration_ms,
203                                 3 * 60 * 60 * 1000 + 5 * 1000)))))));
204 }
205 
TEST_F(DurationAnnotatorTest,AllUnitsAreCovered)206 TEST_F(DurationAnnotatorTest, AllUnitsAreCovered) {
207   const UnicodeText text = UTF8ToUnicodeText(
208       "See you in a week and a day and an hour and a minute and a second");
209   std::vector<Token> tokens = Tokenize(text);
210   std::vector<AnnotatedSpan> result;
211   EXPECT_TRUE(duration_annotator_.FindAll(
212       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
213 
214   EXPECT_THAT(
215       result,
216       ElementsAre(
217           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(13, 65)),
218                 Field(&AnnotatedSpan::classification,
219                       ElementsAre(AllOf(
220                           Field(&ClassificationResult::collection, "duration"),
221                           Field(&ClassificationResult::duration_ms,
222                                 7 * 24 * 60 * 60 * 1000 + 24 * 60 * 60 * 1000 +
223                                     60 * 60 * 1000 + 60 * 1000 + 1000)))))));
224 }
225 
TEST_F(DurationAnnotatorTest,FindsHalfAnHour)226 TEST_F(DurationAnnotatorTest, FindsHalfAnHour) {
227   const UnicodeText text = UTF8ToUnicodeText("Set a timer for half an hour");
228   std::vector<Token> tokens = Tokenize(text);
229   std::vector<AnnotatedSpan> result;
230   EXPECT_TRUE(duration_annotator_.FindAll(
231       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
232 
233   EXPECT_THAT(
234       result,
235       ElementsAre(
236           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 28)),
237                 Field(&AnnotatedSpan::classification,
238                       ElementsAre(AllOf(
239                           Field(&ClassificationResult::collection, "duration"),
240                           Field(&ClassificationResult::duration_ms,
241                                 0.5 * 60 * 60 * 1000)))))));
242 }
243 
TEST_F(DurationAnnotatorTest,FindsWhenHalfIsAfterGranularitySpecification)244 TEST_F(DurationAnnotatorTest, FindsWhenHalfIsAfterGranularitySpecification) {
245   const UnicodeText text =
246       UTF8ToUnicodeText("Set a timer for 1 hour and a half");
247   std::vector<Token> tokens = Tokenize(text);
248   std::vector<AnnotatedSpan> result;
249   EXPECT_TRUE(duration_annotator_.FindAll(
250       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
251 
252   EXPECT_THAT(
253       result,
254       ElementsAre(
255           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 33)),
256                 Field(&AnnotatedSpan::classification,
257                       ElementsAre(AllOf(
258                           Field(&ClassificationResult::collection, "duration"),
259                           Field(&ClassificationResult::duration_ms,
260                                 1.5 * 60 * 60 * 1000)))))));
261 }
262 
TEST_F(DurationAnnotatorTest,FindsAnHourAndAHalf)263 TEST_F(DurationAnnotatorTest, FindsAnHourAndAHalf) {
264   const UnicodeText text =
265       UTF8ToUnicodeText("Set a timer for an hour and a half");
266   std::vector<Token> tokens = Tokenize(text);
267   std::vector<AnnotatedSpan> result;
268   EXPECT_TRUE(duration_annotator_.FindAll(
269       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
270 
271   EXPECT_THAT(
272       result,
273       ElementsAre(
274           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(19, 34)),
275                 Field(&AnnotatedSpan::classification,
276                       ElementsAre(AllOf(
277                           Field(&ClassificationResult::collection, "duration"),
278                           Field(&ClassificationResult::duration_ms,
279                                 1.5 * 60 * 60 * 1000)))))));
280 }
281 
TEST_F(DurationAnnotatorTest,FindsCorrectlyWhenSecondsComeSecondAndDontHaveNumber)282 TEST_F(DurationAnnotatorTest,
283        FindsCorrectlyWhenSecondsComeSecondAndDontHaveNumber) {
284   const UnicodeText text =
285       UTF8ToUnicodeText("Set a timer for 10 minutes and a second ok?");
286   std::vector<Token> tokens = Tokenize(text);
287   std::vector<AnnotatedSpan> result;
288   EXPECT_TRUE(duration_annotator_.FindAll(
289       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
290 
291   EXPECT_THAT(
292       result,
293       ElementsAre(
294           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 39)),
295                 Field(&AnnotatedSpan::classification,
296                       ElementsAre(AllOf(
297                           Field(&ClassificationResult::collection, "duration"),
298                           Field(&ClassificationResult::duration_ms,
299                                 10 * 60 * 1000 + 1 * 1000)))))));
300 }
301 
TEST_F(DurationAnnotatorTest,DoesNotGreedilyTakeFillerWords)302 TEST_F(DurationAnnotatorTest, DoesNotGreedilyTakeFillerWords) {
303   const UnicodeText text = UTF8ToUnicodeText(
304       "Set a timer for a a a 10 minutes and 2 seconds an and an ok?");
305   std::vector<Token> tokens = Tokenize(text);
306   std::vector<AnnotatedSpan> result;
307   EXPECT_TRUE(duration_annotator_.FindAll(
308       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
309 
310   EXPECT_THAT(
311       result,
312       ElementsAre(
313           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(22, 46)),
314                 Field(&AnnotatedSpan::classification,
315                       ElementsAre(AllOf(
316                           Field(&ClassificationResult::collection, "duration"),
317                           Field(&ClassificationResult::duration_ms,
318                                 10 * 60 * 1000 + 2 * 1000)))))));
319 }
320 
TEST_F(DurationAnnotatorTest,DoesNotCrashWhenJustHalfIsSaid)321 TEST_F(DurationAnnotatorTest, DoesNotCrashWhenJustHalfIsSaid) {
322   const UnicodeText text = UTF8ToUnicodeText("Set a timer for half ok?");
323   std::vector<Token> tokens = Tokenize(text);
324   std::vector<AnnotatedSpan> result;
325   EXPECT_TRUE(duration_annotator_.FindAll(
326       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
327 
328   ASSERT_EQ(result.size(), 0);
329 }
330 
TEST_F(DurationAnnotatorTest,StripsPunctuationFromTokens)331 TEST_F(DurationAnnotatorTest, StripsPunctuationFromTokens) {
332   const UnicodeText text =
333       UTF8ToUnicodeText("Set a timer for 10 ,minutes, ,and, ,2, seconds, ok?");
334   std::vector<Token> tokens = Tokenize(text);
335   std::vector<AnnotatedSpan> result;
336   EXPECT_TRUE(duration_annotator_.FindAll(
337       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
338 
339   EXPECT_THAT(
340       result,
341       ElementsAre(
342           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 46)),
343                 Field(&AnnotatedSpan::classification,
344                       ElementsAre(AllOf(
345                           Field(&ClassificationResult::collection, "duration"),
346                           Field(&ClassificationResult::duration_ms,
347                                 10 * 60 * 1000 + 2 * 1000)))))));
348 }
349 
TEST_F(DurationAnnotatorTest,FindsCorrectlyWithCombinedQuantityUnitToken)350 TEST_F(DurationAnnotatorTest, FindsCorrectlyWithCombinedQuantityUnitToken) {
351   const UnicodeText text = UTF8ToUnicodeText("Show 5-minute timer.");
352   std::vector<Token> tokens = Tokenize(text);
353   std::vector<AnnotatedSpan> result;
354   EXPECT_TRUE(duration_annotator_.FindAll(
355       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
356 
357   EXPECT_THAT(
358       result,
359       ElementsAre(
360           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(5, 13)),
361                 Field(&AnnotatedSpan::classification,
362                       ElementsAre(AllOf(
363                           Field(&ClassificationResult::collection, "duration"),
364                           Field(&ClassificationResult::duration_ms,
365                                 5 * 60 * 1000)))))));
366 }
367 
TEST_F(DurationAnnotatorTest,DoesNotIntOverflowWithDurationThatHasMoreThanInt32Millis)368 TEST_F(DurationAnnotatorTest,
369        DoesNotIntOverflowWithDurationThatHasMoreThanInt32Millis) {
370   ClassificationResult classification;
371   EXPECT_TRUE(duration_annotator_.ClassifyText(
372       UTF8ToUnicodeText("1400 hours"), {0, 10},
373       AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification));
374 
375   EXPECT_THAT(classification,
376               AllOf(Field(&ClassificationResult::collection, "duration"),
377                     Field(&ClassificationResult::duration_ms,
378                           1400LL * 60LL * 60LL * 1000LL)));
379 }
380 
TEST_F(DurationAnnotatorTest,FindsSimpleDurationIgnoringCase)381 TEST_F(DurationAnnotatorTest, FindsSimpleDurationIgnoringCase) {
382   const UnicodeText text = UTF8ToUnicodeText("Wake me up in 15 MiNuTeS ok?");
383   std::vector<Token> tokens = Tokenize(text);
384   std::vector<AnnotatedSpan> result;
385   EXPECT_TRUE(duration_annotator_.FindAll(
386       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
387 
388   EXPECT_THAT(
389       result,
390       ElementsAre(
391           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(14, 24)),
392                 Field(&AnnotatedSpan::classification,
393                       ElementsAre(AllOf(
394                           Field(&ClassificationResult::collection, "duration"),
395                           Field(&ClassificationResult::duration_ms,
396                                 15 * 60 * 1000)))))));
397 }
398 
TEST_F(DurationAnnotatorTest,FindsDurationWithHalfExpressionIgnoringCase)399 TEST_F(DurationAnnotatorTest, FindsDurationWithHalfExpressionIgnoringCase) {
400   const UnicodeText text =
401       UTF8ToUnicodeText("Set a timer for 3 and HaLf minutes ok?");
402   std::vector<Token> tokens = Tokenize(text);
403   std::vector<AnnotatedSpan> result;
404   EXPECT_TRUE(duration_annotator_.FindAll(
405       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
406 
407   EXPECT_THAT(
408       result,
409       ElementsAre(
410           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 34)),
411                 Field(&AnnotatedSpan::classification,
412                       ElementsAre(AllOf(
413                           Field(&ClassificationResult::collection, "duration"),
414                           Field(&ClassificationResult::duration_ms,
415                                 3.5 * 60 * 1000)))))));
416 }
417 
TEST_F(DurationAnnotatorTest,FindsDurationWithHalfExpressionIgnoringFillerWordCase)418 TEST_F(DurationAnnotatorTest,
419        FindsDurationWithHalfExpressionIgnoringFillerWordCase) {
420   const UnicodeText text =
421       UTF8ToUnicodeText("Set a timer for 3 AnD half minutes ok?");
422   std::vector<Token> tokens = Tokenize(text);
423   std::vector<AnnotatedSpan> result;
424   EXPECT_TRUE(duration_annotator_.FindAll(
425       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
426 
427   EXPECT_THAT(
428       result,
429       ElementsAre(
430           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(16, 34)),
431                 Field(&AnnotatedSpan::classification,
432                       ElementsAre(AllOf(
433                           Field(&ClassificationResult::collection, "duration"),
434                           Field(&ClassificationResult::duration_ms,
435                                 3.5 * 60 * 1000)))))));
436 }
437 
TEST_F(DurationAnnotatorTest,FindsDurationWithDanglingQuantity)438 TEST_F(DurationAnnotatorTest, FindsDurationWithDanglingQuantity) {
439   const UnicodeText text = UTF8ToUnicodeText("20 minutes 10");
440   std::vector<Token> tokens = Tokenize(text);
441   std::vector<AnnotatedSpan> result;
442   EXPECT_TRUE(duration_annotator_.FindAll(
443       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
444 
445   EXPECT_THAT(
446       result,
447       ElementsAre(
448           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 13)),
449                 Field(&AnnotatedSpan::classification,
450                       ElementsAre(AllOf(
451                           Field(&ClassificationResult::collection, "duration"),
452                           Field(&ClassificationResult::duration_ms,
453                                 20 * 60 * 1000 + 10 * 1000)))))));
454 }
455 
TEST_F(DurationAnnotatorTest,FindsDurationWithDanglingQuantityNotSupported)456 TEST_F(DurationAnnotatorTest, FindsDurationWithDanglingQuantityNotSupported) {
457   const UnicodeText text = UTF8ToUnicodeText("20 seconds 10");
458   std::vector<Token> tokens = Tokenize(text);
459   std::vector<AnnotatedSpan> result;
460   EXPECT_TRUE(duration_annotator_.FindAll(
461       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
462 
463   EXPECT_THAT(
464       result,
465       ElementsAre(AllOf(
466           Field(&AnnotatedSpan::span, CodepointSpan(0, 10)),
467           Field(&AnnotatedSpan::classification,
468                 ElementsAre(AllOf(
469                     Field(&ClassificationResult::collection, "duration"),
470                     Field(&ClassificationResult::duration_ms, 20 * 1000)))))));
471 }
472 
TEST_F(DurationAnnotatorTest,FindsDurationWithDecimalQuantity)473 TEST_F(DurationAnnotatorTest, FindsDurationWithDecimalQuantity) {
474   const UnicodeText text = UTF8ToUnicodeText("in 10.2 hours");
475   std::vector<Token> tokens = Tokenize(text);
476   std::vector<AnnotatedSpan> result;
477   EXPECT_TRUE(duration_annotator_.FindAll(
478       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
479 
480   EXPECT_THAT(
481       result,
482       ElementsAre(
483           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(3, 13)),
484                 Field(&AnnotatedSpan::classification,
485                       ElementsAre(AllOf(
486                           Field(&ClassificationResult::collection, "duration"),
487                           Field(&ClassificationResult::duration_ms,
488                                 10 * 60 * 60 * 1000 + 12 * 60 * 1000)))))));
489 }
490 
TestingJapaneseDurationAnnotatorOptions()491 const DurationAnnotatorOptions* TestingJapaneseDurationAnnotatorOptions() {
492   static const flatbuffers::DetachedBuffer* options_data = []() {
493     DurationAnnotatorOptionsT options;
494     options.enabled = true;
495 
496     options.week_expressions.push_back("週間");
497 
498     options.day_expressions.push_back("日間");
499 
500     options.hour_expressions.push_back("時間");
501 
502     options.minute_expressions.push_back("分");
503     options.minute_expressions.push_back("分間");
504 
505     options.second_expressions.push_back("秒");
506     options.second_expressions.push_back("秒間");
507 
508     options.half_expressions.push_back("半");
509 
510     options.require_quantity = true;
511     options.enable_dangling_quantity_interpretation = true;
512 
513     flatbuffers::FlatBufferBuilder builder;
514     builder.Finish(DurationAnnotatorOptions::Pack(builder, &options));
515     return new flatbuffers::DetachedBuffer(builder.Release());
516   }();
517 
518   return flatbuffers::GetRoot<DurationAnnotatorOptions>(options_data->data());
519 }
520 
521 class JapaneseDurationAnnotatorTest : public ::testing::Test {
522  protected:
JapaneseDurationAnnotatorTest()523   JapaneseDurationAnnotatorTest()
524       : INIT_UNILIB_FOR_TESTING(unilib_),
525         feature_processor_(BuildFeatureProcessor(&unilib_)),
526         duration_annotator_(TestingJapaneseDurationAnnotatorOptions(),
527                             feature_processor_.get(), &unilib_) {}
528 
Tokenize(const UnicodeText & text)529   std::vector<Token> Tokenize(const UnicodeText& text) {
530     return feature_processor_->Tokenize(text);
531   }
532 
533   UniLib unilib_;
534   std::unique_ptr<FeatureProcessor> feature_processor_;
535   DurationAnnotator duration_annotator_;
536 };
537 
TEST_F(JapaneseDurationAnnotatorTest,FindsDuration)538 TEST_F(JapaneseDurationAnnotatorTest, FindsDuration) {
539   const UnicodeText text = UTF8ToUnicodeText("10 分 の アラーム");
540   std::vector<Token> tokens = Tokenize(text);
541   std::vector<AnnotatedSpan> result;
542   EXPECT_TRUE(duration_annotator_.FindAll(
543       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
544 
545   EXPECT_THAT(
546       result,
547       ElementsAre(
548           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 4)),
549                 Field(&AnnotatedSpan::classification,
550                       ElementsAre(AllOf(
551                           Field(&ClassificationResult::collection, "duration"),
552                           Field(&ClassificationResult::duration_ms,
553                                 10 * 60 * 1000)))))));
554 }
555 
TEST_F(JapaneseDurationAnnotatorTest,FindsDurationWithHalfExpression)556 TEST_F(JapaneseDurationAnnotatorTest, FindsDurationWithHalfExpression) {
557   const UnicodeText text = UTF8ToUnicodeText("2 分 半 の アラーム");
558   std::vector<Token> tokens = Tokenize(text);
559   std::vector<AnnotatedSpan> result;
560   EXPECT_TRUE(duration_annotator_.FindAll(
561       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
562 
563   EXPECT_THAT(
564       result,
565       ElementsAre(
566           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 5)),
567                 Field(&AnnotatedSpan::classification,
568                       ElementsAre(AllOf(
569                           Field(&ClassificationResult::collection, "duration"),
570                           Field(&ClassificationResult::duration_ms,
571                                 2.5 * 60 * 1000)))))));
572 }
573 
TEST_F(JapaneseDurationAnnotatorTest,IgnoresDurationWithoutQuantity)574 TEST_F(JapaneseDurationAnnotatorTest, IgnoresDurationWithoutQuantity) {
575   const UnicodeText text = UTF8ToUnicodeText("分 の アラーム");
576   std::vector<Token> tokens = Tokenize(text);
577   std::vector<AnnotatedSpan> result;
578   EXPECT_TRUE(duration_annotator_.FindAll(
579       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
580 
581   EXPECT_THAT(result, IsEmpty());
582 }
583 
TEST_F(JapaneseDurationAnnotatorTest,FindsDurationWithDanglingQuantity)584 TEST_F(JapaneseDurationAnnotatorTest, FindsDurationWithDanglingQuantity) {
585   const UnicodeText text = UTF8ToUnicodeText("2 分 10 の アラーム");
586   std::vector<Token> tokens = Tokenize(text);
587   std::vector<AnnotatedSpan> result;
588   EXPECT_TRUE(duration_annotator_.FindAll(
589       text, tokens, AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
590 
591   EXPECT_THAT(
592       result,
593       ElementsAre(
594           AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 6)),
595                 Field(&AnnotatedSpan::classification,
596                       ElementsAre(AllOf(
597                           Field(&ClassificationResult::collection, "duration"),
598                           Field(&ClassificationResult::duration_ms,
599                                 2 * 60 * 1000 + 10 * 1000)))))));
600 }
601 
602 }  // namespace
603 }  // namespace libtextclassifier3
604