/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "annotator/number/number.h" #include #include #include "annotator/collections.h" #include "annotator/model_generated.h" #include "annotator/types-test-util.h" #include "annotator/types.h" #include "utils/test-utils.h" #include "utils/utf8/unicodetext.h" #include "utils/utf8/unilib.h" #include "gmock/gmock.h" #include "gtest/gtest.h" namespace libtextclassifier3 { namespace { using testing::AllOf; using testing::ElementsAre; using testing::Field; const NumberAnnotatorOptions* TestingNumberAnnotatorOptions() { static const flatbuffers::DetachedBuffer* options_data = []() { NumberAnnotatorOptionsT options; options.enabled = true; options.allowed_prefix_codepoints.push_back('$'); options.allowed_suffix_codepoints.push_back('%'); flatbuffers::FlatBufferBuilder builder; builder.Finish(NumberAnnotatorOptions::Pack(builder, &options)); return new flatbuffers::DetachedBuffer(builder.Release()); }(); return flatbuffers::GetRoot(options_data->data()); } FeatureProcessor BuildFeatureProcessor(const UniLib* unilib) { static const flatbuffers::DetachedBuffer* options_data = []() { FeatureProcessorOptionsT options; options.context_size = 1; options.max_selection_span = 1; options.snap_label_span_boundaries_to_containing_tokens = false; options.ignored_span_boundary_codepoints.push_back(','); options.tokenization_codepoint_config.emplace_back( new TokenizationCodepointRangeT()); auto& config = options.tokenization_codepoint_config.back(); config->start = 32; config->end = 33; config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; flatbuffers::FlatBufferBuilder builder; builder.Finish(FeatureProcessorOptions::Pack(builder, &options)); return new flatbuffers::DetachedBuffer(builder.Release()); }(); const FeatureProcessorOptions* feature_processor_options = flatbuffers::GetRoot(options_data->data()); return FeatureProcessor(feature_processor_options, unilib); } class NumberAnnotatorTest : public ::testing::Test { protected: NumberAnnotatorTest() : INIT_UNILIB_FOR_TESTING(unilib_), feature_processor_(BuildFeatureProcessor(&unilib_)), number_annotator_(TestingNumberAnnotatorOptions(), &feature_processor_) {} UniLib unilib_; FeatureProcessor feature_processor_; NumberAnnotator number_annotator_; }; TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) { ClassificationResult classification_result; EXPECT_TRUE(number_annotator_.ClassifyText( UTF8ToUnicodeText("... 12345 ..."), {4, 9}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); EXPECT_EQ(classification_result.collection, "number"); EXPECT_EQ(classification_result.numeric_value, 12345); } TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("... 123a45 ..."), {4, 10}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) { std::vector result; EXPECT_TRUE(number_annotator_.FindAll( UTF8ToUnicodeText("... 12345 ... 9 is my number and I paid $99 and " "sometimes 27% but not 68# nor #68"), AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); ASSERT_EQ(result.size(), 4); ASSERT_EQ(result[0].classification.size(), 1); EXPECT_EQ(result[0].classification[0].collection, "number"); EXPECT_EQ(result[0].classification[0].numeric_value, 12345); ASSERT_EQ(result[1].classification.size(), 1); EXPECT_EQ(result[1].classification[0].collection, "number"); EXPECT_EQ(result[1].classification[0].numeric_value, 9); ASSERT_EQ(result[2].classification.size(), 1); EXPECT_EQ(result[2].classification[0].collection, "number"); EXPECT_EQ(result[2].classification[0].numeric_value, 99); ASSERT_EQ(result[3].classification.size(), 1); EXPECT_EQ(result[3].classification[0].collection, "number"); EXPECT_EQ(result[3].classification[0].numeric_value, 27); } TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) { std::vector result; EXPECT_TRUE(number_annotator_.FindAll( UTF8ToUnicodeText("Come at 9, ok?"), AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); EXPECT_THAT( result, ElementsAre( AllOf(Field(&AnnotatedSpan::span, CodepointSpan(8, 9)), Field(&AnnotatedSpan::classification, ElementsAre(AllOf( Field(&ClassificationResult::collection, "number"), Field(&ClassificationResult::numeric_value, 9))))))); } TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) { std::vector result; EXPECT_TRUE(number_annotator_.FindAll( UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); EXPECT_THAT( result, ElementsAre( AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 2)), Field(&AnnotatedSpan::classification, ElementsAre(AllOf( Field(&ClassificationResult::collection, "number"), Field(&ClassificationResult::numeric_value, -5))))))); } TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) { ClassificationResult classification_result; EXPECT_TRUE(number_annotator_.ClassifyText( UTF8ToUnicodeText("-999999999999999999"), {0, 19}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); EXPECT_THAT( classification_result, AllOf(Field(&ClassificationResult::collection, "number"), Field(&ClassificationResult::numeric_value, -999999999999999999L))); } TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) { ClassificationResult classification_result; EXPECT_TRUE(number_annotator_.ClassifyText( UTF8ToUnicodeText("999999999999999999"), {0, 18}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); EXPECT_THAT( classification_result, AllOf(Field(&ClassificationResult::collection, "number"), Field(&ClassificationResult::numeric_value, 999999999999999999L))); } TEST_F(NumberAnnotatorTest, WhenFirstLowestNonSupportedNumberDoesNotParseIt) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("-10000000000000000000"), {0, 21}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, WhenFirstLargestNonSupportedNumberDoesNotParseIt) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("10000000000000000000"), {0, 20}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParseIt) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("--10"), {0, 4}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParseIt) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("10-"), {0, 3}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) { ClassificationResult classification_result; EXPECT_FALSE(number_annotator_.ClassifyText( UTF8ToUnicodeText("2016-2017"), {0, 9}, AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); } TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) { std::vector result; EXPECT_TRUE(number_annotator_.FindAll( UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); ASSERT_EQ(result.size(), 0); } TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) { std::vector result; EXPECT_TRUE(number_annotator_.FindAll( UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); ASSERT_EQ(result.size(), 0); } TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) { std::vector result; EXPECT_TRUE(number_annotator_.FindAll( UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); ASSERT_EQ(result.size(), 0); } } // namespace } // namespace libtextclassifier3