/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "annotator/pod_ner/pod-ner-impl.h" #include #include #include // NOLINT(build/c++11) #include "annotator/model_generated.h" #include "annotator/types.h" #include "utils/jvm-test-utils.h" #include "utils/test-data-test-utils.h" #include "utils/tokenizer-utils.h" #include "utils/utf8/unicodetext.h" #include "utils/utf8/unilib.h" #include "gmock/gmock.h" #include "gtest/gtest.h" namespace libtextclassifier3 { namespace { using ::testing::IsEmpty; using ::testing::Not; using PodNerModel_::Label_::BoiseType; using PodNerModel_::Label_::BoiseType_BEGIN; using PodNerModel_::Label_::BoiseType_END; using PodNerModel_::Label_::BoiseType_INTERMEDIATE; using PodNerModel_::Label_::BoiseType_O; using PodNerModel_::Label_::BoiseType_SINGLE; using PodNerModel_::Label_::MentionType; using PodNerModel_::Label_::MentionType_NAM; using PodNerModel_::Label_::MentionType_NOM; using PodNerModel_::Label_::MentionType_UNDEFINED; constexpr int kMinNumberOfTokens = 1; constexpr int kMinNumberOfWordpieces = 1; constexpr float kDefaultPriorityScore = 0.5; class PodNerTest : public testing::Test { protected: PodNerTest() { PodNerModelT model; model.min_number_of_tokens = kMinNumberOfTokens; model.min_number_of_wordpieces = kMinNumberOfWordpieces; model.priority_score = kDefaultPriorityScore; const std::string tflite_model_buffer = GetTestFileContent("annotator/pod_ner/test_data/tflite_model.tflite"); model.tflite_model = std::vector(tflite_model_buffer.begin(), tflite_model_buffer.end()); const std::string word_piece_vocab_buffer = GetTestFileContent("annotator/pod_ner/test_data/vocab.txt"); model.word_piece_vocab = std::vector( word_piece_vocab_buffer.begin(), word_piece_vocab_buffer.end()); flatbuffers::FlatBufferBuilder builder; builder.Finish(PodNerModel::Pack(builder, &model)); model_buffer_ = std::string(reinterpret_cast(builder.GetBufferPointer()), builder.GetSize()); model_ = static_cast( flatbuffers::GetRoot(model_buffer_.data())); model.append_final_period = true; flatbuffers::FlatBufferBuilder builder_append_final_period; builder_append_final_period.Finish( PodNerModel::Pack(builder_append_final_period, &model)); model_buffer_append_final_period_ = std::string(reinterpret_cast( builder_append_final_period.GetBufferPointer()), builder_append_final_period.GetSize()); model_append_final_period_ = static_cast(flatbuffers::GetRoot( model_buffer_append_final_period_.data())); unilib_ = CreateUniLibForTesting(); } std::string model_buffer_; const PodNerModel* model_; std::string model_buffer_append_final_period_; const PodNerModel* model_append_final_period_; std::unique_ptr unilib_; }; TEST_F(PodNerTest, AnnotateSmokeTest) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); { std::vector annotations; ASSERT_TRUE(annotator->Annotate( UTF8ToUnicodeText("Google New York , in New York"), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); } { std::vector annotations; ASSERT_TRUE(annotator->Annotate( UTF8ToUnicodeText("Jamie I'm in the first picture and Cameron and Zach " "are in the second " "picture."), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); } } TEST_F(PodNerTest, AnnotateEmptyInput) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(""), &annotations)); EXPECT_THAT(annotations, IsEmpty()); } } void FillCollections( const std::vector& collection_names, const std::vector& single_token_priority_scores, const std::vector& multi_token_priority_scores, std::vector>* collections) { ASSERT_TRUE(collection_names.size() == single_token_priority_scores.size() && collection_names.size() == multi_token_priority_scores.size()); collections->clear(); for (int i = 0; i < collection_names.size(); ++i) { collections->push_back(std::make_unique()); collections->back()->name = collection_names[i]; collections->back()->single_token_priority_score = single_token_priority_scores[i]; collections->back()->multi_token_priority_score = multi_token_priority_scores[i]; } } void EmplaceToLabelVector( BoiseType boise_type, MentionType mention_type, int collection_id, std::vector>* labels) { labels->push_back(std::make_unique()); labels->back()->boise_type = boise_type; labels->back()->mention_type = mention_type; labels->back()->collection_id = collection_id; } void FillLabels(int num_collections, std::vector>* labels) { labels->clear(); for (auto boise_type : {BoiseType_BEGIN, BoiseType_END, BoiseType_INTERMEDIATE}) { for (auto mention_type : {MentionType_NAM, MentionType_NOM}) { for (int i = 0; i < num_collections - 1; ++i) { // skip undefined EmplaceToLabelVector(boise_type, mention_type, i, labels); } } } EmplaceToLabelVector(BoiseType_O, MentionType_UNDEFINED, num_collections - 1, labels); for (auto mention_type : {MentionType_NAM, MentionType_NOM}) { for (int i = 0; i < num_collections - 1; ++i) { // skip undefined EmplaceToLabelVector(BoiseType_SINGLE, mention_type, i, labels); } } } TEST_F(PodNerTest, AnnotateDefaultCollections) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); std::string multi_word_location = "I live in New York"; std::string single_word_location = "I live in Zurich"; { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(multi_word_location), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); EXPECT_EQ(annotations[0].classification[0].collection, "location"); EXPECT_EQ(annotations[0].classification[0].priority_score, kDefaultPriorityScore); annotations.clear(); ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(single_word_location), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); EXPECT_EQ(annotations[0].classification[0].collection, "location"); EXPECT_EQ(annotations[0].classification[0].priority_score, kDefaultPriorityScore); } } TEST_F(PodNerTest, AnnotateConfigurableCollections) { std::unique_ptr unpacked_model(model_->UnPack()); ASSERT_TRUE(unpacked_model != nullptr); float xxx_single_token_priority = 0.9; float xxx_multi_token_priority = 1.7; const std::vector collection_names = { "art", "consumer_good", "event", "xxx", "organization", "ner_entity", "person", "undefined"}; FillCollections(collection_names, /*single_token_priority_scores=*/ {0., 0., 0., xxx_single_token_priority, 0., 0., 0., 0.}, /*multi_token_priority_scores=*/ {0., 0., 0., xxx_multi_token_priority, 0., 0., 0., 0.}, &(unpacked_model->collections)); FillLabels(collection_names.size(), &(unpacked_model->labels)); flatbuffers::FlatBufferBuilder builder; builder.Finish(PodNerModel::Pack(builder, unpacked_model.get())); std::string model_buffer = std::string(reinterpret_cast(builder.GetBufferPointer()), builder.GetSize()); std::unique_ptr annotator = PodNerAnnotator::Create( static_cast( flatbuffers::GetRoot(model_buffer.data())), *unilib_); ASSERT_TRUE(annotator != nullptr); std::string multi_word_location = "I live in New York"; std::string single_word_location = "I live in Zurich"; { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(multi_word_location), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); EXPECT_EQ(annotations[0].classification[0].collection, "xxx"); EXPECT_EQ(annotations[0].classification[0].priority_score, xxx_multi_token_priority); annotations.clear(); ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(single_word_location), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); EXPECT_EQ(annotations[0].classification[0].collection, "xxx"); EXPECT_EQ(annotations[0].classification[0].priority_score, xxx_single_token_priority); } } TEST_F(PodNerTest, AnnotateMinNumTokens) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); std::string text = "in New York"; { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(text), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); } std::unique_ptr unpacked_model(model_->UnPack()); ASSERT_TRUE(unpacked_model != nullptr); unpacked_model->min_number_of_tokens = 4; flatbuffers::FlatBufferBuilder builder; builder.Finish(PodNerModel::Pack(builder, unpacked_model.get())); std::string model_buffer = std::string(reinterpret_cast(builder.GetBufferPointer()), builder.GetSize()); annotator = PodNerAnnotator::Create( static_cast( flatbuffers::GetRoot(model_buffer.data())), *unilib_); ASSERT_TRUE(annotator != nullptr); { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(text), &annotations)); EXPECT_THAT(annotations, IsEmpty()); } } TEST_F(PodNerTest, AnnotateMinNumWordpieces) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); std::string text = "in New York"; { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(text), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); } std::unique_ptr unpacked_model(model_->UnPack()); ASSERT_TRUE(unpacked_model != nullptr); unpacked_model->min_number_of_wordpieces = 10; flatbuffers::FlatBufferBuilder builder; builder.Finish(PodNerModel::Pack(builder, unpacked_model.get())); std::string model_buffer = std::string(reinterpret_cast(builder.GetBufferPointer()), builder.GetSize()); annotator = PodNerAnnotator::Create( static_cast( flatbuffers::GetRoot(model_buffer.data())), *unilib_); ASSERT_TRUE(annotator != nullptr); { std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(text), &annotations)); EXPECT_THAT(annotations, IsEmpty()); } } TEST_F(PodNerTest, AnnotateNonstandardText) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); const std::string nonstandard_text = "abcNxCDU1RWNvbXByLXI4NS8xNzcwLzE3NzA4NDY2L3J1Ymluby1raWRzLXJlY2xpbmVyLXd" "pdGgtY3VwLWhvbGRlci5qcGc=/" "UnViaW5vIEtpZHMgUmVjbGluZXIgd2l0aCBDdXAgSG9sZGVyIGJ5IEhhcnJpZXQgQmVl." "html>"; std::vector annotations; ASSERT_TRUE( annotator->Annotate(UTF8ToUnicodeText(nonstandard_text), &annotations)); EXPECT_THAT(annotations, IsEmpty()); } TEST_F(PodNerTest, AnnotateTextWithLinefeed) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); std::string nonstandard_text = "My name is Kuba\x09"; nonstandard_text += "and this is a test."; std::vector annotations; ASSERT_TRUE( annotator->Annotate(UTF8ToUnicodeText(nonstandard_text), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); EXPECT_EQ(annotations[0].span, CodepointSpan(11, 15)); nonstandard_text = "My name is Kuba\x09 and this is a test."; ASSERT_TRUE( annotator->Annotate(UTF8ToUnicodeText(nonstandard_text), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); EXPECT_EQ(annotations[0].span, CodepointSpan(11, 15)); } TEST_F(PodNerTest, AnnotateWithUnknownWordpieces) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); const std::string long_text = "It is easy to spend a fun and exciting day in Seattle without a car. " "There are lots of ways to modify this itinerary. Add a ferry ride " "from the waterfront. Spending the day at the Seattle Center or at the " "aquarium could easily extend this from one to several days. Take the " "Underground Tour in Pioneer Square. Visit the Klondike Gold Rush " "Museum which is fun and free. In the summer months you can ride the " "passenger-only Water Taxi from the waterfront to West Seattle and " "Alki Beach. Here's a sample one day itinerary: Start at the Space " "Needle by taking the Seattle Monorail from downtown. Look around the " "Seattle Center or go to the Space Needle."; const std::string text_with_unknown_wordpieces = "před chvílí"; std::vector annotations; ASSERT_TRUE( annotator->Annotate(UTF8ToUnicodeText("Google New York , in New York. " + text_with_unknown_wordpieces), &annotations)); EXPECT_THAT(annotations, IsEmpty()); ASSERT_TRUE(annotator->Annotate( UTF8ToUnicodeText(long_text + " " + text_with_unknown_wordpieces), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); } class PodNerTestWithOrWithoutFinalPeriod : public PodNerTest, public testing::WithParamInterface {}; INSTANTIATE_TEST_SUITE_P(TestAnnotateLongText, PodNerTestWithOrWithoutFinalPeriod, testing::Values(true, false)); TEST_P(PodNerTestWithOrWithoutFinalPeriod, AnnotateLongText) { std::unique_ptr annotator = PodNerAnnotator::Create( GetParam() ? model_append_final_period_ : model_, *unilib_); ASSERT_TRUE(annotator != nullptr); const std::string long_text = "It is easy to spend a fun and exciting day in Seattle without a car. " "There are lots of ways to modify this itinerary. Add a ferry ride " "from the waterfront. Spending the day at the Seattle Center or at the " "aquarium could easily extend this from one to several days. Take the " "Underground Tour in Pioneer Square. Visit the Klondike Gold Rush " "Museum which is fun and free. In the summer months you can ride the " "passenger-only Water Taxi from the waterfront to West Seattle and " "Alki Beach. Here's a sample one day itinerary: Start at the Space " "Needle by taking the Seattle Monorail from downtown. Look around the " "Seattle Center or go to the Space Needle. If you're interested in " "music the EMP-SFM (Experience Music Project - Science Fiction Musuem) " "is located at the foot of the Space Needle. It has a lot of rock'n " "roll memorabilia that you may find interesting. The Chihuly Garden " "and Glass musuem is near the Space Needle and you can get a " "combination ticket for both. It gets really good reviews. If you're " "interested, then the Bill & Melinda Gates Foundation is across from " "the EMP and has a visitors center that is free. Come see how Bill " "Gates is giving away his millions. Take the Monorail back downtown. " "You will be at 5th and Pine (Westlake Center). Head west to the Pike " "Place Market. Look around then head for the Pike Place hill climb " "which is a series of steps that walk down to the waterfront. You will " "end up across the street from the Seattle Aquarium. Plenty of things " "to do on the waterfront, boat cruises, seafood restaurants, the " "Aquarium, or your typical tourist activities. You can walk or take " "the waterfront trolley bus. Note that waterfront construction has " "relocated the trolley Metro bus route 99 that will take you from " "Pioneer Square all the way to the end of the waterfront where you can " "visit the Seattle Art Musuem's XXX Sculpture Garden just north of " "Pier 70. The route goes thru Chinatown/International District, " "through Pioneer Square, up 1st ave past the Pike Place Market and to " "1st and Cedar which is walking distance to the Space Needle. It then " "goes down Broad Street toward the Olympic Sculpture Garden. It runs " "approximately every 30 minutes during the day and early evening."; std::vector annotations; ASSERT_TRUE(annotator->Annotate(UTF8ToUnicodeText(long_text), &annotations)); EXPECT_THAT(annotations, Not(IsEmpty())); const std::string location_from_beginning = "Seattle"; int start_span_location_from_beginning = long_text.find(location_from_beginning); EXPECT_EQ(annotations[0].span, CodepointSpan(start_span_location_from_beginning, start_span_location_from_beginning + location_from_beginning.length())); const std::string location_from_end = "Olympic Sculpture Garden"; int start_span_location_from_end = long_text.find(location_from_end); const AnnotatedSpan& last_annotation = *annotations.rbegin(); EXPECT_EQ( last_annotation.span, CodepointSpan(start_span_location_from_end, start_span_location_from_end + location_from_end.length())); } TEST_F(PodNerTest, SuggestSelectionLongText) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); const std::string long_text = "It is easy to spend a fun and exciting day in Seattle without a car. " "There are lots of ways to modify this itinerary. Add a ferry ride " "from the waterfront. Spending the day at the Seattle Center or at the " "aquarium could easily extend this from one to several days. Take the " "Underground Tour in Pioneer Square. Visit the Klondike Gold Rush " "Museum which is fun and free. In the summer months you can ride the " "passenger-only Water Taxi from the waterfront to West Seattle and " "Alki Beach. Here's a sample one day itinerary: Start at the Space " "Needle by taking the Seattle Monorail from downtown. Look around the " "Seattle Center or go to the Space Needle. If you're interested in " "music the EMP-SFM (Experience Music Project - Science Fiction Musuem) " "is located at the foot of the Space Needle. It has a lot of rock'n " "roll memorabilia that you may find interesting. The Chihuly Garden " "and Glass musuem is near the Space Needle and you can get a " "combination ticket for both. It gets really good reviews. If you're " "interested, then the Bill & Melinda Gates Foundation is across from " "the EMP and has a visitors center that is free. Come see how Bill " "Gates is giving away his millions. Take the Monorail back downtown. " "You will be at 5th and Pine (Westlake Center). Head west to the Pike " "Place Market. Look around then head for the Pike Place hill climb " "which is a series of steps that walk down to the waterfront. You will " "end up across the street from the Seattle Aquarium. Plenty of things " "to do on the waterfront, boat cruises, seafood restaurants, the " "Aquarium, or your typical tourist activities. You can walk or take " "the waterfront trolley bus. Note that waterfront construction has " "relocated the trolley Metro bus route 99 that will take you from " "Pioneer Square all the way to the end of the waterfront where you can " "visit the Seattle Art Musuem's XXX Sculpture Garden just north of " "Pier 70. The route goes thru Chinatown/International District, " "through Pioneer Square, up 1st ave past the Pike Place Market and to " "1st and Cedar which is walking distance to the Space Needle. It then " "goes down Broad Street toward the Olympic Sculpture Garden. It runs " "approximately every 30 minutes during the day and early evening."; const std::string klondike = "Klondike Gold Rush Museum"; int klondike_start = long_text.find(klondike); AnnotatedSpan suggested_span; EXPECT_TRUE(annotator->SuggestSelection(UTF8ToUnicodeText(long_text), {klondike_start, klondike_start + 8}, &suggested_span)); EXPECT_EQ(suggested_span.span, CodepointSpan(klondike_start, klondike_start + klondike.length())); } TEST_F(PodNerTest, SuggestSelectionTest) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); AnnotatedSpan suggested_span; EXPECT_TRUE(annotator->SuggestSelection( UTF8ToUnicodeText("Google New York, in New York"), {7, 10}, &suggested_span)); EXPECT_EQ(suggested_span.span, CodepointSpan(7, 15)); EXPECT_FALSE(annotator->SuggestSelection( UTF8ToUnicodeText("Google New York, in New York"), {17, 19}, &suggested_span)); EXPECT_EQ(suggested_span.span, CodepointSpan(kInvalidIndex, kInvalidIndex)); } TEST_F(PodNerTest, ClassifyTextTest) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); ClassificationResult result; ASSERT_TRUE(annotator->ClassifyText(UTF8ToUnicodeText("We met in New York"), {10, 18}, &result)); EXPECT_EQ(result.collection, "location"); } TEST_F(PodNerTest, ThreadSafety) { std::unique_ptr annotator = PodNerAnnotator::Create(model_, *unilib_); ASSERT_TRUE(annotator != nullptr); // Do inference in 20 threads. When run with --config=tsan, this should fire // if there's a problem. std::vector thread_pool(20); for (std::thread& thread : thread_pool) { thread = std::thread([&annotator]() { AnnotatedSpan suggested_span; EXPECT_TRUE(annotator->SuggestSelection( UTF8ToUnicodeText("Google New York, in New York"), {7, 10}, &suggested_span)); EXPECT_EQ(suggested_span.span, CodepointSpan(7, 15)); }); } for (std::thread& thread : thread_pool) { thread.join(); } } } // namespace } // namespace libtextclassifier3