/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/bert_tokenizer.h" #include "utils/test-data-test-utils.h" #include "gmock/gmock.h" #include "gtest/gtest.h" namespace libtextclassifier3 { using ::testing::ElementsAre; namespace { constexpr char kTestVocabPath[] = "annotator/pod_ner/test_data/vocab.txt"; void AssertTokenizerResults(std::unique_ptr tokenizer) { auto results = tokenizer->Tokenize("i'm question"); EXPECT_THAT(results.subwords, ElementsAre("i", "'", "m", "question")); } TEST(BertTokenizerTest, TestTokenizerCreationFromBuffer) { std::string buffer = GetTestFileContent(kTestVocabPath); auto tokenizer = absl::make_unique(buffer.data(), buffer.size()); AssertTokenizerResults(std::move(tokenizer)); } TEST(BertTokenizerTest, TestTokenizerCreationFromFile) { auto tokenizer = absl::make_unique(GetTestDataPath(kTestVocabPath)); AssertTokenizerResults(std::move(tokenizer)); } TEST(BertTokenizerTest, TestTokenizerCreationFromVector) { std::vector vocab; vocab.emplace_back("i"); vocab.emplace_back("'"); vocab.emplace_back("m"); vocab.emplace_back("question"); auto tokenizer = absl::make_unique(vocab); AssertTokenizerResults(std::move(tokenizer)); } TEST(BertTokenizerTest, TestTokenizerMultipleRows) { auto tokenizer = absl::make_unique(GetTestDataPath(kTestVocabPath)); auto results = tokenizer->Tokenize("i'm questionansweraskask"); EXPECT_THAT(results.subwords, ElementsAre("i", "'", "m", "question", "##ans", "##wer", "##ask", "##ask")); } TEST(BertTokenizerTest, TestTokenizerUnknownTokens) { std::vector vocab; vocab.emplace_back("i"); vocab.emplace_back("'"); vocab.emplace_back("m"); vocab.emplace_back("question"); auto tokenizer = absl::make_unique(vocab); auto results = tokenizer->Tokenize("i'm questionansweraskask"); EXPECT_THAT(results.subwords, ElementsAre("i", "'", "m", kDefaultUnknownToken)); } TEST(BertTokenizerTest, TestLookupId) { std::vector vocab; vocab.emplace_back("i"); vocab.emplace_back("'"); vocab.emplace_back("m"); vocab.emplace_back("question"); auto tokenizer = absl::make_unique(vocab); int i; ASSERT_FALSE(tokenizer->LookupId("iDontExist", &i)); ASSERT_TRUE(tokenizer->LookupId("i", &i)); ASSERT_EQ(i, 0); ASSERT_TRUE(tokenizer->LookupId("'", &i)); ASSERT_EQ(i, 1); ASSERT_TRUE(tokenizer->LookupId("m", &i)); ASSERT_EQ(i, 2); ASSERT_TRUE(tokenizer->LookupId("question", &i)); ASSERT_EQ(i, 3); } TEST(BertTokenizerTest, TestLookupWord) { std::vector vocab; vocab.emplace_back("i"); vocab.emplace_back("'"); vocab.emplace_back("m"); vocab.emplace_back("question"); auto tokenizer = absl::make_unique(vocab); absl::string_view result; ASSERT_FALSE(tokenizer->LookupWord(6, &result)); ASSERT_TRUE(tokenizer->LookupWord(0, &result)); ASSERT_EQ(result, "i"); ASSERT_TRUE(tokenizer->LookupWord(1, &result)); ASSERT_EQ(result, "'"); ASSERT_TRUE(tokenizer->LookupWord(2, &result)); ASSERT_EQ(result, "m"); ASSERT_TRUE(tokenizer->LookupWord(3, &result)); ASSERT_EQ(result, "question"); } TEST(BertTokenizerTest, TestContains) { std::vector vocab; vocab.emplace_back("i"); vocab.emplace_back("'"); vocab.emplace_back("m"); vocab.emplace_back("question"); auto tokenizer = absl::make_unique(vocab); bool result; tokenizer->Contains("iDontExist", &result); ASSERT_FALSE(result); tokenizer->Contains("i", &result); ASSERT_TRUE(result); tokenizer->Contains("'", &result); ASSERT_TRUE(result); tokenizer->Contains("m", &result); ASSERT_TRUE(result); tokenizer->Contains("question", &result); ASSERT_TRUE(result); } TEST(BertTokenizerTest, TestLVocabularySize) { std::vector vocab; vocab.emplace_back("i"); vocab.emplace_back("'"); vocab.emplace_back("m"); vocab.emplace_back("question"); auto tokenizer = absl::make_unique(vocab); ASSERT_EQ(tokenizer->VocabularySize(), 4); } TEST(BertTokenizerTest, SimpleEnglishWithPunctuation) { absl::string_view input = "I am fine, thanks!"; std::vector tokens = BertTokenizer::PreTokenize(input); EXPECT_THAT(tokens, testing::ElementsAreArray( {"I", "am", "fine", ",", "thanks", "!"})); } } // namespace } // namespace libtextclassifier3