/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include "gmock/gmock.h" #include "gtest/gtest.h" #include "utils/sentencepiece/double_array_trie.h" #include "utils/sentencepiece/normalizer.h" #include "utils/sentencepiece/test_utils.h" #include "utils/strings/stringpiece.h" namespace libtextclassifier3 { namespace { std::string GetTestConfigPath() { return ""; } TEST(NormalizerTest, NormalizesAsReferenceNormalizer) { std::ifstream test_config_stream(GetTestConfigPath()); std::string config((std::istreambuf_iterator(test_config_stream)), (std::istreambuf_iterator())); SentencePieceNormalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/true, /*remove_extra_whitespaces=*/true, /*escape_whitespaces=*/true); { std::string normalized; EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); EXPECT_EQ(normalized, "▁hello▁there"); } // Redundant whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); EXPECT_EQ(normalized, "▁when▁is▁the▁world▁cup?"); } // Different whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); EXPECT_EQ(normalized, "▁general▁kenobi"); } // NFKC char to multi-char normalization. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("㍿", &normalized)); EXPECT_EQ(normalized, "▁株式会社"); } // Half width katakana, character composition happens. { std::string normalized; EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized)); EXPECT_EQ(normalized, "▁グーグル"); } // NFKC char to char normalization. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("①②③", &normalized)); EXPECT_EQ(normalized, "▁123"); } } TEST(NormalizerTest, NoDummyPrefix) { std::ifstream test_config_stream(GetTestConfigPath()); std::string config((std::istreambuf_iterator(test_config_stream)), (std::istreambuf_iterator())); SentencePieceNormalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/false, /*remove_extra_whitespaces=*/true, /*escape_whitespaces=*/true); // NFKC char to char normalization. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); EXPECT_EQ(normalized, "hello▁there"); } // Redundant whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); EXPECT_EQ(normalized, "when▁is▁the▁world▁cup?"); } // Different whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); EXPECT_EQ(normalized, "general▁kenobi"); } // NFKC char to multi-char normalization. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("㍿", &normalized)); EXPECT_EQ(normalized, "株式会社"); } // Half width katakana, character composition happens. { std::string normalized; EXPECT_TRUE(normalizer.Normalize(" グーグル ", &normalized)); EXPECT_EQ(normalized, "グーグル"); } // NFKC char to char normalization. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("①②③", &normalized)); EXPECT_EQ(normalized, "123"); } } TEST(NormalizerTest, NoRemoveExtraWhitespace) { std::ifstream test_config_stream(GetTestConfigPath()); std::string config((std::istreambuf_iterator(test_config_stream)), (std::istreambuf_iterator())); SentencePieceNormalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/false, /*remove_extra_whitespaces=*/false, /*escape_whitespaces=*/true); { std::string normalized; EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); EXPECT_EQ(normalized, "hello▁there"); } // Redundant whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); EXPECT_EQ(normalized, "when▁is▁▁the▁▁world▁cup?"); } // Different whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); EXPECT_EQ(normalized, "general▁kenobi"); } } TEST(NormalizerTest, NoEscapeWhitespaces) { std::ifstream test_config_stream(GetTestConfigPath()); std::string config((std::istreambuf_iterator(test_config_stream)), (std::istreambuf_iterator())); SentencePieceNormalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/false, /*remove_extra_whitespaces=*/false, /*escape_whitespaces=*/false); { std::string normalized; EXPECT_TRUE(normalizer.Normalize("hello there", &normalized)); EXPECT_EQ(normalized, "hello there"); } // Redundant whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("when is the world cup?", &normalized)); EXPECT_EQ(normalized, "when is the world cup?"); } // Different whitespace. { std::string normalized; EXPECT_TRUE(normalizer.Normalize("general\tkenobi", &normalized)); EXPECT_EQ(normalized, "general kenobi"); } } } // namespace } // namespace libtextclassifier3