1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Unit tests for the lexer.
18
19 #include "utils/grammar/parsing/lexer.h"
20
21 #include <memory>
22 #include <string>
23 #include <vector>
24
25 #include "annotator/types.h"
26 #include "utils/jvm-test-utils.h"
27 #include "utils/tokenizer.h"
28 #include "utils/utf8/unicodetext.h"
29 #include "utils/utf8/unilib.h"
30 #include "gmock/gmock.h"
31 #include "gtest/gtest.h"
32
33 namespace libtextclassifier3::grammar {
34
operator <<(std::ostream & os,const Symbol & symbol)35 std::ostream& operator<<(std::ostream& os, const Symbol& symbol) {
36 return os << "Symbol(type=" << static_cast<int>(symbol.type) << ", span=("
37 << symbol.codepoint_span.first << ", "
38 << symbol.codepoint_span.second
39 << "), lexeme=" << symbol.lexeme.ToString() << ")";
40 }
41
42 namespace {
43
44 using ::testing::DescribeMatcher;
45 using ::testing::ElementsAre;
46 using ::testing::ExplainMatchResult;
47
48 // Superclass of all tests here.
49 class LexerTest : public testing::Test {
50 protected:
LexerTest()51 explicit LexerTest()
52 : unilib_(libtextclassifier3::CreateUniLibForTesting()),
53 tokenizer_(TokenizationType_ICU, unilib_.get(),
54 /*codepoint_ranges=*/{},
55 /*internal_tokenizer_codepoint_ranges=*/{},
56 /*split_on_script_change=*/false,
57 /*icu_preserve_whitespace_tokens=*/false),
58 lexer_(unilib_.get()) {}
59
SymbolsForTokens(const std::vector<Token> & tokens) const60 std::vector<Symbol> SymbolsForTokens(const std::vector<Token>& tokens) const {
61 std::vector<Symbol> symbols;
62 for (const Token& token : tokens) {
63 lexer_.AppendTokenSymbols(token.value, token.start,
64 CodepointSpan{token.start, token.end},
65 &symbols);
66 }
67 return symbols;
68 }
69
70 std::unique_ptr<UniLib> unilib_;
71 Tokenizer tokenizer_;
72 Lexer lexer_;
73 };
74
75 MATCHER_P4(IsSymbol, type, begin, end, terminal,
76 "is symbol with type that " +
77 DescribeMatcher<Symbol::Type>(type, negation) + ", begin that " +
78 DescribeMatcher<int>(begin, negation) + ", end that " +
79 DescribeMatcher<int>(end, negation) + ", value that " +
80 DescribeMatcher<std::string>(terminal, negation)) {
81 return ExplainMatchResult(type, arg.type, result_listener) &&
82 ExplainMatchResult(CodepointSpan(begin, end), arg.codepoint_span,
83 result_listener) &&
84 ExplainMatchResult(terminal, arg.lexeme.ToString(), result_listener);
85 }
86
TEST_F(LexerTest,HandlesSimpleWords)87 TEST_F(LexerTest, HandlesSimpleWords) {
88 std::vector<Token> tokens = tokenizer_.Tokenize("This is a word");
89 EXPECT_THAT(SymbolsForTokens(tokens),
90 ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 4, "This"),
91 IsSymbol(Symbol::Type::TYPE_TERM, 5, 7, "is"),
92 IsSymbol(Symbol::Type::TYPE_TERM, 8, 9, "a"),
93 IsSymbol(Symbol::Type::TYPE_TERM, 10, 14, "word")));
94 }
95
TEST_F(LexerTest,SplitsConcatedLettersAndDigit)96 TEST_F(LexerTest, SplitsConcatedLettersAndDigit) {
97 std::vector<Token> tokens = tokenizer_.Tokenize("1234This a4321cde");
98 EXPECT_THAT(SymbolsForTokens(tokens),
99 ElementsAre(IsSymbol(Symbol::Type::TYPE_DIGITS, 0, 4, "1234"),
100 IsSymbol(Symbol::Type::TYPE_TERM, 4, 8, "This"),
101 IsSymbol(Symbol::Type::TYPE_TERM, 9, 10, "a"),
102 IsSymbol(Symbol::Type::TYPE_DIGITS, 10, 14, "4321"),
103 IsSymbol(Symbol::Type::TYPE_TERM, 14, 17, "cde")));
104 }
105
TEST_F(LexerTest,SplitsPunctuation)106 TEST_F(LexerTest, SplitsPunctuation) {
107 std::vector<Token> tokens = tokenizer_.Tokenize("10/18/2014");
108 EXPECT_THAT(SymbolsForTokens(tokens),
109 ElementsAre(IsSymbol(Symbol::Type::TYPE_DIGITS, 0, 2, "10"),
110 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 2, 3, "/"),
111 IsSymbol(Symbol::Type::TYPE_DIGITS, 3, 5, "18"),
112 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 5, 6, "/"),
113 IsSymbol(Symbol::Type::TYPE_DIGITS, 6, 10, "2014")));
114 }
115
TEST_F(LexerTest,SplitsUTF8Punctuation)116 TEST_F(LexerTest, SplitsUTF8Punctuation) {
117 std::vector<Token> tokens = tokenizer_.Tokenize("电话:0871—6857(曹");
118 EXPECT_THAT(
119 SymbolsForTokens(tokens),
120 ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 2, "电话"),
121 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 2, 3, ":"),
122 IsSymbol(Symbol::Type::TYPE_DIGITS, 3, 7, "0871"),
123 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 7, 8, "—"),
124 IsSymbol(Symbol::Type::TYPE_DIGITS, 8, 12, "6857"),
125 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 12, 13, "("),
126 IsSymbol(Symbol::Type::TYPE_TERM, 13, 14, "曹")));
127 }
128
TEST_F(LexerTest,HandlesMixedPunctuation)129 TEST_F(LexerTest, HandlesMixedPunctuation) {
130 std::vector<Token> tokens = tokenizer_.Tokenize("电话 :0871—6857(曹");
131 EXPECT_THAT(
132 SymbolsForTokens(tokens),
133 ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 2, "电话"),
134 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 3, 4, ":"),
135 IsSymbol(Symbol::Type::TYPE_DIGITS, 4, 8, "0871"),
136 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 8, 9, "—"),
137 IsSymbol(Symbol::Type::TYPE_DIGITS, 9, 13, "6857"),
138 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 13, 14, "("),
139 IsSymbol(Symbol::Type::TYPE_TERM, 14, 15, "曹")));
140 }
141
TEST_F(LexerTest,HandlesTokensWithDigits)142 TEST_F(LexerTest, HandlesTokensWithDigits) {
143 std::vector<Token> tokens =
144 tokenizer_.Tokenize("The.qUIck\n brown2345fox88 \xE2\x80\x94 the");
145 EXPECT_THAT(SymbolsForTokens(tokens),
146 ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 3, "The"),
147 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 3, 4, "."),
148 IsSymbol(Symbol::Type::TYPE_TERM, 4, 9, "qUIck"),
149 IsSymbol(Symbol::Type::TYPE_TERM, 11, 16, "brown"),
150 IsSymbol(Symbol::Type::TYPE_DIGITS, 16, 20, "2345"),
151 IsSymbol(Symbol::Type::TYPE_TERM, 20, 23, "fox"),
152 IsSymbol(Symbol::Type::TYPE_DIGITS, 23, 25, "88"),
153 IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 26, 27, "—"),
154 IsSymbol(Symbol::Type::TYPE_TERM, 28, 31, "the")));
155 }
156
TEST_F(LexerTest,SplitsPlusSigns)157 TEST_F(LexerTest, SplitsPlusSigns) {
158 std::vector<Token> tokens = tokenizer_.Tokenize("The+2345++the +");
159 EXPECT_THAT(SymbolsForTokens(tokens),
160 ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 3, "The"),
161 IsSymbol(Symbol::Type::TYPE_TERM, 3, 4, "+"),
162 IsSymbol(Symbol::Type::TYPE_DIGITS, 4, 8, "2345"),
163 IsSymbol(Symbol::Type::TYPE_TERM, 8, 9, "+"),
164 IsSymbol(Symbol::Type::TYPE_TERM, 9, 10, "+"),
165 IsSymbol(Symbol::Type::TYPE_TERM, 10, 13, "the"),
166 IsSymbol(Symbol::Type::TYPE_TERM, 14, 15, "+")));
167 }
168
169 } // namespace
170 } // namespace libtextclassifier3::grammar
171