1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Unit tests for the lexer.
18 
19 #include "utils/grammar/parsing/lexer.h"
20 
21 #include <memory>
22 #include <string>
23 #include <vector>
24 
25 #include "annotator/types.h"
26 #include "utils/jvm-test-utils.h"
27 #include "utils/tokenizer.h"
28 #include "utils/utf8/unicodetext.h"
29 #include "utils/utf8/unilib.h"
30 #include "gmock/gmock.h"
31 #include "gtest/gtest.h"
32 
33 namespace libtextclassifier3::grammar {
34 
operator <<(std::ostream & os,const Symbol & symbol)35 std::ostream& operator<<(std::ostream& os, const Symbol& symbol) {
36   return os << "Symbol(type=" << static_cast<int>(symbol.type) << ", span=("
37             << symbol.codepoint_span.first << ", "
38             << symbol.codepoint_span.second
39             << "), lexeme=" << symbol.lexeme.ToString() << ")";
40 }
41 
42 namespace {
43 
44 using ::testing::DescribeMatcher;
45 using ::testing::ElementsAre;
46 using ::testing::ExplainMatchResult;
47 
48 // Superclass of all tests here.
49 class LexerTest : public testing::Test {
50  protected:
LexerTest()51   explicit LexerTest()
52       : unilib_(libtextclassifier3::CreateUniLibForTesting()),
53         tokenizer_(TokenizationType_ICU, unilib_.get(),
54                    /*codepoint_ranges=*/{},
55                    /*internal_tokenizer_codepoint_ranges=*/{},
56                    /*split_on_script_change=*/false,
57                    /*icu_preserve_whitespace_tokens=*/false),
58         lexer_(unilib_.get()) {}
59 
SymbolsForTokens(const std::vector<Token> & tokens) const60   std::vector<Symbol> SymbolsForTokens(const std::vector<Token>& tokens) const {
61     std::vector<Symbol> symbols;
62     for (const Token& token : tokens) {
63       lexer_.AppendTokenSymbols(token.value, token.start,
64                                 CodepointSpan{token.start, token.end},
65                                 &symbols);
66     }
67     return symbols;
68   }
69 
70   std::unique_ptr<UniLib> unilib_;
71   Tokenizer tokenizer_;
72   Lexer lexer_;
73 };
74 
75 MATCHER_P4(IsSymbol, type, begin, end, terminal,
76            "is symbol with type that " +
77                DescribeMatcher<Symbol::Type>(type, negation) + ", begin that " +
78                DescribeMatcher<int>(begin, negation) + ", end that " +
79                DescribeMatcher<int>(end, negation) + ", value that " +
80                DescribeMatcher<std::string>(terminal, negation)) {
81   return ExplainMatchResult(type, arg.type, result_listener) &&
82          ExplainMatchResult(CodepointSpan(begin, end), arg.codepoint_span,
83                             result_listener) &&
84          ExplainMatchResult(terminal, arg.lexeme.ToString(), result_listener);
85 }
86 
TEST_F(LexerTest,HandlesSimpleWords)87 TEST_F(LexerTest, HandlesSimpleWords) {
88   std::vector<Token> tokens = tokenizer_.Tokenize("This is a word");
89   EXPECT_THAT(SymbolsForTokens(tokens),
90               ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 4, "This"),
91                           IsSymbol(Symbol::Type::TYPE_TERM, 5, 7, "is"),
92                           IsSymbol(Symbol::Type::TYPE_TERM, 8, 9, "a"),
93                           IsSymbol(Symbol::Type::TYPE_TERM, 10, 14, "word")));
94 }
95 
TEST_F(LexerTest,SplitsConcatedLettersAndDigit)96 TEST_F(LexerTest, SplitsConcatedLettersAndDigit) {
97   std::vector<Token> tokens = tokenizer_.Tokenize("1234This a4321cde");
98   EXPECT_THAT(SymbolsForTokens(tokens),
99               ElementsAre(IsSymbol(Symbol::Type::TYPE_DIGITS, 0, 4, "1234"),
100                           IsSymbol(Symbol::Type::TYPE_TERM, 4, 8, "This"),
101                           IsSymbol(Symbol::Type::TYPE_TERM, 9, 10, "a"),
102                           IsSymbol(Symbol::Type::TYPE_DIGITS, 10, 14, "4321"),
103                           IsSymbol(Symbol::Type::TYPE_TERM, 14, 17, "cde")));
104 }
105 
TEST_F(LexerTest,SplitsPunctuation)106 TEST_F(LexerTest, SplitsPunctuation) {
107   std::vector<Token> tokens = tokenizer_.Tokenize("10/18/2014");
108   EXPECT_THAT(SymbolsForTokens(tokens),
109               ElementsAre(IsSymbol(Symbol::Type::TYPE_DIGITS, 0, 2, "10"),
110                           IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 2, 3, "/"),
111                           IsSymbol(Symbol::Type::TYPE_DIGITS, 3, 5, "18"),
112                           IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 5, 6, "/"),
113                           IsSymbol(Symbol::Type::TYPE_DIGITS, 6, 10, "2014")));
114 }
115 
TEST_F(LexerTest,SplitsUTF8Punctuation)116 TEST_F(LexerTest, SplitsUTF8Punctuation) {
117   std::vector<Token> tokens = tokenizer_.Tokenize("电话:0871—6857(曹");
118   EXPECT_THAT(
119       SymbolsForTokens(tokens),
120       ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 2, "电话"),
121                   IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 2, 3, ":"),
122                   IsSymbol(Symbol::Type::TYPE_DIGITS, 3, 7, "0871"),
123                   IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 7, 8, "—"),
124                   IsSymbol(Symbol::Type::TYPE_DIGITS, 8, 12, "6857"),
125                   IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 12, 13, "("),
126                   IsSymbol(Symbol::Type::TYPE_TERM, 13, 14, "曹")));
127 }
128 
TEST_F(LexerTest,HandlesMixedPunctuation)129 TEST_F(LexerTest, HandlesMixedPunctuation) {
130   std::vector<Token> tokens = tokenizer_.Tokenize("电话 :0871—6857(曹");
131   EXPECT_THAT(
132       SymbolsForTokens(tokens),
133       ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 2, "电话"),
134                   IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 3, 4, ":"),
135                   IsSymbol(Symbol::Type::TYPE_DIGITS, 4, 8, "0871"),
136                   IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 8, 9, "—"),
137                   IsSymbol(Symbol::Type::TYPE_DIGITS, 9, 13, "6857"),
138                   IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 13, 14, "("),
139                   IsSymbol(Symbol::Type::TYPE_TERM, 14, 15, "曹")));
140 }
141 
TEST_F(LexerTest,HandlesTokensWithDigits)142 TEST_F(LexerTest, HandlesTokensWithDigits) {
143   std::vector<Token> tokens =
144       tokenizer_.Tokenize("The.qUIck\n brown2345fox88 \xE2\x80\x94 the");
145   EXPECT_THAT(SymbolsForTokens(tokens),
146               ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 3, "The"),
147                           IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 3, 4, "."),
148                           IsSymbol(Symbol::Type::TYPE_TERM, 4, 9, "qUIck"),
149                           IsSymbol(Symbol::Type::TYPE_TERM, 11, 16, "brown"),
150                           IsSymbol(Symbol::Type::TYPE_DIGITS, 16, 20, "2345"),
151                           IsSymbol(Symbol::Type::TYPE_TERM, 20, 23, "fox"),
152                           IsSymbol(Symbol::Type::TYPE_DIGITS, 23, 25, "88"),
153                           IsSymbol(Symbol::Type::TYPE_PUNCTUATION, 26, 27, "—"),
154                           IsSymbol(Symbol::Type::TYPE_TERM, 28, 31, "the")));
155 }
156 
TEST_F(LexerTest,SplitsPlusSigns)157 TEST_F(LexerTest, SplitsPlusSigns) {
158   std::vector<Token> tokens = tokenizer_.Tokenize("The+2345++the +");
159   EXPECT_THAT(SymbolsForTokens(tokens),
160               ElementsAre(IsSymbol(Symbol::Type::TYPE_TERM, 0, 3, "The"),
161                           IsSymbol(Symbol::Type::TYPE_TERM, 3, 4, "+"),
162                           IsSymbol(Symbol::Type::TYPE_DIGITS, 4, 8, "2345"),
163                           IsSymbol(Symbol::Type::TYPE_TERM, 8, 9, "+"),
164                           IsSymbol(Symbol::Type::TYPE_TERM, 9, 10, "+"),
165                           IsSymbol(Symbol::Type::TYPE_TERM, 10, 13, "the"),
166                           IsSymbol(Symbol::Type::TYPE_TERM, 14, 15, "+")));
167 }
168 
169 }  // namespace
170 }  // namespace libtextclassifier3::grammar
171