1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // A lexer that (splits) and classifies tokens. 18 // 19 // Any whitespace gets absorbed into the token that follows them in the text. 20 // For example, if the text contains: 21 // 22 // ...hello there world... 23 // | | | 24 // offset=16 39 52 25 // 26 // then the output will be: 27 // 28 // "hello" [?, 16) 29 // "there" [16, 44) <-- note "16" NOT "39" 30 // "world" [44, ?) <-- note "44" NOT "52" 31 // 32 // This makes it appear to the Matcher as if the tokens are adjacent. 33 34 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ 35 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ 36 37 #include <vector> 38 39 #include "annotator/types.h" 40 #include "utils/grammar/parsing/parse-tree.h" 41 #include "utils/grammar/types.h" 42 #include "utils/strings/stringpiece.h" 43 #include "utils/utf8/unicodetext.h" 44 #include "utils/utf8/unilib.h" 45 46 namespace libtextclassifier3::grammar { 47 48 // A lexical symbol with an identified meaning that represents raw tokens, 49 // token categories or predefined text matches. 50 // It is the unit fed to the grammar matcher. 51 struct Symbol { 52 // The type of the lexical symbol. 53 enum class Type { 54 // A raw token. 55 TYPE_TERM, 56 57 // A symbol representing a string of digits. 58 TYPE_DIGITS, 59 60 // Punctuation characters. 61 TYPE_PUNCTUATION, 62 63 // A predefined parse tree. 64 TYPE_PARSE_TREE 65 }; 66 67 explicit Symbol() = default; 68 69 // Constructs a symbol of a given type with an anchor in the text. SymbolSymbol70 Symbol(const Type type, const CodepointSpan codepoint_span, 71 const int match_offset, StringPiece lexeme) 72 : type(type), 73 codepoint_span(codepoint_span), 74 match_offset(match_offset), 75 lexeme(lexeme) {} 76 77 // Constructs a symbol from a pre-defined parse tree. SymbolSymbol78 explicit Symbol(ParseTree* parse_tree) 79 : type(Type::TYPE_PARSE_TREE), 80 codepoint_span(parse_tree->codepoint_span), 81 match_offset(parse_tree->match_offset), 82 parse_tree(parse_tree) {} 83 84 // The type of the symbol. 85 Type type; 86 87 // The span in the text as codepoint offsets. 88 CodepointSpan codepoint_span; 89 90 // The match start offset (including preceding whitespace) as codepoint 91 // offset. 92 int match_offset; 93 94 // The symbol text value. 95 StringPiece lexeme; 96 97 // The predefined parse tree. 98 ParseTree* parse_tree; 99 }; 100 101 class Lexer { 102 public: Lexer(const UniLib * unilib)103 explicit Lexer(const UniLib* unilib) : unilib_(*unilib) {} 104 105 // Processes a single token. 106 // Splits a token into classified symbols. 107 void AppendTokenSymbols(const StringPiece value, int match_offset, 108 const CodepointSpan codepoint_span, 109 std::vector<Symbol>* symbols) const; 110 111 private: 112 // Gets the type of a character. 113 Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; 114 115 const UniLib& unilib_; 116 }; 117 118 } // namespace libtextclassifier3::grammar 119 120 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ 121