1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // A lexer that (splits) and classifies tokens.
18 //
19 // Any whitespace gets absorbed into the token that follows them in the text.
20 // For example, if the text contains:
21 //
22 //      ...hello                       there        world...
23 //              |                      |            |
24 //              offset=16              39           52
25 //
26 // then the output will be:
27 //
28 //      "hello" [?, 16)
29 //      "there" [16, 44)      <-- note "16" NOT "39"
30 //      "world" [44, ?)       <-- note "44" NOT "52"
31 //
32 // This makes it appear to the Matcher as if the tokens are adjacent.
33 
34 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_
35 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_
36 
37 #include <vector>
38 
39 #include "annotator/types.h"
40 #include "utils/grammar/parsing/parse-tree.h"
41 #include "utils/grammar/types.h"
42 #include "utils/strings/stringpiece.h"
43 #include "utils/utf8/unicodetext.h"
44 #include "utils/utf8/unilib.h"
45 
46 namespace libtextclassifier3::grammar {
47 
48 // A lexical symbol with an identified meaning that represents raw tokens,
49 // token categories or predefined text matches.
50 // It is the unit fed to the grammar matcher.
51 struct Symbol {
52   // The type of the lexical symbol.
53   enum class Type {
54     // A raw token.
55     TYPE_TERM,
56 
57     // A symbol representing a string of digits.
58     TYPE_DIGITS,
59 
60     // Punctuation characters.
61     TYPE_PUNCTUATION,
62 
63     // A predefined parse tree.
64     TYPE_PARSE_TREE
65   };
66 
67   explicit Symbol() = default;
68 
69   // Constructs a symbol of a given type with an anchor in the text.
SymbolSymbol70   Symbol(const Type type, const CodepointSpan codepoint_span,
71          const int match_offset, StringPiece lexeme)
72       : type(type),
73         codepoint_span(codepoint_span),
74         match_offset(match_offset),
75         lexeme(lexeme) {}
76 
77   // Constructs a symbol from a pre-defined parse tree.
SymbolSymbol78   explicit Symbol(ParseTree* parse_tree)
79       : type(Type::TYPE_PARSE_TREE),
80         codepoint_span(parse_tree->codepoint_span),
81         match_offset(parse_tree->match_offset),
82         parse_tree(parse_tree) {}
83 
84   // The type of the symbol.
85   Type type;
86 
87   // The span in the text as codepoint offsets.
88   CodepointSpan codepoint_span;
89 
90   // The match start offset (including preceding whitespace) as codepoint
91   // offset.
92   int match_offset;
93 
94   // The symbol text value.
95   StringPiece lexeme;
96 
97   // The predefined parse tree.
98   ParseTree* parse_tree;
99 };
100 
101 class Lexer {
102  public:
Lexer(const UniLib * unilib)103   explicit Lexer(const UniLib* unilib) : unilib_(*unilib) {}
104 
105   // Processes a single token.
106   // Splits a token into classified symbols.
107   void AppendTokenSymbols(const StringPiece value, int match_offset,
108                           const CodepointSpan codepoint_span,
109                           std::vector<Symbol>* symbols) const;
110 
111  private:
112   // Gets the type of a character.
113   Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const;
114 
115   const UniLib& unilib_;
116 };
117 
118 }  // namespace libtextclassifier3::grammar
119 
120 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_
121