1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_ 19 20 #include <vector> 21 22 #include "annotator/types.h" 23 #include "utils/base/arena.h" 24 #include "utils/grammar/parsing/derivation.h" 25 #include "utils/grammar/parsing/lexer.h" 26 #include "utils/grammar/parsing/matcher.h" 27 #include "utils/grammar/rules_generated.h" 28 #include "utils/grammar/text-context.h" 29 #include "utils/i18n/locale.h" 30 #include "utils/utf8/unilib.h" 31 32 namespace libtextclassifier3::grammar { 33 34 // Syntactic parsing pass. 35 // The parser validates and deduplicates candidates produced by the grammar 36 // matcher. It augments the parse trees with derivation information for semantic 37 // evaluation. 38 class Parser { 39 public: 40 explicit Parser(const UniLib* unilib, const RulesSet* rules); 41 42 // Parses an input text and returns the root rule derivations. 43 std::vector<Derivation> Parse(const TextContext& input, 44 UnsafeArena* arena) const; 45 46 private: 47 struct RegexAnnotator { 48 std::unique_ptr<UniLib::RegexPattern> pattern; 49 Nonterm nonterm; 50 }; 51 52 // Uncompresses and build the defined regex annotators. 53 std::vector<RegexAnnotator> BuildRegexAnnotators() const; 54 55 // Produces symbols for a text input to feed to a matcher. 56 // These are symbols for each tokens from the lexer, existing text annotations 57 // and regex annotations. 58 // The symbols are sorted with increasing end-positions to satisfy the matcher 59 // requirements. 60 std::vector<Symbol> SortedSymbolsForInput(const TextContext& input, 61 UnsafeArena* arena) const; 62 63 // Emits a symbol to the matcher. 64 void EmitSymbol(const Symbol& symbol, UnsafeArena* arena, 65 Matcher* matcher) const; 66 67 const UniLib& unilib_; 68 const RulesSet* rules_; 69 const Lexer lexer_; 70 71 // Pre-defined nonterminals. 72 const RulesSet_::Nonterminals* nonterminals_; 73 74 // Pre-parsed locales of the rules. 75 const std::vector<std::vector<Locale>> rules_locales_; 76 77 std::vector<RegexAnnotator> regex_annotators_; 78 }; 79 80 } // namespace libtextclassifier3::grammar 81 82 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_ 83