1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // A token based context-free grammar matcher.
18 //
19 // A parser passes token to the matcher: literal terminal strings and token
20 // types.
21 // The parser passes each token along with the [begin, end) position range
22 // in which it occurs.  So for an input string "Groundhog February 2, 2007", the
23 // parser would tell the matcher that:
24 //
25 //     "Groundhog" occurs at [0, 9)
26 //     "February" occurs at [9, 18)
27 //     <digits> occurs at [18, 20)
28 //     "," occurs at [20, 21)
29 //     <digits> occurs at [21, 26)
30 //
31 // Multiple overlapping symbols can be passed.
32 // The only constraint on symbol order is that they have to be passed in
33 // left-to-right order, strictly speaking, their "end" positions must be
34 // nondecreasing. This constraint allows a more efficient matching algorithm.
35 // The "begin" positions can be in any order.
36 
37 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_
38 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_
39 
40 #include <array>
41 #include <functional>
42 #include <vector>
43 
44 #include "annotator/types.h"
45 #include "utils/base/arena.h"
46 #include "utils/grammar/parsing/chart.h"
47 #include "utils/grammar/parsing/derivation.h"
48 #include "utils/grammar/parsing/parse-tree.h"
49 #include "utils/grammar/rules_generated.h"
50 #include "utils/strings/stringpiece.h"
51 #include "utils/utf8/unilib.h"
52 
53 namespace libtextclassifier3::grammar {
54 
55 class Matcher {
56  public:
Matcher(const UniLib * unilib,const RulesSet * rules,const std::vector<const RulesSet_::Rules * > rules_shards,UnsafeArena * arena)57   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
58                    const std::vector<const RulesSet_::Rules*> rules_shards,
59                    UnsafeArena* arena)
60       : unilib_(*unilib),
61         arena_(arena),
62         last_end_(std::numeric_limits<int>().lowest()),
63         rules_(rules),
64         rules_shards_(rules_shards),
65         pending_items_(nullptr),
66         pending_exclusion_items_(nullptr) {
67     TC3_CHECK_NE(rules, nullptr);
68   }
69 
Matcher(const UniLib * unilib,const RulesSet * rules,UnsafeArena * arena)70   explicit Matcher(const UniLib* unilib, const RulesSet* rules,
71                    UnsafeArena* arena)
72       : Matcher(unilib, rules, {}, arena) {
73     rules_shards_.reserve(rules->rules()->size());
74     rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
75                          rules->rules()->end());
76   }
77 
78   // Finish the matching.
79   void Finish();
80 
81   // Tells the matcher that the given terminal was found occupying position
82   // range [begin, end) in the input.
83   // The matcher may invoke callback functions before returning, if this
84   // terminal triggers any new matches for rules in the grammar.
85   // Calls to AddTerminal() and AddParseTree() must be in left-to-right order,
86   // that is, the sequence of `end` values must be non-decreasing.
87   void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
88                    StringPiece terminal);
AddTerminal(const CodepointIndex begin,const CodepointIndex end,StringPiece terminal)89   void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
90                    StringPiece terminal) {
91     AddTerminal(CodepointSpan{begin, end}, begin, terminal);
92   }
93 
94   // Adds predefined parse tree.
95   void AddParseTree(ParseTree* parse_tree);
96 
chart()97   const Chart<> chart() const { return chart_; }
98 
99  private:
100   // Process matches from lhs set.
101   void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
102                      const int whitespace_gap,
103                      const std::function<void(ParseTree*)>& initializer_fn,
104                      const RulesSet_::LhsSet* lhs_set);
105 
106   // Queues a newly created match item.
107   void QueueForProcessing(ParseTree* item);
108 
109   // Queues a match item for later post checking of the exclusion condition.
110   // For exclusions we need to check that the `item->excluded_nonterminal`
111   // doesn't match the same span. As we cannot know which matches have already
112   // been added, we queue the item for later post checking - once all matches
113   // up to `item->codepoint_span.second` have been added.
114   void QueueForPostCheck(ExclusionNode* item);
115 
116   // Adds pending items to the chart, possibly generating new matches as a
117   // result.
118   void ProcessPendingSet();
119 
120   // Checks all pending exclusion matches that their exclusion condition is
121   // fulfilled.
122   void ProcessPendingExclusionMatches();
123 
124   UniLib unilib_;
125 
126   // Memory arena for match allocation.
127   UnsafeArena* arena_;
128 
129   // The end position of the most recent match or terminal, for sanity
130   // checking.
131   int last_end_;
132 
133   // Rules.
134   const RulesSet* rules_;
135   // The active rule shards.
136   std::vector<const RulesSet_::Rules*> rules_shards_;
137 
138   // The set of items pending to be added to the chart as a singly-linked list.
139   ParseTree* pending_items_;
140 
141   // The set of items pending to be post-checked as a singly-linked list.
142   ExclusionNode* pending_exclusion_items_;
143 
144   // The chart data structure: a hashtable containing all matches, indexed by
145   // their end positions.
146   Chart<> chart_;
147 };
148 
149 }  // namespace libtextclassifier3::grammar
150 
151 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_MATCHER_H_
152