1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Utility functions for pre-processing, creating and testing context free
18 // grammars.
19 
20 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
21 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
22 
23 #include <unordered_map>
24 #include <vector>
25 
26 #include "utils/grammar/types.h"
27 #include "utils/grammar/utils/ir.h"
28 
29 namespace libtextclassifier3::grammar {
30 
31 // Special nonterminals.
32 constexpr const char* kFiller = "<filler>";
33 
34 // All rules for a grammar will be collected in a rules object.
35 //
36 //    Rules r;
37 //    r.Add("<date>", {"<monthname>", "<day>", <year>"});
38 //    r.Add("<monthname>", {"January"});
39 //    ...
40 //    r.Add("<monthname>", {"December"});
41 //    r.Add("<day>", {"<string_of_digits>"});
42 //    r.Add("<year>", {"<string_of_digits>"});
43 //
44 // The Add() method adds a rule with a given lhs, rhs/
45 // The rhs is just a list of terminals and nonterminals.  Anything
46 // surrounded in angle brackets is considered a nonterminal.  A "?" can follow
47 // any element of the RHS, like this:
48 //
49 //    r.Add("<date>", {"<monthname>", "<day>?", ",?", "<year>"});
50 //
51 // This indicates that the <day> and "," parts of the rhs are optional.
52 // (This is just notational shorthand for adding a bunch of rules.)
53 //
54 // Once you're done adding rules, r.Finalize() lowers the rule set into an
55 // internal representation.
56 class Rules {
57  public:
Rules(const LocaleShardMap & locale_shard_map)58   explicit Rules(const LocaleShardMap& locale_shard_map)
59       : locale_shard_map_(locale_shard_map) {}
60 
61   // Represents one item in a right-hand side, a single terminal or nonterminal.
62   struct RhsElement {
RhsElementRhsElement63     RhsElement() {}
RhsElementRhsElement64     explicit RhsElement(const std::string& terminal, const bool is_optional)
65         : is_terminal(true),
66           terminal(terminal),
67           is_optional(is_optional),
68           is_constituent(false) {}
69     explicit RhsElement(const int nonterminal, const bool is_optional,
70                         const bool is_constituent = true)
is_terminalRhsElement71         : is_terminal(false),
72           nonterminal(nonterminal),
73           is_optional(is_optional),
74           is_constituent(is_constituent) {}
75     bool is_terminal;
76     std::string terminal;
77     int nonterminal;
78     bool is_optional;
79 
80     // Whether the element is a constituent of a rule - these are the explicit
81     // nonterminals, but not terminals or implicitly added anchors.
82     bool is_constituent;
83   };
84 
85   // Represents the right-hand side, and possibly callback, of one rule.
86   struct Rule {
87     std::vector<RhsElement> rhs;
88     CallbackId callback = kNoCallback;
89     int64 callback_param = 0;
90     int8 max_whitespace_gap = -1;
91     bool case_sensitive = false;
92     int shard = 0;
93   };
94 
95   struct NontermInfo {
96     // The name of the non-terminal, if defined.
97     std::string name;
98 
99     // Whether the nonterminal is provided via an annotation.
100     bool from_annotation = false;
101 
102     // Rules that have this non-terminal as the lhs.
103     std::vector<int> rules;
104 
105     // Regex rules that have this non-terminal as the lhs.
106     std::vector<int> regex_rules;
107   };
108 
109   // Adds a rule `lhs ::= rhs` with the given callback id and parameter.
110   // Note: Nonterminal names are in angle brackets and cannot contain
111   // whitespace. The `rhs` is a list of components, each of which is either:
112   //  * A nonterminal name (in angle brackets)
113   //  * A terminal
114   // optionally followed by a `?` which indicates that the component is
115   // optional. The `rhs` must contain at least one non-optional component.
116   void Add(const std::string& lhs, const std::vector<std::string>& rhs,
117            const CallbackId callback = kNoCallback,
118            const int64 callback_param = 0, int8 max_whitespace_gap = -1,
119            bool case_sensitive = false, int shard = 0);
120 
121   // Adds a rule `lhs ::= rhs` with the given callback id and parameter.
122   // The `rhs` must contain at least one non-optional component.
123   void Add(int lhs, const std::vector<RhsElement>& rhs,
124            CallbackId callback = kNoCallback, int64 callback_param = 0,
125            int8 max_whitespace_gap = -1, bool case_sensitive = false,
126            int shard = 0);
127 
128   // Adds a rule `lhs ::= rhs` with exclusion.
129   // The rule only matches, if `excluded_nonterminal` doesn't match the same
130   // span.
131   void AddWithExclusion(const std::string& lhs,
132                         const std::vector<std::string>& rhs,
133                         const std::string& excluded_nonterminal,
134                         int8 max_whitespace_gap = -1,
135                         bool case_sensitive = false, int shard = 0);
136 
137   // Adds an assertion callback.
138   void AddAssertion(const std::string& lhs, const std::vector<std::string>& rhs,
139                     bool negative = true, int8 max_whitespace_gap = -1,
140                     bool case_sensitive = false, int shard = 0);
141 
142   // Adds a mapping callback.
143   void AddValueMapping(const std::string& lhs,
144                        const std::vector<std::string>& rhs, int64 value,
145                        int8 max_whitespace_gap = -1,
146                        bool case_sensitive = false, int shard = 0);
147   void AddValueMapping(int lhs, const std::vector<RhsElement>& rhs, int64 value,
148                        int8 max_whitespace_gap = -1,
149                        bool case_sensitive = false, int shard = 0);
150 
151   // Adds a regex rule.
152   void AddRegex(const std::string& lhs, const std::string& regex_pattern);
153   void AddRegex(int lhs, const std::string& regex_pattern);
154 
155   // Creates a nonterminal with the given name, if one doesn't already exist.
156   int AddNonterminal(const std::string& nonterminal_name);
157 
158   // Creates a new nonterminal.
159   int AddNewNonterminal();
160 
161   // Defines a nonterminal for an externally provided annotation.
162   int AddAnnotation(const std::string& annotation_name);
163 
164   // Defines a nonterminal for an externally provided annotation.
165   void BindAnnotation(const std::string& nonterminal_name,
166                       const std::string& annotation_name);
167 
168   // Adds an alias for a nonterminal. This is a separate name for the same
169   // nonterminal.
170   void AddAlias(const std::string& nonterminal_name, const std::string& alias);
171 
172   // Lowers the rule set into the intermediate representation.
173   // Treats nonterminals given by the argument `predefined_nonterminals` as
174   // defined externally. This allows to define rules that are dependent on
175   // non-terminals produced by e.g. existing text annotations and that will be
176   // fed to the matcher by the lexer.
177   Ir Finalize(const std::set<std::string>& predefined_nonterminals = {}) const;
178 
nonterminals()179   const std::vector<NontermInfo>& nonterminals() const { return nonterminals_; }
rules()180   const std::vector<Rule>& rules() const { return rules_; }
181 
182  private:
183   void ExpandOptionals(
184       int lhs, const std::vector<RhsElement>& rhs, CallbackId callback,
185       int64 callback_param, int8 max_whitespace_gap, bool case_sensitive,
186       int shard, std::vector<int>::const_iterator optional_element_indices,
187       std::vector<int>::const_iterator optional_element_indices_end,
188       std::vector<bool>* omit_these);
189 
190   // Applies optimizations to the right hand side of a rule.
191   std::vector<RhsElement> OptimizeRhs(const std::vector<RhsElement>& rhs,
192                                       int shard = 0);
193 
194   // Removes start and end anchors in case they are followed (respectively
195   // preceded) by unbounded filler.
196   std::vector<RhsElement> ResolveAnchors(
197       const std::vector<RhsElement>& rhs) const;
198 
199   // Rewrites fillers in a rule.
200   // Fillers in a rule such as `lhs ::= <a> <filler> <b>` could be lowered as
201   // <tokens> ::= <token>
202   // <tokens> ::= <tokens> <token>
203   // This has the disadvantage that it will produce a match for each possible
204   // span in the text, which is quadratic in the number of tokens.
205   // It can be more efficiently written as:
206   // `lhs ::= <a_with_tokens> <b>` with
207   // `<a_with_tokens> ::= <a>`
208   // `<a_with_tokens> ::= <a_with_tokens> <token>`
209   // In this each occurrence of `<a>` can start a sequence of tokens.
210   std::vector<RhsElement> ResolveFillers(const std::vector<RhsElement>& rhs,
211                                          int shard = 0);
212 
213   // Checks whether an element denotes a specific nonterminal.
214   bool IsNonterminalOfName(const RhsElement& element,
215                            const std::string& nonterminal) const;
216 
217   // Checks whether the fillers are used in any active rule.
218   bool UsesFillers() const;
219 
220   const LocaleShardMap& locale_shard_map_;
221 
222   // Non-terminal to id map.
223   std::unordered_map<std::string, int> nonterminal_names_;
224   std::vector<NontermInfo> nonterminals_;
225   std::unordered_map<std::string, std::string> nonterminal_alias_;
226   std::unordered_map<std::string, int> annotation_nonterminals_;
227 
228   // Rules.
229   std::vector<Rule> rules_;
230   std::vector<std::string> regex_rules_;
231 };
232 
233 }  // namespace libtextclassifier3::grammar
234 
235 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
236