1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
18 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
19 
20 #include <functional>
21 #include <vector>
22 
23 #include "annotator/types.h"
24 #include "utils/grammar/semantics/expression_generated.h"
25 #include "utils/grammar/types.h"
26 #include "utils/strings/stringpiece.h"
27 
28 namespace libtextclassifier3::grammar {
29 
30 // Represents a parse tree for a match that was found for a nonterminal.
31 struct ParseTree {
32   enum class Type : int8 {
33     // Default, untyped match.
34     kDefault = 0,
35 
36     // An assertion match (see: AssertionNode).
37     kAssertion = 1,
38 
39     // A value mapping match (see: MappingNode).
40     kMapping = 2,
41 
42     // An exclusion match (see: ExclusionNode).
43     kExclusion = 3,
44 
45     // A match for an annotation (see: AnnotationNode).
46     kAnnotation = 4,
47 
48     // A match for a semantic annotation (see: SemanticExpressionNode).
49     kExpression = 5,
50   };
51 
52   explicit ParseTree() = default;
ParseTreeParseTree53   explicit ParseTree(const Nonterm lhs, const CodepointSpan& codepoint_span,
54                      const int match_offset, const Type type)
55       : lhs(lhs),
56         type(type),
57         codepoint_span(codepoint_span),
58         match_offset(match_offset) {}
59 
60   // For binary rule matches:  rhs1 != NULL and rhs2 != NULL
61   //      unary rule matches:  rhs1 == NULL and rhs2 != NULL
62   //   terminal rule matches:  rhs1 != NULL and rhs2 == NULL
63   //           custom leaves:  rhs1 == NULL and rhs2 == NULL
IsInteriorNodeParseTree64   bool IsInteriorNode() const { return rhs2 != nullptr; }
IsLeafParseTree65   bool IsLeaf() const { return !rhs2; }
66 
IsBinaryRuleParseTree67   bool IsBinaryRule() const { return rhs1 && rhs2; }
IsUnaryRuleParseTree68   bool IsUnaryRule() const { return !rhs1 && rhs2; }
IsTerminalRuleParseTree69   bool IsTerminalRule() const { return rhs1 && !rhs2; }
HasLeadingWhitespaceParseTree70   bool HasLeadingWhitespace() const {
71     return codepoint_span.first != match_offset;
72   }
73 
unary_rule_rhsParseTree74   const ParseTree* unary_rule_rhs() const { return rhs2; }
75 
76   // Used in singly-linked queue of matches for processing.
77   ParseTree* next = nullptr;
78 
79   // Nonterminal we found a match for.
80   Nonterm lhs = kUnassignedNonterm;
81 
82   // Type of the match.
83   Type type = Type::kDefault;
84 
85   // The span in codepoints.
86   CodepointSpan codepoint_span;
87 
88   // The begin codepoint offset used during matching.
89   // This is usually including any prefix whitespace.
90   int match_offset;
91 
92   union {
93     // The first sub match for binary rules.
94     const ParseTree* rhs1 = nullptr;
95 
96     // The terminal, for terminal rules.
97     const char* terminal;
98   };
99   // First or second sub-match for interior nodes.
100   const ParseTree* rhs2 = nullptr;
101 };
102 
103 // Node type to keep track of associated values.
104 struct MappingNode : public ParseTree {
MappingNodeMappingNode105   explicit MappingNode(const Nonterm arg_lhs,
106                        const CodepointSpan arg_codepoint_span,
107                        const int arg_match_offset, const int64 arg_value)
108       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
109                   Type::kMapping),
110         id(arg_value) {}
111   // The associated id or value.
112   int64 id;
113 };
114 
115 // Node type to keep track of assertions.
116 struct AssertionNode : public ParseTree {
AssertionNodeAssertionNode117   explicit AssertionNode(const Nonterm arg_lhs,
118                          const CodepointSpan arg_codepoint_span,
119                          const int arg_match_offset, const bool arg_negative)
120       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
121                   Type::kAssertion),
122         negative(arg_negative) {}
123   // If true, the assertion is negative and will be valid if the input doesn't
124   // match.
125   bool negative;
126 };
127 
128 // Node type to define exclusions.
129 struct ExclusionNode : public ParseTree {
ExclusionNodeExclusionNode130   explicit ExclusionNode(const Nonterm arg_lhs,
131                          const CodepointSpan arg_codepoint_span,
132                          const int arg_match_offset,
133                          const Nonterm arg_exclusion_nonterm)
134       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
135                   Type::kExclusion),
136         exclusion_nonterm(arg_exclusion_nonterm) {}
137   // The nonterminal that denotes matches to exclude from a successful match.
138   // So the match is only valid if there is no match of `exclusion_nonterm`
139   // spanning the same text range.
140   Nonterm exclusion_nonterm;
141 };
142 
143 // Match to represent an annotator annotated span in the grammar.
144 struct AnnotationNode : public ParseTree {
AnnotationNodeAnnotationNode145   explicit AnnotationNode(const Nonterm arg_lhs,
146                           const CodepointSpan arg_codepoint_span,
147                           const int arg_match_offset,
148                           const ClassificationResult* arg_annotation)
149       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
150                   Type::kAnnotation),
151         annotation(arg_annotation) {}
152   const ClassificationResult* annotation;
153 };
154 
155 // Node type to represent an associated semantic expression.
156 struct SemanticExpressionNode : public ParseTree {
SemanticExpressionNodeSemanticExpressionNode157   explicit SemanticExpressionNode(const Nonterm arg_lhs,
158                                   const CodepointSpan arg_codepoint_span,
159                                   const int arg_match_offset,
160                                   const SemanticExpression* arg_expression)
161       : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
162                   Type::kExpression),
163         expression(arg_expression) {}
164   const SemanticExpression* expression;
165 };
166 
167 // Utility functions for parse tree traversal.
168 
169 // Does a preorder traversal, calling `node_fn` on each node.
170 // `node_fn` is expected to return whether to continue expanding a node.
171 void Traverse(const ParseTree* root,
172               const std::function<bool(const ParseTree*)>& node_fn);
173 
174 // Does a preorder traversal, selecting all nodes where `pred_fn` returns true.
175 std::vector<const ParseTree*> SelectAll(
176     const ParseTree* root,
177     const std::function<bool(const ParseTree*)>& pred_fn);
178 
179 // Retrieves all nodes of a given type.
180 template <typename T>
SelectAllOfType(const ParseTree * root,const ParseTree::Type type)181 const std::vector<const T*> SelectAllOfType(const ParseTree* root,
182                                             const ParseTree::Type type) {
183   std::vector<const T*> result;
184   Traverse(root, [&result, type](const ParseTree* node) {
185     if (node->type == type) {
186       result.push_back(static_cast<const T*>(node));
187     }
188     return true;
189   });
190   return result;
191 }
192 
193 }  // namespace libtextclassifier3::grammar
194 
195 #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
196