1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17include "utils/grammar/semantics/expression.fbs";
18include "utils/i18n/language-tag.fbs";
19include "utils/zlib/buffer.fbs";
20
21// The terminal rules map as sorted strings table.
22// The sorted terminal strings table is represented as offsets into the
23// global strings pool, this allows to save memory between localized
24// rules sets.
25namespace libtextclassifier3.grammar.RulesSet_.Rules_;
26table TerminalRulesMap {
27  // The offsets into the terminals pool.
28  terminal_offsets:[uint];
29
30  // The lhs set associated with a terminal rule.
31  // This is an offset into the (deduplicated) global `lhs_set` vector.
32  lhs_set_index:[uint];
33
34  // Bounds the lengths of the terminal strings for quick early lookup
35  // abort.
36  min_terminal_length:int;
37
38  max_terminal_length:int;
39}
40
41namespace libtextclassifier3.grammar.RulesSet_.Rules_;
42struct UnaryRulesEntry {
43  key:uint (key);
44  value:uint;
45}
46
47// One key, value pair entry in the binary rules hash map.
48// The key is a pair of nonterminals and the value the index of the lhs set.
49namespace libtextclassifier3.grammar.RulesSet_.Rules_;
50struct BinaryRule {
51  // The two rhs nonterminals.
52  rhs_first:uint;
53
54  rhs_second:uint;
55
56  // The lhs set associated with this binary rule.
57  // This is an offset into the (deduplicated) global `lhs_set` vector.
58  lhs_set_index:uint;
59}
60
61// One bucket in the binary rule hash map that contains all entries for a
62// given hash value.
63namespace libtextclassifier3.grammar.RulesSet_.Rules_;
64table BinaryRuleTableBucket {
65  rules:[BinaryRule];
66}
67
68namespace libtextclassifier3.grammar.RulesSet_;
69table Rules {
70  // The locale this rule set applies to.
71  locale:[LanguageTag];
72
73  terminal_rules:Rules_.TerminalRulesMap;
74  lowercase_terminal_rules:Rules_.TerminalRulesMap;
75
76  // The unary rules map.
77  // This is a map from a nonterminal to an lhs set index into the
78  // (deduplicated) global `lhs_set` vector.
79  unary_rules:[Rules_.UnaryRulesEntry];
80
81  // The binary rules (hash) map.
82  // This is a map from nonterminal pair to an lhs set index into the
83  // (deduplicated) global `lhs_set` vector.
84  binary_rules:[Rules_.BinaryRuleTableBucket];
85}
86
87// A set of lhs nonterminals associated with a rule match.
88// Most commonly, that is just the id of the lhs nonterminal of the rule that
89// is triggered, in this case `lhs` is set to the id of the nonterminal.
90// If a callback needs to be triggered, lhs is the (negated) index into the
91// `lhs` vector below that specifies additionally to the nonterminal, also the
92// callback and parameter to call.
93namespace libtextclassifier3.grammar.RulesSet_;
94table LhsSet {
95  lhs:[int];
96}
97
98namespace libtextclassifier3.grammar.RulesSet_;
99struct Lhs {
100  // The lhs nonterminal.
101  nonterminal:uint;
102
103  // The id of the callback to trigger.
104  callback_id:uint;
105
106  // A parameter to pass when invoking the callback.
107  callback_param:ulong;
108
109  // The maximum amount of whitespace allowed between the two parts.
110  // A value of -1 allows for unbounded whitespace.
111  max_whitespace_gap:byte;
112}
113
114namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_;
115table AnnotationNtEntry {
116  key:string (key, shared);
117  value:int;
118}
119
120// Usage of pre-defined non-terminals that the lexer can generate if used by
121// the grammar.
122namespace libtextclassifier3.grammar.RulesSet_;
123table Nonterminals {
124  // Id of the nonterminal indicating the start of input.
125  start_nt:int;
126
127  // Id of the nonterminal indicating the end of input.
128  end_nt:int;
129
130  // Id of the nonterminal indicating a token.
131  token_nt:int;
132
133  // Id of the nonterminal indicating a string of digits.
134  digits_nt:int;
135
136  // `n_digits_nt[k]` is the id of the nonterminal indicating a string of
137  // `k` digits.
138  n_digits_nt:[int];
139
140  // Id of the nonterminal indicating a word or token boundary.
141  wordbreak_nt:int;
142
143  // Id of the nonterminal indicating an uppercase token.
144  uppercase_token_nt:int;
145
146  // Predefined nonterminals for annotations.
147  // Maps annotation/collection names to non-terminal ids.
148  annotation_nt:[Nonterminals_.AnnotationNtEntry];
149}
150
151namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_;
152table NonterminalNamesEntry {
153  key:int (key);
154  value:string (shared);
155}
156
157// Debug information for e.g. printing parse trees and show match
158// information.
159namespace libtextclassifier3.grammar.RulesSet_;
160table DebugInformation {
161  nonterminal_names:[DebugInformation_.NonterminalNamesEntry];
162}
163
164// Regex annotators.
165namespace libtextclassifier3.grammar.RulesSet_;
166table RegexAnnotator {
167  // The pattern to run.
168  pattern:string (shared);
169
170  compressed_pattern:CompressedBuffer;
171
172  // The nonterminal to trigger.
173  nonterminal:uint;
174}
175
176// Context free grammar rules representation.
177// Rules are represented in (mostly) Chomsky Normal Form, where all rules are
178// of the following form, either:
179// * <nonterm> ::= term
180// * <nonterm> ::= <nonterm>
181// * <nonterm> ::= <nonterm> <nonterm>
182// The `terminals`, `unary_rules` and `binary_rules` maps below represent
183// these sets of rules.
184namespace libtextclassifier3.grammar;
185table RulesSet {
186  rules:[RulesSet_.Rules];
187  lhs_set:[RulesSet_.LhsSet];
188  lhs:[RulesSet_.Lhs];
189
190  // Terminals string pool.
191  // The strings are zero-byte delimited and offset indexed by
192  // `terminal_offsets` in the terminals rules map.
193  terminals:string (shared);
194
195  nonterminals:RulesSet_.Nonterminals;
196  reserved_6:int16 (deprecated);
197  debug_information:RulesSet_.DebugInformation;
198  regex_annotator:[RulesSet_.RegexAnnotator];
199
200  // If true, will compile the regexes only on first use.
201  lazy_regex_compilation:bool;
202
203  // The semantic expressions associated with rule matches.
204  semantic_expression:[SemanticExpression];
205
206  // The schema defining the semantic results.
207  semantic_values_schema:[ubyte];
208}
209
210