1// Copyright (C) 2017 The Android Open Source Project
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Text classification model configuration.
16
17syntax = "proto2";
18option optimize_for = LITE_RUNTIME;
19
20import "external/libtextclassifier/common/embedding-network.proto";
21import "external/libtextclassifier/smartselect/tokenizer.proto";
22
23package libtextclassifier;
24
25// Generic options of a model, non-specific to selection or sharing.
26message ModelOptions {
27  // If true, will use embeddings from a different model. This is mainly useful
28  // for the Sharing model using the embeddings from the Selection model.
29  optional bool use_shared_embeddings = 1;
30
31  // Language of the model.
32  optional string language = 2;
33
34  // Version of the model.
35  optional int32 version = 3;
36}
37
38message SelectionModelOptions {
39  // A list of Unicode codepoints to strip from predicted selections.
40  repeated int32 punctuation_to_strip = 1;
41
42  // Whether to strip punctuation after the selection is made.
43  optional bool strip_punctuation = 2;
44
45  // Enforce symmetrical selections.
46  optional bool enforce_symmetry = 3;
47
48  // Number of inferences made around the click position (to one side), for
49  // enforcing symmetry.
50  optional int32 symmetry_context_size = 4;
51}
52
53message SharingModelOptions {
54  // If true, will always return "url" when the url hint is passed in.
55  optional bool always_accept_url_hint = 1;
56
57  // If true, will always return "email" when the e-mail hint is passed in.
58  optional bool always_accept_email_hint = 2;
59
60  // Limits for phone numbers.
61  optional int32 phone_min_num_digits = 3 [default = 7];
62  optional int32 phone_max_num_digits = 4 [default = 15];
63}
64
65message FeatureProcessorOptions {
66  // Number of buckets used for hashing charactergrams.
67  optional int32 num_buckets = 1 [default = -1];
68
69  // Context size defines the number of words to the left and to the right of
70  // the selected word to be used as context. For example, if context size is
71  // N, then we take N words to the left and N words to the right of the
72  // selected word as its context.
73  optional int32 context_size = 2 [default = -1];
74
75  // Maximum number of words of the context to select in total.
76  optional int32 max_selection_span = 3 [default = -1];
77
78  // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
79  // character trigrams etc.
80  repeated int32 chargram_orders = 4;
81
82  // Maximum length of a word, in codepoints.
83  optional int32 max_word_length = 21 [default = 20];
84
85  // If true, will use the unicode-aware functionality for extracting features.
86  optional bool unicode_aware_features = 19 [default = false];
87
88  // Whether to extract the token case feature.
89  optional bool extract_case_feature = 5 [default = false];
90
91  // Whether to extract the selection mask feature.
92  optional bool extract_selection_mask_feature = 6 [default = false];
93
94  // List of regexps to run over each token. For each regexp, if there is a
95  // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
96  repeated string regexp_feature = 22;
97
98  // Whether to remap all digits to a single number.
99  optional bool remap_digits = 20 [default = false];
100
101  // Whether to lower-case each token before generating hashgrams.
102  optional bool lowercase_tokens = 33;
103
104  // If true, the selection classifier output will contain only the selections
105  // that are feasible (e.g., those that are shorter than max_selection_span),
106  // if false, the output will be a complete cross-product of possible
107  // selections to the left and posible selections to the right, including the
108  // infeasible ones.
109  // NOTE: Exists mainly for compatibility with older models that were trained
110  // with the non-reduced output space.
111  optional bool selection_reduced_output_space = 8 [default = true];
112
113  // Collection names.
114  repeated string collections = 9;
115
116  // An index of collection in collections to be used if a collection name can't
117  // be mapped to an id.
118  optional int32 default_collection = 10 [default = -1];
119
120  // If true, will split the input by lines, and only use the line that contains
121  // the clicked token.
122  optional bool only_use_line_with_click = 13 [default = false];
123
124  // If true, will split tokens that contain the selection boundary, at the
125  // position of the boundary.
126  // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
127  optional bool split_tokens_on_selection_boundaries = 14 [default = false];
128
129  // Codepoint ranges that determine how different codepoints are tokenized.
130  // The ranges must not overlap.
131  repeated TokenizationCodepointRange tokenization_codepoint_config = 15;
132
133  // Method for selecting the center token.
134  enum CenterTokenSelectionMethod {
135    DEFAULT_CENTER_TOKEN_METHOD = 0;  // Invalid option.
136
137    // Use click indices to determine the center token.
138    CENTER_TOKEN_FROM_CLICK = 1;
139
140    // Use selection indices to get a token range, and select the middle of it
141    // as the center token.
142    CENTER_TOKEN_MIDDLE_OF_SELECTION = 2;
143  }
144  optional CenterTokenSelectionMethod center_token_selection_method = 16;
145
146  // If true, span boundaries will be snapped to containing tokens and not
147  // required to exactly match token boundaries.
148  optional bool snap_label_span_boundaries_to_containing_tokens = 18;
149
150  // Range of codepoints start - end, where end is exclusive.
151  message CodepointRange {
152    optional int32 start = 1;
153    optional int32 end = 2;
154  }
155
156  // A set of codepoint ranges supported by the model.
157  repeated CodepointRange supported_codepoint_ranges = 23;
158
159  // A set of codepoint ranges to use in the mixed tokenization mode to identify
160  // stretches of tokens to re-tokenize using the internal tokenizer.
161  repeated CodepointRange internal_tokenizer_codepoint_ranges = 34;
162
163  // Minimum ratio of supported codepoints in the input context. If the ratio
164  // is lower than this, the feature computation will fail.
165  optional float min_supported_codepoint_ratio = 24 [default = 0.0];
166
167  // Used for versioning the format of features the model expects.
168  //  - feature_version == 0:
169  //      For each token the features consist of:
170  //       - chargram embeddings
171  //       - dense features
172  //      Chargram embeddings for tokens are concatenated first together,
173  //      and at the end, the dense features for the tokens are concatenated
174  //      to it. So the resulting feature vector has two regions.
175  optional int32 feature_version = 25 [default = 0];
176
177  // Controls the type of tokenization the model will use for the input text.
178  enum TokenizationType {
179    INVALID_TOKENIZATION_TYPE = 0;
180
181    // Use the internal tokenizer for tokenization.
182    INTERNAL_TOKENIZER = 1;
183
184    // Use ICU for tokenization.
185    ICU = 2;
186
187    // First apply ICU tokenization. Then identify stretches of tokens
188    // consisting only of codepoints in internal_tokenizer_codepoint_ranges
189    // and re-tokenize them using the internal tokenizer.
190    MIXED = 3;
191  }
192  optional TokenizationType tokenization_type = 30
193      [default = INTERNAL_TOKENIZER];
194  optional bool icu_preserve_whitespace_tokens = 31 [default = false];
195
196  reserved 7, 11, 12, 17, 26, 27, 28, 29, 32;
197};
198
199extend nlp_core.EmbeddingNetworkProto {
200  optional ModelOptions model_options_in_embedding_network_proto = 150063045;
201  optional FeatureProcessorOptions
202      feature_processor_options_in_embedding_network_proto = 146230910;
203  optional SelectionModelOptions
204      selection_model_options_in_embedding_network_proto = 148190899;
205  optional SharingModelOptions
206      sharing_model_options_in_embedding_network_proto = 151445439;
207}
208