1//
2// Copyright (C) 2018 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17// Configuration for the text encoder op.
18
19namespace libtextclassifier3;
20
21enum SentencePieceMatcherType : byte {
22  MAPPED_TRIE = 0,
23  SORTED_STRING_TABLE = 1,
24}
25
26table TextEncoderConfig {
27  // Code that is used as encoding of the start code.
28  start_code:int32 = 0;
29
30  // Code that is used as encoding of the end code.
31  end_code:int32 = 1;
32
33  // This value is added to all codes to make them not intersect with
34  // `start_code` and `end_code`.
35  encoding_offset:int32 = 2;
36
37  // Code that is used for out-of-dictionary characters.
38  unknown_code:int32 = -1;
39
40  // Penalty associated with the unknown code.
41  unknown_score:float;
42
43  // Normalization options.
44  // Serialized normalization charsmap.
45  normalization_charsmap:string;
46  normalization_charsmap_values:string;
47
48  // Whether to add dummy whitespace at the beginning of the text in order to
49  // treat "world" in "world" and "hello world" uniformly.
50  add_dummy_prefix:bool = true;
51
52  // Whether to remove leading, trailing and duplicate internal whitespace.
53  remove_extra_whitespaces:bool = true;
54
55  // Whether to replace whitespace with a meta symbol.
56  escape_whitespaces:bool = true;
57
58  // Sentence pieces scores.
59  pieces_scores:[float];
60
61  // Serialized sentence pieces.
62  pieces:string;
63  pieces_offsets:[uint32];
64  matcher_type: SentencePieceMatcherType = MAPPED_TRIE;
65}
66