1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TYPES_H_
18 #define LIBTEXTCLASSIFIER_SMARTSELECT_TYPES_H_
19 
20 #include <ostream>
21 #include <string>
22 #include <utility>
23 
24 namespace libtextclassifier {
25 
26 constexpr int kInvalidIndex = -1;
27 
28 // Index for a 0-based array of tokens.
29 using TokenIndex = int;
30 
31 // Index for a 0-based array of codepoints.
32 using CodepointIndex = int;
33 
34 // Marks a span in a sequence of codepoints. The first element is the index of
35 // the first codepoint of the span, and the second element is the index of the
36 // codepoint one past the end of the span.
37 using CodepointSpan = std::pair<CodepointIndex, CodepointIndex>;
38 
39 // Marks a span in a sequence of tokens. The first element is the index of the
40 // first token in the span, and the second element is the index of the token one
41 // past the end of the span.
42 using TokenSpan = std::pair<TokenIndex, TokenIndex>;
43 
44 // Token holds a token, its position in the original string and whether it was
45 // part of the input span.
46 struct Token {
47   std::string value;
48   CodepointIndex start;
49   CodepointIndex end;
50 
51   // Whether the token is a padding token.
52   bool is_padding;
53 
54   // Default constructor constructs the padding-token.
TokenToken55   Token()
56       : value(""), start(kInvalidIndex), end(kInvalidIndex), is_padding(true) {}
57 
TokenToken58   Token(const std::string& arg_value, CodepointIndex arg_start,
59         CodepointIndex arg_end)
60       : value(arg_value), start(arg_start), end(arg_end), is_padding(false) {}
61 
62   bool operator==(const Token& other) const {
63     return value == other.value && start == other.start && end == other.end &&
64            is_padding == other.is_padding;
65   }
66 
IsContainedInSpanToken67   bool IsContainedInSpan(CodepointSpan span) const {
68     return start >= span.first && end <= span.second;
69   }
70 };
71 
72 // Pretty-printing function for Token.
73 inline std::ostream& operator<<(std::ostream& os, const Token& token) {
74   return os << "Token(\"" << token.value << "\", " << token.start << ", "
75             << token.end << ", is_padding=" << token.is_padding << ")";
76 }
77 
78 }  // namespace libtextclassifier
79 
80 #endif  // LIBTEXTCLASSIFIER_SMARTSELECT_TYPES_H_
81