1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Utilities for tests.
18 
19 #ifndef LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
20 #define LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
21 
22 #include <string>
23 
24 #include "annotator/types.h"
25 #include "utils/codepoint-range.h"
26 #include "utils/strings/utf8.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "absl/container/flat_hash_set.h"
29 #include "absl/strings/string_view.h"
30 
31 namespace libtextclassifier3 {
32 
33 struct FilterResult {
34   // Whether split on this codepoint.
35   bool to_split;
36   // If the codepoint is used to split the text, whether to output it as a
37   // token.
38   bool to_keep;
39 };
40 
41 // Returns a list of Tokens for a given input string, by tokenizing on space.
42 std::vector<Token> TokenizeOnSpace(const std::string& text);
43 
44 // Returns a list of Tokens for a given input string, by tokenizing on the
45 // given set of delimiter codepoints.
46 // If create_tokens_for_non_space_delimiters is true, create tokens for
47 // delimiters which are not white spaces. For example "This, is" -> {"This",
48 // ",", "is"}.
49 std::vector<Token> TokenizeOnDelimiters(
50     const std::string& text, const absl::flat_hash_set<char32>& delimiters,
51     bool create_tokens_for_non_space_delimiters = false);
52 
53 // This replicates how the original bert_tokenizer from the tflite-support
54 // library pretokenize text by using regex_split with these default regexes.
55 // It splits the text on spaces, punctuations and chinese characters and
56 // output all the tokens except spaces.
57 // So far, the only difference between this and the original implementation
58 // we are aware of is that the original regexes has 8 ranges of chinese
59 // unicodes. We have all these 8 ranges plus two extra ranges.
60 std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
61     const absl::string_view text);
62 
63 // Returns a list of Tokens for a given input string, by tokenizing on the
64 // given filter function. Caller can control which codepoint to split and
65 // whether a delimiter should be output as a token.
66 template <typename FilterFn>
TokenizeWithFilter(const absl::string_view input,FilterFn filter)67 std::vector<Token> TokenizeWithFilter(const absl::string_view input,
68                                       FilterFn filter) {
69   const UnicodeText input_unicode = UTF8ToUnicodeText(input, /*do_copy=*/false);
70   std::vector<Token> tokens;
71   UnicodeText::const_iterator start_it = input_unicode.begin();
72   int token_start_codepoint = 0;
73   int codepoint_idx = 0;
74 
75   for (auto it = input_unicode.begin(); it != input_unicode.end(); ++it) {
76     const char32 code_point = *it;
77     FilterResult filter_result = filter(code_point);
78     if (filter_result.to_split) {
79       const std::string token_text = UnicodeText::UTF8Substring(start_it, it);
80       if (!token_text.empty()) {
81         tokens.push_back(
82             Token{token_text, token_start_codepoint, codepoint_idx});
83       }
84       if (filter_result.to_keep) {
85         const std::string delimiter =
86             UnicodeText::UTF8Substring(it, std::next(it));
87         tokens.push_back(Token{delimiter, codepoint_idx, codepoint_idx + 1});
88       }
89       start_it = std::next(it);
90       token_start_codepoint = codepoint_idx + 1;
91     }
92     codepoint_idx++;
93   }
94   // Flush the last token if any.
95   if (start_it != input_unicode.end()) {
96     const std::string token_text =
97         UnicodeText::UTF8Substring(start_it, input_unicode.end());
98     tokens.push_back(Token{token_text, token_start_codepoint, codepoint_idx});
99   }
100   return tokens;
101 }
102 
103 }  // namespace  libtextclassifier3
104 
105 #endif  // LIBTEXTCLASSIFIER_UTILS_TOKENIZER_UTILS_H_
106