1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_ 18 #define LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_ 19 20 #include <string> 21 #include <vector> 22 23 #include "smartselect/tokenizer.pb.h" 24 #include "smartselect/types.h" 25 #include "util/base/integral_types.h" 26 27 namespace libtextclassifier { 28 29 // Tokenizer splits the input string into a sequence of tokens, according to the 30 // configuration. 31 class Tokenizer { 32 public: Tokenizer(const std::vector<TokenizationCodepointRange> & codepoint_range_configs)33 explicit Tokenizer( 34 const std::vector<TokenizationCodepointRange>& codepoint_range_configs) { 35 PrepareTokenizationCodepointRanges(codepoint_range_configs); 36 } 37 38 // Tokenizes the input string using the selected tokenization method. 39 std::vector<Token> Tokenize(const std::string& utf8_text) const; 40 41 protected: 42 // Represents a codepoint range [start, end) with its role for tokenization. 43 struct CodepointRange { 44 int32 start; 45 int32 end; 46 TokenizationCodepointRange::Role role; 47 CodepointRangeCodepointRange48 CodepointRange(int32 arg_start, int32 arg_end, 49 TokenizationCodepointRange::Role arg_role) 50 : start(arg_start), end(arg_end), role(arg_role) {} 51 }; 52 53 // Prepares tokenization codepoint ranges for use in tokenization. 54 void PrepareTokenizationCodepointRanges( 55 const std::vector<TokenizationCodepointRange>& codepoint_range_configs); 56 57 // Finds the tokenization role for given codepoint. 58 // If the character is not found returns DEFAULT_ROLE. 59 // Internally uses binary search so should be O(log(# of codepoint_ranges)). 60 TokenizationCodepointRange::Role FindTokenizationRole(int codepoint) const; 61 62 private: 63 // Codepoint ranges that determine how different codepoints are tokenized. 64 // The ranges must not overlap. 65 std::vector<CodepointRange> codepoint_ranges_; 66 }; 67 68 } // namespace libtextclassifier 69 70 #endif // LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_ 71