1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_CUSTOM_TOKENIZER_H_
18 #define LIBTEXTCLASSIFIER_LANG_ID_CUSTOM_TOKENIZER_H_
19 
20 #include <cstddef>
21 #include <string>
22 
23 #include "lang_id/light-sentence.h"
24 
25 namespace libtextclassifier {
26 namespace nlp_core {
27 namespace lang_id {
28 
29 // Perform custom tokenization of text.  Customized for the language
30 // identification project.  Currently (Sep 15, 2016) we tokenize on space,
31 // newline, and tab, ignore all empty tokens, and (for each of the remaining
32 // tokens) prepend "^" (special token begin marker) and append "$" (special
33 // token end marker).
34 //
35 // Tokens are stored into the words of the LightSentence *sentence.
36 void TokenizeTextForLangId(const std::string &text, LightSentence *sentence);
37 
38 // Returns a pointer "end" inside [data, data + size) such that the prefix from
39 // [data, end) is the largest one that does not contain '\0' and offers the
40 // following guarantee: if one starts with
41 //
42 //   curr = text.data()
43 //
44 // and keeps executing
45 //
46 //   curr += utils::GetNumBytesForNonZeroUTF8Char(curr)
47 //
48 // one would eventually reach curr == end (the pointer returned by this
49 // function) without accessing data outside the std::string.  This guards
50 // against scenarios like a broken UTF-8 string which has only e.g., the first 2
51 // bytes from a 3-byte UTF8 sequence.
52 const char *GetSafeEndOfString(const char *data, size_t size);
53 
GetSafeEndOfString(const std::string & text)54 static inline const char *GetSafeEndOfString(const std::string &text) {
55   return GetSafeEndOfString(text.data(), text.size());
56 }
57 
58 }  // namespace lang_id
59 }  // namespace nlp_core
60 }  // namespace libtextclassifier
61 
62 #endif  // LIBTEXTCLASSIFIER_LANG_ID_CUSTOM_TOKENIZER_H_
63