1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef TC3_STD_STRING_IMPORT
18 #define TC3_STD_STRING_IMPORT
19 #include <string>
20 
21 namespace libtextclassifier3 {
22 using string = std::string;
23 template <class CharT, class Traits = std::char_traits<CharT>,
24           class Allocator = std::allocator<CharT> >
25 using basic_string = std::basic_string<CharT, Traits, Allocator>;
26 }  // namespace libtextclassifier3
27 #endif
28 #ifndef NLP_SAFT_COMPONENTS_COMMON_MOBILE_UTF8_H_
29 #define NLP_SAFT_COMPONENTS_COMMON_MOBILE_UTF8_H_
30 
31 #include <stddef.h>
32 
33 #include <string>
34 
35 namespace libtextclassifier3 {
36 namespace mobile {
37 namespace utils {
38 
39 // Returns the length (number of bytes) of the UTF8 code point starting at src,
40 // by reading only the byte from address src.
41 //
42 // The result is a number from the set {1, 2, 3, 4}.
OneCharLen(const char * src)43 static inline int OneCharLen(const char *src) {
44   // On most platforms, char is unsigned by default, but iOS is an exception.
45   // The cast below makes sure we always interpret *src as an unsigned char.
46   return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"
47       [(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4];
48 }
49 
50 // Returns a pointer "end" inside [data, data + size) such that the prefix from
51 // [data, end) is the largest one that does not contain '\0' and offers the
52 // following guarantee: if one starts with
53 //
54 //   curr = text.data()
55 //
56 // and keeps executing
57 //
58 //   curr += OneCharLen(curr)
59 //
60 // one would eventually reach curr == end (the pointer returned by this
61 // function) without accessing data outside the string.  This guards against
62 // scenarios like a broken UTF8 string which has only e.g., the first 2 bytes
63 // from a 3-byte UTF8 sequence.
64 //
65 // Preconditions: data != nullptr.
66 const char *GetSafeEndOfUtf8String(const char *data, size_t size);
67 
GetSafeEndOfUtf8String(const std::string & text)68 static inline const char *GetSafeEndOfUtf8String(const std::string &text) {
69   return GetSafeEndOfUtf8String(text.data(), text.size());
70 }
71 
72 }  // namespace utils
73 }  // namespace mobile
74 }  // namespace nlp_saft
75 
76 #endif  // NLP_SAFT_COMPONENTS_COMMON_MOBILE_UTF8_H_
77