1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
16 #define TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
17 
18 #include "tensorflow/core/lib/core/status.h"
19 
20 namespace tensorflow {
21 
22 // Enumeration for unicode encodings.  Used by ops such as
23 // tf.strings.unicode_encode and tf.strings.unicode_decode.
24 enum class UnicodeEncoding { UTF8, UTF16BE, UTF32BE };
25 
26 // Enumeration for character units.  Used by string such as
27 // tf.strings.length and tf.substr.
28 // TODO(edloper): Add support for: UTF32_CHAR, etc.
29 enum class CharUnit { BYTE, UTF8_CHAR };
30 
31 // Whether or not the given byte is the trailing byte of a UTF-8/16/32 char.
IsTrailByte(char x)32 inline bool IsTrailByte(char x) { return static_cast<signed char>(x) < -0x40; }
33 
34 // Sets `encoding` based on `str`.
35 Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding);
36 
37 // Sets `unit` value based on `str`.
38 Status ParseCharUnit(const string& str, CharUnit* unit);
39 
40 // Returns the number of Unicode characters in a UTF-8 string.
41 // Result may be incorrect if the input string is not valid UTF-8.
42 int32 UTF8StrLen(const string& str);
43 
44 // Get the next UTF8 character position starting at the given position and
45 // skipping the given number of characters. Position is a byte offset, and
46 // should never be `null`. The function return true if successful. However, if
47 // the end of the string is reached before the requested characters, then the
48 // position will point to the end of string and this function will return false.
49 template <typename T>
ForwardNUTF8CharPositions(const StringPiece in,const T num_utf8_chars_to_shift,T * pos)50 bool ForwardNUTF8CharPositions(const StringPiece in,
51                                const T num_utf8_chars_to_shift, T* pos) {
52   const size_t size = in.size();
53   T utf8_chars_counted = 0;
54   while (utf8_chars_counted < num_utf8_chars_to_shift && *pos < size) {
55     // move forward one utf-8 character
56     do {
57       ++*pos;
58     } while (IsTrailByte(in[*pos]) && *pos < size);
59     ++utf8_chars_counted;
60   }
61   return utf8_chars_counted == num_utf8_chars_to_shift;
62 }
63 
64 // Get the previous UTF8 character position starting at the given position and
65 // skipping the given number of characters. Position is a byte offset with a
66 // positive value, relative to the beginning of the string, and should never be
67 // `null`. The function return true if successful. However, if the beginning of
68 // the string is reached before the requested character, then the position will
69 // point to the beginning of the string and this function will return false.
70 template <typename T>
BackNUTF8CharPositions(const StringPiece in,const T num_utf8_chars_to_shift,T * pos)71 bool BackNUTF8CharPositions(const StringPiece in,
72                             const T num_utf8_chars_to_shift, T* pos) {
73   const size_t start = 0;
74   T utf8_chars_counted = 0;
75   while (utf8_chars_counted < num_utf8_chars_to_shift && (*pos > start)) {
76     // move back one utf-8 character
77     do {
78       --*pos;
79     } while (IsTrailByte(in[*pos]) && *pos > start);
80     ++utf8_chars_counted;
81   }
82   return utf8_chars_counted == num_utf8_chars_to_shift;
83 }
84 
85 }  // namespace tensorflow
86 
87 #endif  // TENSORFLOW_CORE_KERNELS_STRING_UTIL_H_
88