1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/strings/utf8.h"
18 
19 #include "utils/base/logging.h"
20 
21 namespace libtextclassifier3 {
22 
IsValidUTF8(const char * src,int size)23 bool IsValidUTF8(const char *src, int size) {
24   int char_length;
25   for (int i = 0; i < size;) {
26     if (!IsValidChar(src + i, size - i, &char_length)) {
27       return false;
28     }
29     i += char_length;
30   }
31   return true;
32 }
33 
SafeTruncateLength(const char * str,int truncate_at)34 int SafeTruncateLength(const char *str, int truncate_at) {
35   // Always want to truncate at the start of a character, so if
36   // it's in a middle, back up toward the start
37   while (IsTrailByte(str[truncate_at]) && (truncate_at > 0)) {
38     truncate_at--;
39   }
40   return truncate_at;
41 }
42 
ValidCharToRune(const char * str)43 char32 ValidCharToRune(const char *str) {
44   TC3_DCHECK(!IsTrailByte(str[0]) && GetNumBytesForUTF8Char(str) > 0);
45 
46   // Convert from UTF-8
47   unsigned char byte1 = static_cast<unsigned char>(str[0]);
48   if (byte1 < 0x80) {
49     // One character sequence: 00000 - 0007F.
50     return byte1;
51   }
52 
53   unsigned char byte2 = static_cast<unsigned char>(str[1]);
54   if (byte1 < 0xE0) {
55     // Two character sequence: 00080 - 007FF.
56     return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
57   }
58 
59   unsigned char byte3 = static_cast<unsigned char>(str[2]);
60   if (byte1 < 0xF0) {
61     // Three character sequence: 00800 - 0FFFF.
62     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
63   }
64 
65   unsigned char byte4 = static_cast<unsigned char>(str[3]);
66   // Four character sequence: 10000 - 1FFFF.
67   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
68          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
69 }
70 
IsValidChar(const char * str,int size,int * num_bytes)71 bool IsValidChar(const char *str, int size, int *num_bytes) {
72   // Unexpected trail byte.
73   if (IsTrailByte(str[0])) {
74     return false;
75   }
76 
77   *num_bytes = GetNumBytesForUTF8Char(str);
78   if (*num_bytes <= 0 || *num_bytes > size) {
79     return false;
80   }
81 
82   // Check that remaining bytes in the codepoint are trailing bytes.
83   for (int k = 1; k < *num_bytes; k++) {
84     if (!IsTrailByte(str[k])) {
85       return false;
86     }
87   }
88 
89   // Exclude overlong encodings.
90   // Check that the codepoint is encoded with the minimum number of required
91   // bytes. An ascii value could be encoded in 4, 3 or 2 bytes but requires
92   // only 1. There is a unique valid encoding for each code point.
93   // This ensures that string comparisons and searches are well-defined.
94   // See: https://en.wikipedia.org/wiki/UTF-8
95   const char32 codepoint = ValidCharToRune(str);
96   switch (*num_bytes) {
97     case 1:
98       return true;
99     case 2:
100       // Everything below 128 can be encoded in one byte.
101       return (codepoint >= (1 << 7 /* num. payload bits in one byte */));
102     case 3:
103       return (codepoint >= (1 << 11 /* num. payload bits in two utf8 bytes */));
104     case 4:
105       return (codepoint >=
106               (1 << 16 /* num. payload bits in three utf8 bytes */)) &&
107              (codepoint < 0x10FFFF /* maximum rune value */);
108   }
109   return false;
110 }
111 
ValidRuneToChar(const char32 rune,char * dest)112 int ValidRuneToChar(const char32 rune, char *dest) {
113   // Convert to unsigned for range check.
114   uint32 c;
115 
116   // 1 char 00-7F
117   c = rune;
118   if (c <= 0x7F) {
119     dest[0] = static_cast<char>(c);
120     return 1;
121   }
122 
123   // 2 char 0080-07FF
124   if (c <= 0x07FF) {
125     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
126     dest[1] = 0x80 | (c & 0x3F);
127     return 2;
128   }
129 
130   // 3 char 0800-FFFF
131   if (c <= 0xFFFF) {
132     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
133     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
134     dest[2] = 0x80 | (c & 0x3F);
135     return 3;
136   }
137 
138   // 4 char 10000-1FFFFF
139   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
140   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
141   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
142   dest[3] = 0x80 | (c & 0x3F);
143   return 4;
144 }
145 
146 }  // namespace libtextclassifier3
147