1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8/unicodetext.h"
18 
19 #include <string.h>
20 
21 #include <algorithm>
22 
23 #include "utils/base/logging.h"
24 #include "utils/strings/utf8.h"
25 
26 namespace libtextclassifier3 {
27 
28 // *************** Data representation **********
29 // Note: the copy constructor is undefined.
30 
operator =(Repr && src)31 UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) {
32   if (ours_ && data_) delete[] data_;
33   data_ = src.data_;
34   size_ = src.size_;
35   capacity_ = src.capacity_;
36   ours_ = src.ours_;
37   src.ours_ = false;
38   return *this;
39 }
40 
PointTo(const char * data,int size)41 void UnicodeText::Repr::PointTo(const char* data, int size) {
42   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
43   data_ = const_cast<char*>(data);
44   size_ = size;
45   capacity_ = size;
46   ours_ = false;
47 }
48 
Copy(const char * data,int size)49 void UnicodeText::Repr::Copy(const char* data, int size) {
50   resize(size);
51   memcpy(data_, data, size);
52 }
53 
resize(int new_size)54 void UnicodeText::Repr::resize(int new_size) {
55   if (new_size == 0) {
56     clear();
57   } else {
58     if (!ours_ || new_size > capacity_) reserve(new_size);
59     // Clear the memory in the expanded part.
60     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
61     size_ = new_size;
62     ours_ = true;
63   }
64 }
65 
reserve(int new_capacity)66 void UnicodeText::Repr::reserve(int new_capacity) {
67   // If there's already enough capacity, and we're an owner, do nothing.
68   if (capacity_ >= new_capacity && ours_) return;
69 
70   // Otherwise, allocate a new buffer.
71   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
72   char* new_data = new char[capacity_];
73 
74   // If there is an old buffer, copy it into the new buffer.
75   if (data_) {
76     memcpy(new_data, data_, size_);
77     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
78   }
79   data_ = new_data;
80   ours_ = true;  // We own the new buffer.
81   // size_ is unchanged.
82 }
83 
append(const char * bytes,int byte_length)84 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
85   reserve(size_ + byte_length);
86   memcpy(data_ + size_, bytes, byte_length);
87   size_ += byte_length;
88 }
89 
clear()90 void UnicodeText::Repr::clear() {
91   if (ours_) delete[] data_;
92   data_ = nullptr;
93   size_ = capacity_ = 0;
94   ours_ = true;
95 }
96 
97 // *************** UnicodeText ******************
98 
UnicodeText()99 UnicodeText::UnicodeText() {}
100 
UnicodeText(const UnicodeText & src)101 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); }
102 
operator =(UnicodeText && src)103 UnicodeText& UnicodeText::operator=(UnicodeText&& src) {
104   this->repr_ = std::move(src.repr_);
105   return *this;
106 }
107 
Copy(const UnicodeText & src)108 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
109   repr_.Copy(src.repr_.data_, src.repr_.size_);
110   return *this;
111 }
112 
PointToUTF8(const char * buffer,int byte_length)113 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
114   repr_.PointTo(buffer, byte_length);
115   return *this;
116 }
117 
CopyUTF8(const char * buffer,int byte_length)118 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
119   repr_.Copy(buffer, byte_length);
120   return *this;
121 }
122 
AppendUTF8(const char * utf8,int len)123 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
124   repr_.append(utf8, len);
125   return *this;
126 }
127 
data() const128 const char* UnicodeText::data() const { return repr_.data_; }
129 
size_bytes() const130 int UnicodeText::size_bytes() const { return repr_.size_; }
131 
132 namespace {
133 
134 enum {
135   RuneError = 0xFFFD,  // Decoding error in UTF.
136   RuneMax = 0x10FFFF,  // Maximum rune value.
137 };
138 
runetochar(const char32 rune,char * dest)139 int runetochar(const char32 rune, char* dest) {
140   // Convert to unsigned for range check.
141   uint32 c;
142 
143   // 1 char 00-7F
144   c = rune;
145   if (c <= 0x7F) {
146     dest[0] = static_cast<char>(c);
147     return 1;
148   }
149 
150   // 2 char 0080-07FF
151   if (c <= 0x07FF) {
152     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
153     dest[1] = 0x80 | (c & 0x3F);
154     return 2;
155   }
156 
157   // Range check
158   if (c > RuneMax) {
159     c = RuneError;
160   }
161 
162   // 3 char 0800-FFFF
163   if (c <= 0xFFFF) {
164     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
165     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
166     dest[2] = 0x80 | (c & 0x3F);
167     return 3;
168   }
169 
170   // 4 char 10000-1FFFFF
171   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
172   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
173   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
174   dest[3] = 0x80 | (c & 0x3F);
175   return 4;
176 }
177 
178 }  // namespace
179 
push_back(char32 ch)180 UnicodeText& UnicodeText::push_back(char32 ch) {
181   char str[4];
182   int char_len = runetochar(ch, str);
183   repr_.append(str, char_len);
184   return *this;
185 }
186 
clear()187 void UnicodeText::clear() { repr_.clear(); }
188 
size_codepoints() const189 int UnicodeText::size_codepoints() const {
190   return std::distance(begin(), end());
191 }
192 
empty() const193 bool UnicodeText::empty() const { return size_bytes() == 0; }
194 
is_valid() const195 bool UnicodeText::is_valid() const {
196   return IsValidUTF8(repr_.data_, repr_.size_);
197 }
198 
operator ==(const UnicodeText & other) const199 bool UnicodeText::operator==(const UnicodeText& other) const {
200   if (repr_.size_ != other.repr_.size_) {
201     return false;
202   }
203   return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0;
204 }
205 
ToUTF8String() const206 std::string UnicodeText::ToUTF8String() const {
207   return UTF8Substring(begin(), end());
208 }
209 
UTF8Substring(int begin_codepoint,int end_codepoint) const210 std::string UnicodeText::UTF8Substring(int begin_codepoint,
211                                        int end_codepoint) const {
212   auto span_begin = begin();
213   std::advance(span_begin, begin_codepoint);
214   auto span_end = begin();
215   std::advance(span_end, end_codepoint);
216   return UTF8Substring(span_begin, span_end);
217 }
218 
UTF8Substring(const const_iterator & it_begin,const const_iterator & it_end)219 std::string UnicodeText::UTF8Substring(const const_iterator& it_begin,
220                                        const const_iterator& it_end) {
221   return std::string(it_begin.it_, it_end.it_ - it_begin.it_);
222 }
223 
Substring(const UnicodeText & text,int begin_codepoint,int end_codepoint,bool do_copy)224 UnicodeText UnicodeText::Substring(const UnicodeText& text, int begin_codepoint,
225                                    int end_codepoint, bool do_copy) {
226   auto it_begin = text.begin();
227   std::advance(it_begin, begin_codepoint);
228   auto it_end = text.begin();
229   std::advance(it_end, end_codepoint);
230 
231   if (do_copy) {
232     UnicodeText result;
233     result.repr_.Copy(it_begin.it_, it_end.it_ - it_begin.it_);
234     return result;
235   } else {
236     UnicodeText result;
237     result.repr_.PointTo(it_begin.it_, it_end.it_ - it_begin.it_);
238     return result;
239   }
240 }
241 
~UnicodeText()242 UnicodeText::~UnicodeText() {}
243 
244 // ******************* UnicodeText::const_iterator *********************
245 
246 // The implementation of const_iterator would be nicer if it
247 // inherited from boost::iterator_facade
248 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
249 
const_iterator()250 UnicodeText::const_iterator::const_iterator() : it_(0) {}
251 
operator =(const const_iterator & other)252 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
253     const const_iterator& other) {
254   if (&other != this) it_ = other.it_;
255   return *this;
256 }
257 
begin() const258 UnicodeText::const_iterator UnicodeText::begin() const {
259   return const_iterator(repr_.data_);
260 }
261 
end() const262 UnicodeText::const_iterator UnicodeText::end() const {
263   return const_iterator(repr_.data_ + repr_.size_);
264 }
265 
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)266 bool operator<(const UnicodeText::const_iterator& lhs,
267                const UnicodeText::const_iterator& rhs) {
268   return lhs.it_ < rhs.it_;
269 }
270 
operator *() const271 char32 UnicodeText::const_iterator::operator*() const {
272   // (We could call chartorune here, but that does some
273   // error-checking, and we're guaranteed that our data is valid
274   // UTF-8. Also, we expect this routine to be called very often. So
275   // for speed, we do the calculation ourselves.)
276 
277   // Convert from UTF-8
278   unsigned char byte1 = static_cast<unsigned char>(it_[0]);
279   if (byte1 < 0x80) return byte1;
280 
281   unsigned char byte2 = static_cast<unsigned char>(it_[1]);
282   if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
283 
284   unsigned char byte3 = static_cast<unsigned char>(it_[2]);
285   if (byte1 < 0xF0) {
286     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
287   }
288 
289   unsigned char byte4 = static_cast<unsigned char>(it_[3]);
290   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
291          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
292 }
293 
operator ++()294 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
295   it_ += GetNumBytesForNonZeroUTF8Char(it_);
296   return *this;
297 }
298 
operator --()299 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
300   while (IsTrailByte(*--it_)) {
301   }
302   return *this;
303 }
304 
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)305 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
306   UnicodeText t;
307   if (do_copy) {
308     t.CopyUTF8(utf8_buf, len);
309   } else {
310     t.PointToUTF8(utf8_buf, len);
311   }
312   return t;
313 }
314 
UTF8ToUnicodeText(const char * utf8_buf,bool do_copy)315 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) {
316   return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy);
317 }
318 
UTF8ToUnicodeText(const std::string & str,bool do_copy)319 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
320   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
321 }
322 
UTF8ToUnicodeText(const std::string & str)323 UnicodeText UTF8ToUnicodeText(const std::string& str) {
324   return UTF8ToUnicodeText(str, /*do_copy=*/true);
325 }
326 
327 }  // namespace libtextclassifier3
328