1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "util/utf8/unicodetext.h"
18 
19 #include <string.h>
20 
21 #include <algorithm>
22 
23 #include "util/strings/utf8.h"
24 
25 namespace libtextclassifier2 {
26 
27 // *************** Data representation **********
28 // Note: the copy constructor is undefined.
29 
operator =(Repr && src)30 UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) {
31   if (ours_ && data_) delete[] data_;
32   data_ = src.data_;
33   size_ = src.size_;
34   capacity_ = src.capacity_;
35   ours_ = src.ours_;
36   src.ours_ = false;
37   return *this;
38 }
39 
PointTo(const char * data,int size)40 void UnicodeText::Repr::PointTo(const char* data, int size) {
41   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
42   data_ = const_cast<char*>(data);
43   size_ = size;
44   capacity_ = size;
45   ours_ = false;
46 }
47 
Copy(const char * data,int size)48 void UnicodeText::Repr::Copy(const char* data, int size) {
49   resize(size);
50   memcpy(data_, data, size);
51 }
52 
resize(int new_size)53 void UnicodeText::Repr::resize(int new_size) {
54   if (new_size == 0) {
55     clear();
56   } else {
57     if (!ours_ || new_size > capacity_) reserve(new_size);
58     // Clear the memory in the expanded part.
59     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
60     size_ = new_size;
61     ours_ = true;
62   }
63 }
64 
reserve(int new_capacity)65 void UnicodeText::Repr::reserve(int new_capacity) {
66   // If there's already enough capacity, and we're an owner, do nothing.
67   if (capacity_ >= new_capacity && ours_) return;
68 
69   // Otherwise, allocate a new buffer.
70   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
71   char* new_data = new char[capacity_];
72 
73   // If there is an old buffer, copy it into the new buffer.
74   if (data_) {
75     memcpy(new_data, data_, size_);
76     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
77   }
78   data_ = new_data;
79   ours_ = true;  // We own the new buffer.
80   // size_ is unchanged.
81 }
82 
append(const char * bytes,int byte_length)83 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
84   reserve(size_ + byte_length);
85   memcpy(data_ + size_, bytes, byte_length);
86   size_ += byte_length;
87 }
88 
clear()89 void UnicodeText::Repr::clear() {
90   if (ours_) delete[] data_;
91   data_ = nullptr;
92   size_ = capacity_ = 0;
93   ours_ = true;
94 }
95 
96 // *************** UnicodeText ******************
97 
UnicodeText()98 UnicodeText::UnicodeText() {}
99 
UnicodeText(const UnicodeText & src)100 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); }
101 
operator =(UnicodeText && src)102 UnicodeText& UnicodeText::operator=(UnicodeText&& src) {
103   this->repr_ = std::move(src.repr_);
104   return *this;
105 }
106 
Copy(const UnicodeText & src)107 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
108   repr_.Copy(src.repr_.data_, src.repr_.size_);
109   return *this;
110 }
111 
PointToUTF8(const char * buffer,int byte_length)112 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
113   repr_.PointTo(buffer, byte_length);
114   return *this;
115 }
116 
CopyUTF8(const char * buffer,int byte_length)117 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
118   repr_.Copy(buffer, byte_length);
119   return *this;
120 }
121 
AppendUTF8(const char * utf8,int len)122 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
123   repr_.append(utf8, len);
124   return *this;
125 }
126 
data() const127 const char* UnicodeText::data() const { return repr_.data_; }
128 
size_bytes() const129 int UnicodeText::size_bytes() const { return repr_.size_; }
130 
131 namespace {
132 
133 enum {
134   RuneError = 0xFFFD,  // Decoding error in UTF.
135   RuneMax = 0x10FFFF,  // Maximum rune value.
136 };
137 
runetochar(const char32 rune,char * dest)138 int runetochar(const char32 rune, char* dest) {
139   // Convert to unsigned for range check.
140   uint32 c;
141 
142   // 1 char 00-7F
143   c = rune;
144   if (c <= 0x7F) {
145     dest[0] = static_cast<char>(c);
146     return 1;
147   }
148 
149   // 2 char 0080-07FF
150   if (c <= 0x07FF) {
151     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
152     dest[1] = 0x80 | (c & 0x3F);
153     return 2;
154   }
155 
156   // Range check
157   if (c > RuneMax) {
158     c = RuneError;
159   }
160 
161   // 3 char 0800-FFFF
162   if (c <= 0xFFFF) {
163     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
164     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
165     dest[2] = 0x80 | (c & 0x3F);
166     return 3;
167   }
168 
169   // 4 char 10000-1FFFFF
170   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
171   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
172   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
173   dest[3] = 0x80 | (c & 0x3F);
174   return 4;
175 }
176 
177 }  // namespace
178 
AppendCodepoint(char32 ch)179 UnicodeText& UnicodeText::AppendCodepoint(char32 ch) {
180   char str[4];
181   int char_len = runetochar(ch, str);
182   repr_.append(str, char_len);
183   return *this;
184 }
185 
clear()186 void UnicodeText::clear() { repr_.clear(); }
187 
size_codepoints() const188 int UnicodeText::size_codepoints() const {
189   return std::distance(begin(), end());
190 }
191 
empty() const192 bool UnicodeText::empty() const { return size_bytes() == 0; }
193 
is_valid() const194 bool UnicodeText::is_valid() const {
195   return IsValidUTF8(repr_.data_, repr_.size_);
196 }
197 
operator ==(const UnicodeText & other) const198 bool UnicodeText::operator==(const UnicodeText& other) const {
199   if (repr_.size_ != other.repr_.size_) {
200     return false;
201   }
202   return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0;
203 }
204 
ToUTF8String() const205 std::string UnicodeText::ToUTF8String() const {
206   return UTF8Substring(begin(), end());
207 }
208 
UTF8Substring(const const_iterator & first,const const_iterator & last)209 std::string UnicodeText::UTF8Substring(const const_iterator& first,
210                                        const const_iterator& last) {
211   return std::string(first.it_, last.it_ - first.it_);
212 }
213 
~UnicodeText()214 UnicodeText::~UnicodeText() {}
215 
216 // ******************* UnicodeText::const_iterator *********************
217 
218 // The implementation of const_iterator would be nicer if it
219 // inherited from boost::iterator_facade
220 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
221 
const_iterator()222 UnicodeText::const_iterator::const_iterator() : it_(0) {}
223 
operator =(const const_iterator & other)224 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
225     const const_iterator& other) {
226   if (&other != this) it_ = other.it_;
227   return *this;
228 }
229 
begin() const230 UnicodeText::const_iterator UnicodeText::begin() const {
231   return const_iterator(repr_.data_);
232 }
233 
end() const234 UnicodeText::const_iterator UnicodeText::end() const {
235   return const_iterator(repr_.data_ + repr_.size_);
236 }
237 
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)238 bool operator<(const UnicodeText::const_iterator& lhs,
239                const UnicodeText::const_iterator& rhs) {
240   return lhs.it_ < rhs.it_;
241 }
242 
operator *() const243 char32 UnicodeText::const_iterator::operator*() const {
244   // (We could call chartorune here, but that does some
245   // error-checking, and we're guaranteed that our data is valid
246   // UTF-8. Also, we expect this routine to be called very often. So
247   // for speed, we do the calculation ourselves.)
248 
249   // Convert from UTF-8
250   unsigned char byte1 = static_cast<unsigned char>(it_[0]);
251   if (byte1 < 0x80) return byte1;
252 
253   unsigned char byte2 = static_cast<unsigned char>(it_[1]);
254   if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
255 
256   unsigned char byte3 = static_cast<unsigned char>(it_[2]);
257   if (byte1 < 0xF0) {
258     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
259   }
260 
261   unsigned char byte4 = static_cast<unsigned char>(it_[3]);
262   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
263          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
264 }
265 
operator ++()266 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
267   it_ += GetNumBytesForNonZeroUTF8Char(it_);
268   return *this;
269 }
270 
operator --()271 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
272   while (IsTrailByte(*--it_)) {
273   }
274   return *this;
275 }
276 
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)277 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
278   UnicodeText t;
279   if (do_copy) {
280     t.CopyUTF8(utf8_buf, len);
281   } else {
282     t.PointToUTF8(utf8_buf, len);
283   }
284   return t;
285 }
286 
UTF8ToUnicodeText(const char * utf8_buf,bool do_copy)287 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) {
288   return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy);
289 }
290 
UTF8ToUnicodeText(const std::string & str,bool do_copy)291 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
292   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
293 }
294 
UTF8ToUnicodeText(const std::string & str)295 UnicodeText UTF8ToUnicodeText(const std::string& str) {
296   return UTF8ToUnicodeText(str, /*do_copy=*/true);
297 }
298 
299 }  // namespace libtextclassifier2
300