1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8/unicodetext.h"
18 
19 #include <string.h>
20 
21 #include <algorithm>
22 
23 #include "utils/base/logging.h"
24 #include "utils/strings/utf8.h"
25 #include "absl/strings/string_view.h"
26 
27 namespace libtextclassifier3 {
28 
29 // *************** Data representation **********
30 // Note: the copy constructor is undefined.
31 
operator =(Repr && src)32 UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) {
33   if (ours_ && data_) delete[] data_;
34   data_ = src.data_;
35   size_ = src.size_;
36   capacity_ = src.capacity_;
37   ours_ = src.ours_;
38   src.ours_ = false;
39   return *this;
40 }
41 
PointTo(const char * data,int size)42 void UnicodeText::Repr::PointTo(const char* data, int size) {
43   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
44   data_ = const_cast<char*>(data);
45   size_ = size;
46   capacity_ = size;
47   ours_ = false;
48 }
49 
Copy(const char * data,int size)50 void UnicodeText::Repr::Copy(const char* data, int size) {
51   resize(size);
52   memcpy(data_, data, size);
53 }
54 
resize(int new_size)55 void UnicodeText::Repr::resize(int new_size) {
56   if (new_size == 0) {
57     clear();
58   } else {
59     if (!ours_ || new_size > capacity_) reserve(new_size);
60     // Clear the memory in the expanded part.
61     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
62     size_ = new_size;
63     ours_ = true;
64   }
65 }
66 
reserve(int new_capacity)67 void UnicodeText::Repr::reserve(int new_capacity) {
68   // If there's already enough capacity, and we're an owner, do nothing.
69   if (capacity_ >= new_capacity && ours_) return;
70 
71   // Otherwise, allocate a new buffer.
72   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
73   char* new_data = new char[capacity_];
74 
75   // If there is an old buffer, copy it into the new buffer.
76   if (data_) {
77     memcpy(new_data, data_, size_);
78     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
79   }
80   data_ = new_data;
81   ours_ = true;  // We own the new buffer.
82   // size_ is unchanged.
83 }
84 
append(const char * bytes,int byte_length)85 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
86   reserve(size_ + byte_length);
87   memcpy(data_ + size_, bytes, byte_length);
88   size_ += byte_length;
89 }
90 
clear()91 void UnicodeText::Repr::clear() {
92   if (ours_) delete[] data_;
93   data_ = nullptr;
94   size_ = capacity_ = 0;
95   ours_ = true;
96 }
97 
98 // *************** UnicodeText ******************
99 
UnicodeText()100 UnicodeText::UnicodeText() {}
101 
UnicodeText(const UnicodeText & src,bool do_copy)102 UnicodeText::UnicodeText(const UnicodeText& src, bool do_copy) {
103   if (do_copy) {
104     Copy(src);
105   } else {
106     repr_.PointTo(src.repr_.data_, src.repr_.size_);
107   }
108 }
109 
operator =(UnicodeText && src)110 UnicodeText& UnicodeText::operator=(UnicodeText&& src) {
111   this->repr_ = std::move(src.repr_);
112   return *this;
113 }
114 
Copy(const UnicodeText & src)115 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
116   repr_.Copy(src.repr_.data_, src.repr_.size_);
117   return *this;
118 }
119 
PointToUTF8(const char * buffer,int byte_length)120 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
121   repr_.PointTo(buffer, byte_length);
122   return *this;
123 }
124 
CopyUTF8(const char * buffer,int byte_length)125 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
126   repr_.Copy(buffer, byte_length);
127   return *this;
128 }
129 
AppendUTF8(const char * utf8,int len)130 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
131   repr_.append(utf8, len);
132   return *this;
133 }
134 
data() const135 const char* UnicodeText::data() const { return repr_.data_; }
136 
size_bytes() const137 int UnicodeText::size_bytes() const { return repr_.size_; }
138 
139 namespace {
140 
141 enum {
142   RuneError = 0xFFFD,  // Decoding error in UTF.
143   RuneMax = 0x10FFFF,  // Maximum rune value.
144 };
145 
runetochar(const char32 rune,char * dest)146 int runetochar(const char32 rune, char* dest) {
147   // Convert to unsigned for range check.
148   uint32 c;
149 
150   // 1 char 00-7F
151   c = rune;
152   if (c <= 0x7F) {
153     dest[0] = static_cast<char>(c);
154     return 1;
155   }
156 
157   // 2 char 0080-07FF
158   if (c <= 0x07FF) {
159     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
160     dest[1] = 0x80 | (c & 0x3F);
161     return 2;
162   }
163 
164   // Range check
165   if (c > RuneMax) {
166     c = RuneError;
167   }
168 
169   // 3 char 0800-FFFF
170   if (c <= 0xFFFF) {
171     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
172     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
173     dest[2] = 0x80 | (c & 0x3F);
174     return 3;
175   }
176 
177   // 4 char 10000-1FFFFF
178   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
179   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
180   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
181   dest[3] = 0x80 | (c & 0x3F);
182   return 4;
183 }
184 
185 }  // namespace
186 
push_back(char32 ch)187 UnicodeText& UnicodeText::push_back(char32 ch) {
188   char str[4];
189   int char_len = runetochar(ch, str);
190   repr_.append(str, char_len);
191   return *this;
192 }
193 
clear()194 void UnicodeText::clear() { repr_.clear(); }
195 
size_codepoints() const196 int UnicodeText::size_codepoints() const {
197   return std::distance(begin(), end());
198 }
199 
empty() const200 bool UnicodeText::empty() const { return size_bytes() == 0; }
201 
is_valid() const202 bool UnicodeText::is_valid() const {
203   return IsValidUTF8(repr_.data_, repr_.size_);
204 }
205 
Codepoints() const206 std::vector<UnicodeText::const_iterator> UnicodeText::Codepoints() const {
207   std::vector<UnicodeText::const_iterator> codepoints;
208   for (auto it = begin(); it != end(); it++) {
209     codepoints.push_back(it);
210   }
211   return codepoints;
212 }
213 
CodepointsChar32() const214 std::vector<char32> UnicodeText::CodepointsChar32() const {
215   std::vector<char32> codepoints;
216   for (auto it = begin(); it != end(); it++) {
217     codepoints.push_back(*it);
218   }
219   return codepoints;
220 }
221 
operator ==(const UnicodeText & other) const222 bool UnicodeText::operator==(const UnicodeText& other) const {
223   if (repr_.size_ != other.repr_.size_) {
224     return false;
225   }
226   return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0;
227 }
228 
ToUTF8String() const229 std::string UnicodeText::ToUTF8String() const {
230   return UTF8Substring(begin(), end());
231 }
232 
UTF8Substring(int begin_codepoint,int end_codepoint) const233 std::string UnicodeText::UTF8Substring(int begin_codepoint,
234                                        int end_codepoint) const {
235   auto span_begin = begin();
236   std::advance(span_begin, begin_codepoint);
237   auto span_end = span_begin;
238   std::advance(span_end, end_codepoint - begin_codepoint);
239   return UTF8Substring(span_begin, span_end);
240 }
241 
UTF8Substring(const const_iterator & it_begin,const const_iterator & it_end)242 std::string UnicodeText::UTF8Substring(const const_iterator& it_begin,
243                                        const const_iterator& it_end) {
244   return std::string(it_begin.it_, it_end.it_ - it_begin.it_);
245 }
246 
Substring(const UnicodeText & text,int begin_codepoint,int end_codepoint,bool do_copy)247 UnicodeText UnicodeText::Substring(const UnicodeText& text, int begin_codepoint,
248                                    int end_codepoint, bool do_copy) {
249   auto it_begin = text.begin();
250   std::advance(it_begin, begin_codepoint);
251   auto it_end = text.begin();
252   std::advance(it_end, end_codepoint);
253 
254   return Substring(it_begin, it_end, do_copy);
255 }
256 
Substring(const const_iterator & it_begin,const const_iterator & it_end,bool do_copy)257 UnicodeText UnicodeText::Substring(const const_iterator& it_begin,
258                                    const const_iterator& it_end, bool do_copy) {
259   if (do_copy) {
260     UnicodeText result;
261     result.repr_.Copy(it_begin.it_, it_end.it_ - it_begin.it_);
262     return result;
263   } else {
264     UnicodeText result;
265     result.repr_.PointTo(it_begin.it_, it_end.it_ - it_begin.it_);
266     return result;
267   }
268 }
269 
~UnicodeText()270 UnicodeText::~UnicodeText() {}
271 
272 // ******************* UnicodeText::const_iterator *********************
273 
274 // The implementation of const_iterator would be nicer if it
275 // inherited from boost::iterator_facade
276 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
277 
const_iterator()278 UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
279 
operator =(const const_iterator & other)280 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
281     const const_iterator& other) {
282   if (&other != this) it_ = other.it_;
283   return *this;
284 }
285 
begin() const286 UnicodeText::const_iterator UnicodeText::begin() const {
287   return const_iterator(repr_.data_);
288 }
289 
end() const290 UnicodeText::const_iterator UnicodeText::end() const {
291   return const_iterator(repr_.data_ + repr_.size_);
292 }
293 
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)294 bool operator<(const UnicodeText::const_iterator& lhs,
295                const UnicodeText::const_iterator& rhs) {
296   return lhs.it_ < rhs.it_;
297 }
298 
operator *() const299 char32 UnicodeText::const_iterator::operator*() const {
300   // (We could call chartorune here, but that does some
301   // error-checking, and we're guaranteed that our data is valid
302   // UTF-8. Also, we expect this routine to be called very often. So
303   // for speed, we do the calculation ourselves.)
304   return ValidCharToRune(it_);
305 }
306 
operator ++()307 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
308   it_ += GetNumBytesForUTF8Char(it_);
309   return *this;
310 }
311 
operator --()312 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
313   while (IsTrailByte(*--it_)) {
314   }
315   return *this;
316 }
317 
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)318 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
319   UnicodeText t;
320   if (do_copy) {
321     t.CopyUTF8(utf8_buf, len);
322   } else {
323     t.PointToUTF8(utf8_buf, len);
324   }
325   return t;
326 }
327 
UTF8ToUnicodeText(const char * utf8_buf,bool do_copy)328 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) {
329   return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy);
330 }
331 
UTF8ToUnicodeText(const std::string & str,bool do_copy)332 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
333   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
334 }
335 
UTF8ToUnicodeText(StringPiece str,bool do_copy)336 UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy) {
337   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
338 }
339 
UTF8ToUnicodeText(absl::string_view str,bool do_copy)340 UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy) {
341   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
342 }
343 
344 }  // namespace libtextclassifier3
345