1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/inspector/string-16.h"
6 
7 #include <algorithm>
8 #include <cctype>
9 #include <cstdlib>
10 #include <cstring>
11 #include <iomanip>
12 #include <limits>
13 #include <locale>
14 #include <sstream>
15 #include <string>
16 
17 #include "src/base/platform/platform.h"
18 #include "src/inspector/protocol-platform.h"
19 
20 namespace v8_inspector {
21 
22 namespace {
23 
isASCII(UChar c)24 bool isASCII(UChar c) { return !(c & ~0x7F); }
25 
isSpaceOrNewLine(UChar c)26 bool isSpaceOrNewLine(UChar c) {
27   return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
28 }
29 
charactersToInteger(const UChar * characters,size_t length,bool * ok=nullptr)30 int charactersToInteger(const UChar* characters, size_t length,
31                         bool* ok = nullptr) {
32   std::vector<char> buffer;
33   buffer.reserve(length + 1);
34   for (size_t i = 0; i < length; ++i) {
35     if (!isASCII(characters[i])) {
36       if (ok) *ok = false;
37       return 0;
38     }
39     buffer.push_back(static_cast<char>(characters[i]));
40   }
41   buffer.push_back('\0');
42 
43   char* endptr;
44   int64_t result =
45       static_cast<int64_t>(std::strtol(buffer.data(), &endptr, 10));
46   if (ok) {
47     *ok = !(*endptr) && result <= std::numeric_limits<int>::max() &&
48           result >= std::numeric_limits<int>::min();
49   }
50   return static_cast<int>(result);
51 }
52 
53 const UChar replacementCharacter = 0xFFFD;
54 using UChar32 = uint32_t;
55 
inlineUTF8SequenceLengthNonASCII(char b0)56 inline int inlineUTF8SequenceLengthNonASCII(char b0) {
57   if ((b0 & 0xC0) != 0xC0) return 0;
58   if ((b0 & 0xE0) == 0xC0) return 2;
59   if ((b0 & 0xF0) == 0xE0) return 3;
60   if ((b0 & 0xF8) == 0xF0) return 4;
61   return 0;
62 }
63 
inlineUTF8SequenceLength(char b0)64 inline int inlineUTF8SequenceLength(char b0) {
65   return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
66 }
67 
68 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
69 // into the first byte, depending on how many bytes follow.  There are
70 // as many entries in this table as there are UTF-8 sequence types.
71 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
72 // for *legal* UTF-8 will be 4 or fewer bytes total.
73 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
74                                                0xF0, 0xF8, 0xFC};
75 
76 typedef enum {
77   conversionOK,     // conversion successful
78   sourceExhausted,  // partial character in source, but hit end
79   targetExhausted,  // insuff. room in target for conversion
80   sourceIllegal     // source sequence is illegal/malformed
81 } ConversionResult;
82 
convertUTF16ToUTF8(const UChar ** sourceStart,const UChar * sourceEnd,char ** targetStart,char * targetEnd,bool strict)83 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
84                                     const UChar* sourceEnd, char** targetStart,
85                                     char* targetEnd, bool strict) {
86   ConversionResult result = conversionOK;
87   const UChar* source = *sourceStart;
88   char* target = *targetStart;
89   while (source < sourceEnd) {
90     UChar32 ch;
91     uint32_t bytesToWrite = 0;
92     const UChar32 byteMask = 0xBF;
93     const UChar32 byteMark = 0x80;
94     const UChar* oldSource =
95         source;  // In case we have to back up because of target overflow.
96     ch = static_cast<uint16_t>(*source++);
97     // If we have a surrogate pair, convert to UChar32 first.
98     if (ch >= 0xD800 && ch <= 0xDBFF) {
99       // If the 16 bits following the high surrogate are in the source buffer...
100       if (source < sourceEnd) {
101         UChar32 ch2 = static_cast<uint16_t>(*source);
102         // If it's a low surrogate, convert to UChar32.
103         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
104           ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
105           ++source;
106         } else if (strict) {  // it's an unpaired high surrogate
107           --source;           // return to the illegal value itself
108           result = sourceIllegal;
109           break;
110         }
111       } else {     // We don't have the 16 bits following the high surrogate.
112         --source;  // return to the high surrogate
113         result = sourceExhausted;
114         break;
115       }
116     } else if (strict) {
117       // UTF-16 surrogate values are illegal in UTF-32
118       if (ch >= 0xDC00 && ch <= 0xDFFF) {
119         --source;  // return to the illegal value itself
120         result = sourceIllegal;
121         break;
122       }
123     }
124     // Figure out how many bytes the result will require
125     if (ch < (UChar32)0x80) {
126       bytesToWrite = 1;
127     } else if (ch < (UChar32)0x800) {
128       bytesToWrite = 2;
129     } else if (ch < (UChar32)0x10000) {
130       bytesToWrite = 3;
131     } else if (ch < (UChar32)0x110000) {
132       bytesToWrite = 4;
133     } else {
134       bytesToWrite = 3;
135       ch = replacementCharacter;
136     }
137 
138     target += bytesToWrite;
139     if (target > targetEnd) {
140       source = oldSource;  // Back up source pointer!
141       target -= bytesToWrite;
142       result = targetExhausted;
143       break;
144     }
145     switch (bytesToWrite) {  // note: everything falls through.
146       case 4:
147         *--target = static_cast<char>((ch | byteMark) & byteMask);
148         ch >>= 6;
149       case 3:
150         *--target = static_cast<char>((ch | byteMark) & byteMask);
151         ch >>= 6;
152       case 2:
153         *--target = static_cast<char>((ch | byteMark) & byteMask);
154         ch >>= 6;
155       case 1:
156         *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
157     }
158     target += bytesToWrite;
159   }
160   *sourceStart = source;
161   *targetStart = target;
162   return result;
163 }
164 
165 /**
166  * Is this code point a BMP code point (U+0000..U+ffff)?
167  * @param c 32-bit code point
168  * @return TRUE or FALSE
169  * @stable ICU 2.8
170  */
171 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)
172 
173 /**
174  * Is this code point a supplementary code point (U+10000..U+10ffff)?
175  * @param c 32-bit code point
176  * @return TRUE or FALSE
177  * @stable ICU 2.8
178  */
179 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff)
180 
181 /**
182  * Is this code point a surrogate (U+d800..U+dfff)?
183  * @param c 32-bit code point
184  * @return TRUE or FALSE
185  * @stable ICU 2.4
186  */
187 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800)
188 
189 /**
190  * Get the lead surrogate (0xd800..0xdbff) for a
191  * supplementary code point (0x10000..0x10ffff).
192  * @param supplementary 32-bit code point (U+10000..U+10ffff)
193  * @return lead surrogate (U+d800..U+dbff) for supplementary
194  * @stable ICU 2.4
195  */
196 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)
197 
198 /**
199  * Get the trail surrogate (0xdc00..0xdfff) for a
200  * supplementary code point (0x10000..0x10ffff).
201  * @param supplementary 32-bit code point (U+10000..U+10ffff)
202  * @return trail surrogate (U+dc00..U+dfff) for supplementary
203  * @stable ICU 2.4
204  */
205 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00)
206 
207 // This must be called with the length pre-determined by the first byte.
208 // If presented with a length > 4, this returns false.  The Unicode
209 // definition of UTF-8 goes up to 4-byte sequences.
isLegalUTF8(const unsigned char * source,int length)210 static bool isLegalUTF8(const unsigned char* source, int length) {
211   unsigned char a;
212   const unsigned char* srcptr = source + length;
213   switch (length) {
214     default:
215       return false;
216     // Everything else falls through when "true"...
217     case 4:
218       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
219     case 3:
220       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
221     case 2:
222       if ((a = (*--srcptr)) > 0xBF) return false;
223 
224       // no fall-through in this inner switch
225       switch (*source) {
226         case 0xE0:
227           if (a < 0xA0) return false;
228           break;
229         case 0xED:
230           if (a > 0x9F) return false;
231           break;
232         case 0xF0:
233           if (a < 0x90) return false;
234           break;
235         case 0xF4:
236           if (a > 0x8F) return false;
237           break;
238         default:
239           if (a < 0x80) return false;
240       }
241 
242     case 1:
243       if (*source >= 0x80 && *source < 0xC2) return false;
244   }
245   if (*source > 0xF4) return false;
246   return true;
247 }
248 
249 // Magic values subtracted from a buffer value during UTF8 conversion.
250 // This table contains as many values as there might be trailing bytes
251 // in a UTF-8 sequence.
252 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
253                                            0x00003080UL,
254                                            0x000E2080UL,
255                                            0x03C82080UL,
256                                            static_cast<UChar32>(0xFA082080UL),
257                                            static_cast<UChar32>(0x82082080UL)};
258 
readUTF8Sequence(const char * & sequence,size_t length)259 static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
260   UChar32 character = 0;
261 
262   // The cases all fall through.
263   switch (length) {
264     case 6:
265       character += static_cast<unsigned char>(*sequence++);
266       character <<= 6;
267     case 5:
268       character += static_cast<unsigned char>(*sequence++);
269       character <<= 6;
270     case 4:
271       character += static_cast<unsigned char>(*sequence++);
272       character <<= 6;
273     case 3:
274       character += static_cast<unsigned char>(*sequence++);
275       character <<= 6;
276     case 2:
277       character += static_cast<unsigned char>(*sequence++);
278       character <<= 6;
279     case 1:
280       character += static_cast<unsigned char>(*sequence++);
281   }
282 
283   return character - offsetsFromUTF8[length - 1];
284 }
285 
convertUTF8ToUTF16(const char ** sourceStart,const char * sourceEnd,UChar ** targetStart,UChar * targetEnd,bool * sourceAllASCII,bool strict)286 ConversionResult convertUTF8ToUTF16(const char** sourceStart,
287                                     const char* sourceEnd, UChar** targetStart,
288                                     UChar* targetEnd, bool* sourceAllASCII,
289                                     bool strict) {
290   ConversionResult result = conversionOK;
291   const char* source = *sourceStart;
292   UChar* target = *targetStart;
293   UChar orAllData = 0;
294   while (source < sourceEnd) {
295     int utf8SequenceLength = inlineUTF8SequenceLength(*source);
296     if (sourceEnd - source < utf8SequenceLength) {
297       result = sourceExhausted;
298       break;
299     }
300     // Do this check whether lenient or strict
301     if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
302                      utf8SequenceLength)) {
303       result = sourceIllegal;
304       break;
305     }
306 
307     UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
308 
309     if (target >= targetEnd) {
310       source -= utf8SequenceLength;  // Back up source pointer!
311       result = targetExhausted;
312       break;
313     }
314 
315     if (U_IS_BMP(character)) {
316       // UTF-16 surrogate values are illegal in UTF-32
317       if (U_IS_SURROGATE(character)) {
318         if (strict) {
319           source -= utf8SequenceLength;  // return to the illegal value itself
320           result = sourceIllegal;
321           break;
322         }
323         *target++ = replacementCharacter;
324         orAllData |= replacementCharacter;
325       } else {
326         *target++ = static_cast<UChar>(character);  // normal case
327         orAllData |= character;
328       }
329     } else if (U_IS_SUPPLEMENTARY(character)) {
330       // target is a character in range 0xFFFF - 0x10FFFF
331       if (target + 1 >= targetEnd) {
332         source -= utf8SequenceLength;  // Back up source pointer!
333         result = targetExhausted;
334         break;
335       }
336       *target++ = U16_LEAD(character);
337       *target++ = U16_TRAIL(character);
338       orAllData = 0xffff;
339     } else {
340       if (strict) {
341         source -= utf8SequenceLength;  // return to the start
342         result = sourceIllegal;
343         break;  // Bail out; shouldn't continue
344       } else {
345         *target++ = replacementCharacter;
346         orAllData |= replacementCharacter;
347       }
348     }
349   }
350   *sourceStart = source;
351   *targetStart = target;
352 
353   if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f);
354 
355   return result;
356 }
357 
358 // Helper to write a three-byte UTF-8 code point to the buffer, caller must
359 // check room is available.
putUTF8Triple(char * & buffer,UChar ch)360 static inline void putUTF8Triple(char*& buffer, UChar ch) {
361   *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
362   *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
363   *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
364 }
365 
366 }  // namespace
367 
368 // static
fromInteger(int number)369 String16 String16::fromInteger(int number) {
370   const size_t kBufferSize = 50;
371   char buffer[kBufferSize];
372   v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number);
373   return String16(buffer);
374 }
375 
376 // static
fromInteger(size_t number)377 String16 String16::fromInteger(size_t number) {
378   const size_t kBufferSize = 50;
379   char buffer[kBufferSize];
380 #if !defined(_WIN32) && !defined(_WIN64)
381   v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
382 #else
383   v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
384 #endif
385   return String16(buffer);
386 }
387 
388 // static
fromDouble(double number)389 String16 String16::fromDouble(double number) {
390   std::ostringstream s;
391   s.imbue(std::locale("C"));
392   s << std::fixed << std::setprecision(std::numeric_limits<double>::digits10)
393     << number;
394   return String16(s.str().c_str());
395 }
396 
397 // static
fromDouble(double number,int precision)398 String16 String16::fromDouble(double number, int precision) {
399   std::ostringstream s;
400   s.imbue(std::locale("C"));
401   s << std::fixed << std::setprecision(precision) << number;
402   return String16(s.str().c_str());
403 }
404 
toInteger(bool * ok) const405 int String16::toInteger(bool* ok) const {
406   return charactersToInteger(characters16(), length(), ok);
407 }
408 
stripWhiteSpace() const409 String16 String16::stripWhiteSpace() const {
410   if (!length()) return String16();
411 
412   size_t start = 0;
413   size_t end = length() - 1;
414 
415   // skip white space from start
416   while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start;
417 
418   // only white space
419   if (start > end) return String16();
420 
421   // skip white space from end
422   while (end && isSpaceOrNewLine(characters16()[end])) --end;
423 
424   if (!start && end == length() - 1) return *this;
425   return String16(characters16() + start, end + 1 - start);
426 }
427 
String16Builder()428 String16Builder::String16Builder() {}
429 
append(const String16 & s)430 void String16Builder::append(const String16& s) {
431   m_buffer.insert(m_buffer.end(), s.characters16(),
432                   s.characters16() + s.length());
433 }
434 
append(UChar c)435 void String16Builder::append(UChar c) { m_buffer.push_back(c); }
436 
append(char c)437 void String16Builder::append(char c) {
438   UChar u = c;
439   m_buffer.push_back(u);
440 }
441 
append(const UChar * characters,size_t length)442 void String16Builder::append(const UChar* characters, size_t length) {
443   m_buffer.insert(m_buffer.end(), characters, characters + length);
444 }
445 
append(const char * characters,size_t length)446 void String16Builder::append(const char* characters, size_t length) {
447   m_buffer.insert(m_buffer.end(), characters, characters + length);
448 }
449 
appendNumber(int number)450 void String16Builder::appendNumber(int number) {
451   const int kBufferSize = 11;
452   char buffer[kBufferSize];
453   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number);
454   DCHECK_GT(kBufferSize, chars);
455   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
456 }
457 
appendNumber(size_t number)458 void String16Builder::appendNumber(size_t number) {
459   const int kBufferSize = 20;
460   char buffer[kBufferSize];
461 #if !defined(_WIN32) && !defined(_WIN64)
462   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
463 #else
464   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
465 #endif
466   DCHECK_GT(kBufferSize, chars);
467   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
468 }
469 
toString()470 String16 String16Builder::toString() {
471   return String16(m_buffer.data(), m_buffer.size());
472 }
473 
reserveCapacity(size_t capacity)474 void String16Builder::reserveCapacity(size_t capacity) {
475   m_buffer.reserve(capacity);
476 }
477 
fromUTF8(const char * stringStart,size_t length)478 String16 String16::fromUTF8(const char* stringStart, size_t length) {
479   if (!stringStart || !length) return String16();
480 
481   std::vector<UChar> buffer(length);
482   UChar* bufferStart = buffer.data();
483 
484   UChar* bufferCurrent = bufferStart;
485   const char* stringCurrent = stringStart;
486   if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
487                          bufferCurrent + buffer.size(), 0,
488                          true) != conversionOK)
489     return String16();
490 
491   size_t utf16Length = bufferCurrent - bufferStart;
492   return String16(bufferStart, utf16Length);
493 }
494 
utf8() const495 std::string String16::utf8() const {
496   size_t length = this->length();
497 
498   if (!length) return std::string("");
499 
500   // Allocate a buffer big enough to hold all the characters
501   // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
502   // Optimization ideas, if we find this function is hot:
503   //  * We could speculatively create a CStringBuffer to contain 'length'
504   //    characters, and resize if necessary (i.e. if the buffer contains
505   //    non-ascii characters). (Alternatively, scan the buffer first for
506   //    ascii characters, so we know this will be sufficient).
507   //  * We could allocate a CStringBuffer with an appropriate size to
508   //    have a good chance of being able to write the string into the
509   //    buffer without reallocing (say, 1.5 x length).
510   if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
511   std::vector<char> bufferVector(length * 3);
512   char* buffer = bufferVector.data();
513   const UChar* characters = m_impl.data();
514 
515   ConversionResult result =
516       convertUTF16ToUTF8(&characters, characters + length, &buffer,
517                          buffer + bufferVector.size(), false);
518   DCHECK(
519       result !=
520       targetExhausted);  // (length * 3) should be sufficient for any conversion
521 
522   // Only produced from strict conversion.
523   DCHECK(result != sourceIllegal);
524 
525   // Check for an unconverted high surrogate.
526   if (result == sourceExhausted) {
527     // This should be one unpaired high surrogate. Treat it the same
528     // was as an unpaired high surrogate would have been handled in
529     // the middle of a string with non-strict conversion - which is
530     // to say, simply encode it to UTF-8.
531     DCHECK((characters + 1) == (m_impl.data() + length));
532     DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
533     // There should be room left, since one UChar hasn't been
534     // converted.
535     DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
536     putUTF8Triple(buffer, *characters);
537   }
538 
539   return std::string(bufferVector.data(), buffer - bufferVector.data());
540 }
541 
542 }  // namespace v8_inspector
543