1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // from google3/strings/strutil.cc
32 
33 #include <google/protobuf/stubs/strutil.h>
34 #include <google/protobuf/stubs/mathlimits.h>
35 
36 #include <errno.h>
37 #include <float.h>    // FLT_DIG and DBL_DIG
38 #include <limits>
39 #include <limits.h>
40 #include <stdio.h>
41 #include <iterator>
42 
43 #include <google/protobuf/stubs/stl_util.h>
44 
45 #ifdef _WIN32
46 // MSVC has only _snprintf, not snprintf.
47 //
48 // MinGW has both snprintf and _snprintf, but they appear to be different
49 // functions.  The former is buggy.  When invoked like so:
50 //   char buffer[32];
51 //   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
52 // it prints "1.23000e+10".  This is plainly wrong:  %g should never print
53 // trailing zeros after the decimal point.  For some reason this bug only
54 // occurs with some input values, not all.  In any case, _snprintf does the
55 // right thing, so we use it.
56 #define snprintf _snprintf
57 #endif
58 
59 namespace google {
60 namespace protobuf {
61 
62 // These are defined as macros on some platforms.  #undef them so that we can
63 // redefine them.
64 #undef isxdigit
65 #undef isprint
66 
67 // The definitions of these in ctype.h change based on locale.  Since our
68 // string manipulation is all in relation to the protocol buffer and C++
69 // languages, we always want to use the C locale.  So, we re-define these
70 // exactly as we want them.
isxdigit(char c)71 inline bool isxdigit(char c) {
72   return ('0' <= c && c <= '9') ||
73          ('a' <= c && c <= 'f') ||
74          ('A' <= c && c <= 'F');
75 }
76 
isprint(char c)77 inline bool isprint(char c) {
78   return c >= 0x20 && c <= 0x7E;
79 }
80 
81 // ----------------------------------------------------------------------
82 // StripString
83 //    Replaces any occurrence of the character 'remove' (or the characters
84 //    in 'remove') with the character 'replacewith'.
85 // ----------------------------------------------------------------------
StripString(string * s,const char * remove,char replacewith)86 void StripString(string* s, const char* remove, char replacewith) {
87   const char * str_start = s->c_str();
88   const char * str = str_start;
89   for (str = strpbrk(str, remove);
90        str != NULL;
91        str = strpbrk(str + 1, remove)) {
92     (*s)[str - str_start] = replacewith;
93   }
94 }
95 
StripWhitespace(string * str)96 void StripWhitespace(string* str) {
97   int str_length = str->length();
98 
99   // Strip off leading whitespace.
100   int first = 0;
101   while (first < str_length && ascii_isspace(str->at(first))) {
102     ++first;
103   }
104   // If entire string is white space.
105   if (first == str_length) {
106     str->clear();
107     return;
108   }
109   if (first > 0) {
110     str->erase(0, first);
111     str_length -= first;
112   }
113 
114   // Strip off trailing whitespace.
115   int last = str_length - 1;
116   while (last >= 0 && ascii_isspace(str->at(last))) {
117     --last;
118   }
119   if (last != (str_length - 1) && last >= 0) {
120     str->erase(last + 1, string::npos);
121   }
122 }
123 
124 // ----------------------------------------------------------------------
125 // StringReplace()
126 //    Replace the "old" pattern with the "new" pattern in a string,
127 //    and append the result to "res".  If replace_all is false,
128 //    it only replaces the first instance of "old."
129 // ----------------------------------------------------------------------
130 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)131 void StringReplace(const string& s, const string& oldsub,
132                    const string& newsub, bool replace_all,
133                    string* res) {
134   if (oldsub.empty()) {
135     res->append(s);  // if empty, append the given string.
136     return;
137   }
138 
139   string::size_type start_pos = 0;
140   string::size_type pos;
141   do {
142     pos = s.find(oldsub, start_pos);
143     if (pos == string::npos) {
144       break;
145     }
146     res->append(s, start_pos, pos - start_pos);
147     res->append(newsub);
148     start_pos = pos + oldsub.size();  // start searching again after the "old"
149   } while (replace_all);
150   res->append(s, start_pos, s.length() - start_pos);
151 }
152 
153 // ----------------------------------------------------------------------
154 // StringReplace()
155 //    Give me a string and two patterns "old" and "new", and I replace
156 //    the first instance of "old" in the string with "new", if it
157 //    exists.  If "global" is true; call this repeatedly until it
158 //    fails.  RETURN a new string, regardless of whether the replacement
159 //    happened or not.
160 // ----------------------------------------------------------------------
161 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)162 string StringReplace(const string& s, const string& oldsub,
163                      const string& newsub, bool replace_all) {
164   string ret;
165   StringReplace(s, oldsub, newsub, replace_all, &ret);
166   return ret;
167 }
168 
169 // ----------------------------------------------------------------------
170 // SplitStringUsing()
171 //    Split a string using a character delimiter. Append the components
172 //    to 'result'.
173 //
174 // Note: For multi-character delimiters, this routine will split on *ANY* of
175 // the characters in the string, not the entire string as a single delimiter.
176 // ----------------------------------------------------------------------
177 template <typename ITR>
178 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)179 void SplitStringToIteratorUsing(const string& full,
180                                 const char* delim,
181                                 ITR& result) {
182   // Optimize the common case where delim is a single character.
183   if (delim[0] != '\0' && delim[1] == '\0') {
184     char c = delim[0];
185     const char* p = full.data();
186     const char* end = p + full.size();
187     while (p != end) {
188       if (*p == c) {
189         ++p;
190       } else {
191         const char* start = p;
192         while (++p != end && *p != c);
193         *result++ = string(start, p - start);
194       }
195     }
196     return;
197   }
198 
199   string::size_type begin_index, end_index;
200   begin_index = full.find_first_not_of(delim);
201   while (begin_index != string::npos) {
202     end_index = full.find_first_of(delim, begin_index);
203     if (end_index == string::npos) {
204       *result++ = full.substr(begin_index);
205       return;
206     }
207     *result++ = full.substr(begin_index, (end_index - begin_index));
208     begin_index = full.find_first_not_of(delim, end_index);
209   }
210 }
211 
SplitStringUsing(const string & full,const char * delim,vector<string> * result)212 void SplitStringUsing(const string& full,
213                       const char* delim,
214                       vector<string>* result) {
215   back_insert_iterator< vector<string> > it(*result);
216   SplitStringToIteratorUsing(full, delim, it);
217 }
218 
219 // Split a string using a character delimiter. Append the components
220 // to 'result'.  If there are consecutive delimiters, this function
221 // will return corresponding empty strings. The string is split into
222 // at most the specified number of pieces greedily. This means that the
223 // last piece may possibly be split further. To split into as many pieces
224 // as possible, specify 0 as the number of pieces.
225 //
226 // If "full" is the empty string, yields an empty string as the only value.
227 //
228 // If "pieces" is negative for some reason, it returns the whole string
229 // ----------------------------------------------------------------------
230 template <typename StringType, typename ITR>
231 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)232 void SplitStringToIteratorAllowEmpty(const StringType& full,
233                                      const char* delim,
234                                      int pieces,
235                                      ITR& result) {
236   string::size_type begin_index, end_index;
237   begin_index = 0;
238 
239   for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
240     end_index = full.find_first_of(delim, begin_index);
241     if (end_index == string::npos) {
242       *result++ = full.substr(begin_index);
243       return;
244     }
245     *result++ = full.substr(begin_index, (end_index - begin_index));
246     begin_index = end_index + 1;
247   }
248   *result++ = full.substr(begin_index);
249 }
250 
SplitStringAllowEmpty(const string & full,const char * delim,vector<string> * result)251 void SplitStringAllowEmpty(const string& full, const char* delim,
252                            vector<string>* result) {
253   back_insert_iterator<vector<string> > it(*result);
254   SplitStringToIteratorAllowEmpty(full, delim, 0, it);
255 }
256 
257 // ----------------------------------------------------------------------
258 // JoinStrings()
259 //    This merges a vector of string components with delim inserted
260 //    as separaters between components.
261 //
262 // ----------------------------------------------------------------------
263 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)264 static void JoinStringsIterator(const ITERATOR& start,
265                                 const ITERATOR& end,
266                                 const char* delim,
267                                 string* result) {
268   GOOGLE_CHECK(result != NULL);
269   result->clear();
270   int delim_length = strlen(delim);
271 
272   // Precompute resulting length so we can reserve() memory in one shot.
273   int length = 0;
274   for (ITERATOR iter = start; iter != end; ++iter) {
275     if (iter != start) {
276       length += delim_length;
277     }
278     length += iter->size();
279   }
280   result->reserve(length);
281 
282   // Now combine everything.
283   for (ITERATOR iter = start; iter != end; ++iter) {
284     if (iter != start) {
285       result->append(delim, delim_length);
286     }
287     result->append(iter->data(), iter->size());
288   }
289 }
290 
JoinStrings(const vector<string> & components,const char * delim,string * result)291 void JoinStrings(const vector<string>& components,
292                  const char* delim,
293                  string * result) {
294   JoinStringsIterator(components.begin(), components.end(), delim, result);
295 }
296 
297 // ----------------------------------------------------------------------
298 // UnescapeCEscapeSequences()
299 //    This does all the unescaping that C does: \ooo, \r, \n, etc
300 //    Returns length of resulting string.
301 //    The implementation of \x parses any positive number of hex digits,
302 //    but it is an error if the value requires more than 8 bits, and the
303 //    result is truncated to 8 bits.
304 //
305 //    The second call stores its errors in a supplied string vector.
306 //    If the string vector pointer is NULL, it reports the errors with LOG().
307 // ----------------------------------------------------------------------
308 
309 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
310 
311 // Protocol buffers doesn't ever care about errors, but I don't want to remove
312 // the code.
313 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
314 
UnescapeCEscapeSequences(const char * source,char * dest)315 int UnescapeCEscapeSequences(const char* source, char* dest) {
316   return UnescapeCEscapeSequences(source, dest, NULL);
317 }
318 
UnescapeCEscapeSequences(const char * source,char * dest,vector<string> * errors)319 int UnescapeCEscapeSequences(const char* source, char* dest,
320                              vector<string> *errors) {
321   GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented.";
322 
323   char* d = dest;
324   const char* p = source;
325 
326   // Small optimization for case where source = dest and there's no escaping
327   while ( p == d && *p != '\0' && *p != '\\' )
328     p++, d++;
329 
330   while (*p != '\0') {
331     if (*p != '\\') {
332       *d++ = *p++;
333     } else {
334       switch ( *++p ) {                    // skip past the '\\'
335         case '\0':
336           LOG_STRING(ERROR, errors) << "String cannot end with \\";
337           *d = '\0';
338           return d - dest;   // we're done with p
339         case 'a':  *d++ = '\a';  break;
340         case 'b':  *d++ = '\b';  break;
341         case 'f':  *d++ = '\f';  break;
342         case 'n':  *d++ = '\n';  break;
343         case 'r':  *d++ = '\r';  break;
344         case 't':  *d++ = '\t';  break;
345         case 'v':  *d++ = '\v';  break;
346         case '\\': *d++ = '\\';  break;
347         case '?':  *d++ = '\?';  break;    // \?  Who knew?
348         case '\'': *d++ = '\'';  break;
349         case '"':  *d++ = '\"';  break;
350         case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
351         case '4': case '5': case '6': case '7': {
352           char ch = *p - '0';
353           if ( IS_OCTAL_DIGIT(p[1]) )
354             ch = ch * 8 + *++p - '0';
355           if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
356             ch = ch * 8 + *++p - '0';      // now points at last digit
357           *d++ = ch;
358           break;
359         }
360         case 'x': case 'X': {
361           if (!isxdigit(p[1])) {
362             if (p[1] == '\0') {
363               LOG_STRING(ERROR, errors) << "String cannot end with \\x";
364             } else {
365               LOG_STRING(ERROR, errors) <<
366                 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
367             }
368             break;
369           }
370           unsigned int ch = 0;
371           const char *hex_start = p;
372           while (isxdigit(p[1]))  // arbitrarily many hex digits
373             ch = (ch << 4) + hex_digit_to_int(*++p);
374           if (ch > 0xFF)
375             LOG_STRING(ERROR, errors) << "Value of " <<
376               "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
377           *d++ = ch;
378           break;
379         }
380 #if 0  // TODO(kenton):  Support \u and \U?  Requires runetochar().
381         case 'u': {
382           // \uhhhh => convert 4 hex digits to UTF-8
383           char32 rune = 0;
384           const char *hex_start = p;
385           for (int i = 0; i < 4; ++i) {
386             if (isxdigit(p[1])) {  // Look one char ahead.
387               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
388             } else {
389               LOG_STRING(ERROR, errors)
390                 << "\\u must be followed by 4 hex digits: \\"
391                 <<  string(hex_start, p+1-hex_start);
392               break;
393             }
394           }
395           d += runetochar(d, &rune);
396           break;
397         }
398         case 'U': {
399           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
400           char32 rune = 0;
401           const char *hex_start = p;
402           for (int i = 0; i < 8; ++i) {
403             if (isxdigit(p[1])) {  // Look one char ahead.
404               // Don't change rune until we're sure this
405               // is within the Unicode limit, but do advance p.
406               char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
407               if (newrune > 0x10FFFF) {
408                 LOG_STRING(ERROR, errors)
409                   << "Value of \\"
410                   << string(hex_start, p + 1 - hex_start)
411                   << " exceeds Unicode limit (0x10FFFF)";
412                 break;
413               } else {
414                 rune = newrune;
415               }
416             } else {
417               LOG_STRING(ERROR, errors)
418                 << "\\U must be followed by 8 hex digits: \\"
419                 <<  string(hex_start, p+1-hex_start);
420               break;
421             }
422           }
423           d += runetochar(d, &rune);
424           break;
425         }
426 #endif
427         default:
428           LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
429       }
430       p++;                                 // read past letter we escaped
431     }
432   }
433   *d = '\0';
434   return d - dest;
435 }
436 
437 // ----------------------------------------------------------------------
438 // UnescapeCEscapeString()
439 //    This does the same thing as UnescapeCEscapeSequences, but creates
440 //    a new string. The caller does not need to worry about allocating
441 //    a dest buffer. This should be used for non performance critical
442 //    tasks such as printing debug messages. It is safe for src and dest
443 //    to be the same.
444 //
445 //    The second call stores its errors in a supplied string vector.
446 //    If the string vector pointer is NULL, it reports the errors with LOG().
447 //
448 //    In the first and second calls, the length of dest is returned. In the
449 //    the third call, the new string is returned.
450 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)451 int UnescapeCEscapeString(const string& src, string* dest) {
452   return UnescapeCEscapeString(src, dest, NULL);
453 }
454 
UnescapeCEscapeString(const string & src,string * dest,vector<string> * errors)455 int UnescapeCEscapeString(const string& src, string* dest,
456                           vector<string> *errors) {
457   scoped_array<char> unescaped(new char[src.size() + 1]);
458   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
459   GOOGLE_CHECK(dest);
460   dest->assign(unescaped.get(), len);
461   return len;
462 }
463 
UnescapeCEscapeString(const string & src)464 string UnescapeCEscapeString(const string& src) {
465   scoped_array<char> unescaped(new char[src.size() + 1]);
466   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL);
467   return string(unescaped.get(), len);
468 }
469 
470 // ----------------------------------------------------------------------
471 // CEscapeString()
472 // CHexEscapeString()
473 //    Copies 'src' to 'dest', escaping dangerous characters using
474 //    C-style escape sequences. This is very useful for preparing query
475 //    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
476 //    hexadecimal rather than octal sequences.
477 //    Returns the number of bytes written to 'dest' (not including the \0)
478 //    or -1 if there was insufficient space.
479 //
480 //    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
481 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)482 int CEscapeInternal(const char* src, int src_len, char* dest,
483                     int dest_len, bool use_hex, bool utf8_safe) {
484   const char* src_end = src + src_len;
485   int used = 0;
486   bool last_hex_escape = false; // true if last output char was \xNN
487 
488   for (; src < src_end; src++) {
489     if (dest_len - used < 2)   // Need space for two letter escape
490       return -1;
491 
492     bool is_hex_escape = false;
493     switch (*src) {
494       case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
495       case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
496       case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
497       case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
498       case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
499       case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
500       default:
501         // Note that if we emit \xNN and the src character after that is a hex
502         // digit then that digit must be escaped too to prevent it being
503         // interpreted as part of the character code by C.
504         if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
505             (!isprint(*src) ||
506              (last_hex_escape && isxdigit(*src)))) {
507           if (dest_len - used < 4) // need space for 4 letter escape
508             return -1;
509           sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
510                   static_cast<uint8>(*src));
511           is_hex_escape = use_hex;
512           used += 4;
513         } else {
514           dest[used++] = *src; break;
515         }
516     }
517     last_hex_escape = is_hex_escape;
518   }
519 
520   if (dest_len - used < 1)   // make sure that there is room for \0
521     return -1;
522 
523   dest[used] = '\0';   // doesn't count towards return value though
524   return used;
525 }
526 
527 // Calculates the length of the C-style escaped version of 'src'.
528 // Assumes that non-printable characters are escaped using octal sequences, and
529 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)530 static inline size_t CEscapedLength(StringPiece src) {
531   static char c_escaped_len[256] = {
532     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
533     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
534     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
535     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
536     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
537     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
538     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
539     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
540     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
541     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
546     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
547     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
548   };
549 
550   size_t escaped_len = 0;
551   for (int i = 0; i < src.size(); ++i) {
552     unsigned char c = static_cast<unsigned char>(src[i]);
553     escaped_len += c_escaped_len[c];
554   }
555   return escaped_len;
556 }
557 
558 // ----------------------------------------------------------------------
559 // Escapes 'src' using C-style escape sequences, and appends the escaped string
560 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
561 // the required space using a lookup table, and also does not do any special
562 // handling for Hex or UTF-8 characters.
563 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)564 void CEscapeAndAppend(StringPiece src, string* dest) {
565   size_t escaped_len = CEscapedLength(src);
566   if (escaped_len == src.size()) {
567     dest->append(src.data(), src.size());
568     return;
569   }
570 
571   size_t cur_dest_len = dest->size();
572   dest->resize(cur_dest_len + escaped_len);
573   char* append_ptr = &(*dest)[cur_dest_len];
574 
575   for (int i = 0; i < src.size(); ++i) {
576     unsigned char c = static_cast<unsigned char>(src[i]);
577     switch (c) {
578       case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
579       case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
580       case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
581       case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
582       case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
583       case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
584       default:
585         if (!isprint(c)) {
586           *append_ptr++ = '\\';
587           *append_ptr++ = '0' + c / 64;
588           *append_ptr++ = '0' + (c % 64) / 8;
589           *append_ptr++ = '0' + c % 8;
590         } else {
591           *append_ptr++ = c;
592         }
593         break;
594     }
595   }
596 }
597 
CEscape(const string & src)598 string CEscape(const string& src) {
599   string dest;
600   CEscapeAndAppend(src, &dest);
601   return dest;
602 }
603 
604 namespace strings {
605 
Utf8SafeCEscape(const string & src)606 string Utf8SafeCEscape(const string& src) {
607   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
608   scoped_array<char> dest(new char[dest_length]);
609   const int len = CEscapeInternal(src.data(), src.size(),
610                                   dest.get(), dest_length, false, true);
611   GOOGLE_DCHECK_GE(len, 0);
612   return string(dest.get(), len);
613 }
614 
CHexEscape(const string & src)615 string CHexEscape(const string& src) {
616   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
617   scoped_array<char> dest(new char[dest_length]);
618   const int len = CEscapeInternal(src.data(), src.size(),
619                                   dest.get(), dest_length, true, false);
620   GOOGLE_DCHECK_GE(len, 0);
621   return string(dest.get(), len);
622 }
623 
624 }  // namespace strings
625 
626 // ----------------------------------------------------------------------
627 // strto32_adaptor()
628 // strtou32_adaptor()
629 //    Implementation of strto[u]l replacements that have identical
630 //    overflow and underflow characteristics for both ILP-32 and LP-64
631 //    platforms, including errno preservation in error-free calls.
632 // ----------------------------------------------------------------------
633 
strto32_adaptor(const char * nptr,char ** endptr,int base)634 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
635   const int saved_errno = errno;
636   errno = 0;
637   const long result = strtol(nptr, endptr, base);
638   if (errno == ERANGE && result == LONG_MIN) {
639     return kint32min;
640   } else if (errno == ERANGE && result == LONG_MAX) {
641     return kint32max;
642   } else if (errno == 0 && result < kint32min) {
643     errno = ERANGE;
644     return kint32min;
645   } else if (errno == 0 && result > kint32max) {
646     errno = ERANGE;
647     return kint32max;
648   }
649   if (errno == 0)
650     errno = saved_errno;
651   return static_cast<int32>(result);
652 }
653 
strtou32_adaptor(const char * nptr,char ** endptr,int base)654 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
655   const int saved_errno = errno;
656   errno = 0;
657   const unsigned long result = strtoul(nptr, endptr, base);
658   if (errno == ERANGE && result == ULONG_MAX) {
659     return kuint32max;
660   } else if (errno == 0 && result > kuint32max) {
661     errno = ERANGE;
662     return kuint32max;
663   }
664   if (errno == 0)
665     errno = saved_errno;
666   return static_cast<uint32>(result);
667 }
668 
safe_parse_sign(string * text,bool * negative_ptr)669 inline bool safe_parse_sign(string* text  /*inout*/,
670                             bool* negative_ptr  /*output*/) {
671   const char* start = text->data();
672   const char* end = start + text->size();
673 
674   // Consume whitespace.
675   while (start < end && (start[0] == ' ')) {
676     ++start;
677   }
678   while (start < end && (end[-1] == ' ')) {
679     --end;
680   }
681   if (start >= end) {
682     return false;
683   }
684 
685   // Consume sign.
686   *negative_ptr = (start[0] == '-');
687   if (*negative_ptr || start[0] == '+') {
688     ++start;
689     if (start >= end) {
690       return false;
691     }
692   }
693   *text = text->substr(start - text->data(), end - start);
694   return true;
695 }
696 
697 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)698 bool safe_parse_positive_int(
699     string text, IntType* value_p) {
700   int base = 10;
701   IntType value = 0;
702   const IntType vmax = std::numeric_limits<IntType>::max();
703   assert(vmax > 0);
704   assert(vmax >= base);
705   const IntType vmax_over_base = vmax / base;
706   const char* start = text.data();
707   const char* end = start + text.size();
708   // loop over digits
709   for (; start < end; ++start) {
710     unsigned char c = static_cast<unsigned char>(start[0]);
711     int digit = c - '0';
712     if (digit >= base || digit < 0) {
713       *value_p = value;
714       return false;
715     }
716     if (value > vmax_over_base) {
717       *value_p = vmax;
718       return false;
719     }
720     value *= base;
721     if (value > vmax - digit) {
722       *value_p = vmax;
723       return false;
724     }
725     value += digit;
726   }
727   *value_p = value;
728   return true;
729 }
730 
731 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)732 bool safe_parse_negative_int(
733     const string& text, IntType* value_p) {
734   int base = 10;
735   IntType value = 0;
736   const IntType vmin = std::numeric_limits<IntType>::min();
737   assert(vmin < 0);
738   assert(vmin <= 0 - base);
739   IntType vmin_over_base = vmin / base;
740   // 2003 c++ standard [expr.mul]
741   // "... the sign of the remainder is implementation-defined."
742   // Although (vmin/base)*base + vmin%base is always vmin.
743   // 2011 c++ standard tightens the spec but we cannot rely on it.
744   if (vmin % base > 0) {
745     vmin_over_base += 1;
746   }
747   const char* start = text.data();
748   const char* end = start + text.size();
749   // loop over digits
750   for (; start < end; ++start) {
751     unsigned char c = static_cast<unsigned char>(start[0]);
752     int digit = c - '0';
753     if (digit >= base || digit < 0) {
754       *value_p = value;
755       return false;
756     }
757     if (value < vmin_over_base) {
758       *value_p = vmin;
759       return false;
760     }
761     value *= base;
762     if (value < vmin + digit) {
763       *value_p = vmin;
764       return false;
765     }
766     value -= digit;
767   }
768   *value_p = value;
769   return true;
770 }
771 
772 template<typename IntType>
safe_int_internal(string text,IntType * value_p)773 bool safe_int_internal(string text, IntType* value_p) {
774   *value_p = 0;
775   bool negative;
776   if (!safe_parse_sign(&text, &negative)) {
777     return false;
778   }
779   if (!negative) {
780     return safe_parse_positive_int(text, value_p);
781   } else {
782     return safe_parse_negative_int(text, value_p);
783   }
784 }
785 
786 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)787 bool safe_uint_internal(string text, IntType* value_p) {
788   *value_p = 0;
789   bool negative;
790   if (!safe_parse_sign(&text, &negative) || negative) {
791     return false;
792   }
793   return safe_parse_positive_int(text, value_p);
794 }
795 
796 // ----------------------------------------------------------------------
797 // FastIntToBuffer()
798 // FastInt64ToBuffer()
799 // FastHexToBuffer()
800 // FastHex64ToBuffer()
801 // FastHex32ToBuffer()
802 // ----------------------------------------------------------------------
803 
804 // Offset into buffer where FastInt64ToBuffer places the end of string
805 // null character.  Also used by FastInt64ToBufferLeft.
806 static const int kFastInt64ToBufferOffset = 21;
807 
FastInt64ToBuffer(int64 i,char * buffer)808 char *FastInt64ToBuffer(int64 i, char* buffer) {
809   // We could collapse the positive and negative sections, but that
810   // would be slightly slower for positive numbers...
811   // 22 bytes is enough to store -2**64, -18446744073709551616.
812   char* p = buffer + kFastInt64ToBufferOffset;
813   *p-- = '\0';
814   if (i >= 0) {
815     do {
816       *p-- = '0' + i % 10;
817       i /= 10;
818     } while (i > 0);
819     return p + 1;
820   } else {
821     // On different platforms, % and / have different behaviors for
822     // negative numbers, so we need to jump through hoops to make sure
823     // we don't divide negative numbers.
824     if (i > -10) {
825       i = -i;
826       *p-- = '0' + i;
827       *p = '-';
828       return p;
829     } else {
830       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
831       i = i + 10;
832       i = -i;
833       *p-- = '0' + i % 10;
834       // Undo what we did a moment ago
835       i = i / 10 + 1;
836       do {
837         *p-- = '0' + i % 10;
838         i /= 10;
839       } while (i > 0);
840       *p = '-';
841       return p;
842     }
843   }
844 }
845 
846 // Offset into buffer where FastInt32ToBuffer places the end of string
847 // null character.  Also used by FastInt32ToBufferLeft
848 static const int kFastInt32ToBufferOffset = 11;
849 
850 // Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
851 // compiler to generate 32 bit arithmetic instructions.  It's much faster, at
852 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)853 char *FastInt32ToBuffer(int32 i, char* buffer) {
854   // We could collapse the positive and negative sections, but that
855   // would be slightly slower for positive numbers...
856   // 12 bytes is enough to store -2**32, -4294967296.
857   char* p = buffer + kFastInt32ToBufferOffset;
858   *p-- = '\0';
859   if (i >= 0) {
860     do {
861       *p-- = '0' + i % 10;
862       i /= 10;
863     } while (i > 0);
864     return p + 1;
865   } else {
866     // On different platforms, % and / have different behaviors for
867     // negative numbers, so we need to jump through hoops to make sure
868     // we don't divide negative numbers.
869     if (i > -10) {
870       i = -i;
871       *p-- = '0' + i;
872       *p = '-';
873       return p;
874     } else {
875       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
876       i = i + 10;
877       i = -i;
878       *p-- = '0' + i % 10;
879       // Undo what we did a moment ago
880       i = i / 10 + 1;
881       do {
882         *p-- = '0' + i % 10;
883         i /= 10;
884       } while (i > 0);
885       *p = '-';
886       return p;
887     }
888   }
889 }
890 
FastHexToBuffer(int i,char * buffer)891 char *FastHexToBuffer(int i, char* buffer) {
892   GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
893 
894   static const char *hexdigits = "0123456789abcdef";
895   char *p = buffer + 21;
896   *p-- = '\0';
897   do {
898     *p-- = hexdigits[i & 15];   // mod by 16
899     i >>= 4;                    // divide by 16
900   } while (i > 0);
901   return p + 1;
902 }
903 
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)904 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
905   static const char *hexdigits = "0123456789abcdef";
906   buffer[num_byte] = '\0';
907   for (int i = num_byte - 1; i >= 0; i--) {
908 #ifdef _M_X64
909     // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
910     // block. Given that the uint32 cast was to improve performance on 32-bit
911     // platforms, we use 64-bit '&' directly.
912     buffer[i] = hexdigits[value & 0xf];
913 #else
914     buffer[i] = hexdigits[uint32(value) & 0xf];
915 #endif
916     value >>= 4;
917   }
918   return buffer;
919 }
920 
FastHex64ToBuffer(uint64 value,char * buffer)921 char *FastHex64ToBuffer(uint64 value, char* buffer) {
922   return InternalFastHexToBuffer(value, buffer, 16);
923 }
924 
FastHex32ToBuffer(uint32 value,char * buffer)925 char *FastHex32ToBuffer(uint32 value, char* buffer) {
926   return InternalFastHexToBuffer(value, buffer, 8);
927 }
928 
929 // ----------------------------------------------------------------------
930 // FastInt32ToBufferLeft()
931 // FastUInt32ToBufferLeft()
932 // FastInt64ToBufferLeft()
933 // FastUInt64ToBufferLeft()
934 //
935 // Like the Fast*ToBuffer() functions above, these are intended for speed.
936 // Unlike the Fast*ToBuffer() functions, however, these functions write
937 // their output to the beginning of the buffer (hence the name, as the
938 // output is left-aligned).  The caller is responsible for ensuring that
939 // the buffer has enough space to hold the output.
940 //
941 // Returns a pointer to the end of the string (i.e. the null character
942 // terminating the string).
943 // ----------------------------------------------------------------------
944 
945 static const char two_ASCII_digits[100][2] = {
946   {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
947   {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
948   {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
949   {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
950   {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
951   {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
952   {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
953   {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
954   {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
955   {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
956   {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
957   {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
958   {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
959   {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
960   {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
961   {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
962   {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
963   {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
964   {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
965   {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
966 };
967 
FastUInt32ToBufferLeft(uint32 u,char * buffer)968 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
969   int digits;
970   const char *ASCII_digits = NULL;
971   // The idea of this implementation is to trim the number of divides to as few
972   // as possible by using multiplication and subtraction rather than mod (%),
973   // and by outputting two digits at a time rather than one.
974   // The huge-number case is first, in the hopes that the compiler will output
975   // that case in one branch-free block of code, and only output conditional
976   // branches into it from below.
977   if (u >= 1000000000) {  // >= 1,000,000,000
978     digits = u / 100000000;  // 100,000,000
979     ASCII_digits = two_ASCII_digits[digits];
980     buffer[0] = ASCII_digits[0];
981     buffer[1] = ASCII_digits[1];
982     buffer += 2;
983 sublt100_000_000:
984     u -= digits * 100000000;  // 100,000,000
985 lt100_000_000:
986     digits = u / 1000000;  // 1,000,000
987     ASCII_digits = two_ASCII_digits[digits];
988     buffer[0] = ASCII_digits[0];
989     buffer[1] = ASCII_digits[1];
990     buffer += 2;
991 sublt1_000_000:
992     u -= digits * 1000000;  // 1,000,000
993 lt1_000_000:
994     digits = u / 10000;  // 10,000
995     ASCII_digits = two_ASCII_digits[digits];
996     buffer[0] = ASCII_digits[0];
997     buffer[1] = ASCII_digits[1];
998     buffer += 2;
999 sublt10_000:
1000     u -= digits * 10000;  // 10,000
1001 lt10_000:
1002     digits = u / 100;
1003     ASCII_digits = two_ASCII_digits[digits];
1004     buffer[0] = ASCII_digits[0];
1005     buffer[1] = ASCII_digits[1];
1006     buffer += 2;
1007 sublt100:
1008     u -= digits * 100;
1009 lt100:
1010     digits = u;
1011     ASCII_digits = two_ASCII_digits[digits];
1012     buffer[0] = ASCII_digits[0];
1013     buffer[1] = ASCII_digits[1];
1014     buffer += 2;
1015 done:
1016     *buffer = 0;
1017     return buffer;
1018   }
1019 
1020   if (u < 100) {
1021     digits = u;
1022     if (u >= 10) goto lt100;
1023     *buffer++ = '0' + digits;
1024     goto done;
1025   }
1026   if (u  <  10000) {   // 10,000
1027     if (u >= 1000) goto lt10_000;
1028     digits = u / 100;
1029     *buffer++ = '0' + digits;
1030     goto sublt100;
1031   }
1032   if (u  <  1000000) {   // 1,000,000
1033     if (u >= 100000) goto lt1_000_000;
1034     digits = u / 10000;  //    10,000
1035     *buffer++ = '0' + digits;
1036     goto sublt10_000;
1037   }
1038   if (u  <  100000000) {   // 100,000,000
1039     if (u >= 10000000) goto lt100_000_000;
1040     digits = u / 1000000;  //   1,000,000
1041     *buffer++ = '0' + digits;
1042     goto sublt1_000_000;
1043   }
1044   // we already know that u < 1,000,000,000
1045   digits = u / 100000000;   // 100,000,000
1046   *buffer++ = '0' + digits;
1047   goto sublt100_000_000;
1048 }
1049 
FastInt32ToBufferLeft(int32 i,char * buffer)1050 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1051   uint32 u = i;
1052   if (i < 0) {
1053     *buffer++ = '-';
1054     u = -i;
1055   }
1056   return FastUInt32ToBufferLeft(u, buffer);
1057 }
1058 
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1059 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1060   int digits;
1061   const char *ASCII_digits = NULL;
1062 
1063   uint32 u = static_cast<uint32>(u64);
1064   if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1065 
1066   uint64 top_11_digits = u64 / 1000000000;
1067   buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1068   u = u64 - (top_11_digits * 1000000000);
1069 
1070   digits = u / 10000000;  // 10,000,000
1071   GOOGLE_DCHECK_LT(digits, 100);
1072   ASCII_digits = two_ASCII_digits[digits];
1073   buffer[0] = ASCII_digits[0];
1074   buffer[1] = ASCII_digits[1];
1075   buffer += 2;
1076   u -= digits * 10000000;  // 10,000,000
1077   digits = u / 100000;  // 100,000
1078   ASCII_digits = two_ASCII_digits[digits];
1079   buffer[0] = ASCII_digits[0];
1080   buffer[1] = ASCII_digits[1];
1081   buffer += 2;
1082   u -= digits * 100000;  // 100,000
1083   digits = u / 1000;  // 1,000
1084   ASCII_digits = two_ASCII_digits[digits];
1085   buffer[0] = ASCII_digits[0];
1086   buffer[1] = ASCII_digits[1];
1087   buffer += 2;
1088   u -= digits * 1000;  // 1,000
1089   digits = u / 10;
1090   ASCII_digits = two_ASCII_digits[digits];
1091   buffer[0] = ASCII_digits[0];
1092   buffer[1] = ASCII_digits[1];
1093   buffer += 2;
1094   u -= digits * 10;
1095   digits = u;
1096   *buffer++ = '0' + digits;
1097   *buffer = 0;
1098   return buffer;
1099 }
1100 
FastInt64ToBufferLeft(int64 i,char * buffer)1101 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1102   uint64 u = i;
1103   if (i < 0) {
1104     *buffer++ = '-';
1105     u = -i;
1106   }
1107   return FastUInt64ToBufferLeft(u, buffer);
1108 }
1109 
1110 // ----------------------------------------------------------------------
1111 // SimpleItoa()
1112 //    Description: converts an integer to a string.
1113 //
1114 //    Return value: string
1115 // ----------------------------------------------------------------------
1116 
SimpleItoa(int i)1117 string SimpleItoa(int i) {
1118   char buffer[kFastToBufferSize];
1119   return (sizeof(i) == 4) ?
1120     FastInt32ToBuffer(i, buffer) :
1121     FastInt64ToBuffer(i, buffer);
1122 }
1123 
SimpleItoa(unsigned int i)1124 string SimpleItoa(unsigned int i) {
1125   char buffer[kFastToBufferSize];
1126   return string(buffer, (sizeof(i) == 4) ?
1127     FastUInt32ToBufferLeft(i, buffer) :
1128     FastUInt64ToBufferLeft(i, buffer));
1129 }
1130 
SimpleItoa(long i)1131 string SimpleItoa(long i) {
1132   char buffer[kFastToBufferSize];
1133   return (sizeof(i) == 4) ?
1134     FastInt32ToBuffer(i, buffer) :
1135     FastInt64ToBuffer(i, buffer);
1136 }
1137 
SimpleItoa(unsigned long i)1138 string SimpleItoa(unsigned long i) {
1139   char buffer[kFastToBufferSize];
1140   return string(buffer, (sizeof(i) == 4) ?
1141     FastUInt32ToBufferLeft(i, buffer) :
1142     FastUInt64ToBufferLeft(i, buffer));
1143 }
1144 
SimpleItoa(long long i)1145 string SimpleItoa(long long i) {
1146   char buffer[kFastToBufferSize];
1147   return (sizeof(i) == 4) ?
1148     FastInt32ToBuffer(i, buffer) :
1149     FastInt64ToBuffer(i, buffer);
1150 }
1151 
SimpleItoa(unsigned long long i)1152 string SimpleItoa(unsigned long long i) {
1153   char buffer[kFastToBufferSize];
1154   return string(buffer, (sizeof(i) == 4) ?
1155     FastUInt32ToBufferLeft(i, buffer) :
1156     FastUInt64ToBufferLeft(i, buffer));
1157 }
1158 
1159 // ----------------------------------------------------------------------
1160 // SimpleDtoa()
1161 // SimpleFtoa()
1162 // DoubleToBuffer()
1163 // FloatToBuffer()
1164 //    We want to print the value without losing precision, but we also do
1165 //    not want to print more digits than necessary.  This turns out to be
1166 //    trickier than it sounds.  Numbers like 0.2 cannot be represented
1167 //    exactly in binary.  If we print 0.2 with a very large precision,
1168 //    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1169 //    On the other hand, if we set the precision too low, we lose
1170 //    significant digits when printing numbers that actually need them.
1171 //    It turns out there is no precision value that does the right thing
1172 //    for all numbers.
1173 //
1174 //    Our strategy is to first try printing with a precision that is never
1175 //    over-precise, then parse the result with strtod() to see if it
1176 //    matches.  If not, we print again with a precision that will always
1177 //    give a precise result, but may use more digits than necessary.
1178 //
1179 //    An arguably better strategy would be to use the algorithm described
1180 //    in "How to Print Floating-Point Numbers Accurately" by Steele &
1181 //    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
1182 //    however, that the following implementation is about as fast as
1183 //    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
1184 //    will not scale well on multi-core machines.  DMG's code is slightly
1185 //    more accurate (in that it will never use more digits than
1186 //    necessary), but this is probably irrelevant for most users.
1187 //
1188 //    Rob Pike and Ken Thompson also have an implementation of dtoa() in
1189 //    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
1190 //    one in that it makes guesses and then uses strtod() to check them.
1191 //    Their implementation is faster because they use their own code to
1192 //    generate the digits in the first place rather than use snprintf(),
1193 //    thus avoiding format string parsing overhead.  However, this makes
1194 //    it considerably more complicated than the following implementation,
1195 //    and it is embedded in a larger library.  If speed turns out to be
1196 //    an issue, we could re-implement this in terms of their
1197 //    implementation.
1198 // ----------------------------------------------------------------------
1199 
SimpleDtoa(double value)1200 string SimpleDtoa(double value) {
1201   char buffer[kDoubleToBufferSize];
1202   return DoubleToBuffer(value, buffer);
1203 }
1204 
SimpleFtoa(float value)1205 string SimpleFtoa(float value) {
1206   char buffer[kFloatToBufferSize];
1207   return FloatToBuffer(value, buffer);
1208 }
1209 
IsValidFloatChar(char c)1210 static inline bool IsValidFloatChar(char c) {
1211   return ('0' <= c && c <= '9') ||
1212          c == 'e' || c == 'E' ||
1213          c == '+' || c == '-';
1214 }
1215 
DelocalizeRadix(char * buffer)1216 void DelocalizeRadix(char* buffer) {
1217   // Fast check:  if the buffer has a normal decimal point, assume no
1218   // translation is needed.
1219   if (strchr(buffer, '.') != NULL) return;
1220 
1221   // Find the first unknown character.
1222   while (IsValidFloatChar(*buffer)) ++buffer;
1223 
1224   if (*buffer == '\0') {
1225     // No radix character found.
1226     return;
1227   }
1228 
1229   // We are now pointing at the locale-specific radix character.  Replace it
1230   // with '.'.
1231   *buffer = '.';
1232   ++buffer;
1233 
1234   if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1235     // It appears the radix was a multi-byte character.  We need to remove the
1236     // extra bytes.
1237     char* target = buffer;
1238     do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1239     memmove(target, buffer, strlen(buffer) + 1);
1240   }
1241 }
1242 
DoubleToBuffer(double value,char * buffer)1243 char* DoubleToBuffer(double value, char* buffer) {
1244   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1245   // platforms these days.  Just in case some system exists where DBL_DIG
1246   // is significantly larger -- and risks overflowing our buffer -- we have
1247   // this assert.
1248   GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1249 
1250   if (value == numeric_limits<double>::infinity()) {
1251     strcpy(buffer, "inf");
1252     return buffer;
1253   } else if (value == -numeric_limits<double>::infinity()) {
1254     strcpy(buffer, "-inf");
1255     return buffer;
1256   } else if (MathLimits<double>::IsNaN(value)) {
1257     strcpy(buffer, "nan");
1258     return buffer;
1259   }
1260 
1261   int snprintf_result =
1262     snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1263 
1264   // The snprintf should never overflow because the buffer is significantly
1265   // larger than the precision we asked for.
1266   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1267 
1268   // We need to make parsed_value volatile in order to force the compiler to
1269   // write it out to the stack.  Otherwise, it may keep the value in a
1270   // register, and if it does that, it may keep it as a long double instead
1271   // of a double.  This long double may have extra bits that make it compare
1272   // unequal to "value" even though it would be exactly equal if it were
1273   // truncated to a double.
1274   volatile double parsed_value = strtod(buffer, NULL);
1275   if (parsed_value != value) {
1276     int snprintf_result =
1277       snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1278 
1279     // Should never overflow; see above.
1280     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1281   }
1282 
1283   DelocalizeRadix(buffer);
1284   return buffer;
1285 }
1286 
memcasecmp(const char * s1,const char * s2,size_t len)1287 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1288   const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1289   const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1290 
1291   for ( int i = 0; i < len; i++ ) {
1292     const int diff =
1293       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1294       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1295     if (diff != 0) return diff;
1296   }
1297   return 0;
1298 }
1299 
CaseEqual(StringPiece s1,StringPiece s2)1300 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1301   if (s1.size() != s2.size()) return false;
1302   return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1303 }
1304 
safe_strtob(StringPiece str,bool * value)1305 bool safe_strtob(StringPiece str, bool* value) {
1306   GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";
1307   if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1308       CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1309       CaseEqual(str, "1")) {
1310     *value = true;
1311     return true;
1312   }
1313   if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1314       CaseEqual(str, "no") || CaseEqual(str, "n") ||
1315       CaseEqual(str, "0")) {
1316     *value = false;
1317     return true;
1318   }
1319   return false;
1320 }
1321 
safe_strtof(const char * str,float * value)1322 bool safe_strtof(const char* str, float* value) {
1323   char* endptr;
1324   errno = 0;  // errno only gets set on errors
1325 #if defined(_WIN32) || defined (__hpux)  // has no strtof()
1326   *value = strtod(str, &endptr);
1327 #else
1328   *value = strtof(str, &endptr);
1329 #endif
1330   return *str != 0 && *endptr == 0 && errno == 0;
1331 }
1332 
safe_strtod(const char * str,double * value)1333 bool safe_strtod(const char* str, double* value) {
1334   char* endptr;
1335   *value = strtod(str, &endptr);
1336   if (endptr != str) {
1337     while (ascii_isspace(*endptr)) ++endptr;
1338   }
1339   // Ignore range errors from strtod.  The values it
1340   // returns on underflow and overflow are the right
1341   // fallback in a robust setting.
1342   return *str != '\0' && *endptr == '\0';
1343 }
1344 
safe_strto32(const string & str,int32 * value)1345 bool safe_strto32(const string& str, int32* value) {
1346   return safe_int_internal(str, value);
1347 }
1348 
safe_strtou32(const string & str,uint32 * value)1349 bool safe_strtou32(const string& str, uint32* value) {
1350   return safe_uint_internal(str, value);
1351 }
1352 
safe_strto64(const string & str,int64 * value)1353 bool safe_strto64(const string& str, int64* value) {
1354   return safe_int_internal(str, value);
1355 }
1356 
safe_strtou64(const string & str,uint64 * value)1357 bool safe_strtou64(const string& str, uint64* value) {
1358   return safe_uint_internal(str, value);
1359 }
1360 
FloatToBuffer(float value,char * buffer)1361 char* FloatToBuffer(float value, char* buffer) {
1362   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1363   // platforms these days.  Just in case some system exists where FLT_DIG
1364   // is significantly larger -- and risks overflowing our buffer -- we have
1365   // this assert.
1366   GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1367 
1368   if (value == numeric_limits<double>::infinity()) {
1369     strcpy(buffer, "inf");
1370     return buffer;
1371   } else if (value == -numeric_limits<double>::infinity()) {
1372     strcpy(buffer, "-inf");
1373     return buffer;
1374   } else if (MathLimits<float>::IsNaN(value)) {
1375     strcpy(buffer, "nan");
1376     return buffer;
1377   }
1378 
1379   int snprintf_result =
1380     snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1381 
1382   // The snprintf should never overflow because the buffer is significantly
1383   // larger than the precision we asked for.
1384   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1385 
1386   float parsed_value;
1387   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1388     int snprintf_result =
1389       snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value);
1390 
1391     // Should never overflow; see above.
1392     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1393   }
1394 
1395   DelocalizeRadix(buffer);
1396   return buffer;
1397 }
1398 
1399 namespace strings {
1400 
AlphaNum(strings::Hex hex)1401 AlphaNum::AlphaNum(strings::Hex hex) {
1402   char *const end = &digits[kFastToBufferSize];
1403   char *writer = end;
1404   uint64 value = hex.value;
1405   uint64 width = hex.spec;
1406   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1407   // where 0x10000 is the smallest hex number that is as wide as the user
1408   // asked for.
1409   uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1410   static const char hexdigits[] = "0123456789abcdef";
1411   do {
1412     *--writer = hexdigits[value & 0xF];
1413     value >>= 4;
1414     mask >>= 4;
1415   } while (mask != 0);
1416   piece_data_ = writer;
1417   piece_size_ = end - writer;
1418 }
1419 
1420 }  // namespace strings
1421 
1422 // ----------------------------------------------------------------------
1423 // StrCat()
1424 //    This merges the given strings or integers, with no delimiter.  This
1425 //    is designed to be the fastest possible way to construct a string out
1426 //    of a mix of raw C strings, C++ strings, and integer values.
1427 // ----------------------------------------------------------------------
1428 
1429 // Append is merely a version of memcpy that returns the address of the byte
1430 // after the area just overwritten.  It comes in multiple flavors to minimize
1431 // call overhead.
Append1(char * out,const AlphaNum & x)1432 static char *Append1(char *out, const AlphaNum &x) {
1433   memcpy(out, x.data(), x.size());
1434   return out + x.size();
1435 }
1436 
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1437 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1438   memcpy(out, x1.data(), x1.size());
1439   out += x1.size();
1440 
1441   memcpy(out, x2.data(), x2.size());
1442   return out + x2.size();
1443 }
1444 
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1445 static char *Append4(char *out,
1446                      const AlphaNum &x1, const AlphaNum &x2,
1447                      const AlphaNum &x3, const AlphaNum &x4) {
1448   memcpy(out, x1.data(), x1.size());
1449   out += x1.size();
1450 
1451   memcpy(out, x2.data(), x2.size());
1452   out += x2.size();
1453 
1454   memcpy(out, x3.data(), x3.size());
1455   out += x3.size();
1456 
1457   memcpy(out, x4.data(), x4.size());
1458   return out + x4.size();
1459 }
1460 
StrCat(const AlphaNum & a,const AlphaNum & b)1461 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1462   string result;
1463   result.resize(a.size() + b.size());
1464   char *const begin = &*result.begin();
1465   char *out = Append2(begin, a, b);
1466   GOOGLE_DCHECK_EQ(out, begin + result.size());
1467   return result;
1468 }
1469 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1470 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1471   string result;
1472   result.resize(a.size() + b.size() + c.size());
1473   char *const begin = &*result.begin();
1474   char *out = Append2(begin, a, b);
1475   out = Append1(out, c);
1476   GOOGLE_DCHECK_EQ(out, begin + result.size());
1477   return result;
1478 }
1479 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1480 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1481               const AlphaNum &d) {
1482   string result;
1483   result.resize(a.size() + b.size() + c.size() + d.size());
1484   char *const begin = &*result.begin();
1485   char *out = Append4(begin, a, b, c, d);
1486   GOOGLE_DCHECK_EQ(out, begin + result.size());
1487   return result;
1488 }
1489 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1490 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1491               const AlphaNum &d, const AlphaNum &e) {
1492   string result;
1493   result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1494   char *const begin = &*result.begin();
1495   char *out = Append4(begin, a, b, c, d);
1496   out = Append1(out, e);
1497   GOOGLE_DCHECK_EQ(out, begin + result.size());
1498   return result;
1499 }
1500 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1501 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1502               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1503   string result;
1504   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1505                 f.size());
1506   char *const begin = &*result.begin();
1507   char *out = Append4(begin, a, b, c, d);
1508   out = Append2(out, e, f);
1509   GOOGLE_DCHECK_EQ(out, begin + result.size());
1510   return result;
1511 }
1512 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1513 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1514               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1515               const AlphaNum &g) {
1516   string result;
1517   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1518                 f.size() + g.size());
1519   char *const begin = &*result.begin();
1520   char *out = Append4(begin, a, b, c, d);
1521   out = Append2(out, e, f);
1522   out = Append1(out, g);
1523   GOOGLE_DCHECK_EQ(out, begin + result.size());
1524   return result;
1525 }
1526 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1527 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1528               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1529               const AlphaNum &g, const AlphaNum &h) {
1530   string result;
1531   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1532                 f.size() + g.size() + h.size());
1533   char *const begin = &*result.begin();
1534   char *out = Append4(begin, a, b, c, d);
1535   out = Append4(out, e, f, g, h);
1536   GOOGLE_DCHECK_EQ(out, begin + result.size());
1537   return result;
1538 }
1539 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1540 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1541               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1542               const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1543   string result;
1544   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1545                 f.size() + g.size() + h.size() + i.size());
1546   char *const begin = &*result.begin();
1547   char *out = Append4(begin, a, b, c, d);
1548   out = Append4(out, e, f, g, h);
1549   out = Append1(out, i);
1550   GOOGLE_DCHECK_EQ(out, begin + result.size());
1551   return result;
1552 }
1553 
1554 // It's possible to call StrAppend with a char * pointer that is partway into
1555 // the string we're appending to.  However the results of this are random.
1556 // Therefore, check for this in debug mode.  Use unsigned math so we only have
1557 // to do one comparison.
1558 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1559     GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1560                      uintptr_t((dest).size()))
1561 
StrAppend(string * result,const AlphaNum & a)1562 void StrAppend(string *result, const AlphaNum &a) {
1563   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1564   result->append(a.data(), a.size());
1565 }
1566 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1567 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1568   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1569   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1570   string::size_type old_size = result->size();
1571   result->resize(old_size + a.size() + b.size());
1572   char *const begin = &*result->begin();
1573   char *out = Append2(begin + old_size, a, b);
1574   GOOGLE_DCHECK_EQ(out, begin + result->size());
1575 }
1576 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1577 void StrAppend(string *result,
1578                const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1579   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1580   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1581   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1582   string::size_type old_size = result->size();
1583   result->resize(old_size + a.size() + b.size() + c.size());
1584   char *const begin = &*result->begin();
1585   char *out = Append2(begin + old_size, a, b);
1586   out = Append1(out, c);
1587   GOOGLE_DCHECK_EQ(out, begin + result->size());
1588 }
1589 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1590 void StrAppend(string *result,
1591                const AlphaNum &a, const AlphaNum &b,
1592                const AlphaNum &c, const AlphaNum &d) {
1593   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1594   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1595   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1596   GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1597   string::size_type old_size = result->size();
1598   result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1599   char *const begin = &*result->begin();
1600   char *out = Append4(begin + old_size, a, b, c, d);
1601   GOOGLE_DCHECK_EQ(out, begin + result->size());
1602 }
1603 
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1604 int GlobalReplaceSubstring(const string& substring,
1605                            const string& replacement,
1606                            string* s) {
1607   GOOGLE_CHECK(s != NULL);
1608   if (s->empty() || substring.empty())
1609     return 0;
1610   string tmp;
1611   int num_replacements = 0;
1612   int pos = 0;
1613   for (int match_pos = s->find(substring.data(), pos, substring.length());
1614        match_pos != string::npos;
1615        pos = match_pos + substring.length(),
1616            match_pos = s->find(substring.data(), pos, substring.length())) {
1617     ++num_replacements;
1618     // Append the original content before the match.
1619     tmp.append(*s, pos, match_pos - pos);
1620     // Append the replacement for the match.
1621     tmp.append(replacement.begin(), replacement.end());
1622   }
1623   // Append the content after the last match. If no replacements were made, the
1624   // original string is left untouched.
1625   if (num_replacements > 0) {
1626     tmp.append(*s, pos, s->length() - pos);
1627     s->swap(tmp);
1628   }
1629   return num_replacements;
1630 }
1631 
CalculateBase64EscapedLen(int input_len,bool do_padding)1632 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1633   // Base64 encodes three bytes of input at a time. If the input is not
1634   // divisible by three, we pad as appropriate.
1635   //
1636   // (from http://tools.ietf.org/html/rfc3548)
1637   // Special processing is performed if fewer than 24 bits are available
1638   // at the end of the data being encoded.  A full encoding quantum is
1639   // always completed at the end of a quantity.  When fewer than 24 input
1640   // bits are available in an input group, zero bits are added (on the
1641   // right) to form an integral number of 6-bit groups.  Padding at the
1642   // end of the data is performed using the '=' character.  Since all base
1643   // 64 input is an integral number of octets, only the following cases
1644   // can arise:
1645 
1646 
1647   // Base64 encodes each three bytes of input into four bytes of output.
1648   int len = (input_len / 3) * 4;
1649 
1650   if (input_len % 3 == 0) {
1651     // (from http://tools.ietf.org/html/rfc3548)
1652     // (1) the final quantum of encoding input is an integral multiple of 24
1653     // bits; here, the final unit of encoded output will be an integral
1654     // multiple of 4 characters with no "=" padding,
1655   } else if (input_len % 3 == 1) {
1656     // (from http://tools.ietf.org/html/rfc3548)
1657     // (2) the final quantum of encoding input is exactly 8 bits; here, the
1658     // final unit of encoded output will be two characters followed by two
1659     // "=" padding characters, or
1660     len += 2;
1661     if (do_padding) {
1662       len += 2;
1663     }
1664   } else {  // (input_len % 3 == 2)
1665     // (from http://tools.ietf.org/html/rfc3548)
1666     // (3) the final quantum of encoding input is exactly 16 bits; here, the
1667     // final unit of encoded output will be three characters followed by one
1668     // "=" padding character.
1669     len += 3;
1670     if (do_padding) {
1671       len += 1;
1672     }
1673   }
1674 
1675   assert(len >= input_len);  // make sure we didn't overflow
1676   return len;
1677 }
1678 
1679 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1680 int CalculateBase64EscapedLen(int input_len) {
1681   return CalculateBase64EscapedLen(input_len, true);
1682 }
1683 
1684 // ----------------------------------------------------------------------
1685 // int Base64Unescape() - base64 decoder
1686 // int Base64Escape() - base64 encoder
1687 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1688 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1689 //
1690 // Check out
1691 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1692 // care about is that...
1693 //   Take the encoded stuff in groups of 4 characters and turn each
1694 //   character into a code 0 to 63 thus:
1695 //           A-Z map to 0 to 25
1696 //           a-z map to 26 to 51
1697 //           0-9 map to 52 to 61
1698 //           +(- for WebSafe) maps to 62
1699 //           /(_ for WebSafe) maps to 63
1700 //   There will be four numbers, all less than 64 which can be represented
1701 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1702 //   Arrange the 6 digit binary numbers into three bytes as such:
1703 //   aaaaaabb bbbbcccc ccdddddd
1704 //   Equals signs (one or two) are used at the end of the encoded block to
1705 //   indicate that the text was not an integer multiple of three bytes long.
1706 // ----------------------------------------------------------------------
1707 
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1708 int Base64UnescapeInternal(const char *src_param, int szsrc,
1709                            char *dest, int szdest,
1710                            const signed char* unbase64) {
1711   static const char kPad64Equals = '=';
1712   static const char kPad64Dot = '.';
1713 
1714   int decode = 0;
1715   int destidx = 0;
1716   int state = 0;
1717   unsigned int ch = 0;
1718   unsigned int temp = 0;
1719 
1720   // If "char" is signed by default, using *src as an array index results in
1721   // accessing negative array elements. Treat the input as a pointer to
1722   // unsigned char to avoid this.
1723   const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1724 
1725   // The GET_INPUT macro gets the next input character, skipping
1726   // over any whitespace, and stopping when we reach the end of the
1727   // string or when we read any non-data character.  The arguments are
1728   // an arbitrary identifier (used as a label for goto) and the number
1729   // of data bytes that must remain in the input to avoid aborting the
1730   // loop.
1731 #define GET_INPUT(label, remain)                 \
1732   label:                                         \
1733     --szsrc;                                     \
1734     ch = *src++;                                 \
1735     decode = unbase64[ch];                       \
1736     if (decode < 0) {                            \
1737       if (ascii_isspace(ch) && szsrc >= remain)  \
1738         goto label;                              \
1739       state = 4 - remain;                        \
1740       break;                                     \
1741     }
1742 
1743   // if dest is null, we're just checking to see if it's legal input
1744   // rather than producing output.  (I suspect this could just be done
1745   // with a regexp...).  We duplicate the loop so this test can be
1746   // outside it instead of in every iteration.
1747 
1748   if (dest) {
1749     // This loop consumes 4 input bytes and produces 3 output bytes
1750     // per iteration.  We can't know at the start that there is enough
1751     // data left in the string for a full iteration, so the loop may
1752     // break out in the middle; if so 'state' will be set to the
1753     // number of input bytes read.
1754 
1755     while (szsrc >= 4)  {
1756       // We'll start by optimistically assuming that the next four
1757       // bytes of the string (src[0..3]) are four good data bytes
1758       // (that is, no nulls, whitespace, padding chars, or illegal
1759       // chars).  We need to test src[0..2] for nulls individually
1760       // before constructing temp to preserve the property that we
1761       // never read past a null in the string (no matter how long
1762       // szsrc claims the string is).
1763 
1764       if (!src[0] || !src[1] || !src[2] ||
1765           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1766                    (unsigned(unbase64[src[1]]) << 12) |
1767                    (unsigned(unbase64[src[2]]) << 6) |
1768                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1769         // Iff any of those four characters was bad (null, illegal,
1770         // whitespace, padding), then temp's high bit will be set
1771         // (because unbase64[] is -1 for all bad characters).
1772         //
1773         // We'll back up and resort to the slower decoder, which knows
1774         // how to handle those cases.
1775 
1776         GET_INPUT(first, 4);
1777         temp = decode;
1778         GET_INPUT(second, 3);
1779         temp = (temp << 6) | decode;
1780         GET_INPUT(third, 2);
1781         temp = (temp << 6) | decode;
1782         GET_INPUT(fourth, 1);
1783         temp = (temp << 6) | decode;
1784       } else {
1785         // We really did have four good data bytes, so advance four
1786         // characters in the string.
1787 
1788         szsrc -= 4;
1789         src += 4;
1790         decode = -1;
1791         ch = '\0';
1792       }
1793 
1794       // temp has 24 bits of input, so write that out as three bytes.
1795 
1796       if (destidx+3 > szdest) return -1;
1797       dest[destidx+2] = temp;
1798       temp >>= 8;
1799       dest[destidx+1] = temp;
1800       temp >>= 8;
1801       dest[destidx] = temp;
1802       destidx += 3;
1803     }
1804   } else {
1805     while (szsrc >= 4)  {
1806       if (!src[0] || !src[1] || !src[2] ||
1807           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1808                    (unsigned(unbase64[src[1]]) << 12) |
1809                    (unsigned(unbase64[src[2]]) << 6) |
1810                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1811         GET_INPUT(first_no_dest, 4);
1812         GET_INPUT(second_no_dest, 3);
1813         GET_INPUT(third_no_dest, 2);
1814         GET_INPUT(fourth_no_dest, 1);
1815       } else {
1816         szsrc -= 4;
1817         src += 4;
1818         decode = -1;
1819         ch = '\0';
1820       }
1821       destidx += 3;
1822     }
1823   }
1824 
1825 #undef GET_INPUT
1826 
1827   // if the loop terminated because we read a bad character, return
1828   // now.
1829   if (decode < 0 && ch != '\0' &&
1830       ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1831     return -1;
1832 
1833   if (ch == kPad64Equals || ch == kPad64Dot) {
1834     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1835     // look at it again when we count to check for the proper number of
1836     // equals signs at the end.
1837     ++szsrc;
1838     --src;
1839   } else {
1840     // This loop consumes 1 input byte per iteration.  It's used to
1841     // clean up the 0-3 input bytes remaining when the first, faster
1842     // loop finishes.  'temp' contains the data from 'state' input
1843     // characters read by the first loop.
1844     while (szsrc > 0)  {
1845       --szsrc;
1846       ch = *src++;
1847       decode = unbase64[ch];
1848       if (decode < 0) {
1849         if (ascii_isspace(ch)) {
1850           continue;
1851         } else if (ch == '\0') {
1852           break;
1853         } else if (ch == kPad64Equals || ch == kPad64Dot) {
1854           // back up one character; we'll read it again when we check
1855           // for the correct number of pad characters at the end.
1856           ++szsrc;
1857           --src;
1858           break;
1859         } else {
1860           return -1;
1861         }
1862       }
1863 
1864       // Each input character gives us six bits of output.
1865       temp = (temp << 6) | decode;
1866       ++state;
1867       if (state == 4) {
1868         // If we've accumulated 24 bits of output, write that out as
1869         // three bytes.
1870         if (dest) {
1871           if (destidx+3 > szdest) return -1;
1872           dest[destidx+2] = temp;
1873           temp >>= 8;
1874           dest[destidx+1] = temp;
1875           temp >>= 8;
1876           dest[destidx] = temp;
1877         }
1878         destidx += 3;
1879         state = 0;
1880         temp = 0;
1881       }
1882     }
1883   }
1884 
1885   // Process the leftover data contained in 'temp' at the end of the input.
1886   int expected_equals = 0;
1887   switch (state) {
1888     case 0:
1889       // Nothing left over; output is a multiple of 3 bytes.
1890       break;
1891 
1892     case 1:
1893       // Bad input; we have 6 bits left over.
1894       return -1;
1895 
1896     case 2:
1897       // Produce one more output byte from the 12 input bits we have left.
1898       if (dest) {
1899         if (destidx+1 > szdest) return -1;
1900         temp >>= 4;
1901         dest[destidx] = temp;
1902       }
1903       ++destidx;
1904       expected_equals = 2;
1905       break;
1906 
1907     case 3:
1908       // Produce two more output bytes from the 18 input bits we have left.
1909       if (dest) {
1910         if (destidx+2 > szdest) return -1;
1911         temp >>= 2;
1912         dest[destidx+1] = temp;
1913         temp >>= 8;
1914         dest[destidx] = temp;
1915       }
1916       destidx += 2;
1917       expected_equals = 1;
1918       break;
1919 
1920     default:
1921       // state should have no other values at this point.
1922       GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1923   }
1924 
1925   // The remainder of the string should be all whitespace, mixed with
1926   // exactly 0 equals signs, or exactly 'expected_equals' equals
1927   // signs.  (Always accepting 0 equals signs is a google extension
1928   // not covered in the RFC, as is accepting dot as the pad character.)
1929 
1930   int equals = 0;
1931   while (szsrc > 0 && *src) {
1932     if (*src == kPad64Equals || *src == kPad64Dot)
1933       ++equals;
1934     else if (!ascii_isspace(*src))
1935       return -1;
1936     --szsrc;
1937     ++src;
1938   }
1939 
1940   return (equals == 0 || equals == expected_equals) ? destidx : -1;
1941 }
1942 
1943 // The arrays below were generated by the following code
1944 // #include <sys/time.h>
1945 // #include <stdlib.h>
1946 // #include <string.h>
1947 // main()
1948 // {
1949 //   static const char Base64[] =
1950 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1951 //   char *pos;
1952 //   int idx, i, j;
1953 //   printf("    ");
1954 //   for (i = 0; i < 255; i += 8) {
1955 //     for (j = i; j < i + 8; j++) {
1956 //       pos = strchr(Base64, j);
1957 //       if ((pos == NULL) || (j == 0))
1958 //         idx = -1;
1959 //       else
1960 //         idx = pos - Base64;
1961 //       if (idx == -1)
1962 //         printf(" %2d,     ", idx);
1963 //       else
1964 //         printf(" %2d/*%c*/,", idx, j);
1965 //     }
1966 //     printf("\n    ");
1967 //   }
1968 // }
1969 //
1970 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1971 // tables from the functions below.
1972 static const signed char kUnBase64[] = {
1973   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1974   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1975   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1976   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1977   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1978   -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1979   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1980   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1981   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1982   07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1983   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1984   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
1985   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1986   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1987   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1988   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
1989   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1990   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1991   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1992   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1993   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1994   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1995   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1996   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1997   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1998   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1999   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2000   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2001   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2002   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2003   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2004   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2005 };
2006 static const signed char kUnWebSafeBase64[] = {
2007   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2008   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2009   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2010   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2011   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2012   -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
2013   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2014   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2015   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2016   07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2017   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2018   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
2019   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2020   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2021   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2022   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2023   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2024   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2025   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2026   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2027   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2028   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2029   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2030   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2031   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2032   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2033   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2034   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2035   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2036   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2037   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2038   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2039 };
2040 
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2041 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2042   return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2043 }
2044 
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2045 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2046                                    const signed char* unbase64) {
2047   // Determine the size of the output string.  Base64 encodes every 3 bytes into
2048   // 4 characters.  any leftover chars are added directly for good measure.
2049   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2050   const int dest_len = 3 * (slen / 4) + (slen % 4);
2051 
2052   dest->resize(dest_len);
2053 
2054   // We are getting the destination buffer by getting the beginning of the
2055   // string and converting it into a char *.
2056   const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2057                                          dest_len, unbase64);
2058   if (len < 0) {
2059     dest->clear();
2060     return false;
2061   }
2062 
2063   // could be shorter if there was padding
2064   GOOGLE_DCHECK_LE(len, dest_len);
2065   dest->erase(len);
2066 
2067   return true;
2068 }
2069 
Base64Unescape(StringPiece src,string * dest)2070 bool Base64Unescape(StringPiece src, string* dest) {
2071   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2072 }
2073 
WebSafeBase64Unescape(StringPiece src,string * dest)2074 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2075   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2076 }
2077 
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2078 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2079                          char *dest, int szdest, const char *base64,
2080                          bool do_padding) {
2081   static const char kPad64 = '=';
2082 
2083   if (szsrc <= 0) return 0;
2084 
2085   if (szsrc * 4 > szdest * 3) return 0;
2086 
2087   char *cur_dest = dest;
2088   const unsigned char *cur_src = src;
2089 
2090   char *limit_dest = dest + szdest;
2091   const unsigned char *limit_src = src + szsrc;
2092 
2093   // Three bytes of data encodes to four characters of cyphertext.
2094   // So we can pump through three-byte chunks atomically.
2095   while (cur_src < limit_src - 3) {  // keep going as long as we have >= 32 bits
2096     uint32 in = BigEndian::Load32(cur_src) >> 8;
2097 
2098     cur_dest[0] = base64[in >> 18];
2099     in &= 0x3FFFF;
2100     cur_dest[1] = base64[in >> 12];
2101     in &= 0xFFF;
2102     cur_dest[2] = base64[in >> 6];
2103     in &= 0x3F;
2104     cur_dest[3] = base64[in];
2105 
2106     cur_dest += 4;
2107     cur_src += 3;
2108   }
2109   // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
2110   szdest = limit_dest - cur_dest;
2111   szsrc = limit_src - cur_src;
2112 
2113   /* now deal with the tail (<=3 bytes) */
2114   switch (szsrc) {
2115     case 0:
2116       // Nothing left; nothing more to do.
2117       break;
2118     case 1: {
2119       // One byte left: this encodes to two characters, and (optionally)
2120       // two pad characters to round out the four-character cypherblock.
2121       if ((szdest -= 2) < 0) return 0;
2122       uint32 in = cur_src[0];
2123       cur_dest[0] = base64[in >> 2];
2124       in &= 0x3;
2125       cur_dest[1] = base64[in << 4];
2126       cur_dest += 2;
2127       if (do_padding) {
2128         if ((szdest -= 2) < 0) return 0;
2129         cur_dest[0] = kPad64;
2130         cur_dest[1] = kPad64;
2131         cur_dest += 2;
2132       }
2133       break;
2134     }
2135     case 2: {
2136       // Two bytes left: this encodes to three characters, and (optionally)
2137       // one pad character to round out the four-character cypherblock.
2138       if ((szdest -= 3) < 0) return 0;
2139       uint32 in = BigEndian::Load16(cur_src);
2140       cur_dest[0] = base64[in >> 10];
2141       in &= 0x3FF;
2142       cur_dest[1] = base64[in >> 4];
2143       in &= 0x00F;
2144       cur_dest[2] = base64[in << 2];
2145       cur_dest += 3;
2146       if (do_padding) {
2147         if ((szdest -= 1) < 0) return 0;
2148         cur_dest[0] = kPad64;
2149         cur_dest += 1;
2150       }
2151       break;
2152     }
2153     case 3: {
2154       // Three bytes left: same as in the big loop above.  We can't do this in
2155       // the loop because the loop above always reads 4 bytes, and the fourth
2156       // byte is past the end of the input.
2157       if ((szdest -= 4) < 0) return 0;
2158       uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2159       cur_dest[0] = base64[in >> 18];
2160       in &= 0x3FFFF;
2161       cur_dest[1] = base64[in >> 12];
2162       in &= 0xFFF;
2163       cur_dest[2] = base64[in >> 6];
2164       in &= 0x3F;
2165       cur_dest[3] = base64[in];
2166       cur_dest += 4;
2167       break;
2168     }
2169     default:
2170       // Should not be reached: blocks of 4 bytes are handled
2171       // in the while loop before this switch statement.
2172       GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2173       break;
2174   }
2175   return (cur_dest - dest);
2176 }
2177 
2178 static const char kBase64Chars[] =
2179 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2180 
2181 static const char kWebSafeBase64Chars[] =
2182 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2183 
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2184 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2185   return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2186 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2187 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2188                         int szdest, bool do_padding) {
2189   return Base64EscapeInternal(src, szsrc, dest, szdest,
2190                               kWebSafeBase64Chars, do_padding);
2191 }
2192 
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2193 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2194                           string* dest, bool do_padding,
2195                           const char* base64_chars) {
2196   const int calc_escaped_size =
2197     CalculateBase64EscapedLen(szsrc, do_padding);
2198   dest->resize(calc_escaped_size);
2199   const int escaped_len = Base64EscapeInternal(src, szsrc,
2200                                                string_as_array(dest),
2201                                                dest->size(),
2202                                                base64_chars,
2203                                                do_padding);
2204   GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2205   dest->erase(escaped_len);
2206 }
2207 
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2208 void Base64Escape(const unsigned char *src, int szsrc,
2209                   string* dest, bool do_padding) {
2210   Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2211 }
2212 
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2213 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2214                          string *dest, bool do_padding) {
2215   Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2216 }
2217 
Base64Escape(StringPiece src,string * dest)2218 void Base64Escape(StringPiece src, string* dest) {
2219   Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2220                src.size(), dest, true);
2221 }
2222 
WebSafeBase64Escape(StringPiece src,string * dest)2223 void WebSafeBase64Escape(StringPiece src, string* dest) {
2224   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2225                       src.size(), dest, false);
2226 }
2227 
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2228 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2229   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2230                       src.size(), dest, true);
2231 }
2232 
2233 // Helper to append a Unicode code point to a string as UTF8, without bringing
2234 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2235 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2236   uint32 tmp = 0;
2237   int len = 0;
2238   if (code_point <= 0x7f) {
2239     tmp = code_point;
2240     len = 1;
2241   } else if (code_point <= 0x07ff) {
2242     tmp = 0x0000c080 |
2243         ((code_point & 0x07c0) << 2) |
2244         (code_point & 0x003f);
2245     len = 2;
2246   } else if (code_point <= 0xffff) {
2247     tmp = 0x00e08080 |
2248         ((code_point & 0xf000) << 4) |
2249         ((code_point & 0x0fc0) << 2) |
2250         (code_point & 0x003f);
2251     len = 3;
2252   } else {
2253     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2254     // normally only defined up to there as well.
2255     tmp = 0xf0808080 |
2256         ((code_point & 0x1c0000) << 6) |
2257         ((code_point & 0x03f000) << 4) |
2258         ((code_point & 0x000fc0) << 2) |
2259         (code_point & 0x003f);
2260     len = 4;
2261   }
2262   tmp = ghtonl(tmp);
2263   memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2264   return len;
2265 }
2266 
2267 // Table of UTF-8 character lengths, based on first byte
2268 static const unsigned char kUTF8LenTbl[256] = {
2269   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2270   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2271   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2272   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2273 
2274   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2275   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2276   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2277   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2278 };
2279 
2280 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2281 int UTF8FirstLetterNumBytes(const char* src, int len) {
2282   if (len == 0) {
2283     return 0;
2284   }
2285   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2286 }
2287 
2288 }  // namespace protobuf
2289 }  // namespace google
2290