1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/lib/strings/str_util.h"
17 
18 #include <ctype.h>
19 #include <algorithm>
20 #include <cstring>
21 #include <vector>
22 #include "tensorflow/core/lib/strings/numbers.h"
23 #include "tensorflow/core/lib/strings/stringprintf.h"
24 #include "tensorflow/core/platform/logging.h"
25 
26 namespace tensorflow {
27 namespace str_util {
28 
29 static char hex_char[] = "0123456789abcdef";
30 
CEscape(StringPiece src)31 string CEscape(StringPiece src) {
32   string dest;
33 
34   for (unsigned char c : src) {
35     switch (c) {
36       case '\n':
37         dest.append("\\n");
38         break;
39       case '\r':
40         dest.append("\\r");
41         break;
42       case '\t':
43         dest.append("\\t");
44         break;
45       case '\"':
46         dest.append("\\\"");
47         break;
48       case '\'':
49         dest.append("\\'");
50         break;
51       case '\\':
52         dest.append("\\\\");
53         break;
54       default:
55         // Note that if we emit \xNN and the src character after that is a hex
56         // digit then that digit must be escaped too to prevent it being
57         // interpreted as part of the character code by C.
58         if ((c >= 0x80) || !isprint(c)) {
59           dest.append("\\");
60           dest.push_back(hex_char[c / 64]);
61           dest.push_back(hex_char[(c % 64) / 8]);
62           dest.push_back(hex_char[c % 8]);
63         } else {
64           dest.push_back(c);
65           break;
66         }
67     }
68   }
69 
70   return dest;
71 }
72 
73 namespace {  // Private helpers for CUnescape().
74 
is_octal_digit(unsigned char c)75 inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; }
76 
ascii_isxdigit(unsigned char c)77 inline bool ascii_isxdigit(unsigned char c) {
78   return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
79          (c >= 'A' && c <= 'F');
80 }
81 
hex_digit_to_int(char c)82 inline int hex_digit_to_int(char c) {
83   int x = static_cast<unsigned char>(c);
84   if (x > '9') {
85     x += 9;
86   }
87   return x & 0xf;
88 }
89 
CUnescapeInternal(StringPiece source,string * dest,string::size_type * dest_len,string * error)90 bool CUnescapeInternal(StringPiece source, string* dest,
91                        string::size_type* dest_len, string* error) {
92   const char* p = source.data();
93   const char* end = source.end();
94   const char* last_byte = end - 1;
95 
96   // We are going to write the result to dest with its iterator. If our string
97   // implementation uses copy-on-write, this will trigger a copy-on-write of
98   // dest's buffer; that is, dest will be assigned a new buffer.
99   //
100   // Note that the following way is NOT a legal way to modify a string's
101   // content:
102   //
103   //  char* d = const_cast<char*>(dest->data());
104   //
105   // This won't trigger copy-on-write of the string, and so is dangerous when
106   // the buffer is shared.
107   auto d = dest->begin();
108 
109   // Small optimization for case where source = dest and there's no escaping
110   if (source.data() == dest->data()) {
111     while (p < end && *p != '\\') {
112       p++;
113       d++;
114     }
115   }
116 
117   while (p < end) {
118     if (*p != '\\') {
119       *d++ = *p++;
120     } else {
121       if (++p > last_byte) {  // skip past the '\\'
122         if (error) *error = "String cannot end with \\";
123         return false;
124       }
125       switch (*p) {
126         case 'a':
127           *d++ = '\a';
128           break;
129         case 'b':
130           *d++ = '\b';
131           break;
132         case 'f':
133           *d++ = '\f';
134           break;
135         case 'n':
136           *d++ = '\n';
137           break;
138         case 'r':
139           *d++ = '\r';
140           break;
141         case 't':
142           *d++ = '\t';
143           break;
144         case 'v':
145           *d++ = '\v';
146           break;
147         case '\\':
148           *d++ = '\\';
149           break;
150         case '?':
151           *d++ = '\?';
152           break;  // \?  Who knew?
153         case '\'':
154           *d++ = '\'';
155           break;
156         case '"':
157           *d++ = '\"';
158           break;
159         case '0':
160         case '1':
161         case '2':
162         case '3':  // octal digit: 1 to 3 digits
163         case '4':
164         case '5':
165         case '6':
166         case '7': {
167           const char* octal_start = p;
168           unsigned int ch = *p - '0';
169           if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
170           if (p < last_byte && is_octal_digit(p[1]))
171             ch = ch * 8 + *++p - '0';  // now points at last digit
172           if (ch > 0xff) {
173             if (error) {
174               *error = "Value of \\" +
175                        string(octal_start, p + 1 - octal_start) +
176                        " exceeds 0xff";
177             }
178             return false;
179           }
180           *d++ = ch;
181           break;
182         }
183         case 'x':
184         case 'X': {
185           if (p >= last_byte) {
186             if (error) *error = "String cannot end with \\x";
187             return false;
188           } else if (!ascii_isxdigit(p[1])) {
189             if (error) *error = "\\x cannot be followed by a non-hex digit";
190             return false;
191           }
192           unsigned int ch = 0;
193           const char* hex_start = p;
194           while (p < last_byte && ascii_isxdigit(p[1]))
195             // Arbitrarily many hex digits
196             ch = (ch << 4) + hex_digit_to_int(*++p);
197           if (ch > 0xFF) {
198             if (error) {
199               *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
200                        " exceeds 0xff";
201             }
202             return false;
203           }
204           *d++ = ch;
205           break;
206         }
207         default: {
208           if (error) *error = string("Unknown escape sequence: \\") + *p;
209           return false;
210         }
211       }
212       p++;  // read past letter we escaped
213     }
214   }
215   *dest_len = d - dest->begin();
216   return true;
217 }
218 
219 template <typename T>
SplitAndParseAsInts(StringPiece text,char delim,std::function<bool (StringPiece,T *)> converter,std::vector<T> * result)220 bool SplitAndParseAsInts(StringPiece text, char delim,
221                          std::function<bool(StringPiece, T*)> converter,
222                          std::vector<T>* result) {
223   result->clear();
224   std::vector<string> num_strings = Split(text, delim);
225   for (const auto& s : num_strings) {
226     T num;
227     if (!converter(s, &num)) return false;
228     result->push_back(num);
229   }
230   return true;
231 }
232 
233 }  // namespace
234 
CUnescape(StringPiece source,string * dest,string * error)235 bool CUnescape(StringPiece source, string* dest, string* error) {
236   dest->resize(source.size());
237   string::size_type dest_size;
238   if (!CUnescapeInternal(source, dest, &dest_size, error)) {
239     return false;
240   }
241   dest->erase(dest_size);
242   return true;
243 }
244 
StripTrailingWhitespace(string * s)245 void StripTrailingWhitespace(string* s) {
246   string::size_type i;
247   for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) {
248   }
249   s->resize(i);
250 }
251 
252 // Return lower-cased version of s.
Lowercase(StringPiece s)253 string Lowercase(StringPiece s) {
254   string result(s.data(), s.size());
255   for (char& c : result) {
256     c = tolower(c);
257   }
258   return result;
259 }
260 
261 // Return upper-cased version of s.
Uppercase(StringPiece s)262 string Uppercase(StringPiece s) {
263   string result(s.data(), s.size());
264   for (char& c : result) {
265     c = toupper(c);
266   }
267   return result;
268 }
269 
ArgDefCase(StringPiece s)270 string ArgDefCase(StringPiece s) {
271   const size_t n = s.size();
272 
273   // Compute the size of resulting string.
274   // Number of extra underscores we will need to add.
275   size_t extra_us = 0;
276   // Number of non-alpha chars in the beginning to skip.
277   size_t to_skip = 0;
278   for (size_t i = 0; i < n; ++i) {
279     // If we are skipping and current letter is non-alpha, skip it as well
280     if (i == to_skip && !isalpha(s[i])) {
281       ++to_skip;
282       continue;
283     }
284 
285     // If we are here, we are not skipping any more.
286     // If this letter is upper case, not the very first char in the
287     // resulting string, and previous letter isn't replaced with an underscore,
288     // we will need to insert an underscore.
289     if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) {
290       ++extra_us;
291     }
292   }
293 
294   // Initialize result with all '_'s. There is no string
295   // constructor that does not initialize memory.
296   string result(n + extra_us - to_skip, '_');
297   // i - index into s
298   // j - index into result
299   for (size_t i = to_skip, j = 0; i < n; ++i, ++j) {
300     DCHECK_LT(j, result.size());
301     char c = s[i];
302     // If c is not alphanumeric, we don't need to do anything
303     // since there is already an underscore in its place.
304     if (isalnum(c)) {
305       if (isupper(c)) {
306         // If current char is upper case, we might need to insert an
307         // underscore.
308         if (i != to_skip) {
309           DCHECK_GT(j, 0);
310           if (result[j - 1] != '_') ++j;
311         }
312         result[j] = tolower(c);
313       } else {
314         result[j] = c;
315       }
316     }
317   }
318 
319   return result;
320 }
321 
TitlecaseString(string * s,StringPiece delimiters)322 void TitlecaseString(string* s, StringPiece delimiters) {
323   bool upper = true;
324   for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
325     if (upper) {
326       *ss = toupper(*ss);
327     }
328     upper = (delimiters.find(*ss) != StringPiece::npos);
329   }
330 }
331 
StringReplace(StringPiece s,StringPiece oldsub,StringPiece newsub,bool replace_all)332 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
333                      bool replace_all) {
334   // TODO(jlebar): We could avoid having to shift data around in the string if
335   // we had a StringPiece::find() overload that searched for a StringPiece.
336   string res(s);
337   size_t pos = 0;
338   while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
339     res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
340     pos += newsub.size();
341     if (oldsub.empty()) {
342       pos++;  // Match at the beginning of the text and after every byte
343     }
344     if (!replace_all) {
345       break;
346     }
347   }
348   return res;
349 }
350 
RemoveLeadingWhitespace(StringPiece * text)351 size_t RemoveLeadingWhitespace(StringPiece* text) {
352   size_t count = 0;
353   const char* ptr = text->data();
354   while (count < text->size() && isspace(*ptr)) {
355     count++;
356     ptr++;
357   }
358   text->remove_prefix(count);
359   return count;
360 }
361 
RemoveTrailingWhitespace(StringPiece * text)362 size_t RemoveTrailingWhitespace(StringPiece* text) {
363   size_t count = 0;
364   const char* ptr = text->data() + text->size() - 1;
365   while (count < text->size() && isspace(*ptr)) {
366     ++count;
367     --ptr;
368   }
369   text->remove_suffix(count);
370   return count;
371 }
372 
RemoveWhitespaceContext(StringPiece * text)373 size_t RemoveWhitespaceContext(StringPiece* text) {
374   // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
375   return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text));
376 }
377 
ConsumePrefix(StringPiece * s,StringPiece expected)378 bool ConsumePrefix(StringPiece* s, StringPiece expected) {
379   if (StartsWith(*s, expected)) {
380     s->remove_prefix(expected.size());
381     return true;
382   }
383   return false;
384 }
385 
ConsumeSuffix(StringPiece * s,StringPiece expected)386 bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
387   if (EndsWith(*s, expected)) {
388     s->remove_suffix(expected.size());
389     return true;
390   }
391   return false;
392 }
393 
ConsumeLeadingDigits(StringPiece * s,uint64 * val)394 bool ConsumeLeadingDigits(StringPiece* s, uint64* val) {
395   const char* p = s->data();
396   const char* limit = p + s->size();
397   uint64 v = 0;
398   while (p < limit) {
399     const char c = *p;
400     if (c < '0' || c > '9') break;
401     uint64 new_v = (v * 10) + (c - '0');
402     if (new_v / 8 < v) {
403       // Overflow occurred
404       return false;
405     }
406     v = new_v;
407     p++;
408   }
409   if (p > s->data()) {
410     // Consume some digits
411     s->remove_prefix(p - s->data());
412     *val = v;
413     return true;
414   } else {
415     return false;
416   }
417 }
418 
ConsumeNonWhitespace(StringPiece * s,StringPiece * val)419 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
420   const char* p = s->data();
421   const char* limit = p + s->size();
422   while (p < limit) {
423     const char c = *p;
424     if (isspace(c)) break;
425     p++;
426   }
427   const size_t n = p - s->data();
428   if (n > 0) {
429     *val = StringPiece(s->data(), n);
430     s->remove_prefix(n);
431     return true;
432   } else {
433     *val = StringPiece();
434     return false;
435   }
436 }
437 
SplitAndParseAsInts(StringPiece text,char delim,std::vector<int32> * result)438 bool SplitAndParseAsInts(StringPiece text, char delim,
439                          std::vector<int32>* result) {
440   return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result);
441 }
442 
SplitAndParseAsInts(StringPiece text,char delim,std::vector<int64> * result)443 bool SplitAndParseAsInts(StringPiece text, char delim,
444                          std::vector<int64>* result) {
445   return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result);
446 }
447 
SplitAndParseAsFloats(StringPiece text,char delim,std::vector<float> * result)448 bool SplitAndParseAsFloats(StringPiece text, char delim,
449                            std::vector<float>* result) {
450   return SplitAndParseAsInts<float>(text, delim,
451                                     [](StringPiece str, float* value) {
452                                       return strings::safe_strtof(str, value);
453                                     },
454                                     result);
455 }
456 
Strnlen(const char * str,const size_t string_max_len)457 size_t Strnlen(const char* str, const size_t string_max_len) {
458   size_t len = 0;
459   while (len < string_max_len && str[len] != '\0') {
460     ++len;
461   }
462   return len;
463 }
464 
StrContains(StringPiece haystack,StringPiece needle)465 bool StrContains(StringPiece haystack, StringPiece needle) {
466   return std::search(haystack.begin(), haystack.end(), needle.begin(),
467                      needle.end()) != haystack.end();
468 }
469 
StartsWith(StringPiece text,StringPiece prefix)470 bool StartsWith(StringPiece text, StringPiece prefix) {
471   return prefix.empty() ||
472          (text.size() >= prefix.size() &&
473           memcmp(text.data(), prefix.data(), prefix.size()) == 0);
474 }
475 
EndsWith(StringPiece text,StringPiece suffix)476 bool EndsWith(StringPiece text, StringPiece suffix) {
477   return suffix.empty() || (text.size() >= suffix.size() &&
478                             memcmp(text.data() + (text.size() - suffix.size()),
479                                    suffix.data(), suffix.size()) == 0);
480 }
481 
482 }  // namespace str_util
483 }  // namespace tensorflow
484