1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LIB_STRINGS_SCANNER_H_ 17 #define TENSORFLOW_LIB_STRINGS_SCANNER_H_ 18 19 #include <string> 20 #include "tensorflow/core/lib/core/stringpiece.h" 21 #include "tensorflow/core/platform/macros.h" 22 23 namespace tensorflow { 24 namespace strings { 25 26 // Scanner provides simplified string parsing, in which a string is parsed as a 27 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then 28 // finally GetResult is called. If GetResult returns true, then it also returns 29 // the remaining characters and any captured substring. 30 // 31 // The range to capture can be controlled with RestartCapture and StopCapture; 32 // by default, all processed characters are captured. 33 class Scanner { 34 public: 35 // Classes of characters. Each enum name is to be read as the union of the 36 // parts - e.g., class LETTER_DIGIT means the class includes all letters and 37 // all digits. 38 // 39 // LETTER means ascii letter a-zA-Z. 40 // DIGIT means ascii digit: 0-9. 41 enum CharClass { 42 // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest 43 // in scanner_test.cc 44 ALL, 45 DIGIT, 46 LETTER, 47 LETTER_DIGIT, 48 LETTER_DIGIT_DASH_UNDERSCORE, 49 LETTER_DIGIT_DASH_DOT_SLASH, // SLASH is / only, not backslash 50 LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE, // SLASH is / only, not backslash 51 LETTER_DIGIT_DOT, 52 LETTER_DIGIT_DOT_PLUS_MINUS, 53 LETTER_DIGIT_DOT_UNDERSCORE, 54 LETTER_DIGIT_UNDERSCORE, 55 LOWERLETTER, 56 LOWERLETTER_DIGIT, 57 LOWERLETTER_DIGIT_UNDERSCORE, 58 NON_ZERO_DIGIT, 59 SPACE, 60 UPPERLETTER, 61 }; 62 Scanner(StringPiece source)63 explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); } 64 65 // Consume the next character of the given class from input. If the next 66 // character is not in the class, then GetResult will ultimately return false. One(CharClass clz)67 Scanner& One(CharClass clz) { 68 if (cur_.empty() || !Matches(clz, cur_[0])) { 69 return Error(); 70 } 71 cur_.remove_prefix(1); 72 return *this; 73 } 74 75 // Consume the next s.size() characters of the input, if they match <s>. If 76 // they don't match <s>, this is a no-op. ZeroOrOneLiteral(StringPiece s)77 Scanner& ZeroOrOneLiteral(StringPiece s) { 78 cur_.Consume(s); 79 return *this; 80 } 81 82 // Consume the next s.size() characters of the input, if they match <s>. If 83 // they don't match <s>, then GetResult will ultimately return false. OneLiteral(StringPiece s)84 Scanner& OneLiteral(StringPiece s) { 85 if (!cur_.Consume(s)) { 86 error_ = true; 87 } 88 return *this; 89 } 90 91 // Consume characters from the input as long as they match <clz>. Zero 92 // characters is still considered a match, so it will never cause GetResult to 93 // return false. Any(CharClass clz)94 Scanner& Any(CharClass clz) { 95 while (!cur_.empty() && Matches(clz, cur_[0])) { 96 cur_.remove_prefix(1); 97 } 98 return *this; 99 } 100 101 // Shorthand for One(clz).Any(clz). Many(CharClass clz)102 Scanner& Many(CharClass clz) { return One(clz).Any(clz); } 103 104 // Reset the capture start point. 105 // 106 // Later, when GetResult is called and if it returns true, the capture 107 // returned will start at the position at the time this was called. RestartCapture()108 Scanner& RestartCapture() { 109 capture_start_ = cur_.data(); 110 capture_end_ = nullptr; 111 return *this; 112 } 113 114 // Stop capturing input. 115 // 116 // Later, when GetResult is called and if it returns true, the capture 117 // returned will end at the position at the time this was called. StopCapture()118 Scanner& StopCapture() { 119 capture_end_ = cur_.data(); 120 return *this; 121 } 122 123 // If not at the input of input, then GetResult will ultimately return false. Eos()124 Scanner& Eos() { 125 if (!cur_.empty()) error_ = true; 126 return *this; 127 } 128 129 // Shorthand for Any(SPACE). AnySpace()130 Scanner& AnySpace() { return Any(SPACE); } 131 132 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. ScanUntil(char end_ch)133 Scanner& ScanUntil(char end_ch) { 134 ScanUntilImpl(end_ch, false); 135 return *this; 136 } 137 138 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. 139 // Backslash escape sequences are skipped. 140 // Used for implementing quoted string scanning. ScanEscapedUntil(char end_ch)141 Scanner& ScanEscapedUntil(char end_ch) { 142 ScanUntilImpl(end_ch, true); 143 return *this; 144 } 145 146 // Return the next character that will be scanned, or <default_value> if there 147 // are no more characters to scan. 148 // Note that if a scan operation has failed (so GetResult() returns false), 149 // then the value of Peek may or may not have advanced since the scan 150 // operation that failed. 151 char Peek(char default_value = '\0') const { 152 return cur_.empty() ? default_value : cur_[0]; 153 } 154 155 // Returns false if there are no remaining characters to consume. empty()156 int empty() const { return cur_.empty(); } 157 158 // Returns true if the input string successfully matched. When true is 159 // returned, the remaining string is returned in <remaining> and the captured 160 // string returned in <capture>, if non-NULL. 161 bool GetResult(StringPiece* remaining = nullptr, 162 StringPiece* capture = nullptr); 163 164 private: 165 void ScanUntilImpl(char end_ch, bool escaped); 166 Error()167 Scanner& Error() { 168 error_ = true; 169 return *this; 170 } 171 IsLetter(char ch)172 static bool IsLetter(char ch) { 173 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 174 } 175 IsLowerLetter(char ch)176 static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; } 177 IsDigit(char ch)178 static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; } 179 IsSpace(char ch)180 static bool IsSpace(char ch) { 181 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' || 182 ch == '\r'); 183 } 184 Matches(CharClass clz,char ch)185 static bool Matches(CharClass clz, char ch) { 186 switch (clz) { 187 case ALL: 188 return true; 189 case DIGIT: 190 return IsDigit(ch); 191 case LETTER: 192 return IsLetter(ch); 193 case LETTER_DIGIT: 194 return IsLetter(ch) || IsDigit(ch); 195 case LETTER_DIGIT_DASH_UNDERSCORE: 196 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_'); 197 case LETTER_DIGIT_DASH_DOT_SLASH: 198 return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || 199 ch == '/'; 200 case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE: 201 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || 202 ch == '/' || ch == '_'); 203 case LETTER_DIGIT_DOT: 204 return IsLetter(ch) || IsDigit(ch) || ch == '.'; 205 case LETTER_DIGIT_DOT_PLUS_MINUS: 206 return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' || 207 ch == '.'; 208 case LETTER_DIGIT_DOT_UNDERSCORE: 209 return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_'; 210 case LETTER_DIGIT_UNDERSCORE: 211 return IsLetter(ch) || IsDigit(ch) || ch == '_'; 212 case LOWERLETTER: 213 return ch >= 'a' && ch <= 'z'; 214 case LOWERLETTER_DIGIT: 215 return IsLowerLetter(ch) || IsDigit(ch); 216 case LOWERLETTER_DIGIT_UNDERSCORE: 217 return IsLowerLetter(ch) || IsDigit(ch) || ch == '_'; 218 case NON_ZERO_DIGIT: 219 return IsDigit(ch) && ch != '0'; 220 case SPACE: 221 return IsSpace(ch); 222 case UPPERLETTER: 223 return ch >= 'A' && ch <= 'Z'; 224 } 225 return false; 226 } 227 228 StringPiece cur_; 229 const char* capture_start_ = nullptr; 230 const char* capture_end_ = nullptr; 231 bool error_ = false; 232 233 friend class ScannerTest; 234 TF_DISALLOW_COPY_AND_ASSIGN(Scanner); 235 }; 236 237 } // namespace strings 238 } // namespace tensorflow 239 240 #endif // TENSORFLOW_LIB_STRINGS_SCANNER_H_ 241