1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_PLATFORM_SCANNER_H_
17 #define TENSORFLOW_CORE_PLATFORM_SCANNER_H_
18 
19 #include <string>
20 
21 #include "tensorflow/core/platform/macros.h"
22 #include "tensorflow/core/platform/str_util.h"
23 #include "tensorflow/core/platform/stringpiece.h"
24 
25 namespace tensorflow {
26 namespace strings {
27 
28 // Scanner provides simplified string parsing, in which a string is parsed as a
29 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
30 // finally GetResult is called. If GetResult returns true, then it also returns
31 // the remaining characters and any captured substring.
32 //
33 // The range to capture can be controlled with RestartCapture and StopCapture;
34 // by default, all processed characters are captured.
35 class Scanner {
36  public:
37   // Classes of characters. Each enum name is to be read as the union of the
38   // parts - e.g., class LETTER_DIGIT means the class includes all letters and
39   // all digits.
40   //
41   // LETTER means ascii letter a-zA-Z.
42   // DIGIT means ascii digit: 0-9.
43   enum CharClass {
44     // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
45     // in scanner_test.cc
46     ALL,
47     DIGIT,
48     LETTER,
49     LETTER_DIGIT,
50     LETTER_DIGIT_DASH_UNDERSCORE,
51     LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
52     LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
53     LETTER_DIGIT_DOT,
54     LETTER_DIGIT_DOT_PLUS_MINUS,
55     LETTER_DIGIT_DOT_UNDERSCORE,
56     LETTER_DIGIT_UNDERSCORE,
57     LOWERLETTER,
58     LOWERLETTER_DIGIT,
59     LOWERLETTER_DIGIT_UNDERSCORE,
60     NON_ZERO_DIGIT,
61     SPACE,
62     UPPERLETTER,
63     RANGLE,
64   };
65 
Scanner(StringPiece source)66   explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
67 
68   // Consume the next character of the given class from input. If the next
69   // character is not in the class, then GetResult will ultimately return false.
One(CharClass clz)70   Scanner& One(CharClass clz) {
71     if (cur_.empty() || !Matches(clz, cur_[0])) {
72       return Error();
73     }
74     cur_.remove_prefix(1);
75     return *this;
76   }
77 
78   // Consume the next s.size() characters of the input, if they match <s>. If
79   // they don't match <s>, this is a no-op.
ZeroOrOneLiteral(StringPiece s)80   Scanner& ZeroOrOneLiteral(StringPiece s) {
81     str_util::ConsumePrefix(&cur_, s);
82     return *this;
83   }
84 
85   // Consume the next s.size() characters of the input, if they match <s>. If
86   // they don't match <s>, then GetResult will ultimately return false.
OneLiteral(StringPiece s)87   Scanner& OneLiteral(StringPiece s) {
88     if (!str_util::ConsumePrefix(&cur_, s)) {
89       error_ = true;
90     }
91     return *this;
92   }
93 
94   // Consume characters from the input as long as they match <clz>. Zero
95   // characters is still considered a match, so it will never cause GetResult to
96   // return false.
Any(CharClass clz)97   Scanner& Any(CharClass clz) {
98     while (!cur_.empty() && Matches(clz, cur_[0])) {
99       cur_.remove_prefix(1);
100     }
101     return *this;
102   }
103 
104   // Shorthand for One(clz).Any(clz).
Many(CharClass clz)105   Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
106 
107   // Reset the capture start point.
108   //
109   // Later, when GetResult is called and if it returns true, the capture
110   // returned will start at the position at the time this was called.
RestartCapture()111   Scanner& RestartCapture() {
112     capture_start_ = cur_.data();
113     capture_end_ = nullptr;
114     return *this;
115   }
116 
117   // Stop capturing input.
118   //
119   // Later, when GetResult is called and if it returns true, the capture
120   // returned will end at the position at the time this was called.
StopCapture()121   Scanner& StopCapture() {
122     capture_end_ = cur_.data();
123     return *this;
124   }
125 
126   // If not at the input of input, then GetResult will ultimately return false.
Eos()127   Scanner& Eos() {
128     if (!cur_.empty()) error_ = true;
129     return *this;
130   }
131 
132   // Shorthand for Any(SPACE).
AnySpace()133   Scanner& AnySpace() { return Any(SPACE); }
134 
135   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
ScanUntil(char end_ch)136   Scanner& ScanUntil(char end_ch) {
137     ScanUntilImpl(end_ch, false);
138     return *this;
139   }
140 
141   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
142   // Backslash escape sequences are skipped.
143   // Used for implementing quoted string scanning.
ScanEscapedUntil(char end_ch)144   Scanner& ScanEscapedUntil(char end_ch) {
145     ScanUntilImpl(end_ch, true);
146     return *this;
147   }
148 
149   // Return the next character that will be scanned, or <default_value> if there
150   // are no more characters to scan.
151   // Note that if a scan operation has failed (so GetResult() returns false),
152   // then the value of Peek may or may not have advanced since the scan
153   // operation that failed.
154   char Peek(char default_value = '\0') const {
155     return cur_.empty() ? default_value : cur_[0];
156   }
157 
158   // Returns false if there are no remaining characters to consume.
empty()159   int empty() const { return cur_.empty(); }
160 
161   // Returns true if the input string successfully matched. When true is
162   // returned, the remaining string is returned in <remaining> and the captured
163   // string returned in <capture>, if non-NULL.
164   bool GetResult(StringPiece* remaining = nullptr,
165                  StringPiece* capture = nullptr);
166 
167  private:
168   void ScanUntilImpl(char end_ch, bool escaped);
169 
Error()170   Scanner& Error() {
171     error_ = true;
172     return *this;
173   }
174 
IsLetter(char ch)175   static bool IsLetter(char ch) {
176     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
177   }
178 
IsLowerLetter(char ch)179   static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
180 
IsDigit(char ch)181   static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
182 
IsSpace(char ch)183   static bool IsSpace(char ch) {
184     return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
185             ch == '\r');
186   }
187 
Matches(CharClass clz,char ch)188   static bool Matches(CharClass clz, char ch) {
189     switch (clz) {
190       case ALL:
191         return true;
192       case DIGIT:
193         return IsDigit(ch);
194       case LETTER:
195         return IsLetter(ch);
196       case LETTER_DIGIT:
197         return IsLetter(ch) || IsDigit(ch);
198       case LETTER_DIGIT_DASH_UNDERSCORE:
199         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
200       case LETTER_DIGIT_DASH_DOT_SLASH:
201         return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
202                ch == '/';
203       case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
204         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
205                 ch == '/' || ch == '_');
206       case LETTER_DIGIT_DOT:
207         return IsLetter(ch) || IsDigit(ch) || ch == '.';
208       case LETTER_DIGIT_DOT_PLUS_MINUS:
209         return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
210                ch == '.';
211       case LETTER_DIGIT_DOT_UNDERSCORE:
212         return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
213       case LETTER_DIGIT_UNDERSCORE:
214         return IsLetter(ch) || IsDigit(ch) || ch == '_';
215       case LOWERLETTER:
216         return ch >= 'a' && ch <= 'z';
217       case LOWERLETTER_DIGIT:
218         return IsLowerLetter(ch) || IsDigit(ch);
219       case LOWERLETTER_DIGIT_UNDERSCORE:
220         return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
221       case NON_ZERO_DIGIT:
222         return IsDigit(ch) && ch != '0';
223       case SPACE:
224         return IsSpace(ch);
225       case UPPERLETTER:
226         return ch >= 'A' && ch <= 'Z';
227       case RANGLE:
228         return ch == '>';
229     }
230     return false;
231   }
232 
233   StringPiece cur_;
234   const char* capture_start_ = nullptr;
235   const char* capture_end_ = nullptr;
236   bool error_ = false;
237 
238   friend class ScannerTest;
239   TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
240 };
241 
242 }  // namespace strings
243 }  // namespace tensorflow
244 
245 #endif  // TENSORFLOW_CORE_PLATFORM_SCANNER_H_
246