1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LIB_STRINGS_SCANNER_H_
17 #define TENSORFLOW_LIB_STRINGS_SCANNER_H_
18 
19 #include <string>
20 #include "tensorflow/core/lib/core/stringpiece.h"
21 #include "tensorflow/core/lib/strings/str_util.h"
22 #include "tensorflow/core/platform/macros.h"
23 
24 namespace tensorflow {
25 namespace strings {
26 
27 // Scanner provides simplified string parsing, in which a string is parsed as a
28 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
29 // finally GetResult is called. If GetResult returns true, then it also returns
30 // the remaining characters and any captured substring.
31 //
32 // The range to capture can be controlled with RestartCapture and StopCapture;
33 // by default, all processed characters are captured.
34 class Scanner {
35  public:
36   // Classes of characters. Each enum name is to be read as the union of the
37   // parts - e.g., class LETTER_DIGIT means the class includes all letters and
38   // all digits.
39   //
40   // LETTER means ascii letter a-zA-Z.
41   // DIGIT means ascii digit: 0-9.
42   enum CharClass {
43     // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
44     // in scanner_test.cc
45     ALL,
46     DIGIT,
47     LETTER,
48     LETTER_DIGIT,
49     LETTER_DIGIT_DASH_UNDERSCORE,
50     LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
51     LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
52     LETTER_DIGIT_DOT,
53     LETTER_DIGIT_DOT_PLUS_MINUS,
54     LETTER_DIGIT_DOT_UNDERSCORE,
55     LETTER_DIGIT_UNDERSCORE,
56     LOWERLETTER,
57     LOWERLETTER_DIGIT,
58     LOWERLETTER_DIGIT_UNDERSCORE,
59     NON_ZERO_DIGIT,
60     SPACE,
61     UPPERLETTER,
62   };
63 
Scanner(StringPiece source)64   explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
65 
66   // Consume the next character of the given class from input. If the next
67   // character is not in the class, then GetResult will ultimately return false.
One(CharClass clz)68   Scanner& One(CharClass clz) {
69     if (cur_.empty() || !Matches(clz, cur_[0])) {
70       return Error();
71     }
72     cur_.remove_prefix(1);
73     return *this;
74   }
75 
76   // Consume the next s.size() characters of the input, if they match <s>. If
77   // they don't match <s>, this is a no-op.
ZeroOrOneLiteral(StringPiece s)78   Scanner& ZeroOrOneLiteral(StringPiece s) {
79     str_util::ConsumePrefix(&cur_, s);
80     return *this;
81   }
82 
83   // Consume the next s.size() characters of the input, if they match <s>. If
84   // they don't match <s>, then GetResult will ultimately return false.
OneLiteral(StringPiece s)85   Scanner& OneLiteral(StringPiece s) {
86     if (!str_util::ConsumePrefix(&cur_, s)) {
87       error_ = true;
88     }
89     return *this;
90   }
91 
92   // Consume characters from the input as long as they match <clz>. Zero
93   // characters is still considered a match, so it will never cause GetResult to
94   // return false.
Any(CharClass clz)95   Scanner& Any(CharClass clz) {
96     while (!cur_.empty() && Matches(clz, cur_[0])) {
97       cur_.remove_prefix(1);
98     }
99     return *this;
100   }
101 
102   // Shorthand for One(clz).Any(clz).
Many(CharClass clz)103   Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
104 
105   // Reset the capture start point.
106   //
107   // Later, when GetResult is called and if it returns true, the capture
108   // returned will start at the position at the time this was called.
RestartCapture()109   Scanner& RestartCapture() {
110     capture_start_ = cur_.data();
111     capture_end_ = nullptr;
112     return *this;
113   }
114 
115   // Stop capturing input.
116   //
117   // Later, when GetResult is called and if it returns true, the capture
118   // returned will end at the position at the time this was called.
StopCapture()119   Scanner& StopCapture() {
120     capture_end_ = cur_.data();
121     return *this;
122   }
123 
124   // If not at the input of input, then GetResult will ultimately return false.
Eos()125   Scanner& Eos() {
126     if (!cur_.empty()) error_ = true;
127     return *this;
128   }
129 
130   // Shorthand for Any(SPACE).
AnySpace()131   Scanner& AnySpace() { return Any(SPACE); }
132 
133   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
ScanUntil(char end_ch)134   Scanner& ScanUntil(char end_ch) {
135     ScanUntilImpl(end_ch, false);
136     return *this;
137   }
138 
139   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
140   // Backslash escape sequences are skipped.
141   // Used for implementing quoted string scanning.
ScanEscapedUntil(char end_ch)142   Scanner& ScanEscapedUntil(char end_ch) {
143     ScanUntilImpl(end_ch, true);
144     return *this;
145   }
146 
147   // Return the next character that will be scanned, or <default_value> if there
148   // are no more characters to scan.
149   // Note that if a scan operation has failed (so GetResult() returns false),
150   // then the value of Peek may or may not have advanced since the scan
151   // operation that failed.
152   char Peek(char default_value = '\0') const {
153     return cur_.empty() ? default_value : cur_[0];
154   }
155 
156   // Returns false if there are no remaining characters to consume.
empty()157   int empty() const { return cur_.empty(); }
158 
159   // Returns true if the input string successfully matched. When true is
160   // returned, the remaining string is returned in <remaining> and the captured
161   // string returned in <capture>, if non-NULL.
162   bool GetResult(StringPiece* remaining = nullptr,
163                  StringPiece* capture = nullptr);
164 
165  private:
166   void ScanUntilImpl(char end_ch, bool escaped);
167 
Error()168   Scanner& Error() {
169     error_ = true;
170     return *this;
171   }
172 
IsLetter(char ch)173   static bool IsLetter(char ch) {
174     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
175   }
176 
IsLowerLetter(char ch)177   static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
178 
IsDigit(char ch)179   static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
180 
IsSpace(char ch)181   static bool IsSpace(char ch) {
182     return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
183             ch == '\r');
184   }
185 
Matches(CharClass clz,char ch)186   static bool Matches(CharClass clz, char ch) {
187     switch (clz) {
188       case ALL:
189         return true;
190       case DIGIT:
191         return IsDigit(ch);
192       case LETTER:
193         return IsLetter(ch);
194       case LETTER_DIGIT:
195         return IsLetter(ch) || IsDigit(ch);
196       case LETTER_DIGIT_DASH_UNDERSCORE:
197         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
198       case LETTER_DIGIT_DASH_DOT_SLASH:
199         return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
200                ch == '/';
201       case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
202         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
203                 ch == '/' || ch == '_');
204       case LETTER_DIGIT_DOT:
205         return IsLetter(ch) || IsDigit(ch) || ch == '.';
206       case LETTER_DIGIT_DOT_PLUS_MINUS:
207         return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
208                ch == '.';
209       case LETTER_DIGIT_DOT_UNDERSCORE:
210         return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
211       case LETTER_DIGIT_UNDERSCORE:
212         return IsLetter(ch) || IsDigit(ch) || ch == '_';
213       case LOWERLETTER:
214         return ch >= 'a' && ch <= 'z';
215       case LOWERLETTER_DIGIT:
216         return IsLowerLetter(ch) || IsDigit(ch);
217       case LOWERLETTER_DIGIT_UNDERSCORE:
218         return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
219       case NON_ZERO_DIGIT:
220         return IsDigit(ch) && ch != '0';
221       case SPACE:
222         return IsSpace(ch);
223       case UPPERLETTER:
224         return ch >= 'A' && ch <= 'Z';
225     }
226     return false;
227   }
228 
229   StringPiece cur_;
230   const char* capture_start_ = nullptr;
231   const char* capture_end_ = nullptr;
232   bool error_ = false;
233 
234   friend class ScannerTest;
235   TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
236 };
237 
238 }  // namespace strings
239 }  // namespace tensorflow
240 
241 #endif  // TENSORFLOW_LIB_STRINGS_SCANNER_H_
242