1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This file implements an interface defined in ResourceScriptToken.h.
10 // In particular, it defines an .rc script tokenizer.
11 //
12 //===---------------------------------------------------------------------===//
13 
14 #include "ResourceScriptToken.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include "llvm/Support/raw_ostream.h"
17 
18 #include <algorithm>
19 #include <cassert>
20 #include <cctype>
21 #include <cstdlib>
22 #include <utility>
23 
24 using namespace llvm;
25 
26 using Kind = RCToken::Kind;
27 
28 // Checks if Representation is a correct description of an RC integer.
29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31 // character (that is the difference between our representation and
32 // StringRef's one). If Representation is correct, 'true' is returned and
33 // the return value is put back in Num.
rcGetAsInteger(StringRef Representation,uint32_t & Num)34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35   size_t Length = Representation.size();
36   if (Length == 0)
37     return false;
38   // Strip the last 'L' if unnecessary.
39   if (std::toupper(Representation.back()) == 'L')
40     Representation = Representation.drop_back(1);
41 
42   return !Representation.getAsInteger<uint32_t>(0, Num);
43 }
44 
RCToken(RCToken::Kind RCTokenKind,StringRef Value)45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46     : TokenKind(RCTokenKind), TokenValue(Value) {}
47 
intValue() const48 uint32_t RCToken::intValue() const {
49   assert(TokenKind == Kind::Int);
50   // We assume that the token already is a correct integer (checked by
51   // rcGetAsInteger).
52   uint32_t Result;
53   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
54   assert(IsSuccess);
55   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
56   return Result;
57 }
58 
isLongInt() const59 bool RCToken::isLongInt() const {
60   return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
61 }
62 
value() const63 StringRef RCToken::value() const { return TokenValue; }
64 
kind() const65 Kind RCToken::kind() const { return TokenKind; }
66 
isBinaryOp() const67 bool RCToken::isBinaryOp() const {
68   switch (TokenKind) {
69   case Kind::Plus:
70   case Kind::Minus:
71   case Kind::Pipe:
72   case Kind::Amp:
73     return true;
74   default:
75     return false;
76   }
77 }
78 
getStringError(const Twine & message)79 static Error getStringError(const Twine &message) {
80   return make_error<StringError>("Error parsing file: " + message,
81                                  inconvertibleErrorCode());
82 }
83 
84 namespace {
85 
86 class Tokenizer {
87 public:
Tokenizer(StringRef Input)88   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {}
89 
90   Expected<std::vector<RCToken>> run();
91 
92 private:
93   // All 'advancing' methods return boolean values; if they're equal to false,
94   // the stream has ended or failed.
95   bool advance(size_t Amount = 1);
96   bool skipWhitespaces();
97 
98   // Consumes a token. If any problem occurred, a non-empty Error is returned.
99   Error consumeToken(const Kind TokenKind);
100 
101   // Check if tokenizer is about to read FollowingChars.
102   bool willNowRead(StringRef FollowingChars) const;
103 
104   // Check if tokenizer can start reading an identifier at current position.
105   // The original tool did non specify the rules to determine what is a correct
106   // identifier. We assume they should follow the C convention:
107   // [a-zA-Z_][a-zA-Z0-9_]*.
108   bool canStartIdentifier() const;
109   // Check if tokenizer can continue reading an identifier.
110   bool canContinueIdentifier() const;
111 
112   // Check if tokenizer can start reading an integer.
113   // A correct integer always starts with a 0-9 digit,
114   // can contain characters 0-9A-Fa-f (digits),
115   // Ll (marking the integer is 32-bit), Xx (marking the representation
116   // is hexadecimal). As some kind of separator should come after the
117   // integer, we can consume the integer until a non-alphanumeric
118   // character.
119   bool canStartInt() const;
120   bool canContinueInt() const;
121 
122   bool canStartString() const;
123 
124   // Check if tokenizer can start reading a single line comment (e.g. a comment
125   // that begins with '//')
126   bool canStartLineComment() const;
127 
128   // Check if tokenizer can start or finish reading a block comment (e.g. a
129   // comment that begins with '/*' and ends with '*/')
130   bool canStartBlockComment() const;
131 
132   // Throw away all remaining characters on the current line.
133   void skipCurrentLine();
134 
135   bool streamEof() const;
136 
137   // Classify the token that is about to be read from the current position.
138   Kind classifyCurrentToken() const;
139 
140   // Process the Kind::Identifier token - check if it is
141   // an identifier describing a block start or end.
142   void processIdentifier(RCToken &token) const;
143 
144   StringRef Data;
145   size_t DataLength, Pos;
146 };
147 
skipCurrentLine()148 void Tokenizer::skipCurrentLine() {
149   Pos = Data.find_first_of("\r\n", Pos);
150   Pos = Data.find_first_not_of("\r\n", Pos);
151 
152   if (Pos == StringRef::npos)
153     Pos = DataLength;
154 }
155 
run()156 Expected<std::vector<RCToken>> Tokenizer::run() {
157   Pos = 0;
158   std::vector<RCToken> Result;
159 
160   // Consume an optional UTF-8 Byte Order Mark.
161   if (willNowRead("\xef\xbb\xbf"))
162     advance(3);
163 
164   while (!streamEof()) {
165     if (!skipWhitespaces())
166       break;
167 
168     Kind TokenKind = classifyCurrentToken();
169     if (TokenKind == Kind::Invalid)
170       return getStringError("Invalid token found at position " + Twine(Pos));
171 
172     const size_t TokenStart = Pos;
173     if (Error TokenError = consumeToken(TokenKind))
174       return std::move(TokenError);
175 
176     // Comments are just deleted, don't bother saving them.
177     if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
178       continue;
179 
180     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
181     if (TokenKind == Kind::Identifier) {
182       processIdentifier(Token);
183     } else if (TokenKind == Kind::Int) {
184       uint32_t TokenInt;
185       if (!rcGetAsInteger(Token.value(), TokenInt)) {
186         // The integer has incorrect format or cannot be represented in
187         // a 32-bit integer.
188         return getStringError("Integer invalid or too large: " +
189                               Token.value().str());
190       }
191     }
192 
193     Result.push_back(Token);
194   }
195 
196   return Result;
197 }
198 
advance(size_t Amount)199 bool Tokenizer::advance(size_t Amount) {
200   Pos += Amount;
201   return !streamEof();
202 }
203 
skipWhitespaces()204 bool Tokenizer::skipWhitespaces() {
205   while (!streamEof() && isSpace(Data[Pos]))
206     advance();
207   return !streamEof();
208 }
209 
consumeToken(const Kind TokenKind)210 Error Tokenizer::consumeToken(const Kind TokenKind) {
211   switch (TokenKind) {
212   // One-character token consumption.
213 #define TOKEN(Name)
214 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
215 #include "ResourceScriptTokenList.def"
216     advance();
217     return Error::success();
218 
219   case Kind::LineComment:
220     advance(2);
221     skipCurrentLine();
222     return Error::success();
223 
224   case Kind::StartComment: {
225     advance(2);
226     auto EndPos = Data.find("*/", Pos);
227     if (EndPos == StringRef::npos)
228       return getStringError(
229           "Unclosed multi-line comment beginning at position " + Twine(Pos));
230     advance(EndPos - Pos);
231     advance(2);
232     return Error::success();
233   }
234   case Kind::Identifier:
235     while (!streamEof() && canContinueIdentifier())
236       advance();
237     return Error::success();
238 
239   case Kind::Int:
240     while (!streamEof() && canContinueInt())
241       advance();
242     return Error::success();
243 
244   case Kind::String:
245     // Consume the preceding 'L', if there is any.
246     if (std::toupper(Data[Pos]) == 'L')
247       advance();
248     // Consume the double-quote.
249     advance();
250 
251     // Consume the characters until the end of the file, line or string.
252     while (true) {
253       if (streamEof()) {
254         return getStringError("Unterminated string literal.");
255       } else if (Data[Pos] == '"') {
256         // Consume the ending double-quote.
257         advance();
258         // However, if another '"' follows this double-quote, the string didn't
259         // end and we just included '"' into the string.
260         if (!willNowRead("\""))
261           return Error::success();
262       } else if (Data[Pos] == '\n') {
263         return getStringError("String literal not terminated in the line.");
264       }
265 
266       advance();
267     }
268 
269   case Kind::Invalid:
270     assert(false && "Cannot consume an invalid token.");
271   }
272 
273   llvm_unreachable("Unknown RCToken::Kind");
274 }
275 
willNowRead(StringRef FollowingChars) const276 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
277   return Data.drop_front(Pos).startswith(FollowingChars);
278 }
279 
canStartIdentifier() const280 bool Tokenizer::canStartIdentifier() const {
281   assert(!streamEof());
282 
283   const char CurChar = Data[Pos];
284   return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
285 }
286 
canContinueIdentifier() const287 bool Tokenizer::canContinueIdentifier() const {
288   assert(!streamEof());
289   const char CurChar = Data[Pos];
290   return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
291          CurChar == '/' || CurChar == '\\';
292 }
293 
canStartInt() const294 bool Tokenizer::canStartInt() const {
295   assert(!streamEof());
296   return std::isdigit(Data[Pos]);
297 }
298 
canStartBlockComment() const299 bool Tokenizer::canStartBlockComment() const {
300   assert(!streamEof());
301   return Data.drop_front(Pos).startswith("/*");
302 }
303 
canStartLineComment() const304 bool Tokenizer::canStartLineComment() const {
305   assert(!streamEof());
306   return Data.drop_front(Pos).startswith("//");
307 }
308 
canContinueInt() const309 bool Tokenizer::canContinueInt() const {
310   assert(!streamEof());
311   return std::isalnum(Data[Pos]);
312 }
313 
canStartString() const314 bool Tokenizer::canStartString() const {
315   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
316 }
317 
streamEof() const318 bool Tokenizer::streamEof() const { return Pos == DataLength; }
319 
classifyCurrentToken() const320 Kind Tokenizer::classifyCurrentToken() const {
321   if (canStartBlockComment())
322     return Kind::StartComment;
323   if (canStartLineComment())
324     return Kind::LineComment;
325 
326   if (canStartInt())
327     return Kind::Int;
328   if (canStartString())
329     return Kind::String;
330   // BEGIN and END are at this point of lexing recognized as identifiers.
331   if (canStartIdentifier())
332     return Kind::Identifier;
333 
334   const char CurChar = Data[Pos];
335 
336   switch (CurChar) {
337   // One-character token classification.
338 #define TOKEN(Name)
339 #define SHORT_TOKEN(Name, Ch)                                                  \
340   case Ch:                                                                     \
341     return Kind::Name;
342 #include "ResourceScriptTokenList.def"
343 
344   default:
345     return Kind::Invalid;
346   }
347 }
348 
processIdentifier(RCToken & Token) const349 void Tokenizer::processIdentifier(RCToken &Token) const {
350   assert(Token.kind() == Kind::Identifier);
351   StringRef Name = Token.value();
352 
353   if (Name.equals_lower("begin"))
354     Token = RCToken(Kind::BlockBegin, Name);
355   else if (Name.equals_lower("end"))
356     Token = RCToken(Kind::BlockEnd, Name);
357 }
358 
359 } // anonymous namespace
360 
361 namespace llvm {
362 
tokenizeRC(StringRef Input)363 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
364   return Tokenizer(Input).run();
365 }
366 
367 } // namespace llvm
368