1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "xfa/fxfa/fm2js/cxfa_fmlexer.h"
8 
9 #include <algorithm>
10 
11 #include "core/fxcrt/fx_extension.h"
12 
13 namespace {
14 
IsFormCalcCharacter(wchar_t c)15 bool IsFormCalcCharacter(wchar_t c) {
16   return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0xd7FF) ||
17          (c >= 0xE000 && c <= 0xFFFD);
18 }
19 
IsIdentifierCharacter(wchar_t c)20 bool IsIdentifierCharacter(wchar_t c) {
21   return FXSYS_iswalnum(c) || c == 0x005F ||  // '_'
22          c == 0x0024;                         // '$'
23 }
24 
IsInitialIdentifierCharacter(wchar_t c)25 bool IsInitialIdentifierCharacter(wchar_t c) {
26   return FXSYS_iswalpha(c) || c == 0x005F ||  // '_'
27          c == 0x0024 ||                       // '$'
28          c == 0x0021;                         // '!'
29 }
30 
IsWhitespaceCharacter(wchar_t c)31 bool IsWhitespaceCharacter(wchar_t c) {
32   return c == 0x0009 ||  // Horizontal tab
33          c == 0x000B ||  // Vertical tab
34          c == 0x000C ||  // Form feed
35          c == 0x0020;    // Space
36 }
37 
38 const XFA_FMKeyword keyWords[] = {
39     {TOKdo, "do"},
40     {TOKkseq, "eq"},
41     {TOKksge, "ge"},
42     {TOKksgt, "gt"},
43     {TOKif, "if"},
44     {TOKin, "in"},
45     {TOKksle, "le"},
46     {TOKkslt, "lt"},
47     {TOKksne, "ne"},
48     {TOKksor, "or"},
49     {TOKnull, "null"},
50     {TOKbreak, "break"},
51     {TOKksand, "and"},
52     {TOKend, "end"},
53     {TOKeof, "eof"},
54     {TOKfor, "for"},
55     {TOKnan, "nan"},
56     {TOKksnot, "not"},
57     {TOKvar, "var"},
58     {TOKthen, "then"},
59     {TOKelse, "else"},
60     {TOKexit, "exit"},
61     {TOKdownto, "downto"},
62     {TOKreturn, "return"},
63     {TOKinfinity, "infinity"},
64     {TOKendwhile, "endwhile"},
65     {TOKforeach, "foreach"},
66     {TOKendfunc, "endfunc"},
67     {TOKelseif, "elseif"},
68     {TOKwhile, "while"},
69     {TOKendfor, "endfor"},
70     {TOKthrow, "throw"},
71     {TOKstep, "step"},
72     {TOKupto, "upto"},
73     {TOKcontinue, "continue"},
74     {TOKfunc, "func"},
75     {TOKendif, "endif"},
76 };
77 
78 #ifndef NDEBUG
79 const char* const tokenStrings[] = {
80     "TOKand",        "TOKlparen",     "TOKrparen",   "TOKmul",
81     "TOKplus",       "TOKcomma",      "TOKminus",    "TOKdot",
82     "TOKdiv",        "TOKlt",         "TOKassign",   "TOKgt",
83     "TOKlbracket",   "TOKrbracket",   "TOKor",       "TOKdotscream",
84     "TOKdotstar",    "TOKdotdot",     "TOKle",       "TOKne",
85     "TOKeq",         "TOKge",         "TOKdo",       "TOKkseq",
86     "TOKksge",       "TOKksgt",       "TOKif",       "TOKin",
87     "TOKksle",       "TOKkslt",       "TOKksne",     "TOKksor",
88     "TOKnull",       "TOKbreak",      "TOKksand",    "TOKend",
89     "TOKeof",        "TOKfor",        "TOKnan",      "TOKksnot",
90     "TOKvar",        "TOKthen",       "TOKelse",     "TOKexit",
91     "TOKdownto",     "TOKreturn",     "TOKinfinity", "TOKendwhile",
92     "TOKforeach",    "TOKendfunc",    "TOKelseif",   "TOKwhile",
93     "TOKendfor",     "TOKthrow",      "TOKstep",     "TOKupto",
94     "TOKcontinue",   "TOKfunc",       "TOKendif",    "TOKstar",
95     "TOKidentifier", "TOKunderscore", "TOKdollar",   "TOKexclamation",
96     "TOKcall",       "TOKstring",     "TOKnumber",   "TOKreserver",
97 };
98 #endif  // NDEBUG
99 
TokenizeIdentifier(WideStringView str)100 XFA_FM_TOKEN TokenizeIdentifier(WideStringView str) {
101   const XFA_FMKeyword* result =
102       std::find_if(std::begin(keyWords), std::end(keyWords),
103                    [str](const XFA_FMKeyword& iter) {
104                      return str.EqualsASCII(iter.m_keyword);
105                    });
106   if (result != std::end(keyWords) && str.EqualsASCII(result->m_keyword))
107     return result->m_type;
108   return TOKidentifier;
109 }
110 
111 }  // namespace
112 
CXFA_FMToken(XFA_FM_TOKEN token)113 CXFA_FMToken::CXFA_FMToken(XFA_FM_TOKEN token) : m_type(token) {}
114 
CXFA_FMToken()115 CXFA_FMToken::CXFA_FMToken() : CXFA_FMToken(TOKreserver) {}
116 
117 CXFA_FMToken::CXFA_FMToken(const CXFA_FMToken&) = default;
118 
119 CXFA_FMToken::~CXFA_FMToken() = default;
120 
121 #ifndef NDEBUG
ToDebugString() const122 WideString CXFA_FMToken::ToDebugString() const {
123   WideString str = WideString::FromASCII("type = ");
124   str += WideString::FromASCII(tokenStrings[m_type]);
125   str += WideString::FromASCII(", string = ");
126   str += m_string;
127   return str;
128 }
129 #endif  // NDEBUG
130 
CXFA_FMLexer(WideStringView wsFormCalc)131 CXFA_FMLexer::CXFA_FMLexer(WideStringView wsFormCalc)
132     : m_spInput(wsFormCalc.span()) {}
133 
134 CXFA_FMLexer::~CXFA_FMLexer() = default;
135 
NextToken()136 CXFA_FMToken CXFA_FMLexer::NextToken() {
137   if (m_bLexerError)
138     return CXFA_FMToken();
139 
140   while (!IsComplete() && m_spInput[m_nCursor]) {
141     if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
142       RaiseError();
143       return CXFA_FMToken();
144     }
145 
146     switch (m_spInput[m_nCursor]) {
147       case '\n':
148         ++m_nCursor;
149         break;
150       case '\r':
151         ++m_nCursor;
152         break;
153       case ';':
154         AdvanceForComment();
155         break;
156       case '"':
157         return AdvanceForString();
158       case '0':
159       case '1':
160       case '2':
161       case '3':
162       case '4':
163       case '5':
164       case '6':
165       case '7':
166       case '8':
167       case '9':
168         return AdvanceForNumber();
169       case '=':
170         ++m_nCursor;
171         if (m_nCursor >= m_spInput.size())
172           return CXFA_FMToken(TOKassign);
173 
174         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
175           RaiseError();
176           return CXFA_FMToken();
177         }
178         if (m_spInput[m_nCursor] == '=') {
179           ++m_nCursor;
180           return CXFA_FMToken(TOKeq);
181         }
182         return CXFA_FMToken(TOKassign);
183       case '<':
184         ++m_nCursor;
185         if (m_nCursor >= m_spInput.size())
186           return CXFA_FMToken(TOKlt);
187 
188         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
189           RaiseError();
190           return CXFA_FMToken();
191         }
192         if (m_spInput[m_nCursor] == '=') {
193           ++m_nCursor;
194           return CXFA_FMToken(TOKle);
195         }
196         if (m_spInput[m_nCursor] == '>') {
197           ++m_nCursor;
198           return CXFA_FMToken(TOKne);
199         }
200         return CXFA_FMToken(TOKlt);
201       case '>':
202         ++m_nCursor;
203         if (m_nCursor >= m_spInput.size())
204           return CXFA_FMToken(TOKgt);
205 
206         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
207           RaiseError();
208           return CXFA_FMToken();
209         }
210         if (m_spInput[m_nCursor] == '=') {
211           ++m_nCursor;
212           return CXFA_FMToken(TOKge);
213         }
214         return CXFA_FMToken(TOKgt);
215       case ',':
216         ++m_nCursor;
217         return CXFA_FMToken(TOKcomma);
218       case '(':
219         ++m_nCursor;
220         return CXFA_FMToken(TOKlparen);
221       case ')':
222         ++m_nCursor;
223         return CXFA_FMToken(TOKrparen);
224       case '[':
225         ++m_nCursor;
226         return CXFA_FMToken(TOKlbracket);
227       case ']':
228         ++m_nCursor;
229         return CXFA_FMToken(TOKrbracket);
230       case '&':
231         ++m_nCursor;
232         return CXFA_FMToken(TOKand);
233       case '|':
234         ++m_nCursor;
235         return CXFA_FMToken(TOKor);
236       case '+':
237         ++m_nCursor;
238         return CXFA_FMToken(TOKplus);
239       case '-':
240         ++m_nCursor;
241         return CXFA_FMToken(TOKminus);
242       case '*':
243         ++m_nCursor;
244         return CXFA_FMToken(TOKmul);
245       case '/': {
246         ++m_nCursor;
247         if (m_nCursor >= m_spInput.size())
248           return CXFA_FMToken(TOKdiv);
249 
250         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
251           RaiseError();
252           return CXFA_FMToken();
253         }
254         if (m_spInput[m_nCursor] != '/')
255           return CXFA_FMToken(TOKdiv);
256 
257         AdvanceForComment();
258         break;
259       }
260       case '.':
261         ++m_nCursor;
262         if (m_nCursor >= m_spInput.size())
263           return CXFA_FMToken(TOKdot);
264 
265         if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
266           RaiseError();
267           return CXFA_FMToken();
268         }
269 
270         if (m_spInput[m_nCursor] == '.') {
271           ++m_nCursor;
272           return CXFA_FMToken(TOKdotdot);
273         }
274         if (m_spInput[m_nCursor] == '*') {
275           ++m_nCursor;
276           return CXFA_FMToken(TOKdotstar);
277         }
278         if (m_spInput[m_nCursor] == '#') {
279           ++m_nCursor;
280           return CXFA_FMToken(TOKdotscream);
281         }
282         if (FXSYS_IsDecimalDigit(m_spInput[m_nCursor])) {
283           --m_nCursor;
284           return AdvanceForNumber();
285         }
286         return CXFA_FMToken(TOKdot);
287       default:
288         if (IsWhitespaceCharacter(m_spInput[m_nCursor])) {
289           ++m_nCursor;
290           break;
291         }
292         if (!IsInitialIdentifierCharacter(m_spInput[m_nCursor])) {
293           RaiseError();
294           return CXFA_FMToken();
295         }
296         return AdvanceForIdentifier();
297     }
298   }
299   return CXFA_FMToken(TOKeof);
300 }
301 
AdvanceForNumber()302 CXFA_FMToken CXFA_FMLexer::AdvanceForNumber() {
303   // This will set end to the character after the end of the number.
304   int32_t used_length = 0;
305   if (m_nCursor < m_spInput.size()) {
306     FXSYS_wcstof(&m_spInput[m_nCursor], m_spInput.size() - m_nCursor,
307                  &used_length);
308   }
309   size_t end = m_nCursor + used_length;
310   if (used_length == 0 ||
311       (end < m_spInput.size() && FXSYS_iswalpha(m_spInput[end]))) {
312     RaiseError();
313     return CXFA_FMToken();
314   }
315   CXFA_FMToken token(TOKnumber);
316   token.m_string =
317       WideStringView(m_spInput.subspan(m_nCursor, end - m_nCursor));
318   m_nCursor = end;
319   return token;
320 }
321 
AdvanceForString()322 CXFA_FMToken CXFA_FMLexer::AdvanceForString() {
323   CXFA_FMToken token(TOKstring);
324   size_t start = m_nCursor;
325   ++m_nCursor;
326   while (!IsComplete() && m_spInput[m_nCursor]) {
327     if (!IsFormCalcCharacter(m_spInput[m_nCursor]))
328       break;
329 
330     if (m_spInput[m_nCursor] == '"') {
331       // Check for escaped "s, i.e. "".
332       ++m_nCursor;
333       // If the end of the input has been reached it was not escaped.
334       if (m_nCursor >= m_spInput.size()) {
335         token.m_string =
336             WideStringView(m_spInput.subspan(start, m_nCursor - start));
337         return token;
338       }
339       // If the next character is not a " then the end of the string has been
340       // found.
341       if (m_spInput[m_nCursor] != '"') {
342         if (!IsFormCalcCharacter(m_spInput[m_nCursor]))
343           break;
344 
345         token.m_string =
346             WideStringView(m_spInput.subspan(start, m_nCursor - start));
347         return token;
348       }
349     }
350     ++m_nCursor;
351   }
352 
353   // Didn't find the end of the string.
354   RaiseError();
355   return CXFA_FMToken();
356 }
357 
AdvanceForIdentifier()358 CXFA_FMToken CXFA_FMLexer::AdvanceForIdentifier() {
359   size_t start = m_nCursor;
360   ++m_nCursor;
361   while (!IsComplete() && m_spInput[m_nCursor]) {
362     if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
363       RaiseError();
364       return CXFA_FMToken();
365     }
366     if (!IsIdentifierCharacter(m_spInput[m_nCursor]))
367       break;
368 
369     ++m_nCursor;
370   }
371 
372   WideStringView str =
373       WideStringView(m_spInput.subspan(start, m_nCursor - start));
374   CXFA_FMToken token(TokenizeIdentifier(str));
375   token.m_string = str;
376   return token;
377 }
378 
AdvanceForComment()379 void CXFA_FMLexer::AdvanceForComment() {
380   ++m_nCursor;
381   while (!IsComplete() && m_spInput[m_nCursor]) {
382     if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
383       RaiseError();
384       return;
385     }
386     if (m_spInput[m_nCursor] == L'\r') {
387       ++m_nCursor;
388       return;
389     }
390     if (m_spInput[m_nCursor] == L'\n') {
391       ++m_nCursor;
392       return;
393     }
394     ++m_nCursor;
395   }
396 }
397