1 // Copright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "xfa/fxfa/fm2js/cxfa_fmlexer.h"
8 
9 #include <algorithm>
10 
11 #include "core/fxcrt/fx_extension.h"
12 #include "third_party/base/ptr_util.h"
13 #include "third_party/icu/source/common/unicode/uchar.h"
14 
15 namespace {
16 
IsFormCalcCharacter(wchar_t c)17 bool IsFormCalcCharacter(wchar_t c) {
18   return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0xd7FF) ||
19          (c >= 0xE000 && c <= 0xFFFD);
20 }
21 
IsIdentifierCharacter(wchar_t c)22 bool IsIdentifierCharacter(wchar_t c) {
23   return u_isalnum(c) || c == 0x005F ||  // '_'
24          c == 0x0024;                    // '$'
25 }
26 
IsInitialIdentifierCharacter(wchar_t c)27 bool IsInitialIdentifierCharacter(wchar_t c) {
28   return u_isalpha(c) || c == 0x005F ||  // '_'
29          c == 0x0024 ||                  // '$'
30          c == 0x0021;                    // '!'
31 }
32 
IsWhitespaceCharacter(wchar_t c)33 bool IsWhitespaceCharacter(wchar_t c) {
34   return c == 0x0009 ||  // Horizontal tab
35          c == 0x000B ||  // Vertical tab
36          c == 0x000C ||  // Form feed
37          c == 0x0020;    // Space
38 }
39 
40 const XFA_FMKeyword keyWords[] = {
41     {TOKand, 0x00000026, L"&"},
42     {TOKlparen, 0x00000028, L"("},
43     {TOKrparen, 0x00000029, L")"},
44     {TOKmul, 0x0000002a, L"*"},
45     {TOKplus, 0x0000002b, L"+"},
46     {TOKcomma, 0x0000002c, L","},
47     {TOKminus, 0x0000002d, L"-"},
48     {TOKdot, 0x0000002e, L"."},
49     {TOKdiv, 0x0000002f, L"/"},
50     {TOKlt, 0x0000003c, L"<"},
51     {TOKassign, 0x0000003d, L"="},
52     {TOKgt, 0x0000003e, L">"},
53     {TOKlbracket, 0x0000005b, L"["},
54     {TOKrbracket, 0x0000005d, L"]"},
55     {TOKor, 0x0000007c, L"|"},
56     {TOKdotscream, 0x0000ec11, L".#"},
57     {TOKdotstar, 0x0000ec18, L".*"},
58     {TOKdotdot, 0x0000ec1c, L".."},
59     {TOKle, 0x000133f9, L"<="},
60     {TOKne, 0x000133fa, L"<>"},
61     {TOKeq, 0x0001391a, L"=="},
62     {TOKge, 0x00013e3b, L">="},
63     {TOKdo, 0x00020153, L"do"},
64     {TOKkseq, 0x00020676, L"eq"},
65     {TOKksge, 0x000210ac, L"ge"},
66     {TOKksgt, 0x000210bb, L"gt"},
67     {TOKif, 0x00021aef, L"if"},
68     {TOKin, 0x00021af7, L"in"},
69     {TOKksle, 0x00022a51, L"le"},
70     {TOKkslt, 0x00022a60, L"lt"},
71     {TOKksne, 0x00023493, L"ne"},
72     {TOKksor, 0x000239c1, L"or"},
73     {TOKnull, 0x052931bb, L"null"},
74     {TOKbreak, 0x05518c25, L"break"},
75     {TOKksand, 0x09f9db33, L"and"},
76     {TOKend, 0x0a631437, L"end"},
77     {TOKeof, 0x0a63195a, L"eof"},
78     {TOKfor, 0x0a7d67a7, L"for"},
79     {TOKnan, 0x0b4f91dd, L"nan"},
80     {TOKksnot, 0x0b4fd9b1, L"not"},
81     {TOKvar, 0x0c2203e9, L"var"},
82     {TOKthen, 0x2d5738cf, L"then"},
83     {TOKelse, 0x45f65ee9, L"else"},
84     {TOKexit, 0x4731d6ba, L"exit"},
85     {TOKdownto, 0x4caadc3b, L"downto"},
86     {TOKreturn, 0x4db8bd60, L"return"},
87     {TOKinfinity, 0x5c0a010a, L"infinity"},
88     {TOKendwhile, 0x5c64bff0, L"endwhile"},
89     {TOKforeach, 0x67e31f38, L"foreach"},
90     {TOKendfunc, 0x68f984a3, L"endfunc"},
91     {TOKelseif, 0x78253218, L"elseif"},
92     {TOKwhile, 0x84229259, L"while"},
93     {TOKendfor, 0x8ab49d7e, L"endfor"},
94     {TOKthrow, 0x8db05c94, L"throw"},
95     {TOKstep, 0xa7a7887c, L"step"},
96     {TOKupto, 0xb5155328, L"upto"},
97     {TOKcontinue, 0xc0340685, L"continue"},
98     {TOKfunc, 0xcdce60ec, L"func"},
99     {TOKendif, 0xe0e8fee6, L"endif"},
100 };
101 
102 const XFA_FM_TOKEN KEYWORD_START = TOKdo;
103 const XFA_FM_TOKEN KEYWORD_END = TOKendif;
104 
105 const wchar_t* tokenStrings[] = {
106     L"TOKand",        L"TOKlparen",     L"TOKrparen",   L"TOKmul",
107     L"TOKplus",       L"TOKcomma",      L"TOKminus",    L"TOKdot",
108     L"TOKdiv",        L"TOKlt",         L"TOKassign",   L"TOKgt",
109     L"TOKlbracket",   L"TOKrbracket",   L"TOKor",       L"TOKdotscream",
110     L"TOKdotstar",    L"TOKdotdot",     L"TOKle",       L"TOKne",
111     L"TOKeq",         L"TOKge",         L"TOKdo",       L"TOKkseq",
112     L"TOKksge",       L"TOKksgt",       L"TOKif",       L"TOKin",
113     L"TOKksle",       L"TOKkslt",       L"TOKksne",     L"TOKksor",
114     L"TOKnull",       L"TOKbreak",      L"TOKksand",    L"TOKend",
115     L"TOKeof",        L"TOKfor",        L"TOKnan",      L"TOKksnot",
116     L"TOKvar",        L"TOKthen",       L"TOKelse",     L"TOKexit",
117     L"TOKdownto",     L"TOKreturn",     L"TOKinfinity", L"TOKendwhile",
118     L"TOKforeach",    L"TOKendfunc",    L"TOKelseif",   L"TOKwhile",
119     L"TOKendfor",     L"TOKthrow",      L"TOKstep",     L"TOKupto",
120     L"TOKcontinue",   L"TOKfunc",       L"TOKendif",    L"TOKstar",
121     L"TOKidentifier", L"TOKunderscore", L"TOKdollar",   L"TOKexclamation",
122     L"TOKcall",       L"TOKstring",     L"TOKnumber",   L"TOKreserver",
123 };
124 
TokenizeIdentifier(const WideStringView & str)125 XFA_FM_TOKEN TokenizeIdentifier(const WideStringView& str) {
126   uint32_t key = FX_HashCode_GetW(str, true);
127 
128   const XFA_FMKeyword* end = std::begin(keyWords) + KEYWORD_END + 1;
129   const XFA_FMKeyword* result =
130       std::lower_bound(std::begin(keyWords) + KEYWORD_START, end, key,
131                        [](const XFA_FMKeyword& iter, const uint32_t& val) {
132                          return iter.m_hash < val;
133                        });
134   if (result != end && result->m_hash == key)
135     return result->m_type;
136   return TOKidentifier;
137 }
138 
139 }  // namespace
140 
CXFA_FMToken()141 CXFA_FMToken::CXFA_FMToken() : m_type(TOKreserver), m_line_num(1) {}
142 
CXFA_FMToken(uint32_t line_num)143 CXFA_FMToken::CXFA_FMToken(uint32_t line_num)
144     : m_type(TOKreserver), m_line_num(line_num) {}
145 
~CXFA_FMToken()146 CXFA_FMToken::~CXFA_FMToken() {}
147 
ToDebugString() const148 WideString CXFA_FMToken::ToDebugString() const {
149   WideString str(L"type = ");
150   str += tokenStrings[m_type];
151   str += L", string = ";
152   str += m_string;
153   str += L", line_num = ";
154   str += std::to_wstring(m_line_num).c_str();
155   return str;
156 }
157 
CXFA_FMLexer(const WideStringView & wsFormCalc)158 CXFA_FMLexer::CXFA_FMLexer(const WideStringView& wsFormCalc)
159     : m_cursor(wsFormCalc.unterminated_c_str()),
160       m_end(m_cursor + wsFormCalc.GetLength() - 1),
161       m_current_line(1),
162       m_lexer_error(false) {}
163 
~CXFA_FMLexer()164 CXFA_FMLexer::~CXFA_FMLexer() {}
165 
NextToken()166 std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::NextToken() {
167   if (m_lexer_error)
168     return nullptr;
169 
170   m_token = pdfium::MakeUnique<CXFA_FMToken>(m_current_line);
171   while (m_cursor <= m_end && *m_cursor) {
172     if (!IsFormCalcCharacter(*m_cursor)) {
173       RaiseError();
174       return nullptr;
175     }
176 
177     switch (*m_cursor) {
178       case '\n':
179         ++m_current_line;
180         m_token->m_line_num = m_current_line;
181         ++m_cursor;
182         break;
183       case '\r':
184         ++m_cursor;
185         break;
186       case ';':
187         AdvanceForComment();
188         break;
189       case '"':
190         m_token->m_type = TOKstring;
191         AdvanceForString();
192         return std::move(m_token);
193       case '0':
194       case '1':
195       case '2':
196       case '3':
197       case '4':
198       case '5':
199       case '6':
200       case '7':
201       case '8':
202       case '9':
203         m_token->m_type = TOKnumber;
204         AdvanceForNumber();
205         return std::move(m_token);
206       case '=':
207         ++m_cursor;
208         if (m_cursor > m_end) {
209           m_token->m_type = TOKassign;
210           return std::move(m_token);
211         }
212 
213         if (!IsFormCalcCharacter(*m_cursor)) {
214           RaiseError();
215           return nullptr;
216         }
217         if (*m_cursor == '=') {
218           m_token->m_type = TOKeq;
219           ++m_cursor;
220         } else {
221           m_token->m_type = TOKassign;
222         }
223         return std::move(m_token);
224       case '<':
225         ++m_cursor;
226         if (m_cursor > m_end) {
227           m_token->m_type = TOKlt;
228           return std::move(m_token);
229         }
230 
231         if (!IsFormCalcCharacter(*m_cursor)) {
232           RaiseError();
233           return nullptr;
234         }
235         if (*m_cursor == '=') {
236           m_token->m_type = TOKle;
237           ++m_cursor;
238         } else if (*m_cursor == '>') {
239           m_token->m_type = TOKne;
240           ++m_cursor;
241         } else {
242           m_token->m_type = TOKlt;
243         }
244         return std::move(m_token);
245       case '>':
246         ++m_cursor;
247         if (m_cursor > m_end) {
248           m_token->m_type = TOKgt;
249           return std::move(m_token);
250         }
251 
252         if (!IsFormCalcCharacter(*m_cursor)) {
253           RaiseError();
254           return nullptr;
255         }
256         if (*m_cursor == '=') {
257           m_token->m_type = TOKge;
258           ++m_cursor;
259         } else {
260           m_token->m_type = TOKgt;
261         }
262         return std::move(m_token);
263       case ',':
264         m_token->m_type = TOKcomma;
265         ++m_cursor;
266         return std::move(m_token);
267       case '(':
268         m_token->m_type = TOKlparen;
269         ++m_cursor;
270         return std::move(m_token);
271       case ')':
272         m_token->m_type = TOKrparen;
273         ++m_cursor;
274         return std::move(m_token);
275       case '[':
276         m_token->m_type = TOKlbracket;
277         ++m_cursor;
278         return std::move(m_token);
279       case ']':
280         m_token->m_type = TOKrbracket;
281         ++m_cursor;
282         return std::move(m_token);
283       case '&':
284         ++m_cursor;
285         m_token->m_type = TOKand;
286         return std::move(m_token);
287       case '|':
288         ++m_cursor;
289         m_token->m_type = TOKor;
290         return std::move(m_token);
291       case '+':
292         ++m_cursor;
293         m_token->m_type = TOKplus;
294         return std::move(m_token);
295       case '-':
296         ++m_cursor;
297         m_token->m_type = TOKminus;
298         return std::move(m_token);
299       case '*':
300         ++m_cursor;
301         m_token->m_type = TOKmul;
302         return std::move(m_token);
303       case '/': {
304         ++m_cursor;
305         if (m_cursor > m_end) {
306           m_token->m_type = TOKdiv;
307           return std::move(m_token);
308         }
309 
310         if (!IsFormCalcCharacter(*m_cursor)) {
311           RaiseError();
312           return nullptr;
313         }
314         if (*m_cursor != '/') {
315           m_token->m_type = TOKdiv;
316           return std::move(m_token);
317         }
318         AdvanceForComment();
319         break;
320       }
321       case '.':
322         ++m_cursor;
323         if (m_cursor > m_end) {
324           m_token->m_type = TOKdot;
325           return std::move(m_token);
326         }
327 
328         if (!IsFormCalcCharacter(*m_cursor)) {
329           RaiseError();
330           return nullptr;
331         }
332 
333         if (*m_cursor == '.') {
334           m_token->m_type = TOKdotdot;
335           ++m_cursor;
336         } else if (*m_cursor == '*') {
337           m_token->m_type = TOKdotstar;
338           ++m_cursor;
339         } else if (*m_cursor == '#') {
340           m_token->m_type = TOKdotscream;
341           ++m_cursor;
342         } else if (*m_cursor <= '9' && *m_cursor >= '0') {
343           m_token->m_type = TOKnumber;
344           --m_cursor;
345           AdvanceForNumber();
346         } else {
347           m_token->m_type = TOKdot;
348         }
349         return std::move(m_token);
350       default:
351         if (IsWhitespaceCharacter(*m_cursor)) {
352           ++m_cursor;
353           break;
354         }
355         if (!IsInitialIdentifierCharacter(*m_cursor)) {
356           RaiseError();
357           return nullptr;
358         }
359         AdvanceForIdentifier();
360         return std::move(m_token);
361     }
362   }
363 
364   // If there isn't currently a token type then mark it EOF.
365   if (m_token->m_type == TOKreserver)
366     m_token->m_type = TOKeof;
367   return std::move(m_token);
368 }
369 
AdvanceForNumber()370 void CXFA_FMLexer::AdvanceForNumber() {
371   // This will set end to the character after the end of the number.
372   wchar_t* end = nullptr;
373   if (m_cursor)
374     wcstod(const_cast<wchar_t*>(m_cursor), &end);
375   if (!end || FXSYS_iswalpha(*end)) {
376     RaiseError();
377     return;
378   }
379 
380   m_token->m_string =
381       WideStringView(m_cursor, static_cast<size_t>(end - m_cursor));
382   m_cursor = end;
383 }
384 
AdvanceForString()385 void CXFA_FMLexer::AdvanceForString() {
386   const wchar_t* start = m_cursor;
387   ++m_cursor;
388   while (m_cursor <= m_end && *m_cursor) {
389     if (!IsFormCalcCharacter(*m_cursor))
390       break;
391 
392     if (*m_cursor == '"') {
393       // Check for escaped "s, i.e. "".
394       ++m_cursor;
395       // If the end of the input has been reached it was not escaped.
396       if (m_cursor > m_end) {
397         m_token->m_string =
398             WideStringView(start, static_cast<size_t>(m_cursor - start));
399         return;
400       }
401       // If the next character is not a " then the end of the string has been
402       // found.
403       if (*m_cursor != '"') {
404         if (!IsFormCalcCharacter(*m_cursor)) {
405           break;
406         }
407         m_token->m_string = WideStringView(start, (m_cursor - start));
408         return;
409       }
410     }
411     ++m_cursor;
412   }
413 
414   // Didn't find the end of the string.
415   RaiseError();
416 }
417 
AdvanceForIdentifier()418 void CXFA_FMLexer::AdvanceForIdentifier() {
419   const wchar_t* start = m_cursor;
420   ++m_cursor;
421   while (m_cursor <= m_end && *m_cursor) {
422     if (!IsFormCalcCharacter(*m_cursor)) {
423       RaiseError();
424       return;
425     }
426 
427     if (!IsIdentifierCharacter(*m_cursor)) {
428       break;
429     }
430     ++m_cursor;
431   }
432   m_token->m_string =
433       WideStringView(start, static_cast<size_t>(m_cursor - start));
434   m_token->m_type = TokenizeIdentifier(m_token->m_string);
435 }
436 
AdvanceForComment()437 void CXFA_FMLexer::AdvanceForComment() {
438   m_cursor++;
439   while (m_cursor <= m_end && *m_cursor) {
440     if (!IsFormCalcCharacter(*m_cursor)) {
441       RaiseError();
442       return;
443     }
444 
445     if (*m_cursor == L'\r') {
446       ++m_cursor;
447       return;
448     }
449     if (*m_cursor == L'\n') {
450       ++m_current_line;
451       ++m_cursor;
452       return;
453     }
454     ++m_cursor;
455   }
456 }
457