1 /*
2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)
4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.
5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>
6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>
7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.
9 * Copyright (C) 2012 Intel Corporation. All rights reserved.
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Library General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Library General Public License for more details.
20 *
21 * You should have received a copy of the GNU Library General Public License
22 * along with this library; see the file COPYING.LIB. If not, write to
23 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 * Boston, MA 02110-1301, USA.
25 */
26
27 #include "config.h"
28 #include "core/css/parser/CSSTokenizer.h"
29
30 #include "core/css/CSSKeyframeRule.h"
31 #include "core/css/MediaQuery.h"
32 #include "core/css/StyleRule.h"
33 #include "core/css/parser/BisonCSSParser.h"
34 #include "core/css/parser/CSSParserValues.h"
35 #include "core/html/parser/HTMLParserIdioms.h"
36 #include "core/svg/SVGParserUtilities.h"
37
38 namespace blink {
39
40 #include "core/CSSGrammar.h"
41
42 enum CharacterType {
43 // Types for the main switch.
44
45 // The first 4 types must be grouped together, as they
46 // represent the allowed chars in an identifier.
47 CharacterCaselessU,
48 CharacterIdentifierStart,
49 CharacterNumber,
50 CharacterDash,
51
52 CharacterOther,
53 CharacterNull,
54 CharacterWhiteSpace,
55 CharacterEndMediaQueryOrSupports,
56 CharacterEndNthChild,
57 CharacterQuote,
58 CharacterExclamationMark,
59 CharacterHashmark,
60 CharacterDollar,
61 CharacterAsterisk,
62 CharacterPlus,
63 CharacterDot,
64 CharacterSlash,
65 CharacterLess,
66 CharacterAt,
67 CharacterBackSlash,
68 CharacterXor,
69 CharacterVerticalBar,
70 CharacterTilde,
71 };
72
73 // 128 ASCII codes
74 static const CharacterType typesOfASCIICharacters[128] = {
75 /* 0 - Null */ CharacterNull,
76 /* 1 - Start of Heading */ CharacterOther,
77 /* 2 - Start of Text */ CharacterOther,
78 /* 3 - End of Text */ CharacterOther,
79 /* 4 - End of Transm. */ CharacterOther,
80 /* 5 - Enquiry */ CharacterOther,
81 /* 6 - Acknowledgment */ CharacterOther,
82 /* 7 - Bell */ CharacterOther,
83 /* 8 - Back Space */ CharacterOther,
84 /* 9 - Horizontal Tab */ CharacterWhiteSpace,
85 /* 10 - Line Feed */ CharacterWhiteSpace,
86 /* 11 - Vertical Tab */ CharacterOther,
87 /* 12 - Form Feed */ CharacterWhiteSpace,
88 /* 13 - Carriage Return */ CharacterWhiteSpace,
89 /* 14 - Shift Out */ CharacterOther,
90 /* 15 - Shift In */ CharacterOther,
91 /* 16 - Data Line Escape */ CharacterOther,
92 /* 17 - Device Control 1 */ CharacterOther,
93 /* 18 - Device Control 2 */ CharacterOther,
94 /* 19 - Device Control 3 */ CharacterOther,
95 /* 20 - Device Control 4 */ CharacterOther,
96 /* 21 - Negative Ack. */ CharacterOther,
97 /* 22 - Synchronous Idle */ CharacterOther,
98 /* 23 - End of Transmit */ CharacterOther,
99 /* 24 - Cancel */ CharacterOther,
100 /* 25 - End of Medium */ CharacterOther,
101 /* 26 - Substitute */ CharacterOther,
102 /* 27 - Escape */ CharacterOther,
103 /* 28 - File Separator */ CharacterOther,
104 /* 29 - Group Separator */ CharacterOther,
105 /* 30 - Record Separator */ CharacterOther,
106 /* 31 - Unit Separator */ CharacterOther,
107 /* 32 - Space */ CharacterWhiteSpace,
108 /* 33 - ! */ CharacterExclamationMark,
109 /* 34 - " */ CharacterQuote,
110 /* 35 - # */ CharacterHashmark,
111 /* 36 - $ */ CharacterDollar,
112 /* 37 - % */ CharacterOther,
113 /* 38 - & */ CharacterOther,
114 /* 39 - ' */ CharacterQuote,
115 /* 40 - ( */ CharacterOther,
116 /* 41 - ) */ CharacterEndNthChild,
117 /* 42 - * */ CharacterAsterisk,
118 /* 43 - + */ CharacterPlus,
119 /* 44 - , */ CharacterOther,
120 /* 45 - - */ CharacterDash,
121 /* 46 - . */ CharacterDot,
122 /* 47 - / */ CharacterSlash,
123 /* 48 - 0 */ CharacterNumber,
124 /* 49 - 1 */ CharacterNumber,
125 /* 50 - 2 */ CharacterNumber,
126 /* 51 - 3 */ CharacterNumber,
127 /* 52 - 4 */ CharacterNumber,
128 /* 53 - 5 */ CharacterNumber,
129 /* 54 - 6 */ CharacterNumber,
130 /* 55 - 7 */ CharacterNumber,
131 /* 56 - 8 */ CharacterNumber,
132 /* 57 - 9 */ CharacterNumber,
133 /* 58 - : */ CharacterOther,
134 /* 59 - ; */ CharacterEndMediaQueryOrSupports,
135 /* 60 - < */ CharacterLess,
136 /* 61 - = */ CharacterOther,
137 /* 62 - > */ CharacterOther,
138 /* 63 - ? */ CharacterOther,
139 /* 64 - @ */ CharacterAt,
140 /* 65 - A */ CharacterIdentifierStart,
141 /* 66 - B */ CharacterIdentifierStart,
142 /* 67 - C */ CharacterIdentifierStart,
143 /* 68 - D */ CharacterIdentifierStart,
144 /* 69 - E */ CharacterIdentifierStart,
145 /* 70 - F */ CharacterIdentifierStart,
146 /* 71 - G */ CharacterIdentifierStart,
147 /* 72 - H */ CharacterIdentifierStart,
148 /* 73 - I */ CharacterIdentifierStart,
149 /* 74 - J */ CharacterIdentifierStart,
150 /* 75 - K */ CharacterIdentifierStart,
151 /* 76 - L */ CharacterIdentifierStart,
152 /* 77 - M */ CharacterIdentifierStart,
153 /* 78 - N */ CharacterIdentifierStart,
154 /* 79 - O */ CharacterIdentifierStart,
155 /* 80 - P */ CharacterIdentifierStart,
156 /* 81 - Q */ CharacterIdentifierStart,
157 /* 82 - R */ CharacterIdentifierStart,
158 /* 83 - S */ CharacterIdentifierStart,
159 /* 84 - T */ CharacterIdentifierStart,
160 /* 85 - U */ CharacterCaselessU,
161 /* 86 - V */ CharacterIdentifierStart,
162 /* 87 - W */ CharacterIdentifierStart,
163 /* 88 - X */ CharacterIdentifierStart,
164 /* 89 - Y */ CharacterIdentifierStart,
165 /* 90 - Z */ CharacterIdentifierStart,
166 /* 91 - [ */ CharacterOther,
167 /* 92 - \ */ CharacterBackSlash,
168 /* 93 - ] */ CharacterOther,
169 /* 94 - ^ */ CharacterXor,
170 /* 95 - _ */ CharacterIdentifierStart,
171 /* 96 - ` */ CharacterOther,
172 /* 97 - a */ CharacterIdentifierStart,
173 /* 98 - b */ CharacterIdentifierStart,
174 /* 99 - c */ CharacterIdentifierStart,
175 /* 100 - d */ CharacterIdentifierStart,
176 /* 101 - e */ CharacterIdentifierStart,
177 /* 102 - f */ CharacterIdentifierStart,
178 /* 103 - g */ CharacterIdentifierStart,
179 /* 104 - h */ CharacterIdentifierStart,
180 /* 105 - i */ CharacterIdentifierStart,
181 /* 106 - j */ CharacterIdentifierStart,
182 /* 107 - k */ CharacterIdentifierStart,
183 /* 108 - l */ CharacterIdentifierStart,
184 /* 109 - m */ CharacterIdentifierStart,
185 /* 110 - n */ CharacterIdentifierStart,
186 /* 111 - o */ CharacterIdentifierStart,
187 /* 112 - p */ CharacterIdentifierStart,
188 /* 113 - q */ CharacterIdentifierStart,
189 /* 114 - r */ CharacterIdentifierStart,
190 /* 115 - s */ CharacterIdentifierStart,
191 /* 116 - t */ CharacterIdentifierStart,
192 /* 117 - u */ CharacterCaselessU,
193 /* 118 - v */ CharacterIdentifierStart,
194 /* 119 - w */ CharacterIdentifierStart,
195 /* 120 - x */ CharacterIdentifierStart,
196 /* 121 - y */ CharacterIdentifierStart,
197 /* 122 - z */ CharacterIdentifierStart,
198 /* 123 - { */ CharacterEndMediaQueryOrSupports,
199 /* 124 - | */ CharacterVerticalBar,
200 /* 125 - } */ CharacterOther,
201 /* 126 - ~ */ CharacterTilde,
202 /* 127 - Delete */ CharacterOther,
203 };
204
205 // Utility functions for the CSS tokenizer.
206
207 template <typename CharacterType>
isCSSLetter(CharacterType character)208 static inline bool isCSSLetter(CharacterType character)
209 {
210 return character >= 128 || typesOfASCIICharacters[character] <= CharacterDash;
211 }
212
213 template <typename CharacterType>
isCSSEscape(CharacterType character)214 static inline bool isCSSEscape(CharacterType character)
215 {
216 return character >= ' ' && character != 127;
217 }
218
219 template <typename CharacterType>
isURILetter(CharacterType character)220 static inline bool isURILetter(CharacterType character)
221 {
222 return (character >= '*' && character != 127) || (character >= '#' && character <= '&') || character == '!';
223 }
224
225 template <typename CharacterType>
isIdentifierStartAfterDash(CharacterType * currentCharacter)226 static inline bool isIdentifierStartAfterDash(CharacterType* currentCharacter)
227 {
228 return isASCIIAlpha(currentCharacter[0]) || currentCharacter[0] == '_' || currentCharacter[0] >= 128
229 || (currentCharacter[0] == '\\' && isCSSEscape(currentCharacter[1]));
230 }
231
232 template <typename CharacterType>
isEqualToCSSIdentifier(CharacterType * cssString,const char * constantString)233 static inline bool isEqualToCSSIdentifier(CharacterType* cssString, const char* constantString)
234 {
235 // Compare an character memory data with a zero terminated string.
236 do {
237 // The input must be part of an identifier if constantChar or constString
238 // contains '-'. Otherwise toASCIILowerUnchecked('\r') would be equal to '-'.
239 ASSERT((*constantString >= 'a' && *constantString <= 'z') || *constantString == '-');
240 ASSERT(*constantString != '-' || isCSSLetter(*cssString));
241 if (toASCIILowerUnchecked(*cssString++) != (*constantString++))
242 return false;
243 } while (*constantString);
244 return true;
245 }
246
247 template <typename CharacterType>
isEqualToCSSCaseSensitiveIdentifier(CharacterType * string,const char * constantString)248 static inline bool isEqualToCSSCaseSensitiveIdentifier(CharacterType* string, const char* constantString)
249 {
250 ASSERT(*constantString);
251
252 do {
253 if (*string++ != *constantString++)
254 return false;
255 } while (*constantString);
256 return true;
257 }
258
259 template <typename CharacterType>
checkAndSkipEscape(CharacterType * currentCharacter)260 static CharacterType* checkAndSkipEscape(CharacterType* currentCharacter)
261 {
262 // Returns with 0, if escape check is failed. Otherwise
263 // it returns with the following character.
264 ASSERT(*currentCharacter == '\\');
265
266 ++currentCharacter;
267 if (!isCSSEscape(*currentCharacter))
268 return 0;
269
270 if (isASCIIHexDigit(*currentCharacter)) {
271 int length = 6;
272
273 do {
274 ++currentCharacter;
275 } while (isASCIIHexDigit(*currentCharacter) && --length);
276
277 // Optional space after the escape sequence.
278 if (isHTMLSpace<CharacterType>(*currentCharacter))
279 ++currentCharacter;
280 return currentCharacter;
281 }
282 return currentCharacter + 1;
283 }
284
285 template <typename CharacterType>
skipWhiteSpace(CharacterType * currentCharacter)286 static inline CharacterType* skipWhiteSpace(CharacterType* currentCharacter)
287 {
288 while (isHTMLSpace<CharacterType>(*currentCharacter))
289 ++currentCharacter;
290 return currentCharacter;
291 }
292
293 // Main CSS tokenizer functions.
294
295 template <>
currentCharacter()296 inline LChar*& CSSTokenizer::currentCharacter<LChar>()
297 {
298 return m_currentCharacter8;
299 }
300
301 template <>
currentCharacter()302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()
303 {
304 return m_currentCharacter16;
305 }
306
allocateStringBuffer16(size_t len)307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
308 {
309 // Allocates and returns a CSSTokenizer owned buffer for storing
310 // UTF-16 data. Used to get a suitable life span for UTF-16
311 // strings, identifiers and URIs created by the tokenizer.
312 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
313
314 UChar* bufferPtr = buffer.get();
315
316 m_cssStrings16.append(buffer.release());
317 return bufferPtr;
318 }
319
320 template <>
dataStart()321 inline LChar* CSSTokenizer::dataStart<LChar>()
322 {
323 return m_dataStart8.get();
324 }
325
326 template <>
dataStart()327 inline UChar* CSSTokenizer::dataStart<UChar>()
328 {
329 return m_dataStart16.get();
330 }
331
332 template <typename CharacterType>
tokenLocation()333 inline CSSParserLocation CSSTokenizer::tokenLocation()
334 {
335 CSSParserLocation location;
336 location.token.init(tokenStart<CharacterType>(), currentCharacter<CharacterType>() - tokenStart<CharacterType>());
337 location.lineNumber = m_tokenStartLineNumber;
338 location.offset = tokenStart<CharacterType>() - dataStart<CharacterType>();
339 return location;
340 }
341
currentLocation()342 CSSParserLocation CSSTokenizer::currentLocation()
343 {
344 if (is8BitSource())
345 return tokenLocation<LChar>();
346 return tokenLocation<UChar>();
347 }
348
349 template <typename CharacterType>
isIdentifierStart()350 inline bool CSSTokenizer::isIdentifierStart()
351 {
352 // Check whether an identifier is started.
353 return isIdentifierStartAfterDash((*currentCharacter<CharacterType>() != '-') ? currentCharacter<CharacterType>() : currentCharacter<CharacterType>() + 1);
354 }
355
356 enum CheckStringValidationMode {
357 AbortIfInvalid,
358 SkipInvalid
359 };
360
361 template <typename CharacterType>
checkAndSkipString(CharacterType * currentCharacter,int quote,CheckStringValidationMode mode)362 static inline CharacterType* checkAndSkipString(CharacterType* currentCharacter, int quote, CheckStringValidationMode mode)
363 {
364 // If mode is AbortIfInvalid and the string check fails it returns
365 // with 0. Otherwise it returns with a pointer to the first
366 // character after the string.
367 while (true) {
368 if (UNLIKELY(*currentCharacter == quote)) {
369 // String parsing is successful.
370 return currentCharacter + 1;
371 }
372 if (UNLIKELY(!*currentCharacter)) {
373 // String parsing is successful up to end of input.
374 return currentCharacter;
375 }
376 if (mode == AbortIfInvalid && UNLIKELY(*currentCharacter <= '\r' && (*currentCharacter == '\n' || (*currentCharacter | 0x1) == '\r'))) {
377 // String parsing is failed for character '\n', '\f' or '\r'.
378 return 0;
379 }
380
381 if (LIKELY(currentCharacter[0] != '\\')) {
382 ++currentCharacter;
383 } else if (currentCharacter[1] == '\n' || currentCharacter[1] == '\f') {
384 currentCharacter += 2;
385 } else if (currentCharacter[1] == '\r') {
386 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;
387 } else {
388 CharacterType* next = checkAndSkipEscape(currentCharacter);
389 if (!next) {
390 if (mode == AbortIfInvalid)
391 return 0;
392 next = currentCharacter + 1;
393 }
394 currentCharacter = next;
395 }
396 }
397 }
398
399 template <typename CharacterType>
parseEscape(CharacterType * & src)400 unsigned CSSTokenizer::parseEscape(CharacterType*& src)
401 {
402 ASSERT(*src == '\\' && isCSSEscape(src[1]));
403
404 unsigned unicode = 0;
405
406 ++src;
407 if (isASCIIHexDigit(*src)) {
408
409 int length = 6;
410
411 do {
412 unicode = (unicode << 4) + toASCIIHexValue(*src++);
413 } while (--length && isASCIIHexDigit(*src));
414
415 // Characters above 0x10ffff are not handled.
416 if (unicode > 0x10ffff)
417 unicode = 0xfffd;
418
419 // Optional space after the escape sequence.
420 if (isHTMLSpace<CharacterType>(*src))
421 ++src;
422
423 return unicode;
424 }
425
426 return *src++;
427 }
428
429 template <>
UnicodeToChars(LChar * & result,unsigned unicode)430 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode)
431 {
432 ASSERT(unicode <= 0xff);
433 *result = unicode;
434
435 ++result;
436 }
437
438 template <>
UnicodeToChars(UChar * & result,unsigned unicode)439 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode)
440 {
441 // Replace unicode with a surrogate pairs when it is bigger than 0xffff
442 if (U16_LENGTH(unicode) == 2) {
443 *result++ = U16_LEAD(unicode);
444 *result = U16_TRAIL(unicode);
445 } else {
446 *result = unicode;
447 }
448
449 ++result;
450 }
451
452 template <typename SrcCharacterType>
peekMaxIdentifierLen(SrcCharacterType * src)453 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
454 {
455 // The decoded form of an identifier (after resolving escape
456 // sequences) will not contain more characters (ASCII or UTF-16
457 // codepoints) than the input. This code can therefore ignore
458 // escape sequences completely.
459 SrcCharacterType* start = src;
460 do {
461 if (LIKELY(*src != '\\'))
462 src++;
463 else
464 parseEscape<SrcCharacterType>(src);
465 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
466
467 return src - start;
468 }
469
470 template <typename SrcCharacterType, typename DestCharacterType>
parseIdentifierInternal(SrcCharacterType * & src,DestCharacterType * & result,bool & hasEscape)471 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape)
472 {
473 hasEscape = false;
474 do {
475 if (LIKELY(*src != '\\')) {
476 *result++ = *src++;
477 } else {
478 hasEscape = true;
479 SrcCharacterType* savedEscapeStart = src;
480 unsigned unicode = parseEscape<SrcCharacterType>(src);
481 if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
482 src = savedEscapeStart;
483 return false;
484 }
485 UnicodeToChars(result, unicode);
486 }
487 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
488
489 return true;
490 }
491
492 template <typename CharacterType>
parseIdentifier(CharacterType * & result,CSSParserString & resultString,bool & hasEscape)493 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserString& resultString, bool& hasEscape)
494 {
495 // If a valid identifier start is found, we can safely
496 // parse the identifier until the next invalid character.
497 ASSERT(isIdentifierStart<CharacterType>());
498
499 CharacterType* start = currentCharacter<CharacterType>();
500 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) {
501 // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
502 ASSERT(is8BitSource());
503 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(currentCharacter<CharacterType>()));
504 UChar* start16 = result16;
505 int i = 0;
506 for (; i < result - start; i++)
507 result16[i] = start[i];
508
509 result16 += i;
510
511 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, hasEscape);
512
513 resultString.init(start16, result16 - start16);
514
515 return;
516 }
517
518 resultString.init(start, result - start);
519 }
520
521 template <typename SrcCharacterType>
peekMaxStringLen(SrcCharacterType * src,UChar quote)522 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
523 {
524 // The decoded form of a CSS string (after resolving escape
525 // sequences) will not contain more characters (ASCII or UTF-16
526 // codepoints) than the input. This code can therefore ignore
527 // escape sequences completely and just return the length of the
528 // input string (possibly including terminating quote if any).
529 SrcCharacterType* end = checkAndSkipString(src, quote, SkipInvalid);
530 return end ? end - src : 0;
531 }
532
533 template <typename SrcCharacterType, typename DestCharacterType>
parseStringInternal(SrcCharacterType * & src,DestCharacterType * & result,UChar quote)534 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote)
535 {
536 while (true) {
537 if (UNLIKELY(*src == quote)) {
538 // String parsing is done.
539 ++src;
540 return true;
541 }
542 if (UNLIKELY(!*src)) {
543 // String parsing is done, but don't advance pointer if at the end of input.
544 return true;
545 }
546 if (LIKELY(src[0] != '\\')) {
547 *result++ = *src++;
548 } else if (src[1] == '\n' || src[1] == '\f') {
549 src += 2;
550 } else if (src[1] == '\r') {
551 src += src[2] == '\n' ? 3 : 2;
552 } else {
553 SrcCharacterType* savedEscapeStart = src;
554 unsigned unicode = parseEscape<SrcCharacterType>(src);
555 if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
556 src = savedEscapeStart;
557 return false;
558 }
559 UnicodeToChars(result, unicode);
560 }
561 }
562
563 return true;
564 }
565
566 template <typename CharacterType>
parseString(CharacterType * & result,CSSParserString & resultString,UChar quote)567 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& resultString, UChar quote)
568 {
569 CharacterType* start = currentCharacter<CharacterType>();
570
571 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
572 // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
573 ASSERT(is8BitSource());
574 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(currentCharacter<CharacterType>(), quote));
575 UChar* start16 = result16;
576 int i = 0;
577 for (; i < result - start; i++)
578 result16[i] = start[i];
579
580 result16 += i;
581
582 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);
583
584 resultString.init(start16, result16 - start16);
585 return;
586 }
587
588 resultString.init(start, result - start);
589 }
590
591 template <typename CharacterType>
findURI(CharacterType * & start,CharacterType * & end,UChar & quote)592 inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UChar& quote)
593 {
594 start = skipWhiteSpace(currentCharacter<CharacterType>());
595
596 if (*start == '"' || *start == '\'') {
597 quote = *start++;
598 end = checkAndSkipString(start, quote, AbortIfInvalid);
599 if (!end)
600 return false;
601 } else {
602 quote = 0;
603 end = start;
604 while (isURILetter(*end)) {
605 if (LIKELY(*end != '\\')) {
606 ++end;
607 } else {
608 end = checkAndSkipEscape(end);
609 if (!end)
610 return false;
611 }
612 }
613 }
614
615 end = skipWhiteSpace(end);
616 if (*end != ')')
617 return false;
618
619 return true;
620 }
621
622 template <typename SrcCharacterType>
peekMaxURILen(SrcCharacterType * src,UChar quote)623 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
624 {
625 // The decoded form of a URI (after resolving escape sequences)
626 // will not contain more characters (ASCII or UTF-16 codepoints)
627 // than the input. This code can therefore ignore escape sequences
628 // completely.
629 SrcCharacterType* start = src;
630 if (quote) {
631 ASSERT(quote == '"' || quote == '\'');
632 return peekMaxStringLen(src, quote);
633 }
634
635 while (isURILetter(*src)) {
636 if (LIKELY(*src != '\\'))
637 src++;
638 else
639 parseEscape<SrcCharacterType>(src);
640 }
641
642 return src - start;
643 }
644
645 template <typename SrcCharacterType, typename DestCharacterType>
parseURIInternal(SrcCharacterType * & src,DestCharacterType * & dest,UChar quote)646 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote)
647 {
648 if (quote) {
649 ASSERT(quote == '"' || quote == '\'');
650 return parseStringInternal(src, dest, quote);
651 }
652
653 while (isURILetter(*src)) {
654 if (LIKELY(*src != '\\')) {
655 *dest++ = *src++;
656 } else {
657 unsigned unicode = parseEscape<SrcCharacterType>(src);
658 if (unicode > 0xff && sizeof(DestCharacterType) == 1)
659 return false;
660 UnicodeToChars(dest, unicode);
661 }
662 }
663
664 return true;
665 }
666
667 template <typename CharacterType>
parseURI(CSSParserString & string)668 inline void CSSTokenizer::parseURI(CSSParserString& string)
669 {
670 CharacterType* uriStart;
671 CharacterType* uriEnd;
672 UChar quote;
673 if (!findURI(uriStart, uriEnd, quote))
674 return;
675
676 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;
677 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote))) {
678 string.init(uriStart, dest - uriStart);
679 } else {
680 // An escape sequence was encountered that can't be stored in 8 bits.
681 // Reset the current character to the start of the URI and re-parse with
682 // a 16-bit destination.
683 ASSERT(is8BitSource());
684 currentCharacter<CharacterType>() = uriStart;
685 UChar* result16 = allocateStringBuffer16(peekMaxURILen(currentCharacter<CharacterType>(), quote));
686 UChar* uriStart16 = result16;
687 bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote);
688 ASSERT_UNUSED(result, result);
689 string.init(uriStart16, result16 - uriStart16);
690 }
691
692 currentCharacter<CharacterType>() = uriEnd + 1;
693 m_token = URI;
694 }
695
696 template <typename CharacterType>
parseUnicodeRange()697 inline bool CSSTokenizer::parseUnicodeRange()
698 {
699 CharacterType* character = currentCharacter<CharacterType>() + 1;
700 int length = 6;
701 ASSERT(*currentCharacter<CharacterType>() == '+');
702
703 while (isASCIIHexDigit(*character) && length) {
704 ++character;
705 --length;
706 }
707
708 if (length && *character == '?') {
709 // At most 5 hex digit followed by a question mark.
710 do {
711 ++character;
712 --length;
713 } while (*character == '?' && length);
714 currentCharacter<CharacterType>() = character;
715 return true;
716 }
717
718 if (length < 6) {
719 // At least one hex digit.
720 if (character[0] == '-' && isASCIIHexDigit(character[1])) {
721 // Followed by a dash and a hex digit.
722 ++character;
723 length = 6;
724 do {
725 ++character;
726 } while (--length && isASCIIHexDigit(*character));
727 }
728 currentCharacter<CharacterType>() = character;
729 return true;
730 }
731 return false;
732 }
733
734 template <typename CharacterType>
parseNthChild()735 bool CSSTokenizer::parseNthChild()
736 {
737 CharacterType* character = currentCharacter<CharacterType>();
738
739 while (isASCIIDigit(*character))
740 ++character;
741 if (isASCIIAlphaCaselessEqual(*character, 'n')) {
742 currentCharacter<CharacterType>() = character + 1;
743 return true;
744 }
745 return false;
746 }
747
748 template <typename CharacterType>
parseNthChildExtra()749 bool CSSTokenizer::parseNthChildExtra()
750 {
751 CharacterType* character = skipWhiteSpace(currentCharacter<CharacterType>());
752 if (*character != '+' && *character != '-')
753 return false;
754
755 character = skipWhiteSpace(character + 1);
756 if (!isASCIIDigit(*character))
757 return false;
758
759 do {
760 ++character;
761 } while (isASCIIDigit(*character));
762
763 currentCharacter<CharacterType>() = character;
764 return true;
765 }
766
767 template <typename CharacterType>
detectFunctionTypeToken(int length)768 inline bool CSSTokenizer::detectFunctionTypeToken(int length)
769 {
770 ASSERT(length > 0);
771 CharacterType* name = tokenStart<CharacterType>();
772 SWITCH(name, length) {
773 CASE("not") {
774 m_token = NOTFUNCTION;
775 return true;
776 }
777 CASE("url") {
778 m_token = URI;
779 return true;
780 }
781 CASE("cue") {
782 m_token = CUEFUNCTION;
783 return true;
784 }
785 CASE("calc") {
786 m_token = CALCFUNCTION;
787 return true;
788 }
789 CASE("host") {
790 m_token = HOSTFUNCTION;
791 return true;
792 }
793 CASE("host-context") {
794 m_token = HOSTCONTEXTFUNCTION;
795 return true;
796 }
797 CASE("nth-child") {
798 m_parsingMode = NthChildMode;
799 return true;
800 }
801 CASE("nth-of-type") {
802 m_parsingMode = NthChildMode;
803 return true;
804 }
805 CASE("nth-last-child") {
806 m_parsingMode = NthChildMode;
807 return true;
808 }
809 CASE("nth-last-of-type") {
810 m_parsingMode = NthChildMode;
811 return true;
812 }
813 }
814 return false;
815 }
816
817 template <typename CharacterType>
detectMediaQueryToken(int length)818 inline void CSSTokenizer::detectMediaQueryToken(int length)
819 {
820 ASSERT(m_parsingMode == MediaQueryMode);
821 CharacterType* name = tokenStart<CharacterType>();
822
823 SWITCH(name, length) {
824 CASE("and") {
825 m_token = MEDIA_AND;
826 }
827 CASE("not") {
828 m_token = MEDIA_NOT;
829 }
830 CASE("only") {
831 m_token = MEDIA_ONLY;
832 }
833 CASE("or") {
834 m_token = MEDIA_OR;
835 }
836 }
837 }
838
839 template <typename CharacterType>
detectNumberToken(CharacterType * type,int length)840 inline void CSSTokenizer::detectNumberToken(CharacterType* type, int length)
841 {
842 ASSERT(length > 0);
843
844 SWITCH(type, length) {
845 CASE("cm") {
846 m_token = CMS;
847 }
848 CASE("ch") {
849 m_token = CHS;
850 }
851 CASE("deg") {
852 m_token = DEGS;
853 }
854 CASE("dppx") {
855 // There is a discussion about the name of this unit on www-style.
856 // Keep this compile time guard in place until that is resolved.
857 // http://lists.w3.org/Archives/Public/www-style/2012May/0915.html
858 m_token = DPPX;
859 }
860 CASE("dpcm") {
861 m_token = DPCM;
862 }
863 CASE("dpi") {
864 m_token = DPI;
865 }
866 CASE("em") {
867 m_token = EMS;
868 }
869 CASE("ex") {
870 m_token = EXS;
871 }
872 CASE("fr") {
873 m_token = FR;
874 }
875 CASE("grad") {
876 m_token = GRADS;
877 }
878 CASE("hz") {
879 m_token = HERTZ;
880 }
881 CASE("in") {
882 m_token = INS;
883 }
884 CASE("khz") {
885 m_token = KHERTZ;
886 }
887 CASE("mm") {
888 m_token = MMS;
889 }
890 CASE("ms") {
891 m_token = MSECS;
892 }
893 CASE("px") {
894 m_token = PXS;
895 }
896 CASE("pt") {
897 m_token = PTS;
898 }
899 CASE("pc") {
900 m_token = PCS;
901 }
902 CASE("rad") {
903 m_token = RADS;
904 }
905 CASE("rem") {
906 m_token = REMS;
907 }
908 CASE("s") {
909 m_token = SECS;
910 }
911 CASE("turn") {
912 m_token = TURNS;
913 }
914 CASE("vw") {
915 m_token = VW;
916 }
917 CASE("vh") {
918 m_token = VH;
919 }
920 CASE("vmin") {
921 m_token = VMIN;
922 }
923 CASE("vmax") {
924 m_token = VMAX;
925 }
926 CASE("__qem") {
927 m_token = QEMS;
928 }
929 }
930 }
931
932 template <typename CharacterType>
detectDashToken(int length)933 inline void CSSTokenizer::detectDashToken(int length)
934 {
935 CharacterType* name = tokenStart<CharacterType>();
936
937 // Ignore leading dash.
938 ++name;
939 --length;
940
941 SWITCH(name, length) {
942 CASE("webkit-any") {
943 m_token = ANYFUNCTION;
944 }
945 CASE("webkit-calc") {
946 m_token = CALCFUNCTION;
947 }
948 }
949 }
950
951 template <typename CharacterType>
detectAtToken(int length,bool hasEscape)952 inline void CSSTokenizer::detectAtToken(int length, bool hasEscape)
953 {
954 CharacterType* name = tokenStart<CharacterType>();
955 ASSERT(name[0] == '@' && length >= 2);
956
957 // Ignore leading @.
958 ++name;
959 --length;
960
961 // charset, font-face, import, media, namespace, page, supports,
962 // -webkit-keyframes, keyframes, and -webkit-mediaquery are not affected by hasEscape.
963 SWITCH(name, length) {
964 CASE("bottom-left") {
965 if (LIKELY(!hasEscape))
966 m_token = BOTTOMLEFT_SYM;
967 }
968 CASE("bottom-right") {
969 if (LIKELY(!hasEscape))
970 m_token = BOTTOMRIGHT_SYM;
971 }
972 CASE("bottom-center") {
973 if (LIKELY(!hasEscape))
974 m_token = BOTTOMCENTER_SYM;
975 }
976 CASE("bottom-left-corner") {
977 if (LIKELY(!hasEscape))
978 m_token = BOTTOMLEFTCORNER_SYM;
979 }
980 CASE("bottom-right-corner") {
981 if (LIKELY(!hasEscape))
982 m_token = BOTTOMRIGHTCORNER_SYM;
983 }
984 CASE("charset") {
985 if (name - 1 == dataStart<CharacterType>())
986 m_token = CHARSET_SYM;
987 }
988 CASE("font-face") {
989 m_token = FONT_FACE_SYM;
990 }
991 CASE("import") {
992 m_parsingMode = MediaQueryMode;
993 m_token = IMPORT_SYM;
994 }
995 CASE("keyframes") {
996 if (RuntimeEnabledFeatures::cssAnimationUnprefixedEnabled())
997 m_token = KEYFRAMES_SYM;
998 }
999 CASE("left-top") {
1000 if (LIKELY(!hasEscape))
1001 m_token = LEFTTOP_SYM;
1002 }
1003 CASE("left-middle") {
1004 if (LIKELY(!hasEscape))
1005 m_token = LEFTMIDDLE_SYM;
1006 }
1007 CASE("left-bottom") {
1008 if (LIKELY(!hasEscape))
1009 m_token = LEFTBOTTOM_SYM;
1010 }
1011 CASE("media") {
1012 m_parsingMode = MediaQueryMode;
1013 m_token = MEDIA_SYM;
1014 }
1015 CASE("namespace") {
1016 m_token = NAMESPACE_SYM;
1017 }
1018 CASE("page") {
1019 m_token = PAGE_SYM;
1020 }
1021 CASE("right-top") {
1022 if (LIKELY(!hasEscape))
1023 m_token = RIGHTTOP_SYM;
1024 }
1025 CASE("right-middle") {
1026 if (LIKELY(!hasEscape))
1027 m_token = RIGHTMIDDLE_SYM;
1028 }
1029 CASE("right-bottom") {
1030 if (LIKELY(!hasEscape))
1031 m_token = RIGHTBOTTOM_SYM;
1032 }
1033 CASE("supports") {
1034 m_parsingMode = SupportsMode;
1035 m_token = SUPPORTS_SYM;
1036 }
1037 CASE("top-left") {
1038 if (LIKELY(!hasEscape))
1039 m_token = TOPLEFT_SYM;
1040 }
1041 CASE("top-right") {
1042 if (LIKELY(!hasEscape))
1043 m_token = TOPRIGHT_SYM;
1044 }
1045 CASE("top-center") {
1046 if (LIKELY(!hasEscape))
1047 m_token = TOPCENTER_SYM;
1048 }
1049 CASE("top-left-corner") {
1050 if (LIKELY(!hasEscape))
1051 m_token = TOPLEFTCORNER_SYM;
1052 }
1053 CASE("top-right-corner") {
1054 if (LIKELY(!hasEscape))
1055 m_token = TOPRIGHTCORNER_SYM;
1056 }
1057 CASE("viewport") {
1058 m_token = VIEWPORT_RULE_SYM;
1059 }
1060 CASE("-internal-rule") {
1061 if (LIKELY(!hasEscape && m_internal))
1062 m_token = INTERNAL_RULE_SYM;
1063 }
1064 CASE("-internal-decls") {
1065 if (LIKELY(!hasEscape && m_internal))
1066 m_token = INTERNAL_DECLS_SYM;
1067 }
1068 CASE("-internal-value") {
1069 if (LIKELY(!hasEscape && m_internal))
1070 m_token = INTERNAL_VALUE_SYM;
1071 }
1072 CASE("-webkit-keyframes") {
1073 m_token = WEBKIT_KEYFRAMES_SYM;
1074 }
1075 CASE("-internal-selector") {
1076 if (LIKELY(!hasEscape && m_internal))
1077 m_token = INTERNAL_SELECTOR_SYM;
1078 }
1079 CASE("-internal-keyframe-rule") {
1080 if (LIKELY(!hasEscape && m_internal))
1081 m_token = INTERNAL_KEYFRAME_RULE_SYM;
1082 }
1083 CASE("-internal-keyframe-key-list") {
1084 if (!m_internal)
1085 return;
1086 m_token = INTERNAL_KEYFRAME_KEY_LIST_SYM;
1087 }
1088 CASE("-internal-supports-condition") {
1089 if (!m_internal)
1090 return;
1091 m_parsingMode = SupportsMode;
1092 m_token = INTERNAL_SUPPORTS_CONDITION_SYM;
1093 }
1094 }
1095 }
1096
1097 template <typename CharacterType>
detectSupportsToken(int length)1098 inline void CSSTokenizer::detectSupportsToken(int length)
1099 {
1100 ASSERT(m_parsingMode == SupportsMode);
1101 CharacterType* name = tokenStart<CharacterType>();
1102
1103 SWITCH(name, length) {
1104 CASE("or") {
1105 m_token = SUPPORTS_OR;
1106 }
1107 CASE("and") {
1108 m_token = SUPPORTS_AND;
1109 }
1110 CASE("not") {
1111 m_token = SUPPORTS_NOT;
1112 }
1113 }
1114 }
1115
1116 template <typename SrcCharacterType>
realLex(void * yylvalWithoutType)1117 int CSSTokenizer::realLex(void* yylvalWithoutType)
1118 {
1119 YYSTYPE* yylval = static_cast<YYSTYPE*>(yylvalWithoutType);
1120 // Write pointer for the next character.
1121 SrcCharacterType* result;
1122 CSSParserString resultString;
1123 bool hasEscape;
1124
1125 // The input buffer is terminated by a \0 character, so
1126 // it is safe to read one character ahead of a known non-null.
1127 #if ENABLE(ASSERT)
1128 // In debug we check with an ASSERT that the length is > 0 for string types.
1129 yylval->string.clear();
1130 #endif
1131
1132 restartAfterComment:
1133 result = currentCharacter<SrcCharacterType>();
1134 setTokenStart(result);
1135 m_tokenStartLineNumber = m_lineNumber;
1136 m_token = *currentCharacter<SrcCharacterType>();
1137 ++currentCharacter<SrcCharacterType>();
1138
1139 switch ((m_token <= 127) ? typesOfASCIICharacters[m_token] : CharacterIdentifierStart) {
1140 case CharacterCaselessU:
1141 if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '+')) {
1142 if (parseUnicodeRange<SrcCharacterType>()) {
1143 m_token = UNICODERANGE;
1144 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1145 break;
1146 }
1147 }
1148 // Fall through to CharacterIdentifierStart.
1149
1150 case CharacterIdentifierStart:
1151 --currentCharacter<SrcCharacterType>();
1152 parseIdentifier(result, yylval->string, hasEscape);
1153 m_token = IDENT;
1154
1155 if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '(')) {
1156 if (m_parsingMode == SupportsMode && !hasEscape) {
1157 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1158 if (m_token != IDENT)
1159 break;
1160 }
1161
1162 m_token = FUNCTION;
1163 if (!hasEscape)
1164 detectFunctionTypeToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1165
1166 // Skip parenthesis
1167 ++currentCharacter<SrcCharacterType>();
1168 ++result;
1169
1170 if (m_token == URI) {
1171 m_token = FUNCTION;
1172 // Check whether it is really an URI.
1173 if (yylval->string.is8Bit())
1174 parseURI<LChar>(yylval->string);
1175 else
1176 parseURI<UChar>(yylval->string);
1177 }
1178 } else if (UNLIKELY(m_parsingMode != NormalMode) && !hasEscape) {
1179 if (m_parsingMode == MediaQueryMode) {
1180 detectMediaQueryToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1181 } else if (m_parsingMode == SupportsMode) {
1182 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1183 } else if (m_parsingMode == NthChildMode && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[0], 'n')) {
1184 if (result - tokenStart<SrcCharacterType>() == 1) {
1185 // String "n" is IDENT but "n+1" is NTH.
1186 if (parseNthChildExtra<SrcCharacterType>()) {
1187 m_token = NTH;
1188 yylval->string.m_length = currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>();
1189 }
1190 } else if (result - tokenStart<SrcCharacterType>() >= 2 && tokenStart<SrcCharacterType>()[1] == '-') {
1191 // String "n-" is IDENT but "n-1" is NTH.
1192 // Set currentCharacter to '-' to continue parsing.
1193 SrcCharacterType* nextCharacter = result;
1194 currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 1;
1195 if (parseNthChildExtra<SrcCharacterType>()) {
1196 m_token = NTH;
1197 yylval->string.setLength(currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1198 } else {
1199 // Revert the change to currentCharacter if unsuccessful.
1200 currentCharacter<SrcCharacterType>() = nextCharacter;
1201 }
1202 }
1203 }
1204 }
1205 break;
1206
1207 case CharacterDot:
1208 if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0]))
1209 break;
1210 // Fall through to CharacterNumber.
1211
1212 case CharacterNumber: {
1213 bool dotSeen = (m_token == '.');
1214
1215 while (true) {
1216 if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0])) {
1217 // Only one dot is allowed for a number,
1218 // and it must be followed by a digit.
1219 if (currentCharacter<SrcCharacterType>()[0] != '.' || dotSeen || !isASCIIDigit(currentCharacter<SrcCharacterType>()[1]))
1220 break;
1221 dotSeen = true;
1222 }
1223 ++currentCharacter<SrcCharacterType>();
1224 }
1225
1226 if (UNLIKELY(m_parsingMode == NthChildMode) && !dotSeen && isASCIIAlphaCaselessEqual(*currentCharacter<SrcCharacterType>(), 'n')) {
1227 // "[0-9]+n" is always an NthChild.
1228 ++currentCharacter<SrcCharacterType>();
1229 parseNthChildExtra<SrcCharacterType>();
1230 m_token = NTH;
1231 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1232 break;
1233 }
1234
1235 // We need to take care of units like 'em' or 'ex'.
1236 SrcCharacterType* character = currentCharacter<SrcCharacterType>();
1237 if (isASCIIAlphaCaselessEqual(*character, 'e')) {
1238 ASSERT(character - tokenStart<SrcCharacterType>() > 0);
1239 ++character;
1240 if (*character == '-' || *character == '+' || isASCIIDigit(*character)) {
1241 ++character;
1242 while (isASCIIDigit(*character))
1243 ++character;
1244 // Use FLOATTOKEN if the string contains exponents.
1245 dotSeen = true;
1246 currentCharacter<SrcCharacterType>() = character;
1247 }
1248 }
1249
1250 yylval->number = charactersToDouble(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1251
1252 // Type of the function.
1253 if (isIdentifierStart<SrcCharacterType>()) {
1254 SrcCharacterType* type = currentCharacter<SrcCharacterType>();
1255 result = currentCharacter<SrcCharacterType>();
1256
1257 parseIdentifier(result, resultString, hasEscape);
1258
1259 m_token = DIMEN;
1260 if (!hasEscape)
1261 detectNumberToken(type, currentCharacter<SrcCharacterType>() - type);
1262
1263 if (m_token == DIMEN) {
1264 // The decoded number is overwritten, but this is intentional.
1265 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1266 }
1267 } else if (*currentCharacter<SrcCharacterType>() == '%') {
1268 // Although the CSS grammar says {num}% we follow
1269 // webkit at the moment which uses {num}%+.
1270 do {
1271 ++currentCharacter<SrcCharacterType>();
1272 } while (*currentCharacter<SrcCharacterType>() == '%');
1273 m_token = PERCENTAGE;
1274 } else {
1275 m_token = dotSeen ? FLOATTOKEN : INTEGER;
1276 }
1277 break;
1278 }
1279
1280 case CharacterDash:
1281 if (isIdentifierStartAfterDash(currentCharacter<SrcCharacterType>())) {
1282 --currentCharacter<SrcCharacterType>();
1283 parseIdentifier(result, resultString, hasEscape);
1284 m_token = IDENT;
1285
1286 if (*currentCharacter<SrcCharacterType>() == '(') {
1287 m_token = FUNCTION;
1288 if (!hasEscape)
1289 detectDashToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1290 ++currentCharacter<SrcCharacterType>();
1291 ++result;
1292 } else if (UNLIKELY(m_parsingMode == NthChildMode) && !hasEscape && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[1], 'n')) {
1293 if (result - tokenStart<SrcCharacterType>() == 2) {
1294 // String "-n" is IDENT but "-n+1" is NTH.
1295 if (parseNthChildExtra<SrcCharacterType>()) {
1296 m_token = NTH;
1297 result = currentCharacter<SrcCharacterType>();
1298 }
1299 } else if (result - tokenStart<SrcCharacterType>() >= 3 && tokenStart<SrcCharacterType>()[2] == '-') {
1300 // String "-n-" is IDENT but "-n-1" is NTH.
1301 // Set currentCharacter to second '-' of '-n-' to continue parsing.
1302 SrcCharacterType* nextCharacter = result;
1303 currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 2;
1304 if (parseNthChildExtra<SrcCharacterType>()) {
1305 m_token = NTH;
1306 result = currentCharacter<SrcCharacterType>();
1307 } else {
1308 // Revert the change to currentCharacter if unsuccessful.
1309 currentCharacter<SrcCharacterType>() = nextCharacter;
1310 }
1311 }
1312 resultString.setLength(result - tokenStart<SrcCharacterType>());
1313 }
1314 yylval->string = resultString;
1315 } else if (currentCharacter<SrcCharacterType>()[0] == '-' && currentCharacter<SrcCharacterType>()[1] == '>') {
1316 currentCharacter<SrcCharacterType>() += 2;
1317 m_token = SGML_CD;
1318 } else if (UNLIKELY(m_parsingMode == NthChildMode)) {
1319 // "-[0-9]+n" is always an NthChild.
1320 if (parseNthChild<SrcCharacterType>()) {
1321 parseNthChildExtra<SrcCharacterType>();
1322 m_token = NTH;
1323 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1324 }
1325 }
1326 break;
1327
1328 case CharacterOther:
1329 // m_token is simply the current character.
1330 break;
1331
1332 case CharacterNull:
1333 // Do not advance pointer at the end of input.
1334 --currentCharacter<SrcCharacterType>();
1335 break;
1336
1337 case CharacterWhiteSpace:
1338 m_token = WHITESPACE;
1339 // Might start with a '\n'.
1340 --currentCharacter<SrcCharacterType>();
1341 do {
1342 if (*currentCharacter<SrcCharacterType>() == '\n')
1343 ++m_lineNumber;
1344 ++currentCharacter<SrcCharacterType>();
1345 } while (*currentCharacter<SrcCharacterType>() <= ' ' && (typesOfASCIICharacters[*currentCharacter<SrcCharacterType>()] == CharacterWhiteSpace));
1346 break;
1347
1348 case CharacterEndMediaQueryOrSupports:
1349 if (m_parsingMode == MediaQueryMode || m_parsingMode == SupportsMode)
1350 m_parsingMode = NormalMode;
1351 break;
1352
1353 case CharacterEndNthChild:
1354 if (m_parsingMode == NthChildMode)
1355 m_parsingMode = NormalMode;
1356 break;
1357
1358 case CharacterQuote:
1359 if (checkAndSkipString(currentCharacter<SrcCharacterType>(), m_token, AbortIfInvalid)) {
1360 ++result;
1361 parseString<SrcCharacterType>(result, yylval->string, m_token);
1362 m_token = STRING;
1363 }
1364 break;
1365
1366 case CharacterExclamationMark: {
1367 SrcCharacterType* start = skipWhiteSpace(currentCharacter<SrcCharacterType>());
1368 if (isEqualToCSSIdentifier(start, "important")) {
1369 m_token = IMPORTANT_SYM;
1370 currentCharacter<SrcCharacterType>() = start + 9;
1371 }
1372 break;
1373 }
1374
1375 case CharacterHashmark: {
1376 SrcCharacterType* start = currentCharacter<SrcCharacterType>();
1377 result = currentCharacter<SrcCharacterType>();
1378
1379 if (isASCIIDigit(*currentCharacter<SrcCharacterType>())) {
1380 // This must be a valid hex number token.
1381 do {
1382 ++currentCharacter<SrcCharacterType>();
1383 } while (isASCIIHexDigit(*currentCharacter<SrcCharacterType>()));
1384 m_token = HEX;
1385 yylval->string.init(start, currentCharacter<SrcCharacterType>() - start);
1386 } else if (isIdentifierStart<SrcCharacterType>()) {
1387 m_token = IDSEL;
1388 parseIdentifier(result, yylval->string, hasEscape);
1389 if (!hasEscape) {
1390 // Check whether the identifier is also a valid hex number.
1391 SrcCharacterType* current = start;
1392 m_token = HEX;
1393 do {
1394 if (!isASCIIHexDigit(*current)) {
1395 m_token = IDSEL;
1396 break;
1397 }
1398 ++current;
1399 } while (current < result);
1400 }
1401 }
1402 break;
1403 }
1404
1405 case CharacterSlash:
1406 // Ignore comments. They are not even considered as white spaces.
1407 if (*currentCharacter<SrcCharacterType>() == '*') {
1408 const CSSParserLocation startLocation = currentLocation();
1409 if (m_parser.m_observer) {
1410 unsigned startOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>() - 1; // Start with a slash.
1411 m_parser.m_observer->startComment(startOffset - m_parsedTextPrefixLength);
1412 }
1413 ++currentCharacter<SrcCharacterType>();
1414 while (currentCharacter<SrcCharacterType>()[0] != '*' || currentCharacter<SrcCharacterType>()[1] != '/') {
1415 if (*currentCharacter<SrcCharacterType>() == '\n')
1416 ++m_lineNumber;
1417 if (*currentCharacter<SrcCharacterType>() == '\0') {
1418 // Unterminated comments are simply ignored.
1419 currentCharacter<SrcCharacterType>() -= 2;
1420 m_parser.reportError(startLocation, UnterminatedCommentCSSError);
1421 break;
1422 }
1423 ++currentCharacter<SrcCharacterType>();
1424 }
1425 currentCharacter<SrcCharacterType>() += 2;
1426 if (m_parser.m_observer) {
1427 unsigned endOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>();
1428 unsigned userTextEndOffset = static_cast<unsigned>(m_length - 1 - m_parsedTextSuffixLength);
1429 m_parser.m_observer->endComment(std::min(endOffset, userTextEndOffset) - m_parsedTextPrefixLength);
1430 }
1431 goto restartAfterComment;
1432 }
1433 break;
1434
1435 case CharacterDollar:
1436 if (*currentCharacter<SrcCharacterType>() == '=') {
1437 ++currentCharacter<SrcCharacterType>();
1438 m_token = ENDSWITH;
1439 }
1440 break;
1441
1442 case CharacterAsterisk:
1443 if (*currentCharacter<SrcCharacterType>() == '=') {
1444 ++currentCharacter<SrcCharacterType>();
1445 m_token = CONTAINS;
1446 }
1447 break;
1448
1449 case CharacterPlus:
1450 if (UNLIKELY(m_parsingMode == NthChildMode)) {
1451 // Simplest case. "+[0-9]*n" is always NthChild.
1452 if (parseNthChild<SrcCharacterType>()) {
1453 parseNthChildExtra<SrcCharacterType>();
1454 m_token = NTH;
1455 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1456 }
1457 }
1458 break;
1459
1460 case CharacterLess:
1461 if (currentCharacter<SrcCharacterType>()[0] == '!' && currentCharacter<SrcCharacterType>()[1] == '-' && currentCharacter<SrcCharacterType>()[2] == '-') {
1462 currentCharacter<SrcCharacterType>() += 3;
1463 m_token = SGML_CD;
1464 }
1465 break;
1466
1467 case CharacterAt:
1468 if (isIdentifierStart<SrcCharacterType>()) {
1469 m_token = ATKEYWORD;
1470 ++result;
1471 parseIdentifier(result, resultString, hasEscape);
1472 // The standard enables unicode escapes in at-rules. In this case only the resultString will contain the
1473 // correct identifier, hence we have to use it to determine its length instead of the usual pointer arithmetic.
1474 detectAtToken<SrcCharacterType>(resultString.length() + 1, hasEscape);
1475 }
1476 break;
1477
1478 case CharacterBackSlash:
1479 if (isCSSEscape(*currentCharacter<SrcCharacterType>())) {
1480 --currentCharacter<SrcCharacterType>();
1481 parseIdentifier(result, yylval->string, hasEscape);
1482 m_token = IDENT;
1483 }
1484 break;
1485
1486 case CharacterXor:
1487 if (*currentCharacter<SrcCharacterType>() == '=') {
1488 ++currentCharacter<SrcCharacterType>();
1489 m_token = BEGINSWITH;
1490 }
1491 break;
1492
1493 case CharacterVerticalBar:
1494 if (*currentCharacter<SrcCharacterType>() == '=') {
1495 ++currentCharacter<SrcCharacterType>();
1496 m_token = DASHMATCH;
1497 }
1498 break;
1499
1500 case CharacterTilde:
1501 if (*currentCharacter<SrcCharacterType>() == '=') {
1502 ++currentCharacter<SrcCharacterType>();
1503 m_token = INCLUDES;
1504 }
1505 break;
1506
1507 default:
1508 ASSERT_NOT_REACHED();
1509 break;
1510 }
1511
1512 return m_token;
1513 }
1514
1515 template <>
setTokenStart(LChar * tokenStart)1516 inline void CSSTokenizer::setTokenStart<LChar>(LChar* tokenStart)
1517 {
1518 m_tokenStart.ptr8 = tokenStart;
1519 }
1520
1521 template <>
setTokenStart(UChar * tokenStart)1522 inline void CSSTokenizer::setTokenStart<UChar>(UChar* tokenStart)
1523 {
1524 m_tokenStart.ptr16 = tokenStart;
1525 }
1526
setupTokenizer(const char * prefix,unsigned prefixLength,const String & string,const char * suffix,unsigned suffixLength)1527 void CSSTokenizer::setupTokenizer(const char* prefix, unsigned prefixLength, const String& string, const char* suffix, unsigned suffixLength)
1528 {
1529 m_parsedTextPrefixLength = prefixLength;
1530 m_parsedTextSuffixLength = suffixLength;
1531 unsigned stringLength = string.length();
1532 unsigned length = stringLength + m_parsedTextPrefixLength + m_parsedTextSuffixLength + 1;
1533 m_length = length;
1534
1535 if (!stringLength || string.is8Bit()) {
1536 m_dataStart8 = adoptArrayPtr(new LChar[length]);
1537 for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
1538 m_dataStart8[i] = prefix[i];
1539
1540 if (stringLength)
1541 memcpy(m_dataStart8.get() + m_parsedTextPrefixLength, string.characters8(), stringLength * sizeof(LChar));
1542
1543 unsigned start = m_parsedTextPrefixLength + stringLength;
1544 unsigned end = start + suffixLength;
1545 for (unsigned i = start; i < end; i++)
1546 m_dataStart8[i] = suffix[i - start];
1547
1548 m_dataStart8[length - 1] = 0;
1549
1550 m_is8BitSource = true;
1551 m_currentCharacter8 = m_dataStart8.get();
1552 m_currentCharacter16 = 0;
1553 setTokenStart<LChar>(m_currentCharacter8);
1554 m_lexFunc = &CSSTokenizer::realLex<LChar>;
1555 return;
1556 }
1557
1558 m_dataStart16 = adoptArrayPtr(new UChar[length]);
1559 for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
1560 m_dataStart16[i] = prefix[i];
1561
1562 ASSERT(stringLength);
1563 memcpy(m_dataStart16.get() + m_parsedTextPrefixLength, string.characters16(), stringLength * sizeof(UChar));
1564
1565 unsigned start = m_parsedTextPrefixLength + stringLength;
1566 unsigned end = start + suffixLength;
1567 for (unsigned i = start; i < end; i++)
1568 m_dataStart16[i] = suffix[i - start];
1569
1570 m_dataStart16[length - 1] = 0;
1571
1572 m_is8BitSource = false;
1573 m_currentCharacter8 = 0;
1574 m_currentCharacter16 = m_dataStart16.get();
1575 setTokenStart<UChar>(m_currentCharacter16);
1576 m_lexFunc = &CSSTokenizer::realLex<UChar>;
1577 }
1578
1579 } // namespace blink
1580