1# Copyright (C) 2016 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html
3#*****************************************************************************
4#
5#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
6#   All Rights Reserved.
7#
8#*****************************************************************************
9#
10#  file:  regexcst.txt
11#  ICU Regular Expression Parser State Table
12#
13#     This state table is used when reading and parsing a regular expression pattern
14#     The pattern parser uses a state machine; the data in this file define the
15#     state transitions that occur for each input character.
16#
17#     *** This file defines the regex pattern grammar.   This is it.
18#     *** The determination of what is accepted is here.
19#
20#     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
21#     that are then built with the rule parser.
22#
23
24#
25# Here is the syntax of the state definitions in this file:
26#
27#
28#StateName:
29#   input-char           n next-state           ^push-state     action
30#   input-char           n next-state           ^push-state     action
31#       |                |   |                      |             |
32#       |                |   |                      |             |--- action to be performed by state machine
33#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
34#       |                |   |                      |
35#       |                |   |                      |--- Push this named state onto the state stack.
36#       |                |   |                           Later, when next state is specified as "pop",
37#       |                |   |                           the pushed state will become the current state.
38#       |                |   |
39#       |                |   |--- Transition to this state if the current input character matches the input
40#       |                |        character or char class in the left hand column.  "pop" causes the next
41#       |                |        state to be popped from the state stack.
42#       |                |
43#       |                |--- When making the state transition specified on this line, advance to the next
44#       |                     character from the input only if 'n' appears here.
45#       |
46#       |--- Character or named character classes to test for.  If the current character being scanned
47#            matches, peform the actions and go to the state specified on this line.
48#            The input character is tested sequentally, in the order written.  The characters and
49#            character classes tested for do not need to be mutually exclusive.  The first match wins.
50#
51
52
53
54
55#
56#  start state, scan position is at the beginning of the pattern.
57#
58start:
59   default                 term                                     doPatStart
60
61
62
63
64#
65#  term.  At a position where we can accept the start most items in a pattern.
66#
67term:
68    quoted               n expr-quant                               doLiteralChar
69    rule_char            n expr-quant                               doLiteralChar
70    '['                  n set-open       ^set-finish               doSetBegin
71    '('                  n open-paren
72    '.'                  n expr-quant                               doDotAny
73    '^'                  n expr-quant                               doCaret
74    '$'                  n expr-quant                               doDollar
75    '\'                  n backslash
76    '|'                  n  term                                    doOrOperator
77    ')'                  n  pop                                     doCloseParen
78    eof	                   term                                     doPatFinish
79    default                errorDeath                               doRuleError
80
81
82
83#
84#   expr-quant    We've just finished scanning a term, now look for the optional
85#                 trailing quantifier - *, +, ?, *?,  etc.
86#
87expr-quant:
88    '*'                  n  quant-star
89    '+'                  n  quant-plus
90    '?'                  n  quant-opt
91    '{'                  n  interval-open                          doIntervalInit
92    '('                  n  open-paren-quant
93    default                 expr-cont
94
95
96#
97#  expr-cont      Expression, continuation.  At a point where additional terms are
98#                                            allowed, but not required.  No Quantifiers
99#
100expr-cont:
101    '|'                  n  term                                    doOrOperator
102    ')'                  n  pop                                     doCloseParen
103    default                 term
104
105
106#
107#   open-paren-quant   Special case handling for comments appearing before a quantifier,
108#                        e.g.   x(?#comment )*
109#                      Open parens from expr-quant come here; anything but a (?# comment
110#                      branches into the normal parenthesis sequence as quickly as possible.
111#
112open-paren-quant:
113    '?'                  n  open-paren-quant2                      doSuppressComments
114    default                 open-paren
115
116open-paren-quant2:
117    '#'                  n  paren-comment   ^expr-quant
118    default                 open-paren-extended
119
120
121#
122#   open-paren    We've got an open paren.  We need to scan further to
123#                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
124#
125open-paren:
126    '?'                  n  open-paren-extended                     doSuppressComments
127    default                 term            ^expr-quant             doOpenCaptureParen
128
129open-paren-extended:
130    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
131    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
132    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
133    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
134    '<'                  n  open-paren-lookbehind
135    '#'                  n  paren-comment   ^term
136    'i'                     paren-flag                              doBeginMatchMode
137    'd'                     paren-flag                              doBeginMatchMode
138    'm'                     paren-flag                              doBeginMatchMode
139    's'                     paren-flag                              doBeginMatchMode
140    'u'                     paren-flag                              doBeginMatchMode
141    'w'                     paren-flag                              doBeginMatchMode
142    'x'                     paren-flag                              doBeginMatchMode
143    '-'                     paren-flag                              doBeginMatchMode
144    '('                  n  errorDeath                              doConditionalExpr
145    '{'                  n  errorDeath                              doPerlInline
146    default                 errorDeath                              doBadOpenParenType
147
148open-paren-lookbehind:
149    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
150    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
151    ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
152    default                 errorDeath                              doBadOpenParenType
153
154
155#
156#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
157#
158paren-comment:
159    ')'                  n  pop
160    eof		                errorDeath                              doMismatchedParenErr
161    default              n  paren-comment
162
163#
164#  paren-flag    Scanned a (?ismx-ismx  flag setting
165#
166paren-flag:
167    'i'                  n  paren-flag                              doMatchMode
168    'd'                  n  paren-flag                              doMatchMode
169    'm'                  n  paren-flag                              doMatchMode
170    's'                  n  paren-flag                              doMatchMode
171    'u'                  n  paren-flag                              doMatchMode
172    'w'                  n  paren-flag                              doMatchMode
173    'x'                  n  paren-flag                              doMatchMode
174    '-'                  n  paren-flag                              doMatchMode
175    ')'                  n  term                                    doSetMatchMode
176    ':'                  n  term              ^expr-quant           doMatchModeParen
177    default                 errorDeath                              doBadModeFlag
178
179#
180#  named-capture    (?<name> ... ), position currently on the name.
181#
182named-capture:
183    ascii_letter         n  named-capture                           doContinueNamedCapture
184    digit_char           n  named-capture                           doContinueNamedCapture
185    '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
186    default                 errorDeath                              doBadNamedCapture
187
188#
189#  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
190#                 between plain '*', '*?', '*+'
191#
192quant-star:
193     '?'                 n  expr-cont                               doNGStar               #  *?
194     '+'                 n  expr-cont                               doPossessiveStar       #  *+
195     default                expr-cont                               doStar
196
197
198#
199#  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
200#                 between plain '+', '+?', '++'
201#
202quant-plus:
203     '?'                 n  expr-cont                               doNGPlus               #  *?
204     '+'                 n  expr-cont                               doPossessivePlus       #  *+
205     default                expr-cont                               doPlus
206
207
208#
209#  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
210#                  between plain '?', '??', '?+'
211#
212quant-opt:
213     '?'                 n  expr-cont                               doNGOpt                 #  ??
214     '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
215     default                expr-cont                               doOpt                   #  ?
216
217
218#
219#   Interval         scanning a '{', the opening delimiter for an interval specification
220#                                   {number} or {min, max} or {min,}
221#
222interval-open:
223    digit_char              interval-lower
224    default                 errorDeath                              doIntervalError
225
226interval-lower:
227    digit_char           n  interval-lower                          doIntevalLowerDigit
228    ','			         n  interval-upper
229    '}'                  n  interval-type                           doIntervalSame             # {n}
230    default                 errorDeath                              doIntervalError
231
232interval-upper:
233    digit_char           n  interval-upper                          doIntervalUpperDigit
234    '}'                  n  interval-type
235    default                 errorDeath                              doIntervalError
236
237interval-type:
238    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
239    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
240    default                 expr-cont                               doInterval                  # {m,n}
241
242
243#
244#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
245#                                  The low level next-char function will have preprocessed
246#                                  some of them already; those won't come here.
247backslash:
248   'A'                   n  term                                    doBackslashA
249   'B'                   n  term                                    doBackslashB
250   'b'                   n  term                                    doBackslashb
251   'd'                   n  expr-quant                              doBackslashd
252   'D'                   n  expr-quant                              doBackslashD
253   'G'                   n  term                                    doBackslashG
254   'h'                   n  expr-quant                              doBackslashh
255   'H'                   n  expr-quant                              doBackslashH
256   'k'                   n  named-backref
257   'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
258   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
259   'P'                      expr-quant                              doProperty
260   'R'                   n  expr-quant                              doBackslashR
261   'Q'                   n  term                                    doEnterQuoteMode
262   'S'                   n  expr-quant                              doBackslashS
263   's'                   n  expr-quant                              doBackslashs
264   'v'                   n  expr-quant                              doBackslashv
265   'V'                   n  expr-quant                              doBackslashV
266   'W'                   n  expr-quant                              doBackslashW
267   'w'                   n  expr-quant                              doBackslashw
268   'X'                   n  expr-quant                              doBackslashX
269   'Z'                   n  term                                    doBackslashZ
270   'z'                   n  term                                    doBackslashz
271   digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
272   eof                      errorDeath                              doEscapeError
273   default               n  expr-quant                              doEscapedLiteralChar
274
275
276# named-backref   Scanned \k
277#                 Leading to \k<captureName>
278#                 Failure to get the full sequence is an error.
279#
280named-backref:
281    '<'                  n  named-backref-2                         doBeginNamedBackRef
282    default                 errorDeath                              doBadNamedCapture
283
284named-backref-2:
285    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
286    default                 errorDeath                              doBadNamedCapture
287
288named-backref-3:
289    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
290    digit_char           n  named-backref-3                         doContinueNamedBackRef
291    '>'                  n  expr-quant                              doCompleteNamedBackRef
292    default                 errorDeath                              doBadNamedCapture
293
294
295#
296# [set expression] parsing,
297#    All states involved in parsing set expressions have names beginning with "set-"
298#
299
300set-open:
301   '^'                   n  set-open2                               doSetNegate
302   ':'                      set-posix                               doSetPosixProp
303   default                  set-open2
304
305set-open2:
306   ']'                   n  set-after-lit                           doSetLiteral
307   default                  set-start
308
309#  set-posix:
310#                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
311#                  moved the scan to the closing ']'.  If it wasn't a property
312#                  expression, the scan will still be at the opening ':', which should
313#                  be interpreted as a normal set expression.
314set-posix:
315    ']'                  n   pop                                    doSetEnd
316    ':'                      set-start
317    default                  errorDeath                             doRuleError  # should not be possible.
318
319#
320#   set-start   after the [ and special case leading characters (^ and/or ]) but before
321#               everything else.   A '-' is literal at this point.
322#
323set-start:
324    ']'                  n  pop                                     doSetEnd
325    '['                  n  set-open      ^set-after-set            doSetBeginUnion
326    '\'                  n  set-escape
327    '-'                  n  set-start-dash
328    '&'                  n  set-start-amp
329    default              n  set-after-lit                           doSetLiteral
330
331#    set-start-dash    Turn "[--" into a syntax error.
332#                           "[-x" is good, - and x are literals.
333#
334set-start-dash:
335    '-'                     errorDeath                              doRuleError
336    default                 set-after-lit                           doSetAddDash
337
338#    set-start-amp     Turn "[&&" into a syntax error.
339#                           "[&x" is good, & and x are literals.
340#
341set-start-amp:
342    '&'                     errorDeath                              doRuleError
343    default                 set-after-lit                           doSetAddAmp
344
345#
346#   set-after-lit    The last thing scanned was a literal character within a set.
347#                    Can be followed by anything.  Single '-' or '&' are
348#                    literals in this context, not operators.
349set-after-lit:
350    ']'                  n  pop                                     doSetEnd
351    '['                  n  set-open      ^set-after-set            doSetBeginUnion
352    '-'                  n  set-lit-dash
353    '&'                  n  set-lit-amp
354    '\'                  n  set-escape
355    eof                     errorDeath                              doSetNoCloseError
356    default              n  set-after-lit                           doSetLiteral
357
358set-after-set:
359    ']'                  n  pop                                     doSetEnd
360    '['                  n  set-open      ^set-after-set            doSetBeginUnion
361    '-'                  n  set-set-dash
362    '&'                  n  set-set-amp
363    '\'                  n  set-escape
364    eof                     errorDeath                              doSetNoCloseError
365    default              n  set-after-lit                           doSetLiteral
366
367set-after-range:
368    ']'                  n  pop                                     doSetEnd
369    '['                  n  set-open      ^set-after-set            doSetBeginUnion
370    '-'                  n  set-range-dash
371    '&'                  n  set-range-amp
372    '\'                  n  set-escape
373    eof                     errorDeath                              doSetNoCloseError
374    default              n  set-after-lit                           doSetLiteral
375
376
377# set-after-op
378#     After a --  or &&
379#     It is an error to close a set at this point.
380#
381set-after-op:
382    '['                  n  set-open         ^set-after-set         doSetBeginUnion
383    ']'                     errorDeath                              doSetOpError
384    '\'                  n  set-escape
385    default              n  set-after-lit                           doSetLiteral
386
387#
388#   set-set-amp
389#      Have scanned [[set]&
390#      Could be a '&' intersection operator, if a set follows.
391#      Could be the start of a '&&' operator.
392#      Otherewise is a literal.
393set-set-amp:
394    '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
395    '&'                  n  set-after-op                           doSetIntersection2
396    default                 set-after-lit                          doSetAddAmp
397
398
399# set-lit-amp   Have scanned "[literals&"
400#               Could be a start of "&&" operator or a literal
401#               In [abc&[def]],   the '&' is a literal
402#
403set-lit-amp:
404    '&'                  n  set-after-op                            doSetIntersection2
405    default                 set-after-lit                           doSetAddAmp
406
407
408#
409#  set-set-dash
410#      Have scanned [set]-
411#      Could be a '-' difference operator, if a [set] follows.
412#      Could be the start of a '--' operator.
413#      Otherewise is a literal.
414set-set-dash:
415    '['                  n  set-open      ^set-after-set           doSetBeginDifference1
416    '-'                  n  set-after-op                           doSetDifference2
417    default                 set-after-lit                          doSetAddDash
418
419
420#
421#  set-range-dash
422#      scanned  a-b-  or \w-
423#         any set or range like item where the trailing single '-' should
424#         be literal, not a set difference operation.
425#         A trailing "--" is still a difference operator.
426set-range-dash:
427    '-'                  n  set-after-op                           doSetDifference2
428    default                 set-after-lit                          doSetAddDash
429
430
431set-range-amp:
432    '&'                  n  set-after-op                           doSetIntersection2
433    default                 set-after-lit                          doSetAddAmp
434
435
436#  set-lit-dash
437#     Have scanned "[literals-" Could be a range or a -- operator or a literal
438#     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
439#        [abc-\p{xx}  the '-' is an error
440#        [abc-]       the '-' is a literal
441#        [ab-xy]      the '-' is a range
442#
443set-lit-dash:
444    '-'                  n  set-after-op                            doSetDifference2
445    '['                     set-after-lit                           doSetAddDash
446    ']'                     set-after-lit                           doSetAddDash
447    '\'                  n  set-lit-dash-escape
448    default              n  set-after-range                         doSetRange
449
450# set-lit-dash-escape
451#
452#    scanned "[literal-\"
453#    Could be a range, if the \ introduces an escaped literal char or a named char.
454#    Otherwise it is an error.
455#
456set-lit-dash-escape:
457   's'                      errorDeath                             doSetOpError
458   'S'                      errorDeath                             doSetOpError
459   'w'                      errorDeath                             doSetOpError
460   'W'                      errorDeath                             doSetOpError
461   'd'                      errorDeath                             doSetOpError
462   'D'                      errorDeath                             doSetOpError
463   'N'                      set-after-range                        doSetNamedRange
464   default               n  set-after-range                        doSetRange
465
466
467#
468#  set-escape
469#       Common back-slash escape processing within set expressions
470#
471set-escape:
472   'p'                      set-after-set                           doSetProp
473   'P'                      set-after-set                           doSetProp
474   'N'                      set-after-lit                           doSetNamedChar
475   's'                   n  set-after-range                         doSetBackslash_s
476   'S'                   n  set-after-range                         doSetBackslash_S
477   'w'                   n  set-after-range                         doSetBackslash_w
478   'W'                   n  set-after-range                         doSetBackslash_W
479   'd'                   n  set-after-range                         doSetBackslash_d
480   'D'                   n  set-after-range                         doSetBackslash_D
481   'h'                   n  set-after-range                         doSetBackslash_h
482   'H'                   n  set-after-range                         doSetBackslash_H
483   'v'                   n  set-after-range                         doSetBackslash_v
484   'V'                   n  set-after-range                         doSetBackslash_V
485   default               n  set-after-lit                           doSetLiteralEscaped
486
487#
488# set-finish
489#     Have just encountered the final ']' that completes a [set], and
490#     arrived here via a pop.  From here, we exit the set parsing world, and go
491#     back to generic regular expression parsing.
492#
493set-finish:
494    default                 expr-quant                              doSetFinish
495
496
497#
498# errorDeath.   This state is specified as the next state whenever a syntax error
499#               in the source rules is detected.  Barring bugs, the state machine will never
500#               actually get here, but will stop because of the action associated with the error.
501#               But, just in case, this state asks the state machine to exit.
502errorDeath:
503    default              n errorDeath                               doExit
504
505
506