1
2#*****************************************************************************
3#
4#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
5#   All Rights Reserved.
6#
7#*****************************************************************************
8#
9#  file:  regexcst.txt
10#  ICU Regular Expression Parser State Table
11#
12#     This state table is used when reading and parsing a regular expression pattern
13#     The pattern parser uses a state machine; the data in this file define the
14#     state transitions that occur for each input character.
15#
16#     *** This file defines the regex pattern grammar.   This is it.
17#     *** The determination of what is accepted is here.
18#
19#     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
20#     that are then built with the rule parser.
21#
22
23#
24# Here is the syntax of the state definitions in this file:
25#
26#
27#StateName:
28#   input-char           n next-state           ^push-state     action
29#   input-char           n next-state           ^push-state     action
30#       |                |   |                      |             |
31#       |                |   |                      |             |--- action to be performed by state machine
32#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
33#       |                |   |                      |
34#       |                |   |                      |--- Push this named state onto the state stack.
35#       |                |   |                           Later, when next state is specified as "pop",
36#       |                |   |                           the pushed state will become the current state.
37#       |                |   |
38#       |                |   |--- Transition to this state if the current input character matches the input
39#       |                |        character or char class in the left hand column.  "pop" causes the next
40#       |                |        state to be popped from the state stack.
41#       |                |
42#       |                |--- When making the state transition specified on this line, advance to the next
43#       |                     character from the input only if 'n' appears here.
44#       |
45#       |--- Character or named character classes to test for.  If the current character being scanned
46#            matches, peform the actions and go to the state specified on this line.
47#            The input character is tested sequentally, in the order written.  The characters and
48#            character classes tested for do not need to be mutually exclusive.  The first match wins.
49#
50
51
52
53
54#
55#  start state, scan position is at the beginning of the pattern.
56#
57start:
58   default                 term                                     doPatStart
59
60
61
62
63#
64#  term.  At a position where we can accept the start most items in a pattern.
65#
66term:
67    quoted               n expr-quant                               doLiteralChar
68    rule_char            n expr-quant                               doLiteralChar
69    '['                  n set-open       ^set-finish               doSetBegin
70    '('                  n open-paren
71    '.'                  n expr-quant                               doDotAny
72    '^'                  n expr-quant                               doCaret
73    '$'                  n expr-quant                               doDollar
74    '\'                  n backslash
75    '|'                  n  term                                    doOrOperator
76    ')'                  n  pop                                     doCloseParen
77    eof	                   term                                     doPatFinish
78    default                errorDeath                               doRuleError
79
80
81
82#
83#   expr-quant    We've just finished scanning a term, now look for the optional
84#                 trailing quantifier - *, +, ?, *?,  etc.
85#
86expr-quant:
87    '*'                  n  quant-star
88    '+'                  n  quant-plus
89    '?'                  n  quant-opt
90    '{'                  n  interval-open                          doIntervalInit
91    '('                  n  open-paren-quant
92    default                 expr-cont
93
94
95#
96#  expr-cont      Expression, continuation.  At a point where additional terms are
97#                                            allowed, but not required.  No Quantifiers
98#
99expr-cont:
100    '|'                  n  term                                    doOrOperator
101    ')'                  n  pop                                     doCloseParen
102    default                 term
103
104
105#
106#   open-paren-quant   Special case handling for comments appearing before a quantifier,
107#                        e.g.   x(?#comment )*
108#                      Open parens from expr-quant come here; anything but a (?# comment
109#                      branches into the normal parenthesis sequence as quickly as possible.
110#
111open-paren-quant:
112    '?'                  n  open-paren-quant2                      doSuppressComments
113    default                 open-paren
114
115open-paren-quant2:
116    '#'                  n  paren-comment   ^expr-quant
117    default                 open-paren-extended
118
119
120#
121#   open-paren    We've got an open paren.  We need to scan further to
122#                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
123#
124open-paren:
125    '?'                  n  open-paren-extended                     doSuppressComments
126    default                 term            ^expr-quant             doOpenCaptureParen
127
128open-paren-extended:
129    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
130    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
131    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
132    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
133    '<'                  n  open-paren-lookbehind
134    '#'                  n  paren-comment   ^term
135    'i'                     paren-flag                              doBeginMatchMode
136    'd'                     paren-flag                              doBeginMatchMode
137    'm'                     paren-flag                              doBeginMatchMode
138    's'                     paren-flag                              doBeginMatchMode
139    'u'                     paren-flag                              doBeginMatchMode
140    'w'                     paren-flag                              doBeginMatchMode
141    'x'                     paren-flag                              doBeginMatchMode
142    '-'                     paren-flag                              doBeginMatchMode
143    '('                  n  errorDeath                              doConditionalExpr
144    '{'                  n  errorDeath                              doPerlInline
145    default                 errorDeath                              doBadOpenParenType
146
147open-paren-lookbehind:
148    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
149    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
150    ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
151    default                 errorDeath                              doBadOpenParenType
152
153
154#
155#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
156#
157paren-comment:
158    ')'                  n  pop
159    eof		                errorDeath                              doMismatchedParenErr
160    default              n  paren-comment
161
162#
163#  paren-flag    Scanned a (?ismx-ismx  flag setting
164#
165paren-flag:
166    'i'                  n  paren-flag                              doMatchMode
167    'd'                  n  paren-flag                              doMatchMode
168    'm'                  n  paren-flag                              doMatchMode
169    's'                  n  paren-flag                              doMatchMode
170    'u'                  n  paren-flag                              doMatchMode
171    'w'                  n  paren-flag                              doMatchMode
172    'x'                  n  paren-flag                              doMatchMode
173    '-'                  n  paren-flag                              doMatchMode
174    ')'                  n  term                                    doSetMatchMode
175    ':'                  n  term              ^expr-quant           doMatchModeParen
176    default                 errorDeath                              doBadModeFlag
177
178#
179#  named-capture    (?<name> ... ), position currently on the name.
180#
181named-capture:
182    ascii_letter         n  named-capture                           doContinueNamedCapture
183    digit_char           n  named-capture                           doContinueNamedCapture
184    '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
185    default                 errorDeath                              doBadNamedCapture
186
187#
188#  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
189#                 between plain '*', '*?', '*+'
190#
191quant-star:
192     '?'                 n  expr-cont                               doNGStar               #  *?
193     '+'                 n  expr-cont                               doPossessiveStar       #  *+
194     default                expr-cont                               doStar
195
196
197#
198#  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
199#                 between plain '+', '+?', '++'
200#
201quant-plus:
202     '?'                 n  expr-cont                               doNGPlus               #  *?
203     '+'                 n  expr-cont                               doPossessivePlus       #  *+
204     default                expr-cont                               doPlus
205
206
207#
208#  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
209#                  between plain '?', '??', '?+'
210#
211quant-opt:
212     '?'                 n  expr-cont                               doNGOpt                 #  ??
213     '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
214     default                expr-cont                               doOpt                   #  ?
215
216
217#
218#   Interval         scanning a '{', the opening delimiter for an interval specification
219#                                   {number} or {min, max} or {min,}
220#
221interval-open:
222    digit_char              interval-lower
223    default                 errorDeath                              doIntervalError
224
225interval-lower:
226    digit_char           n  interval-lower                          doIntevalLowerDigit
227    ','			         n  interval-upper
228    '}'                  n  interval-type                           doIntervalSame             # {n}
229    default                 errorDeath                              doIntervalError
230
231interval-upper:
232    digit_char           n  interval-upper                          doIntervalUpperDigit
233    '}'                  n  interval-type
234    default                 errorDeath                              doIntervalError
235
236interval-type:
237    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
238    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
239    default                 expr-cont                               doInterval                  # {m,n}
240
241
242#
243#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
244#                                  The low level next-char function will have preprocessed
245#                                  some of them already; those won't come here.
246backslash:
247   'A'                   n  term                                    doBackslashA
248   'B'                   n  term                                    doBackslashB
249   'b'                   n  term                                    doBackslashb
250   'd'                   n  expr-quant                              doBackslashd
251   'D'                   n  expr-quant                              doBackslashD
252   'G'                   n  term                                    doBackslashG
253   'h'                   n  expr-quant                              doBackslashh
254   'H'                   n  expr-quant                              doBackslashH
255   'k'                   n  named-backref
256   'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
257   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
258   'P'                      expr-quant                              doProperty
259   'R'                   n  expr-quant                              doBackslashR
260   'Q'                   n  term                                    doEnterQuoteMode
261   'S'                   n  expr-quant                              doBackslashS
262   's'                   n  expr-quant                              doBackslashs
263   'v'                   n  expr-quant                              doBackslashv
264   'V'                   n  expr-quant                              doBackslashV
265   'W'                   n  expr-quant                              doBackslashW
266   'w'                   n  expr-quant                              doBackslashw
267   'X'                   n  expr-quant                              doBackslashX
268   'Z'                   n  term                                    doBackslashZ
269   'z'                   n  term                                    doBackslashz
270   digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
271   eof                      errorDeath                              doEscapeError
272   default               n  expr-quant                              doEscapedLiteralChar
273
274
275# named-backref   Scanned \k
276#                 Leading to \k<captureName>
277#                 Failure to get the full sequence is an error.
278#
279named-backref:
280    '<'                  n  named-backref-2                         doBeginNamedBackRef
281    default                 errorDeath                              doBadNamedCapture
282
283named-backref-2:
284    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
285    default                 errorDeath                              doBadNamedCapture
286
287named-backref-3:
288    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
289    digit_char           n  named-backref-3                         doContinueNamedBackRef
290    '>'                  n  expr-quant                              doCompleteNamedBackRef
291    default                 errorDeath                              doBadNamedCapture
292
293
294#
295# [set expression] parsing,
296#    All states involved in parsing set expressions have names beginning with "set-"
297#
298
299set-open:
300   '^'                   n  set-open2                               doSetNegate
301   ':'                      set-posix                               doSetPosixProp
302   default                  set-open2
303
304set-open2:
305   ']'                   n  set-after-lit                           doSetLiteral
306   default                  set-start
307
308#  set-posix:
309#                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
310#                  moved the scan to the closing ']'.  If it wasn't a property
311#                  expression, the scan will still be at the opening ':', which should
312#                  be interpreted as a normal set expression.
313set-posix:
314    ']'                  n   pop                                    doSetEnd
315    ':'                      set-start
316    default                  errorDeath                             doRuleError  # should not be possible.
317
318#
319#   set-start   after the [ and special case leading characters (^ and/or ]) but before
320#               everything else.   A '-' is literal at this point.
321#
322set-start:
323    ']'                  n  pop                                     doSetEnd
324    '['                  n  set-open      ^set-after-set            doSetBeginUnion
325    '\'                  n  set-escape
326    '-'                  n  set-start-dash
327    '&'                  n  set-start-amp
328    default              n  set-after-lit                           doSetLiteral
329
330#    set-start-dash    Turn "[--" into a syntax error.
331#                           "[-x" is good, - and x are literals.
332#
333set-start-dash:
334    '-'                     errorDeath                              doRuleError
335    default                 set-after-lit                           doSetAddDash
336
337#    set-start-amp     Turn "[&&" into a syntax error.
338#                           "[&x" is good, & and x are literals.
339#
340set-start-amp:
341    '&'                     errorDeath                              doRuleError
342    default                 set-after-lit                           doSetAddAmp
343
344#
345#   set-after-lit    The last thing scanned was a literal character within a set.
346#                    Can be followed by anything.  Single '-' or '&' are
347#                    literals in this context, not operators.
348set-after-lit:
349    ']'                  n  pop                                     doSetEnd
350    '['                  n  set-open      ^set-after-set            doSetBeginUnion
351    '-'                  n  set-lit-dash
352    '&'                  n  set-lit-amp
353    '\'                  n  set-escape
354    eof                     errorDeath                              doSetNoCloseError
355    default              n  set-after-lit                           doSetLiteral
356
357set-after-set:
358    ']'                  n  pop                                     doSetEnd
359    '['                  n  set-open      ^set-after-set            doSetBeginUnion
360    '-'                  n  set-set-dash
361    '&'                  n  set-set-amp
362    '\'                  n  set-escape
363    eof                     errorDeath                              doSetNoCloseError
364    default              n  set-after-lit                           doSetLiteral
365
366set-after-range:
367    ']'                  n  pop                                     doSetEnd
368    '['                  n  set-open      ^set-after-set            doSetBeginUnion
369    '-'                  n  set-range-dash
370    '&'                  n  set-range-amp
371    '\'                  n  set-escape
372    eof                     errorDeath                              doSetNoCloseError
373    default              n  set-after-lit                           doSetLiteral
374
375
376# set-after-op
377#     After a --  or &&
378#     It is an error to close a set at this point.
379#
380set-after-op:
381    '['                  n  set-open         ^set-after-set         doSetBeginUnion
382    ']'                     errorDeath                              doSetOpError
383    '\'                  n  set-escape
384    default              n  set-after-lit                           doSetLiteral
385
386#
387#   set-set-amp
388#      Have scanned [[set]&
389#      Could be a '&' intersection operator, if a set follows.
390#      Could be the start of a '&&' operator.
391#      Otherewise is a literal.
392set-set-amp:
393    '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
394    '&'                  n  set-after-op                           doSetIntersection2
395    default                 set-after-lit                          doSetAddAmp
396
397
398# set-lit-amp   Have scanned "[literals&"
399#               Could be a start of "&&" operator or a literal
400#               In [abc&[def]],   the '&' is a literal
401#
402set-lit-amp:
403    '&'                  n  set-after-op                            doSetIntersection2
404    default                 set-after-lit                           doSetAddAmp
405
406
407#
408#  set-set-dash
409#      Have scanned [set]-
410#      Could be a '-' difference operator, if a [set] follows.
411#      Could be the start of a '--' operator.
412#      Otherewise is a literal.
413set-set-dash:
414    '['                  n  set-open      ^set-after-set           doSetBeginDifference1
415    '-'                  n  set-after-op                           doSetDifference2
416    default                 set-after-lit                          doSetAddDash
417
418
419#
420#  set-range-dash
421#      scanned  a-b-  or \w-
422#         any set or range like item where the trailing single '-' should
423#         be literal, not a set difference operation.
424#         A trailing "--" is still a difference operator.
425set-range-dash:
426    '-'                  n  set-after-op                           doSetDifference2
427    default                 set-after-lit                          doSetAddDash
428
429
430set-range-amp:
431    '&'                  n  set-after-op                           doSetIntersection2
432    default                 set-after-lit                          doSetAddAmp
433
434
435#  set-lit-dash
436#     Have scanned "[literals-" Could be a range or a -- operator or a literal
437#     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
438#        [abc-\p{xx}  the '-' is an error
439#        [abc-]       the '-' is a literal
440#        [ab-xy]      the '-' is a range
441#
442set-lit-dash:
443    '-'                  n  set-after-op                            doSetDifference2
444    '['                     set-after-lit                           doSetAddDash
445    ']'                     set-after-lit                           doSetAddDash
446    '\'                  n  set-lit-dash-escape
447    default              n  set-after-range                         doSetRange
448
449# set-lit-dash-escape
450#
451#    scanned "[literal-\"
452#    Could be a range, if the \ introduces an escaped literal char or a named char.
453#    Otherwise it is an error.
454#
455set-lit-dash-escape:
456   's'                      errorDeath                             doSetOpError
457   'S'                      errorDeath                             doSetOpError
458   'w'                      errorDeath                             doSetOpError
459   'W'                      errorDeath                             doSetOpError
460   'd'                      errorDeath                             doSetOpError
461   'D'                      errorDeath                             doSetOpError
462   'N'                      set-after-range                        doSetNamedRange
463   default               n  set-after-range                        doSetRange
464
465
466#
467#  set-escape
468#       Common back-slash escape processing within set expressions
469#
470set-escape:
471   'p'                      set-after-set                           doSetProp
472   'P'                      set-after-set                           doSetProp
473   'N'                      set-after-lit                           doSetNamedChar
474   's'                   n  set-after-range                         doSetBackslash_s
475   'S'                   n  set-after-range                         doSetBackslash_S
476   'w'                   n  set-after-range                         doSetBackslash_w
477   'W'                   n  set-after-range                         doSetBackslash_W
478   'd'                   n  set-after-range                         doSetBackslash_d
479   'D'                   n  set-after-range                         doSetBackslash_D
480   'h'                   n  set-after-range                         doSetBackslash_h
481   'H'                   n  set-after-range                         doSetBackslash_H
482   'v'                   n  set-after-range                         doSetBackslash_v
483   'V'                   n  set-after-range                         doSetBackslash_V
484   default               n  set-after-lit                           doSetLiteralEscaped
485
486#
487# set-finish
488#     Have just encountered the final ']' that completes a [set], and
489#     arrived here via a pop.  From here, we exit the set parsing world, and go
490#     back to generic regular expression parsing.
491#
492set-finish:
493    default                 expr-quant                              doSetFinish
494
495
496#
497# errorDeath.   This state is specified as the next state whenever a syntax error
498#               in the source rules is detected.  Barring bugs, the state machine will never
499#               actually get here, but will stop because of the action associated with the error.
500#               But, just in case, this state asks the state machine to exit.
501errorDeath:
502    default              n errorDeath                               doExit
503
504
505