1#
2#  start state, scan position is at the beginning of the pattern.
3#
4start:
5    '['                  n set-open       ^set-finish
6    '\'                  n set-escape     ^set-finish
7    default                errorDeath                               doRuleError
8
9#
10# [set expression] parsing,
11#    All states involved in parsing set expressions have names beginning with "set-"
12#
13
14set-open:
15   '^'                   n  set-open2                               doSetNegate
16   ':'                      set-posix                               doSetPosixProp
17   default                  set-open2
18
19set-open2:
20   ']'                   n  set-after-lit                           doSetLiteral
21   default                  set-start
22
23#  set-posix:
24#                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
25#                  moved the scan to the closing ']'.  If it wasn't a property
26#                  expression, the scan will still be at the opening ':', which should
27#                  be interpreted as a normal set expression.
28set-posix:
29    ']'                  n   pop                                    doSetEnd
30    ':'                      set-start
31    default                  errorDeath                             doRuleError  # should not be possible.
32
33#
34#   set-start   after the [ and special case leading characters (^ and/or ]) but before
35#               everything else.   A '-' is literal at this point.
36#
37set-start:
38    ']'                  n  pop                                     doSetEnd
39    '['                  n  set-open      ^set-after-set            doSetBeginUnion
40    '\'                  n  set-escape
41    '-'                  n  set-start-dash
42    '&'                  n  set-start-amp
43    default              n  set-after-lit                           doSetLiteral
44
45#    set-start-dash    Turn "[--" into a syntax error.
46#                           "[-x" is good, - and x are literals.
47#
48set-start-dash:
49    '-'                     errorDeath                              doRuleError
50    default                 set-after-lit                           doSetAddDash
51
52#    set-start-amp     Turn "[&&" into a syntax error.
53#                           "[&x" is good, & and x are literals.
54#
55set-start-amp:
56    '&'                     errorDeath                              doRuleError
57    default                 set-after-lit                           doSetAddAmp
58
59#
60#   set-after-lit    The last thing scanned was a literal character within a set.
61#                    Can be followed by anything.  Single '-' or '&' are
62#                    literals in this context, not operators.
63set-after-lit:
64    ']'                  n  pop                                     doSetEnd
65    '['                  n  set-open      ^set-after-set            doSetBeginUnion
66    '-'                  n  set-lit-dash
67    '&'                  n  set-lit-amp
68    '\'                  n  set-escape
69    eof                     errorDeath                              doSetNoCloseError
70    default              n  set-after-lit                           doSetLiteral
71
72set-after-set:
73    ']'                  n  pop                                     doSetEnd
74    '['                  n  set-open      ^set-after-set            doSetBeginUnion
75    '-'                  n  set-set-dash
76    '&'                  n  set-set-amp
77    '\'                  n  set-escape
78    eof                     errorDeath                              doSetNoCloseError
79    default              n  set-after-lit                           doSetLiteral
80
81set-after-range:
82    ']'                  n  pop                                     doSetEnd
83    '['                  n  set-open      ^set-after-set            doSetBeginUnion
84    '-'                  n  set-range-dash
85    '&'                  n  set-range-amp
86    '\'                  n  set-escape
87    eof                     errorDeath                              doSetNoCloseError
88    default              n  set-after-lit                           doSetLiteral
89
90
91# set-after-op
92#     After a --  or &&
93#     It is an error to close a set at this point.
94#
95set-after-op:
96    '['                  n  set-open         ^set-after-set         doSetBeginUnion
97    ']'                     errorDeath                              doSetOpError
98    '\'                  n  set-escape
99    default              n  set-after-lit                           doSetLiteral
100
101#
102#   set-set-amp
103#      Have scanned [[set]&
104#      Could be a '&' intersection operator, if a set follows.
105#      Could be the start of a '&&' operator.
106#      Otherewise is a literal.
107set-set-amp:
108    '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
109    '&'                  n  set-after-op                           doSetIntersection2
110    default                 set-after-lit                          doSetAddAmp
111
112
113# set-lit-amp   Have scanned "[literals&"
114#               Could be a start of "&&" operator or a literal
115#               In [abc&[def]],   the '&' is a literal
116#
117set-lit-amp:
118    '&'                  n  set-after-op                            doSetIntersection2
119    default                 set-after-lit                           doSetAddAmp
120
121
122#
123#  set-set-dash
124#      Have scanned [set]-
125#      Could be a '-' difference operator, if a [set] follows.
126#      Could be the start of a '--' operator.
127#      Otherwise is a literal.
128set-set-dash:
129    '['                  n  set-open      ^set-after-set           doSetBeginDifference1
130    '-'                  n  set-after-op                           doSetDifference2
131    default                 set-after-lit                          doSetAddDash
132
133
134#
135#  set-range-dash
136#      scanned  a-b-  or \w-
137#         any set or range like item where the trailing single '-' should
138#         be literal, not a set difference operation.
139#         A trailing "--" is still a difference operator.
140set-range-dash:
141    '-'                  n  set-after-op                           doSetDifference2
142    default                 set-after-lit                          doSetAddDash
143
144
145set-range-amp:
146    '&'                  n  set-after-op                           doSetIntersection2
147    default                 set-after-lit                          doSetAddAmp
148
149
150#  set-lit-dash
151#     Have scanned "[literals-" Could be a range or a -- operator or a literal
152#     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
153#        [abc-\p{xx}  the '-' is an error
154#        [abc-]       the '-' is a literal
155#        [ab-xy]      the '-' is a range
156#
157set-lit-dash:
158    '-'                  n  set-after-op                            doSetDifference2
159    '['                     set-after-lit                           doSetAddDash
160    ']'                     set-after-lit                           doSetAddDash
161    '\'                  n  set-lit-dash-escape
162    default              n  set-after-range                         doSetRange
163
164# set-lit-dash-escape
165#
166#    scanned "[literal-\"
167#    Could be a range, if the \ introduces an escaped literal char or a named char.
168#    Otherwise it is an error.
169#
170set-lit-dash-escape:
171   's'                      errorDeath                             doSetOpError
172   'S'                      errorDeath                             doSetOpError
173   'w'                      errorDeath                             doSetOpError
174   'W'                      errorDeath                             doSetOpError
175   'd'                      errorDeath                             doSetOpError
176   'D'                      errorDeath                             doSetOpError
177   'N'                      set-name-start    ^set-after-range          doStartNamedChar
178   'x'                      set-hex-start    ^set-after-range          doStartHex
179   default               n  set-after-range                        doSetRange
180# TODO fix 'N', 'x'
181
182#
183#  set-escape
184#       Common back-slash escape processing within set expressions
185#
186set-escape:
187   'p'                   n  set-prop-start    ^set-after-set          doStartSetProp
188   'P'                   n  set-prop-start    ^set-after-set          doStartSetProp
189   'N'                   n  set-name-start    ^set-after-lit          doStartNamedChar
190   'x'                   n  set-hex-start ^set-after-lit         doStartHex
191   's'                   n  set-after-range                         doSetBackslash_s
192   'S'                   n  set-after-range                         doSetBackslash_S
193   'w'                   n  set-after-range                         doSetBackslash_w
194   'W'                   n  set-after-range                         doSetBackslash_W
195   'd'                   n  set-after-range                         doSetBackslash_d
196   'D'                   n  set-after-range                         doSetBackslash_D
197   default               n  set-after-lit                           doSetLiteralEscaped
198# TODO add \r, \n, etc
199
200set-prop-start:
201    '{'                  n  set-prop-cont
202    default                 errorDeath
203
204set-prop-cont:
205    '}'                  n  pop                                     doPropName
206    '='                  n  set-value                               doPropRelation
207    '≠'                  n  set-value                               doPropRelation
208    default              n  set-prop-cont
209
210set-value:
211    '}'                  n  pop                                     doPropValue
212    default              n  set-value
213
214set-name-start:
215    '{'                  n  set-name-cont
216    default                 errorDeath
217
218set-name-cont:
219    '}'                  n  pop                                     doName
220    [\ \-0-9A-Za-z]      n  set-name-cont
221    default              n  errorDeath
222
223set-hex-start:
224    '{'                  n  set-hex-cont
225    default                 errorDeath
226
227set-hex-cont:
228    '}'                  n  pop                                     doHex
229    [0-9A-Fa-f]          n  set-hex-cont
230    default              n  errorDeath
231
232#
233# set-finish
234#     Have just encountered the final ']' that completes a [set], and
235#     arrived here via a pop.  From here, we exit the set parsing world, and go
236#     back to generic regular expression parsing.
237#
238set-finish:
239    default                 exit                              doSetFinish
240