1
2#*****************************************************************************
3#
4#   Copyright (C) 2016 and later: Unicode, Inc. and others.
5#   License & terms of use: http://www.unicode.org/copyright.html#License
6#
7#*****************************************************************************
8#*****************************************************************************
9#
10#   Copyright (C) 2002-2016, International Business Machines Corporation and others.
11#   All Rights Reserved.
12#
13#*****************************************************************************
14#
15#  file:  rbbirpt.txt
16#  ICU Break Iterator Rule Parser State Table
17#
18#     This state table is used when reading and parsing a set of RBBI rules
19#     The rule parser uses a state machine; the data in this file define the
20#     state transitions that occur for each input character.
21#
22#     *** This file defines the RBBI rule grammar.   This is it.
23#     *** The determination of what is accepted is here.
24#
25#     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
26#     that are then built with the rule parser.
27#
28#    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h
29
30#
31# Here is the syntax of the state definitions in this file:
32#
33#
34#StateName:
35#   input-char           n next-state           ^push-state     action
36#   input-char           n next-state           ^push-state     action
37#       |                |   |                      |             |
38#       |                |   |                      |             |--- action to be performed by state machine
39#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
40#       |                |   |                      |
41#       |                |   |                      |--- Push this named state onto the state stack.
42#       |                |   |                           Later, when next state is specified as "pop",
43#       |                |   |                           the pushed state will become the current state.
44#       |                |   |
45#       |                |   |--- Transition to this state if the current input character matches the input
46#       |                |        character or char class in the left hand column.  "pop" causes the next
47#       |                |        state to be popped from the state stack.
48#       |                |
49#       |                |--- When making the state transition specified on this line, advance to the next
50#       |                     character from the input only if 'n' appears here.
51#       |
52#       |--- Character or named character classes to test for.  If the current character being scanned
53#            matches, peform the actions and go to the state specified on this line.
54#            The input character is tested sequentally, in the order written.  The characters and
55#            character classes tested for do not need to be mutually exclusive.  The first match wins.
56#
57
58
59
60
61#
62#  start state, scan position is at the beginning of the rules file, or in between two rules.
63#
64start:
65    escaped                term                  ^break-rule-end    doExprStart
66    white_space          n start
67    '^'                  n start-after-caret     ^break-rule-end    doNoChain
68    '$'                    scan-var-name         ^assign-or-rule    doExprStart
69    '!'                  n rev-option
70    ';'                  n start                                                  # ignore empty rules.
71    eof                    exit
72    default                term                  ^break-rule-end    doExprStart
73
74#
75#  break-rule-end:  Returned from doing a break-rule expression.
76#
77break-rule-end:
78    ';'	                 n start                                    doEndOfRule
79    white_space          n break-rule-end
80    default                errorDeath                               doRuleError
81
82#
83# start of a rule, after having seen a '^' (inhibits rule chain in).
84#     Similar to the main 'start' state in most respects, except
85#          - empty rule is an error.
86#          - A second '^' is an error.
87#
88start-after-caret:
89    escaped                term                                     doExprStart
90    white_space          n start-after-caret
91    '^'                    errorDeath                               doRuleError    # two '^'s
92    '$'                    scan-var-name         ^term-var-ref      doExprStart
93    ';'                    errorDeath                               doRuleError    # ^ ;
94    eof                    errorDeath                               doRuleError
95    default                term                                     doExprStart
96
97#
98#   !               We've just scanned a '!', indicating either a !!key word flag or a
99#                   !Reverse rule.
100#
101rev-option:
102    '!'                  n option-scan1
103    default                reverse-rule           ^break-rule-end   doReverseDir
104
105option-scan1:
106    name_start_char      n option-scan2                             doOptionStart
107    default                errorDeath                               doRuleError
108
109option-scan2:
110    name_char            n option-scan2
111    default                option-scan3                             doOptionEnd
112
113option-scan3:
114    ';'                  n start
115    white_space          n option-scan3
116    default                errorDeath                               doRuleError
117
118
119reverse-rule:
120    default                term                   ^break-rule-end   doExprStart
121
122
123#
124#  term.  Eat through a single rule character, or a composite thing, which
125#         could be a parenthesized expression, a variable name, or a Unicode Set.
126#
127term:
128    escaped              n expr-mod                                 doRuleChar
129    white_space          n term
130    rule_char            n expr-mod                                 doRuleChar
131    '['                    scan-unicode-set      ^expr-mod
132    '('                  n term                  ^expr-mod          doLParen
133    '$'                    scan-var-name         ^term-var-ref
134    '.'                  n expr-mod                                 doDotAny
135    default                errorDeath                               doRuleError
136
137
138
139#
140#  term-var-ref   We've just finished scanning a reference to a $variable.
141#                 Check that the variable was defined.
142#                 The variable name scanning is in common with assignment statements,
143#                 so the check can't be done there.
144term-var-ref:
145    default                expr-mod                                 doCheckVarDef
146
147
148#
149#   expr-mod      We've just finished scanning a term, now look for the optional
150#                 trailing '*', '?', '+'
151#
152expr-mod:
153    white_space          n  expr-mod
154    '*'                  n  expr-cont                               doUnaryOpStar
155    '+'                  n  expr-cont                               doUnaryOpPlus
156    '?'                  n  expr-cont                               doUnaryOpQuestion
157    default                 expr-cont
158
159
160#
161#  expr-cont      Expression, continuation.  At a point where additional terms are
162#                                            allowed, but not required.
163#
164expr-cont:
165    escaped                 term                                    doExprCatOperator
166    white_space          n  expr-cont
167    rule_char               term                                    doExprCatOperator
168    '['                     term                                    doExprCatOperator
169    '('                     term                                    doExprCatOperator
170    '$'                     term                                    doExprCatOperator
171    '.'                     term                                    doExprCatOperator
172    '/'                     look-ahead                              doExprCatOperator
173    '{'                  n  tag-open                                doExprCatOperator
174    '|'                  n  term                                    doExprOrOperator
175    ')'                  n  pop                                     doExprRParen
176    default                 pop                                     doExprFinished
177
178
179#
180#   look-ahead    Scanning a '/', which identifies a break point, assuming that the
181#                 remainder of the expression matches.
182#
183#                 Generate a parse tree as if this was a special kind of input symbol
184#                 appearing in an otherwise normal concatenation expression.
185#
186look-ahead:
187    '/'                   n expr-cont-no-slash                      doSlash
188    default                 errorDeath
189
190
191#
192#  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
193#                                            allowed, but not required.  Just like
194#                                            expr-cont, above, except that no '/'
195#                                            look-ahead symbol is permitted.
196#
197expr-cont-no-slash:
198    escaped                 term                                    doExprCatOperator
199    white_space          n  expr-cont
200    rule_char               term                                    doExprCatOperator
201    '['                     term                                    doExprCatOperator
202    '('                     term                                    doExprCatOperator
203    '$'                     term                                    doExprCatOperator
204    '.'                     term                                    doExprCatOperator
205    '|'                  n  term                                    doExprOrOperator
206    ')'                  n  pop                                     doExprRParen
207    default                 pop                                     doExprFinished
208
209
210#
211#   tags             scanning a '{', the opening delimiter for a tag that identifies
212#                    the kind of match.  Scan the whole {dddd} tag, where d=digit
213#
214tag-open:
215    white_space          n  tag-open
216    digit_char              tag-value                               doStartTagValue
217    default                 errorDeath                              doTagExpectedError
218
219tag-value:
220    white_space          n  tag-close
221    '}'                     tag-close
222    digit_char           n  tag-value                               doTagDigit
223    default                 errorDeath                              doTagExpectedError
224
225tag-close:
226    white_space          n  tag-close
227    '}'                  n  expr-cont-no-tag                        doTagValue
228    default                 errorDeath                              doTagExpectedError
229
230
231
232#
233#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
234#                                            allowed, but not required.  Just like
235#                                            expr-cont, above, except that no "{ddd}"
236#                                            tagging is permitted.
237#
238expr-cont-no-tag:
239    escaped                 term                                    doExprCatOperator
240    white_space          n  expr-cont-no-tag
241    rule_char               term                                    doExprCatOperator
242    '['                     term                                    doExprCatOperator
243    '('                     term                                    doExprCatOperator
244    '$'                     term                                    doExprCatOperator
245    '.'                     term                                    doExprCatOperator
246    '/'                     look-ahead                              doExprCatOperator
247    '|'                  n  term                                    doExprOrOperator
248    ')'                  n  pop                                     doExprRParen
249    default                 pop                                     doExprFinished
250
251
252
253
254#
255#   Variable Name Scanning.
256#
257#                    The state that branched to here must have pushed a return state
258#                    to go to after completion of the variable name scanning.
259#
260#                    The current input character must be the $ that introduces the name.
261#                    The $ is consummed here rather than in the state that first detected it
262#                    so that the doStartVariableName action only needs to happen in one
263#                    place (here), and the other states don't need to worry about it.
264#
265scan-var-name:
266   '$'                  n scan-var-start                            doStartVariableName
267   default                errorDeath
268
269
270scan-var-start:
271    name_start_char      n scan-var-body
272    default                errorDeath                               doVariableNameExpectedErr
273
274scan-var-body:
275    name_char            n scan-var-body
276    default                pop                                      doEndVariableName
277
278
279
280#
281#  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
282#                     Within the RBBI parser, after finding the first character
283#                     of a Unicode Set, we just hand the rule input at that
284#                     point of to the Unicode Set constructor, then pick
285#                     up parsing after the close of the set.
286#
287#                     The action for this state invokes the UnicodeSet parser.
288#
289scan-unicode-set:
290    '['                   n pop                                      doScanUnicodeSet
291    'p'                   n pop                                      doScanUnicodeSet
292    'P'                   n pop                                      doScanUnicodeSet
293    default		    errorDeath
294
295
296
297
298
299
300
301#
302#  assign-or-rule.   A $variable was encountered at the start of something, could be
303#                    either an assignment statement or a rule, depending on whether an '='
304#                    follows the variable name.  We get to this state when the variable name
305#                    scanning does a return.
306#
307assign-or-rule:
308    white_space          n assign-or-rule
309    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
310    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
311
312
313
314#
315#  assign-end        This state is entered when the end of the expression on the
316#                    right hand side of an assignment is found.  We get here via
317#                    a pop; this state is pushed when the '=' in an assignment is found.
318#
319#                    The only thing allowed at this point is a ';'.  The RHS of an
320#                    assignment must look like a rule expression, and we come here
321#                    when what is being scanned no longer looks like an expression.
322#
323assign-end:
324    ';'                  n start                                    doEndAssign
325    default                errorDeath                               doRuleErrorAssignExpr
326
327
328
329#
330# errorDeath.   This state is specified as the next state whenever a syntax error
331#               in the source rules is detected.  Barring bugs, the state machine will never
332#               actually get here, but will stop because of the action associated with the error.
333#               But, just in case, this state asks the state machine to exit.
334errorDeath:
335    default              n errorDeath                               doExit
336
337
338