1
2#*****************************************************************************
3#
4#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
5#   All Rights Reserved.
6#
7#*****************************************************************************
8#
9#  file:  rbbirpt.txt
10#  ICU Break Iterator Rule Parser State Table
11#
12#     This state table is used when reading and parsing a set of RBBI rules
13#     The rule parser uses a state machine; the data in this file define the
14#     state transitions that occur for each input character.
15#
16#     *** This file defines the RBBI rule grammar.   This is it.
17#     *** The determination of what is accepted is here.
18#
19#     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
20#     that are then built with the rule parser.
21#
22
23#
24# Here is the syntax of the state definitions in this file:
25#
26#
27#StateName:
28#   input-char           n next-state           ^push-state     action
29#   input-char           n next-state           ^push-state     action
30#       |                |   |                      |             |
31#       |                |   |                      |             |--- action to be performed by state machine
32#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
33#       |                |   |                      |
34#       |                |   |                      |--- Push this named state onto the state stack.
35#       |                |   |                           Later, when next state is specified as "pop",
36#       |                |   |                           the pushed state will become the current state.
37#       |                |   |
38#       |                |   |--- Transition to this state if the current input character matches the input
39#       |                |        character or char class in the left hand column.  "pop" causes the next
40#       |                |        state to be popped from the state stack.
41#       |                |
42#       |                |--- When making the state transition specified on this line, advance to the next
43#       |                     character from the input only if 'n' appears here.
44#       |
45#       |--- Character or named character classes to test for.  If the current character being scanned
46#            matches, peform the actions and go to the state specified on this line.
47#            The input character is tested sequentally, in the order written.  The characters and
48#            character classes tested for do not need to be mutually exclusive.  The first match wins.
49#
50
51
52
53
54#
55#  start state, scan position is at the beginning of the rules file, or in between two rules.
56#
57start:
58    escaped                term                  ^break-rule-end    doExprStart
59    white_space          n start
60    '$'                    scan-var-name         ^assign-or-rule    doExprStart
61    '!'                  n rev-option
62    ';'                  n start                                                  # ignore empty rules.
63    eof                    exit
64    default                term                  ^break-rule-end    doExprStart
65
66#
67#  break-rule-end:  Returned from doing a break-rule expression.
68#
69break-rule-end:
70    ';'	                 n start                                    doEndOfRule
71    white_space          n break-rule-end
72    default                errorDeath                               doRuleError
73
74
75#
76#   !               We've just scanned a '!', indicating either a !!key word flag or a
77#                   !Reverse rule.
78#
79rev-option:
80    '!'                  n option-scan1
81    default                reverse-rule           ^break-rule-end   doReverseDir
82
83option-scan1:
84    name_start_char      n option-scan2                             doOptionStart
85    default                errorDeath                               doRuleError
86
87option-scan2:
88    name_char            n option-scan2
89    default                option-scan3                             doOptionEnd
90
91option-scan3:
92    ';'                  n start
93    white_space          n option-scan3
94    default                errorDeath                               doRuleError
95
96
97reverse-rule:
98    default                term                   ^break-rule-end   doExprStart
99
100
101#
102#  term.  Eat through a single rule character, or a composite thing, which
103#         could be a parenthesized expression, a variable name, or a Unicode Set.
104#
105term:
106    escaped              n expr-mod                                 doRuleChar
107    white_space          n term
108    rule_char            n expr-mod                                 doRuleChar
109    '['                    scan-unicode-set      ^expr-mod
110    '('                  n term                  ^expr-mod          doLParen
111    '$'                    scan-var-name         ^term-var-ref
112    '.'                  n expr-mod                                 doDotAny
113    default                errorDeath                               doRuleError
114
115
116
117#
118#  term-var-ref   We've just finished scanning a reference to a $variable.
119#                 Check that the variable was defined.
120#                 The variable name scanning is in common with assignment statements,
121#                 so the check can't be done there.
122term-var-ref:
123    default                expr-mod                                 doCheckVarDef
124
125
126#
127#   expr-mod      We've just finished scanning a term, now look for the optional
128#                 trailing '*', '?', '+'
129#
130expr-mod:
131    white_space          n  expr-mod
132    '*'                  n  expr-cont                               doUnaryOpStar
133    '+'                  n  expr-cont                               doUnaryOpPlus
134    '?'                  n  expr-cont                               doUnaryOpQuestion
135    default                 expr-cont
136
137
138#
139#  expr-cont      Expression, continuation.  At a point where additional terms are
140#                                            allowed, but not required.
141#
142expr-cont:
143    escaped                 term                                    doExprCatOperator
144    white_space          n  expr-cont
145    rule_char               term                                    doExprCatOperator
146    '['                     term                                    doExprCatOperator
147    '('                     term                                    doExprCatOperator
148    '$'                     term                                    doExprCatOperator
149    '.'                     term                                    doExprCatOperator
150    '/'                     look-ahead                              doExprCatOperator
151    '{'                  n  tag-open                                doExprCatOperator
152    '|'                  n  term                                    doExprOrOperator
153    ')'                  n  pop                                     doExprRParen
154    default                 pop                                     doExprFinished
155
156
157#
158#   look-ahead    Scanning a '/', which identifies a break point, assuming that the
159#                 remainder of the expression matches.
160#
161#                 Generate a parse tree as if this was a special kind of input symbol
162#                 appearing in an otherwise normal concatenation expression.
163#
164look-ahead:
165    '/'                   n expr-cont-no-slash                      doSlash
166    default                 errorDeath
167
168
169#
170#  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
171#                                            allowed, but not required.  Just like
172#                                            expr-cont, above, except that no '/'
173#                                            look-ahead symbol is permitted.
174#
175expr-cont-no-slash:
176    escaped                 term                                    doExprCatOperator
177    white_space          n  expr-cont
178    rule_char               term                                    doExprCatOperator
179    '['                     term                                    doExprCatOperator
180    '('                     term                                    doExprCatOperator
181    '$'                     term                                    doExprCatOperator
182    '.'                     term                                    doExprCatOperator
183    '|'                  n  term                                    doExprOrOperator
184    ')'                  n  pop                                     doExprRParen
185    default                 pop                                     doExprFinished
186
187
188#
189#   tags             scanning a '{', the opening delimiter for a tag that identifies
190#                    the kind of match.  Scan the whole {dddd} tag, where d=digit
191#
192tag-open:
193    white_space          n  tag-open
194    digit_char              tag-value                               doStartTagValue
195    default                 errorDeath                              doTagExpectedError
196
197tag-value:
198    white_space          n  tag-close
199    '}'                     tag-close
200    digit_char           n  tag-value                               doTagDigit
201    default                 errorDeath                              doTagExpectedError
202
203tag-close:
204    white_space          n  tag-close
205    '}'                  n  expr-cont-no-tag                        doTagValue
206    default                 errorDeath                              doTagExpectedError
207
208
209
210#
211#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
212#                                            allowed, but not required.  Just like
213#                                            expr-cont, above, except that no "{ddd}"
214#                                            tagging is permitted.
215#
216expr-cont-no-tag:
217    escaped                 term                                    doExprCatOperator
218    white_space          n  expr-cont-no-tag
219    rule_char               term                                    doExprCatOperator
220    '['                     term                                    doExprCatOperator
221    '('                     term                                    doExprCatOperator
222    '$'                     term                                    doExprCatOperator
223    '.'                     term                                    doExprCatOperator
224    '/'                     look-ahead                              doExprCatOperator
225    '|'                  n  term                                    doExprOrOperator
226    ')'                  n  pop                                     doExprRParen
227    default                 pop                                     doExprFinished
228
229
230
231
232#
233#   Variable Name Scanning.
234#
235#                    The state that branched to here must have pushed a return state
236#                    to go to after completion of the variable name scanning.
237#
238#                    The current input character must be the $ that introduces the name.
239#                    The $ is consummed here rather than in the state that first detected it
240#                    so that the doStartVariableName action only needs to happen in one
241#                    place (here), and the other states don't need to worry about it.
242#
243scan-var-name:
244   '$'                  n scan-var-start                            doStartVariableName
245   default                errorDeath
246
247
248scan-var-start:
249    name_start_char      n scan-var-body
250    default                errorDeath                               doVariableNameExpectedErr
251
252scan-var-body:
253    name_char            n scan-var-body
254    default                pop                                      doEndVariableName
255
256
257
258#
259#  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
260#                     Within the RBBI parser, after finding the first character
261#                     of a Unicode Set, we just hand the rule input at that
262#                     point of to the Unicode Set constructor, then pick
263#                     up parsing after the close of the set.
264#
265#                     The action for this state invokes the UnicodeSet parser.
266#
267scan-unicode-set:
268    '['                   n pop                                      doScanUnicodeSet
269    'p'                   n pop                                      doScanUnicodeSet
270    'P'                   n pop                                      doScanUnicodeSet
271    default		    errorDeath
272
273
274
275
276
277
278
279#
280#  assign-or-rule.   A $variable was encountered at the start of something, could be
281#                    either an assignment statement or a rule, depending on whether an '='
282#                    follows the variable name.  We get to this state when the variable name
283#                    scanning does a return.
284#
285assign-or-rule:
286    white_space          n assign-or-rule
287    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
288    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule
289
290
291
292#
293#  assign-end        This state is entered when the end of the expression on the
294#                    right hand side of an assignment is found.  We get here via
295#                    a pop; this state is pushed when the '=' in an assignment is found.
296#
297#                    The only thing allowed at this point is a ';'.  The RHS of an
298#                    assignment must look like a rule expression, and we come here
299#                    when what is being scanned no longer looks like an expression.
300#
301assign-end:
302    ';'                  n start                                    doEndAssign
303    default                errorDeath                               doRuleErrorAssignExpr
304
305
306
307#
308# errorDeath.   This state is specified as the next state whenever a syntax error
309#               in the source rules is detected.  Barring bugs, the state machine will never
310#               actually get here, but will stop because of the action associated with the error.
311#               But, just in case, this state asks the state machine to exit.
312errorDeath:
313    default              n errorDeath                               doExit
314
315
316