1 2#***************************************************************************** 3# 4# Copyright (C) 2002-2015, International Business Machines Corporation and others. 5# All Rights Reserved. 6# 7#***************************************************************************** 8# 9# file: regexcst.txt 10# ICU Regular Expression Parser State Table 11# 12# This state table is used when reading and parsing a regular expression pattern 13# The pattern parser uses a state machine; the data in this file define the 14# state transitions that occur for each input character. 15# 16# *** This file defines the regex pattern grammar. This is it. 17# *** The determination of what is accepted is here. 18# 19# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 20# that are then built with the rule parser. 21# 22 23# 24# Here is the syntax of the state definitions in this file: 25# 26# 27#StateName: 28# input-char n next-state ^push-state action 29# input-char n next-state ^push-state action 30# | | | | | 31# | | | | |--- action to be performed by state machine 32# | | | | See function RBBIRuleScanner::doParseActions() 33# | | | | 34# | | | |--- Push this named state onto the state stack. 35# | | | Later, when next state is specified as "pop", 36# | | | the pushed state will become the current state. 37# | | | 38# | | |--- Transition to this state if the current input character matches the input 39# | | character or char class in the left hand column. "pop" causes the next 40# | | state to be popped from the state stack. 41# | | 42# | |--- When making the state transition specified on this line, advance to the next 43# | character from the input only if 'n' appears here. 44# | 45# |--- Character or named character classes to test for. If the current character being scanned 46# matches, peform the actions and go to the state specified on this line. 47# The input character is tested sequentally, in the order written. The characters and 48# character classes tested for do not need to be mutually exclusive. The first match wins. 49# 50 51 52 53 54# 55# start state, scan position is at the beginning of the pattern. 56# 57start: 58 default term doPatStart 59 60 61 62 63# 64# term. At a position where we can accept the start most items in a pattern. 65# 66term: 67 quoted n expr-quant doLiteralChar 68 rule_char n expr-quant doLiteralChar 69 '[' n set-open ^set-finish doSetBegin 70 '(' n open-paren 71 '.' n expr-quant doDotAny 72 '^' n expr-quant doCaret 73 '$' n expr-quant doDollar 74 '\' n backslash 75 '|' n term doOrOperator 76 ')' n pop doCloseParen 77 eof term doPatFinish 78 default errorDeath doRuleError 79 80 81 82# 83# expr-quant We've just finished scanning a term, now look for the optional 84# trailing quantifier - *, +, ?, *?, etc. 85# 86expr-quant: 87 '*' n quant-star 88 '+' n quant-plus 89 '?' n quant-opt 90 '{' n interval-open doIntervalInit 91 '(' n open-paren-quant 92 default expr-cont 93 94 95# 96# expr-cont Expression, continuation. At a point where additional terms are 97# allowed, but not required. No Quantifiers 98# 99expr-cont: 100 '|' n term doOrOperator 101 ')' n pop doCloseParen 102 default term 103 104 105# 106# open-paren-quant Special case handling for comments appearing before a quantifier, 107# e.g. x(?#comment )* 108# Open parens from expr-quant come here; anything but a (?# comment 109# branches into the normal parenthesis sequence as quickly as possible. 110# 111open-paren-quant: 112 '?' n open-paren-quant2 doSuppressComments 113 default open-paren 114 115open-paren-quant2: 116 '#' n paren-comment ^expr-quant 117 default open-paren-extended 118 119 120# 121# open-paren We've got an open paren. We need to scan further to 122# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 123# 124open-paren: 125 '?' n open-paren-extended doSuppressComments 126 default term ^expr-quant doOpenCaptureParen 127 128open-paren-extended: 129 ':' n term ^expr-quant doOpenNonCaptureParen # (?: 130 '>' n term ^expr-quant doOpenAtomicParen # (?> 131 '=' n term ^expr-cont doOpenLookAhead # (?= 132 '!' n term ^expr-cont doOpenLookAheadNeg # (?! 133 '<' n open-paren-lookbehind 134 '#' n paren-comment ^term 135 'i' paren-flag doBeginMatchMode 136 'd' paren-flag doBeginMatchMode 137 'm' paren-flag doBeginMatchMode 138 's' paren-flag doBeginMatchMode 139 'u' paren-flag doBeginMatchMode 140 'w' paren-flag doBeginMatchMode 141 'x' paren-flag doBeginMatchMode 142 '-' paren-flag doBeginMatchMode 143 '(' n errorDeath doConditionalExpr 144 '{' n errorDeath doPerlInline 145 default errorDeath doBadOpenParenType 146 147open-paren-lookbehind: 148 '=' n term ^expr-cont doOpenLookBehind # (?<= 149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 150 ascii_letter named-capture doBeginNamedCapture # (?<name 151 default errorDeath doBadOpenParenType 152 153 154# 155# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 156# 157paren-comment: 158 ')' n pop 159 eof errorDeath doMismatchedParenErr 160 default n paren-comment 161 162# 163# paren-flag Scanned a (?ismx-ismx flag setting 164# 165paren-flag: 166 'i' n paren-flag doMatchMode 167 'd' n paren-flag doMatchMode 168 'm' n paren-flag doMatchMode 169 's' n paren-flag doMatchMode 170 'u' n paren-flag doMatchMode 171 'w' n paren-flag doMatchMode 172 'x' n paren-flag doMatchMode 173 '-' n paren-flag doMatchMode 174 ')' n term doSetMatchMode 175 ':' n term ^expr-quant doMatchModeParen 176 default errorDeath doBadModeFlag 177 178# 179# named-capture (?<name> ... ), position currently on the name. 180# 181named-capture: 182 ascii_letter n named-capture doContinueNamedCapture 183 digit_char n named-capture doContinueNamedCapture 184 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. 185 default errorDeath doBadNamedCapture 186 187# 188# quant-star Scanning a '*' quantifier. Need to look ahead to decide 189# between plain '*', '*?', '*+' 190# 191quant-star: 192 '?' n expr-cont doNGStar # *? 193 '+' n expr-cont doPossessiveStar # *+ 194 default expr-cont doStar 195 196 197# 198# quant-plus Scanning a '+' quantifier. Need to look ahead to decide 199# between plain '+', '+?', '++' 200# 201quant-plus: 202 '?' n expr-cont doNGPlus # *? 203 '+' n expr-cont doPossessivePlus # *+ 204 default expr-cont doPlus 205 206 207# 208# quant-opt Scanning a '?' quantifier. Need to look ahead to decide 209# between plain '?', '??', '?+' 210# 211quant-opt: 212 '?' n expr-cont doNGOpt # ?? 213 '+' n expr-cont doPossessiveOpt # ?+ 214 default expr-cont doOpt # ? 215 216 217# 218# Interval scanning a '{', the opening delimiter for an interval specification 219# {number} or {min, max} or {min,} 220# 221interval-open: 222 digit_char interval-lower 223 default errorDeath doIntervalError 224 225interval-lower: 226 digit_char n interval-lower doIntevalLowerDigit 227 ',' n interval-upper 228 '}' n interval-type doIntervalSame # {n} 229 default errorDeath doIntervalError 230 231interval-upper: 232 digit_char n interval-upper doIntervalUpperDigit 233 '}' n interval-type 234 default errorDeath doIntervalError 235 236interval-type: 237 '?' n expr-cont doNGInterval # {n,m}? 238 '+' n expr-cont doPossessiveInterval # {n,m}+ 239 default expr-cont doInterval # {m,n} 240 241 242# 243# backslash # Backslash. Figure out which of the \thingies we have encountered. 244# The low level next-char function will have preprocessed 245# some of them already; those won't come here. 246backslash: 247 'A' n term doBackslashA 248 'B' n term doBackslashB 249 'b' n term doBackslashb 250 'd' n expr-quant doBackslashd 251 'D' n expr-quant doBackslashD 252 'G' n term doBackslashG 253 'h' n expr-quant doBackslashh 254 'H' n expr-quant doBackslashH 255 'k' n named-backref 256 'N' expr-quant doNamedChar # \N{NAME} named char 257 'p' expr-quant doProperty # \p{Lu} style property 258 'P' expr-quant doProperty 259 'R' n expr-quant doBackslashR 260 'Q' n term doEnterQuoteMode 261 'S' n expr-quant doBackslashS 262 's' n expr-quant doBackslashs 263 'v' n expr-quant doBackslashv 264 'V' n expr-quant doBackslashV 265 'W' n expr-quant doBackslashW 266 'w' n expr-quant doBackslashw 267 'X' n expr-quant doBackslashX 268 'Z' n term doBackslashZ 269 'z' n term doBackslashz 270 digit_char n expr-quant doBackRef # Will scan multiple digits 271 eof errorDeath doEscapeError 272 default n expr-quant doEscapedLiteralChar 273 274 275# named-backref Scanned \k 276# Leading to \k<captureName> 277# Failure to get the full sequence is an error. 278# 279named-backref: 280 '<' n named-backref-2 doBeginNamedBackRef 281 default errorDeath doBadNamedCapture 282 283named-backref-2: 284 ascii_letter n named-backref-3 doContinueNamedBackRef 285 default errorDeath doBadNamedCapture 286 287named-backref-3: 288 ascii_letter n named-backref-3 doContinueNamedBackRef 289 digit_char n named-backref-3 doContinueNamedBackRef 290 '>' n expr-quant doCompleteNamedBackRef 291 default errorDeath doBadNamedCapture 292 293 294# 295# [set expression] parsing, 296# All states involved in parsing set expressions have names beginning with "set-" 297# 298 299set-open: 300 '^' n set-open2 doSetNegate 301 ':' set-posix doSetPosixProp 302 default set-open2 303 304set-open2: 305 ']' n set-after-lit doSetLiteral 306 default set-start 307 308# set-posix: 309# scanned a '[:' If it really is a [:property:], doSetPosixProp will have 310# moved the scan to the closing ']'. If it wasn't a property 311# expression, the scan will still be at the opening ':', which should 312# be interpreted as a normal set expression. 313set-posix: 314 ']' n pop doSetEnd 315 ':' set-start 316 default errorDeath doRuleError # should not be possible. 317 318# 319# set-start after the [ and special case leading characters (^ and/or ]) but before 320# everything else. A '-' is literal at this point. 321# 322set-start: 323 ']' n pop doSetEnd 324 '[' n set-open ^set-after-set doSetBeginUnion 325 '\' n set-escape 326 '-' n set-start-dash 327 '&' n set-start-amp 328 default n set-after-lit doSetLiteral 329 330# set-start-dash Turn "[--" into a syntax error. 331# "[-x" is good, - and x are literals. 332# 333set-start-dash: 334 '-' errorDeath doRuleError 335 default set-after-lit doSetAddDash 336 337# set-start-amp Turn "[&&" into a syntax error. 338# "[&x" is good, & and x are literals. 339# 340set-start-amp: 341 '&' errorDeath doRuleError 342 default set-after-lit doSetAddAmp 343 344# 345# set-after-lit The last thing scanned was a literal character within a set. 346# Can be followed by anything. Single '-' or '&' are 347# literals in this context, not operators. 348set-after-lit: 349 ']' n pop doSetEnd 350 '[' n set-open ^set-after-set doSetBeginUnion 351 '-' n set-lit-dash 352 '&' n set-lit-amp 353 '\' n set-escape 354 eof errorDeath doSetNoCloseError 355 default n set-after-lit doSetLiteral 356 357set-after-set: 358 ']' n pop doSetEnd 359 '[' n set-open ^set-after-set doSetBeginUnion 360 '-' n set-set-dash 361 '&' n set-set-amp 362 '\' n set-escape 363 eof errorDeath doSetNoCloseError 364 default n set-after-lit doSetLiteral 365 366set-after-range: 367 ']' n pop doSetEnd 368 '[' n set-open ^set-after-set doSetBeginUnion 369 '-' n set-range-dash 370 '&' n set-range-amp 371 '\' n set-escape 372 eof errorDeath doSetNoCloseError 373 default n set-after-lit doSetLiteral 374 375 376# set-after-op 377# After a -- or && 378# It is an error to close a set at this point. 379# 380set-after-op: 381 '[' n set-open ^set-after-set doSetBeginUnion 382 ']' errorDeath doSetOpError 383 '\' n set-escape 384 default n set-after-lit doSetLiteral 385 386# 387# set-set-amp 388# Have scanned [[set]& 389# Could be a '&' intersection operator, if a set follows. 390# Could be the start of a '&&' operator. 391# Otherewise is a literal. 392set-set-amp: 393 '[' n set-open ^set-after-set doSetBeginIntersection1 394 '&' n set-after-op doSetIntersection2 395 default set-after-lit doSetAddAmp 396 397 398# set-lit-amp Have scanned "[literals&" 399# Could be a start of "&&" operator or a literal 400# In [abc&[def]], the '&' is a literal 401# 402set-lit-amp: 403 '&' n set-after-op doSetIntersection2 404 default set-after-lit doSetAddAmp 405 406 407# 408# set-set-dash 409# Have scanned [set]- 410# Could be a '-' difference operator, if a [set] follows. 411# Could be the start of a '--' operator. 412# Otherewise is a literal. 413set-set-dash: 414 '[' n set-open ^set-after-set doSetBeginDifference1 415 '-' n set-after-op doSetDifference2 416 default set-after-lit doSetAddDash 417 418 419# 420# set-range-dash 421# scanned a-b- or \w- 422# any set or range like item where the trailing single '-' should 423# be literal, not a set difference operation. 424# A trailing "--" is still a difference operator. 425set-range-dash: 426 '-' n set-after-op doSetDifference2 427 default set-after-lit doSetAddDash 428 429 430set-range-amp: 431 '&' n set-after-op doSetIntersection2 432 default set-after-lit doSetAddAmp 433 434 435# set-lit-dash 436# Have scanned "[literals-" Could be a range or a -- operator or a literal 437# In [abc-[def]], the '-' is a literal (confirmed with a Java test) 438# [abc-\p{xx} the '-' is an error 439# [abc-] the '-' is a literal 440# [ab-xy] the '-' is a range 441# 442set-lit-dash: 443 '-' n set-after-op doSetDifference2 444 '[' set-after-lit doSetAddDash 445 ']' set-after-lit doSetAddDash 446 '\' n set-lit-dash-escape 447 default n set-after-range doSetRange 448 449# set-lit-dash-escape 450# 451# scanned "[literal-\" 452# Could be a range, if the \ introduces an escaped literal char or a named char. 453# Otherwise it is an error. 454# 455set-lit-dash-escape: 456 's' errorDeath doSetOpError 457 'S' errorDeath doSetOpError 458 'w' errorDeath doSetOpError 459 'W' errorDeath doSetOpError 460 'd' errorDeath doSetOpError 461 'D' errorDeath doSetOpError 462 'N' set-after-range doSetNamedRange 463 default n set-after-range doSetRange 464 465 466# 467# set-escape 468# Common back-slash escape processing within set expressions 469# 470set-escape: 471 'p' set-after-set doSetProp 472 'P' set-after-set doSetProp 473 'N' set-after-lit doSetNamedChar 474 's' n set-after-range doSetBackslash_s 475 'S' n set-after-range doSetBackslash_S 476 'w' n set-after-range doSetBackslash_w 477 'W' n set-after-range doSetBackslash_W 478 'd' n set-after-range doSetBackslash_d 479 'D' n set-after-range doSetBackslash_D 480 'h' n set-after-range doSetBackslash_h 481 'H' n set-after-range doSetBackslash_H 482 'v' n set-after-range doSetBackslash_v 483 'V' n set-after-range doSetBackslash_V 484 default n set-after-lit doSetLiteralEscaped 485 486# 487# set-finish 488# Have just encountered the final ']' that completes a [set], and 489# arrived here via a pop. From here, we exit the set parsing world, and go 490# back to generic regular expression parsing. 491# 492set-finish: 493 default expr-quant doSetFinish 494 495 496# 497# errorDeath. This state is specified as the next state whenever a syntax error 498# in the source rules is detected. Barring bugs, the state machine will never 499# actually get here, but will stop because of the action associated with the error. 500# But, just in case, this state asks the state machine to exit. 501errorDeath: 502 default n errorDeath doExit 503 504 505