1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. 6 ******************************************************************************* 7 */ 8 9 package com.ibm.icu.text; 10 11 import java.text.ParsePosition; 12 import java.util.HashMap; 13 14 import com.ibm.icu.impl.Assert; 15 import com.ibm.icu.impl.Utility; 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.lang.UProperty; 18 19 /** 20 * This class is part of the Rule Based Break Iterator rule compiler. 21 * It scans the rules and builds the parse tree. 22 * There is no public API here. 23 */ 24 class RBBIRuleScanner { 25 26 private final static int kStackSize = 100; // The size of the state stack for 27 // rules parsing. Corresponds roughly 28 // to the depth of parentheses nesting 29 // that is allowed in the rules. 30 31 static class RBBIRuleChar { 32 int fChar; 33 boolean fEscaped; 34 } 35 36 37 38 RBBIRuleBuilder fRB; // The rule builder that we are part of. 39 40 int fScanIndex; // Index of current character being processed 41 // in the rule input string. 42 int fNextIndex; // Index of the next character, which 43 // is the first character not yet scanned. 44 boolean fQuoteMode; // Scan is in a 'quoted region' 45 int fLineNum; // Line number in input file. 46 int fCharNum; // Char position within the line. 47 int fLastChar; // Previous char, needed to count CR-LF 48 // as a single line, not two. 49 50 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 51 // processing. 52 53 54 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 55 int fStackPtr; // and pops as specified in the state 56 // transition rules. 57 58 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 59 // during the parse of a rule 60 int fNodeStackPtr; 61 62 63 boolean fReverseRule; // True if the rule currently being scanned 64 // is a reverse direction rule (if it 65 // starts with a '!') 66 67 boolean fLookAheadRule; // True if the rule includes a '/' 68 // somewhere within it. 69 70 boolean fNoChainInRule; // True if the current rule starts with a '^'. 71 72 73 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 74 // $variable symbols. 75 76 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to 77 // the sets created while parsing rules. 78 // The key is the string used for creating 79 // the set. 80 81 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 82 // the scanning of RBBI rules. The 83 // Indices for these are assigned by the 84 // perl script that builds the state tables. 85 // See rbbirpt.h. 86 87 int fRuleNum; // Counts each rule as it is scanned. 88 89 int fOptionStart; // Input index of start of a !!option 90 // keyword, while being scanned. 91 92 93 // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting. 94 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 95 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 96 static private String gRuleSet_digit_char_pattern = "[0-9]"; 97 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 98 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 99 static private String kAny = "any"; 100 101 102 103 104 //---------------------------------------------------------------------------------------- 105 // 106 // Constructor. 107 // 108 //---------------------------------------------------------------------------------------- RBBIRuleScanner(RBBIRuleBuilder rb)109 RBBIRuleScanner(RBBIRuleBuilder rb) { 110 fRB = rb; 111 fLineNum = 1; 112 113 // 114 // Set up the constant Unicode Sets. 115 // Note: These could be made static and shared among 116 // all instances of RBBIRuleScanners. 117 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 118 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 119 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 120 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 121 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 122 123 fSymbolTable = new RBBISymbolTable(this); 124 } 125 126 //---------------------------------------------------------------------------------------- 127 // 128 // doParseAction Do some action during rule parsing. 129 // Called by the parse state machine. 130 // Actions build the parse tree and Unicode Sets, 131 // and maintain the parse stack for nested expressions. 132 // 133 //---------------------------------------------------------------------------------------- doParseActions(int action)134 boolean doParseActions(int action) { 135 RBBINode n = null; 136 137 boolean returnVal = true; 138 139 switch (action) { 140 141 case RBBIRuleParseTable.doExprStart: 142 pushNewNode(RBBINode.opStart); 143 fRuleNum++; 144 break; 145 146 case RBBIRuleParseTable.doNoChain: 147 // Scanned a '^' while on the rule start state. 148 fNoChainInRule = true; 149 break; 150 151 152 case RBBIRuleParseTable.doExprOrOperator: { 153 fixOpStack(RBBINode.precOpCat); 154 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 155 RBBINode orNode = pushNewNode(RBBINode.opOr); 156 orNode.fLeftChild = operandNode; 157 operandNode.fParent = orNode; 158 } 159 break; 160 161 case RBBIRuleParseTable.doExprCatOperator: 162 // concatenation operator. 163 // For the implicit concatenation of adjacent terms in an expression 164 // that are 165 // not separated by any other operator. Action is invoked between the 166 // actions for the two terms. 167 { 168 fixOpStack(RBBINode.precOpCat); 169 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 170 RBBINode catNode = pushNewNode(RBBINode.opCat); 171 catNode.fLeftChild = operandNode; 172 operandNode.fParent = catNode; 173 } 174 break; 175 176 case RBBIRuleParseTable.doLParen: 177 // Open Paren. 178 // The openParen node is a dummy operation type with a low 179 // precedence, 180 // which has the affect of ensuring that any real binary op that 181 // follows within the parens binds more tightly to the operands than 182 // stuff outside of the parens. 183 pushNewNode(RBBINode.opLParen); 184 break; 185 186 case RBBIRuleParseTable.doExprRParen: 187 fixOpStack(RBBINode.precLParen); 188 break; 189 190 case RBBIRuleParseTable.doNOP: 191 break; 192 193 case RBBIRuleParseTable.doStartAssign: 194 // We've just scanned "$variable = " 195 // The top of the node stack has the $variable ref node. 196 197 // Save the start position of the RHS text in the StartExpression 198 // node 199 // that precedes the $variableReference node on the stack. 200 // This will eventually be used when saving the full $variable 201 // replacement 202 // text as a string. 203 n = fNodeStack[fNodeStackPtr - 1]; 204 n.fFirstPos = fNextIndex; // move past the '=' 205 206 // Push a new start-of-expression node; needed to keep parse of the 207 // RHS expression happy. 208 pushNewNode(RBBINode.opStart); 209 break; 210 211 case RBBIRuleParseTable.doEndAssign: { 212 // We have reached the end of an assignement statement. 213 // Current scan char is the ';' that terminates the assignment. 214 215 // Terminate expression, leaves expression parse tree rooted in TOS 216 // node. 217 fixOpStack(RBBINode.precStart); 218 219 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 220 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 221 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 222 223 // Save original text of right side of assignment, excluding the 224 // terminating ';' 225 // in the root of the node for the right-hand-side expression. 226 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 227 RHSExprNode.fLastPos = fScanIndex; 228 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 229 // RHSExprNode.fLastPos, RHSExprNode.fText); 230 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 231 RHSExprNode.fLastPos); 232 233 // Expression parse tree becomes l. child of the $variable reference 234 // node. 235 varRefNode.fLeftChild = RHSExprNode; 236 RHSExprNode.fParent = varRefNode; 237 238 // Make a symbol table entry for the $variableRef node. 239 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 240 241 // Clean up the stack. 242 fNodeStackPtr -= 3; 243 break; 244 } 245 246 case RBBIRuleParseTable.doEndOfRule: { 247 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 248 // expression 249 250 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 251 printNodeStack("end of rule"); 252 } 253 Assert.assrt(fNodeStackPtr == 1); 254 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 255 256 // If this rule includes a look-ahead '/', add a endMark node to the 257 // expression tree. 258 if (fLookAheadRule) { 259 RBBINode endNode = pushNewNode(RBBINode.endMark); 260 RBBINode catNode = pushNewNode(RBBINode.opCat); 261 fNodeStackPtr -= 2; 262 catNode.fLeftChild = thisRule; 263 catNode.fRightChild = endNode; 264 fNodeStack[fNodeStackPtr] = catNode; 265 endNode.fVal = fRuleNum; 266 endNode.fLookAheadEnd = true; 267 thisRule = catNode; 268 269 // TODO: Disable chaining out of look-ahead (hard break) rules. 270 // The break on rule match is forced, so there is no point in building up 271 // the state table to chain into another rule for a longer match. 272 } 273 274 // Mark this node as being the root of a rule. 275 thisRule.fRuleRoot = true; 276 277 // Flag if chaining into this rule is wanted. 278 // 279 if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain 280 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 281 thisRule.fChainIn = true; 282 } 283 284 285 // All rule expressions are ORed together. 286 // The ';' that terminates an expression really just functions as a 287 // '|' with 288 // a low operator precedence. 289 // 290 // Each of the four sets of rules are collected separately. 291 // (forward, reverse, safe_forward, safe_reverse) 292 // OR this rule into the appropriate group of them. 293 // 294 295 int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree); 296 297 if (fRB.fTreeRoots[destRules] != null) { 298 // This is not the first rule encountered. 299 // OR previous stuff (from *destRules) 300 // with the current rule expression (on the Node Stack) 301 // with the resulting OR expression going to *destRules 302 // 303 thisRule = fNodeStack[fNodeStackPtr]; 304 RBBINode prevRules = fRB.fTreeRoots[destRules]; 305 RBBINode orNode = pushNewNode(RBBINode.opOr); 306 orNode.fLeftChild = prevRules; 307 prevRules.fParent = orNode; 308 orNode.fRightChild = thisRule; 309 thisRule.fParent = orNode; 310 fRB.fTreeRoots[destRules] = orNode; 311 } else { 312 // This is the first rule encountered (for this direction). 313 // Just move its parse tree from the stack to *destRules. 314 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 315 } 316 fReverseRule = false; // in preparation for the next rule. 317 fLookAheadRule = false; 318 fNoChainInRule = false; 319 fNodeStackPtr = 0; 320 } 321 break; 322 323 case RBBIRuleParseTable.doRuleError: 324 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 325 returnVal = false; 326 break; 327 328 case RBBIRuleParseTable.doVariableNameExpectedErr: 329 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 330 break; 331 332 // 333 // Unary operands + ? * 334 // These all appear after the operand to which they apply. 335 // When we hit one, the operand (may be a whole sub expression) 336 // will be on the top of the stack. 337 // Unary Operator becomes TOS, with the old TOS as its one child. 338 case RBBIRuleParseTable.doUnaryOpPlus: { 339 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 340 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 341 plusNode.fLeftChild = operandNode; 342 operandNode.fParent = plusNode; 343 } 344 break; 345 346 case RBBIRuleParseTable.doUnaryOpQuestion: { 347 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 348 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 349 qNode.fLeftChild = operandNode; 350 operandNode.fParent = qNode; 351 } 352 break; 353 354 case RBBIRuleParseTable.doUnaryOpStar: { 355 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 356 RBBINode starNode = pushNewNode(RBBINode.opStar); 357 starNode.fLeftChild = operandNode; 358 operandNode.fParent = starNode; 359 } 360 break; 361 362 case RBBIRuleParseTable.doRuleChar: 363 // A "Rule Character" is any single character that is a literal part 364 // of the regular expression. Like a, b and c in the expression "(abc*) 365 // | [:L:]" 366 // These are pretty uncommon in break rules; the terms are more commonly 367 // sets. To keep things uniform, treat these characters like as 368 // sets that just happen to contain only one character. 369 { 370 n = pushNewNode(RBBINode.setRef); 371 String s = String.valueOf((char)fC.fChar); 372 findSetFor(s, n, null); 373 n.fFirstPos = fScanIndex; 374 n.fLastPos = fNextIndex; 375 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 376 break; 377 } 378 379 case RBBIRuleParseTable.doDotAny: 380 // scanned a ".", meaning match any single character. 381 { 382 n = pushNewNode(RBBINode.setRef); 383 findSetFor(kAny, n, null); 384 n.fFirstPos = fScanIndex; 385 n.fLastPos = fNextIndex; 386 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 387 break; 388 } 389 390 case RBBIRuleParseTable.doSlash: 391 // Scanned a '/', which identifies a look-ahead break position in a 392 // rule. 393 n = pushNewNode(RBBINode.lookAhead); 394 n.fVal = fRuleNum; 395 n.fFirstPos = fScanIndex; 396 n.fLastPos = fNextIndex; 397 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 398 fLookAheadRule = true; 399 break; 400 401 case RBBIRuleParseTable.doStartTagValue: 402 // Scanned a '{', the opening delimiter for a tag value within a 403 // rule. 404 n = pushNewNode(RBBINode.tag); 405 n.fVal = 0; 406 n.fFirstPos = fScanIndex; 407 n.fLastPos = fNextIndex; 408 break; 409 410 case RBBIRuleParseTable.doTagDigit: 411 // Just scanned a decimal digit that's part of a tag value 412 { 413 n = fNodeStack[fNodeStackPtr]; 414 int v = UCharacter.digit((char) fC.fChar, 10); 415 n.fVal = n.fVal * 10 + v; 416 break; 417 } 418 419 case RBBIRuleParseTable.doTagValue: 420 n = fNodeStack[fNodeStackPtr]; 421 n.fLastPos = fNextIndex; 422 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 423 break; 424 425 case RBBIRuleParseTable.doTagExpectedError: 426 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 427 returnVal = false; 428 break; 429 430 case RBBIRuleParseTable.doOptionStart: 431 // Scanning a !!option. At the start of string. 432 fOptionStart = fScanIndex; 433 break; 434 435 case RBBIRuleParseTable.doOptionEnd: { 436 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 437 if (opt.equals("chain")) { 438 fRB.fChainRules = true; 439 } else if (opt.equals("LBCMNoChain")) { 440 fRB.fLBCMNoChain = true; 441 } else if (opt.equals("forward")) { 442 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 443 } else if (opt.equals("reverse")) { 444 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 445 } else if (opt.equals("safe_forward")) { 446 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 447 } else if (opt.equals("safe_reverse")) { 448 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 449 } else if (opt.equals("lookAheadHardBreak")) { 450 fRB.fLookAheadHardBreak = true; 451 } else if (opt.equals("quoted_literals_only")) { 452 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear(); 453 } else if (opt.equals("unquoted_literals")) { 454 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern); 455 } else { 456 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 457 } 458 break; 459 } 460 461 case RBBIRuleParseTable.doReverseDir: 462 fReverseRule = true; 463 break; 464 465 case RBBIRuleParseTable.doStartVariableName: 466 n = pushNewNode(RBBINode.varRef); 467 n.fFirstPos = fScanIndex; 468 break; 469 470 case RBBIRuleParseTable.doEndVariableName: 471 n = fNodeStack[fNodeStackPtr]; 472 if (n == null || n.fType != RBBINode.varRef) { 473 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 474 break; 475 } 476 n.fLastPos = fScanIndex; 477 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 478 // Look the newly scanned name up in the symbol table 479 // If there's an entry, set the l. child of the var ref to the 480 // replacement expression. 481 // (We also pass through here when scanning assignments, but no harm 482 // is done, other 483 // than a slight wasted effort that seems hard to avoid. Lookup will 484 // be null) 485 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 486 break; 487 488 case RBBIRuleParseTable.doCheckVarDef: 489 n = fNodeStack[fNodeStackPtr]; 490 if (n.fLeftChild == null) { 491 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 492 returnVal = false; 493 } 494 break; 495 496 case RBBIRuleParseTable.doExprFinished: 497 break; 498 499 case RBBIRuleParseTable.doRuleErrorAssignExpr: 500 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 501 returnVal = false; 502 break; 503 504 case RBBIRuleParseTable.doExit: 505 returnVal = false; 506 break; 507 508 case RBBIRuleParseTable.doScanUnicodeSet: 509 scanSet(); 510 break; 511 512 default: 513 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 514 returnVal = false; 515 break; 516 } 517 return returnVal; 518 } 519 520 //---------------------------------------------------------------------------------------- 521 // 522 // Error Throw and IllegalArgumentException in response to a rule parse 523 // error. 524 // 525 //---------------------------------------------------------------------------------------- error(int e)526 void error(int e) { 527 String s = "Error " + e + " at line " + fLineNum + " column " 528 + fCharNum; 529 IllegalArgumentException ex = new IllegalArgumentException(s); 530 throw ex; 531 532 } 533 534 //---------------------------------------------------------------------------------------- 535 // 536 // fixOpStack The parse stack holds partially assembled chunks of the parse 537 // tree. 538 // An entry on the stack may be as small as a single setRef node, 539 // or as large as the parse tree 540 // for an entire expression (this will be the one item left on the stack 541 // when the parsing of an RBBI rule completes. 542 // 543 // This function is called when a binary operator is encountered. 544 // It looks back up the stack for operators that are not yet associated 545 // with a right operand, and if the precedence of the stacked operator >= 546 // the precedence of the current operator, binds the operand left, 547 // to the previously encountered operator. 548 // 549 //---------------------------------------------------------------------------------------- fixOpStack(int p)550 void fixOpStack(int p) { 551 RBBINode n; 552 // printNodeStack("entering fixOpStack()"); 553 for (;;) { 554 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 555 if (n.fPrecedence == 0) { 556 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 557 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 558 return; 559 } 560 561 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 562 // The most recent operand goes with the current operator, 563 // not with the previously stacked one. 564 break; 565 } 566 // Stack operator is a binary op ( '|' or concatenation) 567 // TOS operand becomes right child of this operator. 568 // Resulting subexpression becomes the TOS operand. 569 n.fRightChild = fNodeStack[fNodeStackPtr]; 570 fNodeStack[fNodeStackPtr].fParent = n; 571 fNodeStackPtr--; 572 // printNodeStack("looping in fixOpStack() "); 573 } 574 575 if (p <= RBBINode.precLParen) { 576 // Scan is at a right paren or end of expression. 577 // The scanned item must match the stack, or else there was an 578 // error. 579 // Discard the left paren (or start expr) node from the stack, 580 // leaving the completed (sub)expression as TOS. 581 if (n.fPrecedence != p) { 582 // Right paren encountered matched start of expression node, or 583 // end of expression matched with a left paren node. 584 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 585 } 586 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 587 fNodeStackPtr--; 588 // Delete the now-discarded LParen or Start node. 589 // delete n; 590 } 591 // printNodeStack("leaving fixOpStack()"); 592 } 593 594 //---------------------------------------------------------------------------- 595 // 596 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 597 // been encountered. The val Node will be of nodetype uset 598 // and contain pointers to the actual UnicodeSets. 599 // The Key is the source string for initializing the set. 600 // 601 // The hash table is used to avoid creating duplicate 602 // unnamed (not $var references) UnicodeSets. 603 // 604 //---------------------------------------------------------------------------- 605 static class RBBISetTableEl { 606 String key; 607 608 RBBINode val; 609 } 610 611 612 //---------------------------------------------------------------------------------------- 613 // 614 // findSetFor given a String, 615 // - find the corresponding Unicode Set (uset node) 616 // (create one if necessary) 617 // - Set fLeftChild of the caller's node (should be a setRef node) 618 // to the uset node 619 // Maintain a hash table of uset nodes, so the same one is always used 620 // for the same string. 621 // If a "to adopt" set is provided and we haven't seen this key before, 622 // add the provided set to the hash table. 623 // If the string is one (32 bit) char in length, the set contains 624 // just one element which is the char in question. 625 // If the string is "any", return a set containing all chars. 626 // 627 //---------------------------------------------------------------------------------------- findSetFor(String s, RBBINode node, UnicodeSet setToAdopt)628 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 629 630 RBBISetTableEl el; 631 632 // First check whether we've already cached a set for this string. 633 // If so, just use the cached set in the new node. 634 // delete any set provided by the caller, since we own it. 635 el = fSetTable.get(s); 636 if (el != null) { 637 node.fLeftChild = el.val; 638 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 639 return; 640 } 641 642 // Haven't seen this set before. 643 // If the caller didn't provide us with a prebuilt set, 644 // create a new UnicodeSet now. 645 if (setToAdopt == null) { 646 if (s.equals(kAny)) { 647 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 648 } else { 649 int c; 650 c = UTF16.charAt(s, 0); 651 setToAdopt = new UnicodeSet(c, c); 652 } 653 } 654 655 // 656 // Make a new uset node to refer to this UnicodeSet 657 // This new uset node becomes the child of the caller's setReference 658 // node. 659 // 660 RBBINode usetNode = new RBBINode(RBBINode.uset); 661 usetNode.fInputSet = setToAdopt; 662 usetNode.fParent = node; 663 node.fLeftChild = usetNode; 664 usetNode.fText = s; 665 666 // 667 // Add the new uset node to the list of all uset nodes. 668 // 669 fRB.fUSetNodes.add(usetNode); 670 671 // 672 // Add the new set to the set hash table. 673 // 674 el = new RBBISetTableEl(); 675 el.key = s; 676 el.val = usetNode; 677 fSetTable.put(el.key, el); 678 679 return; 680 } 681 682 // 683 // Assorted Unicode character constants. 684 // Numeric because there is no portable way to enter them as literals. 685 // (Think EBCDIC). 686 // 687 static final int chNEL = 0x85; // NEL newline variant 688 689 static final int chLS = 0x2028; // Unicode Line Separator 690 691 //---------------------------------------------------------------------------------------- 692 // 693 // stripRules Return a rules string without unnecessary 694 // characters. 695 // 696 //---------------------------------------------------------------------------------------- stripRules(String rules)697 static String stripRules(String rules) { 698 StringBuilder strippedRules = new StringBuilder(); 699 int rulesLength = rules.length(); 700 boolean skippingSpaces = false; 701 702 for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) { 703 int cp = rules.codePointAt(idx); 704 boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE); 705 if (skippingSpaces && whiteSpace) { 706 continue; 707 } 708 strippedRules.appendCodePoint(cp); 709 skippingSpaces = whiteSpace; 710 } 711 return strippedRules.toString(); 712 } 713 714 //---------------------------------------------------------------------------------------- 715 // 716 // nextCharLL Low Level Next Char from rule input source. 717 // Get a char from the input character iterator, 718 // keep track of input position for error reporting. 719 // 720 //---------------------------------------------------------------------------------------- nextCharLL()721 int nextCharLL() { 722 int ch; 723 724 if (fNextIndex >= fRB.fRules.length()) { 725 return -1; 726 } 727 ch = UTF16.charAt(fRB.fRules, fNextIndex); 728 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 729 730 if (ch == '\r' || 731 ch == chNEL || 732 ch == chLS || 733 ch == '\n' && fLastChar != '\r') { 734 // Character is starting a new line. Bump up the line number, and 735 // reset the column to 0. 736 fLineNum++; 737 fCharNum = 0; 738 if (fQuoteMode) { 739 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 740 fQuoteMode = false; 741 } 742 } else { 743 // Character is not starting a new line. Except in the case of a 744 // LF following a CR, increment the column position. 745 if (ch != '\n') { 746 fCharNum++; 747 } 748 } 749 fLastChar = ch; 750 return ch; 751 } 752 753 //--------------------------------------------------------------------------------- 754 // 755 // nextChar for rules scanning. At this level, we handle stripping 756 // out comments and processing backslash character escapes. 757 // The rest of the rules grammar is handled at the next level up. 758 // 759 //--------------------------------------------------------------------------------- nextChar(RBBIRuleChar c)760 void nextChar(RBBIRuleChar c) { 761 762 // Unicode Character constants needed for the processing done by nextChar(), 763 // in hex because literals wont work on EBCDIC machines. 764 765 fScanIndex = fNextIndex; 766 c.fChar = nextCharLL(); 767 c.fEscaped = false; 768 769 // 770 // check for '' sequence. 771 // These are recognized in all contexts, whether in quoted text or not. 772 // 773 if (c.fChar == '\'') { 774 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 775 c.fChar = nextCharLL(); // get nextChar officially so character counts 776 c.fEscaped = true; // stay correct. 777 } else { 778 // Single quote, by itself. 779 // Toggle quoting mode. 780 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 781 fQuoteMode = !fQuoteMode; 782 if (fQuoteMode == true) { 783 c.fChar = '('; 784 } else { 785 c.fChar = ')'; 786 } 787 c.fEscaped = false; // The paren that we return is not escaped. 788 return; 789 } 790 } 791 792 if (fQuoteMode) { 793 c.fEscaped = true; 794 } else { 795 // We are not in a 'quoted region' of the source. 796 // 797 if (c.fChar == '#') { 798 // Start of a comment. Consume the rest of it. 799 // The new-line char that terminates the comment is always returned. 800 // It will be treated as white-space, and serves to break up anything 801 // that might otherwise incorrectly clump together with a comment in 802 // the middle (a variable name, for example.) 803 int commentStart = fScanIndex; 804 for (;;) { 805 c.fChar = nextCharLL(); 806 if (c.fChar == -1 || // EOF 807 c.fChar == '\r' || 808 c.fChar == '\n' || 809 c.fChar == chNEL || 810 c.fChar == chLS) 811 { 812 break; 813 } 814 } 815 for (int i=commentStart; i<fNextIndex-1; ++i) { 816 fRB.fStrippedRules.setCharAt(i, ' '); 817 } 818 } 819 if (c.fChar == -1) { 820 return; 821 } 822 823 // 824 // check for backslash escaped characters. 825 // Use String.unescapeAt() to handle them. 826 // 827 if (c.fChar == '\\') { 828 c.fEscaped = true; 829 int[] unescapeIndex = new int[1]; 830 unescapeIndex[0] = fNextIndex; 831 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); 832 if (unescapeIndex[0] == fNextIndex) { 833 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 834 } 835 836 fCharNum += unescapeIndex[0] - fNextIndex; 837 fNextIndex = unescapeIndex[0]; 838 } 839 } 840 // putc(c.fChar, stdout); 841 } 842 843 //--------------------------------------------------------------------------------- 844 // 845 // Parse RBBI rules. The state machine for rules parsing is here. 846 // The state tables are hand-written in the file rbbirpt.txt, 847 // and converted to the form used here by a perl 848 // script rbbicst.pl 849 // 850 //--------------------------------------------------------------------------------- parse()851 void parse() { 852 int state; 853 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 854 855 state = 1; 856 nextChar(fC); 857 // 858 // Main loop for the rule parsing state machine. 859 // Runs once per state transition. 860 // Each time through optionally performs, depending on the state table, 861 // - an advance to the the next input char 862 // - an action to be performed. 863 // - pushing or popping a state to/from the local state return stack. 864 // 865 for (;;) { 866 // Quit if state == 0. This is the normal way to exit the state machine. 867 // 868 if (state == 0) { 869 break; 870 } 871 872 // Find the state table element that matches the input char from the rule, or the 873 // class of the input character. Start with the first table row for this 874 // state, then linearly scan forward until we find a row that matches the 875 // character. The last row for each state always matches all characters, so 876 // the search will stop there, if not before. 877 // 878 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 879 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 880 System.out.println("char, line, col = (\'" + (char) fC.fChar 881 + "\', " + fLineNum + ", " + fCharNum + " state = " 882 + tableEl.fStateName); 883 } 884 885 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 886 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 887 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 888 System.out.print("."); 889 } 890 if (tableEl.fCharClass < 127 && fC.fEscaped == false 891 && tableEl.fCharClass == fC.fChar) { 892 // Table row specified an individual character, not a set, and 893 // the input character is not escaped, and 894 // the input character matched it. 895 break; 896 } 897 if (tableEl.fCharClass == 255) { 898 // Table row specified default, match anything character class. 899 break; 900 } 901 if (tableEl.fCharClass == 254 && fC.fEscaped) { 902 // Table row specified "escaped" and the char was escaped. 903 break; 904 } 905 if (tableEl.fCharClass == 253 && fC.fEscaped 906 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 907 // Table row specified "escaped P" and the char is either 'p' or 'P'. 908 break; 909 } 910 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 911 // Table row specified eof and we hit eof on the input. 912 break; 913 } 914 915 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 916 fC.fEscaped == false && // char is not escaped && 917 fC.fChar != -1) { // char is not EOF 918 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 919 if (uniset.contains(fC.fChar)) { 920 // Table row specified a character class, or set of characters, 921 // and the current char matches it. 922 break; 923 } 924 } 925 } 926 927 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 928 System.out.println(""); 929 } 930 // 931 // We've found the row of the state table that matches the current input 932 // character from the rules string. 933 // Perform any action specified by this row in the state table. 934 if (doParseActions(tableEl.fAction) == false) { 935 // Break out of the state machine loop if the 936 // the action signalled some kind of error, or 937 // the action was to exit, occurs on normal end-of-rules-input. 938 break; 939 } 940 941 if (tableEl.fPushState != 0) { 942 fStackPtr++; 943 if (fStackPtr >= kStackSize) { 944 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 945 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 946 } 947 fStack[fStackPtr] = tableEl.fPushState; 948 } 949 950 if (tableEl.fNextChar) { 951 nextChar(fC); 952 } 953 954 // Get the next state from the table entry, or from the 955 // state stack if the next state was specified as "pop". 956 if (tableEl.fNextState != 255) { 957 state = tableEl.fNextState; 958 } else { 959 state = fStack[fStackPtr]; 960 fStackPtr--; 961 if (fStackPtr < 0) { 962 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 963 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 964 } 965 } 966 967 } 968 969 // If there are no forward rules throw an error. 970 // 971 if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) { 972 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 973 } 974 975 // 976 // Parsing of the input RBBI rules is complete. 977 // We now have a parse tree for the rule expressions 978 // and a list of all UnicodeSets that are referenced. 979 // 980 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 981 fSymbolTable.rbbiSymtablePrint(); 982 } 983 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 984 System.out.println("Completed Forward Rules Parse Tree..."); 985 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 986 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 987 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 988 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 989 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 990 System.out.println(" -- null -- "); 991 } else { 992 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 993 } 994 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 995 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 996 System.out.println(" -- null -- "); 997 } else { 998 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 999 } 1000 } 1001 } 1002 1003 //--------------------------------------------------------------------------------- 1004 // 1005 // printNodeStack for debugging... 1006 // 1007 //--------------------------------------------------------------------------------- 1008 ///CLOVER:OFF printNodeStack(String title)1009 void printNodeStack(String title) { 1010 int i; 1011 System.out.println(title + ". Dumping node stack...\n"); 1012 for (i = fNodeStackPtr; i > 0; i--) { 1013 fNodeStack[i].printTree(true); 1014 } 1015 } 1016 ///CLOVER:ON 1017 1018 //--------------------------------------------------------------------------------- 1019 // 1020 // pushNewNode create a new RBBINode of the specified type and push it 1021 // onto the stack of nodes. 1022 // 1023 //--------------------------------------------------------------------------------- pushNewNode(int nodeType)1024 RBBINode pushNewNode(int nodeType) { 1025 fNodeStackPtr++; 1026 if (fNodeStackPtr >= kStackSize) { 1027 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1028 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1029 } 1030 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1031 return fNodeStack[fNodeStackPtr]; 1032 } 1033 1034 //--------------------------------------------------------------------------------- 1035 // 1036 // scanSet Construct a UnicodeSet from the text at the current scan 1037 // position. Advance the scan position to the first character 1038 // after the set. 1039 // 1040 // A new RBBI setref node referring to the set is pushed onto the node 1041 // stack. 1042 // 1043 // The scan position is normally under the control of the state machine 1044 // that controls rule parsing. UnicodeSets, however, are parsed by 1045 // the UnicodeSet constructor, not by the RBBI rule parser. 1046 // 1047 //--------------------------------------------------------------------------------- scanSet()1048 void scanSet() { 1049 UnicodeSet uset = null; 1050 int startPos; 1051 ParsePosition pos = new ParsePosition(fScanIndex); 1052 int i; 1053 1054 startPos = fScanIndex; 1055 try { 1056 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1057 } catch (Exception e) { // TODO: catch fewer exception types. 1058 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1059 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1060 } 1061 1062 // Verify that the set contains at least one code point. 1063 // 1064 if (uset.isEmpty()) { 1065 // This set is empty. 1066 // Make it an error, because it almost certainly is not what the user wanted. 1067 // Also, avoids having to think about corner cases in the tree manipulation code 1068 // that occurs later on. 1069 // TODO: this shouldn't be an error; it does happen. 1070 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1071 } 1072 1073 // Advance the RBBI parse postion over the UnicodeSet pattern. 1074 // Don't just set fScanIndex because the line/char positions maintained 1075 // for error reporting would be thrown off. 1076 i = pos.getIndex(); 1077 for (;;) { 1078 if (fNextIndex >= i) { 1079 break; 1080 } 1081 nextCharLL(); 1082 } 1083 1084 RBBINode n; 1085 1086 n = pushNewNode(RBBINode.setRef); 1087 n.fFirstPos = startPos; 1088 n.fLastPos = fNextIndex; 1089 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1090 // findSetFor() serves several purposes here: 1091 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1092 // - Mantains collection of all sets in use, needed later for establishing 1093 // character categories for run time engine. 1094 // - Eliminates mulitiple instances of the same set. 1095 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1096 findSetFor(n.fText, n, uset); 1097 } 1098 1099 } 1100 1101