1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
6  *******************************************************************************
7  */
8 
9 package com.ibm.icu.text;
10 
11 import java.text.ParsePosition;
12 import java.util.HashMap;
13 
14 import com.ibm.icu.impl.Assert;
15 import com.ibm.icu.impl.Utility;
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.lang.UProperty;
18 
19 /**
20   *  This class is part of the Rule Based Break Iterator rule compiler.
21   *  It scans the rules and builds the parse tree.
22   *  There is no public API here.
23   */
24 class RBBIRuleScanner {
25 
26     private final static int    kStackSize = 100;               // The size of the state stack for
27     //   rules parsing.  Corresponds roughly
28     //   to the depth of parentheses nesting
29     //   that is allowed in the rules.
30 
31     static class RBBIRuleChar {
32         int             fChar;
33         boolean         fEscaped;
34     }
35 
36 
37 
38     RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
39 
40     int                       fScanIndex;        // Index of current character being processed
41                                                      //   in the rule input string.
42     int                       fNextIndex;        // Index of the next character, which
43                                                      //   is the first character not yet scanned.
44     boolean                  fQuoteMode;        // Scan is in a 'quoted region'
45     int                       fLineNum;          // Line number in input file.
46     int                       fCharNum;          // Char position within the line.
47     int                       fLastChar;         // Previous char, needed to count CR-LF
48                                                      //   as a single line, not two.
49 
50     RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
51                                                      //   processing.
52 
53 
54     short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
55     int                       fStackPtr;           //  and pops as specified in the state
56                                                        //  transition rules.
57 
58     RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
59                                                            //  during the parse of a rule
60     int                        fNodeStackPtr;
61 
62 
63     boolean                    fReverseRule;         // True if the rule currently being scanned
64                                                      //  is a reverse direction rule (if it
65                                                      //  starts with a '!')
66 
67     boolean                    fLookAheadRule;       // True if the rule includes a '/'
68                                                      //   somewhere within it.
69 
70     boolean                    fNoChainInRule;       // True if the current rule starts with a '^'.
71 
72 
73     RBBISymbolTable            fSymbolTable;         // symbol table, holds definitions of
74                                                      //   $variable symbols.
75 
76     HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
77                                                                                        //   the sets created while parsing rules.
78                                                                                        //   The key is the string used for creating
79                                                                                        //   the set.
80 
81     UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
82                                                      //  the scanning of RBBI rules.  The
83                                                      //  Indices for these are assigned by the
84                                                      //  perl script that builds the state tables.
85                                                      //  See rbbirpt.h.
86 
87     int                        fRuleNum;         // Counts each rule as it is scanned.
88 
89     int                        fOptionStart;     // Input index of start of a !!option
90                                                  //   keyword, while being scanned.
91 
92 
93    // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting.
94    static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
95    static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
96    static private String gRuleSet_digit_char_pattern      = "[0-9]";
97    static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
98    static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
99    static private String kAny =  "any";
100 
101 
102 
103 
104     //----------------------------------------------------------------------------------------
105     //
106     //  Constructor.
107     //
108     //----------------------------------------------------------------------------------------
RBBIRuleScanner(RBBIRuleBuilder rb)109     RBBIRuleScanner(RBBIRuleBuilder rb) {
110         fRB = rb;
111         fLineNum = 1;
112 
113         //
114         //  Set up the constant Unicode Sets.
115         //     Note: These could be made static and shared among
116         //            all instances of RBBIRuleScanners.
117         fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
118         fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
119         fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
120         fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
121         fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
122 
123         fSymbolTable = new RBBISymbolTable(this);
124     }
125 
126     //----------------------------------------------------------------------------------------
127     //
128     //  doParseAction Do some action during rule parsing.
129     //                       Called by the parse state machine.
130     //                       Actions build the parse tree and Unicode Sets,
131     //                       and maintain the parse stack for nested expressions.
132     //
133     //----------------------------------------------------------------------------------------
doParseActions(int action)134     boolean doParseActions(int action) {
135         RBBINode n = null;
136 
137         boolean returnVal = true;
138 
139         switch (action) {
140 
141         case RBBIRuleParseTable.doExprStart:
142             pushNewNode(RBBINode.opStart);
143             fRuleNum++;
144             break;
145 
146         case RBBIRuleParseTable.doNoChain:
147             // Scanned a '^' while on the rule start state.
148             fNoChainInRule = true;
149             break;
150 
151 
152         case RBBIRuleParseTable.doExprOrOperator: {
153             fixOpStack(RBBINode.precOpCat);
154             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
155             RBBINode orNode = pushNewNode(RBBINode.opOr);
156             orNode.fLeftChild = operandNode;
157             operandNode.fParent = orNode;
158         }
159             break;
160 
161         case RBBIRuleParseTable.doExprCatOperator:
162         // concatenation operator.
163         // For the implicit concatenation of adjacent terms in an expression
164         // that are
165         //   not separated by any other operator. Action is invoked between the
166         //   actions for the two terms.
167         {
168             fixOpStack(RBBINode.precOpCat);
169             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
170             RBBINode catNode = pushNewNode(RBBINode.opCat);
171             catNode.fLeftChild = operandNode;
172             operandNode.fParent = catNode;
173         }
174             break;
175 
176         case RBBIRuleParseTable.doLParen:
177             // Open Paren.
178             //   The openParen node is a dummy operation type with a low
179             // precedence,
180             //     which has the affect of ensuring that any real binary op that
181             //     follows within the parens binds more tightly to the operands than
182             //     stuff outside of the parens.
183             pushNewNode(RBBINode.opLParen);
184             break;
185 
186         case RBBIRuleParseTable.doExprRParen:
187             fixOpStack(RBBINode.precLParen);
188             break;
189 
190         case RBBIRuleParseTable.doNOP:
191             break;
192 
193         case RBBIRuleParseTable.doStartAssign:
194             // We've just scanned "$variable = "
195             // The top of the node stack has the $variable ref node.
196 
197             // Save the start position of the RHS text in the StartExpression
198             // node
199             //   that precedes the $variableReference node on the stack.
200             //   This will eventually be used when saving the full $variable
201             // replacement
202             //   text as a string.
203             n = fNodeStack[fNodeStackPtr - 1];
204             n.fFirstPos = fNextIndex; // move past the '='
205 
206             // Push a new start-of-expression node; needed to keep parse of the
207             //   RHS expression happy.
208             pushNewNode(RBBINode.opStart);
209             break;
210 
211         case RBBIRuleParseTable.doEndAssign: {
212             // We have reached the end of an assignement statement.
213             //   Current scan char is the ';' that terminates the assignment.
214 
215             // Terminate expression, leaves expression parse tree rooted in TOS
216             // node.
217             fixOpStack(RBBINode.precStart);
218 
219             RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
220             RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
221             RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
222 
223             // Save original text of right side of assignment, excluding the
224             // terminating ';'
225             //  in the root of the node for the right-hand-side expression.
226             RHSExprNode.fFirstPos = startExprNode.fFirstPos;
227             RHSExprNode.fLastPos = fScanIndex;
228             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
229             // RHSExprNode.fLastPos, RHSExprNode.fText);
230             RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
231                     RHSExprNode.fLastPos);
232 
233             // Expression parse tree becomes l. child of the $variable reference
234             // node.
235             varRefNode.fLeftChild = RHSExprNode;
236             RHSExprNode.fParent = varRefNode;
237 
238             // Make a symbol table entry for the $variableRef node.
239             fSymbolTable.addEntry(varRefNode.fText, varRefNode);
240 
241             // Clean up the stack.
242             fNodeStackPtr -= 3;
243             break;
244         }
245 
246         case RBBIRuleParseTable.doEndOfRule: {
247             fixOpStack(RBBINode.precStart); // Terminate expression, leaves
248                                             // expression
249 
250             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
251                 printNodeStack("end of rule");
252             }
253             Assert.assrt(fNodeStackPtr == 1);
254             RBBINode thisRule = fNodeStack[fNodeStackPtr];
255 
256             // If this rule includes a look-ahead '/', add a endMark node to the
257             //   expression tree.
258             if (fLookAheadRule) {
259                 RBBINode endNode = pushNewNode(RBBINode.endMark);
260                 RBBINode catNode = pushNewNode(RBBINode.opCat);
261                 fNodeStackPtr -= 2;
262                 catNode.fLeftChild = thisRule;
263                 catNode.fRightChild = endNode;
264                 fNodeStack[fNodeStackPtr] = catNode;
265                 endNode.fVal = fRuleNum;
266                 endNode.fLookAheadEnd = true;
267                 thisRule = catNode;
268 
269                 // TODO: Disable chaining out of look-ahead (hard break) rules.
270                 //   The break on rule match is forced, so there is no point in building up
271                 //   the state table to chain into another rule for a longer match.
272             }
273 
274             // Mark this node as being the root of a rule.
275             thisRule.fRuleRoot = true;
276 
277             // Flag if chaining into this rule is wanted.
278             //
279             if (fRB.fChainRules &&          // If rule chaining is enabled globally via !!chain
280                     !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
281                 thisRule.fChainIn = true;
282             }
283 
284 
285             // All rule expressions are ORed together.
286             // The ';' that terminates an expression really just functions as a
287             // '|' with
288             //   a low operator precedence.
289             //
290             // Each of the four sets of rules are collected separately.
291             //  (forward, reverse, safe_forward, safe_reverse)
292             //  OR this rule into the appropriate group of them.
293             //
294 
295             int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree);
296 
297             if (fRB.fTreeRoots[destRules] != null) {
298                 // This is not the first rule encountered.
299                 // OR previous stuff (from *destRules)
300                 // with the current rule expression (on the Node Stack)
301                 //  with the resulting OR expression going to *destRules
302                 //
303                 thisRule = fNodeStack[fNodeStackPtr];
304                 RBBINode prevRules = fRB.fTreeRoots[destRules];
305                 RBBINode orNode = pushNewNode(RBBINode.opOr);
306                 orNode.fLeftChild = prevRules;
307                 prevRules.fParent = orNode;
308                 orNode.fRightChild = thisRule;
309                 thisRule.fParent = orNode;
310                 fRB.fTreeRoots[destRules] = orNode;
311             } else {
312                 // This is the first rule encountered (for this direction).
313                 // Just move its parse tree from the stack to *destRules.
314                 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
315             }
316             fReverseRule = false; // in preparation for the next rule.
317             fLookAheadRule = false;
318             fNoChainInRule = false;
319             fNodeStackPtr = 0;
320         }
321             break;
322 
323         case RBBIRuleParseTable.doRuleError:
324             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
325             returnVal = false;
326             break;
327 
328         case RBBIRuleParseTable.doVariableNameExpectedErr:
329             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
330             break;
331 
332         //
333         //  Unary operands + ? *
334         //    These all appear after the operand to which they apply.
335         //    When we hit one, the operand (may be a whole sub expression)
336         //    will be on the top of the stack.
337         //    Unary Operator becomes TOS, with the old TOS as its one child.
338         case RBBIRuleParseTable.doUnaryOpPlus: {
339             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
340             RBBINode plusNode = pushNewNode(RBBINode.opPlus);
341             plusNode.fLeftChild = operandNode;
342             operandNode.fParent = plusNode;
343         }
344             break;
345 
346         case RBBIRuleParseTable.doUnaryOpQuestion: {
347             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
348             RBBINode qNode = pushNewNode(RBBINode.opQuestion);
349             qNode.fLeftChild = operandNode;
350             operandNode.fParent = qNode;
351         }
352             break;
353 
354         case RBBIRuleParseTable.doUnaryOpStar: {
355             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
356             RBBINode starNode = pushNewNode(RBBINode.opStar);
357             starNode.fLeftChild = operandNode;
358             operandNode.fParent = starNode;
359         }
360             break;
361 
362         case RBBIRuleParseTable.doRuleChar:
363         // A "Rule Character" is any single character that is a literal part
364         // of the regular expression. Like a, b and c in the expression "(abc*)
365         // | [:L:]"
366         // These are pretty uncommon in break rules; the terms are more commonly
367         //  sets. To keep things uniform, treat these characters like as
368         // sets that just happen to contain only one character.
369         {
370             n = pushNewNode(RBBINode.setRef);
371             String s = String.valueOf((char)fC.fChar);
372             findSetFor(s, n, null);
373             n.fFirstPos = fScanIndex;
374             n.fLastPos = fNextIndex;
375             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
376             break;
377         }
378 
379         case RBBIRuleParseTable.doDotAny:
380         // scanned a ".", meaning match any single character.
381         {
382             n = pushNewNode(RBBINode.setRef);
383             findSetFor(kAny, n, null);
384             n.fFirstPos = fScanIndex;
385             n.fLastPos = fNextIndex;
386             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
387             break;
388         }
389 
390         case RBBIRuleParseTable.doSlash:
391             // Scanned a '/', which identifies a look-ahead break position in a
392             // rule.
393             n = pushNewNode(RBBINode.lookAhead);
394             n.fVal = fRuleNum;
395             n.fFirstPos = fScanIndex;
396             n.fLastPos = fNextIndex;
397             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
398             fLookAheadRule = true;
399             break;
400 
401         case RBBIRuleParseTable.doStartTagValue:
402             // Scanned a '{', the opening delimiter for a tag value within a
403             // rule.
404             n = pushNewNode(RBBINode.tag);
405             n.fVal = 0;
406             n.fFirstPos = fScanIndex;
407             n.fLastPos = fNextIndex;
408             break;
409 
410         case RBBIRuleParseTable.doTagDigit:
411         // Just scanned a decimal digit that's part of a tag value
412         {
413             n = fNodeStack[fNodeStackPtr];
414             int v = UCharacter.digit((char) fC.fChar, 10);
415             n.fVal = n.fVal * 10 + v;
416             break;
417         }
418 
419         case RBBIRuleParseTable.doTagValue:
420             n = fNodeStack[fNodeStackPtr];
421             n.fLastPos = fNextIndex;
422             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
423             break;
424 
425         case RBBIRuleParseTable.doTagExpectedError:
426             error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
427             returnVal = false;
428             break;
429 
430         case RBBIRuleParseTable.doOptionStart:
431             // Scanning a !!option. At the start of string.
432             fOptionStart = fScanIndex;
433             break;
434 
435         case RBBIRuleParseTable.doOptionEnd: {
436             String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
437             if (opt.equals("chain")) {
438                 fRB.fChainRules = true;
439             } else if (opt.equals("LBCMNoChain")) {
440                 fRB.fLBCMNoChain = true;
441             } else if (opt.equals("forward")) {
442                 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
443             } else if (opt.equals("reverse")) {
444                 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
445             } else if (opt.equals("safe_forward")) {
446                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
447             } else if (opt.equals("safe_reverse")) {
448                 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
449             } else if (opt.equals("lookAheadHardBreak")) {
450                 fRB.fLookAheadHardBreak = true;
451             } else if (opt.equals("quoted_literals_only")) {
452                 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear();
453             } else if (opt.equals("unquoted_literals")) {
454                 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern);
455             } else {
456                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
457             }
458             break;
459         }
460 
461         case RBBIRuleParseTable.doReverseDir:
462             fReverseRule = true;
463             break;
464 
465         case RBBIRuleParseTable.doStartVariableName:
466             n = pushNewNode(RBBINode.varRef);
467             n.fFirstPos = fScanIndex;
468             break;
469 
470         case RBBIRuleParseTable.doEndVariableName:
471             n = fNodeStack[fNodeStackPtr];
472             if (n == null || n.fType != RBBINode.varRef) {
473                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
474                 break;
475             }
476             n.fLastPos = fScanIndex;
477             n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
478             // Look the newly scanned name up in the symbol table
479             //   If there's an entry, set the l. child of the var ref to the
480             // replacement expression.
481             //   (We also pass through here when scanning assignments, but no harm
482             // is done, other
483             //    than a slight wasted effort that seems hard to avoid. Lookup will
484             // be null)
485             n.fLeftChild = fSymbolTable.lookupNode(n.fText);
486             break;
487 
488         case RBBIRuleParseTable.doCheckVarDef:
489             n = fNodeStack[fNodeStackPtr];
490             if (n.fLeftChild == null) {
491                 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
492                 returnVal = false;
493             }
494             break;
495 
496         case RBBIRuleParseTable.doExprFinished:
497             break;
498 
499         case RBBIRuleParseTable.doRuleErrorAssignExpr:
500             error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
501             returnVal = false;
502             break;
503 
504         case RBBIRuleParseTable.doExit:
505             returnVal = false;
506             break;
507 
508         case RBBIRuleParseTable.doScanUnicodeSet:
509             scanSet();
510             break;
511 
512         default:
513             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
514             returnVal = false;
515             break;
516         }
517         return returnVal;
518     }
519 
520     //----------------------------------------------------------------------------------------
521     //
522     //  Error Throw and IllegalArgumentException in response to a rule parse
523     // error.
524     //
525     //----------------------------------------------------------------------------------------
error(int e)526     void error(int e) {
527         String s = "Error " + e + " at line " + fLineNum + " column "
528                 + fCharNum;
529         IllegalArgumentException ex = new IllegalArgumentException(s);
530         throw ex;
531 
532     }
533 
534     //----------------------------------------------------------------------------------------
535     //
536     //  fixOpStack The parse stack holds partially assembled chunks of the parse
537     // tree.
538     //               An entry on the stack may be as small as a single setRef node,
539     //               or as large as the parse tree
540     //               for an entire expression (this will be the one item left on the stack
541     //               when the parsing of an RBBI rule completes.
542     //
543     //               This function is called when a binary operator is encountered.
544     //               It looks back up the stack for operators that are not yet associated
545     //               with a right operand, and if the precedence of the stacked operator >=
546     //               the precedence of the current operator, binds the operand left,
547     //               to the previously encountered operator.
548     //
549     //----------------------------------------------------------------------------------------
fixOpStack(int p)550     void fixOpStack(int p) {
551         RBBINode n;
552         // printNodeStack("entering fixOpStack()");
553         for (;;) {
554             n = fNodeStack[fNodeStackPtr - 1]; // an operator node
555             if (n.fPrecedence == 0) {
556                 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
557                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
558                 return;
559             }
560 
561             if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
562                 // The most recent operand goes with the current operator,
563                 //   not with the previously stacked one.
564                 break;
565             }
566             // Stack operator is a binary op ( '|' or concatenation)
567             //   TOS operand becomes right child of this operator.
568             //   Resulting subexpression becomes the TOS operand.
569             n.fRightChild = fNodeStack[fNodeStackPtr];
570             fNodeStack[fNodeStackPtr].fParent = n;
571             fNodeStackPtr--;
572             // printNodeStack("looping in fixOpStack() ");
573         }
574 
575         if (p <= RBBINode.precLParen) {
576             // Scan is at a right paren or end of expression.
577             //  The scanned item must match the stack, or else there was an
578             // error.
579             //  Discard the left paren (or start expr) node from the stack,
580             //  leaving the completed (sub)expression as TOS.
581             if (n.fPrecedence != p) {
582                 // Right paren encountered matched start of expression node, or
583                 // end of expression matched with a left paren node.
584                 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
585             }
586             fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
587             fNodeStackPtr--;
588             // Delete the now-discarded LParen or Start node.
589             // delete n;
590         }
591         // printNodeStack("leaving fixOpStack()");
592     }
593 
594     //----------------------------------------------------------------------------
595     //
596     //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
597     //                        been encountered. The val Node will be of nodetype uset
598     //                        and contain pointers to the actual UnicodeSets.
599     //                        The Key is the source string for initializing the set.
600     //
601     //                        The hash table is used to avoid creating duplicate
602     //                        unnamed (not $var references) UnicodeSets.
603     //
604     //----------------------------------------------------------------------------
605     static class RBBISetTableEl {
606         String key;
607 
608         RBBINode val;
609     }
610 
611 
612     //----------------------------------------------------------------------------------------
613     //
614     //   findSetFor given a String,
615     //                  - find the corresponding Unicode Set (uset node)
616     //                         (create one if necessary)
617     //                  - Set fLeftChild of the caller's node (should be a setRef node)
618     //                         to the uset node
619     //                 Maintain a hash table of uset nodes, so the same one is always used
620     //                    for the same string.
621     //                 If a "to adopt" set is provided and we haven't seen this key before,
622     //                    add the provided set to the hash table.
623     //                 If the string is one (32 bit) char in length, the set contains
624     //                    just one element which is the char in question.
625     //                 If the string is "any", return a set containing all chars.
626     //
627     //----------------------------------------------------------------------------------------
findSetFor(String s, RBBINode node, UnicodeSet setToAdopt)628     void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
629 
630         RBBISetTableEl el;
631 
632         // First check whether we've already cached a set for this string.
633         // If so, just use the cached set in the new node.
634         //   delete any set provided by the caller, since we own it.
635         el = fSetTable.get(s);
636         if (el != null) {
637             node.fLeftChild = el.val;
638             Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
639             return;
640         }
641 
642         // Haven't seen this set before.
643         // If the caller didn't provide us with a prebuilt set,
644         //   create a new UnicodeSet now.
645         if (setToAdopt == null) {
646             if (s.equals(kAny)) {
647                 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
648             } else {
649                 int c;
650                 c = UTF16.charAt(s, 0);
651                 setToAdopt = new UnicodeSet(c, c);
652             }
653         }
654 
655         //
656         // Make a new uset node to refer to this UnicodeSet
657         // This new uset node becomes the child of the caller's setReference
658         // node.
659         //
660         RBBINode usetNode = new RBBINode(RBBINode.uset);
661         usetNode.fInputSet = setToAdopt;
662         usetNode.fParent = node;
663         node.fLeftChild = usetNode;
664         usetNode.fText = s;
665 
666         //
667         // Add the new uset node to the list of all uset nodes.
668         //
669         fRB.fUSetNodes.add(usetNode);
670 
671         //
672         // Add the new set to the set hash table.
673         //
674         el = new RBBISetTableEl();
675         el.key = s;
676         el.val = usetNode;
677         fSetTable.put(el.key, el);
678 
679         return;
680     }
681 
682     //
683     //  Assorted Unicode character constants.
684     //     Numeric because there is no portable way to enter them as literals.
685     //     (Think EBCDIC).
686     //
687     static final int chNEL = 0x85; //    NEL newline variant
688 
689     static final int chLS = 0x2028; //    Unicode Line Separator
690 
691     //----------------------------------------------------------------------------------------
692     //
693     //  stripRules    Return a rules string without unnecessary
694     //                characters.
695     //
696     //----------------------------------------------------------------------------------------
stripRules(String rules)697     static String stripRules(String rules) {
698         StringBuilder strippedRules = new StringBuilder();
699         int rulesLength = rules.length();
700         boolean skippingSpaces = false;
701 
702         for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
703             int cp = rules.codePointAt(idx);
704             boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
705             if (skippingSpaces && whiteSpace) {
706                 continue;
707             }
708             strippedRules.appendCodePoint(cp);
709             skippingSpaces = whiteSpace;
710         }
711         return strippedRules.toString();
712     }
713 
714     //----------------------------------------------------------------------------------------
715     //
716     //  nextCharLL    Low Level Next Char from rule input source.
717     //                Get a char from the input character iterator,
718     //                keep track of input position for error reporting.
719     //
720     //----------------------------------------------------------------------------------------
nextCharLL()721     int nextCharLL() {
722         int ch;
723 
724         if (fNextIndex >= fRB.fRules.length()) {
725             return -1;
726         }
727         ch = UTF16.charAt(fRB.fRules, fNextIndex);
728         fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
729 
730         if (ch == '\r' ||
731             ch == chNEL ||
732             ch == chLS ||
733             ch == '\n' && fLastChar != '\r') {
734             // Character is starting a new line.  Bump up the line number, and
735             //  reset the column to 0.
736             fLineNum++;
737             fCharNum = 0;
738             if (fQuoteMode) {
739                 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
740                 fQuoteMode = false;
741             }
742         } else {
743             // Character is not starting a new line.  Except in the case of a
744             //   LF following a CR, increment the column position.
745             if (ch != '\n') {
746                 fCharNum++;
747             }
748         }
749         fLastChar = ch;
750         return ch;
751     }
752 
753     //---------------------------------------------------------------------------------
754     //
755     //   nextChar     for rules scanning.  At this level, we handle stripping
756     //                out comments and processing backslash character escapes.
757     //                The rest of the rules grammar is handled at the next level up.
758     //
759     //---------------------------------------------------------------------------------
nextChar(RBBIRuleChar c)760     void nextChar(RBBIRuleChar c) {
761 
762         // Unicode Character constants needed for the processing done by nextChar(),
763         //   in hex because literals wont work on EBCDIC machines.
764 
765         fScanIndex = fNextIndex;
766         c.fChar = nextCharLL();
767         c.fEscaped = false;
768 
769         //
770         //  check for '' sequence.
771         //  These are recognized in all contexts, whether in quoted text or not.
772         //
773         if (c.fChar == '\'') {
774             if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
775                 c.fChar = nextCharLL(); // get nextChar officially so character counts
776                 c.fEscaped = true; //   stay correct.
777             } else {
778                 // Single quote, by itself.
779                 //   Toggle quoting mode.
780                 //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
781                 fQuoteMode = !fQuoteMode;
782                 if (fQuoteMode == true) {
783                     c.fChar = '(';
784                 } else {
785                     c.fChar = ')';
786                 }
787                 c.fEscaped = false; // The paren that we return is not escaped.
788                 return;
789             }
790         }
791 
792         if (fQuoteMode) {
793             c.fEscaped = true;
794         } else {
795             // We are not in a 'quoted region' of the source.
796             //
797             if (c.fChar == '#') {
798                 // Start of a comment.  Consume the rest of it.
799                 //  The new-line char that terminates the comment is always returned.
800                 //  It will be treated as white-space, and serves to break up anything
801                 //    that might otherwise incorrectly clump together with a comment in
802                 //    the middle (a variable name, for example.)
803                 int commentStart = fScanIndex;
804                 for (;;) {
805                     c.fChar = nextCharLL();
806                     if (c.fChar == -1 || // EOF
807                         c.fChar == '\r' ||
808                         c.fChar == '\n' ||
809                         c.fChar == chNEL ||
810                         c.fChar == chLS)
811                     {
812                         break;
813                     }
814                 }
815                 for (int i=commentStart; i<fNextIndex-1; ++i) {
816                     fRB.fStrippedRules.setCharAt(i, ' ');
817                 }
818             }
819             if (c.fChar == -1) {
820                 return;
821             }
822 
823             //
824             //  check for backslash escaped characters.
825             //  Use String.unescapeAt() to handle them.
826             //
827             if (c.fChar == '\\') {
828                 c.fEscaped = true;
829                 int[] unescapeIndex = new int[1];
830                 unescapeIndex[0] = fNextIndex;
831                 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
832                 if (unescapeIndex[0] == fNextIndex) {
833                     error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
834                 }
835 
836                 fCharNum += unescapeIndex[0] - fNextIndex;
837                 fNextIndex = unescapeIndex[0];
838             }
839         }
840         // putc(c.fChar, stdout);
841     }
842 
843     //---------------------------------------------------------------------------------
844     //
845     //  Parse RBBI rules.   The state machine for rules parsing is here.
846     //                      The state tables are hand-written in the file rbbirpt.txt,
847     //                      and converted to the form used here by a perl
848     //                      script rbbicst.pl
849     //
850     //---------------------------------------------------------------------------------
parse()851     void parse() {
852         int state;
853         RBBIRuleParseTable.RBBIRuleTableElement tableEl;
854 
855         state = 1;
856         nextChar(fC);
857         //
858         // Main loop for the rule parsing state machine.
859         //   Runs once per state transition.
860         //   Each time through optionally performs, depending on the state table,
861         //      - an advance to the the next input char
862         //      - an action to be performed.
863         //      - pushing or popping a state to/from the local state return stack.
864         //
865         for (;;) {
866             // Quit if state == 0.  This is the normal way to exit the state machine.
867             //
868             if (state == 0) {
869                 break;
870             }
871 
872             // Find the state table element that matches the input char from the rule, or the
873             //    class of the input character.  Start with the first table row for this
874             //    state, then linearly scan forward until we find a row that matches the
875             //    character.  The last row for each state always matches all characters, so
876             //    the search will stop there, if not before.
877             //
878             tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
879             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
880                 System.out.println("char, line, col = (\'" + (char) fC.fChar
881                         + "\', " + fLineNum + ", " + fCharNum + "    state = "
882                         + tableEl.fStateName);
883             }
884 
885             for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
886                 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
887                 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
888                     System.out.print(".");
889                 }
890                 if (tableEl.fCharClass < 127 && fC.fEscaped == false
891                         && tableEl.fCharClass == fC.fChar) {
892                     // Table row specified an individual character, not a set, and
893                     //   the input character is not escaped, and
894                     //   the input character matched it.
895                     break;
896                 }
897                 if (tableEl.fCharClass == 255) {
898                     // Table row specified default, match anything character class.
899                     break;
900                 }
901                 if (tableEl.fCharClass == 254 && fC.fEscaped) {
902                     // Table row specified "escaped" and the char was escaped.
903                     break;
904                 }
905                 if (tableEl.fCharClass == 253 && fC.fEscaped
906                         && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
907                     // Table row specified "escaped P" and the char is either 'p' or 'P'.
908                     break;
909                 }
910                 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
911                     // Table row specified eof and we hit eof on the input.
912                     break;
913                 }
914 
915                 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
916                         fC.fEscaped == false && //   char is not escaped &&
917                         fC.fChar != -1) { //   char is not EOF
918                     UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
919                     if (uniset.contains(fC.fChar)) {
920                         // Table row specified a character class, or set of characters,
921                         //   and the current char matches it.
922                         break;
923                     }
924                 }
925             }
926 
927             if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
928                 System.out.println("");
929             }
930             //
931             // We've found the row of the state table that matches the current input
932             //   character from the rules string.
933             // Perform any action specified  by this row in the state table.
934             if (doParseActions(tableEl.fAction) == false) {
935                 // Break out of the state machine loop if the
936                 //   the action signalled some kind of error, or
937                 //   the action was to exit, occurs on normal end-of-rules-input.
938                 break;
939             }
940 
941             if (tableEl.fPushState != 0) {
942                 fStackPtr++;
943                 if (fStackPtr >= kStackSize) {
944                     System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
945                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
946                 }
947                 fStack[fStackPtr] = tableEl.fPushState;
948             }
949 
950             if (tableEl.fNextChar) {
951                 nextChar(fC);
952             }
953 
954             // Get the next state from the table entry, or from the
955             //   state stack if the next state was specified as "pop".
956             if (tableEl.fNextState != 255) {
957                 state = tableEl.fNextState;
958             } else {
959                 state = fStack[fStackPtr];
960                 fStackPtr--;
961                 if (fStackPtr < 0) {
962                     System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
963                     error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
964                 }
965             }
966 
967         }
968 
969         // If there are no forward rules throw an error.
970         //
971         if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
972             error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
973         }
974 
975         //
976         // Parsing of the input RBBI rules is complete.
977         // We now have a parse tree for the rule expressions
978         // and a list of all UnicodeSets that are referenced.
979         //
980         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
981             fSymbolTable.rbbiSymtablePrint();
982         }
983         if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
984             System.out.println("Completed Forward Rules Parse Tree...");
985             fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
986             System.out.println("\nCompleted Reverse Rules Parse Tree...");
987             fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
988             System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
989             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
990                 System.out.println("  -- null -- ");
991             } else {
992                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
993             }
994             System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
995             if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
996                 System.out.println("  -- null -- ");
997             } else {
998                 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
999             }
1000         }
1001     }
1002 
1003     //---------------------------------------------------------------------------------
1004     //
1005     //  printNodeStack     for debugging...
1006     //
1007     //---------------------------------------------------------------------------------
1008     ///CLOVER:OFF
printNodeStack(String title)1009     void printNodeStack(String title) {
1010         int i;
1011         System.out.println(title + ".  Dumping node stack...\n");
1012         for (i = fNodeStackPtr; i > 0; i--) {
1013             fNodeStack[i].printTree(true);
1014         }
1015     }
1016     ///CLOVER:ON
1017 
1018     //---------------------------------------------------------------------------------
1019     //
1020     //  pushNewNode   create a new RBBINode of the specified type and push it
1021     //                onto the stack of nodes.
1022     //
1023     //---------------------------------------------------------------------------------
pushNewNode(int nodeType)1024     RBBINode pushNewNode(int nodeType) {
1025         fNodeStackPtr++;
1026         if (fNodeStackPtr >= kStackSize) {
1027             System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1028             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1029         }
1030         fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1031         return fNodeStack[fNodeStackPtr];
1032     }
1033 
1034     //---------------------------------------------------------------------------------
1035     //
1036     //  scanSet    Construct a UnicodeSet from the text at the current scan
1037     //             position.  Advance the scan position to the first character
1038     //             after the set.
1039     //
1040     //             A new RBBI setref node referring to the set is pushed onto the node
1041     //             stack.
1042     //
1043     //             The scan position is normally under the control of the state machine
1044     //             that controls rule parsing.  UnicodeSets, however, are parsed by
1045     //             the UnicodeSet constructor, not by the RBBI rule parser.
1046     //
1047     //---------------------------------------------------------------------------------
scanSet()1048     void scanSet() {
1049         UnicodeSet uset = null;
1050         int startPos;
1051         ParsePosition pos = new ParsePosition(fScanIndex);
1052         int i;
1053 
1054         startPos = fScanIndex;
1055         try {
1056             uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1057         } catch (Exception e) { // TODO:  catch fewer exception types.
1058             // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1059             error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1060         }
1061 
1062         // Verify that the set contains at least one code point.
1063         //
1064         if (uset.isEmpty()) {
1065             // This set is empty.
1066             //  Make it an error, because it almost certainly is not what the user wanted.
1067             //  Also, avoids having to think about corner cases in the tree manipulation code
1068             //   that occurs later on.
1069             //  TODO:  this shouldn't be an error; it does happen.
1070             error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1071         }
1072 
1073         // Advance the RBBI parse postion over the UnicodeSet pattern.
1074         //   Don't just set fScanIndex because the line/char positions maintained
1075         //   for error reporting would be thrown off.
1076         i = pos.getIndex();
1077         for (;;) {
1078             if (fNextIndex >= i) {
1079                 break;
1080             }
1081             nextCharLL();
1082         }
1083 
1084         RBBINode n;
1085 
1086         n = pushNewNode(RBBINode.setRef);
1087         n.fFirstPos = startPos;
1088         n.fLastPos = fNextIndex;
1089         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1090         //  findSetFor() serves several purposes here:
1091         //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1092         //     - Mantains collection of all sets in use, needed later for establishing
1093         //          character categories for run time engine.
1094         //     - Eliminates mulitiple instances of the same set.
1095         //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1096         findSetFor(n.fText, n, uset);
1097     }
1098 
1099 }
1100 
1101