1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 //  rbbitblb.cpp
11 //
12 
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbisetb.h"
22 #include "rbbidata.h"
23 #include "cstring.h"
24 #include "uassert.h"
25 #include "uvectr32.h"
26 #include "cmemory.h"
27 
28 U_NAMESPACE_BEGIN
29 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)30 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
31         fRB(rb),
32         fTree(*rootNode),
33         fStatus(&status),
34         fDStates(nullptr),
35         fSafeTable(nullptr) {
36     if (U_FAILURE(status)) {
37         return;
38     }
39     // fDStates is UVector<RBBIStateDescriptor *>
40     fDStates = new UVector(status);
41     if (U_SUCCESS(status) && fDStates == nullptr ) {
42         status = U_MEMORY_ALLOCATION_ERROR;
43     }
44 }
45 
46 
47 
~RBBITableBuilder()48 RBBITableBuilder::~RBBITableBuilder() {
49     int i;
50     for (i=0; i<fDStates->size(); i++) {
51         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
52     }
53     delete fDStates;
54     delete fSafeTable;
55 }
56 
57 
58 //-----------------------------------------------------------------------------
59 //
60 //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
61 //                               the DFA state transition table from the RBBI rules parse tree.
62 //
63 //-----------------------------------------------------------------------------
buildForwardTable()64 void  RBBITableBuilder::buildForwardTable() {
65 
66     if (U_FAILURE(*fStatus)) {
67         return;
68     }
69 
70     // If there were no rules, just return.  This situation can easily arise
71     //   for the reverse rules.
72     if (fTree==NULL) {
73         return;
74     }
75 
76     //
77     // Walk through the tree, replacing any references to $variables with a copy of the
78     //   parse tree for the substition expression.
79     //
80     fTree = fTree->flattenVariables();
81 #ifdef RBBI_DEBUG
82     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
83         RBBIDebugPuts("\nParse tree after flattening variable references.");
84         RBBINode::printTree(fTree, TRUE);
85     }
86 #endif
87 
88     //
89     // If the rules contained any references to {bof}
90     //   add a {bof} <cat> <former root of tree> to the
91     //   tree.  Means that all matches must start out with the
92     //   {bof} fake character.
93     //
94     if (fRB->fSetBuilder->sawBOF()) {
95         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
96         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
97         // Delete and exit if memory allocation failed.
98         if (bofTop == NULL || bofLeaf == NULL) {
99             *fStatus = U_MEMORY_ALLOCATION_ERROR;
100             delete bofTop;
101             delete bofLeaf;
102             return;
103         }
104         bofTop->fLeftChild  = bofLeaf;
105         bofTop->fRightChild = fTree;
106         bofLeaf->fParent    = bofTop;
107         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
108         fTree               = bofTop;
109     }
110 
111     //
112     // Add a unique right-end marker to the expression.
113     //   Appears as a cat-node, left child being the original tree,
114     //   right child being the end marker.
115     //
116     RBBINode *cn = new RBBINode(RBBINode::opCat);
117     // Exit if memory allocation failed.
118     if (cn == NULL) {
119         *fStatus = U_MEMORY_ALLOCATION_ERROR;
120         return;
121     }
122     cn->fLeftChild = fTree;
123     fTree->fParent = cn;
124     cn->fRightChild = new RBBINode(RBBINode::endMark);
125     // Delete and exit if memory allocation failed.
126     if (cn->fRightChild == NULL) {
127         *fStatus = U_MEMORY_ALLOCATION_ERROR;
128         delete cn;
129         return;
130     }
131     cn->fRightChild->fParent = cn;
132     fTree = cn;
133 
134     //
135     //  Replace all references to UnicodeSets with the tree for the equivalent
136     //      expression.
137     //
138     fTree->flattenSets();
139 #ifdef RBBI_DEBUG
140     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
141         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142         RBBINode::printTree(fTree, TRUE);
143     }
144 #endif
145 
146 
147     //
148     // calculate the functions nullable, firstpos, lastpos and followpos on
149     // nodes in the parse tree.
150     //    See the alogrithm description in Aho.
151     //    Understanding how this works by looking at the code alone will be
152     //       nearly impossible.
153     //
154     calcNullable(fTree);
155     calcFirstPos(fTree);
156     calcLastPos(fTree);
157     calcFollowPos(fTree);
158     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
159         RBBIDebugPuts("\n");
160         printPosSets(fTree);
161     }
162 
163     //
164     //  For "chained" rules, modify the followPos sets
165     //
166     if (fRB->fChainRules) {
167         calcChainedFollowPos(fTree);
168     }
169 
170     //
171     //  BOF (start of input) test fixup.
172     //
173     if (fRB->fSetBuilder->sawBOF()) {
174         bofFixup();
175     }
176 
177     //
178     // Build the DFA state transition tables.
179     //
180     buildStateTable();
181     flagAcceptingStates();
182     flagLookAheadStates();
183     flagTaggedStates();
184 
185     //
186     // Update the global table of rule status {tag} values
187     // The rule builder has a global vector of status values that are common
188     //    for all tables.  Merge the ones from this table into the global set.
189     //
190     mergeRuleStatusVals();
191 }
192 
193 
194 
195 //-----------------------------------------------------------------------------
196 //
197 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
198 //
199 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)200 void RBBITableBuilder::calcNullable(RBBINode *n) {
201     if (n == NULL) {
202         return;
203     }
204     if (n->fType == RBBINode::setRef ||
205         n->fType == RBBINode::endMark ) {
206         // These are non-empty leaf node types.
207         n->fNullable = FALSE;
208         return;
209     }
210 
211     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212         // Lookahead marker node.  It's a leaf, so no recursion on children.
213         // It's nullable because it does not match any literal text from the input stream.
214         n->fNullable = TRUE;
215         return;
216     }
217 
218 
219     // The node is not a leaf.
220     //  Calculate nullable on its children.
221     calcNullable(n->fLeftChild);
222     calcNullable(n->fRightChild);
223 
224     // Apply functions from table 3.40 in Aho
225     if (n->fType == RBBINode::opOr) {
226         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227     }
228     else if (n->fType == RBBINode::opCat) {
229         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230     }
231     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232         n->fNullable = TRUE;
233     }
234     else {
235         n->fNullable = FALSE;
236     }
237 }
238 
239 
240 
241 
242 //-----------------------------------------------------------------------------
243 //
244 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
245 //
246 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248     if (n == NULL) {
249         return;
250     }
251     if (n->fType == RBBINode::leafChar  ||
252         n->fType == RBBINode::endMark   ||
253         n->fType == RBBINode::lookAhead ||
254         n->fType == RBBINode::tag) {
255         // These are non-empty leaf node types.
256         // Note: In order to maintain the sort invariant on the set,
257         // this function should only be called on a node whose set is
258         // empty to start with.
259         n->fFirstPosSet->addElement(n, *fStatus);
260         return;
261     }
262 
263     // The node is not a leaf.
264     //  Calculate firstPos on its children.
265     calcFirstPos(n->fLeftChild);
266     calcFirstPos(n->fRightChild);
267 
268     // Apply functions from table 3.40 in Aho
269     if (n->fType == RBBINode::opOr) {
270         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272     }
273     else if (n->fType == RBBINode::opCat) {
274         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275         if (n->fLeftChild->fNullable) {
276             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277         }
278     }
279     else if (n->fType == RBBINode::opStar ||
280              n->fType == RBBINode::opQuestion ||
281              n->fType == RBBINode::opPlus) {
282         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283     }
284 }
285 
286 
287 
288 //-----------------------------------------------------------------------------
289 //
290 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
291 //
292 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
294     if (n == NULL) {
295         return;
296     }
297     if (n->fType == RBBINode::leafChar  ||
298         n->fType == RBBINode::endMark   ||
299         n->fType == RBBINode::lookAhead ||
300         n->fType == RBBINode::tag) {
301         // These are non-empty leaf node types.
302         // Note: In order to maintain the sort invariant on the set,
303         // this function should only be called on a node whose set is
304         // empty to start with.
305         n->fLastPosSet->addElement(n, *fStatus);
306         return;
307     }
308 
309     // The node is not a leaf.
310     //  Calculate lastPos on its children.
311     calcLastPos(n->fLeftChild);
312     calcLastPos(n->fRightChild);
313 
314     // Apply functions from table 3.40 in Aho
315     if (n->fType == RBBINode::opOr) {
316         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318     }
319     else if (n->fType == RBBINode::opCat) {
320         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321         if (n->fRightChild->fNullable) {
322             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323         }
324     }
325     else if (n->fType == RBBINode::opStar     ||
326              n->fType == RBBINode::opQuestion ||
327              n->fType == RBBINode::opPlus) {
328         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329     }
330 }
331 
332 
333 
334 //-----------------------------------------------------------------------------
335 //
336 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
337 //
338 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340     if (n == NULL ||
341         n->fType == RBBINode::leafChar ||
342         n->fType == RBBINode::endMark) {
343         return;
344     }
345 
346     calcFollowPos(n->fLeftChild);
347     calcFollowPos(n->fRightChild);
348 
349     // Aho rule #1
350     if (n->fType == RBBINode::opCat) {
351         RBBINode *i;   // is 'i' in Aho's description
352         uint32_t     ix;
353 
354         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355 
356         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359         }
360     }
361 
362     // Aho rule #2
363     if (n->fType == RBBINode::opStar ||
364         n->fType == RBBINode::opPlus) {
365         RBBINode   *i;  // again, n and i are the names from Aho's description.
366         uint32_t    ix;
367 
368         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370             setAdd(i->fFollowPos, n->fFirstPosSet);
371         }
372     }
373 
374 
375 
376 }
377 
378 //-----------------------------------------------------------------------------
379 //
380 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
381 //                        as roots of a rule to a destination vector.
382 //
383 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)384 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385     if (node == NULL || U_FAILURE(*fStatus)) {
386         return;
387     }
388     if (node->fRuleRoot) {
389         dest->addElement(node, *fStatus);
390         // Note: rules cannot nest. If we found a rule start node,
391         //       no child node can also be a start node.
392         return;
393     }
394     addRuleRootNodes(dest, node->fLeftChild);
395     addRuleRootNodes(dest, node->fRightChild);
396 }
397 
398 //-----------------------------------------------------------------------------
399 //
400 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
401 //                            to implement rule chaining.  NOT described by Aho
402 //
403 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree)404 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405 
406     UVector         endMarkerNodes(*fStatus);
407     UVector         leafNodes(*fStatus);
408     int32_t         i;
409 
410     if (U_FAILURE(*fStatus)) {
411         return;
412     }
413 
414     // get a list of all endmarker nodes.
415     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416 
417     // get a list all leaf nodes
418     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419     if (U_FAILURE(*fStatus)) {
420         return;
421     }
422 
423     // Collect all leaf nodes that can start matches for rules
424     // with inbound chaining enabled, which is the union of the
425     // firstPosition sets from each of the rule root nodes.
426 
427     UVector ruleRootNodes(*fStatus);
428     addRuleRootNodes(&ruleRootNodes, tree);
429 
430     UVector matchStartNodes(*fStatus);
431     for (int j=0; j<ruleRootNodes.size(); ++j) {
432         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
433         if (node->fChainIn) {
434             setAdd(&matchStartNodes, node->fFirstPosSet);
435         }
436     }
437     if (U_FAILURE(*fStatus)) {
438         return;
439     }
440 
441     int32_t  endNodeIx;
442     int32_t  startNodeIx;
443 
444     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445         RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
446         RBBINode *endNode = NULL;
447 
448         // Identify leaf nodes that correspond to overall rule match positions.
449         //   These include an endMarkerNode in their followPos sets.
450         for (i=0; i<endMarkerNodes.size(); i++) {
451             if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452                 endNode = tNode;
453                 break;
454             }
455         }
456         if (endNode == NULL) {
457             // node wasn't an end node.  Try again with the next.
458             continue;
459         }
460 
461         // We've got a node that can end a match.
462 
463         // Line Break Specific hack:  If this node's val correspond to the $CM char class,
464         //                            don't chain from it.
465         // TODO:  Add rule syntax for this behavior, get specifics out of here and
466         //        into the rule file.
467         if (fRB->fLBCMNoChain) {
468             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
469             if (c != -1) {
470                 // c == -1 occurs with sets containing only the {eof} marker string.
471                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472                 if (cLBProp == U_LB_COMBINING_MARK) {
473                     continue;
474                 }
475             }
476         }
477 
478 
479         // Now iterate over the nodes that can start a match, looking for ones
480         //   with the same char class as our ending node.
481         RBBINode *startNode;
482         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
483             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
484             if (startNode->fType != RBBINode::leafChar) {
485                 continue;
486             }
487 
488             if (endNode->fVal == startNode->fVal) {
489                 // The end val (character class) of one possible match is the
490                 //   same as the start of another.
491 
492                 // Add all nodes from the followPos of the start node to the
493                 //  followPos set of the end node, which will have the effect of
494                 //  letting matches transition from a match state at endNode
495                 //  to the second char of a match starting with startNode.
496                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
497             }
498         }
499     }
500 }
501 
502 
503 //-----------------------------------------------------------------------------
504 //
505 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
506 //                Do an swizzle similar to chaining, modifying the followPos set of
507 //                the bofNode to include the followPos nodes from other {bot} nodes
508 //                scattered through the tree.
509 //
510 //                This function has much in common with calcChainedFollowPos().
511 //
512 //-----------------------------------------------------------------------------
bofFixup()513 void RBBITableBuilder::bofFixup() {
514 
515     if (U_FAILURE(*fStatus)) {
516         return;
517     }
518 
519     //   The parse tree looks like this ...
520     //         fTree root  --->       <cat>
521     //                               /     \       .
522     //                            <cat>   <#end node>
523     //                           /     \  .
524     //                     <bofNode>   rest
525     //                               of tree
526     //
527     //    We will be adding things to the followPos set of the <bofNode>
528     //
529     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
530     U_ASSERT(bofNode->fType == RBBINode::leafChar);
531     U_ASSERT(bofNode->fVal == 2);
532 
533     // Get all nodes that can be the start a match of the user-written rules
534     //  (excluding the fake bofNode)
535     //  We want the nodes that can start a match in the
536     //     part labeled "rest of tree"
537     //
538     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
539 
540     RBBINode *startNode;
541     int       startNodeIx;
542     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
543         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
544         if (startNode->fType != RBBINode::leafChar) {
545             continue;
546         }
547 
548         if (startNode->fVal == bofNode->fVal) {
549             //  We found a leaf node corresponding to a {bof} that was
550             //    explicitly written into a rule.
551             //  Add everything from the followPos set of this node to the
552             //    followPos set of the fake bofNode at the start of the tree.
553             //
554             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
555         }
556     }
557 }
558 
559 //-----------------------------------------------------------------------------
560 //
561 //   buildStateTable()    Determine the set of runtime DFA states and the
562 //                        transition tables for these states, by the algorithm
563 //                        of fig. 3.44 in Aho.
564 //
565 //                        Most of the comments are quotes of Aho's psuedo-code.
566 //
567 //-----------------------------------------------------------------------------
buildStateTable()568 void RBBITableBuilder::buildStateTable() {
569     if (U_FAILURE(*fStatus)) {
570         return;
571     }
572     RBBIStateDescriptor *failState;
573     // Set it to NULL to avoid uninitialized warning
574     RBBIStateDescriptor *initialState = NULL;
575     //
576     // Add a dummy state 0 - the stop state.  Not from Aho.
577     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
578     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
579     if (failState == NULL) {
580         *fStatus = U_MEMORY_ALLOCATION_ERROR;
581         goto ExitBuildSTdeleteall;
582     }
583     failState->fPositions = new UVector(*fStatus);
584     if (failState->fPositions == NULL) {
585         *fStatus = U_MEMORY_ALLOCATION_ERROR;
586     }
587     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
588         goto ExitBuildSTdeleteall;
589     }
590     fDStates->addElement(failState, *fStatus);
591     if (U_FAILURE(*fStatus)) {
592         goto ExitBuildSTdeleteall;
593     }
594 
595     // initially, the only unmarked state in Dstates is firstpos(root),
596     //       where toot is the root of the syntax tree for (r)#;
597     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
598     if (initialState == NULL) {
599         *fStatus = U_MEMORY_ALLOCATION_ERROR;
600     }
601     if (U_FAILURE(*fStatus)) {
602         goto ExitBuildSTdeleteall;
603     }
604     initialState->fPositions = new UVector(*fStatus);
605     if (initialState->fPositions == NULL) {
606         *fStatus = U_MEMORY_ALLOCATION_ERROR;
607     }
608     if (U_FAILURE(*fStatus)) {
609         goto ExitBuildSTdeleteall;
610     }
611     setAdd(initialState->fPositions, fTree->fFirstPosSet);
612     fDStates->addElement(initialState, *fStatus);
613     if (U_FAILURE(*fStatus)) {
614         goto ExitBuildSTdeleteall;
615     }
616 
617     // while there is an unmarked state T in Dstates do begin
618     for (;;) {
619         RBBIStateDescriptor *T = NULL;
620         int32_t              tx;
621         for (tx=1; tx<fDStates->size(); tx++) {
622             RBBIStateDescriptor *temp;
623             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
624             if (temp->fMarked == FALSE) {
625                 T = temp;
626                 break;
627             }
628         }
629         if (T == NULL) {
630             break;
631         }
632 
633         // mark T;
634         T->fMarked = TRUE;
635 
636         // for each input symbol a do begin
637         int32_t  a;
638         for (a = 1; a<=lastInputSymbol; a++) {
639             // let U be the set of positions that are in followpos(p)
640             //    for some position p in T
641             //    such that the symbol at position p is a;
642             UVector    *U = NULL;
643             RBBINode   *p;
644             int32_t     px;
645             for (px=0; px<T->fPositions->size(); px++) {
646                 p = (RBBINode *)T->fPositions->elementAt(px);
647                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
648                     if (U == NULL) {
649                         U = new UVector(*fStatus);
650                         if (U == NULL) {
651                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
652                         	goto ExitBuildSTdeleteall;
653                         }
654                     }
655                     setAdd(U, p->fFollowPos);
656                 }
657             }
658 
659             // if U is not empty and not in DStates then
660             int32_t  ux = 0;
661             UBool    UinDstates = FALSE;
662             if (U != NULL) {
663                 U_ASSERT(U->size() > 0);
664                 int  ix;
665                 for (ix=0; ix<fDStates->size(); ix++) {
666                     RBBIStateDescriptor *temp2;
667                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
668                     if (setEquals(U, temp2->fPositions)) {
669                         delete U;
670                         U  = temp2->fPositions;
671                         ux = ix;
672                         UinDstates = TRUE;
673                         break;
674                     }
675                 }
676 
677                 // Add U as an unmarked state to Dstates
678                 if (!UinDstates)
679                 {
680                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
681                     if (newState == NULL) {
682                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
683                     }
684                     if (U_FAILURE(*fStatus)) {
685                         goto ExitBuildSTdeleteall;
686                     }
687                     newState->fPositions = U;
688                     fDStates->addElement(newState, *fStatus);
689                     if (U_FAILURE(*fStatus)) {
690                         return;
691                     }
692                     ux = fDStates->size()-1;
693                 }
694 
695                 // Dtran[T, a] := U;
696                 T->fDtran->setElementAt(ux, a);
697             }
698         }
699     }
700     return;
701     // delete local pointers only if error occured.
702 ExitBuildSTdeleteall:
703     delete initialState;
704     delete failState;
705 }
706 
707 
708 
709 //-----------------------------------------------------------------------------
710 //
711 //   flagAcceptingStates    Identify accepting states.
712 //                          First get a list of all of the end marker nodes.
713 //                          Then, for each state s,
714 //                              if s contains one of the end marker nodes in its list of tree positions then
715 //                                  s is an accepting state.
716 //
717 //-----------------------------------------------------------------------------
flagAcceptingStates()718 void     RBBITableBuilder::flagAcceptingStates() {
719     if (U_FAILURE(*fStatus)) {
720         return;
721     }
722     UVector     endMarkerNodes(*fStatus);
723     RBBINode    *endMarker;
724     int32_t     i;
725     int32_t     n;
726 
727     if (U_FAILURE(*fStatus)) {
728         return;
729     }
730 
731     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
732     if (U_FAILURE(*fStatus)) {
733         return;
734     }
735 
736     for (i=0; i<endMarkerNodes.size(); i++) {
737         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
738         for (n=0; n<fDStates->size(); n++) {
739             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
740             if (sd->fPositions->indexOf(endMarker) >= 0) {
741                 // Any non-zero value for fAccepting means this is an accepting node.
742                 // The value is what will be returned to the user as the break status.
743                 // If no other value was specified, force it to -1.
744 
745                 if (sd->fAccepting==0) {
746                     // State hasn't been marked as accepting yet.  Do it now.
747                     sd->fAccepting = endMarker->fVal;
748                     if (sd->fAccepting == 0) {
749                         sd->fAccepting = -1;
750                     }
751                 }
752                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
753                     // Both lookahead and non-lookahead accepting for this state.
754                     // Favor the look-ahead.  Expedient for line break.
755                     // TODO:  need a more elegant resolution for conflicting rules.
756                     sd->fAccepting = endMarker->fVal;
757                 }
758                 // implicit else:
759                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
760 
761                 // If the end marker node is from a look-ahead rule, set
762                 //   the fLookAhead field for this state also.
763                 if (endMarker->fLookAheadEnd) {
764                     // TODO:  don't change value if already set?
765                     // TODO:  allow for more than one active look-ahead rule in engine.
766                     //        Make value here an index to a side array in engine?
767                     sd->fLookAhead = sd->fAccepting;
768                 }
769             }
770         }
771     }
772 }
773 
774 
775 //-----------------------------------------------------------------------------
776 //
777 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
778 //
779 //-----------------------------------------------------------------------------
flagLookAheadStates()780 void     RBBITableBuilder::flagLookAheadStates() {
781     if (U_FAILURE(*fStatus)) {
782         return;
783     }
784     UVector     lookAheadNodes(*fStatus);
785     RBBINode    *lookAheadNode;
786     int32_t     i;
787     int32_t     n;
788 
789     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
790     if (U_FAILURE(*fStatus)) {
791         return;
792     }
793     for (i=0; i<lookAheadNodes.size(); i++) {
794         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
795 
796         for (n=0; n<fDStates->size(); n++) {
797             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
798             if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
799                 sd->fLookAhead = lookAheadNode->fVal;
800             }
801         }
802     }
803 }
804 
805 
806 
807 
808 //-----------------------------------------------------------------------------
809 //
810 //    flagTaggedStates
811 //
812 //-----------------------------------------------------------------------------
flagTaggedStates()813 void     RBBITableBuilder::flagTaggedStates() {
814     if (U_FAILURE(*fStatus)) {
815         return;
816     }
817     UVector     tagNodes(*fStatus);
818     RBBINode    *tagNode;
819     int32_t     i;
820     int32_t     n;
821 
822     if (U_FAILURE(*fStatus)) {
823         return;
824     }
825     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
826     if (U_FAILURE(*fStatus)) {
827         return;
828     }
829     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
830         tagNode = (RBBINode *)tagNodes.elementAt(i);
831 
832         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
833             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
834             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
835                 sortedAdd(&sd->fTagVals, tagNode->fVal);
836             }
837         }
838     }
839 }
840 
841 
842 
843 
844 //-----------------------------------------------------------------------------
845 //
846 //  mergeRuleStatusVals
847 //
848 //      Update the global table of rule status {tag} values
849 //      The rule builder has a global vector of status values that are common
850 //      for all tables.  Merge the ones from this table into the global set.
851 //
852 //-----------------------------------------------------------------------------
mergeRuleStatusVals()853 void  RBBITableBuilder::mergeRuleStatusVals() {
854     //
855     //  The basic outline of what happens here is this...
856     //
857     //    for each state in this state table
858     //       if the status tag list for this state is in the global statuses list
859     //           record where and
860     //           continue with the next state
861     //       else
862     //           add the tag list for this state to the global list.
863     //
864     int i;
865     int n;
866 
867     // Pre-set a single tag of {0} into the table.
868     //   We will need this as a default, for rule sets with no explicit tagging.
869     if (fRB->fRuleStatusVals->size() == 0) {
870         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
871         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
872     }
873 
874     //    For each state
875     for (n=0; n<fDStates->size(); n++) {
876         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
877         UVector *thisStatesTagValues = sd->fTagVals;
878         if (thisStatesTagValues == NULL) {
879             // No tag values are explicitly associated with this state.
880             //   Set the default tag value.
881             sd->fTagsIdx = 0;
882             continue;
883         }
884 
885         // There are tag(s) associated with this state.
886         //   fTagsIdx will be the index into the global tag list for this state's tag values.
887         //   Initial value of -1 flags that we haven't got it set yet.
888         sd->fTagsIdx = -1;
889         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
890         int32_t  nextTagGroupStart = 0;
891 
892         // Loop runs once per group of tags in the global list
893         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
894             thisTagGroupStart = nextTagGroupStart;
895             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
896             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
897                 // The number of tags for this state is different from
898                 //    the number of tags in this group from the global list.
899                 //    Continue with the next group from the global list.
900                 continue;
901             }
902             // The lengths match, go ahead and compare the actual tag values
903             //    between this state and the group from the global list.
904             for (i=0; i<thisStatesTagValues->size(); i++) {
905                 if (thisStatesTagValues->elementAti(i) !=
906                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
907                     // Mismatch.
908                     break;
909                 }
910             }
911 
912             if (i == thisStatesTagValues->size()) {
913                 // We found a set of tag values in the global list that match
914                 //   those for this state.  Use them.
915                 sd->fTagsIdx = thisTagGroupStart;
916                 break;
917             }
918         }
919 
920         if (sd->fTagsIdx == -1) {
921             // No suitable entry in the global tag list already.  Add one
922             sd->fTagsIdx = fRB->fRuleStatusVals->size();
923             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
924             for (i=0; i<thisStatesTagValues->size(); i++) {
925                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
926             }
927         }
928     }
929 }
930 
931 
932 
933 
934 
935 
936 
937 //-----------------------------------------------------------------------------
938 //
939 //  sortedAdd  Add a value to a vector of sorted values (ints).
940 //             Do not replicate entries; if the value is already there, do not
941 //                add a second one.
942 //             Lazily create the vector if it does not already exist.
943 //
944 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)945 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
946     int32_t i;
947 
948     if (*vector == NULL) {
949         *vector = new UVector(*fStatus);
950     }
951     if (*vector == NULL || U_FAILURE(*fStatus)) {
952         return;
953     }
954     UVector *vec = *vector;
955     int32_t  vSize = vec->size();
956     for (i=0; i<vSize; i++) {
957         int32_t valAtI = vec->elementAti(i);
958         if (valAtI == val) {
959             // The value is already in the vector.  Don't add it again.
960             return;
961         }
962         if (valAtI > val) {
963             break;
964         }
965     }
966     vec->insertElementAt(val, i, *fStatus);
967 }
968 
969 
970 
971 //-----------------------------------------------------------------------------
972 //
973 //  setAdd     Set operation on UVector
974 //             dest = dest union source
975 //             Elements may only appear once and must be sorted.
976 //
977 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)978 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
979     int32_t destOriginalSize = dest->size();
980     int32_t sourceSize       = source->size();
981     int32_t di           = 0;
982     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
983     void **destPtr, **sourcePtr;
984     void **destLim, **sourceLim;
985 
986     if (destOriginalSize > destArray.getCapacity()) {
987         if (destArray.resize(destOriginalSize) == NULL) {
988             return;
989         }
990     }
991     destPtr = destArray.getAlias();
992     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
993 
994     if (sourceSize > sourceArray.getCapacity()) {
995         if (sourceArray.resize(sourceSize) == NULL) {
996             return;
997         }
998     }
999     sourcePtr = sourceArray.getAlias();
1000     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1001 
1002     // Avoid multiple "get element" calls by getting the contents into arrays
1003     (void) dest->toArray(destPtr);
1004     (void) source->toArray(sourcePtr);
1005 
1006     dest->setSize(sourceSize+destOriginalSize, *fStatus);
1007 
1008     while (sourcePtr < sourceLim && destPtr < destLim) {
1009         if (*destPtr == *sourcePtr) {
1010             dest->setElementAt(*sourcePtr++, di++);
1011             destPtr++;
1012         }
1013         // This check is required for machines with segmented memory, like i5/OS.
1014         // Direct pointer comparison is not recommended.
1015         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1016             dest->setElementAt(*destPtr++, di++);
1017         }
1018         else { /* *sourcePtr < *destPtr */
1019             dest->setElementAt(*sourcePtr++, di++);
1020         }
1021     }
1022 
1023     // At most one of these two cleanup loops will execute
1024     while (destPtr < destLim) {
1025         dest->setElementAt(*destPtr++, di++);
1026     }
1027     while (sourcePtr < sourceLim) {
1028         dest->setElementAt(*sourcePtr++, di++);
1029     }
1030 
1031     dest->setSize(di, *fStatus);
1032 }
1033 
1034 
1035 
1036 //-----------------------------------------------------------------------------
1037 //
1038 //  setEqual    Set operation on UVector.
1039 //              Compare for equality.
1040 //              Elements must be sorted.
1041 //
1042 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1043 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1044     return a->equals(*b);
1045 }
1046 
1047 
1048 //-----------------------------------------------------------------------------
1049 //
1050 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1051 //                 for each node in the tree.
1052 //
1053 //-----------------------------------------------------------------------------
1054 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1055 void RBBITableBuilder::printPosSets(RBBINode *n) {
1056     if (n==NULL) {
1057         return;
1058     }
1059     printf("\n");
1060     RBBINode::printNodeHeader();
1061     RBBINode::printNode(n);
1062     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1063 
1064     RBBIDebugPrintf("         firstpos:  ");
1065     printSet(n->fFirstPosSet);
1066 
1067     RBBIDebugPrintf("         lastpos:   ");
1068     printSet(n->fLastPosSet);
1069 
1070     RBBIDebugPrintf("         followpos: ");
1071     printSet(n->fFollowPos);
1072 
1073     printPosSets(n->fLeftChild);
1074     printPosSets(n->fRightChild);
1075 }
1076 #endif
1077 
1078 //
1079 //    findDuplCharClassFrom()
1080 //
findDuplCharClassFrom(IntPair * categories)1081 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1082     int32_t numStates = fDStates->size();
1083     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1084 
1085     for (; categories->first < numCols-1; categories->first++) {
1086         for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1087             // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1088             uint16_t table_base = 0;
1089             uint16_t table_dupl = 1;
1090             for (int32_t state=0; state<numStates; state++) {
1091                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1092                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1093                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1094                 if (table_base != table_dupl) {
1095                     break;
1096                 }
1097             }
1098             if (table_base == table_dupl) {
1099                 return true;
1100             }
1101         }
1102     }
1103     return false;
1104 }
1105 
1106 
1107 //
1108 //    removeColumn()
1109 //
removeColumn(int32_t column)1110 void RBBITableBuilder::removeColumn(int32_t column) {
1111     int32_t numStates = fDStates->size();
1112     for (int32_t state=0; state<numStates; state++) {
1113         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1114         U_ASSERT(column < sd->fDtran->size());
1115         sd->fDtran->removeElementAt(column);
1116     }
1117 }
1118 
1119 /*
1120  * findDuplicateState
1121  */
findDuplicateState(IntPair * states)1122 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1123     int32_t numStates = fDStates->size();
1124     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1125 
1126     for (; states->first<numStates-1; states->first++) {
1127         RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1128         for (states->second=states->first+1; states->second<numStates; states->second++) {
1129             RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1130             if (firstSD->fAccepting != duplSD->fAccepting ||
1131                 firstSD->fLookAhead != duplSD->fLookAhead ||
1132                 firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1133                 continue;
1134             }
1135             bool rowsMatch = true;
1136             for (int32_t col=0; col < numCols; ++col) {
1137                 int32_t firstVal = firstSD->fDtran->elementAti(col);
1138                 int32_t duplVal = duplSD->fDtran->elementAti(col);
1139                 if (!((firstVal == duplVal) ||
1140                         ((firstVal == states->first || firstVal == states->second) &&
1141                         (duplVal  == states->first || duplVal  == states->second)))) {
1142                     rowsMatch = false;
1143                     break;
1144                 }
1145             }
1146             if (rowsMatch) {
1147                 return true;
1148             }
1149         }
1150     }
1151     return false;
1152 }
1153 
1154 
findDuplicateSafeState(IntPair * states)1155 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1156     int32_t numStates = fSafeTable->size();
1157 
1158     for (; states->first<numStates-1; states->first++) {
1159         UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1160         for (states->second=states->first+1; states->second<numStates; states->second++) {
1161             UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1162             bool rowsMatch = true;
1163             int32_t numCols = firstRow->length();
1164             for (int32_t col=0; col < numCols; ++col) {
1165                 int32_t firstVal = firstRow->charAt(col);
1166                 int32_t duplVal = duplRow->charAt(col);
1167                 if (!((firstVal == duplVal) ||
1168                         ((firstVal == states->first || firstVal == states->second) &&
1169                         (duplVal  == states->first || duplVal  == states->second)))) {
1170                     rowsMatch = false;
1171                     break;
1172                 }
1173             }
1174             if (rowsMatch) {
1175                 return true;
1176             }
1177         }
1178     }
1179     return false;
1180 }
1181 
1182 
removeState(IntPair duplStates)1183 void RBBITableBuilder::removeState(IntPair duplStates) {
1184     const int32_t keepState = duplStates.first;
1185     const int32_t duplState = duplStates.second;
1186     U_ASSERT(keepState < duplState);
1187     U_ASSERT(duplState < fDStates->size());
1188 
1189     RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1190     fDStates->removeElementAt(duplState);
1191     delete duplSD;
1192 
1193     int32_t numStates = fDStates->size();
1194     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1195     for (int32_t state=0; state<numStates; ++state) {
1196         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1197         for (int32_t col=0; col<numCols; col++) {
1198             int32_t existingVal = sd->fDtran->elementAti(col);
1199             int32_t newVal = existingVal;
1200             if (existingVal == duplState) {
1201                 newVal = keepState;
1202             } else if (existingVal > duplState) {
1203                 newVal = existingVal - 1;
1204             }
1205             sd->fDtran->setElementAt(newVal, col);
1206         }
1207         if (sd->fAccepting == duplState) {
1208             sd->fAccepting = keepState;
1209         } else if (sd->fAccepting > duplState) {
1210             sd->fAccepting--;
1211         }
1212         if (sd->fLookAhead == duplState) {
1213             sd->fLookAhead = keepState;
1214         } else if (sd->fLookAhead > duplState) {
1215             sd->fLookAhead--;
1216         }
1217     }
1218 }
1219 
removeSafeState(IntPair duplStates)1220 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1221     const int32_t keepState = duplStates.first;
1222     const int32_t duplState = duplStates.second;
1223     U_ASSERT(keepState < duplState);
1224     U_ASSERT(duplState < fSafeTable->size());
1225 
1226     fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1227                                               // and will auto-delete the removed element.
1228     int32_t numStates = fSafeTable->size();
1229     for (int32_t state=0; state<numStates; ++state) {
1230         UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1231         int32_t numCols = sd->length();
1232         for (int32_t col=0; col<numCols; col++) {
1233             int32_t existingVal = sd->charAt(col);
1234             int32_t newVal = existingVal;
1235             if (existingVal == duplState) {
1236                 newVal = keepState;
1237             } else if (existingVal > duplState) {
1238                 newVal = existingVal - 1;
1239             }
1240             sd->setCharAt(col, static_cast<char16_t>(newVal));
1241         }
1242     }
1243 }
1244 
1245 
1246 /*
1247  * RemoveDuplicateStates
1248  */
removeDuplicateStates()1249 int32_t RBBITableBuilder::removeDuplicateStates() {
1250     IntPair dupls = {3, 0};
1251     int32_t numStatesRemoved = 0;
1252 
1253     while (findDuplicateState(&dupls)) {
1254         // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1255         removeState(dupls);
1256         ++numStatesRemoved;
1257     }
1258     return numStatesRemoved;
1259 }
1260 
1261 
1262 //-----------------------------------------------------------------------------
1263 //
1264 //   getTableSize()    Calculate the size of the runtime form of this
1265 //                     state transition table.
1266 //
1267 //-----------------------------------------------------------------------------
getTableSize() const1268 int32_t  RBBITableBuilder::getTableSize() const {
1269     int32_t    size = 0;
1270     int32_t    numRows;
1271     int32_t    numCols;
1272     int32_t    rowSize;
1273 
1274     if (fTree == NULL) {
1275         return 0;
1276     }
1277 
1278     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1279 
1280     numRows = fDStates->size();
1281     numCols = fRB->fSetBuilder->getNumCharCategories();
1282 
1283     rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1284     size   += numRows * rowSize;
1285     return size;
1286 }
1287 
1288 
1289 //-----------------------------------------------------------------------------
1290 //
1291 //   exportTable()    export the state transition table in the format required
1292 //                    by the runtime engine.  getTableSize() bytes of memory
1293 //                    must be available at the output address "where".
1294 //
1295 //-----------------------------------------------------------------------------
exportTable(void * where)1296 void RBBITableBuilder::exportTable(void *where) {
1297     RBBIStateTable    *table = (RBBIStateTable *)where;
1298     uint32_t           state;
1299     int                col;
1300 
1301     if (U_FAILURE(*fStatus) || fTree == NULL) {
1302         return;
1303     }
1304 
1305     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1306     if (catCount > 0x7fff ||
1307         fDStates->size() > 0x7fff) {
1308         *fStatus = U_BRK_INTERNAL_ERROR;
1309         return;
1310     }
1311 
1312     table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1313     table->fNumStates = fDStates->size();
1314     table->fFlags     = 0;
1315     if (fRB->fLookAheadHardBreak) {
1316         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1317     }
1318     if (fRB->fSetBuilder->sawBOF()) {
1319         table->fFlags  |= RBBI_BOF_REQUIRED;
1320     }
1321     table->fReserved  = 0;
1322 
1323     for (state=0; state<table->fNumStates; state++) {
1324         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1325         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1326         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1327         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1328         row->fAccepting = (int16_t)sd->fAccepting;
1329         row->fLookAhead = (int16_t)sd->fLookAhead;
1330         row->fTagIdx    = (int16_t)sd->fTagsIdx;
1331         for (col=0; col<catCount; col++) {
1332             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1333         }
1334     }
1335 }
1336 
1337 
1338 /**
1339  *   Synthesize a safe state table from the main state table.
1340  */
buildSafeReverseTable(UErrorCode & status)1341 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1342     // The safe table creation has three steps:
1343 
1344     // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1345     // following the pair do not depend on context or state before the pair. To test
1346     // whether a pair is safe, run it through the main forward state table, starting
1347     // from each state. If the the final state is the same, no matter what the starting state,
1348     // the pair is safe.
1349     //
1350     // 2. Build a state table that recognizes the safe pairs. It's similar to their
1351     // forward table, with a column for each input character [class], and a row for
1352     // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1353     // create an additional state for each input character category; being in
1354     // one of these states means that the character has been seen, and is potentially
1355     // the first of a pair. In each of these rows, the entry for the second character
1356     // of a safe pair is set to the stop state (0), indicating that a match was found.
1357     // All other table entries are set to the state corresponding the current input
1358     // character, allowing that charcter to be the of a start following pair.
1359     //
1360     // Because the safe rules are to be run in reverse, moving backwards in the text,
1361     // the first and second pair categories are swapped when building the table.
1362     //
1363     // 3. Compress the table. There are typically many rows (states) that are
1364     // equivalent - that have zeroes (match completed) in the same columns -
1365     // and can be folded together.
1366 
1367     // Each safe pair is stored as two UChars in the safePair string.
1368     UnicodeString safePairs;
1369 
1370     int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1371     int32_t numStates = fDStates->size();
1372 
1373     for (int32_t c1=0; c1<numCharClasses; ++c1) {
1374         for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1375             int32_t wantedEndState = -1;
1376             int32_t endState = 0;
1377             for (int32_t startState = 1; startState < numStates; ++startState) {
1378                 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1379                 int32_t s2 = startStateD->fDtran->elementAti(c1);
1380                 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1381                 endState = s2StateD->fDtran->elementAti(c2);
1382                 if (wantedEndState < 0) {
1383                     wantedEndState = endState;
1384                 } else {
1385                     if (wantedEndState != endState) {
1386                         break;
1387                     }
1388                 }
1389             }
1390             if (wantedEndState == endState) {
1391                 safePairs.append((char16_t)c1);
1392                 safePairs.append((char16_t)c2);
1393                 // printf("(%d, %d) ", c1, c2);
1394             }
1395         }
1396         // printf("\n");
1397     }
1398 
1399     // Populate the initial safe table.
1400     // The table as a whole is UVector<UnicodeString>
1401     // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1402     // Row 0 is the stop state.
1403     // Row 1 is the start sate.
1404     // Row 2 and beyond are other states, initially one per char class, but
1405     //   after initial construction, many of the states will be combined, compacting the table.
1406     // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1407     // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1408 
1409     U_ASSERT(fSafeTable == nullptr);
1410     fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1411     for (int32_t row=0; row<numCharClasses + 2; ++row) {
1412         fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1413     }
1414 
1415     // From the start state, each input char class transitions to the state for that input.
1416     UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1417     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1418         // Note: +2 for the start & stop state.
1419         startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1420     }
1421 
1422     // Initially make every other state table row look like the start state row,
1423     for (int32_t row=2; row<numCharClasses+2; ++row) {
1424         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1425         rowState = startState;   // UnicodeString assignment, copies contents.
1426     }
1427 
1428     // Run through the safe pairs, set the next state to zero when pair has been seen.
1429     // Zero being the stop state, meaning we found a safe point.
1430     for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1431         int32_t c1 = safePairs.charAt(pairIdx);
1432         int32_t c2 = safePairs.charAt(pairIdx + 1);
1433 
1434         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1435         rowState.setCharAt(c1, 0);
1436     }
1437 
1438     // Remove duplicate or redundant rows from the table.
1439     IntPair states = {1, 0};
1440     while (findDuplicateSafeState(&states)) {
1441         // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1442         removeSafeState(states);
1443     }
1444 }
1445 
1446 
1447 //-----------------------------------------------------------------------------
1448 //
1449 //   getSafeTableSize()    Calculate the size of the runtime form of this
1450 //                         safe state table.
1451 //
1452 //-----------------------------------------------------------------------------
getSafeTableSize() const1453 int32_t  RBBITableBuilder::getSafeTableSize() const {
1454     int32_t    size = 0;
1455     int32_t    numRows;
1456     int32_t    numCols;
1457     int32_t    rowSize;
1458 
1459     if (fSafeTable == nullptr) {
1460         return 0;
1461     }
1462 
1463     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1464 
1465     numRows = fSafeTable->size();
1466     numCols = fRB->fSetBuilder->getNumCharCategories();
1467 
1468     rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1469     size   += numRows * rowSize;
1470     return size;
1471 }
1472 
1473 
1474 //-----------------------------------------------------------------------------
1475 //
1476 //   exportSafeTable()   export the state transition table in the format required
1477 //                       by the runtime engine.  getTableSize() bytes of memory
1478 //                       must be available at the output address "where".
1479 //
1480 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1481 void RBBITableBuilder::exportSafeTable(void *where) {
1482     RBBIStateTable    *table = (RBBIStateTable *)where;
1483     uint32_t           state;
1484     int                col;
1485 
1486     if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1487         return;
1488     }
1489 
1490     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1491     if (catCount > 0x7fff ||
1492             fSafeTable->size() > 0x7fff) {
1493         *fStatus = U_BRK_INTERNAL_ERROR;
1494         return;
1495     }
1496 
1497     table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1498     table->fNumStates = fSafeTable->size();
1499     table->fFlags     = 0;
1500     table->fReserved  = 0;
1501 
1502     for (state=0; state<table->fNumStates; state++) {
1503         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1504         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1505         row->fAccepting = 0;
1506         row->fLookAhead = 0;
1507         row->fTagIdx    = 0;
1508         row->fReserved  = 0;
1509         for (col=0; col<catCount; col++) {
1510             row->fNextState[col] = rowString->charAt(col);
1511         }
1512     }
1513 }
1514 
1515 
1516 
1517 
1518 //-----------------------------------------------------------------------------
1519 //
1520 //   printSet    Debug function.   Print the contents of a UVector
1521 //
1522 //-----------------------------------------------------------------------------
1523 #ifdef RBBI_DEBUG
printSet(UVector * s)1524 void RBBITableBuilder::printSet(UVector *s) {
1525     int32_t  i;
1526     for (i=0; i<s->size(); i++) {
1527         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1528         RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1529     }
1530     RBBIDebugPrintf("\n");
1531 }
1532 #endif
1533 
1534 
1535 //-----------------------------------------------------------------------------
1536 //
1537 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1538 //
1539 //-----------------------------------------------------------------------------
1540 #ifdef RBBI_DEBUG
printStates()1541 void RBBITableBuilder::printStates() {
1542     int     c;    // input "character"
1543     int     n;    // state number
1544 
1545     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1546     RBBIDebugPrintf("      | Acc  LA    Tag");
1547     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1548         RBBIDebugPrintf(" %2d", c);
1549     }
1550     RBBIDebugPrintf("\n");
1551     RBBIDebugPrintf("      |---------------");
1552     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1553         RBBIDebugPrintf("---");
1554     }
1555     RBBIDebugPrintf("\n");
1556 
1557     for (n=0; n<fDStates->size(); n++) {
1558         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1559         RBBIDebugPrintf("  %3d | " , n);
1560         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1561         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1562             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1563         }
1564         RBBIDebugPrintf("\n");
1565     }
1566     RBBIDebugPrintf("\n\n");
1567 }
1568 #endif
1569 
1570 
1571 //-----------------------------------------------------------------------------
1572 //
1573 //   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1574 //
1575 //-----------------------------------------------------------------------------
1576 #ifdef RBBI_DEBUG
printReverseTable()1577 void RBBITableBuilder::printReverseTable() {
1578     int     c;    // input "character"
1579     int     n;    // state number
1580 
1581     RBBIDebugPrintf("    Safe Reverse Table \n");
1582     if (fSafeTable == nullptr) {
1583         RBBIDebugPrintf("   --- nullptr ---\n");
1584         return;
1585     }
1586     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1587     RBBIDebugPrintf("      | Acc  LA    Tag");
1588     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1589         RBBIDebugPrintf(" %2d", c);
1590     }
1591     RBBIDebugPrintf("\n");
1592     RBBIDebugPrintf("      |---------------");
1593     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1594         RBBIDebugPrintf("---");
1595     }
1596     RBBIDebugPrintf("\n");
1597 
1598     for (n=0; n<fSafeTable->size(); n++) {
1599         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1600         RBBIDebugPrintf("  %3d | " , n);
1601         RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1602         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1603             RBBIDebugPrintf(" %2d", rowString->charAt(c));
1604         }
1605         RBBIDebugPrintf("\n");
1606     }
1607     RBBIDebugPrintf("\n\n");
1608 }
1609 #endif
1610 
1611 
1612 
1613 //-----------------------------------------------------------------------------
1614 //
1615 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1616 //
1617 //-----------------------------------------------------------------------------
1618 #ifdef RBBI_DEBUG
printRuleStatusTable()1619 void RBBITableBuilder::printRuleStatusTable() {
1620     int32_t  thisRecord = 0;
1621     int32_t  nextRecord = 0;
1622     int      i;
1623     UVector  *tbl = fRB->fRuleStatusVals;
1624 
1625     RBBIDebugPrintf("index |  tags \n");
1626     RBBIDebugPrintf("-------------------\n");
1627 
1628     while (nextRecord < tbl->size()) {
1629         thisRecord = nextRecord;
1630         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1631         RBBIDebugPrintf("%4d   ", thisRecord);
1632         for (i=thisRecord+1; i<nextRecord; i++) {
1633             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1634         }
1635         RBBIDebugPrintf("\n");
1636     }
1637     RBBIDebugPrintf("\n\n");
1638 }
1639 #endif
1640 
1641 
1642 //-----------------------------------------------------------------------------
1643 //
1644 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1645 //                           Most access is directly to the fields.
1646 //
1647 //-----------------------------------------------------------------------------
1648 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1649 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1650     fMarked    = FALSE;
1651     fAccepting = 0;
1652     fLookAhead = 0;
1653     fTagsIdx   = 0;
1654     fTagVals   = NULL;
1655     fPositions = NULL;
1656     fDtran     = NULL;
1657 
1658     fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1659     if (U_FAILURE(*fStatus)) {
1660         return;
1661     }
1662     if (fDtran == NULL) {
1663         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1664         return;
1665     }
1666     fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1667                                            //   It is indexed by input symbols, and will
1668                                            //   hold  the next state number for each
1669                                            //   symbol.
1670 }
1671 
1672 
~RBBIStateDescriptor()1673 RBBIStateDescriptor::~RBBIStateDescriptor() {
1674     delete       fPositions;
1675     delete       fDtran;
1676     delete       fTagVals;
1677     fPositions = NULL;
1678     fDtran     = NULL;
1679     fTagVals   = NULL;
1680 }
1681 
1682 U_NAMESPACE_END
1683 
1684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1685