1 /*
2 **********************************************************************
3 *   Copyright (c) 2002-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 //
8 //  rbbitblb.cpp
9 //
10 
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "unicode/unistr.h"
17 #include "rbbitblb.h"
18 #include "rbbirb.h"
19 #include "rbbisetb.h"
20 #include "rbbidata.h"
21 #include "cstring.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 
25 U_NAMESPACE_BEGIN
26 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode)27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
28  fTree(*rootNode) {
29     fRB                 = rb;
30     fStatus             = fRB->fStatus;
31     UErrorCode status   = U_ZERO_ERROR;
32     fDStates            = new UVector(status);
33     if (U_FAILURE(*fStatus)) {
34         return;
35     }
36     if (U_FAILURE(status)) {
37         *fStatus = status;
38         return;
39     }
40     if (fDStates == NULL) {
41         *fStatus = U_MEMORY_ALLOCATION_ERROR;;
42     }
43 }
44 
45 
46 
~RBBITableBuilder()47 RBBITableBuilder::~RBBITableBuilder() {
48     int i;
49     for (i=0; i<fDStates->size(); i++) {
50         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
51     }
52     delete   fDStates;
53 }
54 
55 
56 //-----------------------------------------------------------------------------
57 //
58 //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
59 //                               table from the RBBI rules parse tree.
60 //
61 //-----------------------------------------------------------------------------
build()62 void  RBBITableBuilder::build() {
63 
64     if (U_FAILURE(*fStatus)) {
65         return;
66     }
67 
68     // If there were no rules, just return.  This situation can easily arise
69     //   for the reverse rules.
70     if (fTree==NULL) {
71         return;
72     }
73 
74     //
75     // Walk through the tree, replacing any references to $variables with a copy of the
76     //   parse tree for the substition expression.
77     //
78     fTree = fTree->flattenVariables();
79 #ifdef RBBI_DEBUG
80     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
81         RBBIDebugPuts("Parse tree after flattening variable references.");
82         fTree->printTree(TRUE);
83     }
84 #endif
85 
86     //
87     // If the rules contained any references to {bof}
88     //   add a {bof} <cat> <former root of tree> to the
89     //   tree.  Means that all matches must start out with the
90     //   {bof} fake character.
91     //
92     if (fRB->fSetBuilder->sawBOF()) {
93         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
94         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
95         // Delete and exit if memory allocation failed.
96         if (bofTop == NULL || bofLeaf == NULL) {
97             *fStatus = U_MEMORY_ALLOCATION_ERROR;
98             delete bofTop;
99             delete bofLeaf;
100             return;
101         }
102         bofTop->fLeftChild  = bofLeaf;
103         bofTop->fRightChild = fTree;
104         bofLeaf->fParent    = bofTop;
105         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
106         fTree               = bofTop;
107     }
108 
109     //
110     // Add a unique right-end marker to the expression.
111     //   Appears as a cat-node, left child being the original tree,
112     //   right child being the end marker.
113     //
114     RBBINode *cn = new RBBINode(RBBINode::opCat);
115     // Exit if memory allocation failed.
116     if (cn == NULL) {
117         *fStatus = U_MEMORY_ALLOCATION_ERROR;
118         return;
119     }
120     cn->fLeftChild = fTree;
121     fTree->fParent = cn;
122     cn->fRightChild = new RBBINode(RBBINode::endMark);
123     // Delete and exit if memory allocation failed.
124     if (cn->fRightChild == NULL) {
125         *fStatus = U_MEMORY_ALLOCATION_ERROR;
126         delete cn;
127         return;
128     }
129     cn->fRightChild->fParent = cn;
130     fTree = cn;
131 
132     //
133     //  Replace all references to UnicodeSets with the tree for the equivalent
134     //      expression.
135     //
136     fTree->flattenSets();
137 #ifdef RBBI_DEBUG
138     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
139         RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
140         fTree->printTree(TRUE);
141     }
142 #endif
143 
144 
145     //
146     // calculate the functions nullable, firstpos, lastpos and followpos on
147     // nodes in the parse tree.
148     //    See the alogrithm description in Aho.
149     //    Understanding how this works by looking at the code alone will be
150     //       nearly impossible.
151     //
152     calcNullable(fTree);
153     calcFirstPos(fTree);
154     calcLastPos(fTree);
155     calcFollowPos(fTree);
156     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
157         RBBIDebugPuts("\n");
158         printPosSets(fTree);
159     }
160 
161     //
162     //  For "chained" rules, modify the followPos sets
163     //
164     if (fRB->fChainRules) {
165         calcChainedFollowPos(fTree);
166     }
167 
168     //
169     //  BOF (start of input) test fixup.
170     //
171     if (fRB->fSetBuilder->sawBOF()) {
172         bofFixup();
173     }
174 
175     //
176     // Build the DFA state transition tables.
177     //
178     buildStateTable();
179     flagAcceptingStates();
180     flagLookAheadStates();
181     flagTaggedStates();
182 
183     //
184     // Update the global table of rule status {tag} values
185     // The rule builder has a global vector of status values that are common
186     //    for all tables.  Merge the ones from this table into the global set.
187     //
188     mergeRuleStatusVals();
189 
190     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
191 }
192 
193 
194 
195 //-----------------------------------------------------------------------------
196 //
197 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
198 //
199 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)200 void RBBITableBuilder::calcNullable(RBBINode *n) {
201     if (n == NULL) {
202         return;
203     }
204     if (n->fType == RBBINode::setRef ||
205         n->fType == RBBINode::endMark ) {
206         // These are non-empty leaf node types.
207         n->fNullable = FALSE;
208         return;
209     }
210 
211     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212         // Lookahead marker node.  It's a leaf, so no recursion on children.
213         // It's nullable because it does not match any literal text from the input stream.
214         n->fNullable = TRUE;
215         return;
216     }
217 
218 
219     // The node is not a leaf.
220     //  Calculate nullable on its children.
221     calcNullable(n->fLeftChild);
222     calcNullable(n->fRightChild);
223 
224     // Apply functions from table 3.40 in Aho
225     if (n->fType == RBBINode::opOr) {
226         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227     }
228     else if (n->fType == RBBINode::opCat) {
229         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230     }
231     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232         n->fNullable = TRUE;
233     }
234     else {
235         n->fNullable = FALSE;
236     }
237 }
238 
239 
240 
241 
242 //-----------------------------------------------------------------------------
243 //
244 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
245 //
246 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248     if (n == NULL) {
249         return;
250     }
251     if (n->fType == RBBINode::leafChar  ||
252         n->fType == RBBINode::endMark   ||
253         n->fType == RBBINode::lookAhead ||
254         n->fType == RBBINode::tag) {
255         // These are non-empty leaf node types.
256         // Note: In order to maintain the sort invariant on the set,
257         // this function should only be called on a node whose set is
258         // empty to start with.
259         n->fFirstPosSet->addElement(n, *fStatus);
260         return;
261     }
262 
263     // The node is not a leaf.
264     //  Calculate firstPos on its children.
265     calcFirstPos(n->fLeftChild);
266     calcFirstPos(n->fRightChild);
267 
268     // Apply functions from table 3.40 in Aho
269     if (n->fType == RBBINode::opOr) {
270         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272     }
273     else if (n->fType == RBBINode::opCat) {
274         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275         if (n->fLeftChild->fNullable) {
276             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277         }
278     }
279     else if (n->fType == RBBINode::opStar ||
280              n->fType == RBBINode::opQuestion ||
281              n->fType == RBBINode::opPlus) {
282         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283     }
284 }
285 
286 
287 
288 //-----------------------------------------------------------------------------
289 //
290 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
291 //
292 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
294     if (n == NULL) {
295         return;
296     }
297     if (n->fType == RBBINode::leafChar  ||
298         n->fType == RBBINode::endMark   ||
299         n->fType == RBBINode::lookAhead ||
300         n->fType == RBBINode::tag) {
301         // These are non-empty leaf node types.
302         // Note: In order to maintain the sort invariant on the set,
303         // this function should only be called on a node whose set is
304         // empty to start with.
305         n->fLastPosSet->addElement(n, *fStatus);
306         return;
307     }
308 
309     // The node is not a leaf.
310     //  Calculate lastPos on its children.
311     calcLastPos(n->fLeftChild);
312     calcLastPos(n->fRightChild);
313 
314     // Apply functions from table 3.40 in Aho
315     if (n->fType == RBBINode::opOr) {
316         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318     }
319     else if (n->fType == RBBINode::opCat) {
320         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321         if (n->fRightChild->fNullable) {
322             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323         }
324     }
325     else if (n->fType == RBBINode::opStar     ||
326              n->fType == RBBINode::opQuestion ||
327              n->fType == RBBINode::opPlus) {
328         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329     }
330 }
331 
332 
333 
334 //-----------------------------------------------------------------------------
335 //
336 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
337 //
338 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340     if (n == NULL ||
341         n->fType == RBBINode::leafChar ||
342         n->fType == RBBINode::endMark) {
343         return;
344     }
345 
346     calcFollowPos(n->fLeftChild);
347     calcFollowPos(n->fRightChild);
348 
349     // Aho rule #1
350     if (n->fType == RBBINode::opCat) {
351         RBBINode *i;   // is 'i' in Aho's description
352         uint32_t     ix;
353 
354         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355 
356         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359         }
360     }
361 
362     // Aho rule #2
363     if (n->fType == RBBINode::opStar ||
364         n->fType == RBBINode::opPlus) {
365         RBBINode   *i;  // again, n and i are the names from Aho's description.
366         uint32_t    ix;
367 
368         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370             setAdd(i->fFollowPos, n->fFirstPosSet);
371         }
372     }
373 
374 
375 
376 }
377 
378 
379 //-----------------------------------------------------------------------------
380 //
381 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
382 //                            to implement rule chaining.  NOT described by Aho
383 //
384 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree)385 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
386 
387     UVector         endMarkerNodes(*fStatus);
388     UVector         leafNodes(*fStatus);
389     int32_t         i;
390 
391     if (U_FAILURE(*fStatus)) {
392         return;
393     }
394 
395     // get a list of all endmarker nodes.
396     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
397 
398     // get a list all leaf nodes
399     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
400     if (U_FAILURE(*fStatus)) {
401         return;
402     }
403 
404     // Get all nodes that can be the start a match, which is FirstPosition()
405     // of the portion of the tree corresponding to user-written rules.
406     // See the tree description in bofFixup().
407     RBBINode *userRuleRoot = tree;
408     if (fRB->fSetBuilder->sawBOF()) {
409         userRuleRoot = tree->fLeftChild->fRightChild;
410     }
411     U_ASSERT(userRuleRoot != NULL);
412     UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
413 
414 
415     // Iteratate over all leaf nodes,
416     //
417     int32_t  endNodeIx;
418     int32_t  startNodeIx;
419 
420     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
421         RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
422         RBBINode *endNode = NULL;
423 
424         // Identify leaf nodes that correspond to overall rule match positions.
425         //   These include an endMarkerNode in their followPos sets.
426         for (i=0; i<endMarkerNodes.size(); i++) {
427             if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
428                 endNode = tNode;
429                 break;
430             }
431         }
432         if (endNode == NULL) {
433             // node wasn't an end node.  Try again with the next.
434             continue;
435         }
436 
437         // We've got a node that can end a match.
438 
439         // Line Break Specific hack:  If this node's val correspond to the $CM char class,
440         //                            don't chain from it.
441         // TODO:  Add rule syntax for this behavior, get specifics out of here and
442         //        into the rule file.
443         if (fRB->fLBCMNoChain) {
444             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
445             if (c != -1) {
446                 // c == -1 occurs with sets containing only the {eof} marker string.
447                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
448                 if (cLBProp == U_LB_COMBINING_MARK) {
449                     continue;
450                 }
451             }
452         }
453 
454 
455         // Now iterate over the nodes that can start a match, looking for ones
456         //   with the same char class as our ending node.
457         RBBINode *startNode;
458         for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
459             startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
460             if (startNode->fType != RBBINode::leafChar) {
461                 continue;
462             }
463 
464             if (endNode->fVal == startNode->fVal) {
465                 // The end val (character class) of one possible match is the
466                 //   same as the start of another.
467 
468                 // Add all nodes from the followPos of the start node to the
469                 //  followPos set of the end node, which will have the effect of
470                 //  letting matches transition from a match state at endNode
471                 //  to the second char of a match starting with startNode.
472                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
473             }
474         }
475     }
476 }
477 
478 
479 //-----------------------------------------------------------------------------
480 //
481 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
482 //                Do an swizzle similar to chaining, modifying the followPos set of
483 //                the bofNode to include the followPos nodes from other {bot} nodes
484 //                scattered through the tree.
485 //
486 //                This function has much in common with calcChainedFollowPos().
487 //
488 //-----------------------------------------------------------------------------
bofFixup()489 void RBBITableBuilder::bofFixup() {
490 
491     if (U_FAILURE(*fStatus)) {
492         return;
493     }
494 
495     //   The parse tree looks like this ...
496     //         fTree root  --->       <cat>
497     //                               /     \       .
498     //                            <cat>   <#end node>
499     //                           /     \  .
500     //                     <bofNode>   rest
501     //                               of tree
502     //
503     //    We will be adding things to the followPos set of the <bofNode>
504     //
505     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
506     U_ASSERT(bofNode->fType == RBBINode::leafChar);
507     U_ASSERT(bofNode->fVal == 2);
508 
509     // Get all nodes that can be the start a match of the user-written rules
510     //  (excluding the fake bofNode)
511     //  We want the nodes that can start a match in the
512     //     part labeled "rest of tree"
513     //
514     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
515 
516     RBBINode *startNode;
517     int       startNodeIx;
518     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
519         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
520         if (startNode->fType != RBBINode::leafChar) {
521             continue;
522         }
523 
524         if (startNode->fVal == bofNode->fVal) {
525             //  We found a leaf node corresponding to a {bof} that was
526             //    explicitly written into a rule.
527             //  Add everything from the followPos set of this node to the
528             //    followPos set of the fake bofNode at the start of the tree.
529             //
530             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
531         }
532     }
533 }
534 
535 //-----------------------------------------------------------------------------
536 //
537 //   buildStateTable()    Determine the set of runtime DFA states and the
538 //                        transition tables for these states, by the algorithm
539 //                        of fig. 3.44 in Aho.
540 //
541 //                        Most of the comments are quotes of Aho's psuedo-code.
542 //
543 //-----------------------------------------------------------------------------
buildStateTable()544 void RBBITableBuilder::buildStateTable() {
545     if (U_FAILURE(*fStatus)) {
546         return;
547     }
548     RBBIStateDescriptor *failState;
549     // Set it to NULL to avoid uninitialized warning
550     RBBIStateDescriptor *initialState = NULL;
551     //
552     // Add a dummy state 0 - the stop state.  Not from Aho.
553     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
554     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
555     if (failState == NULL) {
556         *fStatus = U_MEMORY_ALLOCATION_ERROR;
557         goto ExitBuildSTdeleteall;
558     }
559     failState->fPositions = new UVector(*fStatus);
560     if (failState->fPositions == NULL) {
561         *fStatus = U_MEMORY_ALLOCATION_ERROR;
562     }
563     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
564         goto ExitBuildSTdeleteall;
565     }
566     fDStates->addElement(failState, *fStatus);
567     if (U_FAILURE(*fStatus)) {
568         goto ExitBuildSTdeleteall;
569     }
570 
571     // initially, the only unmarked state in Dstates is firstpos(root),
572     //       where toot is the root of the syntax tree for (r)#;
573     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
574     if (initialState == NULL) {
575         *fStatus = U_MEMORY_ALLOCATION_ERROR;
576     }
577     if (U_FAILURE(*fStatus)) {
578         goto ExitBuildSTdeleteall;
579     }
580     initialState->fPositions = new UVector(*fStatus);
581     if (initialState->fPositions == NULL) {
582         *fStatus = U_MEMORY_ALLOCATION_ERROR;
583     }
584     if (U_FAILURE(*fStatus)) {
585         goto ExitBuildSTdeleteall;
586     }
587     setAdd(initialState->fPositions, fTree->fFirstPosSet);
588     fDStates->addElement(initialState, *fStatus);
589     if (U_FAILURE(*fStatus)) {
590         goto ExitBuildSTdeleteall;
591     }
592 
593     // while there is an unmarked state T in Dstates do begin
594     for (;;) {
595         RBBIStateDescriptor *T = NULL;
596         int32_t              tx;
597         for (tx=1; tx<fDStates->size(); tx++) {
598             RBBIStateDescriptor *temp;
599             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
600             if (temp->fMarked == FALSE) {
601                 T = temp;
602                 break;
603             }
604         }
605         if (T == NULL) {
606             break;
607         }
608 
609         // mark T;
610         T->fMarked = TRUE;
611 
612         // for each input symbol a do begin
613         int32_t  a;
614         for (a = 1; a<=lastInputSymbol; a++) {
615             // let U be the set of positions that are in followpos(p)
616             //    for some position p in T
617             //    such that the symbol at position p is a;
618             UVector    *U = NULL;
619             RBBINode   *p;
620             int32_t     px;
621             for (px=0; px<T->fPositions->size(); px++) {
622                 p = (RBBINode *)T->fPositions->elementAt(px);
623                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
624                     if (U == NULL) {
625                         U = new UVector(*fStatus);
626                         if (U == NULL) {
627                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
628                         	goto ExitBuildSTdeleteall;
629                         }
630                     }
631                     setAdd(U, p->fFollowPos);
632                 }
633             }
634 
635             // if U is not empty and not in DStates then
636             int32_t  ux = 0;
637             UBool    UinDstates = FALSE;
638             if (U != NULL) {
639                 U_ASSERT(U->size() > 0);
640                 int  ix;
641                 for (ix=0; ix<fDStates->size(); ix++) {
642                     RBBIStateDescriptor *temp2;
643                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
644                     if (setEquals(U, temp2->fPositions)) {
645                         delete U;
646                         U  = temp2->fPositions;
647                         ux = ix;
648                         UinDstates = TRUE;
649                         break;
650                     }
651                 }
652 
653                 // Add U as an unmarked state to Dstates
654                 if (!UinDstates)
655                 {
656                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
657                     if (newState == NULL) {
658                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
659                     }
660                     if (U_FAILURE(*fStatus)) {
661                         goto ExitBuildSTdeleteall;
662                     }
663                     newState->fPositions = U;
664                     fDStates->addElement(newState, *fStatus);
665                     if (U_FAILURE(*fStatus)) {
666                         return;
667                     }
668                     ux = fDStates->size()-1;
669                 }
670 
671                 // Dtran[T, a] := U;
672                 T->fDtran->setElementAt(ux, a);
673             }
674         }
675     }
676     return;
677     // delete local pointers only if error occured.
678 ExitBuildSTdeleteall:
679     delete initialState;
680     delete failState;
681 }
682 
683 
684 
685 //-----------------------------------------------------------------------------
686 //
687 //   flagAcceptingStates    Identify accepting states.
688 //                          First get a list of all of the end marker nodes.
689 //                          Then, for each state s,
690 //                              if s contains one of the end marker nodes in its list of tree positions then
691 //                                  s is an accepting state.
692 //
693 //-----------------------------------------------------------------------------
flagAcceptingStates()694 void     RBBITableBuilder::flagAcceptingStates() {
695     if (U_FAILURE(*fStatus)) {
696         return;
697     }
698     UVector     endMarkerNodes(*fStatus);
699     RBBINode    *endMarker;
700     int32_t     i;
701     int32_t     n;
702 
703     if (U_FAILURE(*fStatus)) {
704         return;
705     }
706 
707     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
708     if (U_FAILURE(*fStatus)) {
709         return;
710     }
711 
712     for (i=0; i<endMarkerNodes.size(); i++) {
713         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
714         for (n=0; n<fDStates->size(); n++) {
715             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
716             if (sd->fPositions->indexOf(endMarker) >= 0) {
717                 // Any non-zero value for fAccepting means this is an accepting node.
718                 // The value is what will be returned to the user as the break status.
719                 // If no other value was specified, force it to -1.
720 
721                 if (sd->fAccepting==0) {
722                     // State hasn't been marked as accepting yet.  Do it now.
723                     sd->fAccepting = endMarker->fVal;
724                     if (sd->fAccepting == 0) {
725                         sd->fAccepting = -1;
726                     }
727                 }
728                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
729                     // Both lookahead and non-lookahead accepting for this state.
730                     // Favor the look-ahead.  Expedient for line break.
731                     // TODO:  need a more elegant resolution for conflicting rules.
732                     sd->fAccepting = endMarker->fVal;
733                 }
734                 // implicit else:
735                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
736 
737                 // If the end marker node is from a look-ahead rule, set
738                 //   the fLookAhead field or this state also.
739                 if (endMarker->fLookAheadEnd) {
740                     // TODO:  don't change value if already set?
741                     // TODO:  allow for more than one active look-ahead rule in engine.
742                     //        Make value here an index to a side array in engine?
743                     sd->fLookAhead = sd->fAccepting;
744                 }
745             }
746         }
747     }
748 }
749 
750 
751 //-----------------------------------------------------------------------------
752 //
753 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
754 //
755 //-----------------------------------------------------------------------------
flagLookAheadStates()756 void     RBBITableBuilder::flagLookAheadStates() {
757     if (U_FAILURE(*fStatus)) {
758         return;
759     }
760     UVector     lookAheadNodes(*fStatus);
761     RBBINode    *lookAheadNode;
762     int32_t     i;
763     int32_t     n;
764 
765     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
766     if (U_FAILURE(*fStatus)) {
767         return;
768     }
769     for (i=0; i<lookAheadNodes.size(); i++) {
770         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
771 
772         for (n=0; n<fDStates->size(); n++) {
773             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
774             if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
775                 sd->fLookAhead = lookAheadNode->fVal;
776             }
777         }
778     }
779 }
780 
781 
782 
783 
784 //-----------------------------------------------------------------------------
785 //
786 //    flagTaggedStates
787 //
788 //-----------------------------------------------------------------------------
flagTaggedStates()789 void     RBBITableBuilder::flagTaggedStates() {
790     if (U_FAILURE(*fStatus)) {
791         return;
792     }
793     UVector     tagNodes(*fStatus);
794     RBBINode    *tagNode;
795     int32_t     i;
796     int32_t     n;
797 
798     if (U_FAILURE(*fStatus)) {
799         return;
800     }
801     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
802     if (U_FAILURE(*fStatus)) {
803         return;
804     }
805     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
806         tagNode = (RBBINode *)tagNodes.elementAt(i);
807 
808         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
809             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
810             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
811                 sortedAdd(&sd->fTagVals, tagNode->fVal);
812             }
813         }
814     }
815 }
816 
817 
818 
819 
820 //-----------------------------------------------------------------------------
821 //
822 //  mergeRuleStatusVals
823 //
824 //      Update the global table of rule status {tag} values
825 //      The rule builder has a global vector of status values that are common
826 //      for all tables.  Merge the ones from this table into the global set.
827 //
828 //-----------------------------------------------------------------------------
mergeRuleStatusVals()829 void  RBBITableBuilder::mergeRuleStatusVals() {
830     //
831     //  The basic outline of what happens here is this...
832     //
833     //    for each state in this state table
834     //       if the status tag list for this state is in the global statuses list
835     //           record where and
836     //           continue with the next state
837     //       else
838     //           add the tag list for this state to the global list.
839     //
840     int i;
841     int n;
842 
843     // Pre-set a single tag of {0} into the table.
844     //   We will need this as a default, for rule sets with no explicit tagging.
845     if (fRB->fRuleStatusVals->size() == 0) {
846         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
847         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
848     }
849 
850     //    For each state
851     for (n=0; n<fDStates->size(); n++) {
852         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
853         UVector *thisStatesTagValues = sd->fTagVals;
854         if (thisStatesTagValues == NULL) {
855             // No tag values are explicitly associated with this state.
856             //   Set the default tag value.
857             sd->fTagsIdx = 0;
858             continue;
859         }
860 
861         // There are tag(s) associated with this state.
862         //   fTagsIdx will be the index into the global tag list for this state's tag values.
863         //   Initial value of -1 flags that we haven't got it set yet.
864         sd->fTagsIdx = -1;
865         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
866         int32_t  nextTagGroupStart = 0;
867 
868         // Loop runs once per group of tags in the global list
869         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
870             thisTagGroupStart = nextTagGroupStart;
871             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
872             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
873                 // The number of tags for this state is different from
874                 //    the number of tags in this group from the global list.
875                 //    Continue with the next group from the global list.
876                 continue;
877             }
878             // The lengths match, go ahead and compare the actual tag values
879             //    between this state and the group from the global list.
880             for (i=0; i<thisStatesTagValues->size(); i++) {
881                 if (thisStatesTagValues->elementAti(i) !=
882                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
883                     // Mismatch.
884                     break;
885                 }
886             }
887 
888             if (i == thisStatesTagValues->size()) {
889                 // We found a set of tag values in the global list that match
890                 //   those for this state.  Use them.
891                 sd->fTagsIdx = thisTagGroupStart;
892                 break;
893             }
894         }
895 
896         if (sd->fTagsIdx == -1) {
897             // No suitable entry in the global tag list already.  Add one
898             sd->fTagsIdx = fRB->fRuleStatusVals->size();
899             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
900             for (i=0; i<thisStatesTagValues->size(); i++) {
901                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
902             }
903         }
904     }
905 }
906 
907 
908 
909 
910 
911 
912 
913 //-----------------------------------------------------------------------------
914 //
915 //  sortedAdd  Add a value to a vector of sorted values (ints).
916 //             Do not replicate entries; if the value is already there, do not
917 //                add a second one.
918 //             Lazily create the vector if it does not already exist.
919 //
920 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)921 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
922     int32_t i;
923 
924     if (*vector == NULL) {
925         *vector = new UVector(*fStatus);
926     }
927     if (*vector == NULL || U_FAILURE(*fStatus)) {
928         return;
929     }
930     UVector *vec = *vector;
931     int32_t  vSize = vec->size();
932     for (i=0; i<vSize; i++) {
933         int32_t valAtI = vec->elementAti(i);
934         if (valAtI == val) {
935             // The value is already in the vector.  Don't add it again.
936             return;
937         }
938         if (valAtI > val) {
939             break;
940         }
941     }
942     vec->insertElementAt(val, i, *fStatus);
943 }
944 
945 
946 
947 //-----------------------------------------------------------------------------
948 //
949 //  setAdd     Set operation on UVector
950 //             dest = dest union source
951 //             Elements may only appear once and must be sorted.
952 //
953 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)954 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
955     int32_t destOriginalSize = dest->size();
956     int32_t sourceSize       = source->size();
957     int32_t di           = 0;
958     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
959     void **destPtr, **sourcePtr;
960     void **destLim, **sourceLim;
961 
962     if (destOriginalSize > destArray.getCapacity()) {
963         if (destArray.resize(destOriginalSize) == NULL) {
964             return;
965         }
966     }
967     destPtr = destArray.getAlias();
968     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
969 
970     if (sourceSize > sourceArray.getCapacity()) {
971         if (sourceArray.resize(sourceSize) == NULL) {
972             return;
973         }
974     }
975     sourcePtr = sourceArray.getAlias();
976     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
977 
978     // Avoid multiple "get element" calls by getting the contents into arrays
979     (void) dest->toArray(destPtr);
980     (void) source->toArray(sourcePtr);
981 
982     dest->setSize(sourceSize+destOriginalSize, *fStatus);
983 
984     while (sourcePtr < sourceLim && destPtr < destLim) {
985         if (*destPtr == *sourcePtr) {
986             dest->setElementAt(*sourcePtr++, di++);
987             destPtr++;
988         }
989         // This check is required for machines with segmented memory, like i5/OS.
990         // Direct pointer comparison is not recommended.
991         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
992             dest->setElementAt(*destPtr++, di++);
993         }
994         else { /* *sourcePtr < *destPtr */
995             dest->setElementAt(*sourcePtr++, di++);
996         }
997     }
998 
999     // At most one of these two cleanup loops will execute
1000     while (destPtr < destLim) {
1001         dest->setElementAt(*destPtr++, di++);
1002     }
1003     while (sourcePtr < sourceLim) {
1004         dest->setElementAt(*sourcePtr++, di++);
1005     }
1006 
1007     dest->setSize(di, *fStatus);
1008 }
1009 
1010 
1011 
1012 //-----------------------------------------------------------------------------
1013 //
1014 //  setEqual    Set operation on UVector.
1015 //              Compare for equality.
1016 //              Elements must be sorted.
1017 //
1018 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1019 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1020     return a->equals(*b);
1021 }
1022 
1023 
1024 //-----------------------------------------------------------------------------
1025 //
1026 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1027 //                 for each node in the tree.
1028 //
1029 //-----------------------------------------------------------------------------
1030 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1031 void RBBITableBuilder::printPosSets(RBBINode *n) {
1032     if (n==NULL) {
1033         return;
1034     }
1035     n->printNode();
1036     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1037 
1038     RBBIDebugPrintf("         firstpos:  ");
1039     printSet(n->fFirstPosSet);
1040 
1041     RBBIDebugPrintf("         lastpos:   ");
1042     printSet(n->fLastPosSet);
1043 
1044     RBBIDebugPrintf("         followpos: ");
1045     printSet(n->fFollowPos);
1046 
1047     printPosSets(n->fLeftChild);
1048     printPosSets(n->fRightChild);
1049 }
1050 #endif
1051 
1052 
1053 
1054 //-----------------------------------------------------------------------------
1055 //
1056 //   getTableSize()    Calculate the size of the runtime form of this
1057 //                     state transition table.
1058 //
1059 //-----------------------------------------------------------------------------
getTableSize() const1060 int32_t  RBBITableBuilder::getTableSize() const {
1061     int32_t    size = 0;
1062     int32_t    numRows;
1063     int32_t    numCols;
1064     int32_t    rowSize;
1065 
1066     if (fTree == NULL) {
1067         return 0;
1068     }
1069 
1070     size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
1071 
1072     numRows = fDStates->size();
1073     numCols = fRB->fSetBuilder->getNumCharCategories();
1074 
1075     //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
1076     //        Therefore we subtract two from numCols when determining
1077     //        how much storage to add to a row for the total columns.
1078     rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1079     size   += numRows * rowSize;
1080     return size;
1081 }
1082 
1083 
1084 
1085 //-----------------------------------------------------------------------------
1086 //
1087 //   exportTable()    export the state transition table in the format required
1088 //                    by the runtime engine.  getTableSize() bytes of memory
1089 //                    must be available at the output address "where".
1090 //
1091 //-----------------------------------------------------------------------------
exportTable(void * where)1092 void RBBITableBuilder::exportTable(void *where) {
1093     RBBIStateTable    *table = (RBBIStateTable *)where;
1094     uint32_t           state;
1095     int                col;
1096 
1097     if (U_FAILURE(*fStatus) || fTree == NULL) {
1098         return;
1099     }
1100 
1101     if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1102         fDStates->size() > 0x7fff) {
1103         *fStatus = U_BRK_INTERNAL_ERROR;
1104         return;
1105     }
1106 
1107     table->fRowLen    = sizeof(RBBIStateTableRow) +
1108                             sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1109     table->fNumStates = fDStates->size();
1110     table->fFlags     = 0;
1111     if (fRB->fLookAheadHardBreak) {
1112         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1113     }
1114     if (fRB->fSetBuilder->sawBOF()) {
1115         table->fFlags  |= RBBI_BOF_REQUIRED;
1116     }
1117     table->fReserved  = 0;
1118 
1119     for (state=0; state<table->fNumStates; state++) {
1120         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1121         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1122         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1123         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1124         row->fAccepting = (int16_t)sd->fAccepting;
1125         row->fLookAhead = (int16_t)sd->fLookAhead;
1126         row->fTagIdx    = (int16_t)sd->fTagsIdx;
1127         for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1128             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1129         }
1130     }
1131 }
1132 
1133 
1134 
1135 //-----------------------------------------------------------------------------
1136 //
1137 //   printSet    Debug function.   Print the contents of a UVector
1138 //
1139 //-----------------------------------------------------------------------------
1140 #ifdef RBBI_DEBUG
printSet(UVector * s)1141 void RBBITableBuilder::printSet(UVector *s) {
1142     int32_t  i;
1143     for (i=0; i<s->size(); i++) {
1144         void *v = s->elementAt(i);
1145         RBBIDebugPrintf("%10p", v);
1146     }
1147     RBBIDebugPrintf("\n");
1148 }
1149 #endif
1150 
1151 
1152 //-----------------------------------------------------------------------------
1153 //
1154 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1155 //
1156 //-----------------------------------------------------------------------------
1157 #ifdef RBBI_DEBUG
printStates()1158 void RBBITableBuilder::printStates() {
1159     int     c;    // input "character"
1160     int     n;    // state number
1161 
1162     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1163     RBBIDebugPrintf("      | Acc  LA    Tag");
1164     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1165         RBBIDebugPrintf(" %2d", c);
1166     }
1167     RBBIDebugPrintf("\n");
1168     RBBIDebugPrintf("      |---------------");
1169     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1170         RBBIDebugPrintf("---");
1171     }
1172     RBBIDebugPrintf("\n");
1173 
1174     for (n=0; n<fDStates->size(); n++) {
1175         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1176         RBBIDebugPrintf("  %3d | " , n);
1177         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1178         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1179             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1180         }
1181         RBBIDebugPrintf("\n");
1182     }
1183     RBBIDebugPrintf("\n\n");
1184 }
1185 #endif
1186 
1187 
1188 
1189 //-----------------------------------------------------------------------------
1190 //
1191 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1192 //
1193 //-----------------------------------------------------------------------------
1194 #ifdef RBBI_DEBUG
printRuleStatusTable()1195 void RBBITableBuilder::printRuleStatusTable() {
1196     int32_t  thisRecord = 0;
1197     int32_t  nextRecord = 0;
1198     int      i;
1199     UVector  *tbl = fRB->fRuleStatusVals;
1200 
1201     RBBIDebugPrintf("index |  tags \n");
1202     RBBIDebugPrintf("-------------------\n");
1203 
1204     while (nextRecord < tbl->size()) {
1205         thisRecord = nextRecord;
1206         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1207         RBBIDebugPrintf("%4d   ", thisRecord);
1208         for (i=thisRecord+1; i<nextRecord; i++) {
1209             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1210         }
1211         RBBIDebugPrintf("\n");
1212     }
1213     RBBIDebugPrintf("\n\n");
1214 }
1215 #endif
1216 
1217 
1218 //-----------------------------------------------------------------------------
1219 //
1220 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1221 //                           Most access is directly to the fields.
1222 //
1223 //-----------------------------------------------------------------------------
1224 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1225 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1226     fMarked    = FALSE;
1227     fAccepting = 0;
1228     fLookAhead = 0;
1229     fTagsIdx   = 0;
1230     fTagVals   = NULL;
1231     fPositions = NULL;
1232     fDtran     = NULL;
1233 
1234     fDtran     = new UVector(lastInputSymbol+1, *fStatus);
1235     if (U_FAILURE(*fStatus)) {
1236         return;
1237     }
1238     if (fDtran == NULL) {
1239         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1240         return;
1241     }
1242     fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
1243                                            //   It is indexed by input symbols, and will
1244                                            //   hold  the next state number for each
1245                                            //   symbol.
1246 }
1247 
1248 
~RBBIStateDescriptor()1249 RBBIStateDescriptor::~RBBIStateDescriptor() {
1250     delete       fPositions;
1251     delete       fDtran;
1252     delete       fTagVals;
1253     fPositions = NULL;
1254     fDtran     = NULL;
1255     fTagVals   = NULL;
1256 }
1257 
1258 U_NAMESPACE_END
1259 
1260 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1261