1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  rbbirb.h
5 //
6 //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains declarations for several classes from the
10 //    Rule Based Break Iterator rule builder.
11 //
12 
13 
14 #ifndef RBBIRB_H
15 #define RBBIRB_H
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_BREAK_ITERATION
20 
21 #include <utility>
22 
23 #include "unicode/uobject.h"
24 #include "unicode/rbbi.h"
25 #include "unicode/uniset.h"
26 #include "unicode/parseerr.h"
27 #include "uhash.h"
28 #include "uvector.h"
29 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
30                              //    looks up references to $variables within a set.
31 
32 
33 U_NAMESPACE_BEGIN
34 
35 class               RBBIRuleScanner;
36 struct              RBBIRuleTableEl;
37 class               RBBISetBuilder;
38 class               RBBINode;
39 class               RBBITableBuilder;
40 
41 
42 
43 //--------------------------------------------------------------------------------
44 //
45 //   RBBISymbolTable.    Implements SymbolTable interface that is used by the
46 //                       UnicodeSet parser to resolve references to $variables.
47 //
48 //--------------------------------------------------------------------------------
49 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
50 public:                                       //   of these structs for each entry.
51     RBBISymbolTableEntry();
52     UnicodeString          key;
53     RBBINode               *val;
54     ~RBBISymbolTableEntry();
55 
56 private:
57     RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
58     RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
59 };
60 
61 
62 class RBBISymbolTable : public UMemory, public SymbolTable {
63 private:
64     const UnicodeString      &fRules;
65     UHashtable               *fHashTable;
66     RBBIRuleScanner          *fRuleScanner;
67 
68     // These next two fields are part of the mechanism for passing references to
69     //   already-constructed UnicodeSets back to the UnicodeSet constructor
70     //   when the pattern includes $variable references.
71     const UnicodeString      ffffString;      // = "/uffff"
72     UnicodeSet              *fCachedSetLookup;
73 
74 public:
75     //  API inherited from class SymbolTable
76     virtual const UnicodeString*  lookup(const UnicodeString& s) const;
77     virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
78     virtual UnicodeString parseReference(const UnicodeString& text,
79                                          ParsePosition& pos, int32_t limit) const;
80 
81     //  Additional Functions
82     RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
83     virtual ~RBBISymbolTable();
84 
85     virtual RBBINode *lookupNode(const UnicodeString &key) const;
86     virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
87 
88 #ifdef RBBI_DEBUG
89     virtual void      rbbiSymtablePrint() const;
90 #else
91     // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
92     //  or the call sites won't compile.
93     int32_t fFakeField;
94     #define rbbiSymtablePrint() fFakeField=0;
95 #endif
96 
97 private:
98     RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
99     RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
100 };
101 
102 
103 //--------------------------------------------------------------------------------
104 //
105 //  class RBBIRuleBuilder       The top-level class handling RBBI rule compiling.
106 //
107 //--------------------------------------------------------------------------------
108 class RBBIRuleBuilder : public UMemory {
109 public:
110 
111     //  Create a rule based break iterator from a set of rules.
112     //  This function is the main entry point into the rule builder.  The
113     //   public ICU API for creating RBBIs uses this function to do the actual work.
114     //
115     static BreakIterator * createRuleBasedBreakIterator( const UnicodeString    &rules,
116                                     UParseError      *parseError,
117                                     UErrorCode       &status);
118 
119 public:
120     // The "public" functions and data members that appear below are accessed
121     //  (and shared) by the various parts that make up the rule builder.  They
122     //  are NOT intended to be accessed by anything outside of the
123     //  rule builder implementation.
124     RBBIRuleBuilder(const UnicodeString  &rules,
125                     UParseError          *parseErr,
126                     UErrorCode           &status
127     );
128 
129     virtual    ~RBBIRuleBuilder();
130 
131     /**
132      *  Build the state tables and char class Trie from the source rules.
133      */
134     RBBIDataHeader  *build(UErrorCode &status);
135 
136 
137     /**
138      * Fold together redundant character classes (table columns) and
139      * redundant states (table rows). Done after initial table generation,
140      * before serializing the result.
141      */
142     void optimizeTables();
143 
144     char                          *fDebugEnv;        // controls debug trace output
145     UErrorCode                    *fStatus;          // Error reporting.  Keeping status
146     UParseError                   *fParseError;      //   here avoids passing it everywhere.
147     const UnicodeString           &fRules;           // The rule string that we are compiling
148     UnicodeString                 fStrippedRules;    // The rule string, with comments stripped.
149 
150     RBBIRuleScanner               *fScanner;         // The scanner.
151     RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
152     RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
153     RBBINode                      *fSafeFwdTree;
154     RBBINode                      *fSafeRevTree;
155 
156     RBBINode                      **fDefaultTree;    // For rules not qualified with a !
157                                                      //   the tree to which they belong to.
158 
159     UBool                         fChainRules;       // True for chained Unicode TR style rules.
160                                                      // False for traditional regexp rules.
161 
162     UBool                         fLBCMNoChain;      // True:  suppress chaining of rules on
163                                                      //   chars with LineBreak property == CM.
164 
165     UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
166                                                      // immediate break, no continuing for the
167                                                      // longest match.
168 
169     RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
170     UVector                       *fUSetNodes;       // Vector of all uset nodes.
171 
172     RBBITableBuilder              *fForwardTable;    // State transition table, build time form.
173 
174     UVector                       *fRuleStatusVals;  // The values that can be returned
175                                                      //   from getRuleStatus().
176 
177     RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
178                                                      // data tables..
179 private:
180     RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
181     RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
182 };
183 
184 
185 
186 
187 //----------------------------------------------------------------------------
188 //
189 //   RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
190 //                    been encountered.  The val Node will be of nodetype uset
191 //                    and contain pointers to the actual UnicodeSets.
192 //                    The Key is the source string for initializing the set.
193 //
194 //                    The hash table is used to avoid creating duplicate
195 //                    unnamed (not $var references) UnicodeSets.
196 //
197 //                    Memory Management:
198 //                       The Hash Table owns these RBBISetTableEl structs and
199 //                            the key strings.  It does NOT own the val nodes.
200 //
201 //----------------------------------------------------------------------------
202 struct RBBISetTableEl {
203     UnicodeString *key;
204     RBBINode      *val;
205 };
206 
207 /**
208  *   A pair of ints, used to bundle pairs of states or pairs of character classes.
209  */
210 typedef std::pair<int32_t, int32_t> IntPair;
211 
212 
213 //----------------------------------------------------------------------------
214 //
215 //   RBBIDebugPrintf    Printf equivalent, for debugging output.
216 //                      Conditional compilation of the implementation lets us
217 //                      get rid of the stdio dependency in environments where it
218 //                      is unavailable.
219 //
220 //----------------------------------------------------------------------------
221 #ifdef RBBI_DEBUG
222 #include <stdio.h>
223 #define RBBIDebugPrintf printf
224 #define RBBIDebugPuts puts
225 #else
226 #undef RBBIDebugPrintf
227 #define RBBIDebugPuts(arg)
228 #endif
229 
230 U_NAMESPACE_END
231 
232 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
233 
234 #endif
235 
236 
237 
238