1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  rbbirb.h
5 //
6 //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains declarations for several classes from the
10 //    Rule Based Break Iterator rule builder.
11 //
12 
13 
14 #ifndef RBBIRB_H
15 #define RBBIRB_H
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_BREAK_ITERATION
20 
21 #include "unicode/uobject.h"
22 #include "unicode/rbbi.h"
23 #include "unicode/uniset.h"
24 #include "unicode/parseerr.h"
25 #include "uhash.h"
26 #include "uvector.h"
27 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
28                           //    looks up references to $variables within a set.
29 
30 
31 
32 U_NAMESPACE_BEGIN
33 
34 class               RBBIRuleScanner;
35 struct              RBBIRuleTableEl;
36 class               RBBISetBuilder;
37 class               RBBINode;
38 class               RBBITableBuilder;
39 
40 
41 
42 //--------------------------------------------------------------------------------
43 //
44 //   RBBISymbolTable.    Implements SymbolTable interface that is used by the
45 //                       UnicodeSet parser to resolve references to $variables.
46 //
47 //--------------------------------------------------------------------------------
48 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
49 public:                                       //   of these structs for each entry.
50     RBBISymbolTableEntry();
51     UnicodeString          key;
52     RBBINode               *val;
53     ~RBBISymbolTableEntry();
54 
55 private:
56     RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
57     RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
58 };
59 
60 
61 class RBBISymbolTable : public UMemory, public SymbolTable {
62 private:
63     const UnicodeString      &fRules;
64     UHashtable               *fHashTable;
65     RBBIRuleScanner          *fRuleScanner;
66 
67     // These next two fields are part of the mechanism for passing references to
68     //   already-constructed UnicodeSets back to the UnicodeSet constructor
69     //   when the pattern includes $variable references.
70     const UnicodeString      ffffString;      // = "/uffff"
71     UnicodeSet              *fCachedSetLookup;
72 
73 public:
74     //  API inherited from class SymbolTable
75     virtual const UnicodeString*  lookup(const UnicodeString& s) const;
76     virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
77     virtual UnicodeString parseReference(const UnicodeString& text,
78                                          ParsePosition& pos, int32_t limit) const;
79 
80     //  Additional Functions
81     RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
82     virtual ~RBBISymbolTable();
83 
84     virtual RBBINode *lookupNode(const UnicodeString &key) const;
85     virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
86 
87 #ifdef RBBI_DEBUG
88     virtual void      rbbiSymtablePrint() const;
89 #else
90     // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
91     //  or the call sites won't compile.
92     int32_t fFakeField;
93     #define rbbiSymtablePrint() fFakeField=0;
94 #endif
95 
96 private:
97     RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
98     RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
99 };
100 
101 
102 //--------------------------------------------------------------------------------
103 //
104 //  class RBBIRuleBuilder       The top-level class handling RBBI rule compiling.
105 //
106 //--------------------------------------------------------------------------------
107 class RBBIRuleBuilder : public UMemory {
108 public:
109 
110     //  Create a rule based break iterator from a set of rules.
111     //  This function is the main entry point into the rule builder.  The
112     //   public ICU API for creating RBBIs uses this function to do the actual work.
113     //
114     static BreakIterator * createRuleBasedBreakIterator( const UnicodeString    &rules,
115                                     UParseError      *parseError,
116                                     UErrorCode       &status);
117 
118 public:
119     // The "public" functions and data members that appear below are accessed
120     //  (and shared) by the various parts that make up the rule builder.  They
121     //  are NOT intended to be accessed by anything outside of the
122     //  rule builder implementation.
123     RBBIRuleBuilder(const UnicodeString  &rules,
124                     UParseError          *parseErr,
125                     UErrorCode           &status
126         );
127 
128     virtual    ~RBBIRuleBuilder();
129     char                          *fDebugEnv;        // controls debug trace output
130     UErrorCode                    *fStatus;          // Error reporting.  Keeping status
131     UParseError                   *fParseError;      //   here avoids passing it everywhere.
132     const UnicodeString           &fRules;           // The rule string that we are compiling
133 
134     RBBIRuleScanner               *fScanner;         // The scanner.
135     RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
136     RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
137     RBBINode                      *fSafeFwdTree;
138     RBBINode                      *fSafeRevTree;
139 
140     RBBINode                      **fDefaultTree;    // For rules not qualified with a !
141                                                      //   the tree to which they belong to.
142 
143     UBool                         fChainRules;       // True for chained Unicode TR style rules.
144                                                      // False for traditional regexp rules.
145 
146     UBool                         fLBCMNoChain;      // True:  suppress chaining of rules on
147                                                      //   chars with LineBreak property == CM.
148 
149     UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
150                                                      // immediate break, no continuing for the
151                                                      // longest match.
152 
153     RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
154     UVector                       *fUSetNodes;       // Vector of all uset nodes.
155 
156     RBBITableBuilder              *fForwardTables;   // State transition tables
157     RBBITableBuilder              *fReverseTables;
158     RBBITableBuilder              *fSafeFwdTables;
159     RBBITableBuilder              *fSafeRevTables;
160 
161     UVector                       *fRuleStatusVals;  // The values that can be returned
162                                                      //   from getRuleStatus().
163 
164     RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
165                                                      // data tables..
166 private:
167     RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
168     RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
169 };
170 
171 
172 
173 
174 //----------------------------------------------------------------------------
175 //
176 //   RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
177 //                    been encountered.  The val Node will be of nodetype uset
178 //                    and contain pointers to the actual UnicodeSets.
179 //                    The Key is the source string for initializing the set.
180 //
181 //                    The hash table is used to avoid creating duplicate
182 //                    unnamed (not $var references) UnicodeSets.
183 //
184 //                    Memory Management:
185 //                       The Hash Table owns these RBBISetTableEl structs and
186 //                            the key strings.  It does NOT own the val nodes.
187 //
188 //----------------------------------------------------------------------------
189 struct RBBISetTableEl {
190     UnicodeString *key;
191     RBBINode      *val;
192 };
193 
194 
195 //----------------------------------------------------------------------------
196 //
197 //   RBBIDebugPrintf    Printf equivalent, for debugging output.
198 //                      Conditional compilation of the implementation lets us
199 //                      get rid of the stdio dependency in environments where it
200 //                      is unavailable.
201 //
202 //----------------------------------------------------------------------------
203 #ifdef RBBI_DEBUG
204 #include <stdio.h>
205 #define RBBIDebugPrintf printf
206 #define RBBIDebugPuts puts
207 #else
208 #undef RBBIDebugPrintf
209 #define RBBIDebugPuts(arg)
210 #endif
211 
212 U_NAMESPACE_END
213 
214 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
215 
216 #endif
217 
218 
219 
220