1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // rbbirb.h 5 // 6 // Copyright (C) 2002-2008, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains declarations for several classes from the 10 // Rule Based Break Iterator rule builder. 11 // 12 13 14 #ifndef RBBIRB_H 15 #define RBBIRB_H 16 17 #include "unicode/utypes.h" 18 #include "unicode/uobject.h" 19 #include "unicode/rbbi.h" 20 #include "unicode/uniset.h" 21 #include "unicode/parseerr.h" 22 #include "uhash.h" 23 #include "uvector.h" 24 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 25 // looks up references to $variables within a set. 26 27 28 29 U_NAMESPACE_BEGIN 30 31 class RBBIRuleScanner; 32 struct RBBIRuleTableEl; 33 class RBBISetBuilder; 34 class RBBINode; 35 class RBBITableBuilder; 36 37 38 39 //-------------------------------------------------------------------------------- 40 // 41 // RBBISymbolTable. Implements SymbolTable interface that is used by the 42 // UnicodeSet parser to resolve references to $variables. 43 // 44 //-------------------------------------------------------------------------------- 45 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 46 public: // of these structs for each entry. 47 RBBISymbolTableEntry(); 48 UnicodeString key; 49 RBBINode *val; 50 ~RBBISymbolTableEntry(); 51 52 private: 53 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class 54 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class 55 }; 56 57 58 class RBBISymbolTable : public UMemory, public SymbolTable { 59 private: 60 const UnicodeString &fRules; 61 UHashtable *fHashTable; 62 RBBIRuleScanner *fRuleScanner; 63 64 // These next two fields are part of the mechanism for passing references to 65 // already-constructed UnicodeSets back to the UnicodeSet constructor 66 // when the pattern includes $variable references. 67 const UnicodeString ffffString; // = "/uffff" 68 UnicodeSet *fCachedSetLookup; 69 70 public: 71 // API inherited from class SymbolTable 72 virtual const UnicodeString* lookup(const UnicodeString& s) const; 73 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 74 virtual UnicodeString parseReference(const UnicodeString& text, 75 ParsePosition& pos, int32_t limit) const; 76 77 // Additional Functions 78 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 79 virtual ~RBBISymbolTable(); 80 81 virtual RBBINode *lookupNode(const UnicodeString &key) const; 82 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 83 84 #ifdef RBBI_DEBUG 85 virtual void rbbiSymtablePrint() const; 86 #else 87 // A do-nothing inline function for non-debug builds. Member funcs can't be empty 88 // or the call sites won't compile. 89 int32_t fFakeField; 90 #define rbbiSymtablePrint() fFakeField=0; 91 #endif 92 93 private: 94 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 95 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 96 }; 97 98 99 //-------------------------------------------------------------------------------- 100 // 101 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 102 // 103 //-------------------------------------------------------------------------------- 104 class RBBIRuleBuilder : public UMemory { 105 public: 106 107 // Create a rule based break iterator from a set of rules. 108 // This function is the main entry point into the rule builder. The 109 // public ICU API for creating RBBIs uses this function to do the actual work. 110 // 111 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 112 UParseError *parseError, 113 UErrorCode &status); 114 115 public: 116 // The "public" functions and data members that appear below are accessed 117 // (and shared) by the various parts that make up the rule builder. They 118 // are NOT intended to be accessed by anything outside of the 119 // rule builder implementation. 120 RBBIRuleBuilder(const UnicodeString &rules, 121 UParseError *parseErr, 122 UErrorCode &status 123 ); 124 125 virtual ~RBBIRuleBuilder(); 126 char *fDebugEnv; // controls debug trace output 127 UErrorCode *fStatus; // Error reporting. Keeping status 128 UParseError *fParseError; // here avoids passing it everywhere. 129 const UnicodeString &fRules; // The rule string that we are compiling 130 131 RBBIRuleScanner *fScanner; // The scanner. 132 RBBINode *fForwardTree; // The parse trees, generated by the scanner, 133 RBBINode *fReverseTree; // then manipulated by subsequent steps. 134 RBBINode *fSafeFwdTree; 135 RBBINode *fSafeRevTree; 136 137 RBBINode **fDefaultTree; // For rules not qualified with a ! 138 // the tree to which they belong to. 139 140 UBool fChainRules; // True for chained Unicode TR style rules. 141 // False for traditional regexp rules. 142 143 UBool fLBCMNoChain; // True: suppress chaining of rules on 144 // chars with LineBreak property == CM. 145 146 UBool fLookAheadHardBreak; // True: Look ahead matches cause an 147 // immediate break, no continuing for the 148 // longest match. 149 150 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 151 UVector *fUSetNodes; // Vector of all uset nodes. 152 153 RBBITableBuilder *fForwardTables; // State transition tables 154 RBBITableBuilder *fReverseTables; 155 RBBITableBuilder *fSafeFwdTables; 156 RBBITableBuilder *fSafeRevTables; 157 158 UVector *fRuleStatusVals; // The values that can be returned 159 // from getRuleStatus(). 160 161 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 162 // data tables.. 163 private: 164 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class 165 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class 166 }; 167 168 169 170 171 //---------------------------------------------------------------------------- 172 // 173 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 174 // been encountered. The val Node will be of nodetype uset 175 // and contain pointers to the actual UnicodeSets. 176 // The Key is the source string for initializing the set. 177 // 178 // The hash table is used to avoid creating duplicate 179 // unnamed (not $var references) UnicodeSets. 180 // 181 // Memory Management: 182 // The Hash Table owns these RBBISetTableEl structs and 183 // the key strings. It does NOT own the val nodes. 184 // 185 //---------------------------------------------------------------------------- 186 struct RBBISetTableEl { 187 UnicodeString *key; 188 RBBINode *val; 189 }; 190 191 192 //---------------------------------------------------------------------------- 193 // 194 // RBBIDebugPrintf Printf equivalent, for debugging output. 195 // Conditional compilation of the implementation lets us 196 // get rid of the stdio dependency in environments where it 197 // is unavailable. 198 // 199 //---------------------------------------------------------------------------- 200 #ifdef RBBI_DEBUG 201 #include <stdio.h> 202 #define RBBIDebugPrintf printf 203 #define RBBIDebugPuts puts 204 #else 205 #undef RBBIDebugPrintf 206 #define RBBIDebugPuts(arg) 207 #endif 208 209 U_NAMESPACE_END 210 #endif 211 212 213 214