1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // file: rbbirb.cpp 5 // 6 // Copyright (C) 2002-2011, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains the RBBIRuleBuilder class implementation. This is the main class for 10 // building (compiling) break rules into the tables required by the runtime 11 // RBBI engine. 12 // 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_BREAK_ITERATION 17 18 #include "unicode/brkiter.h" 19 #include "unicode/rbbi.h" 20 #include "unicode/ubrk.h" 21 #include "unicode/unistr.h" 22 #include "unicode/uniset.h" 23 #include "unicode/uchar.h" 24 #include "unicode/uchriter.h" 25 #include "unicode/parsepos.h" 26 #include "unicode/parseerr.h" 27 28 #include "cmemory.h" 29 #include "cstring.h" 30 #include "rbbirb.h" 31 #include "rbbinode.h" 32 #include "rbbiscan.h" 33 #include "rbbisetb.h" 34 #include "rbbitblb.h" 35 #include "rbbidata.h" 36 #include "uassert.h" 37 38 39 U_NAMESPACE_BEGIN 40 41 42 //---------------------------------------------------------------------------------------- 43 // 44 // Constructor. 45 // 46 //---------------------------------------------------------------------------------------- 47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, 48 UParseError *parseErr, 49 UErrorCode &status) 50 : fRules(rules) 51 { 52 fStatus = &status; // status is checked below 53 fParseError = parseErr; 54 fDebugEnv = NULL; 55 #ifdef RBBI_DEBUG 56 fDebugEnv = getenv("U_RBBIDEBUG"); 57 #endif 58 59 60 fForwardTree = NULL; 61 fReverseTree = NULL; 62 fSafeFwdTree = NULL; 63 fSafeRevTree = NULL; 64 fDefaultTree = &fForwardTree; 65 fForwardTables = NULL; 66 fReverseTables = NULL; 67 fSafeFwdTables = NULL; 68 fSafeRevTables = NULL; 69 fRuleStatusVals = NULL; 70 fChainRules = FALSE; 71 fLBCMNoChain = FALSE; 72 fLookAheadHardBreak = FALSE; 73 fUSetNodes = NULL; 74 fRuleStatusVals = NULL; 75 fScanner = NULL; 76 fSetBuilder = NULL; 77 if (parseErr) { 78 uprv_memset(parseErr, 0, sizeof(UParseError)); 79 } 80 81 if (U_FAILURE(status)) { 82 return; 83 } 84 85 fUSetNodes = new UVector(status); // bcos status gets overwritten here 86 fRuleStatusVals = new UVector(status); 87 fScanner = new RBBIRuleScanner(this); 88 fSetBuilder = new RBBISetBuilder(this); 89 if (U_FAILURE(status)) { 90 return; 91 } 92 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { 93 status = U_MEMORY_ALLOCATION_ERROR; 94 } 95 } 96 97 98 99 //---------------------------------------------------------------------------------------- 100 // 101 // Destructor 102 // 103 //---------------------------------------------------------------------------------------- 104 RBBIRuleBuilder::~RBBIRuleBuilder() { 105 106 int i; 107 for (i=0; ; i++) { 108 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); 109 if (n==NULL) { 110 break; 111 } 112 delete n; 113 } 114 115 delete fUSetNodes; 116 delete fSetBuilder; 117 delete fForwardTables; 118 delete fReverseTables; 119 delete fSafeFwdTables; 120 delete fSafeRevTables; 121 122 delete fForwardTree; 123 delete fReverseTree; 124 delete fSafeFwdTree; 125 delete fSafeRevTree; 126 delete fScanner; 127 delete fRuleStatusVals; 128 } 129 130 131 132 133 134 //---------------------------------------------------------------------------------------- 135 // 136 // flattenData() - Collect up the compiled RBBI rule data and put it into 137 // the format for saving in ICU data files, 138 // which is also the format needed by the RBBI runtime engine. 139 // 140 //---------------------------------------------------------------------------------------- 141 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} 142 143 RBBIDataHeader *RBBIRuleBuilder::flattenData() { 144 int32_t i; 145 146 if (U_FAILURE(*fStatus)) { 147 return NULL; 148 } 149 150 // Remove comments and whitespace from the rules to make it smaller. 151 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); 152 153 // Calculate the size of each section in the data. 154 // Sizes here are padded up to a multiple of 8 for better memory alignment. 155 // Sections sizes actually stored in the header are for the actual data 156 // without the padding. 157 // 158 int32_t headerSize = align8(sizeof(RBBIDataHeader)); 159 int32_t forwardTableSize = align8(fForwardTables->getTableSize()); 160 int32_t reverseTableSize = align8(fReverseTables->getTableSize()); 161 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); 162 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); 163 int32_t trieSize = align8(fSetBuilder->getTrieSize()); 164 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); 165 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); 166 167 (void)safeFwdTableSize; 168 169 int32_t totalSize = headerSize 170 + forwardTableSize 171 + /* reverseTableSize */ 0 172 + /* safeFwdTableSize */ 0 173 + (safeRevTableSize ? safeRevTableSize : reverseTableSize) 174 + statusTableSize + trieSize + rulesSize; 175 176 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); 177 if (data == NULL) { 178 *fStatus = U_MEMORY_ALLOCATION_ERROR; 179 return NULL; 180 } 181 uprv_memset(data, 0, totalSize); 182 183 184 data->fMagic = 0xb1a0; 185 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; 186 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; 187 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; 188 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; 189 data->fLength = totalSize; 190 data->fCatCount = fSetBuilder->getNumCharCategories(); 191 192 // Only save the forward table and the safe reverse table, 193 // because these are the only ones used at run-time. 194 // 195 // For the moment, we still build the other tables if they are present in the rule source files, 196 // for backwards compatibility. Old rule files need to work, and this is the simplest approach. 197 // 198 // Additional backwards compatibility consideration: if no safe rules are provided, consider the 199 // reverse rules to actually be the safe reverse rules. 200 201 data->fFTable = headerSize; 202 data->fFTableLen = forwardTableSize; 203 204 // Do not save Reverse Table. 205 data->fRTable = data->fFTable + forwardTableSize; 206 data->fRTableLen = 0; 207 208 // Do not save the Safe Forward table. 209 data->fSFTable = data->fRTable + 0; 210 data->fSFTableLen = 0; 211 212 data->fSRTable = data->fSFTable + 0; 213 if (safeRevTableSize > 0) { 214 data->fSRTableLen = safeRevTableSize; 215 } else if (reverseTableSize > 0) { 216 data->fSRTableLen = reverseTableSize; 217 } else { 218 U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table 219 // before reaching this point. 220 } 221 222 223 data->fTrie = data->fSRTable + data->fSRTableLen; 224 data->fTrieLen = fSetBuilder->getTrieSize(); 225 data->fStatusTable = data->fTrie + trieSize; 226 data->fStatusTableLen= statusTableSize; 227 data->fRuleSource = data->fStatusTable + statusTableSize; 228 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); 229 230 uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); 231 232 fForwardTables->exportTable((uint8_t *)data + data->fFTable); 233 // fReverseTables->exportTable((uint8_t *)data + data->fRTable); 234 // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); 235 if (safeRevTableSize > 0) { 236 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); 237 } else { 238 fReverseTables->exportTable((uint8_t *)data + data->fSRTable); 239 } 240 241 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); 242 243 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); 244 for (i=0; i<fRuleStatusVals->size(); i++) { 245 ruleStatusTable[i] = fRuleStatusVals->elementAti(i); 246 } 247 248 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); 249 250 return data; 251 } 252 253 254 255 256 257 258 //---------------------------------------------------------------------------------------- 259 // 260 // createRuleBasedBreakIterator construct from source rules that are passed in 261 // in a UnicodeString 262 // 263 //---------------------------------------------------------------------------------------- 264 BreakIterator * 265 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, 266 UParseError *parseError, 267 UErrorCode &status) 268 { 269 // status checked below 270 271 // 272 // Read the input rules, generate a parse tree, symbol table, 273 // and list of all Unicode Sets referenced by the rules. 274 // 275 RBBIRuleBuilder builder(rules, parseError, status); 276 if (U_FAILURE(status)) { // status checked here bcos build below doesn't 277 return NULL; 278 } 279 builder.fScanner->parse(); 280 281 // 282 // UnicodeSet processing. 283 // Munge the Unicode Sets to create a set of character categories. 284 // Generate the mapping tables (TRIE) from input 32-bit characters to 285 // the character categories. 286 // 287 builder.fSetBuilder->build(); 288 289 290 // 291 // Generate the DFA state transition table. 292 // 293 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); 294 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); 295 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); 296 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); 297 if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || 298 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) 299 { 300 status = U_MEMORY_ALLOCATION_ERROR; 301 delete builder.fForwardTables; builder.fForwardTables = NULL; 302 delete builder.fReverseTables; builder.fReverseTables = NULL; 303 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; 304 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; 305 return NULL; 306 } 307 308 builder.fForwardTables->build(); 309 builder.fReverseTables->build(); 310 builder.fSafeFwdTables->build(); 311 builder.fSafeRevTables->build(); 312 313 #ifdef RBBI_DEBUG 314 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { 315 builder.fForwardTables->printRuleStatusTable(); 316 } 317 #endif 318 319 // 320 // Package up the compiled data into a memory image 321 // in the run-time format. 322 // 323 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error 324 if (U_FAILURE(*builder.fStatus)) { 325 return NULL; 326 } 327 328 329 // 330 // Clean up the compiler related stuff 331 // 332 333 334 // 335 // Create a break iterator from the compiled rules. 336 // (Identical to creation from stored pre-compiled rules) 337 // 338 // status is checked after init in construction. 339 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); 340 if (U_FAILURE(status)) { 341 delete This; 342 This = NULL; 343 } 344 else if(This == NULL) { // test for NULL 345 status = U_MEMORY_ALLOCATION_ERROR; 346 } 347 return This; 348 } 349 350 U_NAMESPACE_END 351 352 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 353