1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  rbbirb.cpp
5 //
6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10 //    building (compiling) break rules into the tables required by the runtime
11 //    RBBI engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/parseerr.h"
27 
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "rbbirb.h"
31 #include "rbbinode.h"
32 #include "rbbiscan.h"
33 #include "rbbisetb.h"
34 #include "rbbitblb.h"
35 #include "rbbidata.h"
36 #include "uassert.h"
37 
38 
39 U_NAMESPACE_BEGIN
40 
41 
42 //----------------------------------------------------------------------------------------
43 //
44 //  Constructor.
45 //
46 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
48                                        UParseError     *parseErr,
49                                        UErrorCode      &status)
50  : fRules(rules), fStrippedRules(rules)
51 {
52     fStatus = &status; // status is checked below
53     fParseError = parseErr;
54     fDebugEnv   = NULL;
55 #ifdef RBBI_DEBUG
56     fDebugEnv   = getenv("U_RBBIDEBUG");
57 #endif
58 
59 
60     fForwardTree        = NULL;
61     fReverseTree        = NULL;
62     fSafeFwdTree        = NULL;
63     fSafeRevTree        = NULL;
64     fDefaultTree        = &fForwardTree;
65     fForwardTable       = NULL;
66     fRuleStatusVals     = NULL;
67     fChainRules         = FALSE;
68     fLBCMNoChain        = FALSE;
69     fLookAheadHardBreak = FALSE;
70     fUSetNodes          = NULL;
71     fRuleStatusVals     = NULL;
72     fScanner            = NULL;
73     fSetBuilder         = NULL;
74     if (parseErr) {
75         uprv_memset(parseErr, 0, sizeof(UParseError));
76     }
77 
78     if (U_FAILURE(status)) {
79         return;
80     }
81 
82     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
83     fRuleStatusVals     = new UVector(status);
84     fScanner            = new RBBIRuleScanner(this);
85     fSetBuilder         = new RBBISetBuilder(this);
86     if (U_FAILURE(status)) {
87         return;
88     }
89     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
90         status = U_MEMORY_ALLOCATION_ERROR;
91     }
92 }
93 
94 
95 
96 //----------------------------------------------------------------------------------------
97 //
98 //  Destructor
99 //
100 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()101 RBBIRuleBuilder::~RBBIRuleBuilder() {
102 
103     int        i;
104     for (i=0; ; i++) {
105         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
106         if (n==NULL) {
107             break;
108         }
109         delete n;
110     }
111 
112     delete fUSetNodes;
113     delete fSetBuilder;
114     delete fForwardTable;
115     delete fForwardTree;
116     delete fReverseTree;
117     delete fSafeFwdTree;
118     delete fSafeRevTree;
119     delete fScanner;
120     delete fRuleStatusVals;
121 }
122 
123 
124 
125 
126 
127 //----------------------------------------------------------------------------------------
128 //
129 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
130 //                    the format for saving in ICU data files,
131 //                    which is also the format needed by the RBBI runtime engine.
132 //
133 //----------------------------------------------------------------------------------------
align8(int32_t i)134 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
135 
flattenData()136 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
137     int32_t    i;
138 
139     if (U_FAILURE(*fStatus)) {
140         return NULL;
141     }
142 
143     // Remove whitespace from the rules to make it smaller.
144     // The rule parser has already removed comments.
145     fStrippedRules = fScanner->stripRules(fStrippedRules);
146 
147     // Calculate the size of each section in the data.
148     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
149     //   Sections sizes actually stored in the header are for the actual data
150     //     without the padding.
151     //
152     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
153     int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
154     int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
155     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
156     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
157     int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
158 
159     int32_t         totalSize = headerSize
160                                 + forwardTableSize
161                                 + reverseTableSize
162                                 + statusTableSize + trieSize + rulesSize;
163 
164     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
165     if (data == NULL) {
166         *fStatus = U_MEMORY_ALLOCATION_ERROR;
167         return NULL;
168     }
169     uprv_memset(data, 0, totalSize);
170 
171 
172     data->fMagic            = 0xb1a0;
173     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
174     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
175     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
176     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
177     data->fLength           = totalSize;
178     data->fCatCount         = fSetBuilder->getNumCharCategories();
179 
180     data->fFTable        = headerSize;
181     data->fFTableLen     = forwardTableSize;
182 
183     data->fRTable        = data->fFTable  + data->fFTableLen;
184     data->fRTableLen     = reverseTableSize;
185 
186     data->fTrie          = data->fRTable + data->fRTableLen;
187     data->fTrieLen       = fSetBuilder->getTrieSize();
188     data->fStatusTable   = data->fTrie    + trieSize;
189     data->fStatusTableLen= statusTableSize;
190     data->fRuleSource    = data->fStatusTable + statusTableSize;
191     data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
192 
193     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
194 
195     fForwardTable->exportTable((uint8_t *)data + data->fFTable);
196     fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
197     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
198 
199     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
200     for (i=0; i<fRuleStatusVals->size(); i++) {
201         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
202     }
203 
204     fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
205 
206     return data;
207 }
208 
209 
210 //----------------------------------------------------------------------------------------
211 //
212 //  createRuleBasedBreakIterator    construct from source rules that are passed in
213 //                                  in a UnicodeString
214 //
215 //----------------------------------------------------------------------------------------
216 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)217 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
218                                     UParseError      *parseError,
219                                     UErrorCode       &status)
220 {
221     //
222     // Read the input rules, generate a parse tree, symbol table,
223     // and list of all Unicode Sets referenced by the rules.
224     //
225     RBBIRuleBuilder  builder(rules, parseError, status);
226     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
227         return NULL;
228     }
229 
230     RBBIDataHeader *data = builder.build(status);
231 
232     if (U_FAILURE(status)) {
233         return nullptr;
234     }
235 
236     //
237     //  Create a break iterator from the compiled rules.
238     //     (Identical to creation from stored pre-compiled rules)
239     //
240     // status is checked after init in construction.
241     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
242     if (U_FAILURE(status)) {
243         delete This;
244         This = NULL;
245     }
246     else if(This == NULL) { // test for NULL
247         status = U_MEMORY_ALLOCATION_ERROR;
248     }
249     return This;
250 }
251 
build(UErrorCode & status)252 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
253     if (U_FAILURE(status)) {
254         return nullptr;
255     }
256 
257     fScanner->parse();
258     if (U_FAILURE(status)) {
259         return nullptr;
260     }
261 
262     //
263     // UnicodeSet processing.
264     //    Munge the Unicode Sets to create a set of character categories.
265     //    Generate the mapping tables (TRIE) from input code points to
266     //    the character categories.
267     //
268     fSetBuilder->buildRanges();
269 
270     //
271     //   Generate the DFA state transition table.
272     //
273     fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
274     if (fForwardTable == nullptr) {
275         status = U_MEMORY_ALLOCATION_ERROR;
276         return nullptr;
277     }
278 
279     fForwardTable->buildForwardTable();
280     optimizeTables();
281     fForwardTable->buildSafeReverseTable(status);
282 
283 
284 #ifdef RBBI_DEBUG
285     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
286         fForwardTable->printStates();
287         fForwardTable->printRuleStatusTable();
288         fForwardTable->printReverseTable();
289     }
290 #endif
291 
292     fSetBuilder->buildTrie();
293 
294     //
295     //   Package up the compiled data into a memory image
296     //      in the run-time format.
297     //
298     RBBIDataHeader *data = flattenData(); // returns NULL if error
299     if (U_FAILURE(status)) {
300         return nullptr;
301     }
302     return data;
303 }
304 
optimizeTables()305 void RBBIRuleBuilder::optimizeTables() {
306     bool didSomething;
307     do {
308         didSomething = false;
309 
310         // Begin looking for duplicates with char class 3.
311         // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
312         // and should not have other categories merged into them.
313         IntPair duplPair = {3, 0};
314         while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
315             fSetBuilder->mergeCategories(duplPair);
316             fForwardTable->removeColumn(duplPair.second);
317             didSomething = true;
318         }
319 
320         while (fForwardTable->removeDuplicateStates() > 0) {
321             didSomething = true;
322         }
323     } while (didSomething);
324 }
325 
326 U_NAMESPACE_END
327 
328 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
329