1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  rbbirb.cpp
5 //
6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10 //    building (compiling) break rules into the tables required by the runtime
11 //    RBBI engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/parseerr.h"
27 
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "rbbirb.h"
31 #include "rbbinode.h"
32 #include "rbbiscan.h"
33 #include "rbbisetb.h"
34 #include "rbbitblb.h"
35 #include "rbbidata.h"
36 #include "uassert.h"
37 
38 
39 U_NAMESPACE_BEGIN
40 
41 
42 //----------------------------------------------------------------------------------------
43 //
44 //  Constructor.
45 //
46 //----------------------------------------------------------------------------------------
47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
48                                        UParseError     *parseErr,
49                                        UErrorCode      &status)
50  : fRules(rules)
51 {
52     fStatus = &status; // status is checked below
53     fParseError = parseErr;
54     fDebugEnv   = NULL;
55 #ifdef RBBI_DEBUG
56     fDebugEnv   = getenv("U_RBBIDEBUG");
57 #endif
58 
59 
60     fForwardTree        = NULL;
61     fReverseTree        = NULL;
62     fSafeFwdTree        = NULL;
63     fSafeRevTree        = NULL;
64     fDefaultTree        = &fForwardTree;
65     fForwardTables      = NULL;
66     fReverseTables      = NULL;
67     fSafeFwdTables      = NULL;
68     fSafeRevTables      = NULL;
69     fRuleStatusVals     = NULL;
70     fChainRules         = FALSE;
71     fLBCMNoChain        = FALSE;
72     fLookAheadHardBreak = FALSE;
73     fUSetNodes          = NULL;
74     fRuleStatusVals     = NULL;
75     fScanner            = NULL;
76     fSetBuilder         = NULL;
77     if (parseErr) {
78         uprv_memset(parseErr, 0, sizeof(UParseError));
79     }
80 
81     if (U_FAILURE(status)) {
82         return;
83     }
84 
85     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
86     fRuleStatusVals     = new UVector(status);
87     fScanner            = new RBBIRuleScanner(this);
88     fSetBuilder         = new RBBISetBuilder(this);
89     if (U_FAILURE(status)) {
90         return;
91     }
92     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
93         status = U_MEMORY_ALLOCATION_ERROR;
94     }
95 }
96 
97 
98 
99 //----------------------------------------------------------------------------------------
100 //
101 //  Destructor
102 //
103 //----------------------------------------------------------------------------------------
104 RBBIRuleBuilder::~RBBIRuleBuilder() {
105 
106     int        i;
107     for (i=0; ; i++) {
108         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
109         if (n==NULL) {
110             break;
111         }
112         delete n;
113     }
114 
115     delete fUSetNodes;
116     delete fSetBuilder;
117     delete fForwardTables;
118     delete fReverseTables;
119     delete fSafeFwdTables;
120     delete fSafeRevTables;
121 
122     delete fForwardTree;
123     delete fReverseTree;
124     delete fSafeFwdTree;
125     delete fSafeRevTree;
126     delete fScanner;
127     delete fRuleStatusVals;
128 }
129 
130 
131 
132 
133 
134 //----------------------------------------------------------------------------------------
135 //
136 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
137 //                    the format for saving in ICU data files,
138 //                    which is also the format needed by the RBBI runtime engine.
139 //
140 //----------------------------------------------------------------------------------------
141 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
142 
143 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
144     int32_t    i;
145 
146     if (U_FAILURE(*fStatus)) {
147         return NULL;
148     }
149 
150     // Remove comments and whitespace from the rules to make it smaller.
151     UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
152 
153     // Calculate the size of each section in the data.
154     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
155     //   Sections sizes actually stored in the header are for the actual data
156     //     without the padding.
157     //
158     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
159     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
160     int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
161     int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
162     int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
163     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
164     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
165     int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
166 
167     (void)safeFwdTableSize;
168 
169     int32_t         totalSize = headerSize
170                                 + forwardTableSize
171                                 + /* reverseTableSize */ 0
172                                 + /* safeFwdTableSize */ 0
173                                 + (safeRevTableSize ? safeRevTableSize : reverseTableSize)
174                                 + statusTableSize + trieSize + rulesSize;
175 
176     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
177     if (data == NULL) {
178         *fStatus = U_MEMORY_ALLOCATION_ERROR;
179         return NULL;
180     }
181     uprv_memset(data, 0, totalSize);
182 
183 
184     data->fMagic            = 0xb1a0;
185     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
186     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
187     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
188     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
189     data->fLength           = totalSize;
190     data->fCatCount         = fSetBuilder->getNumCharCategories();
191 
192     // Only save the forward table and the safe reverse table,
193     // because these are the only ones used at run-time.
194     //
195     // For the moment, we still build the other tables if they are present in the rule source files,
196     // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
197     //
198     // Additional backwards compatibility consideration: if no safe rules are provided, consider the
199     // reverse rules to actually be the safe reverse rules.
200 
201     data->fFTable        = headerSize;
202     data->fFTableLen     = forwardTableSize;
203 
204     // Do not save Reverse Table.
205     data->fRTable        = data->fFTable  + forwardTableSize;
206     data->fRTableLen     = 0;
207 
208     // Do not save the Safe Forward table.
209     data->fSFTable       = data->fRTable + 0;
210     data->fSFTableLen    = 0;
211 
212     data->fSRTable       = data->fSFTable + 0;
213     if (safeRevTableSize > 0) {
214         data->fSRTableLen    = safeRevTableSize;
215     } else if (reverseTableSize > 0) {
216         data->fSRTableLen    = reverseTableSize;
217     } else {
218         U_ASSERT(FALSE);    // Rule build should have failed for lack of a reverse table
219                             // before reaching this point.
220     }
221 
222 
223     data->fTrie          = data->fSRTable + data->fSRTableLen;
224     data->fTrieLen       = fSetBuilder->getTrieSize();
225     data->fStatusTable   = data->fTrie    + trieSize;
226     data->fStatusTableLen= statusTableSize;
227     data->fRuleSource    = data->fStatusTable + statusTableSize;
228     data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
229 
230     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
231 
232     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
233     // fReverseTables->exportTable((uint8_t *)data + data->fRTable);
234     // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
235     if (safeRevTableSize > 0) {
236         fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
237     } else {
238         fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
239     }
240 
241     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
242 
243     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
244     for (i=0; i<fRuleStatusVals->size(); i++) {
245         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
246     }
247 
248     strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
249 
250     return data;
251 }
252 
253 
254 
255 
256 
257 
258 //----------------------------------------------------------------------------------------
259 //
260 //  createRuleBasedBreakIterator    construct from source rules that are passed in
261 //                                  in a UnicodeString
262 //
263 //----------------------------------------------------------------------------------------
264 BreakIterator *
265 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
266                                     UParseError      *parseError,
267                                     UErrorCode       &status)
268 {
269     // status checked below
270 
271     //
272     // Read the input rules, generate a parse tree, symbol table,
273     // and list of all Unicode Sets referenced by the rules.
274     //
275     RBBIRuleBuilder  builder(rules, parseError, status);
276     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
277         return NULL;
278     }
279     builder.fScanner->parse();
280 
281     //
282     // UnicodeSet processing.
283     //    Munge the Unicode Sets to create a set of character categories.
284     //    Generate the mapping tables (TRIE) from input 32-bit characters to
285     //    the character categories.
286     //
287     builder.fSetBuilder->build();
288 
289 
290     //
291     //   Generate the DFA state transition table.
292     //
293     builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
294     builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
295     builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
296     builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
297     if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
298         builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
299     {
300         status = U_MEMORY_ALLOCATION_ERROR;
301         delete builder.fForwardTables; builder.fForwardTables = NULL;
302         delete builder.fReverseTables; builder.fReverseTables = NULL;
303         delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
304         delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
305         return NULL;
306     }
307 
308     builder.fForwardTables->build();
309     builder.fReverseTables->build();
310     builder.fSafeFwdTables->build();
311     builder.fSafeRevTables->build();
312 
313 #ifdef RBBI_DEBUG
314     if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
315         builder.fForwardTables->printRuleStatusTable();
316     }
317 #endif
318 
319     //
320     //   Package up the compiled data into a memory image
321     //      in the run-time format.
322     //
323     RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
324     if (U_FAILURE(*builder.fStatus)) {
325         return NULL;
326     }
327 
328 
329     //
330     //  Clean up the compiler related stuff
331     //
332 
333 
334     //
335     //  Create a break iterator from the compiled rules.
336     //     (Identical to creation from stored pre-compiled rules)
337     //
338     // status is checked after init in construction.
339     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
340     if (U_FAILURE(status)) {
341         delete This;
342         This = NULL;
343     }
344     else if(This == NULL) { // test for NULL
345         status = U_MEMORY_ALLOCATION_ERROR;
346     }
347     return This;
348 }
349 
350 U_NAMESPACE_END
351 
352 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
353