1 //
2 //  file:  rbbirb.cpp
3 //
4 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
5 //  All Rights Reserved.
6 //
7 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
8 //    building (compiling) break rules into the tables required by the runtime
9 //    RBBI engine.
10 //
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "unicode/brkiter.h"
17 #include "unicode/rbbi.h"
18 #include "unicode/ubrk.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27 
28 #include "rbbirb.h"
29 #include "rbbinode.h"
30 
31 #include "rbbiscan.h"
32 #include "rbbisetb.h"
33 #include "rbbitblb.h"
34 #include "rbbidata.h"
35 
36 
37 U_NAMESPACE_BEGIN
38 
39 
40 //----------------------------------------------------------------------------------------
41 //
42 //  Constructor.
43 //
44 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
46                                        UParseError     *parseErr,
47                                        UErrorCode      &status)
48  : fRules(rules)
49 {
50     fStatus = &status; // status is checked below
51     fParseError = parseErr;
52     fDebugEnv   = NULL;
53 #ifdef RBBI_DEBUG
54     fDebugEnv   = getenv("U_RBBIDEBUG");
55 #endif
56 
57 
58     fForwardTree        = NULL;
59     fReverseTree        = NULL;
60     fSafeFwdTree        = NULL;
61     fSafeRevTree        = NULL;
62     fDefaultTree        = &fForwardTree;
63     fForwardTables      = NULL;
64     fReverseTables      = NULL;
65     fSafeFwdTables      = NULL;
66     fSafeRevTables      = NULL;
67     fRuleStatusVals     = NULL;
68     fChainRules         = FALSE;
69     fLBCMNoChain        = FALSE;
70     fLookAheadHardBreak = FALSE;
71     fUSetNodes          = NULL;
72     fRuleStatusVals     = NULL;
73     fScanner            = NULL;
74     fSetBuilder         = NULL;
75     if (parseErr) {
76         uprv_memset(parseErr, 0, sizeof(UParseError));
77     }
78 
79     if (U_FAILURE(status)) {
80         return;
81     }
82 
83     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
84     fRuleStatusVals     = new UVector(status);
85     fScanner            = new RBBIRuleScanner(this);
86     fSetBuilder         = new RBBISetBuilder(this);
87     if (U_FAILURE(status)) {
88         return;
89     }
90     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
91         status = U_MEMORY_ALLOCATION_ERROR;
92     }
93 }
94 
95 
96 
97 //----------------------------------------------------------------------------------------
98 //
99 //  Destructor
100 //
101 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()102 RBBIRuleBuilder::~RBBIRuleBuilder() {
103 
104     int        i;
105     for (i=0; ; i++) {
106         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
107         if (n==NULL) {
108             break;
109         }
110         delete n;
111     }
112 
113     delete fUSetNodes;
114     delete fSetBuilder;
115     delete fForwardTables;
116     delete fReverseTables;
117     delete fSafeFwdTables;
118     delete fSafeRevTables;
119 
120     delete fForwardTree;
121     delete fReverseTree;
122     delete fSafeFwdTree;
123     delete fSafeRevTree;
124     delete fScanner;
125     delete fRuleStatusVals;
126 }
127 
128 
129 
130 
131 
132 //----------------------------------------------------------------------------------------
133 //
134 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
135 //                    the format for saving in ICU data files,
136 //                    which is also the format needed by the RBBI runtime engine.
137 //
138 //----------------------------------------------------------------------------------------
align8(int32_t i)139 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
140 
flattenData()141 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
142     int32_t    i;
143 
144     if (U_FAILURE(*fStatus)) {
145         return NULL;
146     }
147 
148     // Remove comments and whitespace from the rules to make it smaller.
149     UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
150 
151     // Calculate the size of each section in the data.
152     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
153     //   Sections sizes actually stored in the header are for the actual data
154     //     without the padding.
155     //
156     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
157     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
158     int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
159     int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
160     int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
161     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
162     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
163     int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
164 
165     int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
166                                 + safeFwdTableSize + safeRevTableSize
167                                 + statusTableSize + trieSize + rulesSize;
168 
169     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
170     if (data == NULL) {
171         *fStatus = U_MEMORY_ALLOCATION_ERROR;
172         return NULL;
173     }
174     uprv_memset(data, 0, totalSize);
175 
176 
177     data->fMagic            = 0xb1a0;
178     data->fFormatVersion[0] = 3;
179     data->fFormatVersion[1] = 1;
180     data->fFormatVersion[2] = 0;
181     data->fFormatVersion[3] = 0;
182     data->fLength           = totalSize;
183     data->fCatCount         = fSetBuilder->getNumCharCategories();
184 
185     data->fFTable        = headerSize;
186     data->fFTableLen     = forwardTableSize;
187     data->fRTable        = data->fFTable  + forwardTableSize;
188     data->fRTableLen     = reverseTableSize;
189     data->fSFTable       = data->fRTable  + reverseTableSize;
190     data->fSFTableLen    = safeFwdTableSize;
191     data->fSRTable       = data->fSFTable + safeFwdTableSize;
192     data->fSRTableLen    = safeRevTableSize;
193 
194     data->fTrie          = data->fSRTable + safeRevTableSize;
195     data->fTrieLen       = fSetBuilder->getTrieSize();
196     data->fStatusTable   = data->fTrie    + trieSize;
197     data->fStatusTableLen= statusTableSize;
198     data->fRuleSource    = data->fStatusTable + statusTableSize;
199     data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
200 
201     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
202 
203     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
204     fReverseTables->exportTable((uint8_t *)data + data->fRTable);
205     fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
206     fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
207     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
208 
209     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
210     for (i=0; i<fRuleStatusVals->size(); i++) {
211         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
212     }
213 
214     strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
215 
216     return data;
217 }
218 
219 
220 
221 
222 
223 
224 //----------------------------------------------------------------------------------------
225 //
226 //  createRuleBasedBreakIterator    construct from source rules that are passed in
227 //                                  in a UnicodeString
228 //
229 //----------------------------------------------------------------------------------------
230 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)231 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
232                                     UParseError      *parseError,
233                                     UErrorCode       &status)
234 {
235     // status checked below
236 
237     //
238     // Read the input rules, generate a parse tree, symbol table,
239     // and list of all Unicode Sets referenced by the rules.
240     //
241     RBBIRuleBuilder  builder(rules, parseError, status);
242     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
243         return NULL;
244     }
245     builder.fScanner->parse();
246 
247     //
248     // UnicodeSet processing.
249     //    Munge the Unicode Sets to create a set of character categories.
250     //    Generate the mapping tables (TRIE) from input 32-bit characters to
251     //    the character categories.
252     //
253     builder.fSetBuilder->build();
254 
255 
256     //
257     //   Generate the DFA state transition table.
258     //
259     builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
260     builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
261     builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
262     builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
263     if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
264         builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
265     {
266         status = U_MEMORY_ALLOCATION_ERROR;
267         delete builder.fForwardTables; builder.fForwardTables = NULL;
268         delete builder.fReverseTables; builder.fReverseTables = NULL;
269         delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
270         delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
271         return NULL;
272     }
273 
274     builder.fForwardTables->build();
275     builder.fReverseTables->build();
276     builder.fSafeFwdTables->build();
277     builder.fSafeRevTables->build();
278 
279 #ifdef RBBI_DEBUG
280     if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
281         builder.fForwardTables->printRuleStatusTable();
282     }
283 #endif
284 
285     //
286     //   Package up the compiled data into a memory image
287     //      in the run-time format.
288     //
289     RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
290     if (U_FAILURE(*builder.fStatus)) {
291         return NULL;
292     }
293 
294 
295     //
296     //  Clean up the compiler related stuff
297     //
298 
299 
300     //
301     //  Create a break iterator from the compiled rules.
302     //     (Identical to creation from stored pre-compiled rules)
303     //
304     // status is checked after init in construction.
305     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
306     if (U_FAILURE(status)) {
307         delete This;
308         This = NULL;
309     }
310     else if(This == NULL) { // test for NULL
311         status = U_MEMORY_ALLOCATION_ERROR;
312     }
313     return This;
314 }
315 
316 U_NAMESPACE_END
317 
318 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
319