1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 *   Copyright (C) 1999-2014 International Business Machines Corporation   *
6 *   and others. All rights reserved.                                      *
7 ***************************************************************************
8 */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/utypes.h"
15 #include "rbbidata.h"
16 #include "rbbirb.h"
17 #include "utrie.h"
18 #include "udatamem.h"
19 #include "cmemory.h"
20 #include "cstring.h"
21 #include "umutex.h"
22 
23 #include "uassert.h"
24 
25 
26 //-----------------------------------------------------------------------------------
27 //
28 //   Trie access folding function.  Copied as-is from properties code in uchar.c
29 //
30 //-----------------------------------------------------------------------------------
31 U_CDECL_BEGIN
32 static int32_t U_CALLCONV
getFoldingOffset(uint32_t data)33 getFoldingOffset(uint32_t data) {
34     /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
35     if(data&0x8000) {
36         return (int32_t)(data&0x7fff);
37     } else {
38         return 0;
39     }
40 }
41 U_CDECL_END
42 
43 U_NAMESPACE_BEGIN
44 
45 //-----------------------------------------------------------------------------
46 //
47 //    Constructors.
48 //
49 //-----------------------------------------------------------------------------
RBBIDataWrapper(const RBBIDataHeader * data,UErrorCode & status)50 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
51     init0();
52     init(data, status);
53 }
54 
RBBIDataWrapper(const RBBIDataHeader * data,enum EDontAdopt,UErrorCode & status)55 RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
56     init0();
57     init(data, status);
58     fDontFreeData = TRUE;
59 }
60 
RBBIDataWrapper(UDataMemory * udm,UErrorCode & status)61 RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
62     init0();
63     if (U_FAILURE(status)) {
64         return;
65     }
66     const DataHeader *dh = udm->pHeader;
67     int32_t headerSize = dh->dataHeader.headerSize;
68     if (  !(headerSize >= 20 &&
69             dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
70             dh->info.charsetFamily == U_CHARSET_FAMILY &&
71             dh->info.dataFormat[0] == 0x42 &&  // dataFormat="Brk "
72             dh->info.dataFormat[1] == 0x72 &&
73             dh->info.dataFormat[2] == 0x6b &&
74             dh->info.dataFormat[3] == 0x20)
75             // Note: info.fFormatVersion is duplicated in the RBBIDataHeader, and is
76             //       validated when checking that.
77         ) {
78         status = U_INVALID_FORMAT_ERROR;
79         return;
80     }
81     const char *dataAsBytes = reinterpret_cast<const char *>(dh);
82     const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize);
83     init(rbbidh, status);
84     fUDataMem = udm;
85 }
86 
87 //-----------------------------------------------------------------------------
88 //
89 //    init().   Does most of the work of construction, shared between the
90 //              constructors.
91 //
92 //-----------------------------------------------------------------------------
init0()93 void RBBIDataWrapper::init0() {
94     fHeader = NULL;
95     fForwardTable = NULL;
96     fReverseTable = NULL;
97     fSafeFwdTable = NULL;
98     fSafeRevTable = NULL;
99     fRuleSource = NULL;
100     fRuleStatusTable = NULL;
101     fUDataMem = NULL;
102     fRefCount = 0;
103     fDontFreeData = TRUE;
104 }
105 
init(const RBBIDataHeader * data,UErrorCode & status)106 void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
107     if (U_FAILURE(status)) {
108         return;
109     }
110     fHeader = data;
111     if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
112     {
113         status = U_INVALID_FORMAT_ERROR;
114         return;
115     }
116     // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
117     //       that is no longer supported.  At that time fFormatVersion was
118     //       an int32_t field, rather than an array of 4 bytes.
119 
120     fDontFreeData = FALSE;
121     if (data->fFTableLen != 0) {
122         fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
123     }
124     if (data->fRTableLen != 0) {
125         fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
126     }
127     if (data->fSFTableLen != 0) {
128         fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
129     }
130     if (data->fSRTableLen != 0) {
131         fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
132     }
133 
134 
135     utrie_unserialize(&fTrie,
136                        (uint8_t *)data + fHeader->fTrie,
137                        fHeader->fTrieLen,
138                        &status);
139     if (U_FAILURE(status)) {
140         return;
141     }
142     fTrie.getFoldingOffset=getFoldingOffset;
143 
144 
145     fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
146     fRuleString.setTo(TRUE, fRuleSource, -1);
147     U_ASSERT(data->fRuleSourceLen > 0);
148 
149     fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
150     fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
151 
152     fRefCount = 1;
153 
154 #ifdef RBBI_DEBUG
155     char *debugEnv = getenv("U_RBBIDEBUG");
156     if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
157 #endif
158 }
159 
160 
161 //-----------------------------------------------------------------------------
162 //
163 //    Destructor.     Don't call this - use removeReference() instead.
164 //
165 //-----------------------------------------------------------------------------
~RBBIDataWrapper()166 RBBIDataWrapper::~RBBIDataWrapper() {
167     U_ASSERT(fRefCount == 0);
168     if (fUDataMem) {
169         udata_close(fUDataMem);
170     } else if (!fDontFreeData) {
171         uprv_free((void *)fHeader);
172     }
173 }
174 
175 
176 
177 //-----------------------------------------------------------------------------
178 //
179 //   Operator ==    Consider two RBBIDataWrappers to be equal if they
180 //                  refer to the same underlying data.  Although
181 //                  the data wrappers are normally shared between
182 //                  iterator instances, it's possible to independently
183 //                  open the same data twice, and get two instances, which
184 //                  should still be ==.
185 //
186 //-----------------------------------------------------------------------------
operator ==(const RBBIDataWrapper & other) const187 UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
188     if (fHeader == other.fHeader) {
189         return TRUE;
190     }
191     if (fHeader->fLength != other.fHeader->fLength) {
192         return FALSE;
193     }
194     if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
195         return TRUE;
196     }
197     return FALSE;
198 }
199 
hashCode()200 int32_t  RBBIDataWrapper::hashCode() {
201     return fHeader->fFTableLen;
202 }
203 
204 
205 
206 //-----------------------------------------------------------------------------
207 //
208 //    Reference Counting.   A single RBBIDataWrapper object is shared among
209 //                          however many RulesBasedBreakIterator instances are
210 //                          referencing the same data.
211 //
212 //-----------------------------------------------------------------------------
removeReference()213 void RBBIDataWrapper::removeReference() {
214     if (umtx_atomic_dec(&fRefCount) == 0) {
215         delete this;
216     }
217 }
218 
219 
addReference()220 RBBIDataWrapper *RBBIDataWrapper::addReference() {
221    umtx_atomic_inc(&fRefCount);
222    return this;
223 }
224 
225 
226 
227 //-----------------------------------------------------------------------------
228 //
229 //  getRuleSourceString
230 //
231 //-----------------------------------------------------------------------------
getRuleSourceString() const232 const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
233     return fRuleString;
234 }
235 
236 
237 //-----------------------------------------------------------------------------
238 //
239 //  print   -  debugging function to dump the runtime data tables.
240 //
241 //-----------------------------------------------------------------------------
242 #ifdef RBBI_DEBUG
printTable(const char * heading,const RBBIStateTable * table)243 void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
244     uint32_t   c;
245     uint32_t   s;
246 
247     RBBIDebugPrintf("   %s\n", heading);
248 
249     RBBIDebugPrintf("State |  Acc  LA TagIx");
250     for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
251     RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
252         RBBIDebugPrintf("----");
253     }
254     RBBIDebugPrintf("\n");
255 
256     if (table == NULL) {
257         RBBIDebugPrintf("         N U L L   T A B L E\n\n");
258         return;
259     }
260     for (s=0; s<table->fNumStates; s++) {
261         RBBIStateTableRow *row = (RBBIStateTableRow *)
262                                   (table->fTableData + (table->fRowLen * s));
263         RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
264         for (c=0; c<fHeader->fCatCount; c++)  {
265             RBBIDebugPrintf("%3d ", row->fNextState[c]);
266         }
267         RBBIDebugPrintf("\n");
268     }
269     RBBIDebugPrintf("\n");
270 }
271 #endif
272 
273 
274 #ifdef RBBI_DEBUG
printData()275 void  RBBIDataWrapper::printData() {
276     RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
277     RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
278                                                     fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
279     RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
280     RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
281 
282     printTable("Forward State Transition Table", fForwardTable);
283     printTable("Reverse State Transition Table", fReverseTable);
284     printTable("Safe Forward State Transition Table", fSafeFwdTable);
285     printTable("Safe Reverse State Transition Table", fSafeRevTable);
286 
287     RBBIDebugPrintf("\nOrignal Rules source:\n");
288     for (int32_t c=0; fRuleSource[c] != 0; c++) {
289         RBBIDebugPrintf("%c", fRuleSource[c]);
290     }
291     RBBIDebugPrintf("\n\n");
292 }
293 #endif
294 
295 
296 U_NAMESPACE_END
297 U_NAMESPACE_USE
298 
299 //-----------------------------------------------------------------------------
300 //
301 //  ubrk_swap   -  byte swap and char encoding swap of RBBI data
302 //
303 //-----------------------------------------------------------------------------
304 
305 U_CAPI int32_t U_EXPORT2
ubrk_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)306 ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
307            UErrorCode *status) {
308 
309     if (status == NULL || U_FAILURE(*status)) {
310         return 0;
311     }
312     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
313         *status=U_ILLEGAL_ARGUMENT_ERROR;
314         return 0;
315     }
316 
317     //
318     //  Check that the data header is for for break data.
319     //    (Header contents are defined in genbrk.cpp)
320     //
321     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
322     if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
323            pInfo->dataFormat[1]==0x72 &&
324            pInfo->dataFormat[2]==0x6b &&
325            pInfo->dataFormat[3]==0x20 &&
326            pInfo->formatVersion[0]==3  )) {
327         udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
328                          pInfo->dataFormat[0], pInfo->dataFormat[1],
329                          pInfo->dataFormat[2], pInfo->dataFormat[3],
330                          pInfo->formatVersion[0]);
331         *status=U_UNSUPPORTED_ERROR;
332         return 0;
333     }
334 
335     //
336     // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
337     //                         RBBIDataHeader).  This swap also conveniently gets us
338     //                         the size of the ICU d.h., which lets us locate the start
339     //                         of the RBBI specific data.
340     //
341     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
342 
343 
344     //
345     // Get the RRBI Data Header, and check that it appears to be OK.
346     //
347     //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
348     //           an int32_t with a value of 1.  Starting with ICU 3.4,
349     //           RBBI's fDataFormat matches the dataFormat field from the
350     //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
351     //
352     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
353     RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
354     if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
355         rbbiDH->fFormatVersion[0] != 3 ||
356         ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
357     {
358         udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
359         *status=U_UNSUPPORTED_ERROR;
360         return 0;
361     }
362 
363     //
364     // Prefight operation?  Just return the size
365     //
366     int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
367     int32_t totalSize = headerSize + breakDataLength;
368     if (length < 0) {
369         return totalSize;
370     }
371 
372     //
373     // Check that length passed in is consistent with length from RBBI data header.
374     //
375     if (length < totalSize) {
376         udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
377                             breakDataLength);
378         *status=U_INDEX_OUTOFBOUNDS_ERROR;
379         return 0;
380         }
381 
382 
383     //
384     // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
385     //                 we need to reference the header to locate the data, and an
386     //                 inplace swap of the header leaves it unusable.
387     //
388     uint8_t         *outBytes = (uint8_t *)outData + headerSize;
389     RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
390 
391     int32_t   tableStartOffset;
392     int32_t   tableLength;
393 
394     //
395     // If not swapping in place, zero out the output buffer before starting.
396     //    Individual tables and other data items within are aligned to 8 byte boundaries
397     //    when originally created.  Any unused space between items needs to be zero.
398     //
399     if (inBytes != outBytes) {
400         uprv_memset(outBytes, 0, breakDataLength);
401     }
402 
403     //
404     // Each state table begins with several 32 bit fields.  Calculate the size
405     //   in bytes of these.
406     //
407     int32_t         topSize = offsetof(RBBIStateTable, fTableData);
408 
409     // Forward state table.
410     tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
411     tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
412 
413     if (tableLength > 0) {
414         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
415                             outBytes+tableStartOffset, status);
416         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
417                             outBytes+tableStartOffset+topSize, status);
418     }
419 
420     // Reverse state table.  Same layout as forward table, above.
421     tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
422     tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
423 
424     if (tableLength > 0) {
425         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
426                             outBytes+tableStartOffset, status);
427         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
428                             outBytes+tableStartOffset+topSize, status);
429     }
430 
431     // Safe Forward state table.  Same layout as forward table, above.
432     tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
433     tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
434 
435     if (tableLength > 0) {
436         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
437                             outBytes+tableStartOffset, status);
438         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
439                             outBytes+tableStartOffset+topSize, status);
440     }
441 
442     // Safe Reverse state table.  Same layout as forward table, above.
443     tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
444     tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
445 
446     if (tableLength > 0) {
447         ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
448                             outBytes+tableStartOffset, status);
449         ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
450                             outBytes+tableStartOffset+topSize, status);
451     }
452 
453     // Trie table for character categories
454     utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
455                             outBytes+ds->readUInt32(rbbiDH->fTrie), status);
456 
457     // Source Rules Text.  It's UChar data
458     ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
459                         outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
460 
461     // Table of rule status values.  It's all int_32 values
462     ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
463                         outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
464 
465     // And, last, the header.
466     //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
467     //   Swap the whole thing as int32_t, then re-swap the one field.
468     //
469     ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
470     ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
471 
472     return totalSize;
473 }
474 
475 
476 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
477