1 /*
2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * dictionarydata.h
7 *
8 * created on: 2012may31
9 * created by: Markus W. Scherer & Maxime Serrano
10 */
11 
12 #include "dictionarydata.h"
13 #include "unicode/ucharstrie.h"
14 #include "unicode/bytestrie.h"
15 #include "unicode/udata.h"
16 #include "cmemory.h"
17 
18 #if !UCONFIG_NO_BREAK_ITERATION
19 
20 U_NAMESPACE_BEGIN
21 
22 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
23 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
24 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
25 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
26 
27 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
28 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
31 
~DictionaryMatcher()32 DictionaryMatcher::~DictionaryMatcher() {
33 }
34 
~UCharsDictionaryMatcher()35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
36     udata_close(file);
37 }
38 
getType() const39 int32_t UCharsDictionaryMatcher::getType() const {
40     return DictionaryData::TRIE_TYPE_UCHARS;
41 }
42 
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
44                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
45                             int32_t *prefix) const {
46 
47     UCharsTrie uct(characters);
48     int32_t startingTextIndex = utext_getNativeIndex(text);
49     int32_t wordCount = 0;
50     int32_t codePointsMatched = 0;
51 
52     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
53         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
54         int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
55         codePointsMatched += 1;
56         if (USTRINGTRIE_HAS_VALUE(result)) {
57             if (wordCount < limit) {
58                 if (values != NULL) {
59                     values[wordCount] = uct.getValue();
60                 }
61                 if (lengths != NULL) {
62                     lengths[wordCount] = lengthMatched;
63                 }
64                 if (cpLengths != NULL) {
65                     cpLengths[wordCount] = codePointsMatched;
66                 }
67                 ++wordCount;
68             }
69             if (result == USTRINGTRIE_FINAL_VALUE) {
70                 break;
71             }
72         }
73         else if (result == USTRINGTRIE_NO_MATCH) {
74             break;
75         }
76         if (lengthMatched >= maxLength) {
77             break;
78         }
79     }
80 
81     if (prefix != NULL) {
82         *prefix = codePointsMatched;
83     }
84     return wordCount;
85 }
86 
~BytesDictionaryMatcher()87 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
88     udata_close(file);
89 }
90 
transform(UChar32 c) const91 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
92     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
93         if (c == 0x200D) {
94             return 0xFF;
95         } else if (c == 0x200C) {
96             return 0xFE;
97         }
98         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
99         if (delta < 0 || 0xFD < delta) {
100             return U_SENTINEL;
101         }
102         return (UChar32)delta;
103     }
104     return c;
105 }
106 
getType() const107 int32_t BytesDictionaryMatcher::getType() const {
108     return DictionaryData::TRIE_TYPE_BYTES;
109 }
110 
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const111 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
112                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
113                             int32_t *prefix) const {
114     BytesTrie bt(characters);
115     int32_t startingTextIndex = utext_getNativeIndex(text);
116     int32_t wordCount = 0;
117     int32_t codePointsMatched = 0;
118 
119     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
120         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
121         int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
122         codePointsMatched += 1;
123         if (USTRINGTRIE_HAS_VALUE(result)) {
124             if (wordCount < limit) {
125                 if (values != NULL) {
126                     values[wordCount] = bt.getValue();
127                 }
128                 if (lengths != NULL) {
129                     lengths[wordCount] = lengthMatched;
130                 }
131                 if (cpLengths != NULL) {
132                     cpLengths[wordCount] = codePointsMatched;
133                 }
134                 ++wordCount;
135             }
136             if (result == USTRINGTRIE_FINAL_VALUE) {
137                 break;
138             }
139         }
140         else if (result == USTRINGTRIE_NO_MATCH) {
141             break;
142         }
143         if (lengthMatched >= maxLength) {
144             break;
145         }
146     }
147 
148     if (prefix != NULL) {
149         *prefix = codePointsMatched;
150     }
151     return wordCount;
152 }
153 
154 
155 U_NAMESPACE_END
156 
157 U_NAMESPACE_USE
158 
159 U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)160 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
161            void *outData, UErrorCode *pErrorCode) {
162     const UDataInfo *pInfo;
163     int32_t headerSize;
164     const uint8_t *inBytes;
165     uint8_t *outBytes;
166     const int32_t *inIndexes;
167     int32_t indexes[DictionaryData::IX_COUNT];
168     int32_t i, offset, size;
169 
170     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
171     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
172     pInfo = (const UDataInfo *)((const char *)inData + 4);
173     if (!(pInfo->dataFormat[0] == 0x44 &&
174           pInfo->dataFormat[1] == 0x69 &&
175           pInfo->dataFormat[2] == 0x63 &&
176           pInfo->dataFormat[3] == 0x74 &&
177           pInfo->formatVersion[0] == 1)) {
178         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
179                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
180         *pErrorCode = U_UNSUPPORTED_ERROR;
181         return 0;
182     }
183 
184     inBytes = (const uint8_t *)inData + headerSize;
185     outBytes = (uint8_t *)outData + headerSize;
186 
187     inIndexes = (const int32_t *)inBytes;
188     if (length >= 0) {
189         length -= headerSize;
190         if (length < (int32_t)(sizeof(indexes))) {
191             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
192             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
193             return 0;
194         }
195     }
196 
197     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
198         indexes[i] = udata_readInt32(ds, inIndexes[i]);
199     }
200 
201     size = indexes[DictionaryData::IX_TOTAL_SIZE];
202 
203     if (length >= 0) {
204         if (length < size) {
205             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
206             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
207             return 0;
208         }
209 
210         if (inBytes != outBytes) {
211             uprv_memcpy(outBytes, inBytes, size);
212         }
213 
214         offset = 0;
215         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
216         offset = (int32_t)sizeof(indexes);
217         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
218         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
219 
220         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
221             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
222         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
223             // nothing to do
224         } else {
225             udata_printError(ds, "udict_swap(): unknown trie type!\n");
226             *pErrorCode = U_UNSUPPORTED_ERROR;
227             return 0;
228         }
229 
230         // these next two sections are empty in the current format,
231         // but may be used later.
232         offset = nextOffset;
233         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
234         offset = nextOffset;
235         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
236         offset = nextOffset;
237     }
238     return headerSize + size;
239 }
240 #endif
241