1 /*
2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * dictionarydata.h
7 *
8 * created on: 2012may31
9 * created by: Markus W. Scherer & Maxime Serrano
10 */
11
12 #include "dictionarydata.h"
13 #include "unicode/ucharstrie.h"
14 #include "unicode/bytestrie.h"
15 #include "unicode/udata.h"
16 #include "cmemory.h"
17
18 #if !UCONFIG_NO_BREAK_ITERATION
19
20 U_NAMESPACE_BEGIN
21
22 const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
23 const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
24 const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
25 const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
26
27 const int32_t DictionaryData::TRANSFORM_NONE = 0;
28 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29 const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
31
~DictionaryMatcher()32 DictionaryMatcher::~DictionaryMatcher() {
33 }
34
~UCharsDictionaryMatcher()35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
36 udata_close(file);
37 }
38
getType() const39 int32_t UCharsDictionaryMatcher::getType() const {
40 return DictionaryData::TRIE_TYPE_UCHARS;
41 }
42
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
44 int32_t *lengths, int32_t *cpLengths, int32_t *values,
45 int32_t *prefix) const {
46
47 UCharsTrie uct(characters);
48 int32_t startingTextIndex = utext_getNativeIndex(text);
49 int32_t wordCount = 0;
50 int32_t codePointsMatched = 0;
51
52 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
53 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
54 int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
55 codePointsMatched += 1;
56 if (USTRINGTRIE_HAS_VALUE(result)) {
57 if (wordCount < limit) {
58 if (values != NULL) {
59 values[wordCount] = uct.getValue();
60 }
61 if (lengths != NULL) {
62 lengths[wordCount] = lengthMatched;
63 }
64 if (cpLengths != NULL) {
65 cpLengths[wordCount] = codePointsMatched;
66 }
67 ++wordCount;
68 }
69 if (result == USTRINGTRIE_FINAL_VALUE) {
70 break;
71 }
72 }
73 else if (result == USTRINGTRIE_NO_MATCH) {
74 break;
75 }
76 if (lengthMatched >= maxLength) {
77 break;
78 }
79 }
80
81 if (prefix != NULL) {
82 *prefix = codePointsMatched;
83 }
84 return wordCount;
85 }
86
~BytesDictionaryMatcher()87 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
88 udata_close(file);
89 }
90
transform(UChar32 c) const91 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
92 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
93 if (c == 0x200D) {
94 return 0xFF;
95 } else if (c == 0x200C) {
96 return 0xFE;
97 }
98 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
99 if (delta < 0 || 0xFD < delta) {
100 return U_SENTINEL;
101 }
102 return (UChar32)delta;
103 }
104 return c;
105 }
106
getType() const107 int32_t BytesDictionaryMatcher::getType() const {
108 return DictionaryData::TRIE_TYPE_BYTES;
109 }
110
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const111 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
112 int32_t *lengths, int32_t *cpLengths, int32_t *values,
113 int32_t *prefix) const {
114 BytesTrie bt(characters);
115 int32_t startingTextIndex = utext_getNativeIndex(text);
116 int32_t wordCount = 0;
117 int32_t codePointsMatched = 0;
118
119 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
120 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
121 int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
122 codePointsMatched += 1;
123 if (USTRINGTRIE_HAS_VALUE(result)) {
124 if (wordCount < limit) {
125 if (values != NULL) {
126 values[wordCount] = bt.getValue();
127 }
128 if (lengths != NULL) {
129 lengths[wordCount] = lengthMatched;
130 }
131 if (cpLengths != NULL) {
132 cpLengths[wordCount] = codePointsMatched;
133 }
134 ++wordCount;
135 }
136 if (result == USTRINGTRIE_FINAL_VALUE) {
137 break;
138 }
139 }
140 else if (result == USTRINGTRIE_NO_MATCH) {
141 break;
142 }
143 if (lengthMatched >= maxLength) {
144 break;
145 }
146 }
147
148 if (prefix != NULL) {
149 *prefix = codePointsMatched;
150 }
151 return wordCount;
152 }
153
154
155 U_NAMESPACE_END
156
157 U_NAMESPACE_USE
158
159 U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)160 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
161 void *outData, UErrorCode *pErrorCode) {
162 const UDataInfo *pInfo;
163 int32_t headerSize;
164 const uint8_t *inBytes;
165 uint8_t *outBytes;
166 const int32_t *inIndexes;
167 int32_t indexes[DictionaryData::IX_COUNT];
168 int32_t i, offset, size;
169
170 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
171 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
172 pInfo = (const UDataInfo *)((const char *)inData + 4);
173 if (!(pInfo->dataFormat[0] == 0x44 &&
174 pInfo->dataFormat[1] == 0x69 &&
175 pInfo->dataFormat[2] == 0x63 &&
176 pInfo->dataFormat[3] == 0x74 &&
177 pInfo->formatVersion[0] == 1)) {
178 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
179 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
180 *pErrorCode = U_UNSUPPORTED_ERROR;
181 return 0;
182 }
183
184 inBytes = (const uint8_t *)inData + headerSize;
185 outBytes = (uint8_t *)outData + headerSize;
186
187 inIndexes = (const int32_t *)inBytes;
188 if (length >= 0) {
189 length -= headerSize;
190 if (length < (int32_t)(sizeof(indexes))) {
191 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
192 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
193 return 0;
194 }
195 }
196
197 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
198 indexes[i] = udata_readInt32(ds, inIndexes[i]);
199 }
200
201 size = indexes[DictionaryData::IX_TOTAL_SIZE];
202
203 if (length >= 0) {
204 if (length < size) {
205 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
206 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
207 return 0;
208 }
209
210 if (inBytes != outBytes) {
211 uprv_memcpy(outBytes, inBytes, size);
212 }
213
214 offset = 0;
215 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
216 offset = (int32_t)sizeof(indexes);
217 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
218 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
219
220 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
221 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
222 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
223 // nothing to do
224 } else {
225 udata_printError(ds, "udict_swap(): unknown trie type!\n");
226 *pErrorCode = U_UNSUPPORTED_ERROR;
227 return 0;
228 }
229
230 // these next two sections are empty in the current format,
231 // but may be used later.
232 offset = nextOffset;
233 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
234 offset = nextOffset;
235 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
236 offset = nextOffset;
237 }
238 return headerSize + size;
239 }
240 #endif
241