1 /*
2 ************************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_BREAK_ITERATION
11
12 #include "brkeng.h"
13 #include "dictbe.h"
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24 #include "charstr.h"
25 #include "dictionarydata.h"
26 #include "uvector.h"
27 #include "umutex.h"
28 #include "uresimp.h"
29 #include "ubrkimpl.h"
30
31 U_NAMESPACE_BEGIN
32
33 /*
34 ******************************************************************
35 */
36
LanguageBreakEngine()37 LanguageBreakEngine::LanguageBreakEngine() {
38 }
39
~LanguageBreakEngine()40 LanguageBreakEngine::~LanguageBreakEngine() {
41 }
42
43 /*
44 ******************************************************************
45 */
46
LanguageBreakFactory()47 LanguageBreakFactory::LanguageBreakFactory() {
48 }
49
~LanguageBreakFactory()50 LanguageBreakFactory::~LanguageBreakFactory() {
51 }
52
53 /*
54 ******************************************************************
55 */
56
UnhandledEngine(UErrorCode &)57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
59 fHandled[i] = 0;
60 }
61 }
62
~UnhandledEngine()63 UnhandledEngine::~UnhandledEngine() {
64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
65 if (fHandled[i] != 0) {
66 delete fHandled[i];
67 }
68 }
69 }
70
71 UBool
handles(UChar32 c,int32_t breakType) const72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
75 }
76
77 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UBool reverse,int32_t breakType,UStack &) const78 UnhandledEngine::findBreaks( UText *text,
79 int32_t startPos,
80 int32_t endPos,
81 UBool reverse,
82 int32_t breakType,
83 UStack &/*foundBreaks*/ ) const {
84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
85 UChar32 c = utext_current32(text);
86 if (reverse) {
87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
88 c = utext_previous32(text);
89 }
90 }
91 else {
92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
93 utext_next32(text); // TODO: recast loop to work with post-increment operations.
94 c = utext_current32(text);
95 }
96 }
97 }
98 return 0;
99 }
100
101 void
handleCharacter(UChar32 c,int32_t breakType)102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
104 if (fHandled[breakType] == 0) {
105 fHandled[breakType] = new UnicodeSet();
106 if (fHandled[breakType] == 0) {
107 return;
108 }
109 }
110 if (!fHandled[breakType]->contains(c)) {
111 UErrorCode status = U_ZERO_ERROR;
112 // Apply the entire script of the character.
113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
115 }
116 }
117 }
118
119 /*
120 ******************************************************************
121 */
122
ICULanguageBreakFactory(UErrorCode &)123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
124 fEngines = 0;
125 }
126
~ICULanguageBreakFactory()127 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
128 if (fEngines != 0) {
129 delete fEngines;
130 }
131 }
132
133 U_NAMESPACE_END
134 U_CDECL_BEGIN
_deleteEngine(void * obj)135 static void U_CALLCONV _deleteEngine(void *obj) {
136 delete (const icu::LanguageBreakEngine *) obj;
137 }
138 U_CDECL_END
139 U_NAMESPACE_BEGIN
140
141 const LanguageBreakEngine *
getEngineFor(UChar32 c,int32_t breakType)142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
143 UBool needsInit;
144 int32_t i;
145 const LanguageBreakEngine *lbe = NULL;
146 UErrorCode status = U_ZERO_ERROR;
147
148 // TODO: The global mutex should not be used.
149 // The global mutex should only be used for short periods.
150 // A ICULanguageBreakFactory specific mutex should be used.
151 umtx_lock(NULL);
152 needsInit = (UBool)(fEngines == NULL);
153 if (!needsInit) {
154 i = fEngines->size();
155 while (--i >= 0) {
156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
157 if (lbe != NULL && lbe->handles(c, breakType)) {
158 break;
159 }
160 lbe = NULL;
161 }
162 }
163 umtx_unlock(NULL);
164
165 if (lbe != NULL) {
166 return lbe;
167 }
168
169 if (needsInit) {
170 UStack *engines = new UStack(_deleteEngine, NULL, status);
171 if (U_SUCCESS(status) && engines == NULL) {
172 status = U_MEMORY_ALLOCATION_ERROR;
173 }
174 else if (U_FAILURE(status)) {
175 delete engines;
176 engines = NULL;
177 }
178 else {
179 umtx_lock(NULL);
180 if (fEngines == NULL) {
181 fEngines = engines;
182 engines = NULL;
183 }
184 umtx_unlock(NULL);
185 delete engines;
186 }
187 }
188
189 if (fEngines == NULL) {
190 return NULL;
191 }
192
193 // We didn't find an engine the first time through, or there was no
194 // stack. Create an engine.
195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
196
197 // Now get the lock, and see if someone else has created it in the
198 // meantime
199 umtx_lock(NULL);
200 i = fEngines->size();
201 while (--i >= 0) {
202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
203 if (lbe != NULL && lbe->handles(c, breakType)) {
204 break;
205 }
206 lbe = NULL;
207 }
208 if (lbe == NULL && newlbe != NULL) {
209 fEngines->push((void *)newlbe, status);
210 lbe = newlbe;
211 newlbe = NULL;
212 }
213 umtx_unlock(NULL);
214
215 delete newlbe;
216
217 return lbe;
218 }
219
220 const LanguageBreakEngine *
loadEngineFor(UChar32 c,int32_t breakType)221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
222 UErrorCode status = U_ZERO_ERROR;
223 UScriptCode code = uscript_getScript(c, &status);
224 if (U_SUCCESS(status)) {
225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
226 if (m != NULL) {
227 const LanguageBreakEngine *engine = NULL;
228 switch(code) {
229 case USCRIPT_THAI:
230 engine = new ThaiBreakEngine(m, status);
231 break;
232 case USCRIPT_LAO:
233 engine = new LaoBreakEngine(m, status);
234 break;
235 case USCRIPT_MYANMAR:
236 engine = new BurmeseBreakEngine(m, status);
237 break;
238 case USCRIPT_KHMER:
239 engine = new KhmerBreakEngine(m, status);
240 break;
241
242 #if !UCONFIG_NO_NORMALIZATION
243 // CJK not available w/o normalization
244 case USCRIPT_HANGUL:
245 engine = new CjkBreakEngine(m, kKorean, status);
246 break;
247
248 // use same BreakEngine and dictionary for both Chinese and Japanese
249 case USCRIPT_HIRAGANA:
250 case USCRIPT_KATAKANA:
251 case USCRIPT_HAN:
252 engine = new CjkBreakEngine(m, kChineseJapanese, status);
253 break;
254 #if 0
255 // TODO: Have to get some characters with script=common handled
256 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
257 // them to CjkBreakEngine does not work. The engine has to
258 // special-case them.
259 case USCRIPT_COMMON:
260 {
261 UBlockCode block = ublock_getCode(code);
262 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
263 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
264 break;
265 }
266 #endif
267 #endif
268
269 default:
270 break;
271 }
272 if (engine == NULL) {
273 delete m;
274 }
275 else if (U_FAILURE(status)) {
276 delete engine;
277 engine = NULL;
278 }
279 return engine;
280 }
281 }
282 return NULL;
283 }
284
285 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script,int32_t)286 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
287 UErrorCode status = U_ZERO_ERROR;
288 // open root from brkitr tree.
289 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
290 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
291 int32_t dictnlength = 0;
292 const UChar *dictfname =
293 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
294 if (U_FAILURE(status)) {
295 ures_close(b);
296 return NULL;
297 }
298 CharString dictnbuf;
299 CharString ext;
300 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
301 if (extStart != NULL) {
302 int32_t len = (int32_t)(extStart - dictfname);
303 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
304 dictnlength = len;
305 }
306 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
307 ures_close(b);
308
309 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
310 if (U_SUCCESS(status)) {
311 // build trie
312 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
313 const int32_t *indexes = (const int32_t *)data;
314 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
315 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
316 DictionaryMatcher *m = NULL;
317 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
318 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
319 const char *characters = (const char *)(data + offset);
320 m = new BytesDictionaryMatcher(characters, transform, file);
321 }
322 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
323 const UChar *characters = (const UChar *)(data + offset);
324 m = new UCharsDictionaryMatcher(characters, file);
325 }
326 if (m == NULL) {
327 // no matcher exists to take ownership - either we are an invalid
328 // type or memory allocation failed
329 udata_close(file);
330 }
331 return m;
332 } else if (dictfname != NULL) {
333 // we don't have a dictionary matcher.
334 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
335 status = U_ZERO_ERROR;
336 return NULL;
337 }
338 return NULL;
339 }
340
341 U_NAMESPACE_END
342
343 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
344