1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ************************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  ************************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24 
25 #include "brkeng.h"
26 #include "cmemory.h"
27 #include "dictbe.h"
28 #include "charstr.h"
29 #include "dictionarydata.h"
30 #include "mutex.h"
31 #include "uvector.h"
32 #include "umutex.h"
33 #include "uresimp.h"
34 #include "ubrkimpl.h"
35 
36 U_NAMESPACE_BEGIN
37 
38 /*
39  ******************************************************************
40  */
41 
LanguageBreakEngine()42 LanguageBreakEngine::LanguageBreakEngine() {
43 }
44 
~LanguageBreakEngine()45 LanguageBreakEngine::~LanguageBreakEngine() {
46 }
47 
48 /*
49  ******************************************************************
50  */
51 
LanguageBreakFactory()52 LanguageBreakFactory::LanguageBreakFactory() {
53 }
54 
~LanguageBreakFactory()55 LanguageBreakFactory::~LanguageBreakFactory() {
56 }
57 
58 /*
59  ******************************************************************
60  */
61 
UnhandledEngine(UErrorCode & status)62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
63     (void)status;
64 }
65 
~UnhandledEngine()66 UnhandledEngine::~UnhandledEngine() {
67     delete fHandled;
68     fHandled = nullptr;
69 }
70 
71 UBool
handles(UChar32 c) const72 UnhandledEngine::handles(UChar32 c) const {
73     return fHandled && fHandled->contains(c);
74 }
75 
76 int32_t
findBreaks(UText * text,int32_t,int32_t endPos,UVector32 &) const77 UnhandledEngine::findBreaks( UText *text,
78                              int32_t /* startPos */,
79                              int32_t endPos,
80                              UVector32 &/*foundBreaks*/ ) const {
81     UChar32 c = utext_current32(text);
82     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
83         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
84         c = utext_current32(text);
85     }
86     return 0;
87 }
88 
89 void
handleCharacter(UChar32 c)90 UnhandledEngine::handleCharacter(UChar32 c) {
91     if (fHandled == nullptr) {
92         fHandled = new UnicodeSet();
93         if (fHandled == nullptr) {
94             return;
95         }
96     }
97     if (!fHandled->contains(c)) {
98         UErrorCode status = U_ZERO_ERROR;
99         // Apply the entire script of the character.
100         int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
101         fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
102     }
103 }
104 
105 /*
106  ******************************************************************
107  */
108 
ICULanguageBreakFactory(UErrorCode &)109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
110     fEngines = 0;
111 }
112 
~ICULanguageBreakFactory()113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
114     if (fEngines != 0) {
115         delete fEngines;
116     }
117 }
118 
119 U_NAMESPACE_END
120 U_CDECL_BEGIN
_deleteEngine(void * obj)121 static void U_CALLCONV _deleteEngine(void *obj) {
122     delete (const icu::LanguageBreakEngine *) obj;
123 }
124 U_CDECL_END
125 U_NAMESPACE_BEGIN
126 
127 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
128 
129 const LanguageBreakEngine *
getEngineFor(UChar32 c)130 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
131     const LanguageBreakEngine *lbe = NULL;
132     UErrorCode  status = U_ZERO_ERROR;
133 
134     Mutex m(&gBreakEngineMutex);
135 
136     if (fEngines == NULL) {
137         UStack  *engines = new UStack(_deleteEngine, NULL, status);
138         if (U_FAILURE(status) || engines == NULL) {
139             // Note: no way to return error code to caller.
140             delete engines;
141             return NULL;
142         }
143         fEngines = engines;
144     } else {
145         int32_t i = fEngines->size();
146         while (--i >= 0) {
147             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
148             if (lbe != NULL && lbe->handles(c)) {
149                 return lbe;
150             }
151         }
152     }
153 
154     // We didn't find an engine. Create one.
155     lbe = loadEngineFor(c);
156     if (lbe != NULL) {
157         fEngines->push((void *)lbe, status);
158     }
159     return lbe;
160 }
161 
162 const LanguageBreakEngine *
loadEngineFor(UChar32 c)163 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
164     UErrorCode status = U_ZERO_ERROR;
165     UScriptCode code = uscript_getScript(c, &status);
166     if (U_SUCCESS(status)) {
167         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
168         if (m != NULL) {
169             const LanguageBreakEngine *engine = NULL;
170             switch(code) {
171             case USCRIPT_THAI:
172                 engine = new ThaiBreakEngine(m, status);
173                 break;
174             case USCRIPT_LAO:
175                 engine = new LaoBreakEngine(m, status);
176                 break;
177             case USCRIPT_MYANMAR:
178                 engine = new BurmeseBreakEngine(m, status);
179                 break;
180             case USCRIPT_KHMER:
181                 engine = new KhmerBreakEngine(m, status);
182                 break;
183 
184 #if !UCONFIG_NO_NORMALIZATION
185                 // CJK not available w/o normalization
186             case USCRIPT_HANGUL:
187                 engine = new CjkBreakEngine(m, kKorean, status);
188                 break;
189 
190             // use same BreakEngine and dictionary for both Chinese and Japanese
191             case USCRIPT_HIRAGANA:
192             case USCRIPT_KATAKANA:
193             case USCRIPT_HAN:
194                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
195                 break;
196 #if 0
197             // TODO: Have to get some characters with script=common handled
198             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
199             // them to CjkBreakEngine does not work. The engine has to
200             // special-case them.
201             case USCRIPT_COMMON:
202             {
203                 UBlockCode block = ublock_getCode(code);
204                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
205                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
206                 break;
207             }
208 #endif
209 #endif
210 
211             default:
212                 break;
213             }
214             if (engine == NULL) {
215                 delete m;
216             }
217             else if (U_FAILURE(status)) {
218                 delete engine;
219                 engine = NULL;
220             }
221             return engine;
222         }
223     }
224     return NULL;
225 }
226 
227 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script)228 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
229     UErrorCode status = U_ZERO_ERROR;
230     // open root from brkitr tree.
231     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
232     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
233     int32_t dictnlength = 0;
234     const UChar *dictfname =
235         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
236     if (U_FAILURE(status)) {
237         ures_close(b);
238         return NULL;
239     }
240     CharString dictnbuf;
241     CharString ext;
242     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
243     if (extStart != NULL) {
244         int32_t len = (int32_t)(extStart - dictfname);
245         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
246         dictnlength = len;
247     }
248     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
249     ures_close(b);
250 
251     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
252     if (U_SUCCESS(status)) {
253         // build trie
254         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
255         const int32_t *indexes = (const int32_t *)data;
256         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
257         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
258         DictionaryMatcher *m = NULL;
259         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
260             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
261             const char *characters = (const char *)(data + offset);
262             m = new BytesDictionaryMatcher(characters, transform, file);
263         }
264         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
265             const UChar *characters = (const UChar *)(data + offset);
266             m = new UCharsDictionaryMatcher(characters, file);
267         }
268         if (m == NULL) {
269             // no matcher exists to take ownership - either we are an invalid
270             // type or memory allocation failed
271             udata_close(file);
272         }
273         return m;
274     } else if (dictfname != NULL) {
275         // we don't have a dictionary matcher.
276         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
277         status = U_ZERO_ERROR;
278         return NULL;
279     }
280     return NULL;
281 }
282 
283 U_NAMESPACE_END
284 
285 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
286