1 /*
2 *****************************************************************
3 * Copyright (c) 2002-2014, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *****************************************************************
6 * Date Name Description
7 * 06/06/2002 aliu Creation.
8 *****************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
17
18 #include "anytrans.h"
19 #include "hash.h"
20 #include "mutex.h"
21 #include "nultrans.h"
22 #include "putilimp.h"
23 #include "tridpars.h"
24 #include "uinvchar.h"
25 #include "uvector.h"
26
27 //------------------------------------------------------------
28 // Constants
29
30 static const UChar TARGET_SEP = 45; // '-'
31 static const UChar VARIANT_SEP = 47; // '/'
32 static const UChar ANY[] = {65,110,121,0}; // "Any"
33 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
34 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
35
36 //------------------------------------------------------------
37
38 U_CDECL_BEGIN
39 /**
40 * Deleter function for Transliterator*.
41 */
42 static void U_CALLCONV
_deleteTransliterator(void * obj)43 _deleteTransliterator(void *obj) {
44 delete (icu::Transliterator*) obj;
45 }
46 U_CDECL_END
47
48 //------------------------------------------------------------
49
50 U_NAMESPACE_BEGIN
51
52 //------------------------------------------------------------
53 // ScriptRunIterator
54
55 /**
56 * Returns a series of ranges corresponding to scripts. They will be
57 * of the form:
58 *
59 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
60 * | | - first run (start, limit)
61 * | | - second run (start, limit)
62 *
63 * That is, the runs will overlap. The reason for this is so that a
64 * transliterator can consider common characters both before and after
65 * the scripts.
66 */
67 class ScriptRunIterator : public UMemory {
68 private:
69 const Replaceable& text;
70 int32_t textStart;
71 int32_t textLimit;
72
73 public:
74 /**
75 * The code of the current run, valid after next() returns. May
76 * be USCRIPT_INVALID_CODE if and only if the entire text is
77 * COMMON/INHERITED.
78 */
79 UScriptCode scriptCode;
80
81 /**
82 * The start of the run, inclusive, valid after next() returns.
83 */
84 int32_t start;
85
86 /**
87 * The end of the run, exclusive, valid after next() returns.
88 */
89 int32_t limit;
90
91 /**
92 * Constructs a run iterator over the given text from start
93 * (inclusive) to limit (exclusive).
94 */
95 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
96
97 /**
98 * Returns TRUE if there are any more runs. TRUE is always
99 * returned at least once. Upon return, the caller should
100 * examine scriptCode, start, and limit.
101 */
102 UBool next();
103
104 /**
105 * Adjusts internal indices for a change in the limit index of the
106 * given delta. A positive delta means the limit has increased.
107 */
108 void adjustLimit(int32_t delta);
109
110 private:
111 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
112 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
113 };
114
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)115 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
116 int32_t myStart, int32_t myLimit) :
117 text(theText)
118 {
119 textStart = myStart;
120 textLimit = myLimit;
121 limit = myStart;
122 }
123
next()124 UBool ScriptRunIterator::next() {
125 UChar32 ch;
126 UScriptCode s;
127 UErrorCode ec = U_ZERO_ERROR;
128
129 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
130 start = limit;
131
132 // Are we done?
133 if (start == textLimit) {
134 return FALSE;
135 }
136
137 // Move start back to include adjacent COMMON or INHERITED
138 // characters
139 while (start > textStart) {
140 ch = text.char32At(start - 1); // look back
141 s = uscript_getScript(ch, &ec);
142 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
143 --start;
144 } else {
145 break;
146 }
147 }
148
149 // Move limit ahead to include COMMON, INHERITED, and characters
150 // of the current script.
151 while (limit < textLimit) {
152 ch = text.char32At(limit); // look ahead
153 s = uscript_getScript(ch, &ec);
154 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
155 if (scriptCode == USCRIPT_INVALID_CODE) {
156 scriptCode = s;
157 } else if (s != scriptCode) {
158 break;
159 }
160 }
161 ++limit;
162 }
163
164 // Return TRUE even if the entire text is COMMON / INHERITED, in
165 // which case scriptCode will be USCRIPT_INVALID_CODE.
166 return TRUE;
167 }
168
adjustLimit(int32_t delta)169 void ScriptRunIterator::adjustLimit(int32_t delta) {
170 limit += delta;
171 textLimit += delta;
172 }
173
174 //------------------------------------------------------------
175 // AnyTransliterator
176
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)177 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
178
179 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
180 const UnicodeString& theTarget,
181 const UnicodeString& theVariant,
182 UScriptCode theTargetScript,
183 UErrorCode& ec) :
184 Transliterator(id, NULL),
185 targetScript(theTargetScript)
186 {
187 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
188 if (U_FAILURE(ec)) {
189 return;
190 }
191 uhash_setValueDeleter(cache, _deleteTransliterator);
192
193 target = theTarget;
194 if (theVariant.length() > 0) {
195 target.append(VARIANT_SEP).append(theVariant);
196 }
197 }
198
~AnyTransliterator()199 AnyTransliterator::~AnyTransliterator() {
200 uhash_close(cache);
201 }
202
203 /**
204 * Copy constructor.
205 */
AnyTransliterator(const AnyTransliterator & o)206 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
207 Transliterator(o),
208 target(o.target),
209 targetScript(o.targetScript)
210 {
211 // Don't copy the cache contents
212 UErrorCode ec = U_ZERO_ERROR;
213 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
214 if (U_FAILURE(ec)) {
215 return;
216 }
217 uhash_setValueDeleter(cache, _deleteTransliterator);
218 }
219
220 /**
221 * Transliterator API.
222 */
clone() const223 Transliterator* AnyTransliterator::clone() const {
224 return new AnyTransliterator(*this);
225 }
226
227 /**
228 * Implements {@link Transliterator#handleTransliterate}.
229 */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const230 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
231 UBool isIncremental) const {
232 int32_t allStart = pos.start;
233 int32_t allLimit = pos.limit;
234
235 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
236
237 while (it.next()) {
238 // Ignore runs in the ante context
239 if (it.limit <= allStart) continue;
240
241 // Try to instantiate transliterator from it.scriptCode to
242 // our target or target/variant
243 Transliterator* t = getTransliterator(it.scriptCode);
244
245 if (t == NULL) {
246 // We have no transliterator. Do nothing, but keep
247 // pos.start up to date.
248 pos.start = it.limit;
249 continue;
250 }
251
252 // If the run end is before the transliteration limit, do
253 // a non-incremental transliteration. Otherwise do an
254 // incremental one.
255 UBool incremental = isIncremental && (it.limit >= allLimit);
256
257 pos.start = uprv_max(allStart, it.start);
258 pos.limit = uprv_min(allLimit, it.limit);
259 int32_t limit = pos.limit;
260 t->filteredTransliterate(text, pos, incremental);
261 int32_t delta = pos.limit - limit;
262 allLimit += delta;
263 it.adjustLimit(delta);
264
265 // We're done if we enter the post context
266 if (it.limit >= allLimit) break;
267 }
268
269 // Restore limit. pos.start is fine where the last transliterator
270 // left it, or at the end of the last run.
271 pos.limit = allLimit;
272 }
273
getTransliterator(UScriptCode source) const274 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
275
276 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
277 return NULL;
278 }
279
280 Transliterator* t = NULL;
281 {
282 Mutex m(NULL);
283 t = (Transliterator*) uhash_iget(cache, (int32_t) source);
284 }
285 if (t == NULL) {
286 UErrorCode ec = U_ZERO_ERROR;
287 UnicodeString sourceName(uscript_getName(source), -1, US_INV);
288 UnicodeString id(sourceName);
289 id.append(TARGET_SEP).append(target);
290
291 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
292 if (U_FAILURE(ec) || t == NULL) {
293 delete t;
294
295 // Try to pivot around Latin, our most common script
296 id = sourceName;
297 id.append(LATIN_PIVOT, -1).append(target);
298 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
299 if (U_FAILURE(ec) || t == NULL) {
300 delete t;
301 t = NULL;
302 }
303 }
304
305 if (t != NULL) {
306 Transliterator *rt = NULL;
307 {
308 Mutex m(NULL);
309 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
310 if (rt == NULL) {
311 // Common case, no race to cache this new transliterator.
312 uhash_iput(cache, (int32_t) source, t, &ec);
313 } else {
314 // Race case, some other thread beat us to caching this transliterator.
315 Transliterator *temp = rt;
316 rt = t; // Our newly created transliterator that lost the race & now needs deleting.
317 t = temp; // The transliterator from the cache that we will return.
318 }
319 }
320 delete rt; // will be non-null only in case of races.
321 }
322 }
323 return t;
324 }
325
326 /**
327 * Return the script code for a given name, or -1 if not found.
328 */
scriptNameToCode(const UnicodeString & name)329 static UScriptCode scriptNameToCode(const UnicodeString& name) {
330 char buf[128];
331 UScriptCode code;
332 UErrorCode ec = U_ZERO_ERROR;
333 int32_t nameLen = name.length();
334 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
335
336 if (isInvariant) {
337 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
338 buf[127] = 0; // Make sure that we NULL terminate the string.
339 }
340 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
341 {
342 code = USCRIPT_INVALID_CODE;
343 }
344 return code;
345 }
346
347 /**
348 * Registers standard transliterators with the system. Called by
349 * Transliterator during initialization. Scan all current targets and
350 * register those that are scripts T as Any-T/V.
351 */
registerIDs()352 void AnyTransliterator::registerIDs() {
353
354 UErrorCode ec = U_ZERO_ERROR;
355 Hashtable seen(TRUE, ec);
356
357 int32_t sourceCount = Transliterator::_countAvailableSources();
358 for (int32_t s=0; s<sourceCount; ++s) {
359 UnicodeString source;
360 Transliterator::_getAvailableSource(s, source);
361
362 // Ignore the "Any" source
363 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
364
365 int32_t targetCount = Transliterator::_countAvailableTargets(source);
366 for (int32_t t=0; t<targetCount; ++t) {
367 UnicodeString target;
368 Transliterator::_getAvailableTarget(t, source, target);
369
370 // Only process each target once
371 if (seen.geti(target) != 0) continue;
372 ec = U_ZERO_ERROR;
373 seen.puti(target, 1, ec);
374
375 // Get the script code for the target. If not a script, ignore.
376 UScriptCode targetScript = scriptNameToCode(target);
377 if (targetScript == USCRIPT_INVALID_CODE) continue;
378
379 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
380 // assert(variantCount >= 1);
381 for (int32_t v=0; v<variantCount; ++v) {
382 UnicodeString variant;
383 Transliterator::_getAvailableVariant(v, source, target, variant);
384
385 UnicodeString id;
386 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
387 ec = U_ZERO_ERROR;
388 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
389 targetScript, ec);
390 if (U_FAILURE(ec)) {
391 delete t;
392 } else {
393 Transliterator::_registerInstance(t);
394 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
395 }
396 }
397 }
398 }
399 }
400
401 U_NAMESPACE_END
402
403 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
404
405 //eof
406