1 /*
2 *****************************************************************
3 * Copyright (c) 2002-2014, International Business Machines Corporation
4 * and others.  All Rights Reserved.
5 *****************************************************************
6 * Date        Name        Description
7 * 06/06/2002  aliu        Creation.
8 *****************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
17 
18 #include "anytrans.h"
19 #include "hash.h"
20 #include "mutex.h"
21 #include "nultrans.h"
22 #include "putilimp.h"
23 #include "tridpars.h"
24 #include "uinvchar.h"
25 #include "uvector.h"
26 
27 //------------------------------------------------------------
28 // Constants
29 
30 static const UChar TARGET_SEP = 45; // '-'
31 static const UChar VARIANT_SEP = 47; // '/'
32 static const UChar ANY[] = {65,110,121,0}; // "Any"
33 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
34 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
35 
36 //------------------------------------------------------------
37 
38 U_CDECL_BEGIN
39 /**
40  * Deleter function for Transliterator*.
41  */
42 static void U_CALLCONV
_deleteTransliterator(void * obj)43 _deleteTransliterator(void *obj) {
44     delete (icu::Transliterator*) obj;
45 }
46 U_CDECL_END
47 
48 //------------------------------------------------------------
49 
50 U_NAMESPACE_BEGIN
51 
52 //------------------------------------------------------------
53 // ScriptRunIterator
54 
55 /**
56  * Returns a series of ranges corresponding to scripts. They will be
57  * of the form:
58  *
59  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
60  * |            |          - first run (start, limit)
61  *          |           |  - second run (start, limit)
62  *
63  * That is, the runs will overlap. The reason for this is so that a
64  * transliterator can consider common characters both before and after
65  * the scripts.
66  */
67 class ScriptRunIterator : public UMemory {
68 private:
69     const Replaceable& text;
70     int32_t textStart;
71     int32_t textLimit;
72 
73 public:
74     /**
75      * The code of the current run, valid after next() returns.  May
76      * be USCRIPT_INVALID_CODE if and only if the entire text is
77      * COMMON/INHERITED.
78      */
79     UScriptCode scriptCode;
80 
81     /**
82      * The start of the run, inclusive, valid after next() returns.
83      */
84     int32_t start;
85 
86     /**
87      * The end of the run, exclusive, valid after next() returns.
88      */
89     int32_t limit;
90 
91     /**
92      * Constructs a run iterator over the given text from start
93      * (inclusive) to limit (exclusive).
94      */
95     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
96 
97     /**
98      * Returns TRUE if there are any more runs.  TRUE is always
99      * returned at least once.  Upon return, the caller should
100      * examine scriptCode, start, and limit.
101      */
102     UBool next();
103 
104     /**
105      * Adjusts internal indices for a change in the limit index of the
106      * given delta.  A positive delta means the limit has increased.
107      */
108     void adjustLimit(int32_t delta);
109 
110 private:
111     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
112     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
113 };
114 
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)115 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
116                                      int32_t myStart, int32_t myLimit) :
117     text(theText)
118 {
119     textStart = myStart;
120     textLimit = myLimit;
121     limit = myStart;
122 }
123 
next()124 UBool ScriptRunIterator::next() {
125     UChar32 ch;
126     UScriptCode s;
127     UErrorCode ec = U_ZERO_ERROR;
128 
129     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
130     start = limit;
131 
132     // Are we done?
133     if (start == textLimit) {
134         return FALSE;
135     }
136 
137     // Move start back to include adjacent COMMON or INHERITED
138     // characters
139     while (start > textStart) {
140         ch = text.char32At(start - 1); // look back
141         s = uscript_getScript(ch, &ec);
142         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
143             --start;
144         } else {
145             break;
146         }
147     }
148 
149     // Move limit ahead to include COMMON, INHERITED, and characters
150     // of the current script.
151     while (limit < textLimit) {
152         ch = text.char32At(limit); // look ahead
153         s = uscript_getScript(ch, &ec);
154         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
155             if (scriptCode == USCRIPT_INVALID_CODE) {
156                 scriptCode = s;
157             } else if (s != scriptCode) {
158                 break;
159             }
160         }
161         ++limit;
162     }
163 
164     // Return TRUE even if the entire text is COMMON / INHERITED, in
165     // which case scriptCode will be USCRIPT_INVALID_CODE.
166     return TRUE;
167 }
168 
adjustLimit(int32_t delta)169 void ScriptRunIterator::adjustLimit(int32_t delta) {
170     limit += delta;
171     textLimit += delta;
172 }
173 
174 //------------------------------------------------------------
175 // AnyTransliterator
176 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)177 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
178 
179 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
180                                      const UnicodeString& theTarget,
181                                      const UnicodeString& theVariant,
182                                      UScriptCode theTargetScript,
183                                      UErrorCode& ec) :
184     Transliterator(id, NULL),
185     targetScript(theTargetScript)
186 {
187     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
188     if (U_FAILURE(ec)) {
189         return;
190     }
191     uhash_setValueDeleter(cache, _deleteTransliterator);
192 
193     target = theTarget;
194     if (theVariant.length() > 0) {
195         target.append(VARIANT_SEP).append(theVariant);
196     }
197 }
198 
~AnyTransliterator()199 AnyTransliterator::~AnyTransliterator() {
200     uhash_close(cache);
201 }
202 
203 /**
204  * Copy constructor.
205  */
AnyTransliterator(const AnyTransliterator & o)206 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
207     Transliterator(o),
208     target(o.target),
209     targetScript(o.targetScript)
210 {
211     // Don't copy the cache contents
212     UErrorCode ec = U_ZERO_ERROR;
213     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
214     if (U_FAILURE(ec)) {
215         return;
216     }
217     uhash_setValueDeleter(cache, _deleteTransliterator);
218 }
219 
220 /**
221  * Transliterator API.
222  */
clone() const223 Transliterator* AnyTransliterator::clone() const {
224     return new AnyTransliterator(*this);
225 }
226 
227 /**
228  * Implements {@link Transliterator#handleTransliterate}.
229  */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const230 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
231                                             UBool isIncremental) const {
232     int32_t allStart = pos.start;
233     int32_t allLimit = pos.limit;
234 
235     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
236 
237     while (it.next()) {
238         // Ignore runs in the ante context
239         if (it.limit <= allStart) continue;
240 
241         // Try to instantiate transliterator from it.scriptCode to
242         // our target or target/variant
243         Transliterator* t = getTransliterator(it.scriptCode);
244 
245         if (t == NULL) {
246             // We have no transliterator.  Do nothing, but keep
247             // pos.start up to date.
248             pos.start = it.limit;
249             continue;
250         }
251 
252         // If the run end is before the transliteration limit, do
253         // a non-incremental transliteration.  Otherwise do an
254         // incremental one.
255         UBool incremental = isIncremental && (it.limit >= allLimit);
256 
257         pos.start = uprv_max(allStart, it.start);
258         pos.limit = uprv_min(allLimit, it.limit);
259         int32_t limit = pos.limit;
260         t->filteredTransliterate(text, pos, incremental);
261         int32_t delta = pos.limit - limit;
262         allLimit += delta;
263         it.adjustLimit(delta);
264 
265         // We're done if we enter the post context
266         if (it.limit >= allLimit) break;
267     }
268 
269     // Restore limit.  pos.start is fine where the last transliterator
270     // left it, or at the end of the last run.
271     pos.limit = allLimit;
272 }
273 
getTransliterator(UScriptCode source) const274 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
275 
276     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
277         return NULL;
278     }
279 
280     Transliterator* t = NULL;
281     {
282         Mutex m(NULL);
283         t = (Transliterator*) uhash_iget(cache, (int32_t) source);
284     }
285     if (t == NULL) {
286         UErrorCode ec = U_ZERO_ERROR;
287         UnicodeString sourceName(uscript_getName(source), -1, US_INV);
288         UnicodeString id(sourceName);
289         id.append(TARGET_SEP).append(target);
290 
291         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
292         if (U_FAILURE(ec) || t == NULL) {
293             delete t;
294 
295             // Try to pivot around Latin, our most common script
296             id = sourceName;
297             id.append(LATIN_PIVOT, -1).append(target);
298             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
299             if (U_FAILURE(ec) || t == NULL) {
300                 delete t;
301                 t = NULL;
302             }
303         }
304 
305         if (t != NULL) {
306             Transliterator *rt = NULL;
307             {
308                 Mutex m(NULL);
309                 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
310                 if (rt == NULL) {
311                     // Common case, no race to cache this new transliterator.
312                     uhash_iput(cache, (int32_t) source, t, &ec);
313                 } else {
314                     // Race case, some other thread beat us to caching this transliterator.
315                     Transliterator *temp = rt;
316                     rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
317                     t  = temp; // The transliterator from the cache that we will return.
318                 }
319             }
320             delete rt;    // will be non-null only in case of races.
321         }
322     }
323     return t;
324 }
325 
326 /**
327  * Return the script code for a given name, or -1 if not found.
328  */
scriptNameToCode(const UnicodeString & name)329 static UScriptCode scriptNameToCode(const UnicodeString& name) {
330     char buf[128];
331     UScriptCode code;
332     UErrorCode ec = U_ZERO_ERROR;
333     int32_t nameLen = name.length();
334     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
335 
336     if (isInvariant) {
337         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
338         buf[127] = 0;   // Make sure that we NULL terminate the string.
339     }
340     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
341     {
342         code = USCRIPT_INVALID_CODE;
343     }
344     return code;
345 }
346 
347 /**
348  * Registers standard transliterators with the system.  Called by
349  * Transliterator during initialization.  Scan all current targets and
350  * register those that are scripts T as Any-T/V.
351  */
registerIDs()352 void AnyTransliterator::registerIDs() {
353 
354     UErrorCode ec = U_ZERO_ERROR;
355     Hashtable seen(TRUE, ec);
356 
357     int32_t sourceCount = Transliterator::_countAvailableSources();
358     for (int32_t s=0; s<sourceCount; ++s) {
359         UnicodeString source;
360         Transliterator::_getAvailableSource(s, source);
361 
362         // Ignore the "Any" source
363         if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
364 
365         int32_t targetCount = Transliterator::_countAvailableTargets(source);
366         for (int32_t t=0; t<targetCount; ++t) {
367             UnicodeString target;
368             Transliterator::_getAvailableTarget(t, source, target);
369 
370             // Only process each target once
371             if (seen.geti(target) != 0) continue;
372             ec = U_ZERO_ERROR;
373             seen.puti(target, 1, ec);
374 
375             // Get the script code for the target.  If not a script, ignore.
376             UScriptCode targetScript = scriptNameToCode(target);
377             if (targetScript == USCRIPT_INVALID_CODE) continue;
378 
379             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
380             // assert(variantCount >= 1);
381             for (int32_t v=0; v<variantCount; ++v) {
382                 UnicodeString variant;
383                 Transliterator::_getAvailableVariant(v, source, target, variant);
384 
385                 UnicodeString id;
386                 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
387                 ec = U_ZERO_ERROR;
388                 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
389                                                              targetScript, ec);
390                 if (U_FAILURE(ec)) {
391                     delete t;
392                 } else {
393                     Transliterator::_registerInstance(t);
394                     Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
395                 }
396             }
397         }
398     }
399 }
400 
401 U_NAMESPACE_END
402 
403 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
404 
405 //eof
406